diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py index 98acd05a..fc4d7efa 100644 --- a/pyQuARC/code/url_validator.py +++ b/pyQuARC/code/url_validator.py @@ -34,6 +34,28 @@ def _extract_http_texts(text_with_urls): starts_with_http.add(text) return starts_with_http + @staticmethod + def _status_code_from_request(url): + """ + Return HTTP status code for url, raising requests exceptions to caller. + """ + headers = get_headers() + return requests.get(url, headers=headers, timeout=10).status_code + + @staticmethod + def _extract_and_normalize_urls(text_with_urls): + """ + Extract URLs from text, include tokens that start with 'http', strip trailing dots, + and return (set_of_urls, joined_value_string). + """ + extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) + urls = extractor.find_urls(text_with_urls) + urls.extend(UrlValidator._extract_http_texts(text_with_urls)) + # remove dots at the end and deduplicate + urls = set(url[:-1] if url.endswith(".") else url for url in urls) + value = ", ".join(urls) + return urls, value + @staticmethod @if_arg def health_and_status_check(text_with_urls): @@ -45,48 +67,87 @@ def health_and_status_check(text_with_urls): (dict) An object with the validity of the check and the instance/results """ - def status_code_from_request(url): - headers = get_headers() - # timeout = 10 seconds, to allow for slow but not invalid connections - return requests.get(url, headers=headers, timeout=10).status_code - results = [] validity = True - # extract URLs from text - extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR")) - urls = extractor.find_urls(text_with_urls) - urls.extend(UrlValidator._extract_http_texts(text_with_urls)) - - # remove dots at the end (The URLExtract library catches URLs, but sometimes appends a '.' at the end) - # remove duplicated urls - urls = set(url[:-1] if url.endswith(".") else url for url in urls) - value = ", ".join(urls) + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) - # check that URL returns a valid response for url in urls: - if not url.startswith("http"): - url = f"http://{url}" - try: - response_code = status_code_from_request(url) - if response_code == 200: - if url.startswith("http://"): - secure_url = url.replace("http://", "https://") - if status_code_from_request(secure_url) == 200: - result = { - "url": url, - "error": "The URL is secure. Please use 'https' instead of 'http'.", - } + if url.startswith("https"): + try: + response_code = UrlValidator._status_code_from_request(url) + if response_code != 200: + result = { + "url": url, + "error": f"The url {url} is broken.", + } + results.append(result) else: continue - else: - result = {"url": url, "error": f"Status code {response_code}"} - except requests.ConnectionError: - result = {"url": url, "error": "The URL does not exist on Internet."} - except: - result = {"url": url, "error": "Some unknown error occurred."} - results.append(result) + except requests.ConnectionError: + result = {"url": url, "error": f"The URL {url} does not exist on Internet."} + results.append(result) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def protocol_checks(text_with_urls): + """ + Checks the ftp included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains ftp + Returns: + (dict) An object with the validity of the check and the instance/results + """ + + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) + + for url in urls: + if url.startswith("ftp://"): + results.append({ + "url": url, + "error": f"The URL {url} exists" + }) + + if results: + validity = False + value = results + + return {"valid": validity, "value": value} + + @staticmethod + @if_arg + def secure_url_checks(text_with_urls): + """ + Checks whether the secure link (https) is included in `text_with_urls` + Args: + text_with_urls (str, required): The text that contains https + Returns: + (dict) An object with the validity of the check and the instance/results + """ + + results = [] + + validity = True + + urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls) + + for url in urls: + if url.startswith("http://"): + results.append({ + "url": url, + "error": f"The URL {url} is not secure" + }) if results: validity = False diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 3c6353e4..ffe03742 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -40,12 +40,28 @@ "remediation": "Recommend updating the Revision date so that it comes chronologically after the Insert/Creation time." }, "url_check": { - "failure": "A URL with a status code other than 200 has been identified: `{}`.", + "failure": "`{}`.", "help": { "message": "", "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" }, - "remediation": "This often indicates a broken link. If the URL is broken, recommend revising." + "remediation": "The following link is broken. Recommend replacing the OnlineAccessURL with a link to directly access the granule via https." + }, + "protocol_check": { + "failure": "The following URL `{}` does not exist.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend removing the ftp access link." + }, + "secure_url_check": { + "failure": "`{}`.", + "help": { + "message": "", + "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" + }, + "remediation": "Recommend updating the following link(s) from 'http' to 'https':" }, "url_update_email_check": { "failure": "The listed email contact information must be updated.", @@ -221,7 +237,7 @@ "message": "", "url": "" }, - "remediation": "If data collection is ongoing, provide an EndsAtPresentFlag of \"true\"" + "remediation": "Since data collection is no longer ongoing, recommend updating the EndsAtPresentFlag to 'false'." }, "ends_at_present_flag_presence_check": { "failure": "Potential issue with:\n - No EndingDateTime provided; no EndsAtPresentFlag provided for a potentially active collection. \n - CollectionState is not \"COMPLETE\"; no EndsAtPresentFlag provided for a potentially active collection.", @@ -752,12 +768,12 @@ "remediation": "Recommend providing an entry of 'true' or 'false'." }, "collection_progress_consistency_check": { - "failure": "The Collection State/Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.", + "failure": "The Collection Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Collection+Progress" }, - "remediation": "Recommend updating the Collection State/Progress based on the Ending Date Time and Ends At Present Flag values." + "remediation": "Recommend updating the Collection Progress based on the Ending Date Time and Ends At Present Flag values." }, "online_resource_type_gcmd_check": { "failure": "The provided Online Resource/Related URLs Type `{}` is not consistent with GCMD.", diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index c55d2e6a..4fa0df4c 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -24,6 +24,16 @@ "check_function": "health_and_status_check", "available": true }, + "protocol_check": { + "data_type": "url", + "check_function": "protocol_checks", + "available": true + }, + "secure_url_check": { + "data_type": "url", + "check_function": "secure_url_checks", + "available": true + }, "url_update_email_check": { "data_type": "url", "check_function": "url_update_email_check", diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 633bb584..64214155 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -812,6 +812,262 @@ "severity": "error", "check_id": "url_check" }, + + "protocol_check": { + "rule_name": "protocol_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "error", + "check_id": "protocol_check" + }, + "secure_url_check": { + "rule_name": "secure_url_checks", + "fields_to_apply": { + "echo-c": [ + { + "fields": [ + "Collection/Description" + ] + }, + { + "fields": [ + "Collection/SuggestedUsage" + ] + }, + { + "fields": [ + "Collection/CitationforExternalPublication" + ] + }, + { + "fields": [ + "Collection/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Collection/OnlineResources/OnlineResource/URL" + ] + } + ], + "dif10": [ + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + }, + { + "fields": [ + "DIF/Dataset_Citation/Online_Resource" + ] + }, + { + "fields": [ + "DIF/Summary/Abstract" + ] + }, + { + "fields": [ + "DIF/Organization/Organization_URL" + ] + }, + { + "fields": [ + "DIF/Related_URL/URL" + ] + }, + { + "fields": [ + "DIF/Extended_Metadata/Metadata/Value" + ] + } + ], + "umm-c": [ + { + "fields": [ + "DataCenters/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "DataCenters/ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactPersons/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "ContactGroups/ContactInformation/RelatedUrls/URL" + ] + }, + { + "fields": [ + "RelatedUrls/URL" + ] + } + ], + "umm-g": [ + { + "fields": [ + "RelatedUrls/URL" + ] + }, + { + "fields": [ + "MetadataSpecification/URL" + ] + } + ], + "echo-g": [ + { + "fields": [ + "Granule/OnlineAccessURLs/OnlineAccessURL/URL" + ] + }, + { + "fields": [ + "Granule/OnlineResources/OnlineResource/URL" + ] + }, + { + "fields": [ + "Granule/AssociatedBrowseImageUrls/ProviderBrowseUrl/URL" + ] + } + ] + }, + "severity": "info", + "check_id": "secure_url_check" + }, + "shortname_uniqueness": { "rule_name": "Short Name uniqueness check", "fields_to_apply": { @@ -1341,7 +1597,7 @@ } ] }, - "severity": "warning", + "severity": "error", "check_id": "ends_at_present_flag_logic_check" }, "ends_at_present_flag_presence_check": {