Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 95 additions & 34 deletions pyQuARC/code/url_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,28 @@ def _extract_http_texts(text_with_urls):
starts_with_http.add(text)
return starts_with_http

@staticmethod
def _status_code_from_request(url):
"""
Return HTTP status code for url, raising requests exceptions to caller.
"""
headers = get_headers()
return requests.get(url, headers=headers, timeout=10).status_code

@staticmethod
def _extract_and_normalize_urls(text_with_urls):
"""
Extract URLs from text, include tokens that start with 'http', strip trailing dots,
and return (set_of_urls, joined_value_string).
"""
extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
urls = extractor.find_urls(text_with_urls)
urls.extend(UrlValidator._extract_http_texts(text_with_urls))
# remove dots at the end and deduplicate
urls = set(url[:-1] if url.endswith(".") else url for url in urls)
value = ", ".join(urls)
return urls, value

@staticmethod
@if_arg
def health_and_status_check(text_with_urls):
Expand All @@ -45,48 +67,87 @@ def health_and_status_check(text_with_urls):
(dict) An object with the validity of the check and the instance/results
"""

def status_code_from_request(url):
headers = get_headers()
# timeout = 10 seconds, to allow for slow but not invalid connections
return requests.get(url, headers=headers, timeout=10).status_code

results = []

validity = True

# extract URLs from text
extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
urls = extractor.find_urls(text_with_urls)
urls.extend(UrlValidator._extract_http_texts(text_with_urls))

# remove dots at the end (The URLExtract library catches URLs, but sometimes appends a '.' at the end)
# remove duplicated urls
urls = set(url[:-1] if url.endswith(".") else url for url in urls)
value = ", ".join(urls)
urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls)

# check that URL returns a valid response
for url in urls:
if not url.startswith("http"):
url = f"http://{url}"
try:
response_code = status_code_from_request(url)
if response_code == 200:
if url.startswith("http://"):
secure_url = url.replace("http://", "https://")
if status_code_from_request(secure_url) == 200:
result = {
"url": url,
"error": "The URL is secure. Please use 'https' instead of 'http'.",
}
if url.startswith("https"):
try:
response_code = UrlValidator._status_code_from_request(url)
if response_code != 200:
result = {
"url": url,
"error": f"The url {url} is broken.",
}
results.append(result)
else:
continue
else:
result = {"url": url, "error": f"Status code {response_code}"}
except requests.ConnectionError:
result = {"url": url, "error": "The URL does not exist on Internet."}
except:
result = {"url": url, "error": "Some unknown error occurred."}
results.append(result)
except requests.ConnectionError:
result = {"url": url, "error": f"The URL {url} does not exist on Internet."}
results.append(result)

if results:
validity = False
value = results

return {"valid": validity, "value": value}

@staticmethod
@if_arg
def protocol_checks(text_with_urls):
"""
Checks the ftp included in `text_with_urls`
Args:
text_with_urls (str, required): The text that contains ftp
Returns:
(dict) An object with the validity of the check and the instance/results
"""

results = []

validity = True

urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls)

for url in urls:
if url.startswith("ftp://"):
results.append({
"url": url,
"error": f"The URL {url} exists"
})

if results:
validity = False
value = results

return {"valid": validity, "value": value}

@staticmethod
@if_arg
def secure_url_checks(text_with_urls):
"""
Checks whether the secure link (https) is included in `text_with_urls`
Args:
text_with_urls (str, required): The text that contains https
Returns:
(dict) An object with the validity of the check and the instance/results
"""

results = []

validity = True

urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls)

for url in urls:
if url.startswith("http://"):
results.append({
"url": url,
"error": f"The URL {url} is not secure"
})

if results:
validity = False
Expand Down
26 changes: 21 additions & 5 deletions pyQuARC/schemas/check_messages.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,28 @@
"remediation": "Recommend updating the Revision date so that it comes chronologically after the Insert/Creation time."
},
"url_check": {
"failure": "A URL with a status code other than 200 has been identified: `{}`.",
"failure": "`{}`.",
"help": {
"message": "",
"url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"
},
"remediation": "This often indicates a broken link. If the URL is broken, recommend revising."
"remediation": "The following link is broken. Recommend replacing the OnlineAccessURL with a link to directly access the granule via https."
},
"protocol_check": {
"failure": "The following URL `{}` does not exist.",
"help": {
"message": "",
"url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"
},
"remediation": "Recommend removing the ftp access link."
},
"secure_url_check": {
"failure": "`{}`.",
"help": {
"message": "",
"url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"
},
"remediation": "Recommend updating the following link(s) from 'http' to 'https':"
},
"url_update_email_check": {
"failure": "The listed email contact information must be updated.",
Expand Down Expand Up @@ -221,7 +237,7 @@
"message": "",
"url": ""
},
"remediation": "If data collection is ongoing, provide an EndsAtPresentFlag of \"true\""
"remediation": "Since data collection is no longer ongoing, recommend updating the EndsAtPresentFlag to 'false'."
},
"ends_at_present_flag_presence_check": {
"failure": "Potential issue with:\n - No EndingDateTime provided; no EndsAtPresentFlag provided for a potentially active collection. \n - CollectionState is not \"COMPLETE\"; no EndsAtPresentFlag provided for a potentially active collection.",
Expand Down Expand Up @@ -752,12 +768,12 @@
"remediation": "Recommend providing an entry of 'true' or 'false'."
},
"collection_progress_consistency_check": {
"failure": "The Collection State/Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.",
"failure": "The Collection Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.",
"help": {
"message": "",
"url": "https://wiki.earthdata.nasa.gov/display/CMR/Collection+Progress"
},
"remediation": "Recommend updating the Collection State/Progress based on the Ending Date Time and Ends At Present Flag values."
"remediation": "Recommend updating the Collection Progress based on the Ending Date Time and Ends At Present Flag values."
},
"online_resource_type_gcmd_check": {
"failure": "The provided Online Resource/Related URLs Type `{}` is not consistent with GCMD.",
Expand Down
10 changes: 10 additions & 0 deletions pyQuARC/schemas/checks.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@
"check_function": "health_and_status_check",
"available": true
},
"protocol_check": {
"data_type": "url",
"check_function": "protocol_checks",
"available": true
},
"secure_url_check": {
"data_type": "url",
"check_function": "secure_url_checks",
"available": true
},
"url_update_email_check": {
"data_type": "url",
"check_function": "url_update_email_check",
Expand Down
Loading