NASA-IMPACT · lavanya3k · Oct 17, 2025 · Sep 5, 2025 · Oct 14, 2025 · Oct 17, 2025
diff --git a/pyQuARC/code/url_validator.py b/pyQuARC/code/url_validator.py
@@ -34,6 +34,28 @@ def _extract_http_texts(text_with_urls):
                 starts_with_http.add(text)
         return starts_with_http
 
+    @staticmethod
+    def _status_code_from_request(url):
+        """
+        Return HTTP status code for url, raising requests exceptions to caller.
+        """
+        headers = get_headers()
+        return requests.get(url, headers=headers, timeout=10).status_code
+
+    @staticmethod
+    def _extract_and_normalize_urls(text_with_urls):
+        """
+        Extract URLs from text, include tokens that start with 'http', strip trailing dots,
+        and return (set_of_urls, joined_value_string).
+        """
+        extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
+        urls = extractor.find_urls(text_with_urls)
+        urls.extend(UrlValidator._extract_http_texts(text_with_urls))
+        # remove dots at the end and deduplicate
+        urls = set(url[:-1] if url.endswith(".") else url for url in urls)
+        value = ", ".join(urls)
+        return urls, value
+
     @staticmethod
     @if_arg
     def health_and_status_check(text_with_urls):
@@ -45,48 +67,87 @@ def health_and_status_check(text_with_urls):
             (dict) An object with the validity of the check and the instance/results
         """
 
-        def status_code_from_request(url):
-            headers = get_headers()
-            # timeout = 10 seconds, to allow for slow but not invalid connections
-            return requests.get(url, headers=headers, timeout=10).status_code
-
         results = []
 
         validity = True
 
-        # extract URLs from text
-        extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
-        urls = extractor.find_urls(text_with_urls)
-        urls.extend(UrlValidator._extract_http_texts(text_with_urls))
-
-        # remove dots at the end (The URLExtract library catches URLs, but sometimes appends a '.' at the end)
-        # remove duplicated urls
-        urls = set(url[:-1] if url.endswith(".") else url for url in urls)
-        value = ", ".join(urls)
+        urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls)
 
-        # check that URL returns a valid response
         for url in urls:
-            if not url.startswith("http"):
-                url = f"http://{url}"
-            try:
-                response_code = status_code_from_request(url)
-                if response_code == 200:
-                    if url.startswith("http://"):
-                        secure_url = url.replace("http://", "https://")
-                        if status_code_from_request(secure_url) == 200:
-                            result = {
-                                "url": url,
-                                "error": "The URL is secure. Please use 'https' instead of 'http'.",
-                            }
+            if url.startswith("https"):
+                try:
+                    response_code = UrlValidator._status_code_from_request(url)
+                    if response_code != 200:
+                        result = {
+                            "url": url,
+                            "error": f"The url {url} is broken.",
+                        }
+                        results.append(result)
                     else:
                         continue
-                else:
-                    result = {"url": url, "error": f"Status code {response_code}"}
-            except requests.ConnectionError:
-                result = {"url": url, "error": "The URL does not exist on Internet."}
-            except:
-                result = {"url": url, "error": "Some unknown error occurred."}
-            results.append(result)
+                except requests.ConnectionError:
+                    result = {"url": url, "error": f"The URL {url} does not exist on Internet."}
+                    results.append(result)
+
+        if results:
+            validity = False
+            value = results
+
+        return {"valid": validity, "value": value}
+
+    @staticmethod
+    @if_arg
+    def protocol_checks(text_with_urls):
+        """
+        Checks the ftp included in `text_with_urls`
+        Args:
+           text_with_urls (str, required): The text that contains ftp
+        Returns:
+            (dict) An object with the validity of the check and the instance/results
+        """
+
+        results = []
+
+        validity = True
+
+        urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls)
+
+        for url in urls:
+            if url.startswith("ftp://"):
+                results.append({
+                    "url": url,
+                    "error": f"The URL {url} exists"
+                })
+
+        if results:
+            validity = False
+            value = results
+
+        return {"valid": validity, "value": value}
+
+    @staticmethod
+    @if_arg
+    def secure_url_checks(text_with_urls):
+        """
+        Checks whether the secure link (https) is included in `text_with_urls`
+        Args:
+           text_with_urls (str, required): The text that contains https
+        Returns:
+            (dict) An object with the validity of the check and the instance/results
+        """
+
+        results = []
+
+        validity = True
+
+        urls, value = UrlValidator._extract_and_normalize_urls(text_with_urls)
+
+        for url in urls:
+            if url.startswith("http://"):
+                results.append({
+                    "url": url,
+                    "error": f"The URL {url} is not secure"
+                })
 
         if results:
             validity = False

diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json
@@ -40,12 +40,28 @@
         "remediation": "Recommend updating the Revision date so that it comes chronologically after the Insert/Creation time."
     },
     "url_check": {
-        "failure": "A URL with a status code other than 200 has been identified: `{}`.",
+        "failure": "`{}`.",
         "help": {
             "message": "",
             "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"
         },
-        "remediation": "This often indicates a broken link. If the URL is broken, recommend revising."
+        "remediation": "The following link is broken. Recommend replacing the OnlineAccessURL with a link to directly access the granule via https."
+    },
+    "protocol_check": {
+        "failure": "The following URL `{}` does not exist.",
+        "help": {
+            "message": "",
+            "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"
+        },
+        "remediation": "Recommend removing the ftp access link."
+    },
+    "secure_url_check": {
+        "failure": "`{}`.",
+        "help": {
+            "message": "",
+            "url": "https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"
+        },
+        "remediation": "Recommend updating the following link(s) from 'http' to 'https':"
     },
     "url_update_email_check": {
         "failure": "The listed email contact information must be updated.",
@@ -221,7 +237,7 @@
             "message": "",
             "url": ""
         },
-        "remediation": "If data collection is ongoing, provide an EndsAtPresentFlag of \"true\""
+        "remediation": "Since data collection is no longer ongoing, recommend updating the EndsAtPresentFlag to 'false'."
     },
     "ends_at_present_flag_presence_check": {
         "failure": "Potential issue with:\n - No EndingDateTime provided; no EndsAtPresentFlag provided for a potentially active collection. \n - CollectionState is not \"COMPLETE\"; no EndsAtPresentFlag provided for a potentially active collection.",
@@ -752,12 +768,12 @@
         "remediation": "Recommend providing an entry of 'true' or 'false'."
     },
     "collection_progress_consistency_check": {
-        "failure": "The Collection State/Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.",
+        "failure": "The Collection Progress `{}` is not consistent with the Ending Date Time and/or the Ends At Present Flag.",
         "help": {
             "message": "",
             "url": "https://wiki.earthdata.nasa.gov/display/CMR/Collection+Progress"
         },
-        "remediation": "Recommend updating the Collection State/Progress based on the Ending Date Time and Ends At Present Flag values."
+        "remediation": "Recommend updating the Collection Progress based on the Ending Date Time and Ends At Present Flag values."
     },
     "online_resource_type_gcmd_check": {
         "failure": "The provided Online Resource/Related URLs Type `{}` is not consistent with GCMD.",

diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json
@@ -24,6 +24,16 @@
         "check_function": "health_and_status_check",
         "available": true
     },
+     "protocol_check": {
+        "data_type": "url",
+        "check_function": "protocol_checks",
+        "available": true
+    },
+    "secure_url_check": {
+        "data_type": "url",
+        "check_function": "secure_url_checks",
+        "available": true
+    }, 
     "url_update_email_check": {
         "data_type": "url",
         "check_function": "url_update_email_check",