add doc:links and docs:links:check

Jannis-Mittenzwei · Jannis-Mittenzwei · commit aa7cf9e7bc07 · 2025-05-16T16:49:25.000+02:00
diff --git a/doc/conf.py b/doc/conf.py
@@ -78,3 +78,7 @@
     "github_url": "https://github.com/exasol/python-toolbox",
     "accent_color": "grass",
 }
+# -- Configure link checking behavior  ----------------------------------------
+linkcheck_ignore = [
+
+]
diff --git a/exasol/toolbox/nox/_documentation.py b/exasol/toolbox/nox/_documentation.py
@@ -3,8 +3,7 @@
 import shutil
 import subprocess
 import sys
-from urllib import request
-import urllib.error
+import requests
 import webbrowser
 from itertools import repeat
 from pathlib import Path
@@ -15,7 +14,7 @@
     Tuple,
 )
 
-
+import re
 import nox
 from nox import Session
 
@@ -67,27 +66,18 @@ def should_filter(url: str) -> bool:
         return url.startswith("mailto") or url in _filtered
 
     for file in files:
-        cmd = ["python", "-m", "urlscan", "-n", f"{file}"]
-        result = subprocess.run(cmd, capture_output=True)
-        if result.returncode != 0:
-            stderr = result.stderr.decode("utf8")
-            msg = f"Could not retrieve url's from file: {file}, details: {stderr}"
-            raise Exception(msg)
-        stdout = result.stdout.decode("utf8").strip()
-        _urls = (url.strip() for url in stdout.split("\n"))
-        _urls = (url for url in _urls if url)  # filter empty strings and none
-        yield from zip(repeat(file), filter(lambda url: not should_filter(url), _urls))
+        urls = re.findall( r"http[s]?://[^\s<>'\"\,\)\]]+[^\s<>'\"\,\.\)\]]" , file.open().read())
+        yield from zip(repeat(file), filter(lambda url: not should_filter(url), urls))
 
 
 def _doc_links_check(url: str) -> Tuple[Optional[int], str]:
     """Checks if an url is still working (can be accessed)"""
     try:
         # User-Agent needs to be faked otherwise some webpages will deny access with a 403
-        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/10.0"})
-        result = request.urlopen(req)
-        return result.code, f"{result.msg}"
-    except urllib.error.HTTPError as ex:
-        return ex.code, f"{ex}"
+        result = requests.get(url, timeout=5)
+        return result.status_code, f"{result.reason}"
+    except requests.exceptions.RequestException as ex:
+        print("error:", ex)
 
 
 def _git_diff_changes_main() -> int:
@@ -150,10 +140,15 @@ def docs_list_links(session: Session) -> None:
 def docs_links_check(session: Session) -> None:
     """Checks whether all links in the documentation are accessible."""
     errors = []
-    for path, url in _doc_urls(_doc_files(PROJECT_CONFIG.root)):
+    urls = list(_doc_urls(_doc_files(PROJECT_CONFIG.root)))
+    urls_count = len(urls)
+    count = 1
+    for path, url in urls:
+        print(f"({count}/{urls_count}): {url}")
         status, details = _doc_links_check(url)
         if status != 200:
             errors.append((path, url, status, details))
+        count += 1
 
     if errors:
         session.error(

Original file line number	Diff line number	Diff line change
`@@ -78,3 +78,7 @@`
`78`	`78`	`"github_url": "https://github.com/exasol/python-toolbox",`
`79`	`79`	`"accent_color": "grass",`
`80`	`80`	`}`
	`81`	`+# -- Configure link checking behavior ----------------------------------------`
	`82`	`+linkcheck_ignore = [`
	`83`	`+`
	`84`	`+]`