From d531040396a0ebc3297ea5ee7664fe6984411afd Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Fri, 7 Feb 2025 12:29:57 -0800
Subject: [PATCH 1/8] feat(na): add manual link checker scripts

---
 .github/scripts/check.urls.sh | 47 +++++++++++++++++++++++++++
 .github/scripts/get.urls.sh   | 60 +++++++++++++++++++++++++++++++++++
 .gitignore                    |  1 +
 3 files changed, 108 insertions(+)
 create mode 100755 .github/scripts/check.urls.sh
 create mode 100755 .github/scripts/get.urls.sh
diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh
new file mode 100755
index 000000000..89abba1de
--- /dev/null
+++ b/.github/scripts/check.urls.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Function to get HTTP response code of a URL
+get_response_code() {
+    local url=$1
+    local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url")
+    echo "$response_code"
+}
+
+# Function to check for meta refresh tag in HTML content
+check_meta_refresh() {
+    local html_content=$1
+    url=$2
+    if grep -q '<meta http-equiv="refresh"' <<< "$html_content"; then
+        local redirect_url=$(grep -oP 'url=[^"]+' <<< "$html_content" | cut -d'=' -f2)
+        local redirect_response_code=$(get_response_code "$redirect_url")
+        echo "${url} Is redirected! Result is:"
+        echo "    -> $redirect_url $redirect_response_code "
+    fi
+}
+
+run_checks(){
+  # Loop through each URL in the file
+  while IFS= read -r url; do
+      # Get HTTP response code, if it's not 200, print it so they know
+      response_code=$(get_response_code "$url")
+      if [ "$response_code" -ne 200 ]; then
+          echo "$url $response_code"
+      fi
+
+      # If response code is 200, check for meta refresh tag
+      if [ "$response_code" -eq 200 ]; then
+          html_content=$(curl -s "$url")
+          check_meta_refresh "$html_content" "$url"
+      fi
+  done < urls.txt
+}
+
+echo;echo "Are you on test branch running hugo on http://localhost:1313 and already run get.urls.sh?";echo
+read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
+echo    # (optional) move to a new line
+if [[ ! $REPLY =~ ^[nN]$ ]]
+then
+  run_checks
+  echo;echo "Done!";echo
+fi
+
diff --git a/.github/scripts/get.urls.sh b/.github/scripts/get.urls.sh
new file mode 100755
index 000000000..f0a83b654
--- /dev/null
+++ b/.github/scripts/get.urls.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Function to crawl URLs recursively
+function crawl_urls {
+    local base_url="$1"
+    local path="$2"
+    local url="$base_url$path"
+    local visited_urls=("${@:3}")
+
+    # Check if the URL has already been visited
+    if [[ " ${visited_urls[@]} " =~ " $url " ]]; then
+        return
+    fi
+
+    # Add the current URL to the visited list
+    visited_urls+=("$url")
+
+    # Fetch the HTML content of the URL and suppress all output
+    html_content=$(wget -qO- "$url" 2>/dev/null)
+    wget_exit_status=$?
+
+    # Check if wget command was successful
+    if [ $wget_exit_status -ne 0 ]; then
+        return
+    fi
+
+    # Extract all anchor tags and their href attributes
+    local links=$(echo "$html_content" | grep -oE '<a [^>]+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//')
+
+    # Output each URL found under the current URL
+    for link in $links; do
+        # Construct absolute URL if the link is relative
+        if [[ $link == /* ]]; then
+            link="$base_url$link"
+        fi
+
+        # Check if the URL is under the specified path and has not been visited before
+        if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then
+            echo "$link"
+            # Recursively crawl the URL
+            crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}"
+        fi
+    done
+}
+
+echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo
+read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
+echo    # (optional) move to a new line
+if [[ ! $REPLY =~ ^[nN]$ ]]
+then
+  # Start crawling from the base URL with the specified path
+  base_url="http://localhost:1313"
+  path=""
+  declare -a visited_urls=()
+  crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt
+  count=$(wc -l urls.txt)
+  echo "Saved $count URLs in urls.txt"
+fi
+
+
diff --git a/.gitignore b/.gitignore
index 36fdfda29..7e1030df6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ tech-doc-hugo
 .DS_Store
 .idea
 .hugo_build.lock
+urls.txt
\ No newline at end of file

From eaa5c103b8f9634f6c6b13191d90c9ba203ad3be Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Fri, 7 Feb 2025 12:41:06 -0800
Subject: [PATCH 2/8] fix URL count output

---
 .github/scripts/get.urls.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/get.urls.sh b/.github/scripts/get.urls.sh
index f0a83b654..7f505f3f9 100755
--- a/.github/scripts/get.urls.sh
+++ b/.github/scripts/get.urls.sh
@@ -53,7 +53,7 @@ then
   path=""
   declare -a visited_urls=()
   crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt
-  count=$(wc -l urls.txt)
+  count=$(wc -l urls.txt|cut -d' ' -f1)
   echo "Saved $count URLs in urls.txt"
 fi
 

From 23f31b9dd8d5a4eb36fa79521bfba138f9f2527a Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Mon, 10 Feb 2025 11:19:42 -0800
Subject: [PATCH 3/8] simplify url check scripts per feedback

---
 .github/scripts/check.urls.sh | 54 +++++++++++++++++++------------
 .github/scripts/get.urls.sh   | 60 -----------------------------------
 2 files changed, 34 insertions(+), 80 deletions(-)
 delete mode 100755 .github/scripts/get.urls.sh

diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh
index 89abba1de..a94a44b41 100755
--- a/.github/scripts/check.urls.sh
+++ b/.github/scripts/check.urls.sh
@@ -2,8 +2,9 @@
 
 # Function to get HTTP response code of a URL
 get_response_code() {
+    local response_code
     local url=$1
-    local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url")
+    response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url")
     echo "$response_code"
 }
 
@@ -12,36 +13,49 @@ check_meta_refresh() {
     local html_content=$1
     url=$2
     if grep -q '<meta http-equiv="refresh"' <<< "$html_content"; then
-        local redirect_url=$(grep -oP 'url=[^"]+' <<< "$html_content" | cut -d'=' -f2)
-        local redirect_response_code=$(get_response_code "$redirect_url")
+        local redirect_url
+        local redirect_response_code
+        redirect_url=$(grep -oP 'url=[^"]+' <<< "$html_content" | cut -d'=' -f2)
+        redirect_response_code=$(get_response_code "$redirect_url")
         echo "${url} Is redirected! Result is:"
         echo "    -> $redirect_url $redirect_response_code "
     fi
 }
 
 run_checks(){
-  # Loop through each URL in the file
-  while IFS= read -r url; do
-      # Get HTTP response code, if it's not 200, print it so they know
-      response_code=$(get_response_code "$url")
-      if [ "$response_code" -ne 200 ]; then
-          echo "$url $response_code"
-      fi
+    echo
+    prod_urls=$1
+    # Loop through each URL in the file
+    while IFS= read -r url; do
+        # Get HTTP response code, if it's not 200, print it so they know
+        response_code=$(get_response_code "$url")
+        if [ "$response_code" -ne 200 ]; then
+            echo "$url $response_code"
+        fi
 
-      # If response code is 200, check for meta refresh tag
-      if [ "$response_code" -eq 200 ]; then
-          html_content=$(curl -s "$url")
-          check_meta_refresh "$html_content" "$url"
-      fi
-  done < urls.txt
+        # If response code is 200, check for meta refresh tag
+        if [ "$response_code" -eq 200 ]; then
+            html_content=$(curl -s "$url")
+            check_meta_refresh "$html_content" "$url"
+        fi
+    done <<< "$prod_urls"
 }
 
-echo;echo "Are you on test branch running hugo on http://localhost:1313 and already run get.urls.sh?";echo
+get_urls_from_prod_site_map(){
+    local urls
+    # thanks https://aruljohn.com/blog/download-extract-urls-sitemaps/
+    urls=$(curl -qs https://docs.communityhealthtoolkit.org/sitemap.xml  2>&1 | grep -o "<loc>[^<]*" | sed -e 's/<[^>]*>//g')
+    urls="${urls//https:\/\/docs.communityhealthtoolkit.org/http:\/\/localhost:1313}"
+    echo "$urls"
+}
+
+echo;echo "Are you on a test branch and is hugo running on http://localhost:1313 ?";echo
 read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
 echo    # (optional) move to a new line
 if [[ ! $REPLY =~ ^[nN]$ ]]
 then
-  run_checks
-  echo;echo "Done!";echo
+    prod_urls=$(get_urls_from_prod_site_map)
+    url_count=$(echo "$prod_urls" | wc -l | cut -d' ' -f1)
+    run_checks "$prod_urls"
+    echo;echo "Successfully checked ${url_count} URLs!"
 fi
-
diff --git a/.github/scripts/get.urls.sh b/.github/scripts/get.urls.sh
deleted file mode 100755
index 7f505f3f9..000000000
--- a/.github/scripts/get.urls.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Function to crawl URLs recursively
-function crawl_urls {
-    local base_url="$1"
-    local path="$2"
-    local url="$base_url$path"
-    local visited_urls=("${@:3}")
-
-    # Check if the URL has already been visited
-    if [[ " ${visited_urls[@]} " =~ " $url " ]]; then
-        return
-    fi
-
-    # Add the current URL to the visited list
-    visited_urls+=("$url")
-
-    # Fetch the HTML content of the URL and suppress all output
-    html_content=$(wget -qO- "$url" 2>/dev/null)
-    wget_exit_status=$?
-
-    # Check if wget command was successful
-    if [ $wget_exit_status -ne 0 ]; then
-        return
-    fi
-
-    # Extract all anchor tags and their href attributes
-    local links=$(echo "$html_content" | grep -oE '<a [^>]+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//')
-
-    # Output each URL found under the current URL
-    for link in $links; do
-        # Construct absolute URL if the link is relative
-        if [[ $link == /* ]]; then
-            link="$base_url$link"
-        fi
-
-        # Check if the URL is under the specified path and has not been visited before
-        if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then
-            echo "$link"
-            # Recursively crawl the URL
-            crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}"
-        fi
-    done
-}
-
-echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo
-read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
-echo    # (optional) move to a new line
-if [[ ! $REPLY =~ ^[nN]$ ]]
-then
-  # Start crawling from the base URL with the specified path
-  base_url="http://localhost:1313"
-  path=""
-  declare -a visited_urls=()
-  crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt
-  count=$(wc -l urls.txt|cut -d' ' -f1)
-  echo "Saved $count URLs in urls.txt"
-fi
-
-

From 033e2fcb19164382efc39f1bb0664748cf0c656e Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Wed, 19 Feb 2025 14:35:08 -0800
Subject: [PATCH 4/8] remove file from gitignore per feedback

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7e1030df6..1b9c52df1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ tech-doc-hugo
 .DS_Store
 .idea
 .hugo_build.lock
-urls.txt
\ No newline at end of file
+

From c35385fac9519b86c177be795e58c29a88936008 Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Wed, 19 Feb 2025 15:29:16 -0800
Subject: [PATCH 5/8] add a bit more info text about running script

---
 .github/scripts/check.urls.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh
index a94a44b41..8c62b382a 100755
--- a/.github/scripts/check.urls.sh
+++ b/.github/scripts/check.urls.sh
@@ -56,6 +56,7 @@ if [[ ! $REPLY =~ ^[nN]$ ]]
 then
     prod_urls=$(get_urls_from_prod_site_map)
     url_count=$(echo "$prod_urls" | wc -l | cut -d' ' -f1)
+    echo;echo "Checking ${url_count} URLs, be patent.  Any non 200 URLs will be listed here:"
     run_checks "$prod_urls"
-    echo;echo "Successfully checked ${url_count} URLs!"
+    echo "Successfully checked ${url_count} URLs!"
 fi

From 9fdd0ccbb7abb0e3d835dbe620d22b6a56c67144 Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Thu, 20 Feb 2025 12:54:19 -0800
Subject: [PATCH 6/8] update compose to install curl, add more UI to checker
 script

---
 .github/scripts/check.urls.sh | 3 ++-
 compose.yml                   | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh
index 8c62b382a..49036ca18 100755
--- a/.github/scripts/check.urls.sh
+++ b/.github/scripts/check.urls.sh
@@ -54,9 +54,10 @@ read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r
 echo    # (optional) move to a new line
 if [[ ! $REPLY =~ ^[nN]$ ]]
 then
+    echo;echo "Fetching URLs from production."
     prod_urls=$(get_urls_from_prod_site_map)
     url_count=$(echo "$prod_urls" | wc -l | cut -d' ' -f1)
-    echo;echo "Checking ${url_count} URLs, be patent.  Any non 200 URLs will be listed here:"
+    echo;echo "Checking ${url_count} URLs, be patient.  Any non 200 URLs will be listed here:"
     run_checks "$prod_urls"
     echo "Successfully checked ${url_count} URLs!"
 fi
diff --git a/compose.yml b/compose.yml
index d33599496..dc1d78f89 100644
--- a/compose.yml
+++ b/compose.yml
@@ -6,4 +6,7 @@ services:
       - 1313:1313
     volumes:
       - ./:/src
-    command: hugo server --buildDrafts --buildFuture --bind 0.0.0.0
+    command: >
+      sh -c "apk add bash curl grep &&
+      hugo server --buildDrafts --buildFuture --bind 0.0.0.0"
+      

From f8e96be9ec7af7e03c8645cfeb0dbcabe6b6de52 Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Fri, 21 Feb 2025 16:30:39 -0800
Subject: [PATCH 7/8] update readme with link checker info per feedback

---
 README.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index bd02d62c2..0def8061d 100644
--- a/README.md
+++ b/README.md
@@ -52,18 +52,62 @@ Any users who experience errors running `hugo server`, please see our [Troublesh
 
 ## Link Checking [Optional]
 
+We have two types of link checking:
+
+* All links - tests all links within docs and outbound
+* Internal links - takes all internal links from [production site](https://docs.communityhealthtoolkit.org/) and tests them on your branch
+
+### All links, including outbound 
+
 We validate that all links on the docs site work (do not 404) using a tool called [Muffet](https://github.com/raviqqe/muffet) along with [Actions](https://github.com/features/actions). If you're creating a lot of new links, or editing a lot of existing links, you may optionally run Muffet locally before pushing your commits. Running Muffet locally can save time by exposing broken links before pushing a build since you can avoid waiting for the Action to run, finding you have a broken link, fixing it, and pushing a new change.
 
-1. Install [Go](https://golang.org/doc/install) as a prerequisite 
-2. Install Muffet: `go get -u github.com/raviqqe/muffet`
-    - If using `asdf` you need to reshim (`asdf reshim golang`)
-3. Ensure you've run `hugo server` so your local docs instance is reachable at http://localhost:1313/
-4. Test the links with the [`muffet.sh`](https://github.com/medic/cht-docs/blob/main/.github/scripts/muffet.sh) script.  If you're in the root of this repo, that'd be: `./.github/scripts/muffet.sh` 
+1. Start the docker container: `docker compose up -d`
+2. Test the links with the [`muffet.sh`](https://github.com/medic/cht-docs/blob/main/.github/scripts/muffet.sh) script: `docker exec cht-hugo sh -c "cd .github/scripts/; ./muffet.sh"`
   
-It should take about 60 seconds depending on your Internet connection. If Muffet returns no output, you have no broken links, congrats! 
+It can take many minutes depending on your Internet connection. If Muffet returns no output, you have no broken links, congrats! 
 
 _Note_: The `muffet.sh` script here is the identical script we run on GitHub. If you simply run `muffet http://localhost:1313` you will hit GitHub's rate limiting and get lots of [429's](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429). Our script intentionally reduces concurrency and excludes some repetitive GitHub URLs.
 
+Note that you may see transient errors as shown here with lookup errors:
+
+```shell
+$ docker exec  cht-hugo sh -c "cd .github/scripts/; ./muffet.sh" 
+
+http://localhost:1313/hosting/4.x/production/docker/adding-tls-certificates/
+        lookup letsencrypt.org: i/o timeout     https://letsencrypt.org/
+http://localhost:1313/core/overview/offline-first/
+        lookup blog.couchdb.org: i/o timeout    https://blog.couchdb.org/2017/09/19/couchdb-takes-medic-mobile-to-the-front-lines-of-healthcare-work/
+http://localhost:1313/hosting/monitoring/production/
+        lookup letsencrypt.org: i/o timeout     https://letsencrypt.org/
+http://localhost:1313/building/forms/app/
+        lookup www.w3.org: i/o timeout  https://www.w3.org/TR/1999/REC-xpath-19991116/ 
+```
+
+### Internal links after major re-organization
+
+If you're moving more than ~5 pages around, you should check that they either correctly redirect with the `aliases` [feature](https://hugo-docs.netlify.app/en/content-management/urls/#aliases) or 404 if the page is indeed removed with no replacement.  There's a script that will get all the URLs from the [production site](https://docs.communityhealthtoolkit.org/) and then check your branch for the result of every URL.  If it gets a `200` with no redirect, nothing is shown.  All other results, like `404` or a `200` which results in a redirect are shown.
+
+This is mainly to help preserve SEO and to help folks who bookmark specific pages.  
+
+To test:
+
+1. Make your changes, for example moving 10s of pages to a new location
+2. Check that `hugo` compiles and doesn't complain of any broken links
+3. Start the docker container: `docker compose up -d`
+4. Test the links with the bash script: `docker exec cht-hugo .github/scripts/check.urls.sh`
+
+```shell
+$ docker exec  cht-hugo .github/scripts/check.urls.sh           
+
+Are you on a test branch and is hugo running on http://localhost:1313 ?
+
+Fetching URLs from production.
+
+Checking 435 URLs, be patient.  Any non 200 URLs will be listed here:
+
+Successfully checked 435 URLs!
+```
+
 ## Continuous Deployment
 
 All changes to `main` branch run a [GitHub action](.github/workflows/ci.yml) to first check for any broken links ([per above](#link-checking-optional)) and then deploy to the documentation site: [docs.communityhealthtoolkit.org](https://docs.communityhealthtoolkit.org)

From e0fafa96701a1bc218966bb0e1ffab522143985f Mon Sep 17 00:00:00 2001
From: mrjones-plip <mrjones-plip@plip.com>
Date: Fri, 21 Feb 2025 16:31:59 -0800
Subject: [PATCH 8/8] add more subheads to readme

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0def8061d..5bf8bce30 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,8 @@ We have two types of link checking:
 
 We validate that all links on the docs site work (do not 404) using a tool called [Muffet](https://github.com/raviqqe/muffet) along with [Actions](https://github.com/features/actions). If you're creating a lot of new links, or editing a lot of existing links, you may optionally run Muffet locally before pushing your commits. Running Muffet locally can save time by exposing broken links before pushing a build since you can avoid waiting for the Action to run, finding you have a broken link, fixing it, and pushing a new change.
 
+#### Running
+
 1. Start the docker container: `docker compose up -d`
 2. Test the links with the [`muffet.sh`](https://github.com/medic/cht-docs/blob/main/.github/scripts/muffet.sh) script: `docker exec cht-hugo sh -c "cd .github/scripts/; ./muffet.sh"`
   
@@ -68,8 +70,11 @@ It can take many minutes depending on your Internet connection. If Muffet return
 
 _Note_: The `muffet.sh` script here is the identical script we run on GitHub. If you simply run `muffet http://localhost:1313` you will hit GitHub's rate limiting and get lots of [429's](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429). Our script intentionally reduces concurrency and excludes some repetitive GitHub URLs.
 
+#### Example
+
 Note that you may see transient errors as shown here with lookup errors:
 
+
 ```shell
 $ docker exec  cht-hugo sh -c "cd .github/scripts/; ./muffet.sh" 
 
@@ -89,13 +94,15 @@ If you're moving more than ~5 pages around, you should check that they either co
 
 This is mainly to help preserve SEO and to help folks who bookmark specific pages.  
 
-To test:
+#### Running
 
 1. Make your changes, for example moving 10s of pages to a new location
 2. Check that `hugo` compiles and doesn't complain of any broken links
 3. Start the docker container: `docker compose up -d`
 4. Test the links with the bash script: `docker exec cht-hugo .github/scripts/check.urls.sh`
 
+#### Example
+
 ```shell
 $ docker exec  cht-hugo .github/scripts/check.urls.sh