From d531040396a0ebc3297ea5ee7664fe6984411afd Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Fri, 7 Feb 2025 12:29:57 -0800 Subject: [PATCH 1/8] feat(na): add manual link checker scripts --- .github/scripts/check.urls.sh | 47 +++++++++++++++++++++++++++ .github/scripts/get.urls.sh | 60 +++++++++++++++++++++++++++++++++++ .gitignore | 1 + 3 files changed, 108 insertions(+) create mode 100755 .github/scripts/check.urls.sh create mode 100755 .github/scripts/get.urls.sh diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh new file mode 100755 index 000000000..89abba1de --- /dev/null +++ b/.github/scripts/check.urls.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Function to get HTTP response code of a URL +get_response_code() { + local url=$1 + local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url") + echo "$response_code" +} + +# Function to check for meta refresh tag in HTML content +check_meta_refresh() { + local html_content=$1 + url=$2 + if grep -q '/dev/null) + wget_exit_status=$? + + # Check if wget command was successful + if [ $wget_exit_status -ne 0 ]; then + return + fi + + # Extract all anchor tags and their href attributes + local links=$(echo "$html_content" | grep -oE ']+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//') + + # Output each URL found under the current URL + for link in $links; do + # Construct absolute URL if the link is relative + if [[ $link == /* ]]; then + link="$base_url$link" + fi + + # Check if the URL is under the specified path and has not been visited before + if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then + echo "$link" + # Recursively crawl the URL + crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}" + fi + done +} + +echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo +read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r +echo # (optional) move to a new line +if [[ ! $REPLY =~ ^[nN]$ ]] +then + # Start crawling from the base URL with the specified path + base_url="http://localhost:1313" + path="" + declare -a visited_urls=() + crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt + count=$(wc -l urls.txt) + echo "Saved $count URLs in urls.txt" +fi + + diff --git a/.gitignore b/.gitignore index 36fdfda29..7e1030df6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ tech-doc-hugo .DS_Store .idea .hugo_build.lock +urls.txt \ No newline at end of file From eaa5c103b8f9634f6c6b13191d90c9ba203ad3be Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Fri, 7 Feb 2025 12:41:06 -0800 Subject: [PATCH 2/8] fix URL count output --- .github/scripts/get.urls.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/get.urls.sh b/.github/scripts/get.urls.sh index f0a83b654..7f505f3f9 100755 --- a/.github/scripts/get.urls.sh +++ b/.github/scripts/get.urls.sh @@ -53,7 +53,7 @@ then path="" declare -a visited_urls=() crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt - count=$(wc -l urls.txt) + count=$(wc -l urls.txt|cut -d' ' -f1) echo "Saved $count URLs in urls.txt" fi From 23f31b9dd8d5a4eb36fa79521bfba138f9f2527a Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Mon, 10 Feb 2025 11:19:42 -0800 Subject: [PATCH 3/8] simplify url check scripts per feedback --- .github/scripts/check.urls.sh | 54 +++++++++++++++++++------------ .github/scripts/get.urls.sh | 60 ----------------------------------- 2 files changed, 34 insertions(+), 80 deletions(-) delete mode 100755 .github/scripts/get.urls.sh diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh index 89abba1de..a94a44b41 100755 --- a/.github/scripts/check.urls.sh +++ b/.github/scripts/check.urls.sh @@ -2,8 +2,9 @@ # Function to get HTTP response code of a URL get_response_code() { + local response_code local url=$1 - local response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url") + response_code=$(curl -s -o /dev/null -w "%{http_code}" "$url") echo "$response_code" } @@ -12,36 +13,49 @@ check_meta_refresh() { local html_content=$1 url=$2 if grep -q '&1 | grep -o "[^<]*" | sed -e 's/<[^>]*>//g') + urls="${urls//https:\/\/docs.communityhealthtoolkit.org/http:\/\/localhost:1313}" + echo "$urls" +} + +echo;echo "Are you on a test branch and is hugo running on http://localhost:1313 ?";echo read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r echo # (optional) move to a new line if [[ ! $REPLY =~ ^[nN]$ ]] then - run_checks - echo;echo "Done!";echo + prod_urls=$(get_urls_from_prod_site_map) + url_count=$(echo "$prod_urls" | wc -l | cut -d' ' -f1) + run_checks "$prod_urls" + echo;echo "Successfully checked ${url_count} URLs!" fi - diff --git a/.github/scripts/get.urls.sh b/.github/scripts/get.urls.sh deleted file mode 100755 index 7f505f3f9..000000000 --- a/.github/scripts/get.urls.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Function to crawl URLs recursively -function crawl_urls { - local base_url="$1" - local path="$2" - local url="$base_url$path" - local visited_urls=("${@:3}") - - # Check if the URL has already been visited - if [[ " ${visited_urls[@]} " =~ " $url " ]]; then - return - fi - - # Add the current URL to the visited list - visited_urls+=("$url") - - # Fetch the HTML content of the URL and suppress all output - html_content=$(wget -qO- "$url" 2>/dev/null) - wget_exit_status=$? - - # Check if wget command was successful - if [ $wget_exit_status -ne 0 ]; then - return - fi - - # Extract all anchor tags and their href attributes - local links=$(echo "$html_content" | grep -oE ']+>' | grep -oE 'href="([^"#]+)"' | sed -e 's/^href="//' -e 's/"$//') - - # Output each URL found under the current URL - for link in $links; do - # Construct absolute URL if the link is relative - if [[ $link == /* ]]; then - link="$base_url$link" - fi - - # Check if the URL is under the specified path and has not been visited before - if [[ $link == "$base_url$path/"* && ! " ${visited_urls[@]} " =~ " $link " ]]; then - echo "$link" - # Recursively crawl the URL - crawl_urls "$base_url" "$path" "$link" "${visited_urls[@]}" - fi - done -} - -echo;echo "Are you on 'main' branch and running hugo on http://localhost:1313?";echo -read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r -echo # (optional) move to a new line -if [[ ! $REPLY =~ ^[nN]$ ]] -then - # Start crawling from the base URL with the specified path - base_url="http://localhost:1313" - path="" - declare -a visited_urls=() - crawl_urls "$base_url" "$path" "${visited_urls[@]}" | sort -u > urls.txt - count=$(wc -l urls.txt|cut -d' ' -f1) - echo "Saved $count URLs in urls.txt" -fi - - From 033e2fcb19164382efc39f1bb0664748cf0c656e Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Wed, 19 Feb 2025 14:35:08 -0800 Subject: [PATCH 4/8] remove file from gitignore per feedback --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7e1030df6..1b9c52df1 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ tech-doc-hugo .DS_Store .idea .hugo_build.lock -urls.txt \ No newline at end of file + From c35385fac9519b86c177be795e58c29a88936008 Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Wed, 19 Feb 2025 15:29:16 -0800 Subject: [PATCH 5/8] add a bit more info text about running script --- .github/scripts/check.urls.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh index a94a44b41..8c62b382a 100755 --- a/.github/scripts/check.urls.sh +++ b/.github/scripts/check.urls.sh @@ -56,6 +56,7 @@ if [[ ! $REPLY =~ ^[nN]$ ]] then prod_urls=$(get_urls_from_prod_site_map) url_count=$(echo "$prod_urls" | wc -l | cut -d' ' -f1) + echo;echo "Checking ${url_count} URLs, be patent. Any non 200 URLs will be listed here:" run_checks "$prod_urls" - echo;echo "Successfully checked ${url_count} URLs!" + echo "Successfully checked ${url_count} URLs!" fi From 9fdd0ccbb7abb0e3d835dbe620d22b6a56c67144 Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Thu, 20 Feb 2025 12:54:19 -0800 Subject: [PATCH 6/8] update compose to install curl, add more UI to checker script --- .github/scripts/check.urls.sh | 3 ++- compose.yml | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/scripts/check.urls.sh b/.github/scripts/check.urls.sh index 8c62b382a..49036ca18 100755 --- a/.github/scripts/check.urls.sh +++ b/.github/scripts/check.urls.sh @@ -54,9 +54,10 @@ read -p "\"y\" to proceed, \"n\" to cancel " -n 1 -r echo # (optional) move to a new line if [[ ! $REPLY =~ ^[nN]$ ]] then + echo;echo "Fetching URLs from production." prod_urls=$(get_urls_from_prod_site_map) url_count=$(echo "$prod_urls" | wc -l | cut -d' ' -f1) - echo;echo "Checking ${url_count} URLs, be patent. Any non 200 URLs will be listed here:" + echo;echo "Checking ${url_count} URLs, be patient. Any non 200 URLs will be listed here:" run_checks "$prod_urls" echo "Successfully checked ${url_count} URLs!" fi diff --git a/compose.yml b/compose.yml index d33599496..dc1d78f89 100644 --- a/compose.yml +++ b/compose.yml @@ -6,4 +6,7 @@ services: - 1313:1313 volumes: - ./:/src - command: hugo server --buildDrafts --buildFuture --bind 0.0.0.0 + command: > + sh -c "apk add bash curl grep && + hugo server --buildDrafts --buildFuture --bind 0.0.0.0" + From f8e96be9ec7af7e03c8645cfeb0dbcabe6b6de52 Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Fri, 21 Feb 2025 16:30:39 -0800 Subject: [PATCH 7/8] update readme with link checker info per feedback --- README.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index bd02d62c2..0def8061d 100644 --- a/README.md +++ b/README.md @@ -52,18 +52,62 @@ Any users who experience errors running `hugo server`, please see our [Troublesh ## Link Checking [Optional] +We have two types of link checking: + +* All links - tests all links within docs and outbound +* Internal links - takes all internal links from [production site](https://docs.communityhealthtoolkit.org/) and tests them on your branch + +### All links, including outbound + We validate that all links on the docs site work (do not 404) using a tool called [Muffet](https://github.com/raviqqe/muffet) along with [Actions](https://github.com/features/actions). If you're creating a lot of new links, or editing a lot of existing links, you may optionally run Muffet locally before pushing your commits. Running Muffet locally can save time by exposing broken links before pushing a build since you can avoid waiting for the Action to run, finding you have a broken link, fixing it, and pushing a new change. -1. Install [Go](https://golang.org/doc/install) as a prerequisite -2. Install Muffet: `go get -u github.com/raviqqe/muffet` - - If using `asdf` you need to reshim (`asdf reshim golang`) -3. Ensure you've run `hugo server` so your local docs instance is reachable at http://localhost:1313/ -4. Test the links with the [`muffet.sh`](https://github.com/medic/cht-docs/blob/main/.github/scripts/muffet.sh) script. If you're in the root of this repo, that'd be: `./.github/scripts/muffet.sh` +1. Start the docker container: `docker compose up -d` +2. Test the links with the [`muffet.sh`](https://github.com/medic/cht-docs/blob/main/.github/scripts/muffet.sh) script: `docker exec cht-hugo sh -c "cd .github/scripts/; ./muffet.sh"` -It should take about 60 seconds depending on your Internet connection. If Muffet returns no output, you have no broken links, congrats! +It can take many minutes depending on your Internet connection. If Muffet returns no output, you have no broken links, congrats! _Note_: The `muffet.sh` script here is the identical script we run on GitHub. If you simply run `muffet http://localhost:1313` you will hit GitHub's rate limiting and get lots of [429's](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429). Our script intentionally reduces concurrency and excludes some repetitive GitHub URLs. +Note that you may see transient errors as shown here with lookup errors: + +```shell +$ docker exec cht-hugo sh -c "cd .github/scripts/; ./muffet.sh" + +http://localhost:1313/hosting/4.x/production/docker/adding-tls-certificates/ + lookup letsencrypt.org: i/o timeout https://letsencrypt.org/ +http://localhost:1313/core/overview/offline-first/ + lookup blog.couchdb.org: i/o timeout https://blog.couchdb.org/2017/09/19/couchdb-takes-medic-mobile-to-the-front-lines-of-healthcare-work/ +http://localhost:1313/hosting/monitoring/production/ + lookup letsencrypt.org: i/o timeout https://letsencrypt.org/ +http://localhost:1313/building/forms/app/ + lookup www.w3.org: i/o timeout https://www.w3.org/TR/1999/REC-xpath-19991116/ +``` + +### Internal links after major re-organization + +If you're moving more than ~5 pages around, you should check that they either correctly redirect with the `aliases` [feature](https://hugo-docs.netlify.app/en/content-management/urls/#aliases) or 404 if the page is indeed removed with no replacement. There's a script that will get all the URLs from the [production site](https://docs.communityhealthtoolkit.org/) and then check your branch for the result of every URL. If it gets a `200` with no redirect, nothing is shown. All other results, like `404` or a `200` which results in a redirect are shown. + +This is mainly to help preserve SEO and to help folks who bookmark specific pages. + +To test: + +1. Make your changes, for example moving 10s of pages to a new location +2. Check that `hugo` compiles and doesn't complain of any broken links +3. Start the docker container: `docker compose up -d` +4. Test the links with the bash script: `docker exec cht-hugo .github/scripts/check.urls.sh` + +```shell +$ docker exec cht-hugo .github/scripts/check.urls.sh + +Are you on a test branch and is hugo running on http://localhost:1313 ? + +Fetching URLs from production. + +Checking 435 URLs, be patient. Any non 200 URLs will be listed here: + +Successfully checked 435 URLs! +``` + ## Continuous Deployment All changes to `main` branch run a [GitHub action](.github/workflows/ci.yml) to first check for any broken links ([per above](#link-checking-optional)) and then deploy to the documentation site: [docs.communityhealthtoolkit.org](https://docs.communityhealthtoolkit.org) From e0fafa96701a1bc218966bb0e1ffab522143985f Mon Sep 17 00:00:00 2001 From: mrjones-plip Date: Fri, 21 Feb 2025 16:31:59 -0800 Subject: [PATCH 8/8] add more subheads to readme --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0def8061d..5bf8bce30 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ We have two types of link checking: We validate that all links on the docs site work (do not 404) using a tool called [Muffet](https://github.com/raviqqe/muffet) along with [Actions](https://github.com/features/actions). If you're creating a lot of new links, or editing a lot of existing links, you may optionally run Muffet locally before pushing your commits. Running Muffet locally can save time by exposing broken links before pushing a build since you can avoid waiting for the Action to run, finding you have a broken link, fixing it, and pushing a new change. +#### Running + 1. Start the docker container: `docker compose up -d` 2. Test the links with the [`muffet.sh`](https://github.com/medic/cht-docs/blob/main/.github/scripts/muffet.sh) script: `docker exec cht-hugo sh -c "cd .github/scripts/; ./muffet.sh"` @@ -68,8 +70,11 @@ It can take many minutes depending on your Internet connection. If Muffet return _Note_: The `muffet.sh` script here is the identical script we run on GitHub. If you simply run `muffet http://localhost:1313` you will hit GitHub's rate limiting and get lots of [429's](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429). Our script intentionally reduces concurrency and excludes some repetitive GitHub URLs. +#### Example + Note that you may see transient errors as shown here with lookup errors: + ```shell $ docker exec cht-hugo sh -c "cd .github/scripts/; ./muffet.sh" @@ -89,13 +94,15 @@ If you're moving more than ~5 pages around, you should check that they either co This is mainly to help preserve SEO and to help folks who bookmark specific pages. -To test: +#### Running 1. Make your changes, for example moving 10s of pages to a new location 2. Check that `hugo` compiles and doesn't complain of any broken links 3. Start the docker container: `docker compose up -d` 4. Test the links with the bash script: `docker exec cht-hugo .github/scripts/check.urls.sh` +#### Example + ```shell $ docker exec cht-hugo .github/scripts/check.urls.sh