Check Website links #855
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ultralytics YOLO 🚀, AGPL-3.0 license | |
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee | |
# Ignores the following status codes to reduce false positives: | |
# - 401(Vimeo, 'unauthorized') | |
# - 403(OpenVINO, 'forbidden') | |
# - 429(Instagram, 'too many requests') | |
# - 500(Zenodo, 'cached') | |
# - 502(Zenodo, 'bad gateway') | |
# - 999(LinkedIn, 'unknown status code') | |
name: Check Website links | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: "0 0 * * *" # runs at 00:00 UTC every day | |
jobs: | |
Links: | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false # This ensures that if one job fails, the others will still run | |
matrix: | |
website: | |
[ | |
www.ultralytics.com, | |
docs.ultralytics.com, | |
community.ultralytics.com, | |
handbook.ultralytics.com, | |
] | |
steps: | |
- name: Download and install lychee | |
run: | | |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4) | |
curl -L $LYCHEE_URL | tar xz -C /usr/local/bin | |
- name: Get Website URLs | |
run: | | |
# Function to parse sitemap URLs | |
parse_sitemap() { | |
cat - | tr '\n' ' ' | sed 's/<loc>/\n<loc>/g' | grep -oP '(?<=<loc>).*?(?=</loc>)' || true | |
} | |
# Download initial sitemap and process | |
echo "Downloading sitemap..." | |
SITEMAP=$(wget -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; } | |
echo "$SITEMAP" | parse_sitemap > urls.txt | |
# Process any subsitemaps if they exist | |
if grep -q 'sitemap' urls.txt; then | |
echo "Found subsitemaps, processing..." | |
grep 'sitemap' urls.txt > subsitemaps.txt | |
grep -v 'sitemap' urls.txt > urls.tmp || true | |
while read -r submap; do | |
echo "Processing submap: $submap" | |
SUBMAP_CONTENT=$(wget -qO- "$submap") || { echo "Failed to download submap: $submap"; continue; } | |
echo "$SUBMAP_CONTENT" | parse_sitemap >> urls.tmp | |
done < subsitemaps.txt | |
mv urls.tmp urls.txt || true | |
fi | |
# Count URLs | |
total_urls=$(wc -l < urls.txt) | |
echo "Total URLs to be downloaded: $total_urls" | |
- name: Download Website | |
run: | | |
# Set higher wait seconds for discourse community to avoid 429 rate limit errors | |
if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then | |
WAIT=1 | |
else | |
WAIT=0.001 | |
fi | |
# Download all URLs | |
wget \ | |
--adjust-extension \ | |
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \ | |
--input-file=urls.txt \ | |
--no-clobber \ | |
--no-parent \ | |
--wait=$WAIT \ | |
--random-wait \ | |
--tries=3 \ | |
--no-verbose \ | |
--force-directories | |
- name: Run Broken Link Checks on Website | |
id: lychee | |
uses: ultralytics/actions/retry@main | |
with: | |
timeout_minutes: 60 | |
retry_delay_seconds: 900 | |
retries: 3 | |
run: | | |
# Count successfully downloaded files | |
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l) | |
echo "Scanning $downloaded_files downloaded pages for broken links..." | |
# Create summary.txt with the total page count | |
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt | |
echo "" >> summary.txt | |
rm -rf .lycheecache | |
lychee \ | |
--scheme 'https' \ | |
--timeout 60 \ | |
--insecure \ | |
--accept 401,403,429,500,502,999 \ | |
--exclude-all-private \ | |
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \ | |
--exclude-path '**/ci.yaml' \ | |
--github-token ${{ secrets.GITHUB_TOKEN }} \ | |
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \ | |
--header "Accept=*/*" \ | |
--header "Accept-Language=*" \ | |
--header "Accept-Encoding=*" \ | |
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt | |
# Add the summary to GitHub step summary | |
cat summary.txt >> $GITHUB_STEP_SUMMARY | |
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters) | |
# Ignore lines starting with [TIMEOUT] on the next line or keep them in the following line | |
ESCAPED_SUMMARY=$(awk '!/^\[TIMEOUT\]/ {printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g') | |
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g') | |
echo "SUMMARY<<EOF" >> $GITHUB_ENV | |
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
# Check if lychee found any broken links | |
if grep -q "0 Errors" summary.txt; then | |
echo "No broken links found." | |
exit 0 | |
else | |
echo "Broken links found." | |
exit 1 | |
fi | |
- name: Check for failure and notify | |
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1' | |
uses: slackapi/[email protected] | |
with: | |
webhook-type: incoming-webhook | |
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }} | |
payload: | | |
text: "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n" |