Skip to content

Check Website links #852

Check Website links

Check Website links #852

Workflow file for this run

# Ultralytics YOLO 🚀, AGPL-3.0 license
# Continuous Integration (CI) GitHub Actions tests broken link checker using https://github.com/lycheeverse/lychee
# Ignores the following status codes to reduce false positives:
# - 401(Vimeo, 'unauthorized')
# - 403(OpenVINO, 'forbidden')
# - 429(Instagram, 'too many requests')
# - 500(Zenodo, 'cached')
# - 502(Zenodo, 'bad gateway')
# - 999(LinkedIn, 'unknown status code')
name: Check Website links
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *" # runs at 00:00 UTC every day
jobs:
Links:
runs-on: ubuntu-latest
strategy:
fail-fast: false # This ensures that if one job fails, the others will still run
matrix:
website: [www.ultralytics.com, docs.ultralytics.com, community.ultralytics.com, handbook.ultralytics.com]
steps:
- name: Download and install lychee
run: |
LYCHEE_URL=$(curl -s https://api.github.com/repos/lycheeverse/lychee/releases/latest | grep "browser_download_url" | grep "x86_64-unknown-linux-gnu.tar.gz" | cut -d '"' -f 4)
curl -L $LYCHEE_URL | tar xz -C /usr/local/bin
- name: Get Website URLs
run: |
# Function to parse sitemap URLs
parse_sitemap() {
cat - | tr '\n' ' ' | sed 's/<loc>/\n<loc>/g' | grep -oP '(?<=<loc>).*?(?=</loc>)' || true
}
# Download initial sitemap and process
echo "Downloading sitemap..."
SITEMAP=$(wget -qO- "https://${{ matrix.website }}/sitemap.xml") || { echo "Failed to download sitemap"; exit 1; }
echo "Downloaded sitemap content:"
echo "$SITEMAP" | head -n 5
echo "Parsing sitemap..."
echo "$SITEMAP" | parse_sitemap > urls.txt
echo "Initial parsed URLs:"
head -n 5 urls.txt
# Process any subsitemaps if they exist
if grep -q 'sitemap' urls.txt; then
echo "Found subsitemaps, processing..."
grep 'sitemap' urls.txt > subsitemaps.txt
grep -v 'sitemap' urls.txt > urls.tmp
while read submap; do
echo "Processing submap: $submap"
wget -qO- "$submap" | parse_sitemap >> urls.tmp
done < subsitemaps.txt
mv urls.tmp urls.txt
fi
# Count URLs
total_urls=$(wc -l < urls.txt)
echo "Total URLs to be downloaded: $total_urls"
- name: Download Website
run: |
# Set higher wait seconds for discourse community to avoid 429 rate limit errors
if [ "${{ matrix.website }}" = "community.ultralytics.com" ]; then
WAIT=1
else
WAIT=0.001
fi
# Download all URLs
wget \
--adjust-extension \
--reject "*.jpg*,*.jpeg*,*.png*,*.gif*,*.webp*,*.svg*,*.txt" \
--input-file=urls.txt \
--no-clobber \
--no-parent \
--wait=$WAIT \
--random-wait \
--tries=3 \
--no-verbose \
--force-directories
- name: Run Broken Link Checks on Website
id: lychee
uses: ultralytics/actions/retry@main
with:
timeout_minutes: 60
retry_delay_seconds: 900
retries: 3
run: |
# Count successfully downloaded files
downloaded_files=$(find ${{ matrix.website }} -type f | wc -l)
echo "Scanning $downloaded_files downloaded pages for broken links..."
# Create summary.txt with the total page count
echo "*Results for $downloaded_files pages in https://${{ matrix.website }}*" > summary.txt
echo "" >> summary.txt
rm -rf .lycheecache
lychee \
--scheme 'https' \
--timeout 60 \
--insecure \
--accept 401,403,429,500,502,999 \
--exclude-all-private \
--exclude 'https?://(www\.)?(linkedin\.com|twitter\.com|instagram\.com|kaggle\.com|tiktok\.com|fonts\.gstatic\.com|fonts\.googleapis\.com|url\.com|tesla\.com|wellfound\.com|.*\.cloudfunctions\.net|0\.0\.0\.0:5543/predict/from_files)' \
--exclude-path '**/ci.yaml' \
--github-token ${{ secrets.GITHUB_TOKEN }} \
--header "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.183 Safari/537.36" \
--header "Accept=*/*" \
--header "Accept-Language=*" \
--header "Accept-Encoding=*" \
'./${{ matrix.website }}/**/*.html' | tee -a summary.txt
# Add the summary to GitHub step summary
cat summary.txt >> $GITHUB_STEP_SUMMARY
# Prepare the summary for Slack (escape newlines, remove [], remove .html, and escape special characters)
# Ignore lines starting with [TIMEOUT] on the next line or keep them in the following line
ESCAPED_SUMMARY=$(awk '!/^\[TIMEOUT\]/ {printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
ESCAPED_SUMMARY=$(awk '{printf "%s\\n", $0}' summary.txt | sed 's/\[//g; s/\]//g; s/\.html//g; s/"/\\"/g')
echo "SUMMARY<<EOF" >> $GITHUB_ENV
echo "$ESCAPED_SUMMARY" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
# Check if lychee found any broken links
if grep -q "0 Errors" summary.txt; then
echo "No broken links found."
exit 0
else
echo "Broken links found."
exit 1
fi
- name: Check for failure and notify
if: always() && steps.lychee.outcome == 'failure' && github.event_name == 'schedule' && github.run_attempt == '1'
uses: slackapi/[email protected]
with:
webhook-type: incoming-webhook
webhook: ${{ matrix.website == 'www.ultralytics.com' && secrets.SLACK_WEBHOOK_URL_WEBSITE || secrets.SLACK_WEBHOOK_URL_YOLO }}
payload: |
text: "GitHub Actions: Errors found in ${{ github.workflow }} for ${{ matrix.website }} ❌\n\n\n*Repository:* https://github.com/${{ github.repository }}\n*Action:* https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}\n*Author:* ${{ github.actor }}\n*Event:* ${{ github.event_name }}\n\n\n${{ env.SUMMARY }}\n"