|
59 | 59 | import os
|
60 | 60 | import re
|
61 | 61 | import sys
|
| 62 | +import time |
62 | 63 | from concurrent.futures import ThreadPoolExecutor, as_completed
|
63 | 64 | from typing import Dict, List, Optional, Tuple
|
64 | 65 |
|
@@ -242,6 +243,10 @@ def check_link_validity(
|
242 | 243 | if not HAS_REQUESTS:
|
243 | 244 | return url, False, "requests module not installed", None
|
244 | 245 |
|
| 246 | + # Skip GitHub links |
| 247 | + if "github.com" in url: |
| 248 | + return url, True, "GitHub link validation skipped", None |
| 249 | + |
245 | 250 | # Clean up escaped characters in URLs
|
246 | 251 | # This helps with Markdown URLs that have escaped underscores, etc.
|
247 | 252 | cleaned_url = clean_url(url)
|
@@ -326,32 +331,49 @@ def validate_urls(
|
326 | 331 | Returns:
|
327 | 332 | Dictionary of {url: (is_valid, error_message, status_code)}
|
328 | 333 | """
|
| 334 | + if not urls: |
| 335 | + return {} |
| 336 | + |
329 | 337 | results = {}
|
330 | 338 |
|
| 339 | + # Count and report GitHub links that will be skipped in validation |
| 340 | + from urllib.parse import urlparse |
| 341 | + github_urls = [url for url in urls if urlparse(url).hostname and urlparse(url).hostname.endswith("github.com")] |
| 342 | + other_urls = [url for url in urls if urlparse(url).hostname and not urlparse(url).hostname.endswith("github.com")] |
| 343 | + |
331 | 344 | print(f"Validating {len(urls)} links...")
|
332 |
| - |
333 |
| - with ThreadPoolExecutor(max_workers=max_workers) as executor: |
334 |
| - future_to_url = { |
335 |
| - executor.submit(check_link_validity, url): url for url in urls |
336 |
| - } |
337 |
| - |
| 345 | + print(f"Note: {len(github_urls)} GitHub links will be automatically marked as valid (skipping validation)") |
| 346 | + |
| 347 | + # Use moderate settings for non-GitHub URLs |
| 348 | + actual_max_workers = min(6, max_workers) |
| 349 | + |
| 350 | + print(f"Using {actual_max_workers} workers for remaining {len(other_urls)} links...") |
| 351 | + |
| 352 | + with ThreadPoolExecutor(max_workers=actual_max_workers) as executor: |
| 353 | + future_to_url = {} |
| 354 | + |
| 355 | + # Submit all URLs (GitHub links will be auto-skipped in check_link_validity) |
| 356 | + for url in urls: |
| 357 | + future_to_url[executor.submit(check_link_validity, url, timeout=15)] = url |
| 358 | + |
| 359 | + # Process results |
338 | 360 | for i, future in enumerate(as_completed(future_to_url), 1):
|
339 | 361 | url = future_to_url[future]
|
340 | 362 | try:
|
341 | 363 | _, is_valid, error_message, status_code = future.result()
|
342 | 364 | results[url] = (is_valid, error_message, status_code)
|
343 |
| - |
344 |
| - # Print progress indicator |
345 |
| - if i % 10 == 0 or i == len(urls): |
346 |
| - print( |
347 |
| - f" Checked {i}/{len(urls)} links", |
348 |
| - end="\r", |
349 |
| - flush=True, |
350 |
| - ) |
| 365 | + |
| 366 | + if "github.com" in url: |
| 367 | + print(f" Checked URL {i}/{len(urls)} [github.com]: ✓ Skipped (automatically marked valid)") |
| 368 | + else: |
| 369 | + status = "✅ Valid" if is_valid else f"❌ {error_message}" |
| 370 | + domain = url.split('/')[2] if '://' in url and '/' in url.split('://', 1)[1] else 'unknown' |
| 371 | + print(f" Checked URL {i}/{len(urls)} [{domain}]: {status}") |
| 372 | + |
351 | 373 | except Exception as e:
|
352 | 374 | results[url] = (False, str(e), None)
|
| 375 | + print(f" Error checking URL {i}/{len(urls)}: {e}") |
353 | 376 |
|
354 |
| - print() # New line after progress |
355 | 377 | return results
|
356 | 378 |
|
357 | 379 |
|
|
0 commit comments