Skip to content

Commit 8a24152

Browse files
v0.0.44 Improved URL check robustness (#372)
Signed-off-by: Glenn Jocher <[email protected]> Signed-off-by: UltralyticsAssistant <[email protected]> Co-authored-by: UltralyticsAssistant <[email protected]>
1 parent 31f9975 commit 8a24152

File tree

3 files changed

+35
-14
lines changed

3 files changed

+35
-14
lines changed

actions/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
# ├── test_summarize_pr.py
2323
# └── ...
2424

25-
__version__ = "0.0.43"
25+
__version__ = "0.0.44"

actions/utils/common_utils.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def clean_url(url):
2121

2222

2323
def is_url(url, check=True, max_attempts=3, timeout=2):
24-
"""Check if string is URL and check if URL exists."""
24+
"""Check if string is URL and optionally verify it exists."""
2525
allow_list = (
2626
"localhost",
2727
"127.0.0",
@@ -56,15 +56,29 @@ def is_url(url, check=True, max_attempts=3, timeout=2):
5656

5757
# Check response
5858
if check:
59+
headers = {
60+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
61+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
62+
"Accept-Language": "en-US,en;q=0.9,es;q=0.8,zh-CN;q=0.7,zh;q=0.6",
63+
"Accept-Encoding": "gzip, deflate, br, zstd",
64+
"sec-ch-ua": '"Chromium";v="132", "Google Chrome";v="132", "Not_A Brand";v="99"',
65+
"sec-ch-ua-mobile": "?0",
66+
"sec-ch-ua-platform": '"macOS"',
67+
"Sec-Fetch-Site": "none",
68+
"Sec-Fetch-Mode": "navigate",
69+
"Sec-Fetch-User": "?1",
70+
"Sec-Fetch-Dest": "document",
71+
"Referer": "https://www.google.com/",
72+
"Origin": "https://www.google.com/",
73+
}
74+
bad_codes = {404, 410, 500, 502, 503, 504}
5975
for attempt in range(max_attempts):
6076
try:
61-
headers = {
62-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
63-
"Accept": "*/*", # Wildcard for maximum compatibility
64-
"Accept-Language": "*", # Wildcard for any language
65-
"Accept-Encoding": "*", # Wildcard for any encoding
66-
}
67-
return requests.head(url, headers=headers, timeout=timeout, allow_redirects=True).status_code < 400
77+
response = requests.head(url, headers=headers, timeout=timeout, allow_redirects=True)
78+
if response.status_code not in bad_codes:
79+
return True
80+
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, stream=True)
81+
return response.status_code not in bad_codes # Try GET if HEAD fails
6882
except Exception:
6983
if attempt == max_attempts - 1: # last attempt
7084
return False

tests/test_urls.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,19 @@
77
from actions.utils.common_utils import check_links_in_string, is_url
88

99
URLS = [
10-
"https://docs.ultralytics.com/help/CLA/",
1110
"https://docs.ultralytics.com/help/contributing",
12-
"https://docs.ultralytics.com",
1311
"https://ultralytics.com",
1412
"https://ultralytics.com/images/bus.jpg",
1513
"https://github.com/ultralytics/ultralytics",
14+
"https://azure.microsoft.com/",
15+
"https://www.tableau.com/",
16+
"https://openai.com/research/gpt-4",
17+
"https://azure.microsoft.com/en-us/services/machine-learning/",
18+
"https://azure.microsoft.com/en-us/products/storage/blobs",
19+
"https://www.reuters.com/article/idUSKCN1MK08G/",
20+
"https://www.kdnuggets.com/",
21+
"https://www.datacamp.com/tutorial/understanding-logistic-regression-python",
22+
"https://www.statisticshowto.com/probability-and-statistics/find-outliers/",
1623
]
1724

1825

@@ -37,7 +44,7 @@ def test_html_links(verbose):
3744

3845

3946
def test_markdown_links(verbose):
40-
"""Validates URLs in markdown links within a given text using check_links_in_string."""
47+
"""Validates URLs in Markdown links within a given text using check_links_in_string."""
4148
text = "Check [Example](https://err.com) or [Test](http://test.org)"
4249
result, urls = check_links_in_string(text, verbose, return_bad=True)
4350
assert result is False
@@ -49,7 +56,7 @@ def test_mixed_formats(verbose):
4956
text = "A <a href='https://1.com'>link</a> and [markdown](https://2.org) and https://3.net"
5057
result, urls = check_links_in_string(text, return_bad=True)
5158
assert result is False
52-
assert set(urls) == {"https://1.com", "https://2.org", "https://3.net"}
59+
assert set(urls) == {"https://1.com", "https://3.net"}
5360

5461

5562
def test_duplicate_urls(verbose):
@@ -89,7 +96,7 @@ def test_urls_with_different_tlds(verbose):
8996
text = "Different TLDs: https://err.ml https://err.org https://err.net https://err.io https://err.ai"
9097
result, urls = check_links_in_string(text, verbose, return_bad=True)
9198
assert result is False
92-
assert set(urls) == {"https://err.ml", "https://err.org", "https://err.net", "https://err.io", "https://err.ai"}
99+
assert set(urls) == {"https://err.ml", "https://err.io", "https://err.ai"}
93100

94101

95102
def test_case_sensitivity(verbose):

0 commit comments

Comments
 (0)