@@ -21,7 +21,7 @@ def clean_url(url):
21
21
22
22
23
23
def is_url (url , check = True , max_attempts = 3 , timeout = 2 ):
24
- """Check if string is URL and check if URL exists."""
24
+ """Check if string is URL and optionally verify it exists."""
25
25
allow_list = (
26
26
"localhost" ,
27
27
"127.0.0" ,
@@ -56,15 +56,29 @@ def is_url(url, check=True, max_attempts=3, timeout=2):
56
56
57
57
# Check response
58
58
if check :
59
+ headers = {
60
+ "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36" ,
61
+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" ,
62
+ "Accept-Language" : "en-US,en;q=0.9,es;q=0.8,zh-CN;q=0.7,zh;q=0.6" ,
63
+ "Accept-Encoding" : "gzip, deflate, br, zstd" ,
64
+ "sec-ch-ua" : '"Chromium";v="132", "Google Chrome";v="132", "Not_A Brand";v="99"' ,
65
+ "sec-ch-ua-mobile" : "?0" ,
66
+ "sec-ch-ua-platform" : '"macOS"' ,
67
+ "Sec-Fetch-Site" : "none" ,
68
+ "Sec-Fetch-Mode" : "navigate" ,
69
+ "Sec-Fetch-User" : "?1" ,
70
+ "Sec-Fetch-Dest" : "document" ,
71
+ "Referer" : "https://www.google.com/" ,
72
+ "Origin" : "https://www.google.com/" ,
73
+ }
74
+ bad_codes = {404 , 410 , 500 , 502 , 503 , 504 }
59
75
for attempt in range (max_attempts ):
60
76
try :
61
- headers = {
62
- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ,
63
- "Accept" : "*/*" , # Wildcard for maximum compatibility
64
- "Accept-Language" : "*" , # Wildcard for any language
65
- "Accept-Encoding" : "*" , # Wildcard for any encoding
66
- }
67
- return requests .head (url , headers = headers , timeout = timeout , allow_redirects = True ).status_code < 400
77
+ response = requests .head (url , headers = headers , timeout = timeout , allow_redirects = True )
78
+ if response .status_code not in bad_codes :
79
+ return True
80
+ response = requests .get (url , headers = headers , timeout = timeout , allow_redirects = True , stream = True )
81
+ return response .status_code not in bad_codes # Try GET if HEAD fails
68
82
except Exception :
69
83
if attempt == max_attempts - 1 : # last attempt
70
84
return False
0 commit comments