Skip to content

Commit

Permalink
enhanced backend address extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
wzdnzd committed Nov 13, 2024
1 parent 66708f3 commit ac7111e
Showing 1 changed file with 95 additions and 29 deletions.
124 changes: 95 additions & 29 deletions subscribe/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1599,39 +1599,100 @@ def run_crawl(url: str, separator: str, address_regex: str, coupon_regex: str) -

return result

def extract_real_url(domain: str, retry: int = 2) -> str:
count, retry = 0, max(retry, 1)
url = urllib.parse.urljoin(domain, "env.js")
def extract_backend_url(domain: str, retry: int = 2) -> str:
# TODO: exploring a more generalized approach to backend addresses
def request_once(suffix: str) -> tuple[bool, str]:
count, suffix = 0, utils.trim(suffix)
url = urllib.parse.urljoin(domain, suffix)

while count < retry:
count += 1
while count < retry:
count += 1

try:
request = urllib.request.Request(url=url, headers=utils.DEFAULT_HTTP_HEADERS, method="GET")
response = urllib.request.urlopen(request, timeout=6, context=utils.CTX)
try:
request = urllib.request.Request(url=url, headers=utils.DEFAULT_HTTP_HEADERS, method="GET")
response = urllib.request.urlopen(request, timeout=6, context=utils.CTX)

word = "" if not suffix else (suffix if suffix.startswith("/") else "/" + suffix)
if word and not utils.trim(response.geturl()).endswith(word):
return True, ""

content = response.read()
try:
content = str(content, encoding="utf8")
except:
content = gzip.decompress(content).decode("utf8")

return False, content
except urllib.error.HTTPError as e:
if e.code == 404:
return False, ""
except urllib.error.URLError as e:
if isinstance(e.reason, (socket.gaierror, ssl.SSLError, socket.timeout)):
return True, ""
except Exception as e:
pass

return False, ""

def attempt_env() -> str:
status, content = request_once(suffix="/env.js")
if status:
return terminated

# do not redirect
# opener = urllib.request.build_opener(utils.NoRedirect)
# response = opener.open(request, timeout=6)
if not content:
return ""

if not utils.trim(response.geturl()).endswith("/env.js"):
return ""
if groups := re.findall(r"\bhost\b:(?:\s+)?[\"\'](https?://[^\s\r\t]+)[\"\']", content, flags=re.I):
return utils.trim(groups[0])

content = response.read()
try:
content = str(content, encoding="utf8")
except:
content = gzip.decompress(content).decode("utf8")

groups = re.findall(r"window.routerBase(?:\s+)?=(?:\s+)?['\"](https?://.*)['\"]", content, flags=re.I)
return groups[0].rstrip("/") if groups and groups[0] else domain
except urllib.error.HTTPError:
return domain
except urllib.error.URLError as e:
if isinstance(e.reason, (socket.gaierror, ssl.SSLError, socket.timeout)):
return ""
except Exception as e:
pass
groups = re.findall(r"window.routerBase(?:\s+)?=(?:\s+)?['\"](https?://.*)['\"]", content, flags=re.I)
return groups[0].rstrip("/") if groups and groups[0] else ""

def attempt_zero() -> str:
status, content = request_once(suffix="/config.json")
if status:
return terminated

if not content:
return ""

try:
data = json.loads(content)
# for https://github.com/amyouran/v2board-Zero-Theme
link = utils.trim(data.get("api_base", ""))
if not link:
# for https://github.com/DyAxy/V2B-Theme-Nest
link = utils.trim(data.get("apiUrl", ""))

return utils.extract_domain(url=link, include_protocal=True)
except:
return ""

def attempt_buddy() -> str:
# for https://github.com/vlesstop/v2board-theme-buddy
status, content = request_once(suffix="/config.js")
if status:
return terminated

group = re.findall(r"\bhost\b:(?:\s+)?[\"\'](https?://[^\s\r\t]+)[\"\']", content, flags=re.I)
return "" if not group else utils.trim(group[0])

def attempt_aurora() -> str:
# for https://github.com/krsunm/Aurora
status, content = request_once(suffix="")
if status:
return terminated

group = re.findall(r"\bserverUrl\b:(?:\s+)?[\"\'](https?://[^\s\r\t]+)[\"\']", content, flags=re.I)
return "" if not group else utils.trim(group[0])

terminated, retry = "terminated", max(retry, 1)
for func in [attempt_env, attempt_zero, attempt_buddy, attempt_aurora]:
backend = func()
if terminated == backend:
return ""
if backend:
return backend

return domain

Expand Down Expand Up @@ -1668,7 +1729,12 @@ def extract_real_url(domain: str, retry: int = 2) -> str:

# extract real routing base url
logger.info(f"[AirPortCollector] fetched {len(domains)} airport, start extracting real routing addresses")
sites = utils.multi_thread_run(func=extract_real_url, tasks=domains, num_threads=num_thread, show_progress=display)
sites = utils.multi_thread_run(
func=extract_backend_url,
tasks=domains,
num_threads=num_thread,
show_progress=display,
)

tasks = [[site, rigid, chuck] for site in sites if site]
records = {sites[i]: candidates.get(domains[i], "") for i in range(len(sites)) if sites[i]}
Expand Down

0 comments on commit ac7111e

Please sign in to comment.