diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 1d252d0d..f579b98a 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -360,7 +360,8 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") -> else: raise ValueError(f"Invalid browser name: {browser_name}") context = await browser.new_context( - storage_state=self.storage_state + storage_state=self.storage_state, + ignore_https_errors=True, ) await Malenia.apply_stealth(context) page = await context.new_page() diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 903c15ad..6da03a90 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -3,12 +3,44 @@ """ import re +import json from urllib.parse import urljoin from bs4 import BeautifulSoup, Comment from minify_html import minify +def extract_from_script_tags(soup): + script_content = [] + + for script in soup.find_all("script"): + content = script.string + if content: + try: + json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$' + json_matches = re.findall(json_pattern, content) + + for potential_json in json_matches: + try: + parsed = json.loads(potential_json) + if parsed: + script_content.append(f"JSON data from script: {json.dumps(parsed, indent=2)}") + except json.JSONDecodeError: + pass + + if "window." in content or "document." in content: + data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);' + data_matches = re.findall(data_pattern, content) + + for var_name, var_value in data_matches: + script_content.append(f"Dynamic data - {var_name}: {var_value.strip()}") + except Exception: + if len(content) < 1000: + script_content.append(f"Script content: {content.strip()}") + + return "\n\n".join(script_content) + + def cleanup_html(html_content: str, base_url: str) -> str: """ Processes HTML content by removing unnecessary tags, @@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str: title_tag = soup.find("title") title = title_tag.get_text() if title_tag else "" - - for tag in soup.find_all(["script", "style"]): + + script_content = extract_from_script_tags(soup) + + for tag in soup.find_all("style"): tag.extract() link_urls = [ @@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str: body_content = soup.find("body") if body_content: minimized_body = minify(str(body_content)) - return title, minimized_body, link_urls, image_urls + return title, minimized_body, link_urls, image_urls, script_content else: raise ValueError( @@ -106,10 +140,10 @@ def reduce_html(html, reduction): for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): comment.extract() - for tag in soup(["script", "style"]): + for tag in soup(["style"]): tag.string = "" - attrs_to_keep = ["class", "id", "href", "src"] + attrs_to_keep = ["class", "id", "href", "src", "type"] for tag in soup.find_all(True): for attr in list(tag.attrs): if attr not in attrs_to_keep: @@ -118,7 +152,7 @@ def reduce_html(html, reduction): if reduction == 1: return minify_html(str(soup)) - for tag in soup(["script", "style"]): + for tag in soup(["style"]): tag.decompose() body = soup.body @@ -126,7 +160,7 @@ def reduce_html(html, reduction): return "No tag found in the HTML" for tag in body.find_all(string=True): - if tag.parent.name not in ["script", "style"]: + if tag.parent.name not in ["script"]: tag.replace_with(re.sub(r"\s+", " ", tag.strip())[:20]) reduced_html = str(body) diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py index 8c1fdb09..8e8534e1 100644 --- a/scrapegraphai/utils/proxy_rotation.py +++ b/scrapegraphai/utils/proxy_rotation.py @@ -10,7 +10,7 @@ import requests from fp.errors import FreeProxyException from fp.fp import FreeProxy - +from urllib.parse import urlparse class ProxyBrokerCriteria(TypedDict, total=False): """ @@ -188,59 +188,21 @@ def is_ipv4_address(address: str) -> bool: def parse_or_search_proxy(proxy: Proxy) -> ProxySettings: - """parses a proxy configuration or searches for a new one matching - the specified broker criteria - - Args: - proxy: The proxy configuration to parse or search for. - - Returns: - A 'playwright' compliant proxy configuration. - - Notes: - - If the proxy server is a IP address, it is assumed to be - a proxy server address. - - If the proxy server is 'broker', a proxy server is searched for - based on the provided broker criteria. - - Example: - >>> proxy = { - ... "server": "broker", - ... "criteria": { - ... "anonymous": True, - ... "countryset": {"GB", "US"}, - ... "secure": True, - ... "timeout": 5.0 - ... "search_outside_if_empty": False - ... } - ... } - - >>> parse_or_search_proxy(proxy) - { - "server": "", - } - - Example: - >>> proxy = { - ... "server": "192.168.1.1:8080", - ... "username": "", - ... "password": "" - ... } - - >>> parse_or_search_proxy(proxy) - { - "server": "192.168.1.1:8080", - "username": "", - "password": "" - } """ - assert "server" in proxy, "missing server in the proxy configuration" + Parses a proxy configuration or searches for a matching one via broker. + """ + assert "server" in proxy, "Missing 'server' field in the proxy configuration." + + parsed_url = urlparse(proxy["server"]) + server_address = parsed_url.hostname - server_address = re.sub(r"^\w+://", "", proxy["server"]).split(":", maxsplit=1)[0] + if server_address is None: + raise ValueError(f"Invalid proxy server format: {proxy['server']}") - if is_ipv4_address(server_address): + # Accept both IP addresses and domain names like 'gate.nodemaven.com' + if is_ipv4_address(server_address) or re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address): return _parse_proxy(proxy) - assert proxy["server"] == "broker", "unknown proxy server" + assert proxy["server"] == "broker", f"Unknown proxy server type: {proxy['server']}" return _search_proxy(proxy)