Skip to content

tune scraper #962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
else:
raise ValueError(f"Invalid browser name: {browser_name}")
context = await browser.new_context(
storage_state=self.storage_state
storage_state=self.storage_state,
ignore_https_errors=True,
)
await Malenia.apply_stealth(context)
page = await context.new_page()
Expand Down
48 changes: 41 additions & 7 deletions scrapegraphai/utils/cleanup_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,44 @@
"""

import re
import json
from urllib.parse import urljoin

from bs4 import BeautifulSoup, Comment
from minify_html import minify


def extract_from_script_tags(soup):
script_content = []

for script in soup.find_all("script"):
content = script.string
if content:
try:
json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$'
json_matches = re.findall(json_pattern, content)

for potential_json in json_matches:
try:
parsed = json.loads(potential_json)
if parsed:
script_content.append(f"JSON data from script: {json.dumps(parsed, indent=2)}")
except json.JSONDecodeError:
pass

if "window." in content or "document." in content:
data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);'
data_matches = re.findall(data_pattern, content)

for var_name, var_value in data_matches:
script_content.append(f"Dynamic data - {var_name}: {var_value.strip()}")
except Exception:
if len(content) < 1000:
script_content.append(f"Script content: {content.strip()}")

return "\n\n".join(script_content)


def cleanup_html(html_content: str, base_url: str) -> str:
"""
Processes HTML content by removing unnecessary tags,
Expand All @@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str:

title_tag = soup.find("title")
title = title_tag.get_text() if title_tag else ""

for tag in soup.find_all(["script", "style"]):

script_content = extract_from_script_tags(soup)

for tag in soup.find_all("style"):
tag.extract()

link_urls = [
Expand All @@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
body_content = soup.find("body")
if body_content:
minimized_body = minify(str(body_content))
return title, minimized_body, link_urls, image_urls
return title, minimized_body, link_urls, image_urls, script_content

else:
raise ValueError(
Expand Down Expand Up @@ -106,10 +140,10 @@ def reduce_html(html, reduction):
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()

for tag in soup(["script", "style"]):
for tag in soup(["style"]):
tag.string = ""

attrs_to_keep = ["class", "id", "href", "src"]
attrs_to_keep = ["class", "id", "href", "src", "type"]
for tag in soup.find_all(True):
for attr in list(tag.attrs):
if attr not in attrs_to_keep:
Expand All @@ -118,15 +152,15 @@ def reduce_html(html, reduction):
if reduction == 1:
return minify_html(str(soup))

for tag in soup(["script", "style"]):
for tag in soup(["style"]):
tag.decompose()

body = soup.body
if not body:
return "No <body> tag found in the HTML"

for tag in body.find_all(string=True):
if tag.parent.name not in ["script", "style"]:
if tag.parent.name not in ["script"]:
tag.replace_with(re.sub(r"\s+", " ", tag.strip())[:20])

reduced_html = str(body)
Expand Down
62 changes: 12 additions & 50 deletions scrapegraphai/utils/proxy_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import requests
from fp.errors import FreeProxyException
from fp.fp import FreeProxy

from urllib.parse import urlparse

class ProxyBrokerCriteria(TypedDict, total=False):
"""
Expand Down Expand Up @@ -188,59 +188,21 @@ def is_ipv4_address(address: str) -> bool:


def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
"""parses a proxy configuration or searches for a new one matching
the specified broker criteria

Args:
proxy: The proxy configuration to parse or search for.

Returns:
A 'playwright' compliant proxy configuration.

Notes:
- If the proxy server is a IP address, it is assumed to be
a proxy server address.
- If the proxy server is 'broker', a proxy server is searched for
based on the provided broker criteria.

Example:
>>> proxy = {
... "server": "broker",
... "criteria": {
... "anonymous": True,
... "countryset": {"GB", "US"},
... "secure": True,
... "timeout": 5.0
... "search_outside_if_empty": False
... }
... }

>>> parse_or_search_proxy(proxy)
{
"server": "<proxy-server-matching-criteria>",
}

Example:
>>> proxy = {
... "server": "192.168.1.1:8080",
... "username": "<username>",
... "password": "<password>"
... }

>>> parse_or_search_proxy(proxy)
{
"server": "192.168.1.1:8080",
"username": "<username>",
"password": "<password>"
}
"""
assert "server" in proxy, "missing server in the proxy configuration"
Parses a proxy configuration or searches for a matching one via broker.
"""
assert "server" in proxy, "Missing 'server' field in the proxy configuration."

parsed_url = urlparse(proxy["server"])
server_address = parsed_url.hostname

server_address = re.sub(r"^\w+://", "", proxy["server"]).split(":", maxsplit=1)[0]
if server_address is None:
raise ValueError(f"Invalid proxy server format: {proxy['server']}")

if is_ipv4_address(server_address):
# Accept both IP addresses and domain names like 'gate.nodemaven.com'
if is_ipv4_address(server_address) or re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address):
return _parse_proxy(proxy)

assert proxy["server"] == "broker", "unknown proxy server"
assert proxy["server"] == "broker", f"Unknown proxy server type: {proxy['server']}"

return _search_proxy(proxy)