Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#772 added functionality to change browser to firefox #848

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions examples/extras/chromium_selenium.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it better to separate firefox and chromium files? what do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VinciGit00 I believe that combining Firefox and Selenium in the same file might be less engaging for the user, as it could lead to confusion with the implementation. However, from the perspective of cleaner code, you make a valid point.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VinciGit00, you guys are doing a great job, and I am really impressed by your hunger to achieve. If there is any possibility that I can officially dedicate myself to this skyrocketing project, I will be super happy and excited to talk about it.

Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,11 @@ async def main():
# Test with Playwright backend
print("\n--- Testing Playwright Backend ---")
try:
scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True)
await test_scraper_with_analysis(scraper_playwright, urls_to_scrape)
scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium")
await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape)

scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox")
await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape)
except ImportError as ie:
print(f"❌ Playwright ImportError: {ie}")
except Exception as e:
Expand All @@ -97,8 +100,11 @@ async def main():
# Test with Selenium backend
print("\n--- Testing Selenium Backend ---")
try:
scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True)
await test_scraper_with_analysis(scraper_selenium, urls_to_scrape)
scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium")
await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape)

scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox")
await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape)
except ImportError as ie:
print(f"❌ Selenium ImportError: {ie}")
except Exception as e:
Expand Down
75 changes: 64 additions & 11 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from langchain_core.documents import Document
import aiohttp
import async_timeout
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from typing import Union
from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy

Expand Down Expand Up @@ -36,6 +38,7 @@ def __init__(
load_state: str = "domcontentloaded",
requires_js_support: bool = False,
storage_state: Optional[str] = None,
browser_name: str = "chromium", #default chromium
**kwargs: Any,
):
"""Initialize the loader with a list of URL paths.
Expand Down Expand Up @@ -66,6 +69,7 @@ def __init__(
self.load_state = load_state
self.requires_js_support = requires_js_support
self.storage_state = storage_state
self.browser_name = browser_name

async def scrape(self, url:str) -> str:
if self.backend == "playwright":
Expand Down Expand Up @@ -95,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
while attempt < self.RETRY_LIMIT:
try:
async with async_timeout.timeout(self.TIMEOUT):
driver = uc.Chrome(headless=self.headless)
driver.get(url)
results = driver.page_source
logger.info(f"Successfully scraped {url}")
break
# Handling browser selection
if self.backend == "selenium":
if self.browser_name == "chromium":
options = ChromeOptions()
options.headless = self.headless
# Initialize undetected chromedriver for Selenium
driver = uc.Chrome(options=options)
driver.get(url)
results = driver.page_source
logger.info(f"Successfully scraped {url} with {self.browser_name}")
break
elif self.browser_name == "firefox":
from selenium.webdriver.firefox.options import Options as FirefoxOptions
options = FirefoxOptions()
options.headless = self.headless
# Initialize undetected Firefox driver (if required)
driver = webdriver.Firefox(options=options)
driver.get(url)
results = driver.page_source
logger.info(f"Successfully scraped {url} with {self.browser_name}")
break
else:
logger.error(f"Unsupported browser {self.browser_name} for Selenium.")
results = f"Error: Unsupported browser {self.browser_name}."
break
else:
logger.error(f"Unsupported backend {self.backend}.")
results = f"Error: Unsupported backend {self.backend}."
break
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
attempt += 1
logger.error(f"Attempt {attempt} failed: {e}")
Expand All @@ -118,7 +146,8 @@ async def ascrape_playwright_scroll(
timeout: Union[int, None]=30,
scroll: int=15000,
sleep: float=2,
scroll_to_bottom: bool=False
scroll_to_bottom: bool=False,
browser_name: str = "chromium" #default chrome is added
) -> str:
"""
Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
Expand Down Expand Up @@ -175,9 +204,17 @@ async def ascrape_playwright_scroll(
while attempt < self.RETRY_LIMIT:
try:
async with async_playwright() as p:
browser = await p.chromium.launch(
browser = None
if browser_name == "chromium":
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
elif browser_name == "firefox":
browser = await p.firefox.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
else:
raise ValueError(f"Invalid browser name: {browser_name}")
context = await browser.new_context()
await Malenia.apply_stealth(context)
page = await context.new_page()
Expand Down Expand Up @@ -235,7 +272,7 @@ async def ascrape_playwright_scroll(

return results

async def ascrape_playwright(self, url: str) -> str:
async def ascrape_playwright(self, url: str, browser_name: str = "chromium") -> str:
"""
Asynchronously scrape the content of a given URL using Playwright's async API.

Expand All @@ -255,9 +292,17 @@ async def ascrape_playwright(self, url: str) -> str:
while attempt < self.RETRY_LIMIT:
try:
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
browser = await p.chromium.launch(
browser = None
if browser_name == "chromium":
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
elif browser_name == "firefox":
browser = await p.firefox.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
else:
raise ValueError(f"Invalid browser name: {browser_name}")
context = await browser.new_context(
storage_state=self.storage_state
)
Expand All @@ -282,7 +327,7 @@ async def ascrape_playwright(self, url: str) -> str:



async def ascrape_with_js_support(self, url: str) -> str:
async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium") -> str:
"""
Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.

Expand All @@ -302,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str:
while attempt < self.RETRY_LIMIT:
try:
async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
browser = await p.chromium.launch(
browser = None
if browser_name == "chromium":
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
elif browser_name == "firefox":
browser = await p.firefox.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
else:
raise ValueError(f"Invalid browser name: {browser_name}")
context = await browser.new_context(
storage_state=self.storage_state
)
Expand Down