diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index 255fc3c2..f61f4d51 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -4,6 +4,7 @@ from langchain_core.documents import Document import aiohttp import async_timeout +from typing import Union from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy logger = get_logger("web-loader") @@ -102,14 +103,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: return results + async def ascrape_playwright_scroll( + self, + url: str, + timeout: Union[int, None]=30, + scroll: int=15000, + sleep: float=2, + scroll_to_bottom: bool=False + ) -> str: + """ + Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling. + + Notes: + - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time. + - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when + the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual. + - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load. + Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to + make sure that the scrolling actually happens, thereby allowing the page height to change. + - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling. + + Args: + - url (str): The URL to scrape. + - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0. + Can also be set to None, in which case the scraper will only stop when the page height stops changing. + - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels. + Less than this and we don't scroll enough to see any content change. + - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load. + Defaults to 2. Must be greater than 0. + + Returns: + str: The scraped HTML content + + Raises: + - ValueError: If the timeout value is less than or equal to 0. + - ValueError: If the sleep value is less than or equal to 0. + - ValueError: If the scroll value is less than 5000. + """ + # NB: I have tested using scrollHeight to determine when to stop scrolling + # but it doesn't always work as expected. The page height doesn't change on some sites like + # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom. + # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!? + + if timeout and timeout <= 0: + raise ValueError("If set, timeout value for scrolling scraper must be greater than 0.") + + if sleep <= 0: + raise ValueError("Sleep for scrolling scraper value must be greater than 0.") + + if scroll < 5000: + raise ValueError("Scroll value for scrolling scraper must be greater than or equal to 5000.") + + from playwright.async_api import async_playwright + from undetected_playwright import Malenia + import time + + logger.info(f"Starting scraping with scrolling support for {url}...") + + results = "" + attempt = 0 + + while attempt < self.RETRY_LIMIT: + try: + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=self.headless, proxy=self.proxy, **self.browser_config + ) + context = await browser.new_context() + await Malenia.apply_stealth(context) + page = await context.new_page() + await page.goto(url, wait_until="domcontentloaded") + await page.wait_for_load_state(self.load_state) + + previous_height = None + start_time = time.time() + + # Store the heights of the page after each scroll + # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom + # or simly when the page stops changing for some reason. + heights = [] + + while True: + current_height = await page.evaluate("document.body.scrollHeight") + heights.append(current_height) + heights = heights[-5:] # Keep only the last 5 heights, to not run out of memory + + # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress + # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading + # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout. + if scroll_to_bottom and previous_height == current_height: + logger.info(f"Reached bottom of page for url {url}") + break + + previous_height = current_height + + await page.mouse.wheel(0, scroll) + logger.debug(f"Scrolled {url} to current height {current_height}px...") + time.sleep(sleep) # Allow some time for any lazy-loaded content to load + + current_time = time.time() + elapsed_time = current_time - start_time + logger.debug(f"Elapsed time: {elapsed_time} seconds") + + if timeout: + if elapsed_time >= timeout: + logger.info(f"Reached timeout of {timeout} seconds for url {url}") + break + elif len(heights) == 5 and len(set(heights)) == 1: + logger.info(f"Page height has not changed for url {url} for the last 5 scrolls. Stopping.") + break + + results = await page.content() + break + + except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: + attempt += 1 + logger.error(f"Attempt {attempt} failed: {e}") + if attempt == self.RETRY_LIMIT: + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" + finally: + await browser.close() + + return results + async def ascrape_playwright(self, url: str) -> str: """ Asynchronously scrape the content of a given URL using Playwright's async API. + + Args: + url (str): The URL to scrape. + + Returns: + str: The scraped HTML content or an error message if an exception occurs. """ from playwright.async_api import async_playwright from undetected_playwright import Malenia logger.info(f"Starting scraping with {self.backend}...") + results = "" attempt = 0 while attempt < self.RETRY_LIMIT: @@ -127,16 +258,16 @@ async def ascrape_playwright(self, url: str) -> str: await page.wait_for_load_state(self.load_state) results = await page.content() logger.info("Content scraped") - return results + break except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e: attempt += 1 logger.error(f"Attempt {attempt} failed: {e}") if attempt == self.RETRY_LIMIT: - raise RuntimeError( - f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}" - ) + results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}" finally: - if "browser" in locals(): + await browser.close() + + return results async def ascrape_with_js_support(self, url: str) -> str: