Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pre/beta #838

Merged
merged 21 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7285ab0
fix: error on fetching the code
VinciGit00 Nov 24, 2024
faf0c01
feat: revert search function
VinciGit00 Nov 24, 2024
92bb8bb
Merge pull request #824 from ScrapeGraphAI/main
VinciGit00 Nov 24, 2024
8aa9103
feat: add api integration
VinciGit00 Nov 24, 2024
7ff1051
Merge pull request #825 from ScrapeGraphAI/revert-to-1.19
VinciGit00 Nov 24, 2024
b98dd39
ci(release): 1.32.0-beta.1 [skip ci]
semantic-release-bot Nov 24, 2024
7da7bfe
fix: improved links extraction for parse_node, resolves #822
Levyathanus Nov 24, 2024
c84ff56
Merge pull request #826 from ScrapeGraphAI/fix-runtime-error
VinciGit00 Nov 25, 2024
8b17764
ci(release): 1.32.0-beta.2 [skip ci]
semantic-release-bot Nov 25, 2024
adddd64
Merge pull request #828 from Levyathanus/pre/beta
VinciGit00 Nov 26, 2024
0769fce
ci(release): 1.32.0-beta.3 [skip ci]
semantic-release-bot Nov 26, 2024
5fe528a
chore: migrate from rye to uv
f-aguzzi Nov 27, 2024
209b445
feat: add sdk integration
VinciGit00 Nov 28, 2024
58ca0f5
Merge pull request #833 from ScrapeGraphAI/776-migrate-to-uv
VinciGit00 Nov 28, 2024
1ff7f88
Merge branch 'api-integration' into pre/beta
VinciGit00 Dec 2, 2024
67c9859
ci(release): 1.32.0-beta.4 [skip ci]
semantic-release-bot Dec 2, 2024
ba6e931
feat: add API integration
VinciGit00 Dec 2, 2024
74985fb
Merge branch 'pre/beta' of https://github.com/ScrapeGraphAI/Scrapegra…
VinciGit00 Dec 2, 2024
fbb4252
ci(release): 1.32.0-beta.5 [skip ci]
semantic-release-bot Dec 2, 2024
a86e7d6
enhancement: add support for Playwright's `storage_state` parameter (…
aflansburg Dec 3, 2024
7f0e1d2
Merge branch 'main' into pre/beta
VinciGit00 Dec 5, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
## [1.32.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.4...v1.32.0-beta.5) (2024-12-02)
## [1.32.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.31.1...v1.32.0) (2024-12-02)


### Features

* add API integration ([46373af](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/46373afe6d8c05ad26039e68190f13d82b20a349))

## [1.32.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.32.0-beta.3...v1.32.0-beta.4) (2024-12-02)

Expand Down
93 changes: 93 additions & 0 deletions examples/extras/authenticated_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Example leveraging a state file containing session cookies which
might be leveraged to authenticate to a website and scrape protected
content.
"""

import os
import random
from dotenv import load_dotenv

# import playwright so we can use it to create the state file
from playwright.async_api import async_playwright

from scrapegraphai.graphs import OmniScraperGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

# ************************************************
# Leveraging Playwright external to the invocation of the graph to
# login and create the state file
# ************************************************


# note this is just an example and probably won't actually work on
# LinkedIn, the implementation of the login is highly dependent on the website
async def do_login():
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(
timeout=30000,
headless=False,
slow_mo=random.uniform(500, 1500),
)
page = await browser.new_page()

# very basic implementation of a login, in reality it may be trickier
await page.goto("https://www.linkedin.com/login")
await page.get_by_label("Email or phone").fill("some_bloke@some_domain.com")
await page.get_by_label("Password").fill("test1234")
await page.get_by_role("button", name="Sign in").click()
await page.wait_for_timeout(3000)

# assuming a successful login, we save the cookies to a file
await page.context.storage_state(path="./state.json")


async def main():
await do_login()

# ************************************************
# Define the configuration for the graph
# ************************************************

openai_api_key = os.getenv("OPENAI_APIKEY")

graph_config = {
"llm": {
"api_key": openai_api_key,
"model": "openai/gpt-4o",
},
"max_images": 10,
"headless": False,
# provide the path to the state file
"storage_state": "./state.json",
}

# ************************************************
# Create the OmniScraperGraph instance and run it
# ************************************************

omni_scraper_graph = OmniScraperGraph(
prompt="List me all the projects with their description.",
source="https://www.linkedin.com/feed/",
config=graph_config,
)

# the storage_state is used to load the cookies from the state file
# so we are authenticated and able to scrape protected content
result = omni_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = omni_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))


if __name__ == "__main__":
import asyncio

asyncio.run(main())
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ name = "scrapegraphai"



version = "1.32.0"
version = "1.32.0b5"




Expand Down
37 changes: 27 additions & 10 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

logger = get_logger("web-loader")


class ChromiumLoader(BaseLoader):
"""Scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection.
Expand All @@ -33,6 +34,7 @@ def __init__(
proxy: Optional[Proxy] = None,
load_state: str = "domcontentloaded",
requires_js_support: bool = False,
storage_state: Optional[str] = None,
**kwargs: Any,
):
"""Initialize the loader with a list of URL paths.
Expand Down Expand Up @@ -62,6 +64,7 @@ def __init__(
self.urls = urls
self.load_state = load_state
self.requires_js_support = requires_js_support
self.storage_state = storage_state

async def ascrape_undetected_chromedriver(self, url: str) -> str:
"""
Expand Down Expand Up @@ -91,7 +94,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
attempt += 1
logger.error(f"Attempt {attempt} failed: {e}")
if attempt == self.RETRY_LIMIT:
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
results = (
f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
)
finally:
driver.quit()

Expand All @@ -113,7 +118,9 @@ async def ascrape_playwright(self, url: str) -> str:
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
context = await browser.new_context()
context = await browser.new_context(
storage_state=self.storage_state
)
await Malenia.apply_stealth(context)
page = await context.new_page()
await page.goto(url, wait_until="domcontentloaded")
Expand All @@ -125,10 +132,12 @@ async def ascrape_playwright(self, url: str) -> str:
attempt += 1
logger.error(f"Attempt {attempt} failed: {e}")
if attempt == self.RETRY_LIMIT:
raise RuntimeError(f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}")
raise RuntimeError(
f"Failed to fetch {url} after {self.RETRY_LIMIT} attempts: {e}"
)
finally:
if 'browser' in locals():
await browser.close()
if "browser" in locals():


async def ascrape_with_js_support(self, url: str) -> str:
"""
Expand All @@ -138,7 +147,7 @@ async def ascrape_with_js_support(self, url: str) -> str:
url (str): The URL to scrape.

Returns:
str: The fully rendered HTML content after JavaScript execution,
str: The fully rendered HTML content after JavaScript execution,
or an error message if an exception occurs.
"""
from playwright.async_api import async_playwright
Expand All @@ -153,7 +162,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
browser = await p.chromium.launch(
headless=self.headless, proxy=self.proxy, **self.browser_config
)
context = await browser.new_context()
context = await browser.new_context(
storage_state=self.storage_state
)
page = await context.new_page()
await page.goto(url, wait_until="networkidle")
results = await page.content()
Expand All @@ -163,7 +174,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
attempt += 1
logger.error(f"Attempt {attempt} failed: {e}")
if attempt == self.RETRY_LIMIT:
results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
results = (
f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
)
finally:
await browser.close()

Expand All @@ -180,7 +193,9 @@ def lazy_load(self) -> Iterator[Document]:
Document: The scraped content encapsulated within a Document object.
"""
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
self.ascrape_with_js_support
if self.requires_js_support
else getattr(self, f"ascrape_{self.backend}")
)

for url in self.urls:
Expand All @@ -202,7 +217,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
source URL as metadata.
"""
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
self.ascrape_with_js_support
if self.requires_js_support
else getattr(self, f"ascrape_{self.backend}")
)

tasks = [scraping_fn(url) for url in self.urls]
Expand Down
Loading
Loading