8
8
9
9
logger = get_logger ("web-loader" )
10
10
11
+
11
12
class ChromiumLoader (BaseLoader ):
12
13
"""Scrapes HTML pages from URLs using a (headless) instance of the
13
14
Chromium web driver with proxy protection.
@@ -33,6 +34,7 @@ def __init__(
33
34
proxy : Optional [Proxy ] = None ,
34
35
load_state : str = "domcontentloaded" ,
35
36
requires_js_support : bool = False ,
37
+ storage_state : Optional [str ] = None ,
36
38
** kwargs : Any ,
37
39
):
38
40
"""Initialize the loader with a list of URL paths.
@@ -62,6 +64,7 @@ def __init__(
62
64
self .urls = urls
63
65
self .load_state = load_state
64
66
self .requires_js_support = requires_js_support
67
+ self .storage_state = storage_state
65
68
66
69
async def ascrape_undetected_chromedriver (self , url : str ) -> str :
67
70
"""
@@ -91,7 +94,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
91
94
attempt += 1
92
95
logger .error (f"Attempt { attempt } failed: { e } " )
93
96
if attempt == self .RETRY_LIMIT :
94
- results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
97
+ results = (
98
+ f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
99
+ )
95
100
finally :
96
101
driver .quit ()
97
102
@@ -113,7 +118,9 @@ async def ascrape_playwright(self, url: str) -> str:
113
118
browser = await p .chromium .launch (
114
119
headless = self .headless , proxy = self .proxy , ** self .browser_config
115
120
)
116
- context = await browser .new_context ()
121
+ context = await browser .new_context (
122
+ storage_state = self .storage_state
123
+ )
117
124
await Malenia .apply_stealth (context )
118
125
page = await context .new_page ()
119
126
await page .goto (url , wait_until = "domcontentloaded" )
@@ -125,10 +132,12 @@ async def ascrape_playwright(self, url: str) -> str:
125
132
attempt += 1
126
133
logger .error (f"Attempt { attempt } failed: { e } " )
127
134
if attempt == self .RETRY_LIMIT :
128
- raise RuntimeError (f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } " )
135
+ raise RuntimeError (
136
+ f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } "
137
+ )
129
138
finally :
130
- if ' browser' in locals ():
131
- await browser . close ()
139
+ if " browser" in locals ():
140
+
132
141
133
142
async def ascrape_with_js_support (self , url : str ) -> str :
134
143
"""
@@ -138,7 +147,7 @@ async def ascrape_with_js_support(self, url: str) -> str:
138
147
url (str): The URL to scrape.
139
148
140
149
Returns:
141
- str: The fully rendered HTML content after JavaScript execution,
150
+ str: The fully rendered HTML content after JavaScript execution,
142
151
or an error message if an exception occurs.
143
152
"""
144
153
from playwright .async_api import async_playwright
@@ -153,7 +162,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
153
162
browser = await p .chromium .launch (
154
163
headless = self .headless , proxy = self .proxy , ** self .browser_config
155
164
)
156
- context = await browser .new_context ()
165
+ context = await browser .new_context (
166
+ storage_state = self .storage_state
167
+ )
157
168
page = await context .new_page ()
158
169
await page .goto (url , wait_until = "networkidle" )
159
170
results = await page .content ()
@@ -163,7 +174,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
163
174
attempt += 1
164
175
logger .error (f"Attempt { attempt } failed: { e } " )
165
176
if attempt == self .RETRY_LIMIT :
166
- results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
177
+ results = (
178
+ f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
179
+ )
167
180
finally :
168
181
await browser .close ()
169
182
@@ -180,7 +193,9 @@ def lazy_load(self) -> Iterator[Document]:
180
193
Document: The scraped content encapsulated within a Document object.
181
194
"""
182
195
scraping_fn = (
183
- self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
196
+ self .ascrape_with_js_support
197
+ if self .requires_js_support
198
+ else getattr (self , f"ascrape_{ self .backend } " )
184
199
)
185
200
186
201
for url in self .urls :
@@ -202,7 +217,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
202
217
source URL as metadata.
203
218
"""
204
219
scraping_fn = (
205
- self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
220
+ self .ascrape_with_js_support
221
+ if self .requires_js_support
222
+ else getattr (self , f"ascrape_{ self .backend } " )
206
223
)
207
224
208
225
tasks = [scraping_fn (url ) for url in self .urls ]
0 commit comments