-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Browser manager refactor #42
Open
MatthewZMSU
wants to merge
21
commits into
master
Choose a base branch
from
browser-manager-refactor
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
0d99f78
add skip line in requirements.txt
MatthewZMSU 87babe5
Revert "Local mode (#34)"
MatthewZMSU c94a0d2
additional merging from service_browser_manager
MatthewZMSU 7bbc8fc
linter
MatthewZMSU 2f21358
Smth changed
MatthewZMSU 3b0a315
Changed playwright browser_manager
MatthewZMSU b113c69
PlayWrightBrowserManager is now ready!
MatthewZMSU a537f6b
added async
MatthewZMSU bbdf23d
working playwright?
MatthewZMSU 7fefd7f
working playwright?
MatthewZMSU a5dde45
exceptions in playwright!
MatthewZMSU 7bfb246
ruff
MatthewZMSU 61e82f9
pyppeteer
MatthewZMSU fbbb374
ruff
MatthewZMSU c009449
temporarily delete pyppeteer support
MatthewZMSU a970695
example spider
MatthewZMSU a64ed18
docs
MatthewZMSU c899533
ruff
MatthewZMSU a36c72f
README and requirements
MatthewZMSU 3b65883
README
MatthewZMSU 91882af
CI
MatthewZMSU File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,5 +10,3 @@ | |
} | ||
|
||
PUPPETEER_SERVICE_URL = "http://localhost:3000" | ||
|
||
PUPPETEER_LOCAL = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from logging import ERROR | ||
|
||
import scrapy | ||
from scrapy.utils.log import failure_to_exc_info | ||
from twisted.python.failure import Failure | ||
|
||
from scrapypuppeteer import ( | ||
PuppeteerRequest, | ||
PuppeteerResponse, | ||
PuppeteerScreenshotResponse, | ||
) | ||
from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll | ||
|
||
|
||
class PlaywrightSpider(scrapy.Spider): | ||
""" | ||
Mostly, it is Compose spider, but it is very convenient for PlayWright testing. | ||
""" | ||
|
||
name = "playwright_test" | ||
|
||
custom_settings = { | ||
"DOWNLOADER_MIDDLEWARES": { | ||
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, | ||
}, | ||
"DOWNLOAD_HANDLERS": { | ||
"http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", | ||
"https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", | ||
}, | ||
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", | ||
"EXECUTION_METHOD": "playwright", | ||
} | ||
|
||
def start_requests(self): | ||
goto = GoTo("https://pptr.dev") | ||
click_1 = Click( | ||
"#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" | ||
) | ||
click_2 = Click( | ||
"#__docusaurus_skipToContent_fallback > div > div > aside > div > " | ||
"div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a" | ||
) | ||
click = Compose(click_1, click_2) | ||
scroll = Scroll() | ||
screenshot = Screenshot(options={"full_page": True, "type": "jpeg"}) | ||
|
||
compose_action = Compose( | ||
goto, | ||
click, | ||
scroll, | ||
screenshot, | ||
) | ||
|
||
yield PuppeteerRequest( | ||
compose_action, | ||
callback=self.parse, | ||
errback=self.errback, | ||
close_page=True, | ||
) | ||
|
||
def parse(self, response: PuppeteerResponse): | ||
assert isinstance(response, PuppeteerScreenshotResponse) | ||
self.log("Spider worked fine!") | ||
|
||
def errback(self, failure: Failure): | ||
print(failure) | ||
self.log(failure_to_exc_info(failure), level=ERROR) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,2 @@ | ||
scrapy>=2.6 | ||
pyppeteer | ||
syncer | ||
bs4 | ||
playwright | ||
playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,89 @@ | ||
__all__ = ["BrowserManager"] | ||
__all__ = ["BrowserManager", "ContextManager"] | ||
|
||
import uuid | ||
from abc import ABC, abstractmethod | ||
from collections.abc import Coroutine | ||
from typing import Dict, Union | ||
|
||
from scrapy import Request | ||
from scrapy.utils.defer import deferred_from_coro | ||
from twisted.internet.defer import Deferred | ||
|
||
from scrapypuppeteer import CloseContextRequest | ||
|
||
|
||
class ContextManager(ABC): | ||
def __init__(self, browser): | ||
self.browser = browser | ||
self.contexts: Dict[str, ...] = {} | ||
self.pages: Dict[str, ...] = {} | ||
self.context2page: Dict[str, str] = {} | ||
|
||
@classmethod | ||
@abstractmethod | ||
async def async_init(cls): ... | ||
|
||
@staticmethod | ||
@abstractmethod | ||
async def _create_context(browser): ... | ||
|
||
@staticmethod | ||
@abstractmethod | ||
async def _create_page(context): ... | ||
|
||
async def check_context_and_page(self, context_id, page_id): | ||
if not context_id or not page_id: | ||
context_id, page_id = await self.open_new_page() | ||
return context_id, page_id | ||
|
||
async def open_new_page(self): | ||
context_id = uuid.uuid4().hex.upper() | ||
page_id = uuid.uuid4().hex.upper() | ||
|
||
self.contexts[context_id] = await self._create_context(self.browser) | ||
self.pages[page_id] = await self._create_page(self.contexts[context_id]) | ||
self.context2page[context_id] = page_id | ||
|
||
return context_id, page_id | ||
|
||
def get_page_by_id(self, context_id, page_id): | ||
return self.pages[page_id] | ||
|
||
async def close_browser(self): | ||
if self.browser: | ||
await self.browser.close() | ||
|
||
async def close_contexts(self, request: CloseContextRequest): | ||
for context_id in request.contexts: | ||
if context_id in self.contexts: | ||
await self.contexts[context_id].close() | ||
page_id = self.context2page.get(context_id) | ||
self.pages.pop(page_id, None) | ||
|
||
del self.contexts[context_id] | ||
del self.context2page[context_id] | ||
|
||
|
||
class BrowserManager(ABC): | ||
@abstractmethod | ||
def process_request(self, request, spider): | ||
pass | ||
def _download_request( | ||
self, request: Request, spider | ||
) -> Union[Coroutine, Request]: ... | ||
|
||
@abstractmethod | ||
def close_used_contexts(self): | ||
pass | ||
async def _start_browser_manager(self) -> None: ... | ||
|
||
@abstractmethod | ||
def process_response(self, middleware, request, response, spider): | ||
pass | ||
async def _stop_browser_manager(self) -> None: ... | ||
|
||
def download_request(self, request: Request, spider) -> Union[Deferred, Request]: | ||
coro_or_request = self._download_request(request, spider) | ||
if isinstance(coro_or_request, Coroutine): | ||
return deferred_from_coro(coro_or_request) | ||
return coro_or_request | ||
|
||
def start_browser_manager(self) -> Deferred: | ||
return deferred_from_coro(self._start_browser_manager()) | ||
|
||
def stop_browser_manager(self) -> Deferred: | ||
return deferred_from_coro(self._stop_browser_manager()) |
76 changes: 76 additions & 0 deletions
76
scrapypuppeteer/browser_managers/browser_downloader_handler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from scrapy import signals | ||
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler | ||
from scrapy.crawler import Crawler | ||
from scrapy.utils.reactor import verify_installed_reactor | ||
from twisted.internet.defer import Deferred | ||
|
||
from scrapypuppeteer import CloseContextRequest | ||
from scrapypuppeteer.browser_managers import BrowserManager | ||
from scrapypuppeteer.browser_managers.playwright_browser_manager import ( | ||
PlaywrightBrowserManager, | ||
) | ||
from scrapypuppeteer.browser_managers.service_browser_manager import ( | ||
ServiceBrowserManager, | ||
) | ||
from scrapypuppeteer.request import ActionRequest | ||
|
||
|
||
class BrowserDownloaderHandler(HTTPDownloadHandler): | ||
""" | ||
Browser downloader handler. | ||
If instantiated, executes actions in the browser with provided requests. | ||
If given, then installed reactor must be `AsyncioSelectorReactor`. | ||
|
||
Currently, supports 3 browser types via EXECUTION_METHOD setting: | ||
* puppeteer -- see scrapy-puppeteer-service | ||
* pyppeteer -- not available, since the package is not actively supported | ||
* playwright -- see https://playwright.dev/python/ | ||
""" | ||
|
||
EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" | ||
|
||
def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None: | ||
super().__init__(settings, crawler=crawler) | ||
verify_installed_reactor( | ||
"twisted.internet.asyncioreactor.AsyncioSelectorReactor" | ||
) | ||
|
||
self.browser_manager = browser_manager | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler: Crawler): | ||
settings = crawler.settings | ||
|
||
execution_method = crawler.settings.get( | ||
cls.EXECUTION_METHOD_SETTING, "PUPPETEER" | ||
).lower() | ||
|
||
match execution_method: | ||
case "puppeteer": | ||
browser_manager = ServiceBrowserManager() | ||
case "pyppeteer": | ||
raise ValueError( | ||
"Currently, pyppeteer cannot run on some machines since it is not actively supported." | ||
) | ||
case "playwright": | ||
browser_manager = PlaywrightBrowserManager() | ||
case _: | ||
raise ValueError( | ||
f"Invalid execution method: {execution_method.upper()}" | ||
) | ||
|
||
bdh = cls(settings, browser_manager, crawler=crawler) | ||
crawler.signals.connect( | ||
bdh.browser_manager.start_browser_manager, signals.spider_opened | ||
) # This makes the start VERY slow | ||
crawler.signals.connect( | ||
bdh.browser_manager.stop_browser_manager, signals.engine_stopped | ||
) | ||
return bdh | ||
|
||
def download_request(self, request, spider): | ||
if isinstance(request, (ActionRequest, CloseContextRequest)): | ||
dfd_or_request = self.browser_manager.download_request(request, spider) | ||
if isinstance(dfd_or_request, Deferred): | ||
return dfd_or_request | ||
return super().download_request(request, spider) |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GitHub dropped Python3.7 support