Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Browser manager refactor #42

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/python-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ jobs:
strategy:
matrix:
include:
- python-version: "3.7.x" # Min Python version (No 3.6 version in GitHub repository)
- python-version: "3.8.x"
- python-version: "3.8.x" # Min Python version (No 3.7 version in GitHub repository)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GitHub dropped Python3.7 support

- python-version: "3.9.x"
- python-version: "3.10.x"
- python-version: "3.11.x"
- python-version: "3.12.x"
- python-version: "3.13.x"
- python-version: "3.x" # Last Python version
steps:
- uses: actions/checkout@v3
Expand Down
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,21 @@ DOWNLOADER_MIDDLEWARES = {
}

PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods
```
You may want not to use scrapy-puppeteer-service.
In this case aou are able to run locally with playwright and pyppeteer.

# To change the execution method, you must add the corresponding setting:
EXECUTION_METHOD = "Puppeteer"
```python
DOWNLOADER_MIDDLEWARES = {
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
}

DOWNLOAD_HANDLERS = { # Add browser download handler
"http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler",
"https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler",
}
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" # Need to install asyncio reactor
EXECUTION_METHOD = "playwright" # Choose execution method
```
Available methods: `Puppeteer`, `Pyppeteer`, `Playwright`

Expand Down
2 changes: 0 additions & 2 deletions examples/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,3 @@
}

PUPPETEER_SERVICE_URL = "http://localhost:3000"

PUPPETEER_LOCAL = False
67 changes: 67 additions & 0 deletions examples/spiders/playwright_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from logging import ERROR

import scrapy
from scrapy.utils.log import failure_to_exc_info
from twisted.python.failure import Failure

from scrapypuppeteer import (
PuppeteerRequest,
PuppeteerResponse,
PuppeteerScreenshotResponse,
)
from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll


class PlaywrightSpider(scrapy.Spider):
"""
Mostly, it is Compose spider, but it is very convenient for PlayWright testing.
"""

name = "playwright_test"

custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042,
},
"DOWNLOAD_HANDLERS": {
"http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler",
"https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"EXECUTION_METHOD": "playwright",
}

def start_requests(self):
goto = GoTo("https://pptr.dev")
click_1 = Click(
"#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)"
)
click_2 = Click(
"#__docusaurus_skipToContent_fallback > div > div > aside > div > "
"div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a"
)
click = Compose(click_1, click_2)
scroll = Scroll()
screenshot = Screenshot(options={"full_page": True, "type": "jpeg"})

compose_action = Compose(
goto,
click,
scroll,
screenshot,
)

yield PuppeteerRequest(
compose_action,
callback=self.parse,
errback=self.errback,
close_page=True,
)

def parse(self, response: PuppeteerResponse):
assert isinstance(response, PuppeteerScreenshotResponse)
self.log("Spider worked fine!")

def errback(self, failure: Failure):
print(failure)
self.log(failure_to_exc_info(failure), level=ERROR)
5 changes: 1 addition & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
scrapy>=2.6
pyppeteer
syncer
bs4
playwright
playwright
86 changes: 79 additions & 7 deletions scrapypuppeteer/browser_managers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,89 @@
__all__ = ["BrowserManager"]
__all__ = ["BrowserManager", "ContextManager"]

import uuid
from abc import ABC, abstractmethod
from collections.abc import Coroutine
from typing import Dict, Union

from scrapy import Request
from scrapy.utils.defer import deferred_from_coro
from twisted.internet.defer import Deferred

from scrapypuppeteer import CloseContextRequest


class ContextManager(ABC):
def __init__(self, browser):
self.browser = browser
self.contexts: Dict[str, ...] = {}
self.pages: Dict[str, ...] = {}
self.context2page: Dict[str, str] = {}

@classmethod
@abstractmethod
async def async_init(cls): ...

@staticmethod
@abstractmethod
async def _create_context(browser): ...

@staticmethod
@abstractmethod
async def _create_page(context): ...

async def check_context_and_page(self, context_id, page_id):
if not context_id or not page_id:
context_id, page_id = await self.open_new_page()
return context_id, page_id

async def open_new_page(self):
context_id = uuid.uuid4().hex.upper()
page_id = uuid.uuid4().hex.upper()

self.contexts[context_id] = await self._create_context(self.browser)
self.pages[page_id] = await self._create_page(self.contexts[context_id])
self.context2page[context_id] = page_id

return context_id, page_id

def get_page_by_id(self, context_id, page_id):
return self.pages[page_id]

async def close_browser(self):
if self.browser:
await self.browser.close()

async def close_contexts(self, request: CloseContextRequest):
for context_id in request.contexts:
if context_id in self.contexts:
await self.contexts[context_id].close()
page_id = self.context2page.get(context_id)
self.pages.pop(page_id, None)

del self.contexts[context_id]
del self.context2page[context_id]


class BrowserManager(ABC):
@abstractmethod
def process_request(self, request, spider):
pass
def _download_request(
self, request: Request, spider
) -> Union[Coroutine, Request]: ...

@abstractmethod
def close_used_contexts(self):
pass
async def _start_browser_manager(self) -> None: ...

@abstractmethod
def process_response(self, middleware, request, response, spider):
pass
async def _stop_browser_manager(self) -> None: ...

def download_request(self, request: Request, spider) -> Union[Deferred, Request]:
coro_or_request = self._download_request(request, spider)
if isinstance(coro_or_request, Coroutine):
return deferred_from_coro(coro_or_request)
return coro_or_request

def start_browser_manager(self) -> Deferred:
return deferred_from_coro(self._start_browser_manager())

def stop_browser_manager(self) -> Deferred:
return deferred_from_coro(self._stop_browser_manager())
76 changes: 76 additions & 0 deletions scrapypuppeteer/browser_managers/browser_downloader_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from scrapy import signals
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.utils.reactor import verify_installed_reactor
from twisted.internet.defer import Deferred

from scrapypuppeteer import CloseContextRequest
from scrapypuppeteer.browser_managers import BrowserManager
from scrapypuppeteer.browser_managers.playwright_browser_manager import (
PlaywrightBrowserManager,
)
from scrapypuppeteer.browser_managers.service_browser_manager import (
ServiceBrowserManager,
)
from scrapypuppeteer.request import ActionRequest


class BrowserDownloaderHandler(HTTPDownloadHandler):
"""
Browser downloader handler.
If instantiated, executes actions in the browser with provided requests.
If given, then installed reactor must be `AsyncioSelectorReactor`.

Currently, supports 3 browser types via EXECUTION_METHOD setting:
* puppeteer -- see scrapy-puppeteer-service
* pyppeteer -- not available, since the package is not actively supported
* playwright -- see https://playwright.dev/python/
"""

EXECUTION_METHOD_SETTING = "EXECUTION_METHOD"

def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None:
super().__init__(settings, crawler=crawler)
verify_installed_reactor(
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
)

self.browser_manager = browser_manager

@classmethod
def from_crawler(cls, crawler: Crawler):
settings = crawler.settings

execution_method = crawler.settings.get(
cls.EXECUTION_METHOD_SETTING, "PUPPETEER"
).lower()

match execution_method:
case "puppeteer":
browser_manager = ServiceBrowserManager()
case "pyppeteer":
raise ValueError(
"Currently, pyppeteer cannot run on some machines since it is not actively supported."
)
case "playwright":
browser_manager = PlaywrightBrowserManager()
case _:
raise ValueError(
f"Invalid execution method: {execution_method.upper()}"
)

bdh = cls(settings, browser_manager, crawler=crawler)
crawler.signals.connect(
bdh.browser_manager.start_browser_manager, signals.spider_opened
) # This makes the start VERY slow
crawler.signals.connect(
bdh.browser_manager.stop_browser_manager, signals.engine_stopped
)
return bdh

def download_request(self, request, spider):
if isinstance(request, (ActionRequest, CloseContextRequest)):
dfd_or_request = self.browser_manager.download_request(request, spider)
if isinstance(dfd_or_request, Deferred):
return dfd_or_request
return super().download_request(request, spider)
Loading
Loading