From 0d99f780e7e5856176380820eb271ecaae3c23c4 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 14 Nov 2024 16:56:42 +0300 Subject: [PATCH 01/21] add skip line in requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7846630..7e351a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ scrapy>=2.6 pyppeteer syncer bs4 -playwright \ No newline at end of file +playwright From 87babe50bbefc1a23d1faef32e20d3876a7277df Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 14 Nov 2024 17:33:54 +0300 Subject: [PATCH 02/21] Revert "Local mode (#34)" This reverts commit f0f96410 --- README.md | 1 - scrapypuppeteer/browser_managers/__init__.py | 17 -- scrapypuppeteer/middleware.py | 225 +++++++++++++++---- setup.py | 7 +- 4 files changed, 189 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 7daee5a..545c586 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,6 @@ There is a parent `PuppeteerResponse` class from which other response classes ar Here is a list of them all: - `PuppeteerHtmlResponse` - has `html` and `cookies` properties - `PuppeteerScreenshotResponse` - has `screenshot` property -- `PuppeteerHarResponse` - has `har` property - `PuppeteerJsonResponse` - has `data` property and `to_html()` method which tries to transform itself to `PuppeteerHtmlResponse` - `PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse)` - has `recaptcha_data` property diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index c7f77b3..e69de29 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -1,17 +0,0 @@ -__all__ = ["BrowserManager"] - -from abc import ABC, abstractmethod - - -class BrowserManager(ABC): - @abstractmethod - def process_request(self, request, spider): - pass - - @abstractmethod - def close_used_contexts(self): - pass - - @abstractmethod - def process_response(self, middleware, request, response, spider): - pass diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index b051ed0..f5419ff 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,33 +1,36 @@ +import json import logging from collections import defaultdict from typing import List, Union +from urllib.parse import urlencode, urljoin from scrapy import signals from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy.exceptions import IgnoreRequest, NotConfigured, DontCloseSpider +from scrapy.http import Headers, TextResponse, Response +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure from scrapypuppeteer.actions import ( Click, - CustomJsAction, + GoBack, + GoForward, + GoTo, RecaptchaSolver, Screenshot, Scroll, + CustomJsAction, + Har ) -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.browser_managers.playwright_browser_manager import ( - PlaywrightBrowserManager, -) -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( - PyppeteerBrowserManager, -) -from scrapypuppeteer.browser_managers.service_browser_manager import ( - ServiceBrowserManager, -) -from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, PuppeteerResponse, + PuppeteerHtmlResponse, + PuppeteerScreenshotResponse, + PuppeteerHarResponse, + PuppeteerRecaptchaSolverResponse, + PuppeteerJsonResponse, ) +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest class PuppeteerServiceDownloaderMiddleware: @@ -63,8 +66,6 @@ class PuppeteerServiceDownloaderMiddleware: SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately - EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" - service_logger = logging.getLogger(__name__) def __init__( @@ -73,18 +74,18 @@ def __init__( service_url: str, include_headers: Union[bool, List[str]], include_meta: bool, - browser_manager: BrowserManager, ): self.service_base_url = service_url self.include_headers = include_headers self.include_meta = include_meta self.crawler = crawler self.used_contexts = defaultdict(set) - self.browser_manager = browser_manager @classmethod def from_crawler(cls, crawler): service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + # if service_url is None: + # raise ValueError("Puppeteer service URL must be provided") if cls.INCLUDE_HEADERS_SETTING in crawler.settings: try: include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) @@ -93,35 +94,181 @@ def from_crawler(cls, crawler): else: include_headers = cls.DEFAULT_INCLUDE_HEADERS include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + middleware = cls(crawler, service_url, include_headers, include_meta) + crawler.signals.connect( + middleware.close_used_contexts, signal=signals.spider_idle + ) + return middleware - execution_method = crawler.settings.get( - cls.EXECUTION_METHOD_SETTING, "PUPPETEER" - ).lower() + def process_request(self, request, **_): + if isinstance(request, CloseContextRequest): + return self.process_close_context_request(request) + + if isinstance(request, PuppeteerRequest): + return self.process_puppeteer_request(request) - if execution_method == "pyppeteer": - browser_manager = PyppeteerBrowserManager() - elif execution_method == "puppeteer": - browser_manager = ServiceBrowserManager( - service_url, include_meta, include_headers, crawler + def process_close_context_request(self, request: CloseContextRequest): + if not request.is_valid_url: + return request.replace( + url=urljoin(self.service_base_url, "/close_context"), ) - elif execution_method == "playwright": - browser_manager = PlaywrightBrowserManager() - else: - raise NameError("Wrong EXECUTION_METHOD") - middleware = cls( - crawler, service_url, include_headers, include_meta, browser_manager - ) - crawler.signals.connect( - middleware.browser_manager.close_used_contexts, signal=signals.spider_idle + def process_puppeteer_request(self, request: PuppeteerRequest): + action = request.action + service_url = urljoin(self.service_base_url, action.endpoint) + service_params = self._encode_service_params(request) + if service_params: + service_url += "?" + service_params + + meta = { + "puppeteer_request": request, + "dont_obey_robotstxt": True, + "proxy": None, + } + if self.include_meta: + meta = {**request.meta, **meta} + + return ActionRequest( + url=service_url, + action=action, + method="POST", + headers=Headers({"Content-Type": action.content_type}), + body=self._serialize_body(action, request), + dont_filter=True, + cookies=request.cookies, + priority=request.priority, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta=meta, ) - return middleware - def process_request(self, request, spider): - return self.browser_manager.process_request(request) + @staticmethod + def _encode_service_params(request): + service_params = {} + if request.context_id is not None: + service_params["contextId"] = request.context_id + if request.page_id is not None: + service_params["pageId"] = request.page_id + if request.close_page: + service_params["closePage"] = 1 + return urlencode(service_params) + + def _serialize_body(self, action, request): + payload = action.payload() + if action.content_type == "application/json": + if isinstance(payload, dict): + # disallow null values in top-level request parameters + payload = {k: v for k, v in payload.items() if v is not None} + proxy = request.meta.get("proxy") + if proxy: + payload["proxy"] = proxy + include_headers = ( + self.include_headers + if request.include_headers is None + else request.include_headers + ) + if include_headers: + headers = request.headers.to_unicode_dict() + if isinstance(include_headers, list): + headers = { + h.lower(): headers[h] for h in include_headers if h in headers + } + payload["headers"] = headers + return json.dumps(payload) + return str(payload) def process_response(self, request, response, spider): - return self.browser_manager.process_response(self, request, response, spider) + if not isinstance(response, TextResponse): + return response + + puppeteer_request = request.meta.get("puppeteer_request") + if puppeteer_request is None: + return response + + if b"application/json" not in response.headers.get(b"Content-Type", b""): + return response.replace(request=request) + + response_data = json.loads(response.text) + if response.status != 200: + reason = response_data.pop("error", f"undefined, status {response.status}") + self.service_logger.warning( + f"Request {request} is not succeeded. Reason: {reason}" + ) + context_id = response_data.get("contextId") + if context_id: + self.used_contexts[id(spider)].add(context_id) + return response + + response_cls = self._get_response_class(puppeteer_request.action) + + return self._form_response( + response_cls, + response_data, + puppeteer_request.url, + request, + puppeteer_request, + spider, + ) + + def _form_response( + self, response_cls, response_data, url, request, puppeteer_request, spider + ): + context_id = response_data.pop("contextId", puppeteer_request.context_id) + page_id = response_data.pop("pageId", puppeteer_request.page_id) + + self.used_contexts[id(spider)].add(context_id) + + return response_cls( + url=url, + puppeteer_request=puppeteer_request, + context_id=context_id, + page_id=page_id, + request=request, + **response_data, + ) + + @staticmethod + def _get_response_class(request_action): + if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): + return PuppeteerHtmlResponse + if isinstance(request_action, Screenshot): + return PuppeteerScreenshotResponse + if isinstance(request_action, Har): + return PuppeteerHarResponse + if isinstance(request_action, RecaptchaSolver): + return PuppeteerRecaptchaSolverResponse + return PuppeteerJsonResponse + + def close_used_contexts(self, spider): + contexts = list(self.used_contexts.pop(id(spider), set())) + if contexts: + request = CloseContextRequest( + contexts, + meta={"proxy": None}, + ) + + def handle_close_contexts_result(result): + if isinstance(result, Response): + if result.status == 200: + self.service_logger.debug( + f"Successfully closed {len(request.contexts)} " + f"contexts with request {result.request}" + ) + else: + self.service_logger.warning( + f"Could not close contexts: {result.text}" + ) + elif isinstance(result, Failure): + self.service_logger.warning( + f"Could not close contexts: {result.value}", + exc_info=failure_to_exc_info(result), + ) + + dfd = self.crawler.engine.download(request) + dfd.addBoth(handle_close_contexts_result) + + raise DontCloseSpider() class PuppeteerRecaptchaDownloaderMiddleware: diff --git a/setup.py b/setup.py index b9b7750..e005f87 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,15 @@ from setuptools import find_packages, setup -def read_long_description(file_path): - with open(file_path, "r") as file: - return file.read() +with open("README.md", "r") as readme: + long_description = readme.read() setup( name="scrapy-puppeteer-client", version="0.3.8", description="A library to use Puppeteer-managed browser in Scrapy spiders", - long_description=read_long_description("README.md"), + long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/ispras/scrapy-puppeteer", author="MODIS @ ISP RAS", From c94a0d29d85c033a774cbeb2ef3b3653e4c4a270 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 14 Nov 2024 17:53:49 +0300 Subject: [PATCH 03/21] additional merging from service_browser_manager --- scrapypuppeteer/middleware.py | 37 ++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index f5419ff..c7c430d 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -13,9 +13,12 @@ from scrapypuppeteer.actions import ( Click, + Compose, + FillForm, GoBack, GoForward, GoTo, + Har, RecaptchaSolver, Screenshot, Scroll, @@ -157,9 +160,7 @@ def _encode_service_params(request): def _serialize_body(self, action, request): payload = action.payload() if action.content_type == "application/json": - if isinstance(payload, dict): - # disallow null values in top-level request parameters - payload = {k: v for k, v in payload.items() if v is not None} + payload = self.__clean_payload(payload) proxy = request.meta.get("proxy") if proxy: payload["proxy"] = proxy @@ -178,6 +179,18 @@ def _serialize_body(self, action, request): return json.dumps(payload) return str(payload) + def __clean_payload(self, payload): + """ + disallow null values in request parameters + """ + if isinstance(payload, dict): + payload = { + k: self.__clean_payload(v) for k, v in payload.items() if v is not None + } + elif isinstance(payload, list): + payload = [self.__clean_payload(v) for v in payload if v is not None] + return payload + def process_response(self, request, response, spider): if not isinstance(response, TextResponse): return response @@ -212,7 +225,13 @@ def process_response(self, request, response, spider): ) def _form_response( - self, response_cls, response_data, url, request, puppeteer_request, spider + self, + response_cls, + response_data, + url, + request, + puppeteer_request, + spider, ): context_id = response_data.pop("contextId", puppeteer_request.context_id) page_id = response_data.pop("pageId", puppeteer_request.page_id) @@ -228,9 +247,10 @@ def _form_response( **response_data, ) - @staticmethod - def _get_response_class(request_action): - if isinstance(request_action, (GoTo, GoForward, GoBack, Click, Scroll)): + def _get_response_class(self, request_action): + if isinstance( + request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm) + ): return PuppeteerHtmlResponse if isinstance(request_action, Screenshot): return PuppeteerScreenshotResponse @@ -238,6 +258,9 @@ def _get_response_class(request_action): return PuppeteerHarResponse if isinstance(request_action, RecaptchaSolver): return PuppeteerRecaptchaSolverResponse + if isinstance(request_action, Compose): + # Response class is a last action's response class + return self._get_response_class(request_action.actions[-1]) return PuppeteerJsonResponse def close_used_contexts(self, spider): From 7bbc8fc4dcedc865c37dca4bdd7720520d61afd9 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 14 Nov 2024 17:56:48 +0300 Subject: [PATCH 04/21] linter --- scrapypuppeteer/middleware.py | 17 ++++++++--------- setup.py | 1 - 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index c7c430d..e33ec73 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -6,14 +6,15 @@ from scrapy import signals from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured, DontCloseSpider -from scrapy.http import Headers, TextResponse, Response +from scrapy.exceptions import DontCloseSpider, IgnoreRequest, NotConfigured +from scrapy.http import Headers, Response, TextResponse from scrapy.utils.log import failure_to_exc_info from twisted.python.failure import Failure from scrapypuppeteer.actions import ( Click, Compose, + CustomJsAction, FillForm, GoBack, GoForward, @@ -22,18 +23,16 @@ RecaptchaSolver, Screenshot, Scroll, - CustomJsAction, - Har ) +from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest from scrapypuppeteer.response import ( - PuppeteerResponse, - PuppeteerHtmlResponse, - PuppeteerScreenshotResponse, PuppeteerHarResponse, - PuppeteerRecaptchaSolverResponse, + PuppeteerHtmlResponse, PuppeteerJsonResponse, + PuppeteerRecaptchaSolverResponse, + PuppeteerResponse, + PuppeteerScreenshotResponse, ) -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest class PuppeteerServiceDownloaderMiddleware: diff --git a/setup.py b/setup.py index e005f87..ba1d4de 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ from setuptools import find_packages, setup - with open("README.md", "r") as readme: long_description = readme.read() From 2f2135844a0213adcad2b3ba3537716d09715abf Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 18 Nov 2024 12:36:19 +0300 Subject: [PATCH 05/21] Smth changed --- scrapypuppeteer/browser_managers/__init__.py | 21 ++ .../browser_downloader_handler.py | 55 +++++ .../playwright_browser_manager.py | 2 +- .../pyppeteer_browser_manager.py | 2 +- .../service_browser_manager.py | 209 +----------------- 5 files changed, 82 insertions(+), 207 deletions(-) create mode 100644 scrapypuppeteer/browser_managers/browser_downloader_handler.py diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index e69de29..2648d5c 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -0,0 +1,21 @@ +__all__ = ["BrowserManager"] + +from abc import ABC, abstractmethod +from collections.abc import Coroutine +from typing import Union + +from scrapy import Request + + +class BrowserManager(ABC): + @abstractmethod + def download_request(self, request: Request, spider) -> Union[Coroutine, Request]: + ... + + # @abstractmethod + # def close_used_contexts(self): + # ... + # + # @abstractmethod + # def process_response(self, middleware, request, response, spider): + # ... diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py new file mode 100644 index 0000000..eda570e --- /dev/null +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -0,0 +1,55 @@ +from collections.abc import Coroutine + +from scrapy.core.downloader.handlers.http import HTTPDownloadHandler +from scrapy.crawler import Crawler +from scrapy.exceptions import NotConfigured +from scrapy.utils.defer import deferred_from_coro +from scrapy.utils.reactor import verify_installed_reactor + +from scrapypuppeteer import CloseContextRequest +from scrapypuppeteer.browser_managers import BrowserManager +from scrapypuppeteer.browser_managers.playwright_browser_manager import PlaywrightBrowserManager +from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager +from scrapypuppeteer.browser_managers.service_browser_manager import ServiceBrowserManager +from scrapypuppeteer.request import ActionRequest + + +class BrowserDownloaderHandler(HTTPDownloadHandler): + """ + docstring: TODO + """ + + EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" + + def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None: + super().__init__(settings, crawler=crawler) + verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") + + self.browser_manager = browser_manager + + @classmethod + def from_crawler(cls, crawler: Crawler): + settings = crawler.settings + + execution_method = crawler.settings.get( + cls.EXECUTION_METHOD_SETTING, "PUPPETEER" + ).lower() + + match execution_method: + case "puppeteer": + browser_manager = ServiceBrowserManager() + case "pyppeteer": + browser_manager = PyppeteerBrowserManager() + case "playwright": + browser_manager = PlaywrightBrowserManager() + case _: + raise ValueError(f"Invalid execution method: {execution_method.upper()}") + + return cls(settings, browser_manager, crawler=crawler) + + def download_request(self, request, spider): + if isinstance(request, (ActionRequest, CloseContextRequest)): + coro_or_request = self.browser_manager.download_request(request, spider) + if isinstance(coro_or_request, Coroutine): + return deferred_from_coro(coro_or_request) + return super().download_request(request, spider) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 1228e29..f3172e8 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -74,7 +74,7 @@ def __init__(self): "fill_form": self.fill_form, } - def process_request(self, request): + def download_request(self, request, spider): if isinstance(request, PuppeteerRequest): endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index 6998e0c..7872fc8 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -70,7 +70,7 @@ def __init__(self): "fill_form": self.fill_form, } - def process_request(self, request): + def download_request(self, request, spider): if isinstance(request, PuppeteerRequest): endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py index f016f14..8f98baa 100644 --- a/scrapypuppeteer/browser_managers/service_browser_manager.py +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -21,212 +21,11 @@ Scroll, ) from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest -from scrapypuppeteer.response import ( - PuppeteerHarResponse, - PuppeteerHtmlResponse, - PuppeteerJsonResponse, - PuppeteerRecaptchaSolverResponse, - PuppeteerScreenshotResponse, -) class ServiceBrowserManager(BrowserManager): - def __init__(self, service_base_url, include_meta, include_headers, crawler): - self.service_base_url = service_base_url - self.include_meta = include_meta - self.include_headers = include_headers - self.used_contexts = defaultdict(set) - self.service_logger = logging.getLogger(__name__) - self.crawler = crawler - - if self.service_base_url is None: - raise ValueError("Puppeteer service URL must be provided") - - def process_request(self, request): - if isinstance(request, CloseContextRequest): - return self.process_close_context_request(request) - - if isinstance(request, PuppeteerRequest): - return self.process_puppeteer_request(request) - - def process_close_context_request(self, request: CloseContextRequest): - if not request.is_valid_url: - return request.replace( - url=urljoin(self.service_base_url, "/close_context"), - ) - - def process_puppeteer_request(self, request: PuppeteerRequest): - action = request.action - service_url = urljoin(self.service_base_url, action.endpoint) - service_params = self._encode_service_params(request) - if service_params: - service_url += "?" + service_params - meta = { - "puppeteer_request": request, - "dont_obey_robotstxt": True, - "proxy": None, - } - if self.include_meta: - meta = {**request.meta, **meta} - action_request = ActionRequest( - url=service_url, - action=action, - method="POST", - headers=Headers({"Content-Type": action.content_type}), - body=self._serialize_body(action, request), - dont_filter=True, - cookies=request.cookies, - priority=request.priority, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta=meta, - ) - return action_request - - @staticmethod - def _encode_service_params(request): - service_params = {} - if request.context_id is not None: - service_params["contextId"] = request.context_id - if request.page_id is not None: - service_params["pageId"] = request.page_id - if request.close_page: - service_params["closePage"] = 1 - return urlencode(service_params) - - def _serialize_body(self, action, request): - payload = action.payload() - if action.content_type == "application/json": - payload = self.__clean_payload(payload) - proxy = request.meta.get("proxy") - if proxy: - payload["proxy"] = proxy - include_headers = ( - self.include_headers - if request.include_headers is None - else request.include_headers - ) - if include_headers: - headers = request.headers.to_unicode_dict() - if isinstance(include_headers, list): - headers = { - h.lower(): headers[h] for h in include_headers if h in headers - } - payload["headers"] = headers - return json.dumps(payload) - return str(payload) - - def __clean_payload(self, payload): - """ - disallow null values in request parameters - """ - if isinstance(payload, dict): - payload = { - k: self.__clean_payload(v) for k, v in payload.items() if v is not None - } - elif isinstance(payload, list): - payload = [self.__clean_payload(v) for v in payload if v is not None] - return payload - - def close_used_contexts(self, spider): - contexts = list(self.used_contexts.pop(id(spider), set())) - if contexts: - request = CloseContextRequest( - contexts, - meta={"proxy": None}, - ) - - def handle_close_contexts_result(result): - if isinstance(result, Response): - if result.status == 200: - self.service_logger.debug( - f"Successfully closed {len(request.contexts)} " - f"contexts with request {result.request}" - ) - else: - self.service_logger.warning( - f"Could not close contexts: {result.text}" - ) - elif isinstance(result, Failure): - self.service_logger.warning( - f"Could not close contexts: {result.value}", - exc_info=failure_to_exc_info(result), - ) - - dfd = self.crawler.engine.download(request) - dfd.addBoth(handle_close_contexts_result) - - raise DontCloseSpider() - - def process_response(self, middleware, request, response, spider): - if not isinstance(response, TextResponse): - return response - - puppeteer_request = request.meta.get("puppeteer_request") - if puppeteer_request is None: - return response - - if b"application/json" not in response.headers.get(b"Content-Type", b""): - return response.replace(request=request) - - response_data = json.loads(response.text) - if response.status != 200: - reason = response_data.pop("error", f"undefined, status {response.status}") - middleware.service_logger.warning( - f"Request {request} is not succeeded. Reason: {reason}" - ) - context_id = response_data.get("contextId") - if context_id: - self.used_contexts[id(spider)].add(context_id) - return response - - response_cls = self._get_response_class(puppeteer_request.action) - - return self._form_response( - response_cls, - response_data, - puppeteer_request.url, - request, - puppeteer_request, - spider, - ) - - def _form_response( - self, - response_cls, - response_data, - url, - request, - puppeteer_request, - spider, - ): - context_id = response_data.pop("contextId", puppeteer_request.context_id) - page_id = response_data.pop("pageId", puppeteer_request.page_id) - self.used_contexts[id(spider)].add(context_id) - - return response_cls( - url=url, - puppeteer_request=puppeteer_request, - context_id=context_id, - page_id=page_id, - request=request, - **response_data, - ) + def __init__(self): + super().__init__() - def _get_response_class(self, request_action): - if isinstance( - request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm) - ): - return PuppeteerHtmlResponse - if isinstance(request_action, Screenshot): - return PuppeteerScreenshotResponse - if isinstance(request_action, Har): - return PuppeteerHarResponse - if isinstance(request_action, RecaptchaSolver): - return PuppeteerRecaptchaSolverResponse - if isinstance(request_action, Compose): - # Response class is a last action's response class - return self._get_response_class(request_action.actions[-1]) - return PuppeteerJsonResponse + def download_request(self, request, spider): + return request From 3b0a3154c71016274fc8b95008e5bbf0539e56ec Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 18 Nov 2024 13:18:52 +0300 Subject: [PATCH 06/21] Changed playwright browser_manager --- .../playwright_browser_manager.py | 332 +++++++++--------- .../service_browser_manager.py | 24 +- 2 files changed, 161 insertions(+), 195 deletions(-) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index f3172e8..6f1263b 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -1,12 +1,14 @@ import asyncio import base64 import uuid +from typing import Dict, Callable, Coroutine import syncer -from playwright.async_api import async_playwright +from playwright.async_api import async_playwright, Browser +from scrapypuppeteer import PuppeteerResponse, PuppeteerRequest from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest +from scrapypuppeteer.request import CloseContextRequest, ActionRequest from scrapypuppeteer.response import ( PuppeteerHtmlResponse, PuppeteerScreenshotResponse, @@ -14,13 +16,19 @@ class ContextManager: - def __init__(self): - self.browser = syncer.sync(self.launch_browser()) + def __init__(self, browser: Browser): + self.browser = browser self.contexts = {} self.pages = {} self.context_page_map = {} - async def launch_browser(self): + @classmethod + async def async_init(cls): + browser = await cls.launch_browser() + return cls(browser) + + @staticmethod + async def launch_browser(): playwright = await async_playwright().start() return await playwright.chromium.launch(headless=False) @@ -42,9 +50,9 @@ async def open_new_page(self): def get_page_by_id(self, context_id, page_id): return self.pages[page_id] - def close_browser(self): + async def close_browser(self): if self.browser: - syncer.sync(self.browser.close()) + await self.browser.close() def close_contexts(self, request: CloseContextRequest): for context_id in request.contexts: @@ -58,9 +66,9 @@ def close_contexts(self, request: CloseContextRequest): class PlaywrightBrowserManager(BrowserManager): - def __init__(self): - self.context_manager = ContextManager() - self.action_map = { + def __init__(self, context_manager: ContextManager): + self.context_manager = context_manager + self.action_map: Dict[str, Callable[[ActionRequest], Coroutine[PuppeteerResponse]]] = { "goto": self.goto, "click": self.click, "compose": self.compose, @@ -74,8 +82,13 @@ def __init__(self): "fill_form": self.fill_form, } + @classmethod + async def async_init(cls): + context_manager = await ContextManager.async_init() + return cls(context_manager) + def download_request(self, request, spider): - if isinstance(request, PuppeteerRequest): + if isinstance(request, ActionRequest): endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) if action_function: @@ -90,9 +103,6 @@ def close_contexts(self, request: CloseContextRequest): def close_used_contexts(self): self.context_manager.close_browser() - def process_response(self, middleware, request, response, spider): - return response - def map_navigation_options(self, navigation_options): if not navigation_options: return {} @@ -121,6 +131,8 @@ def map_navigation_options(self, navigation_options): ) elif isinstance(waitUntil, str): strictest_event = waitUntil + else: + raise TypeError(f"waitUntil should be a list or a string, got {type(waitUntil)}") if strictest_event in event_map: mapped_navigation_options["wait_until"] = event_map[strictest_event] @@ -177,201 +189,177 @@ async def wait_with_options(self, page, wait_options): elif timeout: await asyncio.sleep(timeout / 1000) - def get_page_from_request(self, request): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id + async def get_page_from_request(self, request: ActionRequest): + pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] + context_id, page_id = await self.context_manager.check_context_and_page( + pptr_request.context_id, pptr_request.page_id ) - ) return ( self.context_manager.get_page_by_id(context_id, page_id), context_id, page_id, ) - def goto(self, request: PuppeteerRequest): + async def goto(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_goto(): - url = request.action.payload()["url"] - cookies = request.cookies - navigation_options = self.map_navigation_options( - request.action.navigation_options - ) - await page.goto(url, **navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_goto()) + url = request.action.payload()["url"] + cookies = request.cookies + navigation_options = self.map_navigation_options( + request.action.navigation_options + ) + await page.goto(url, **navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse( + url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + html=response_html, + cookies=cookies, + ) - def click(self, request: PuppeteerRequest): + async def click(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_click(): - selector = request.action.payload().get("selector") - cookies = request.cookies - click_options = self.map_click_options(request.action.click_options) - await page.click(selector, **click_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_click()) + selector = request.action.payload().get("selector") + cookies = request.cookies + click_options = self.map_click_options(request.action.click_options) + await page.click(selector, **click_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse( + request.url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + html=response_html, + cookies=cookies, + ) - def go_back(self, request: PuppeteerRequest): + async def go_back(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_go_back(): - cookies = request.cookies - navigation_options = self.map_navigation_options( - request.action.navigation_options - ) - await page.go_back(**navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_back()) + cookies = request.cookies + navigation_options = self.map_navigation_options( + request.action.navigation_options + ) + await page.go_back(**navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + return PuppeteerHtmlResponse( + request.url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + html=response_html, + cookies=cookies, + ) - def go_forward(self, request: PuppeteerRequest): + async def go_forward(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_go_forward(): - cookies = request.cookies - navigation_options = self.map_navigation_options( - request.action.navigation_options - ) - await page.go_forward(**navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_forward()) + cookies = request.cookies + navigation_options = self.map_navigation_options( + request.action.navigation_options + ) + await page.go_forward(**navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + return PuppeteerHtmlResponse( + request.url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + html=await page.content(), + cookies=cookies, + ) - def screenshot(self, request: PuppeteerRequest): + async def screenshot(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_screenshot(): - screenshot_options = request.action.options or {} - screenshot_bytes = await page.screenshot( - **self.map_screenshot_options(screenshot_options) - ) - screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") - return PuppeteerScreenshotResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - screenshot=screenshot_base64, - ) - - return syncer.sync(async_screenshot()) + screenshot_options = request.action.options or {} + screenshot_bytes = await page.screenshot( + **self.map_screenshot_options(screenshot_options) + ) + screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") + return PuppeteerScreenshotResponse( + request.url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + screenshot=screenshot_base64, + ) - def scroll(self, request: PuppeteerRequest): + async def scroll(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_scroll(): - cookies = request.cookies - selector = request.action.payload().get("selector", None) + cookies = request.cookies + selector = request.action.payload().get("selector", None) - if selector: - script = f""" - document.querySelector('{selector}').scrollIntoView(); - """ - else: - script = """ - window.scrollBy(0, document.body.scrollHeight); - """ - await page.evaluate(script) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_scroll()) + if selector: + script = f""" + document.querySelector('{selector}').scrollIntoView(); + """ + else: + script = """ + window.scrollBy(0, document.body.scrollHeight); + """ + + await page.evaluate(script) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + return PuppeteerHtmlResponse( + request.url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + html=await page.content(), + cookies=cookies, + ) - def fill_form(self, request: PuppeteerRequest): + async def fill_form(self, request: ActionRequest): page, context_id, page_id = self.get_page_from_request(request) - async def async_fill_form(): - input_mapping = request.action.payload().get("inputMapping") - submit_button = request.action.payload().get("submitButton", None) - cookies = request.cookies - - for selector, params in input_mapping.items(): - text = params.get("value", None) - delay = params.get("delay", 0) - await page.type(selector, text=text, delay=delay) - - if submit_button: - await page.click(submit_button) - - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_fill_form()) + input_mapping = request.action.payload().get("inputMapping") + submit_button = request.action.payload().get("submitButton", None) + cookies = request.cookies + + for selector, params in input_mapping.items(): + text = params.get("value", None) + delay = params.get("delay", 0) + await page.type(selector, text=text, delay=delay) + + if submit_button: + await page.click(submit_button) + + return PuppeteerHtmlResponse( + request.url, + request.meta["puppeteer_request"], + context_id=context_id, + page_id=page_id, + html=await page.content(), + cookies=cookies, + ) - def compose(self, request: PuppeteerRequest): + async def compose(self, request: ActionRequest): _, context_id, page_id = self.get_page_from_request(request) request.page_id = page_id request.context_id = context_id for action in request.action.actions: - response = self.action_map[action.endpoint](request.replace(action=action)) - return response.replace(puppeteer_request=request) + response = await self.action_map[action.endpoint](request.replace(action=action)) + return response.replace(puppeteer_request=request.meta["puppeteer_request"]) - def action(self, request: PuppeteerRequest): + async def action(self, request: ActionRequest): raise ValueError("CustomJsAction is not available in local mode") - def recaptcha_solver(self, request: PuppeteerRequest): + async def recaptcha_solver(self, request: ActionRequest): raise ValueError("RecaptchaSolver is not available in local mode") - def har(self, request: PuppeteerRequest): + async def har(self, request: ActionRequest): raise ValueError("Har is not available in local mode") diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py index 8f98baa..600e5f8 100644 --- a/scrapypuppeteer/browser_managers/service_browser_manager.py +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -1,26 +1,4 @@ -import json -import logging -from collections import defaultdict -from urllib.parse import urlencode, urljoin - -from scrapy.exceptions import DontCloseSpider -from scrapy.http import Headers, Response, TextResponse -from scrapy.utils.log import failure_to_exc_info -from twisted.python.failure import Failure - -from scrapypuppeteer.actions import ( - Click, - Compose, - FillForm, - GoBack, - GoForward, - GoTo, - Har, - RecaptchaSolver, - Screenshot, - Scroll, -) -from scrapypuppeteer.browser_managers import BrowserManager +from . import BrowserManager class ServiceBrowserManager(BrowserManager): From b113c69a14fa444c4dae22b3a896488eb6d68c98 Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 18 Nov 2024 16:54:32 +0300 Subject: [PATCH 07/21] PlayWrightBrowserManager is now ready! --- scrapypuppeteer/browser_managers/__init__.py | 30 ++++++++++---- .../browser_downloader_handler.py | 23 ++++++----- .../playwright_browser_manager.py | 40 ++++++++++++------- .../pyppeteer_browser_manager.py | 2 +- .../service_browser_manager.py | 8 +++- 5 files changed, 67 insertions(+), 36 deletions(-) diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index 2648d5c..7504dbe 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -5,17 +5,31 @@ from typing import Union from scrapy import Request +from scrapy.utils.defer import deferred_from_coro +from twisted.internet.defer import Deferred class BrowserManager(ABC): @abstractmethod - def download_request(self, request: Request, spider) -> Union[Coroutine, Request]: + def _download_request(self, request: Request, spider) -> Union[Coroutine, Request]: ... - # @abstractmethod - # def close_used_contexts(self): - # ... - # - # @abstractmethod - # def process_response(self, middleware, request, response, spider): - # ... + @abstractmethod + async def _start_browser_manager(self) -> None: + ... + + @abstractmethod + async def _stop_browser_manager(self) -> None: + ... + + def download_request(self, request: Request, spider) -> Union[Deferred, Request]: + coro_or_request = self._download_request(request, spider) + if isinstance(coro_or_request, Coroutine): + return deferred_from_coro(coro_or_request) + return coro_or_request + + def start_browser_manager(self) -> Deferred: + return deferred_from_coro(self._start_browser_manager()) + + def stop_browser_manager(self) -> Deferred: + return deferred_from_coro(self._stop_browser_manager()) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index eda570e..6588be6 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -1,15 +1,13 @@ -from collections.abc import Coroutine - from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler -from scrapy.exceptions import NotConfigured -from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import verify_installed_reactor +from scrapy import signals +from twisted.internet.defer import Deferred from scrapypuppeteer import CloseContextRequest from scrapypuppeteer.browser_managers import BrowserManager from scrapypuppeteer.browser_managers.playwright_browser_manager import PlaywrightBrowserManager -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager +# from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager from scrapypuppeteer.browser_managers.service_browser_manager import ServiceBrowserManager from scrapypuppeteer.request import ActionRequest @@ -38,18 +36,21 @@ def from_crawler(cls, crawler: Crawler): match execution_method: case "puppeteer": browser_manager = ServiceBrowserManager() - case "pyppeteer": - browser_manager = PyppeteerBrowserManager() + # case "pyppeteer": + # browser_manager = PyppeteerBrowserManager() case "playwright": browser_manager = PlaywrightBrowserManager() case _: raise ValueError(f"Invalid execution method: {execution_method.upper()}") - return cls(settings, browser_manager, crawler=crawler) + bdh = cls(settings, browser_manager, crawler=crawler) + crawler.signals.connect(bdh.browser_manager.start_browser_manager, signals.engine_started) + crawler.signals.connect(bdh.browser_manager.stop_browser_manager, signals.engine_stopped) + return bdh def download_request(self, request, spider): if isinstance(request, (ActionRequest, CloseContextRequest)): - coro_or_request = self.browser_manager.download_request(request, spider) - if isinstance(coro_or_request, Coroutine): - return deferred_from_coro(coro_or_request) + dfd_or_request = self.browser_manager.download_request(request, spider) + if isinstance(dfd_or_request, Deferred): + return dfd_or_request return super().download_request(request, spider) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 6f1263b..df39972 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -1,10 +1,11 @@ import asyncio import base64 import uuid -from typing import Dict, Callable, Coroutine +from typing import Dict, Callable, Awaitable, Union import syncer from playwright.async_api import async_playwright, Browser +from scrapy.http import TextResponse from scrapypuppeteer import PuppeteerResponse, PuppeteerRequest from scrapypuppeteer.browser_managers import BrowserManager @@ -54,10 +55,10 @@ async def close_browser(self): if self.browser: await self.browser.close() - def close_contexts(self, request: CloseContextRequest): + async def close_contexts(self, request: CloseContextRequest): for context_id in request.contexts: if context_id in self.contexts: - syncer.sync(self.contexts[context_id].close()) + await self.contexts[context_id].close() page_id = self.context_page_map.get(context_id) self.pages.pop(page_id, None) @@ -66,9 +67,9 @@ def close_contexts(self, request: CloseContextRequest): class PlaywrightBrowserManager(BrowserManager): - def __init__(self, context_manager: ContextManager): - self.context_manager = context_manager - self.action_map: Dict[str, Callable[[ActionRequest], Coroutine[PuppeteerResponse]]] = { + def __init__(self): + self.context_manager: Union[ContextManager, None] = None # Will be initialized later + self.action_map: Dict[str, Callable[[ActionRequest], Awaitable[PuppeteerResponse]]] = { "goto": self.goto, "click": self.click, "compose": self.compose, @@ -82,12 +83,14 @@ def __init__(self, context_manager: ContextManager): "fill_form": self.fill_form, } - @classmethod - async def async_init(cls): - context_manager = await ContextManager.async_init() - return cls(context_manager) + async def _start_browser_manager(self) -> None: + self.context_manager = await ContextManager.async_init() + + async def _stop_browser_manager(self) -> None: + if self.context_manager: + await self.context_manager.close_browser() - def download_request(self, request, spider): + def _download_request(self, request, spider): if isinstance(request, ActionRequest): endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) @@ -97,11 +100,18 @@ def download_request(self, request, spider): if isinstance(request, CloseContextRequest): return self.close_contexts(request) - def close_contexts(self, request: CloseContextRequest): - self.context_manager.close_contexts(request) + async def close_contexts(self, request: CloseContextRequest) -> TextResponse: + await self.context_manager.close_contexts(request) + return TextResponse( + request.url, + encoding="utf-8", + status=200, + headers={}, + body=b"Successfully closed context", + ) - def close_used_contexts(self): - self.context_manager.close_browser() + async def close_used_contexts(self): + await self.context_manager.close_browser() def map_navigation_options(self, navigation_options): if not navigation_options: diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index 7872fc8..00fee21 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -70,7 +70,7 @@ def __init__(self): "fill_form": self.fill_form, } - def download_request(self, request, spider): + def _download_request(self, request, spider): if isinstance(request, PuppeteerRequest): endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py index 600e5f8..9cafa92 100644 --- a/scrapypuppeteer/browser_managers/service_browser_manager.py +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -5,5 +5,11 @@ class ServiceBrowserManager(BrowserManager): def __init__(self): super().__init__() - def download_request(self, request, spider): + def _download_request(self, request, spider): return request + + def _start_browser_manager(self) -> None: + return + + def _stop_browser_manager(self) -> None: + return From a537f6b1b1d86bd6b8c8020eb1a3f923351d1cc5 Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 18 Nov 2024 17:07:35 +0300 Subject: [PATCH 08/21] added async --- scrapypuppeteer/browser_managers/service_browser_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py index 9cafa92..22051c1 100644 --- a/scrapypuppeteer/browser_managers/service_browser_manager.py +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -8,8 +8,8 @@ def __init__(self): def _download_request(self, request, spider): return request - def _start_browser_manager(self) -> None: + async def _start_browser_manager(self) -> None: return - def _stop_browser_manager(self) -> None: + async def _stop_browser_manager(self) -> None: return From bbdf23d8a8fd5f0b755b2a8b774ef9d824dd8bcb Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 21 Nov 2024 12:56:33 +0300 Subject: [PATCH 09/21] working playwright? --- scrapypuppeteer/browser_managers/__init__.py | 11 +- .../browser_downloader_handler.py | 27 ++- .../playwright_browser_manager.py | 174 +++++++++--------- 3 files changed, 112 insertions(+), 100 deletions(-) diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index 7504dbe..2f17f20 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -11,16 +11,15 @@ class BrowserManager(ABC): @abstractmethod - def _download_request(self, request: Request, spider) -> Union[Coroutine, Request]: - ... + def _download_request( + self, request: Request, spider + ) -> Union[Coroutine, Request]: ... @abstractmethod - async def _start_browser_manager(self) -> None: - ... + async def _start_browser_manager(self) -> None: ... @abstractmethod - async def _stop_browser_manager(self) -> None: - ... + async def _stop_browser_manager(self) -> None: ... def download_request(self, request: Request, spider) -> Union[Deferred, Request]: coro_or_request = self._download_request(request, spider) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index 6588be6..3695561 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -6,22 +6,29 @@ from scrapypuppeteer import CloseContextRequest from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.browser_managers.playwright_browser_manager import PlaywrightBrowserManager +from scrapypuppeteer.browser_managers.playwright_browser_manager import ( + PlaywrightBrowserManager, +) + # from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager -from scrapypuppeteer.browser_managers.service_browser_manager import ServiceBrowserManager +from scrapypuppeteer.browser_managers.service_browser_manager import ( + ServiceBrowserManager, +) from scrapypuppeteer.request import ActionRequest class BrowserDownloaderHandler(HTTPDownloadHandler): """ - docstring: TODO + docstring: TODO """ EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None: super().__init__(settings, crawler=crawler) - verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") + verify_installed_reactor( + "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + ) self.browser_manager = browser_manager @@ -41,11 +48,17 @@ def from_crawler(cls, crawler: Crawler): case "playwright": browser_manager = PlaywrightBrowserManager() case _: - raise ValueError(f"Invalid execution method: {execution_method.upper()}") + raise ValueError( + f"Invalid execution method: {execution_method.upper()}" + ) bdh = cls(settings, browser_manager, crawler=crawler) - crawler.signals.connect(bdh.browser_manager.start_browser_manager, signals.engine_started) - crawler.signals.connect(bdh.browser_manager.stop_browser_manager, signals.engine_stopped) + crawler.signals.connect( + bdh.browser_manager.start_browser_manager, signals.engine_started + ) # This makes the start VERY slow + crawler.signals.connect( + bdh.browser_manager.stop_browser_manager, signals.engine_stopped + ) return bdh def download_request(self, request, spider): diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index df39972..028c46e 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -2,9 +2,9 @@ import base64 import uuid from typing import Dict, Callable, Awaitable, Union +from dataclasses import dataclass -import syncer -from playwright.async_api import async_playwright, Browser +from playwright.async_api import async_playwright, Browser, BrowserContext, Page from scrapy.http import TextResponse from scrapypuppeteer import PuppeteerResponse, PuppeteerRequest @@ -16,12 +16,19 @@ ) +@dataclass +class BrowserPage: + context_id: str + page_id: str + page: Page + + class ContextManager: def __init__(self, browser: Browser): self.browser = browser - self.contexts = {} - self.pages = {} - self.context_page_map = {} + self.contexts: Dict[str, BrowserContext] = {} + self.pages: Dict[str, BrowserPage] = {} + self.context2page: Dict[str, str] = {} @classmethod async def async_init(cls): @@ -43,8 +50,8 @@ async def open_new_page(self): page_id = uuid.uuid4().hex.upper() self.contexts[context_id] = await self.browser.new_context() - self.pages[page_id] = await self.contexts[context_id].new_page() - self.context_page_map[context_id] = page_id + self.pages[page_id] = BrowserPage(context_id, page_id, await self.contexts[context_id].new_page()) + self.context2page[context_id] = page_id return context_id, page_id @@ -59,17 +66,21 @@ async def close_contexts(self, request: CloseContextRequest): for context_id in request.contexts: if context_id in self.contexts: await self.contexts[context_id].close() - page_id = self.context_page_map.get(context_id) + page_id = self.context2page.get(context_id) self.pages.pop(page_id, None) del self.contexts[context_id] - del self.context_page_map[context_id] + del self.context2page[context_id] class PlaywrightBrowserManager(BrowserManager): def __init__(self): - self.context_manager: Union[ContextManager, None] = None # Will be initialized later - self.action_map: Dict[str, Callable[[ActionRequest], Awaitable[PuppeteerResponse]]] = { + self.context_manager: Union[ContextManager, None] = ( + None # Will be initialized later + ) + self.action_map: Dict[ + str, Callable[[BrowserPage, ActionRequest], Awaitable[PuppeteerResponse]] + ] = { "goto": self.goto, "click": self.click, "compose": self.compose, @@ -83,6 +94,13 @@ def __init__(self): "fill_form": self.fill_form, } + def _download_request(self, request, spider): + if isinstance(request, ActionRequest): + return self.__make_action(request) + + if isinstance(request, CloseContextRequest): + return self.close_contexts(request) + async def _start_browser_manager(self) -> None: self.context_manager = await ContextManager.async_init() @@ -90,15 +108,13 @@ async def _stop_browser_manager(self) -> None: if self.context_manager: await self.context_manager.close_browser() - def _download_request(self, request, spider): - if isinstance(request, ActionRequest): - endpoint = request.action.endpoint - action_function = self.action_map.get(endpoint) - if action_function: - return action_function(request) - - if isinstance(request, CloseContextRequest): - return self.close_contexts(request) + async def __make_action(self, request): + endpoint = request.action.endpoint + action_function = self.action_map.get(endpoint) + if action_function: + page = await self.get_page_from_request(request) + return action_function(page, request) + raise ValueError(f"No such action: {endpoint}") async def close_contexts(self, request: CloseContextRequest) -> TextResponse: await self.context_manager.close_contexts(request) @@ -142,7 +158,9 @@ def map_navigation_options(self, navigation_options): elif isinstance(waitUntil, str): strictest_event = waitUntil else: - raise TypeError(f"waitUntil should be a list or a string, got {type(waitUntil)}") + raise TypeError( + f"waitUntil should be a list or a string, got {type(waitUntil)}" + ) if strictest_event in event_map: mapped_navigation_options["wait_until"] = event_map[strictest_event] @@ -202,112 +220,96 @@ async def wait_with_options(self, page, wait_options): async def get_page_from_request(self, request: ActionRequest): pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] context_id, page_id = await self.context_manager.check_context_and_page( - pptr_request.context_id, pptr_request.page_id - ) - return ( - self.context_manager.get_page_by_id(context_id, page_id), - context_id, - page_id, + pptr_request.context_id, pptr_request.page_id ) + return self.context_manager.get_page_by_id(context_id, page_id) - async def goto(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + async def goto(self, page: BrowserPage, request: ActionRequest): url = request.action.payload()["url"] cookies = request.cookies navigation_options = self.map_navigation_options( request.action.navigation_options ) - await page.goto(url, **navigation_options) + await page.page.goto(url, **navigation_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() + await self.wait_with_options(page.page, wait_options) + response_html = await page.page.content() return PuppeteerHtmlResponse( url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, + context_id=page.context_id, + page_id=page.page_id, html=response_html, cookies=cookies, ) - async def click(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + async def click(self, page: BrowserPage, request: ActionRequest): selector = request.action.payload().get("selector") cookies = request.cookies click_options = self.map_click_options(request.action.click_options) - await page.click(selector, **click_options) + await page.page.click(selector, **click_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() + await self.wait_with_options(page.page, wait_options) + response_html = await page.page.content() return PuppeteerHtmlResponse( request.url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, + context_id=page.context_id, + page_id=page.page_id, html=response_html, cookies=cookies, ) - async def go_back(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + async def go_back(self, page: BrowserPage, request: ActionRequest): cookies = request.cookies navigation_options = self.map_navigation_options( request.action.navigation_options ) - await page.go_back(**navigation_options) + await page.page.go_back(**navigation_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() + await self.wait_with_options(page.page, wait_options) + response_html = await page.page.content() return PuppeteerHtmlResponse( request.url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, + context_id=page.context_id, + page_id=page.page_id, html=response_html, cookies=cookies, ) - async def go_forward(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + async def go_forward(self, page: BrowserPage, request: ActionRequest): cookies = request.cookies navigation_options = self.map_navigation_options( request.action.navigation_options ) - await page.go_forward(**navigation_options) + await page.page.go_forward(**navigation_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) + await self.wait_with_options(page.page, wait_options) return PuppeteerHtmlResponse( request.url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, - html=await page.content(), + context_id=page.context_id, + page_id=page.page_id, + html=await page.page.content(), cookies=cookies, ) - async def screenshot(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + async def screenshot(self, page: BrowserPage, request: ActionRequest): screenshot_options = request.action.options or {} - screenshot_bytes = await page.screenshot( + screenshot_bytes = await page.page.screenshot( **self.map_screenshot_options(screenshot_options) ) screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") return PuppeteerScreenshotResponse( request.url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, + context_id=page.context_id, + page_id=page.page_id, screenshot=screenshot_base64, ) - async def scroll(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + async def scroll(self, page: BrowserPage, request: ActionRequest): cookies = request.cookies selector = request.action.payload().get("selector", None) @@ -320,21 +322,20 @@ async def scroll(self, request: ActionRequest): window.scrollBy(0, document.body.scrollHeight); """ - await page.evaluate(script) + await page.page.evaluate(script) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) + await self.wait_with_options(page.page, wait_options) return PuppeteerHtmlResponse( request.url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, - html=await page.content(), + context_id=page.context_id, + page_id=page.page_id, + html=await page.page.content(), cookies=cookies, ) - async def fill_form(self, request: ActionRequest): - page, context_id, page_id = self.get_page_from_request(request) - + @staticmethod + async def fill_form(page: BrowserPage, request: ActionRequest): input_mapping = request.action.payload().get("inputMapping") submit_button = request.action.payload().get("submitButton", None) cookies = request.cookies @@ -342,27 +343,26 @@ async def fill_form(self, request: ActionRequest): for selector, params in input_mapping.items(): text = params.get("value", None) delay = params.get("delay", 0) - await page.type(selector, text=text, delay=delay) + await page.page.type(selector, text=text, delay=delay) if submit_button: - await page.click(submit_button) + await page.page.click(submit_button) return PuppeteerHtmlResponse( request.url, request.meta["puppeteer_request"], - context_id=context_id, - page_id=page_id, - html=await page.content(), + context_id=page.context_id, + page_id=page.page_id, + html=await page.page.content(), cookies=cookies, ) - async def compose(self, request: ActionRequest): - _, context_id, page_id = self.get_page_from_request(request) - request.page_id = page_id - request.context_id = context_id - + async def compose(self, page: BrowserPage, request: ActionRequest): for action in request.action.actions: - response = await self.action_map[action.endpoint](request.replace(action=action)) + response = await self.action_map[action.endpoint]( + page, + request.replace(action=action), + ) return response.replace(puppeteer_request=request.meta["puppeteer_request"]) async def action(self, request: ActionRequest): From 7fefd7fca345cdee5056348ec8b10d336e0755ce Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 21 Nov 2024 12:57:16 +0300 Subject: [PATCH 10/21] working playwright? --- .../browser_managers/browser_downloader_handler.py | 2 +- .../browser_managers/playwright_browser_manager.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index 3695561..a692ab3 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -1,7 +1,7 @@ +from scrapy import signals from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler from scrapy.utils.reactor import verify_installed_reactor -from scrapy import signals from twisted.internet.defer import Deferred from scrapypuppeteer import CloseContextRequest diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 028c46e..d4c43d3 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -1,15 +1,15 @@ import asyncio import base64 import uuid -from typing import Dict, Callable, Awaitable, Union from dataclasses import dataclass +from typing import Awaitable, Callable, Dict, Union -from playwright.async_api import async_playwright, Browser, BrowserContext, Page +from playwright.async_api import Browser, BrowserContext, Page, async_playwright from scrapy.http import TextResponse -from scrapypuppeteer import PuppeteerResponse, PuppeteerRequest +from scrapypuppeteer import PuppeteerRequest, PuppeteerResponse from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import CloseContextRequest, ActionRequest +from scrapypuppeteer.request import ActionRequest, CloseContextRequest from scrapypuppeteer.response import ( PuppeteerHtmlResponse, PuppeteerScreenshotResponse, @@ -50,7 +50,9 @@ async def open_new_page(self): page_id = uuid.uuid4().hex.upper() self.contexts[context_id] = await self.browser.new_context() - self.pages[page_id] = BrowserPage(context_id, page_id, await self.contexts[context_id].new_page()) + self.pages[page_id] = BrowserPage( + context_id, page_id, await self.contexts[context_id].new_page() + ) self.context2page[context_id] = page_id return context_id, page_id From a5dde45256252334617afc5122192f1301010ca8 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 21 Nov 2024 16:05:46 +0300 Subject: [PATCH 11/21] exceptions in playwright! --- .../playwright_browser_manager.py | 185 +++++++++--------- scrapypuppeteer/middleware.py | 2 + 2 files changed, 93 insertions(+), 94 deletions(-) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index d4c43d3..6664968 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -2,18 +2,16 @@ import base64 import uuid from dataclasses import dataclass -from typing import Awaitable, Callable, Dict, Union +from typing import Awaitable, Callable, Dict, Union, Any +from json import dumps +from traceback import format_exc from playwright.async_api import Browser, BrowserContext, Page, async_playwright from scrapy.http import TextResponse -from scrapypuppeteer import PuppeteerRequest, PuppeteerResponse +from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.browser_managers import BrowserManager from scrapypuppeteer.request import ActionRequest, CloseContextRequest -from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, - PuppeteerScreenshotResponse, -) @dataclass @@ -81,7 +79,7 @@ def __init__(self): None # Will be initialized later ) self.action_map: Dict[ - str, Callable[[BrowserPage, ActionRequest], Awaitable[PuppeteerResponse]] + str, Callable[[Page, ActionRequest], Awaitable[Dict[str, Any]]] ] = { "goto": self.goto, "click": self.click, @@ -98,8 +96,7 @@ def __init__(self): def _download_request(self, request, spider): if isinstance(request, ActionRequest): - return self.__make_action(request) - + return self.__perform_action(request) if isinstance(request, CloseContextRequest): return self.close_contexts(request) @@ -110,12 +107,31 @@ async def _stop_browser_manager(self) -> None: if self.context_manager: await self.context_manager.close_browser() - async def __make_action(self, request): + async def __perform_action(self, request): endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) if action_function: page = await self.get_page_from_request(request) - return action_function(page, request) + + try: + response_data = await action_function(page.page, request) + except: + return TextResponse( + request.url, + headers={"Content-Type": "application/json"}, + body=dumps({"error": format_exc(), "contextId": page.context_id, "pageId": page.page_id}), + status=500, + encoding="utf-8", + ) + + response_data["contextId"] = page.context_id + response_data["pageId"] = page.page_id + return TextResponse( + request.url, + headers={"Content-Type": "application/json"}, + body=dumps(response_data), + encoding="utf-8", + ) raise ValueError(f"No such action: {endpoint}") async def close_contexts(self, request: CloseContextRequest) -> TextResponse: @@ -226,92 +242,77 @@ async def get_page_from_request(self, request: ActionRequest): ) return self.context_manager.get_page_by_id(context_id, page_id) - async def goto(self, page: BrowserPage, request: ActionRequest): + async def goto(self, page: Page, request: ActionRequest): url = request.action.payload()["url"] cookies = request.cookies navigation_options = self.map_navigation_options( request.action.navigation_options ) - await page.page.goto(url, **navigation_options) + await page.goto(url, **navigation_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page.page, wait_options) - response_html = await page.page.content() - return PuppeteerHtmlResponse( - url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - html=response_html, - cookies=cookies, - ) + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - async def click(self, page: BrowserPage, request: ActionRequest): + async def click(self, page: Page, request: ActionRequest): selector = request.action.payload().get("selector") cookies = request.cookies click_options = self.map_click_options(request.action.click_options) - await page.page.click(selector, **click_options) + await page.click(selector, **click_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page.page, wait_options) - response_html = await page.page.content() - return PuppeteerHtmlResponse( - request.url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - html=response_html, - cookies=cookies, - ) + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - async def go_back(self, page: BrowserPage, request: ActionRequest): + async def go_back(self, page: Page, request: ActionRequest): cookies = request.cookies navigation_options = self.map_navigation_options( request.action.navigation_options ) - await page.page.go_back(**navigation_options) + await page.go_back(**navigation_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page.page, wait_options) - response_html = await page.page.content() - return PuppeteerHtmlResponse( - request.url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - html=response_html, - cookies=cookies, - ) + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - async def go_forward(self, page: BrowserPage, request: ActionRequest): + async def go_forward(self, page: Page, request: ActionRequest): cookies = request.cookies navigation_options = self.map_navigation_options( request.action.navigation_options ) - await page.page.go_forward(**navigation_options) + await page.go_forward(**navigation_options) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page.page, wait_options) - return PuppeteerHtmlResponse( - request.url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - html=await page.page.content(), - cookies=cookies, - ) + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - async def screenshot(self, page: BrowserPage, request: ActionRequest): + async def screenshot(self, page: Page, request: ActionRequest): screenshot_options = request.action.options or {} - screenshot_bytes = await page.page.screenshot( + screenshot_bytes = await page.screenshot( **self.map_screenshot_options(screenshot_options) ) screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") - return PuppeteerScreenshotResponse( - request.url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - screenshot=screenshot_base64, - ) + return { + "screenshot": screenshot_base64, + } - async def scroll(self, page: BrowserPage, request: ActionRequest): + async def scroll(self, page: Page, request: ActionRequest): cookies = request.cookies selector = request.action.payload().get("selector", None) @@ -324,20 +325,18 @@ async def scroll(self, page: BrowserPage, request: ActionRequest): window.scrollBy(0, document.body.scrollHeight); """ - await page.page.evaluate(script) + await page.evaluate(script) wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page.page, wait_options) - return PuppeteerHtmlResponse( - request.url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - html=await page.page.content(), - cookies=cookies, - ) + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } @staticmethod - async def fill_form(page: BrowserPage, request: ActionRequest): + async def fill_form(page: Page, request: ActionRequest): input_mapping = request.action.payload().get("inputMapping") submit_button = request.action.payload().get("submitButton", None) cookies = request.cookies @@ -345,33 +344,31 @@ async def fill_form(page: BrowserPage, request: ActionRequest): for selector, params in input_mapping.items(): text = params.get("value", None) delay = params.get("delay", 0) - await page.page.type(selector, text=text, delay=delay) + await page.type(selector, text=text, delay=delay) if submit_button: - await page.page.click(submit_button) + await page.click(submit_button) - return PuppeteerHtmlResponse( - request.url, - request.meta["puppeteer_request"], - context_id=page.context_id, - page_id=page.page_id, - html=await page.page.content(), - cookies=cookies, - ) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } async def compose(self, page: BrowserPage, request: ActionRequest): for action in request.action.actions: - response = await self.action_map[action.endpoint]( + response_data = await self.action_map[action.endpoint]( page, request.replace(action=action), ) - return response.replace(puppeteer_request=request.meta["puppeteer_request"]) + return response_data async def action(self, request: ActionRequest): - raise ValueError("CustomJsAction is not available in local mode") + raise NotImplementedError("CustomJsAction is not available in local mode") async def recaptcha_solver(self, request: ActionRequest): - raise ValueError("RecaptchaSolver is not available in local mode") + raise NotImplementedError("RecaptchaSolver is not available in local mode") async def har(self, request: ActionRequest): - raise ValueError("Har is not available in local mode") + raise NotImplementedError("Har is not available in local mode") diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index e33ec73..0417f70 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -286,6 +286,8 @@ def handle_close_contexts_result(result): f"Could not close contexts: {result.value}", exc_info=failure_to_exc_info(result), ) + else: + print(f"Not a Response or Failure: {type(result)}, {result}") dfd = self.crawler.engine.download(request) dfd.addBoth(handle_close_contexts_result) From 7bfb2464a9f2ec3bb6a017ebdffec51d676c6b87 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 21 Nov 2024 16:06:18 +0300 Subject: [PATCH 12/21] ruff --- .../browser_managers/playwright_browser_manager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 6664968..a462a95 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -2,9 +2,9 @@ import base64 import uuid from dataclasses import dataclass -from typing import Awaitable, Callable, Dict, Union, Any from json import dumps from traceback import format_exc +from typing import Any, Awaitable, Callable, Dict, Union from playwright.async_api import Browser, BrowserContext, Page, async_playwright from scrapy.http import TextResponse @@ -119,7 +119,13 @@ async def __perform_action(self, request): return TextResponse( request.url, headers={"Content-Type": "application/json"}, - body=dumps({"error": format_exc(), "contextId": page.context_id, "pageId": page.page_id}), + body=dumps( + { + "error": format_exc(), + "contextId": page.context_id, + "pageId": page.page_id, + } + ), status=500, encoding="utf-8", ) From 61e82f979626d8cf4327f44dba5565def9dc46f8 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 28 Nov 2024 14:54:55 +0300 Subject: [PATCH 13/21] pyppeteer --- examples/spiders/compose.py | 5 + scrapypuppeteer/browser_managers/__init__.py | 65 ++- .../browser_downloader_handler.py | 8 +- .../playwright_browser_manager.py | 100 ++-- .../pyppeteer_browser_manager.py | 431 +++++++----------- 5 files changed, 270 insertions(+), 339 deletions(-) diff --git a/examples/spiders/compose.py b/examples/spiders/compose.py index b0af7ad..d037752 100644 --- a/examples/spiders/compose.py +++ b/examples/spiders/compose.py @@ -19,6 +19,11 @@ class ComposeSpider(scrapy.Spider): "DOWNLOADER_MIDDLEWARES": { "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, }, + "DOWNLOAD_HANDLERS": { + "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + }, + "EXECUTION_METHOD": "pyppeteer", } def start_requests(self): diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index 2f17f20..1d87054 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -1,19 +1,78 @@ -__all__ = ["BrowserManager"] +__all__ = ["BrowserManager", "ContextManager"] +import uuid from abc import ABC, abstractmethod from collections.abc import Coroutine -from typing import Union +from typing import Union, Dict from scrapy import Request from scrapy.utils.defer import deferred_from_coro from twisted.internet.defer import Deferred +from scrapypuppeteer import CloseContextRequest + + +class ContextManager(ABC): + def __init__(self, browser): + self.browser = browser + self.contexts: Dict[str, ...] = {} + self.pages: Dict[str, ...] = {} + self.context2page: Dict[str, str] = {} + + @classmethod + @abstractmethod + async def async_init(cls): + ... + + @staticmethod + @abstractmethod + async def _create_context(browser): + ... + + @staticmethod + @abstractmethod + async def _create_page(context): + ... + + async def check_context_and_page(self, context_id, page_id): + if not context_id or not page_id: + context_id, page_id = await self.open_new_page() + return context_id, page_id + + async def open_new_page(self): + context_id = uuid.uuid4().hex.upper() + page_id = uuid.uuid4().hex.upper() + + self.contexts[context_id] = await self._create_context(self.browser) + self.pages[page_id] = await self._create_page(self.contexts[context_id]) + self.context2page[context_id] = page_id + + return context_id, page_id + + def get_page_by_id(self, context_id, page_id): + return self.pages[page_id] + + async def close_browser(self): + if self.browser: + await self.browser.close() + + async def close_contexts(self, request: CloseContextRequest): + for context_id in request.contexts: + if context_id in self.contexts: + await self.contexts[context_id].close() + page_id = self.context2page.get(context_id) + self.pages.pop(page_id, None) + + del self.contexts[context_id] + del self.context2page[context_id] + class BrowserManager(ABC): @abstractmethod def _download_request( self, request: Request, spider - ) -> Union[Coroutine, Request]: ... + ) -> Union[Coroutine, Request]: + ... @abstractmethod async def _start_browser_manager(self) -> None: ... diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index a692ab3..77c6d30 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -9,8 +9,8 @@ from scrapypuppeteer.browser_managers.playwright_browser_manager import ( PlaywrightBrowserManager, ) +from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager -# from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager from scrapypuppeteer.browser_managers.service_browser_manager import ( ServiceBrowserManager, ) @@ -43,8 +43,8 @@ def from_crawler(cls, crawler: Crawler): match execution_method: case "puppeteer": browser_manager = ServiceBrowserManager() - # case "pyppeteer": - # browser_manager = PyppeteerBrowserManager() + case "pyppeteer": + browser_manager = PyppeteerBrowserManager() case "playwright": browser_manager = PlaywrightBrowserManager() case _: @@ -54,7 +54,7 @@ def from_crawler(cls, crawler: Crawler): bdh = cls(settings, browser_manager, crawler=crawler) crawler.signals.connect( - bdh.browser_manager.start_browser_manager, signals.engine_started + bdh.browser_manager.start_browser_manager, signals.spider_opened ) # This makes the start VERY slow crawler.signals.connect( bdh.browser_manager.stop_browser_manager, signals.engine_stopped diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index a462a95..562b839 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -1,81 +1,36 @@ import asyncio import base64 -import uuid -from dataclasses import dataclass from json import dumps -from traceback import format_exc from typing import Any, Awaitable, Callable, Dict, Union -from playwright.async_api import Browser, BrowserContext, Page, async_playwright +from playwright.async_api import Page, async_playwright from scrapy.http import TextResponse from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.browser_managers import BrowserManager from scrapypuppeteer.request import ActionRequest, CloseContextRequest +from scrapypuppeteer.browser_managers import ContextManager -@dataclass -class BrowserPage: - context_id: str - page_id: str - page: Page - - -class ContextManager: - def __init__(self, browser: Browser): - self.browser = browser - self.contexts: Dict[str, BrowserContext] = {} - self.pages: Dict[str, BrowserPage] = {} - self.context2page: Dict[str, str] = {} - +class PlaywrightContextManager(ContextManager): @classmethod async def async_init(cls): - browser = await cls.launch_browser() + playwright = await async_playwright().start() + browser = await playwright.chromium.launch(headless=False) return cls(browser) @staticmethod - async def launch_browser(): - playwright = await async_playwright().start() - return await playwright.chromium.launch(headless=False) - - async def check_context_and_page(self, context_id, page_id): - if not context_id or not page_id: - context_id, page_id = await self.open_new_page() - return context_id, page_id - - async def open_new_page(self): - context_id = uuid.uuid4().hex.upper() - page_id = uuid.uuid4().hex.upper() - - self.contexts[context_id] = await self.browser.new_context() - self.pages[page_id] = BrowserPage( - context_id, page_id, await self.contexts[context_id].new_page() - ) - self.context2page[context_id] = page_id - - return context_id, page_id + async def _create_context(browser): + return await browser.new_context() - def get_page_by_id(self, context_id, page_id): - return self.pages[page_id] - - async def close_browser(self): - if self.browser: - await self.browser.close() - - async def close_contexts(self, request: CloseContextRequest): - for context_id in request.contexts: - if context_id in self.contexts: - await self.contexts[context_id].close() - page_id = self.context2page.get(context_id) - self.pages.pop(page_id, None) - - del self.contexts[context_id] - del self.context2page[context_id] + @staticmethod + async def _create_page(context): + return await context.new_page() class PlaywrightBrowserManager(BrowserManager): def __init__(self): - self.context_manager: Union[ContextManager, None] = ( + self.context_manager: Union[PlaywrightContextManager, None] = ( None # Will be initialized later ) self.action_map: Dict[ @@ -101,37 +56,41 @@ def _download_request(self, request, spider): return self.close_contexts(request) async def _start_browser_manager(self) -> None: - self.context_manager = await ContextManager.async_init() + self.context_manager = await PlaywrightContextManager.async_init() async def _stop_browser_manager(self) -> None: if self.context_manager: await self.context_manager.close_browser() - async def __perform_action(self, request): + async def __perform_action(self, request: ActionRequest): + pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] endpoint = request.action.endpoint action_function = self.action_map.get(endpoint) if action_function: - page = await self.get_page_from_request(request) + context_id, page_id = await self.context_manager.check_context_and_page( + pptr_request.context_id, pptr_request.page_id + ) + page = self.context_manager.get_page_by_id(context_id, page_id) try: - response_data = await action_function(page.page, request) - except: + response_data = await action_function(page, request) + except Exception as e: return TextResponse( request.url, headers={"Content-Type": "application/json"}, body=dumps( { - "error": format_exc(), - "contextId": page.context_id, - "pageId": page.page_id, + "error": str(e), + "contextId": context_id, + "pageId": page_id, } ), status=500, encoding="utf-8", ) - response_data["contextId"] = page.context_id - response_data["pageId"] = page.page_id + response_data["contextId"] = context_id + response_data["pageId"] = page_id return TextResponse( request.url, headers={"Content-Type": "application/json"}, @@ -241,13 +200,6 @@ async def wait_with_options(self, page, wait_options): elif timeout: await asyncio.sleep(timeout / 1000) - async def get_page_from_request(self, request: ActionRequest): - pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] - context_id, page_id = await self.context_manager.check_context_and_page( - pptr_request.context_id, pptr_request.page_id - ) - return self.context_manager.get_page_by_id(context_id, page_id) - async def goto(self, page: Page, request: ActionRequest): url = request.action.payload()["url"] cookies = request.cookies @@ -362,7 +314,7 @@ async def fill_form(page: Page, request: ActionRequest): "cookies": cookies, } - async def compose(self, page: BrowserPage, request: ActionRequest): + async def compose(self, page: Page, request: ActionRequest): for action in request.action.actions: response_data = await self.action_map[action.endpoint]( page, diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index 00fee21..988b988 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -1,62 +1,37 @@ import asyncio import base64 -import uuid +from json import dumps +from typing import Dict, Callable, Awaitable, Any, Union -import syncer from pyppeteer import launch +from scrapy.http import TextResponse -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest -from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, - PuppeteerScreenshotResponse, -) +from scrapypuppeteer.browser_managers import BrowserManager, ContextManager +from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest, ActionRequest -class ContextManager: - def __init__(self): - self.browser = syncer.sync(launch()) - self.contexts = {} - self.pages = {} - self.context_page_map = {} - - async def check_context_and_page(self, context_id, page_id): - if not context_id or not page_id: - context_id, page_id = await self.open_new_page() - return context_id, page_id - - async def open_new_page(self): - context_id = uuid.uuid4().hex.upper() - page_id = uuid.uuid4().hex.upper() - - self.contexts[context_id] = await self.browser.createIncognitoBrowserContext() - self.pages[page_id] = await self.contexts[context_id].newPage() - self.context_page_map[context_id] = page_id +class PyppeteerContextManager(ContextManager): + @classmethod + async def async_init(cls): + browser = await launch(headless=False) + return cls(browser) - return context_id, page_id + @staticmethod + async def _create_context(browser): + return await browser.createIncognitoBrowserContext() - def get_page_by_id(self, context_id, page_id): - return self.pages[page_id] - - def close_browser(self): - if self.browser: - syncer.sync(self.browser.close()) - - def close_contexts(self, request: CloseContextRequest): - for context_id in request.contexts: - if context_id in self.contexts: - syncer.sync(self.contexts[context_id].close()) - page_id = self.context_page_map.get(context_id) - self.pages.pop(page_id, None) - - del self.contexts[context_id] - del self.context_page_map[context_id] + @staticmethod + async def _create_page(context): + return await context.newPage() class PyppeteerBrowserManager(BrowserManager): def __init__(self): - self.context_manager = ContextManager() - self.action_map = { + self.__flag = False + self.context_manager: Union[PyppeteerContextManager, None] = None + self.action_map: Dict[ + str, Callable[[..., ActionRequest], Awaitable[Dict[str, Any]]] + ] = { "goto": self.goto, "click": self.click, "compose": self.compose, @@ -72,23 +47,60 @@ def __init__(self): def _download_request(self, request, spider): if isinstance(request, PuppeteerRequest): - endpoint = request.action.endpoint - action_function = self.action_map.get(endpoint) - if action_function: - return action_function(request) + return self.__perform_action(request) if isinstance(request, CloseContextRequest): return self.close_contexts(request) + async def _start_browser_manager(self) -> None: + if not self.__flag: + self.__flag = True + self.context_manager = await PyppeteerContextManager.async_init() + + async def _stop_browser_manager(self) -> None: + if self.context_manager: + await self.context_manager.close_browser() + + async def __perform_action(self, request: ActionRequest): + pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] + endpoint = request.action.endpoint + action_function = self.action_map.get(endpoint) + if action_function: + context_id, page_id = await self.context_manager.check_context_and_page( + pptr_request.context_id, pptr_request.page_id + ) + page = self.context_manager.get_page_by_id(context_id, page_id) + + try: + response_data = await action_function(page, request) + except Exception as e: + return TextResponse( + request.url, + headers={"Content-Type": "application/json"}, + body=dumps( + { + "error": str(e), + "contextId": context_id, + "pageId": page_id, + } + ), + status=500, + encoding="utf-8", + ) + + response_data["contextId"] = context_id + response_data["pageId"] = page_id + return TextResponse( + request.url, + headers={"Content-Type": "application/json"}, + body=dumps(response_data), + encoding="utf-8", + ) + raise ValueError(f"No such action: {endpoint}") + def close_contexts(self, request: CloseContextRequest): self.context_manager.close_contexts(request) - def close_used_contexts(self): - self.context_manager.close_browser() - - def process_response(self, middleware, request, response, spider): - return response - async def wait_with_options(self, page, wait_options: dict): selector = wait_options.get("selector") xpath = wait_options.get("xpath") @@ -117,224 +129,127 @@ async def wait_with_options(self, page, wait_options: dict): elif timeout: await asyncio.sleep(timeout / 1000) - def goto(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_goto(): - url = request.action.payload()["url"] - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goto(url, navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_goto()) - - def click(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_click(): - selector = request.action.payload().get("selector") - cookies = request.cookies - click_options = request.action.click_options or {} - navigation_options = request.action.navigation_options or {} - options = {**click_options, **navigation_options} - await page.click(selector, options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_click()) - - def go_back(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_go_back(): - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goBack(navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_back()) - - def go_forward(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_go_forward(): - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goForward(navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_forward()) + async def goto(self, page, request: ActionRequest): + url = request.action.payload()["url"] + cookies = request.cookies + navigation_options = request.action.navigation_options + await page.goto(url, navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - def screenshot(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_screenshot(): - request_options = request.action.options or {} - screenshot_options = {"encoding": "binary"} - screenshot_options.update(request_options) - screenshot_bytes = await page.screenshot(screenshot_options) - screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") - return PuppeteerScreenshotResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - screenshot=screenshot_base64, - ) + async def click(self, page, request: ActionRequest): + selector = request.action.payload().get("selector") + cookies = request.cookies + click_options = request.action.click_options or {} + navigation_options = request.action.navigation_options or {} + options = {**click_options, **navigation_options} + await page.click(selector, options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - return syncer.sync(async_screenshot()) + async def go_back(self, page, request: ActionRequest): + cookies = request.cookies + navigation_options = request.action.navigation_options + await page.goBack(navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - def scroll(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_scroll(): - cookies = request.cookies - selector = request.action.payload().get("selector", None) - - if selector: - script = f""" - document.querySelector('{selector}').scrollIntoView(); - """ - else: - script = """ - window.scrollBy(0, document.body.scrollHeight); - """ - await page.evaluate(script) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) + async def go_forward(self, page, request: ActionRequest): + cookies = request.cookies + navigation_options = request.action.navigation_options + await page.goForward(navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - return syncer.sync(async_scroll()) + async def screenshot(self, page, request: ActionRequest): + request_options = request.action.options or {} + screenshot_options = {"encoding": "binary"} + screenshot_options.update(request_options) + screenshot_bytes = await page.screenshot(screenshot_options) + screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") + return { + "screenshot": screenshot_base64, + } - def fill_form(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) + async def scroll(self, page, request: ActionRequest): + cookies = request.cookies + selector = request.action.payload().get("selector", None) - async def async_fill_form(): - input_mapping = request.action.payload().get("inputMapping") - submit_button = request.action.payload().get("submitButton", None) - cookies = request.cookies + if selector: + script = f""" + document.querySelector('{selector}').scrollIntoView(); + """ + else: + script = """ + window.scrollBy(0, document.body.scrollHeight); + """ + await page.evaluate(script) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - for selector, params in input_mapping.items(): - value = params.get("value", None) - delay = params.get("delay", 0) - await page.type(selector, value, {"delay": delay}) + async def fill_form(self, page, request: ActionRequest): + input_mapping = request.action.payload().get("inputMapping") + submit_button = request.action.payload().get("submitButton", None) + cookies = request.cookies - if submit_button: - await page.click(submit_button) + for selector, params in input_mapping.items(): + value = params.get("value", None) + delay = params.get("delay", 0) + await page.type(selector, value, {"delay": delay}) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) + if submit_button: + await page.click(submit_button) - return syncer.sync(async_fill_form()) + response_html = await page.content() - def compose(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - request.page_id = page_id - request.context_id = context_id + return { + "html": response_html, + "cookies": cookies, + } + async def compose(self, page, request: ActionRequest): for action in request.action.actions: - response = self.action_map[action.endpoint](request.replace(action=action)) - return response.replace(puppeteer_request=request) + response_data = await self.action_map[action.endpoint]( + page, + request.replace(action=action), + ) + return response_data - def action(self, request: PuppeteerRequest): + async def action(self, request: PuppeteerRequest): raise ValueError("CustomJsAction is not available in local mode") - def recaptcha_solver(self, request: PuppeteerRequest): + async def recaptcha_solver(self, request: PuppeteerRequest): raise ValueError("RecaptchaSolver is not available in local mode") - def har(self, request: PuppeteerRequest): + async def har(self, request: PuppeteerRequest): raise ValueError("Har is not available in local mode") From fbbb374c93af98a025e9ffa3591832c840a803fb Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 28 Nov 2024 14:55:28 +0300 Subject: [PATCH 14/21] ruff --- scrapypuppeteer/browser_managers/__init__.py | 14 +++++--------- .../browser_managers/browser_downloader_handler.py | 5 +++-- .../browser_managers/playwright_browser_manager.py | 3 +-- .../browser_managers/pyppeteer_browser_manager.py | 4 ++-- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index 1d87054..20985d3 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -3,7 +3,7 @@ import uuid from abc import ABC, abstractmethod from collections.abc import Coroutine -from typing import Union, Dict +from typing import Dict, Union from scrapy import Request from scrapy.utils.defer import deferred_from_coro @@ -21,18 +21,15 @@ def __init__(self, browser): @classmethod @abstractmethod - async def async_init(cls): - ... + async def async_init(cls): ... @staticmethod @abstractmethod - async def _create_context(browser): - ... + async def _create_context(browser): ... @staticmethod @abstractmethod - async def _create_page(context): - ... + async def _create_page(context): ... async def check_context_and_page(self, context_id, page_id): if not context_id or not page_id: @@ -71,8 +68,7 @@ class BrowserManager(ABC): @abstractmethod def _download_request( self, request: Request, spider - ) -> Union[Coroutine, Request]: - ... + ) -> Union[Coroutine, Request]: ... @abstractmethod async def _start_browser_manager(self) -> None: ... diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index 77c6d30..d6c2c11 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -9,8 +9,9 @@ from scrapypuppeteer.browser_managers.playwright_browser_manager import ( PlaywrightBrowserManager, ) -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import PyppeteerBrowserManager - +from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( + PyppeteerBrowserManager, +) from scrapypuppeteer.browser_managers.service_browser_manager import ( ServiceBrowserManager, ) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 562b839..d41e558 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -7,9 +7,8 @@ from scrapy.http import TextResponse from scrapypuppeteer import PuppeteerRequest -from scrapypuppeteer.browser_managers import BrowserManager +from scrapypuppeteer.browser_managers import BrowserManager, ContextManager from scrapypuppeteer.request import ActionRequest, CloseContextRequest -from scrapypuppeteer.browser_managers import ContextManager class PlaywrightContextManager(ContextManager): diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index 988b988..c00e2a9 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -1,13 +1,13 @@ import asyncio import base64 from json import dumps -from typing import Dict, Callable, Awaitable, Any, Union +from typing import Any, Awaitable, Callable, Dict, Union from pyppeteer import launch from scrapy.http import TextResponse from scrapypuppeteer.browser_managers import BrowserManager, ContextManager -from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest, ActionRequest +from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest class PyppeteerContextManager(ContextManager): From c0094494d5f01b5e7af3493832b226f0cfb85687 Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 13:18:12 +0300 Subject: [PATCH 15/21] temporarily delete pyppeteer support --- examples/settings.py | 1 + examples/spiders/compose.py | 3 +- .../browser_downloader_handler.py | 2 +- .../pyppeteer_browser_manager.py | 256 +----------------- 4 files changed, 5 insertions(+), 257 deletions(-) diff --git a/examples/settings.py b/examples/settings.py index 1ac96c6..e950b5c 100644 --- a/examples/settings.py +++ b/examples/settings.py @@ -11,4 +11,5 @@ PUPPETEER_SERVICE_URL = "http://localhost:3000" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" PUPPETEER_LOCAL = False diff --git a/examples/spiders/compose.py b/examples/spiders/compose.py index d037752..f2fd133 100644 --- a/examples/spiders/compose.py +++ b/examples/spiders/compose.py @@ -21,8 +21,9 @@ class ComposeSpider(scrapy.Spider): }, "DOWNLOAD_HANDLERS": { "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", - "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + # "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", }, + "EXECUTION_METHOD": "pyppeteer", } diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index d6c2c11..ab17034 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -45,7 +45,7 @@ def from_crawler(cls, crawler: Crawler): case "puppeteer": browser_manager = ServiceBrowserManager() case "pyppeteer": - browser_manager = PyppeteerBrowserManager() + raise ValueError("Currently, pyppeteer cannot run on some machines since it is not actively supported.") case "playwright": browser_manager = PlaywrightBrowserManager() case _: diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index c00e2a9..673c437 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -1,255 +1 @@ -import asyncio -import base64 -from json import dumps -from typing import Any, Awaitable, Callable, Dict, Union - -from pyppeteer import launch -from scrapy.http import TextResponse - -from scrapypuppeteer.browser_managers import BrowserManager, ContextManager -from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest - - -class PyppeteerContextManager(ContextManager): - @classmethod - async def async_init(cls): - browser = await launch(headless=False) - return cls(browser) - - @staticmethod - async def _create_context(browser): - return await browser.createIncognitoBrowserContext() - - @staticmethod - async def _create_page(context): - return await context.newPage() - - -class PyppeteerBrowserManager(BrowserManager): - def __init__(self): - self.__flag = False - self.context_manager: Union[PyppeteerContextManager, None] = None - self.action_map: Dict[ - str, Callable[[..., ActionRequest], Awaitable[Dict[str, Any]]] - ] = { - "goto": self.goto, - "click": self.click, - "compose": self.compose, - "back": self.go_back, - "forward": self.go_forward, - "scroll": self.scroll, - "screenshot": self.screenshot, - "action": self.action, - "recaptcha_solver": self.recaptcha_solver, - "har": self.har, - "fill_form": self.fill_form, - } - - def _download_request(self, request, spider): - if isinstance(request, PuppeteerRequest): - return self.__perform_action(request) - - if isinstance(request, CloseContextRequest): - return self.close_contexts(request) - - async def _start_browser_manager(self) -> None: - if not self.__flag: - self.__flag = True - self.context_manager = await PyppeteerContextManager.async_init() - - async def _stop_browser_manager(self) -> None: - if self.context_manager: - await self.context_manager.close_browser() - - async def __perform_action(self, request: ActionRequest): - pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] - endpoint = request.action.endpoint - action_function = self.action_map.get(endpoint) - if action_function: - context_id, page_id = await self.context_manager.check_context_and_page( - pptr_request.context_id, pptr_request.page_id - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - try: - response_data = await action_function(page, request) - except Exception as e: - return TextResponse( - request.url, - headers={"Content-Type": "application/json"}, - body=dumps( - { - "error": str(e), - "contextId": context_id, - "pageId": page_id, - } - ), - status=500, - encoding="utf-8", - ) - - response_data["contextId"] = context_id - response_data["pageId"] = page_id - return TextResponse( - request.url, - headers={"Content-Type": "application/json"}, - body=dumps(response_data), - encoding="utf-8", - ) - raise ValueError(f"No such action: {endpoint}") - - def close_contexts(self, request: CloseContextRequest): - self.context_manager.close_contexts(request) - - async def wait_with_options(self, page, wait_options: dict): - selector = wait_options.get("selector") - xpath = wait_options.get("xpath") - timeout = wait_options.get("timeout", None) - options = wait_options.get("options", {}) - - selector_or_timeout = wait_options.get("selectorOrTimeout") - if selector_or_timeout: - if isinstance(selector_or_timeout, (int, float)): - timeout = selector_or_timeout - elif isinstance(selector_or_timeout, str): - if selector_or_timeout.startswith("//"): - xpath = selector_or_timeout - else: - selector = selector_or_timeout - - if len([item for item in [selector, xpath, timeout] if item]) > 1: - raise ValueError( - "Wait options must contain either a selector, an xpath, or a timeout" - ) - - if selector: - await page.waitForSelector(selector, options) - elif xpath: - await page.waitForXPath(xpath, options) - elif timeout: - await asyncio.sleep(timeout / 1000) - - async def goto(self, page, request: ActionRequest): - url = request.action.payload()["url"] - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goto(url, navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - - return { - "html": response_html, - "cookies": cookies, - } - - async def click(self, page, request: ActionRequest): - selector = request.action.payload().get("selector") - cookies = request.cookies - click_options = request.action.click_options or {} - navigation_options = request.action.navigation_options or {} - options = {**click_options, **navigation_options} - await page.click(selector, options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - - return { - "html": response_html, - "cookies": cookies, - } - - async def go_back(self, page, request: ActionRequest): - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goBack(navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - - return { - "html": response_html, - "cookies": cookies, - } - - async def go_forward(self, page, request: ActionRequest): - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goForward(navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - - return { - "html": response_html, - "cookies": cookies, - } - - async def screenshot(self, page, request: ActionRequest): - request_options = request.action.options or {} - screenshot_options = {"encoding": "binary"} - screenshot_options.update(request_options) - screenshot_bytes = await page.screenshot(screenshot_options) - screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") - return { - "screenshot": screenshot_base64, - } - - async def scroll(self, page, request: ActionRequest): - cookies = request.cookies - selector = request.action.payload().get("selector", None) - - if selector: - script = f""" - document.querySelector('{selector}').scrollIntoView(); - """ - else: - script = """ - window.scrollBy(0, document.body.scrollHeight); - """ - await page.evaluate(script) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - - return { - "html": response_html, - "cookies": cookies, - } - - async def fill_form(self, page, request: ActionRequest): - input_mapping = request.action.payload().get("inputMapping") - submit_button = request.action.payload().get("submitButton", None) - cookies = request.cookies - - for selector, params in input_mapping.items(): - value = params.get("value", None) - delay = params.get("delay", 0) - await page.type(selector, value, {"delay": delay}) - - if submit_button: - await page.click(submit_button) - - response_html = await page.content() - - return { - "html": response_html, - "cookies": cookies, - } - - async def compose(self, page, request: ActionRequest): - for action in request.action.actions: - response_data = await self.action_map[action.endpoint]( - page, - request.replace(action=action), - ) - return response_data - - async def action(self, request: PuppeteerRequest): - raise ValueError("CustomJsAction is not available in local mode") - - async def recaptcha_solver(self, request: PuppeteerRequest): - raise ValueError("RecaptchaSolver is not available in local mode") - - async def har(self, request: PuppeteerRequest): - raise ValueError("Har is not available in local mode") +# TODO: wait for fixing the `pyppeteer` package From a970695f3668922d54a90f87789b1ad06d1b7730 Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 14:05:36 +0300 Subject: [PATCH 16/21] example spider --- examples/settings.py | 3 - examples/spiders/compose.py | 6 -- examples/spiders/playwright_test.py | 67 +++++++++++++++++++ .../browser_downloader_handler.py | 6 +- 4 files changed, 68 insertions(+), 14 deletions(-) create mode 100644 examples/spiders/playwright_test.py diff --git a/examples/settings.py b/examples/settings.py index e950b5c..bdfcff7 100644 --- a/examples/settings.py +++ b/examples/settings.py @@ -10,6 +10,3 @@ } PUPPETEER_SERVICE_URL = "http://localhost:3000" - -TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" -PUPPETEER_LOCAL = False diff --git a/examples/spiders/compose.py b/examples/spiders/compose.py index f2fd133..b0af7ad 100644 --- a/examples/spiders/compose.py +++ b/examples/spiders/compose.py @@ -19,12 +19,6 @@ class ComposeSpider(scrapy.Spider): "DOWNLOADER_MIDDLEWARES": { "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, }, - "DOWNLOAD_HANDLERS": { - "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", - # "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", - }, - - "EXECUTION_METHOD": "pyppeteer", } def start_requests(self): diff --git a/examples/spiders/playwright_test.py b/examples/spiders/playwright_test.py new file mode 100644 index 0000000..4e0e5a3 --- /dev/null +++ b/examples/spiders/playwright_test.py @@ -0,0 +1,67 @@ +from logging import ERROR + +import scrapy +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure + +from scrapypuppeteer import ( + PuppeteerRequest, + PuppeteerResponse, + PuppeteerScreenshotResponse, +) +from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll + + +class PlaywrightSpider(scrapy.Spider): + """ + Mostly, it is Compose spider, but it is very convenient for PlayWright testing. + """ + + name = "playwright_test" + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, + }, + "DOWNLOAD_HANDLERS": { + "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + }, + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "EXECUTION_METHOD": "playwright", + } + + def start_requests(self): + goto = GoTo("https://pptr.dev") + click_1 = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" + ) + click_2 = Click( + "#__docusaurus_skipToContent_fallback > div > div > aside > div > " + "div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a" + ) + click = Compose(click_1, click_2) + scroll = Scroll() + screenshot = Screenshot(options={"full_page": True, "type": "jpeg"}) + + compose_action = Compose( + goto, + click, + scroll, + screenshot, + ) + + yield PuppeteerRequest( + compose_action, + callback=self.parse, + errback=self.errback, + close_page=True, + ) + + def parse(self, response: PuppeteerResponse): + assert isinstance(response, PuppeteerScreenshotResponse) + self.log("Spider worked fine!") + + def errback(self, failure: Failure): + print(failure) + self.log(failure_to_exc_info(failure), level=ERROR) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index ab17034..b4c9132 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -9,9 +9,6 @@ from scrapypuppeteer.browser_managers.playwright_browser_manager import ( PlaywrightBrowserManager, ) -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( - PyppeteerBrowserManager, -) from scrapypuppeteer.browser_managers.service_browser_manager import ( ServiceBrowserManager, ) @@ -20,9 +17,8 @@ class BrowserDownloaderHandler(HTTPDownloadHandler): """ - docstring: TODO - """ + """ EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None: From a64ed1833e662dc8732c946f3cd322f7e00d881b Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 14:11:43 +0300 Subject: [PATCH 17/21] docs --- .../browser_managers/browser_downloader_handler.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index b4c9132..0715b82 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -17,8 +17,16 @@ class BrowserDownloaderHandler(HTTPDownloadHandler): """ + Browser downloader handler. + If instantiated, executes actions in the browser with provided requests. + If given, then installed reactor must be `AsyncioSelectorReactor`. + Currently, supports 3 browser types via EXECUTION_METHOD setting: + * puppeteer -- see scrapy-puppeteer-service + * pyppeteer -- not available, since the package is not actively supported + * playwright -- see https://playwright.dev/python/ """ + EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None: From c899533a2ce428ddba3d7d63811442462df3f754 Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 14:14:17 +0300 Subject: [PATCH 18/21] ruff --- .../browser_managers/browser_downloader_handler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py index 0715b82..fc96801 100644 --- a/scrapypuppeteer/browser_managers/browser_downloader_handler.py +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -49,7 +49,9 @@ def from_crawler(cls, crawler: Crawler): case "puppeteer": browser_manager = ServiceBrowserManager() case "pyppeteer": - raise ValueError("Currently, pyppeteer cannot run on some machines since it is not actively supported.") + raise ValueError( + "Currently, pyppeteer cannot run on some machines since it is not actively supported." + ) case "playwright": browser_manager = PlaywrightBrowserManager() case _: From a36c72fa0b682c5ca8df6a17b8c8b525daacff7a Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 14:15:57 +0300 Subject: [PATCH 19/21] README and requirements --- README.md | 1 + requirements.txt | 3 --- setup.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 545c586..7daee5a 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ There is a parent `PuppeteerResponse` class from which other response classes ar Here is a list of them all: - `PuppeteerHtmlResponse` - has `html` and `cookies` properties - `PuppeteerScreenshotResponse` - has `screenshot` property +- `PuppeteerHarResponse` - has `har` property - `PuppeteerJsonResponse` - has `data` property and `to_html()` method which tries to transform itself to `PuppeteerHtmlResponse` - `PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse)` - has `recaptcha_data` property diff --git a/requirements.txt b/requirements.txt index 7e351a2..f30cc9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,2 @@ scrapy>=2.6 -pyppeteer -syncer -bs4 playwright diff --git a/setup.py b/setup.py index ba1d4de..56cef92 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ maintainer="Maksim Varlamov", maintainer_email="varlamov@ispras.ru", packages=find_packages(), - install_requires=["scrapy>=2.6", "pyppeteer", "syncer", "bs4", "playwright"], + install_requires=["scrapy>=2.6", "playwright"], python_requires=">=3.6", license="BSD", classifiers=[ From 3b658833c447a2e5978395780439a7aee39af0f0 Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 14:19:28 +0300 Subject: [PATCH 20/21] README --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7daee5a..ab6718f 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,21 @@ DOWNLOADER_MIDDLEWARES = { } PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods +``` +You may want not to use scrapy-puppeteer-service. +In this case aou are able to run locally with playwright and pyppeteer. -# To change the execution method, you must add the corresponding setting: -EXECUTION_METHOD = "Puppeteer" +```python +DOWNLOADER_MIDDLEWARES = { + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 +} + +DOWNLOAD_HANDLERS = { # Add browser download handler + "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", +} +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" # Need to install asyncio reactor +EXECUTION_METHOD = "playwright" # Choose execution method ``` Available methods: `Puppeteer`, `Pyppeteer`, `Playwright` From 91882af5125d30991047e33c467ba5557ded0d58 Mon Sep 17 00:00:00 2001 From: matthew Date: Fri, 20 Dec 2024 14:23:07 +0300 Subject: [PATCH 21/21] CI --- .github/workflows/python-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index fc0d5c8..558a95d 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -8,12 +8,12 @@ jobs: strategy: matrix: include: - - python-version: "3.7.x" # Min Python version (No 3.6 version in GitHub repository) - - python-version: "3.8.x" + - python-version: "3.8.x" # Min Python version (No 3.7 version in GitHub repository) - python-version: "3.9.x" - python-version: "3.10.x" - python-version: "3.11.x" - python-version: "3.12.x" + - python-version: "3.13.x" - python-version: "3.x" # Last Python version steps: - uses: actions/checkout@v3