diff --git a/README.md b/README.md index 7daee5a..7963136 100644 --- a/README.md +++ b/README.md @@ -78,9 +78,9 @@ Here is the list of available actions: - `Compose(*actions)` - composition of several puppeteer action - `Scroll(selector, wait_options)` - scroll page - `Screenshot(options)` - take screenshot +- `RecaptchaSolver(solve_recaptcha, close_on_empty)` - find or solve recaptcha on page - `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution. - `FillForm(input_mapping, submit_button)` - to fill out and submit forms on page. -- `RecaptchaSolver(solve_recaptcha)` - find or solve recaptcha on page - `CustomJsAction(js_function)` - evaluate JS function on page Available options essentially mirror [service](https://github.com/ispras/scrapy-puppeteer-service) method parameters, which in turn mirror puppeteer API functions to some extent. @@ -166,6 +166,45 @@ and will notify you about number of found captchas on the page. If you don't want the middleware to work on specific request you may provide special meta key: `'dont_recaptcha': True`. In this case RecaptchaMiddleware will just skip the request. +## Automatic context restoring + +Sometimes you may receive responses with status 422 (Unprocessable Entity). +This means the scrapy-puppeteer-services struggled to find provided context or page in its memory. +In such situations you can use this middleware to restore these contexts. + +Enabling the middleware: +```Python +DOWNLOADER_MIDDLEWARES = { # Strict order of middlewares + # 'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1040, # You may also use recaptcha middleware + 'scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware': 1041, + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042, +} +``` + +Settings of the middleware: +```Python +N_RETRY_RESTORING = 3 # Number of tries to restore a context +RESTORING_LENGTH = 2 # Number of restorable requests in a sequence +``` + +Currently, the middleware can only restart from the beginning of request-response sequence. +You can start this sequence with `recover_context` meta-key, just provide `True` value. +Example: +```Python +... +yield PuppeteerRequest( + url, + callback=self.click_on_navigation, + errback=self.errback, + close_page=False, + meta={'recover_context': True} +) +... +``` + +Also, you can see `dead_context` spider and try to enable `PuppeteerContextRestoreDownloaderMiddleware` in its `custom_settings` +to see the working middleware. + ## TODO - [x] skeleton that could handle goto, click, scroll, and actions diff --git a/examples/spiders/dead_context.py b/examples/spiders/dead_context.py new file mode 100644 index 0000000..3252a8e --- /dev/null +++ b/examples/spiders/dead_context.py @@ -0,0 +1,74 @@ +from asyncio import sleep + +import scrapy +from twisted.python.failure import Failure + +from scrapypuppeteer import PuppeteerRequest, PuppeteerResponse +from scrapypuppeteer.actions import Click, GoTo + + +class DeadContextSpider(scrapy.Spider): + custom_settings = { + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware": 1041, + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, + }, + "N_RETRY_RESTORING": 3, + "RESTORING_LENGTH": 2, + } + name = "dead_context" + + def start_requests(self): + urls = [ + "https://www.google.com/recaptcha/api2/demo", + "https://scrapy.org", + "https://pptr.dev", + ] + + for url in urls: + yield PuppeteerRequest( + url, + callback=self.click_on_navigation, + errback=self.errback, + close_page=False, + meta={"recover_context": True}, + ) + + async def click_on_navigation(self, response: PuppeteerResponse): + await sleep(4) + + click = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" + ) + yield response.follow( + click, callback=self.click_back, errback=self.errback, close_page=False + ) + + async def click_back(self, response: PuppeteerResponse): + await sleep(4) + + click = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a.navbar__brand > b" + ) + yield response.follow( + click, callback=self.goto_api, errback=self.errback, close_page=False + ) + + async def goto_api(self, response): + await sleep(4) + + yield response.follow( + GoTo("api/puppeteer.puppeteernode"), + callback=self.empty_action, + errback=self.errback, + close_page=False, + ) + + @staticmethod + async def empty_action(response, **kwargs): + await sleep(4) + + @staticmethod + def errback(failure: Failure): + print(failure) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index b051ed0..bbfbe2f 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,318 +1,23 @@ -import logging -from collections import defaultdict -from typing import List, Union +import warnings -from scrapy import signals -from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured +import scrapy.exceptions -from scrapypuppeteer.actions import ( - Click, - CustomJsAction, - RecaptchaSolver, - Screenshot, - Scroll, +from .middlewares import ( + PuppeteerContextRestoreDownloaderMiddleware, + PuppeteerRecaptchaDownloaderMiddleware, + PuppeteerServiceDownloaderMiddleware, ) -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.browser_managers.playwright_browser_manager import ( - PlaywrightBrowserManager, -) -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( - PyppeteerBrowserManager, -) -from scrapypuppeteer.browser_managers.service_browser_manager import ( - ServiceBrowserManager, -) -from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest -from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, - PuppeteerResponse, -) - - -class PuppeteerServiceDownloaderMiddleware: - """ - This downloader middleware converts PuppeteerRequest instances to - Puppeteer service API requests and then converts its responses to - PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service right before - spider is closed. - - Additionally, the middleware uses these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' - - Settings: - - PUPPETEER_SERVICE_URL (str) - Service URL, e.g. 'http://localhost:3000' - - PUPPETEER_INCLUDE_HEADERS (bool|list[str]) - Determines which request headers will be sent to remote site by puppeteer service. - Either True (all headers), False (no headers) or list of header names. - May be overridden per request. - By default, only cookies are sent. - - PUPPETEER_INCLUDE_META (bool) - Determines whether to send or not user's meta attached by user. - Default to False. - """ - - SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" - INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" - SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" - DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately - - EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" - - service_logger = logging.getLogger(__name__) - - def __init__( - self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool, - browser_manager: BrowserManager, - ): - self.service_base_url = service_url - self.include_headers = include_headers - self.include_meta = include_meta - self.crawler = crawler - self.used_contexts = defaultdict(set) - self.browser_manager = browser_manager - - @classmethod - def from_crawler(cls, crawler): - service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - if cls.INCLUDE_HEADERS_SETTING in crawler.settings: - try: - include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) - except ValueError: - include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) - else: - include_headers = cls.DEFAULT_INCLUDE_HEADERS - include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - - execution_method = crawler.settings.get( - cls.EXECUTION_METHOD_SETTING, "PUPPETEER" - ).lower() - - if execution_method == "pyppeteer": - browser_manager = PyppeteerBrowserManager() - elif execution_method == "puppeteer": - browser_manager = ServiceBrowserManager( - service_url, include_meta, include_headers, crawler - ) - elif execution_method == "playwright": - browser_manager = PlaywrightBrowserManager() - else: - raise NameError("Wrong EXECUTION_METHOD") - - middleware = cls( - crawler, service_url, include_headers, include_meta, browser_manager - ) - crawler.signals.connect( - middleware.browser_manager.close_used_contexts, signal=signals.spider_idle - ) - return middleware - - def process_request(self, request, spider): - return self.browser_manager.process_request(request) - - def process_response(self, request, response, spider): - return self.browser_manager.process_response(self, request, response, spider) - - -class PuppeteerRecaptchaDownloaderMiddleware: - """ - This middleware is supposed to solve recaptcha on the page automatically. - If there is no captcha on the page then this middleware will do nothing - on the page, so your 2captcha balance will remain the same. - It can submit recaptcha if "submit button" is provided. - It will not "submit" captcha if there is no submit-selector. - - If you want to turn Recaptcha solving off on the exact request provide - meta-key 'dont_recaptcha' with True value. The middleware will skip the request - through itself. - The middleware uses additionally these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - '_captcha_submission', '_captcha_solving' - - Settings: - - RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) - RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not - RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and - these domains' submit selectors, e.g. - 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' - it could be also squeezed to - 'ecaptcha/api2/de': '#recaptcha-demo-submit' - also you can use not just strings but Click actions with required parameters: - 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') - In general - domain is a unique identifying string which is contained in web-page url - If there is no button to submit recaptcha then provide empty string to a domain. - This setting can also be a string. If so the middleware will only click the button - related to this selector. - This setting can also be unprovided. In this case every web-page you crawl is supposed to be - without submit button, or you manually do it yourself. - """ - - MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" - RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" - SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" - - def __init__(self, recaptcha_solving: bool, submit_selectors: dict): - self.submit_selectors = submit_selectors - self.recaptcha_solving = recaptcha_solving - self._page_responses = dict() - self._page_closing = set() - - @classmethod - def from_crawler(cls, crawler: Crawler): - activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) - if not activation: - raise NotConfigured - recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) - - try: - submit_selectors = crawler.settings.getdict( - cls.SUBMIT_SELECTORS_SETTING, dict() - ) - except ValueError: - submit_selectors = { - "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") - } - except Exception as exception: - raise ValueError( - f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" - ) - - for key in submit_selectors.keys(): - submit_selector = submit_selectors[key] - if isinstance(submit_selector, str): - submit_selectors[key] = Click(selector=submit_selector) - elif not isinstance(submit_selector, Click): - raise ValueError( - "Submit selector must be str or Click," - f"but {type(submit_selector)} provided" - ) - return cls(recaptcha_solving, submit_selectors) - - @staticmethod - def is_recaptcha_producing_action(action) -> bool: - return not isinstance( - action, - (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), - ) - - def process_request(self, request, **_): - if request.meta.get("dont_recaptcha", False): - return None - - # Checking if we need to close page after action - if isinstance(request, PuppeteerRequest): - if self.is_recaptcha_producing_action(request.action): - if request.close_page and not request.meta.get( - "_captcha_submission", False - ): - request.close_page = False - request.dont_filter = True - self._page_closing.add(request) - return request - - def process_response(self, request, response, spider): - if not isinstance( - response, PuppeteerResponse - ): # We only work with PuppeteerResponses - return response - - puppeteer_request = response.puppeteer_request - if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses - return response - - if puppeteer_request.meta.pop( - "_captcha_submission", False - ): # Submitted captcha - return self.__gen_response(response) - - if puppeteer_request.meta.pop("_captcha_solving", False): - # RECaptchaSolver was called by recaptcha middleware - return self._submit_recaptcha(request, response, spider) - - if not self.is_recaptcha_producing_action(puppeteer_request.action): - # No recaptcha after these actions - return response - - # Any puppeteer response besides PuppeteerRecaptchaSolverResponse - return self._solve_recaptcha(request, response) - - def _solve_recaptcha(self, request, response): - self._page_responses[response.page_id] = ( - response # Saving main response to return it later - ) - - recaptcha_solver = RecaptchaSolver( - solve_recaptcha=self.recaptcha_solving, - close_on_empty=self.__is_closing(response, remove_request=False), - ) - return response.follow( - recaptcha_solver, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta={"_captcha_solving": True}, - close_page=False, - ) - - def _submit_recaptcha(self, request, response, spider): - if not response.puppeteer_request.action.solve_recaptcha: - spider.log( - message=f"Found {len(response.recaptcha_data['captchas'])} captcha " - f"but did not solve due to argument", - level=logging.INFO, - ) - return self.__gen_response(response) - # Click "submit button"? - if response.recaptcha_data["captchas"] and self.submit_selectors: - # We need to click "submit button" - for domain, submitting in self.submit_selectors.items(): - if domain in response.url: - if not submitting.selector: - return self.__gen_response(response) - return response.follow( - action=submitting, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - close_page=self.__is_closing(response), - meta={"_captcha_submission": True}, - ) - raise IgnoreRequest( - "No submit selector found to click on the page but captcha found" - ) - return self.__gen_response(response) - - def __gen_response(self, response): - main_response_data = dict() - main_response_data["page_id"] = ( - None if self.__is_closing(response) else response.puppeteer_request.page_id - ) - - main_response = self._page_responses.pop(response.page_id) - - if isinstance(main_response, PuppeteerHtmlResponse): - if isinstance(response.puppeteer_request.action, RecaptchaSolver): - main_response_data["body"] = response.html - elif isinstance(response.puppeteer_request.action, Click): - main_response_data["body"] = response.body +warnings.warn( + "Import from `scrapypuppeteer.middleware` is deprecated. " + "Use `scrapypuppeteer.middlewares` instead.", + scrapy.exceptions.ScrapyDeprecationWarning, + stacklevel=2, +) - return main_response.replace(**main_response_data) - def __is_closing(self, response, remove_request: bool = True) -> bool: - main_request = self._page_responses[response.page_id].puppeteer_request - close_page = main_request in self._page_closing - if close_page and remove_request: - self._page_closing.remove(main_request) - return close_page +__all__ = [ + "PuppeteerServiceDownloaderMiddleware", + "PuppeteerRecaptchaDownloaderMiddleware", + "PuppeteerContextRestoreDownloaderMiddleware", +] diff --git a/scrapypuppeteer/middlewares/__init__.py b/scrapypuppeteer/middlewares/__init__.py new file mode 100644 index 0000000..80280d1 --- /dev/null +++ b/scrapypuppeteer/middlewares/__init__.py @@ -0,0 +1,3 @@ +from .recaptcha import PuppeteerRecaptchaDownloaderMiddleware +from .restore import PuppeteerContextRestoreDownloaderMiddleware +from .service import PuppeteerServiceDownloaderMiddleware diff --git a/scrapypuppeteer/middlewares/recaptcha.py b/scrapypuppeteer/middlewares/recaptcha.py new file mode 100644 index 0000000..da8bdd1 --- /dev/null +++ b/scrapypuppeteer/middlewares/recaptcha.py @@ -0,0 +1,204 @@ +import logging + +from scrapy.crawler import Crawler +from scrapy.exceptions import IgnoreRequest, NotConfigured + +from scrapypuppeteer.actions import ( + Click, + CustomJsAction, + RecaptchaSolver, + Screenshot, + Scroll, +) +from scrapypuppeteer.request import PuppeteerRequest +from scrapypuppeteer.response import PuppeteerHtmlResponse, PuppeteerResponse + +recaptcha_logger = logging.getLogger(__name__) + + +class PuppeteerRecaptchaDownloaderMiddleware: + """ + This middleware is supposed to solve recaptcha on the page automatically. + If there is no captcha on the page then this middleware will do nothing + on the page, so your 2captcha balance will remain the same. + It can submit recaptcha if "submit button" is provided. + It will not "submit" captcha if there is no submit-selector. + + If you want to turn Recaptcha solving off on the exact request provide + meta-key 'dont_recaptcha' with True value. The middleware will skip the request + through itself. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + '_captcha_submission', '_captcha_solving' + + Settings: + + RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) + RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not + RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and + these domains' submit selectors, e.g. + 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' + it could be also squeezed to + 'ecaptcha/api2/de': '#recaptcha-demo-submit' + also you can use not just strings but Click actions with required parameters: + 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') + In general - domain is a unique identifying string which is contained in web-page url + If there is no button to submit recaptcha then provide empty string to a domain. + This setting can also be a string. If so the middleware will only click the button + related to this selector. + This setting can also be unprovided. In this case every web-page you crawl is supposed to be + without submit button, or you manually do it yourself. + """ + + MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" + RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" + SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" + + def __init__(self, recaptcha_solving: bool, submit_selectors: dict): + self.submit_selectors = submit_selectors + self.recaptcha_solving = recaptcha_solving + self._page_responses = dict() + self._page_closing = set() + + @classmethod + def from_crawler(cls, crawler: Crawler): + activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) + if not activation: + raise NotConfigured + recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) + + try: + submit_selectors = crawler.settings.getdict( + cls.SUBMIT_SELECTORS_SETTING, dict() + ) + except ValueError: + submit_selectors = { + "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") + } + except Exception as exception: + raise ValueError( + f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" + ) + + for key in submit_selectors.keys(): + submit_selector = submit_selectors[key] + if isinstance(submit_selector, str): + submit_selectors[key] = Click(selector=submit_selector) + elif not isinstance(submit_selector, Click): + raise TypeError( + f"Submit selector must be str or Click, got {type(submit_selector)}" + ) + return cls(recaptcha_solving, submit_selectors) + + def process_request(self, request, spider): + if request.meta.get("dont_recaptcha", False): + return None + + if isinstance(request, PuppeteerRequest): + if request.close_page and not request.meta.get( + "_captcha_submission", False + ): + request.close_page = False + request.dont_filter = True + self._page_closing.add(request) + return request + return None + + def process_response(self, request, response, spider): + if not isinstance( + response, PuppeteerResponse + ): # We only work with PuppeteerResponses + return response + + puppeteer_request = response.puppeteer_request + if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses + return response + + if puppeteer_request.meta.pop( + "_captcha_submission", False + ): # Submitted captcha + return self.__gen_response(response) + + if puppeteer_request.meta.pop("_captcha_solving", False): + # RECaptchaSolver was called by recaptcha middleware + return self._submit_recaptcha(request, response, spider) + + if isinstance( + puppeteer_request.action, + (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), + ): + # No recaptcha after these actions + return response + + # Any puppeteer response besides PuppeteerRecaptchaSolverResponse + return self._solve_recaptcha(request, response) + + def _solve_recaptcha(self, request, response): + self._page_responses[response.page_id] = ( + response # Saving main response to return it later + ) + + recaptcha_solver = RecaptchaSolver( + solve_recaptcha=self.recaptcha_solving, + close_on_empty=self.__is_closing(response, remove_request=False), + ) + return response.follow( + recaptcha_solver, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta={"_captcha_solving": True}, + close_page=False, + ) + + def _submit_recaptcha(self, request, response, spider): + if not response.puppeteer_request.action.solve_recaptcha: + recaptcha_logger.log( + level=logging.INFO, + msg=f"Found {len(response.recaptcha_data['captchas'])} captcha " + f"but did not solve due to argument", + ) + return self.__gen_response(response) + # Click "submit button"? + if response.recaptcha_data["captchas"] and self.submit_selectors: + # We need to click "submit button" + for domain, submitting in self.submit_selectors.items(): + if domain in response.url: + if not submitting.selector: + return self.__gen_response(response) + return response.follow( + action=submitting, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + close_page=self.__is_closing(response), + meta={"_captcha_submission": True}, + ) + raise IgnoreRequest( + "No submit selector found to click on the page but captcha found" + ) + return self.__gen_response(response) + + def __gen_response(self, response): + main_response_data = dict() + main_response_data["page_id"] = ( + None if self.__is_closing(response) else response.puppeteer_request.page_id + ) + + main_response = self._page_responses.pop(response.page_id) + + if isinstance(main_response, PuppeteerHtmlResponse): + if isinstance(response.puppeteer_request.action, RecaptchaSolver): + main_response_data["body"] = response.html + elif isinstance(response.puppeteer_request.action, Click): + main_response_data["body"] = response.body + + return main_response.replace(**main_response_data) + + def __is_closing(self, response, remove_request: bool = True) -> bool: + main_request = self._page_responses[response.page_id].puppeteer_request + close_page = main_request in self._page_closing + if close_page and remove_request: + self._page_closing.remove(main_request) + return close_page diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py new file mode 100644 index 0000000..51f5df5 --- /dev/null +++ b/scrapypuppeteer/middlewares/restore.py @@ -0,0 +1,158 @@ +import json +import logging +from http import HTTPStatus +from typing import Dict, Union + +from scrapy.crawler import Crawler + +from scrapypuppeteer.actions import Compose +from scrapypuppeteer.request import ActionRequest, PuppeteerRequest +from scrapypuppeteer.response import PuppeteerResponse + + +class PuppeteerContextRestoreDownloaderMiddleware: + """ + This middleware allows you to recover puppeteer context. + The middleware supposes that restored requests + would have the same effect as original requests. + + If you want to recover puppeteer context starting from the specified first request provide + `recover_context` meta-key with `True` value. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + `__request_binding` + + Settings: + + RESTORING_LENGTH: int = 1 - number of restorable requests in a sequence. + N_RETRY_RESTORING: int = 1 - number of tries to restore a context. + """ + + restore_logger = logging.getLogger(__name__) + + N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" + RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" + + def __init__(self, restoring_length: int, n_retry_restoring: int): + self.restoring_length = restoring_length + self.n_retry_restoring = n_retry_restoring + self.context_actions: Dict[str, Compose] = {} + + @classmethod + def from_crawler(cls, crawler: Crawler): + restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) + if not isinstance(restoring_length, int): + raise TypeError( + f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}" + ) + elif restoring_length < 1: + raise ValueError( + f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1, got {restoring_length}" + ) + + n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) + if not isinstance(n_retry_restoring, int): + raise TypeError( + f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}" + ) + elif n_retry_restoring < 1: + raise ValueError( + f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1, got {n_retry_restoring}" + ) + + return cls(restoring_length, n_retry_restoring) + + def process_request(self, request, spider): + if not isinstance(request, PuppeteerRequest): + return None + + if not request.meta.pop("recover_context", False): + return None + + if request.context_id or request.page_id: + self.restore_logger.warning( + f"Request {request} is not in the beginning of the request-response sequence." + "Cannot 'restore' this sequence, skipping." + ) + return None + + request.meta["__request_binding"] = True + return None + + def process_response(self, request, response, spider): + puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get( + "puppeteer_request", None + ) + request_binding = puppeteer_request is not None and puppeteer_request.meta.get( + "__request_binding", False + ) + + if isinstance(response, PuppeteerResponse): + if request_binding: + self.context_actions[response.context_id] = Compose(request.action) + elif response.context_id in self.context_actions: + # Update actions in context + self._update_context_actions(request, response) + elif ( + puppeteer_request is not None + and response.status == HTTPStatus.UNPROCESSABLE_ENTITY + ): + # One PuppeteerRequest has failed with 422 error + if request_binding: + # Could not get context, retry + if ( + request.meta.get("__request_binding_count", 0) + < self.n_retry_restoring + ): + new_request = request.copy() + new_request.meta["__request_binding_count"] += 1 + return new_request + else: + return self._restore_context(puppeteer_request, response) + return response + + def _update_context_actions( + self, request: ActionRequest, response: PuppeteerResponse + ): + context_id = response.context_id + context_actions = self.context_actions[context_id] + + if len(context_actions.actions) > self.restoring_length: + self.__delete_context( + context_id, + f"Too many actions in context ({context_id}). Deleting it.", + ) + else: + self.context_actions[response.context_id] = Compose( + context_actions, + request.action, + ) + + def _restore_context(self, puppeteer_request: PuppeteerRequest, response): + context_id = json.loads(response.text).get("contextId", None) + + if context_id in self.context_actions: + # Restoring + restoring_request = puppeteer_request.replace( + action=Compose( + self.context_actions.pop(context_id), puppeteer_request.action + ), + context_id=None, + page_id=None, + ) + restoring_request.meta["__request_binding"] = True + self.restore_logger.log( + level=logging.DEBUG, + msg=f"Restoring the context with context_id {context_id}", + ) + return restoring_request + + self.restore_logger.warning(f"Context_id {context_id} not in context_actions.") + return response + + def __delete_context(self, context_id: str, reason: Union[str, None]): + del self.context_actions[context_id] + + if reason is not None: + self.restore_logger.log(level=logging.INFO, msg=reason) diff --git a/scrapypuppeteer/middlewares/service.py b/scrapypuppeteer/middlewares/service.py new file mode 100644 index 0000000..b34d66e --- /dev/null +++ b/scrapypuppeteer/middlewares/service.py @@ -0,0 +1,111 @@ +import logging +from collections import defaultdict +from typing import List, Union + +from scrapy import signals +from scrapy.crawler import Crawler + +from scrapypuppeteer.browser_managers import BrowserManager +from scrapypuppeteer.browser_managers.playwright_browser_manager import ( + PlaywrightBrowserManager, +) +from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( + PyppeteerBrowserManager, +) +from scrapypuppeteer.browser_managers.service_browser_manager import ( + ServiceBrowserManager, +) + + +class PuppeteerServiceDownloaderMiddleware: + """ + This downloader middleware converts PuppeteerRequest instances to + Puppeteer service API requests and then converts its responses to + PuppeteerResponse instances. Additionally, it tracks all browser contexts + that spider uses and performs cleanup request to service right before + spider is closed. + + Additionally, the middleware uses these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' + + Settings: + + PUPPETEER_SERVICE_URL (str) + Service URL, e.g. 'http://localhost:3000' + + PUPPETEER_INCLUDE_HEADERS (bool|list[str]) + Determines which request headers will be sent to remote site by puppeteer service. + Either True (all headers), False (no headers) or list of header names. + May be overridden per request. + By default, only cookies are sent. + + PUPPETEER_INCLUDE_META (bool) + Determines whether to send or not user's meta attached by user. + Default to False. + """ + + SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" + INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" + SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" + DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + + EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" + + service_logger = logging.getLogger(__name__) + + def __init__( + self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool, + browser_manager: BrowserManager, + ): + self.service_base_url = service_url + self.include_headers = include_headers + self.include_meta = include_meta + self.crawler = crawler + self.used_contexts = defaultdict(set) + self.browser_manager = browser_manager + + @classmethod + def from_crawler(cls, crawler): + service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + if cls.INCLUDE_HEADERS_SETTING in crawler.settings: + try: + include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) + except ValueError: + include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) + else: + include_headers = cls.DEFAULT_INCLUDE_HEADERS + include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + + execution_method = crawler.settings.get( + cls.EXECUTION_METHOD_SETTING, "PUPPETEER" + ).lower() + + if execution_method == "pyppeteer": + browser_manager = PyppeteerBrowserManager() + elif execution_method == "puppeteer": + browser_manager = ServiceBrowserManager( + service_url, include_meta, include_headers, crawler + ) + elif execution_method == "playwright": + browser_manager = PlaywrightBrowserManager() + else: + raise NameError("Wrong EXECUTION_METHOD") + + middleware = cls( + crawler, service_url, include_headers, include_meta, browser_manager + ) + crawler.signals.connect( + middleware.browser_manager.close_used_contexts, signal=signals.spider_idle + ) + return middleware + + def process_request(self, request, spider): + return self.browser_manager.process_request(request) + + def process_response(self, request, response, spider): + return self.browser_manager.process_response(self, request, response, spider) diff --git a/setup.py b/setup.py index b9b7750..50b3597 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read_long_description(file_path): setup( name="scrapy-puppeteer-client", - version="0.3.8", + version="0.3.9", description="A library to use Puppeteer-managed browser in Scrapy spiders", long_description=read_long_description("README.md"), long_description_content_type="text/markdown",