diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index fc0d5c8..558a95d 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -8,12 +8,12 @@ jobs: strategy: matrix: include: - - python-version: "3.7.x" # Min Python version (No 3.6 version in GitHub repository) - - python-version: "3.8.x" + - python-version: "3.8.x" # Min Python version (No 3.7 version in GitHub repository) - python-version: "3.9.x" - python-version: "3.10.x" - python-version: "3.11.x" - python-version: "3.12.x" + - python-version: "3.13.x" - python-version: "3.x" # Last Python version steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index 7daee5a..ab6718f 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,21 @@ DOWNLOADER_MIDDLEWARES = { } PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods +``` +You may want not to use scrapy-puppeteer-service. +In this case aou are able to run locally with playwright and pyppeteer. -# To change the execution method, you must add the corresponding setting: -EXECUTION_METHOD = "Puppeteer" +```python +DOWNLOADER_MIDDLEWARES = { + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 +} + +DOWNLOAD_HANDLERS = { # Add browser download handler + "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", +} +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" # Need to install asyncio reactor +EXECUTION_METHOD = "playwright" # Choose execution method ``` Available methods: `Puppeteer`, `Pyppeteer`, `Playwright` diff --git a/examples/settings.py b/examples/settings.py index 1ac96c6..bdfcff7 100644 --- a/examples/settings.py +++ b/examples/settings.py @@ -10,5 +10,3 @@ } PUPPETEER_SERVICE_URL = "http://localhost:3000" - -PUPPETEER_LOCAL = False diff --git a/examples/spiders/playwright_test.py b/examples/spiders/playwright_test.py new file mode 100644 index 0000000..4e0e5a3 --- /dev/null +++ b/examples/spiders/playwright_test.py @@ -0,0 +1,67 @@ +from logging import ERROR + +import scrapy +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure + +from scrapypuppeteer import ( + PuppeteerRequest, + PuppeteerResponse, + PuppeteerScreenshotResponse, +) +from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll + + +class PlaywrightSpider(scrapy.Spider): + """ + Mostly, it is Compose spider, but it is very convenient for PlayWright testing. + """ + + name = "playwright_test" + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, + }, + "DOWNLOAD_HANDLERS": { + "http": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + "https": "scrapypuppeteer.browser_managers.browser_downloader_handler.BrowserDownloaderHandler", + }, + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "EXECUTION_METHOD": "playwright", + } + + def start_requests(self): + goto = GoTo("https://pptr.dev") + click_1 = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" + ) + click_2 = Click( + "#__docusaurus_skipToContent_fallback > div > div > aside > div > " + "div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a" + ) + click = Compose(click_1, click_2) + scroll = Scroll() + screenshot = Screenshot(options={"full_page": True, "type": "jpeg"}) + + compose_action = Compose( + goto, + click, + scroll, + screenshot, + ) + + yield PuppeteerRequest( + compose_action, + callback=self.parse, + errback=self.errback, + close_page=True, + ) + + def parse(self, response: PuppeteerResponse): + assert isinstance(response, PuppeteerScreenshotResponse) + self.log("Spider worked fine!") + + def errback(self, failure: Failure): + print(failure) + self.log(failure_to_exc_info(failure), level=ERROR) diff --git a/requirements.txt b/requirements.txt index 7846630..f30cc9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,2 @@ scrapy>=2.6 -pyppeteer -syncer -bs4 -playwright \ No newline at end of file +playwright diff --git a/scrapypuppeteer/browser_managers/__init__.py b/scrapypuppeteer/browser_managers/__init__.py index c7f77b3..20985d3 100644 --- a/scrapypuppeteer/browser_managers/__init__.py +++ b/scrapypuppeteer/browser_managers/__init__.py @@ -1,17 +1,89 @@ -__all__ = ["BrowserManager"] +__all__ = ["BrowserManager", "ContextManager"] +import uuid from abc import ABC, abstractmethod +from collections.abc import Coroutine +from typing import Dict, Union + +from scrapy import Request +from scrapy.utils.defer import deferred_from_coro +from twisted.internet.defer import Deferred + +from scrapypuppeteer import CloseContextRequest + + +class ContextManager(ABC): + def __init__(self, browser): + self.browser = browser + self.contexts: Dict[str, ...] = {} + self.pages: Dict[str, ...] = {} + self.context2page: Dict[str, str] = {} + + @classmethod + @abstractmethod + async def async_init(cls): ... + + @staticmethod + @abstractmethod + async def _create_context(browser): ... + + @staticmethod + @abstractmethod + async def _create_page(context): ... + + async def check_context_and_page(self, context_id, page_id): + if not context_id or not page_id: + context_id, page_id = await self.open_new_page() + return context_id, page_id + + async def open_new_page(self): + context_id = uuid.uuid4().hex.upper() + page_id = uuid.uuid4().hex.upper() + + self.contexts[context_id] = await self._create_context(self.browser) + self.pages[page_id] = await self._create_page(self.contexts[context_id]) + self.context2page[context_id] = page_id + + return context_id, page_id + + def get_page_by_id(self, context_id, page_id): + return self.pages[page_id] + + async def close_browser(self): + if self.browser: + await self.browser.close() + + async def close_contexts(self, request: CloseContextRequest): + for context_id in request.contexts: + if context_id in self.contexts: + await self.contexts[context_id].close() + page_id = self.context2page.get(context_id) + self.pages.pop(page_id, None) + + del self.contexts[context_id] + del self.context2page[context_id] class BrowserManager(ABC): @abstractmethod - def process_request(self, request, spider): - pass + def _download_request( + self, request: Request, spider + ) -> Union[Coroutine, Request]: ... @abstractmethod - def close_used_contexts(self): - pass + async def _start_browser_manager(self) -> None: ... @abstractmethod - def process_response(self, middleware, request, response, spider): - pass + async def _stop_browser_manager(self) -> None: ... + + def download_request(self, request: Request, spider) -> Union[Deferred, Request]: + coro_or_request = self._download_request(request, spider) + if isinstance(coro_or_request, Coroutine): + return deferred_from_coro(coro_or_request) + return coro_or_request + + def start_browser_manager(self) -> Deferred: + return deferred_from_coro(self._start_browser_manager()) + + def stop_browser_manager(self) -> Deferred: + return deferred_from_coro(self._stop_browser_manager()) diff --git a/scrapypuppeteer/browser_managers/browser_downloader_handler.py b/scrapypuppeteer/browser_managers/browser_downloader_handler.py new file mode 100644 index 0000000..fc96801 --- /dev/null +++ b/scrapypuppeteer/browser_managers/browser_downloader_handler.py @@ -0,0 +1,76 @@ +from scrapy import signals +from scrapy.core.downloader.handlers.http import HTTPDownloadHandler +from scrapy.crawler import Crawler +from scrapy.utils.reactor import verify_installed_reactor +from twisted.internet.defer import Deferred + +from scrapypuppeteer import CloseContextRequest +from scrapypuppeteer.browser_managers import BrowserManager +from scrapypuppeteer.browser_managers.playwright_browser_manager import ( + PlaywrightBrowserManager, +) +from scrapypuppeteer.browser_managers.service_browser_manager import ( + ServiceBrowserManager, +) +from scrapypuppeteer.request import ActionRequest + + +class BrowserDownloaderHandler(HTTPDownloadHandler): + """ + Browser downloader handler. + If instantiated, executes actions in the browser with provided requests. + If given, then installed reactor must be `AsyncioSelectorReactor`. + + Currently, supports 3 browser types via EXECUTION_METHOD setting: + * puppeteer -- see scrapy-puppeteer-service + * pyppeteer -- not available, since the package is not actively supported + * playwright -- see https://playwright.dev/python/ + """ + + EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" + + def __init__(self, settings, browser_manager: BrowserManager, crawler=None) -> None: + super().__init__(settings, crawler=crawler) + verify_installed_reactor( + "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + ) + + self.browser_manager = browser_manager + + @classmethod + def from_crawler(cls, crawler: Crawler): + settings = crawler.settings + + execution_method = crawler.settings.get( + cls.EXECUTION_METHOD_SETTING, "PUPPETEER" + ).lower() + + match execution_method: + case "puppeteer": + browser_manager = ServiceBrowserManager() + case "pyppeteer": + raise ValueError( + "Currently, pyppeteer cannot run on some machines since it is not actively supported." + ) + case "playwright": + browser_manager = PlaywrightBrowserManager() + case _: + raise ValueError( + f"Invalid execution method: {execution_method.upper()}" + ) + + bdh = cls(settings, browser_manager, crawler=crawler) + crawler.signals.connect( + bdh.browser_manager.start_browser_manager, signals.spider_opened + ) # This makes the start VERY slow + crawler.signals.connect( + bdh.browser_manager.stop_browser_manager, signals.engine_stopped + ) + return bdh + + def download_request(self, request, spider): + if isinstance(request, (ActionRequest, CloseContextRequest)): + dfd_or_request = self.browser_manager.download_request(request, spider) + if isinstance(dfd_or_request, Deferred): + return dfd_or_request + return super().download_request(request, spider) diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 1228e29..d41e558 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -1,66 +1,40 @@ import asyncio import base64 -import uuid +from json import dumps +from typing import Any, Awaitable, Callable, Dict, Union -import syncer -from playwright.async_api import async_playwright +from playwright.async_api import Page, async_playwright +from scrapy.http import TextResponse -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest -from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, - PuppeteerScreenshotResponse, -) +from scrapypuppeteer import PuppeteerRequest +from scrapypuppeteer.browser_managers import BrowserManager, ContextManager +from scrapypuppeteer.request import ActionRequest, CloseContextRequest -class ContextManager: - def __init__(self): - self.browser = syncer.sync(self.launch_browser()) - self.contexts = {} - self.pages = {} - self.context_page_map = {} - - async def launch_browser(self): +class PlaywrightContextManager(ContextManager): + @classmethod + async def async_init(cls): playwright = await async_playwright().start() - return await playwright.chromium.launch(headless=False) - - async def check_context_and_page(self, context_id, page_id): - if not context_id or not page_id: - context_id, page_id = await self.open_new_page() - return context_id, page_id - - async def open_new_page(self): - context_id = uuid.uuid4().hex.upper() - page_id = uuid.uuid4().hex.upper() - - self.contexts[context_id] = await self.browser.new_context() - self.pages[page_id] = await self.contexts[context_id].new_page() - self.context_page_map[context_id] = page_id + browser = await playwright.chromium.launch(headless=False) + return cls(browser) - return context_id, page_id + @staticmethod + async def _create_context(browser): + return await browser.new_context() - def get_page_by_id(self, context_id, page_id): - return self.pages[page_id] - - def close_browser(self): - if self.browser: - syncer.sync(self.browser.close()) - - def close_contexts(self, request: CloseContextRequest): - for context_id in request.contexts: - if context_id in self.contexts: - syncer.sync(self.contexts[context_id].close()) - page_id = self.context_page_map.get(context_id) - self.pages.pop(page_id, None) - - del self.contexts[context_id] - del self.context_page_map[context_id] + @staticmethod + async def _create_page(context): + return await context.new_page() class PlaywrightBrowserManager(BrowserManager): def __init__(self): - self.context_manager = ContextManager() - self.action_map = { + self.context_manager: Union[PlaywrightContextManager, None] = ( + None # Will be initialized later + ) + self.action_map: Dict[ + str, Callable[[Page, ActionRequest], Awaitable[Dict[str, Any]]] + ] = { "goto": self.goto, "click": self.click, "compose": self.compose, @@ -74,24 +48,68 @@ def __init__(self): "fill_form": self.fill_form, } - def process_request(self, request): - if isinstance(request, PuppeteerRequest): - endpoint = request.action.endpoint - action_function = self.action_map.get(endpoint) - if action_function: - return action_function(request) - + def _download_request(self, request, spider): + if isinstance(request, ActionRequest): + return self.__perform_action(request) if isinstance(request, CloseContextRequest): return self.close_contexts(request) - def close_contexts(self, request: CloseContextRequest): - self.context_manager.close_contexts(request) + async def _start_browser_manager(self) -> None: + self.context_manager = await PlaywrightContextManager.async_init() + + async def _stop_browser_manager(self) -> None: + if self.context_manager: + await self.context_manager.close_browser() + + async def __perform_action(self, request: ActionRequest): + pptr_request: PuppeteerRequest = request.meta["puppeteer_request"] + endpoint = request.action.endpoint + action_function = self.action_map.get(endpoint) + if action_function: + context_id, page_id = await self.context_manager.check_context_and_page( + pptr_request.context_id, pptr_request.page_id + ) + page = self.context_manager.get_page_by_id(context_id, page_id) + + try: + response_data = await action_function(page, request) + except Exception as e: + return TextResponse( + request.url, + headers={"Content-Type": "application/json"}, + body=dumps( + { + "error": str(e), + "contextId": context_id, + "pageId": page_id, + } + ), + status=500, + encoding="utf-8", + ) - def close_used_contexts(self): - self.context_manager.close_browser() + response_data["contextId"] = context_id + response_data["pageId"] = page_id + return TextResponse( + request.url, + headers={"Content-Type": "application/json"}, + body=dumps(response_data), + encoding="utf-8", + ) + raise ValueError(f"No such action: {endpoint}") + + async def close_contexts(self, request: CloseContextRequest) -> TextResponse: + await self.context_manager.close_contexts(request) + return TextResponse( + request.url, + encoding="utf-8", + status=200, + headers={}, + body=b"Successfully closed context", + ) - def process_response(self, middleware, request, response, spider): - return response + async def close_used_contexts(self): + await self.context_manager.close_browser() def map_navigation_options(self, navigation_options): if not navigation_options: @@ -121,6 +139,10 @@ def map_navigation_options(self, navigation_options): ) elif isinstance(waitUntil, str): strictest_event = waitUntil + else: + raise TypeError( + f"waitUntil should be a list or a string, got {type(waitUntil)}" + ) if strictest_event in event_map: mapped_navigation_options["wait_until"] = event_map[strictest_event] @@ -177,201 +199,133 @@ async def wait_with_options(self, page, wait_options): elif timeout: await asyncio.sleep(timeout / 1000) - def get_page_from_request(self, request): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - return ( - self.context_manager.get_page_by_id(context_id, page_id), - context_id, - page_id, + async def goto(self, page: Page, request: ActionRequest): + url = request.action.payload()["url"] + cookies = request.cookies + navigation_options = self.map_navigation_options( + request.action.navigation_options ) + await page.goto(url, **navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - def goto(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) - - async def async_goto(): - url = request.action.payload()["url"] - cookies = request.cookies - navigation_options = self.map_navigation_options( - request.action.navigation_options - ) - await page.goto(url, **navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_goto()) - - def click(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) - - async def async_click(): - selector = request.action.payload().get("selector") - cookies = request.cookies - click_options = self.map_click_options(request.action.click_options) - await page.click(selector, **click_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_click()) - - def go_back(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) - - async def async_go_back(): - cookies = request.cookies - navigation_options = self.map_navigation_options( - request.action.navigation_options - ) - await page.go_back(**navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_back()) - - def go_forward(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) - - async def async_go_forward(): - cookies = request.cookies - navigation_options = self.map_navigation_options( - request.action.navigation_options - ) - await page.go_forward(**navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_forward()) - - def screenshot(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) - - async def async_screenshot(): - screenshot_options = request.action.options or {} - screenshot_bytes = await page.screenshot( - **self.map_screenshot_options(screenshot_options) - ) - screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") - return PuppeteerScreenshotResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - screenshot=screenshot_base64, - ) - - return syncer.sync(async_screenshot()) - - def scroll(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) + async def click(self, page: Page, request: ActionRequest): + selector = request.action.payload().get("selector") + cookies = request.cookies + click_options = self.map_click_options(request.action.click_options) + await page.click(selector, **click_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - async def async_scroll(): - cookies = request.cookies - selector = request.action.payload().get("selector", None) + async def go_back(self, page: Page, request: ActionRequest): + cookies = request.cookies + navigation_options = self.map_navigation_options( + request.action.navigation_options + ) + await page.go_back(**navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - if selector: - script = f""" - document.querySelector('{selector}').scrollIntoView(); - """ - else: - script = """ - window.scrollBy(0, document.body.scrollHeight); - """ - await page.evaluate(script) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) + async def go_forward(self, page: Page, request: ActionRequest): + cookies = request.cookies + navigation_options = self.map_navigation_options( + request.action.navigation_options + ) + await page.go_forward(**navigation_options) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - return syncer.sync(async_scroll()) + async def screenshot(self, page: Page, request: ActionRequest): + screenshot_options = request.action.options or {} + screenshot_bytes = await page.screenshot( + **self.map_screenshot_options(screenshot_options) + ) + screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") + return { + "screenshot": screenshot_base64, + } - def fill_form(self, request: PuppeteerRequest): - page, context_id, page_id = self.get_page_from_request(request) + async def scroll(self, page: Page, request: ActionRequest): + cookies = request.cookies + selector = request.action.payload().get("selector", None) - async def async_fill_form(): - input_mapping = request.action.payload().get("inputMapping") - submit_button = request.action.payload().get("submitButton", None) - cookies = request.cookies + if selector: + script = f""" + document.querySelector('{selector}').scrollIntoView(); + """ + else: + script = """ + window.scrollBy(0, document.body.scrollHeight); + """ + + await page.evaluate(script) + wait_options = request.action.payload().get("waitOptions", {}) or {} + await self.wait_with_options(page, wait_options) + response_html = await page.content() + + return { + "html": response_html, + "cookies": cookies, + } - for selector, params in input_mapping.items(): - text = params.get("value", None) - delay = params.get("delay", 0) - await page.type(selector, text=text, delay=delay) + @staticmethod + async def fill_form(page: Page, request: ActionRequest): + input_mapping = request.action.payload().get("inputMapping") + submit_button = request.action.payload().get("submitButton", None) + cookies = request.cookies - if submit_button: - await page.click(submit_button) + for selector, params in input_mapping.items(): + text = params.get("value", None) + delay = params.get("delay", 0) + await page.type(selector, text=text, delay=delay) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) + if submit_button: + await page.click(submit_button) - return syncer.sync(async_fill_form()) + response_html = await page.content() - def compose(self, request: PuppeteerRequest): - _, context_id, page_id = self.get_page_from_request(request) - request.page_id = page_id - request.context_id = context_id + return { + "html": response_html, + "cookies": cookies, + } + async def compose(self, page: Page, request: ActionRequest): for action in request.action.actions: - response = self.action_map[action.endpoint](request.replace(action=action)) - return response.replace(puppeteer_request=request) + response_data = await self.action_map[action.endpoint]( + page, + request.replace(action=action), + ) + return response_data - def action(self, request: PuppeteerRequest): - raise ValueError("CustomJsAction is not available in local mode") + async def action(self, request: ActionRequest): + raise NotImplementedError("CustomJsAction is not available in local mode") - def recaptcha_solver(self, request: PuppeteerRequest): - raise ValueError("RecaptchaSolver is not available in local mode") + async def recaptcha_solver(self, request: ActionRequest): + raise NotImplementedError("RecaptchaSolver is not available in local mode") - def har(self, request: PuppeteerRequest): - raise ValueError("Har is not available in local mode") + async def har(self, request: ActionRequest): + raise NotImplementedError("Har is not available in local mode") diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index 6998e0c..673c437 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -1,340 +1 @@ -import asyncio -import base64 -import uuid - -import syncer -from pyppeteer import launch - -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest -from scrapypuppeteer.response import ( - PuppeteerHtmlResponse, - PuppeteerScreenshotResponse, -) - - -class ContextManager: - def __init__(self): - self.browser = syncer.sync(launch()) - self.contexts = {} - self.pages = {} - self.context_page_map = {} - - async def check_context_and_page(self, context_id, page_id): - if not context_id or not page_id: - context_id, page_id = await self.open_new_page() - return context_id, page_id - - async def open_new_page(self): - context_id = uuid.uuid4().hex.upper() - page_id = uuid.uuid4().hex.upper() - - self.contexts[context_id] = await self.browser.createIncognitoBrowserContext() - self.pages[page_id] = await self.contexts[context_id].newPage() - self.context_page_map[context_id] = page_id - - return context_id, page_id - - def get_page_by_id(self, context_id, page_id): - return self.pages[page_id] - - def close_browser(self): - if self.browser: - syncer.sync(self.browser.close()) - - def close_contexts(self, request: CloseContextRequest): - for context_id in request.contexts: - if context_id in self.contexts: - syncer.sync(self.contexts[context_id].close()) - page_id = self.context_page_map.get(context_id) - self.pages.pop(page_id, None) - - del self.contexts[context_id] - del self.context_page_map[context_id] - - -class PyppeteerBrowserManager(BrowserManager): - def __init__(self): - self.context_manager = ContextManager() - self.action_map = { - "goto": self.goto, - "click": self.click, - "compose": self.compose, - "back": self.go_back, - "forward": self.go_forward, - "scroll": self.scroll, - "screenshot": self.screenshot, - "action": self.action, - "recaptcha_solver": self.recaptcha_solver, - "har": self.har, - "fill_form": self.fill_form, - } - - def process_request(self, request): - if isinstance(request, PuppeteerRequest): - endpoint = request.action.endpoint - action_function = self.action_map.get(endpoint) - if action_function: - return action_function(request) - - if isinstance(request, CloseContextRequest): - return self.close_contexts(request) - - def close_contexts(self, request: CloseContextRequest): - self.context_manager.close_contexts(request) - - def close_used_contexts(self): - self.context_manager.close_browser() - - def process_response(self, middleware, request, response, spider): - return response - - async def wait_with_options(self, page, wait_options: dict): - selector = wait_options.get("selector") - xpath = wait_options.get("xpath") - timeout = wait_options.get("timeout", None) - options = wait_options.get("options", {}) - - selector_or_timeout = wait_options.get("selectorOrTimeout") - if selector_or_timeout: - if isinstance(selector_or_timeout, (int, float)): - timeout = selector_or_timeout - elif isinstance(selector_or_timeout, str): - if selector_or_timeout.startswith("//"): - xpath = selector_or_timeout - else: - selector = selector_or_timeout - - if len([item for item in [selector, xpath, timeout] if item]) > 1: - raise ValueError( - "Wait options must contain either a selector, an xpath, or a timeout" - ) - - if selector: - await page.waitForSelector(selector, options) - elif xpath: - await page.waitForXPath(xpath, options) - elif timeout: - await asyncio.sleep(timeout / 1000) - - def goto(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_goto(): - url = request.action.payload()["url"] - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goto(url, navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_goto()) - - def click(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_click(): - selector = request.action.payload().get("selector") - cookies = request.cookies - click_options = request.action.click_options or {} - navigation_options = request.action.navigation_options or {} - options = {**click_options, **navigation_options} - await page.click(selector, options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_click()) - - def go_back(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_go_back(): - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goBack(navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_back()) - - def go_forward(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_go_forward(): - cookies = request.cookies - navigation_options = request.action.navigation_options - await page.goForward(navigation_options) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_go_forward()) - - def screenshot(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_screenshot(): - request_options = request.action.options or {} - screenshot_options = {"encoding": "binary"} - screenshot_options.update(request_options) - screenshot_bytes = await page.screenshot(screenshot_options) - screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") - return PuppeteerScreenshotResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - screenshot=screenshot_base64, - ) - - return syncer.sync(async_screenshot()) - - def scroll(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_scroll(): - cookies = request.cookies - selector = request.action.payload().get("selector", None) - - if selector: - script = f""" - document.querySelector('{selector}').scrollIntoView(); - """ - else: - script = """ - window.scrollBy(0, document.body.scrollHeight); - """ - await page.evaluate(script) - wait_options = request.action.payload().get("waitOptions", {}) or {} - await self.wait_with_options(page, wait_options) - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_scroll()) - - def fill_form(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - page = self.context_manager.get_page_by_id(context_id, page_id) - - async def async_fill_form(): - input_mapping = request.action.payload().get("inputMapping") - submit_button = request.action.payload().get("submitButton", None) - cookies = request.cookies - - for selector, params in input_mapping.items(): - value = params.get("value", None) - delay = params.get("delay", 0) - await page.type(selector, value, {"delay": delay}) - - if submit_button: - await page.click(submit_button) - - response_html = await page.content() - return PuppeteerHtmlResponse( - request.url, - request, - context_id=context_id, - page_id=page_id, - html=response_html, - cookies=cookies, - ) - - return syncer.sync(async_fill_form()) - - def compose(self, request: PuppeteerRequest): - context_id, page_id = syncer.sync( - self.context_manager.check_context_and_page( - request.context_id, request.page_id - ) - ) - request.page_id = page_id - request.context_id = context_id - - for action in request.action.actions: - response = self.action_map[action.endpoint](request.replace(action=action)) - return response.replace(puppeteer_request=request) - - def action(self, request: PuppeteerRequest): - raise ValueError("CustomJsAction is not available in local mode") - - def recaptcha_solver(self, request: PuppeteerRequest): - raise ValueError("RecaptchaSolver is not available in local mode") - - def har(self, request: PuppeteerRequest): - raise ValueError("Har is not available in local mode") +# TODO: wait for fixing the `pyppeteer` package diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py index f016f14..22051c1 100644 --- a/scrapypuppeteer/browser_managers/service_browser_manager.py +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -1,232 +1,15 @@ -import json -import logging -from collections import defaultdict -from urllib.parse import urlencode, urljoin - -from scrapy.exceptions import DontCloseSpider -from scrapy.http import Headers, Response, TextResponse -from scrapy.utils.log import failure_to_exc_info -from twisted.python.failure import Failure - -from scrapypuppeteer.actions import ( - Click, - Compose, - FillForm, - GoBack, - GoForward, - GoTo, - Har, - RecaptchaSolver, - Screenshot, - Scroll, -) -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest -from scrapypuppeteer.response import ( - PuppeteerHarResponse, - PuppeteerHtmlResponse, - PuppeteerJsonResponse, - PuppeteerRecaptchaSolverResponse, - PuppeteerScreenshotResponse, -) +from . import BrowserManager class ServiceBrowserManager(BrowserManager): - def __init__(self, service_base_url, include_meta, include_headers, crawler): - self.service_base_url = service_base_url - self.include_meta = include_meta - self.include_headers = include_headers - self.used_contexts = defaultdict(set) - self.service_logger = logging.getLogger(__name__) - self.crawler = crawler - - if self.service_base_url is None: - raise ValueError("Puppeteer service URL must be provided") - - def process_request(self, request): - if isinstance(request, CloseContextRequest): - return self.process_close_context_request(request) - - if isinstance(request, PuppeteerRequest): - return self.process_puppeteer_request(request) - - def process_close_context_request(self, request: CloseContextRequest): - if not request.is_valid_url: - return request.replace( - url=urljoin(self.service_base_url, "/close_context"), - ) - - def process_puppeteer_request(self, request: PuppeteerRequest): - action = request.action - service_url = urljoin(self.service_base_url, action.endpoint) - service_params = self._encode_service_params(request) - if service_params: - service_url += "?" + service_params - meta = { - "puppeteer_request": request, - "dont_obey_robotstxt": True, - "proxy": None, - } - if self.include_meta: - meta = {**request.meta, **meta} - action_request = ActionRequest( - url=service_url, - action=action, - method="POST", - headers=Headers({"Content-Type": action.content_type}), - body=self._serialize_body(action, request), - dont_filter=True, - cookies=request.cookies, - priority=request.priority, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta=meta, - ) - return action_request - - @staticmethod - def _encode_service_params(request): - service_params = {} - if request.context_id is not None: - service_params["contextId"] = request.context_id - if request.page_id is not None: - service_params["pageId"] = request.page_id - if request.close_page: - service_params["closePage"] = 1 - return urlencode(service_params) - - def _serialize_body(self, action, request): - payload = action.payload() - if action.content_type == "application/json": - payload = self.__clean_payload(payload) - proxy = request.meta.get("proxy") - if proxy: - payload["proxy"] = proxy - include_headers = ( - self.include_headers - if request.include_headers is None - else request.include_headers - ) - if include_headers: - headers = request.headers.to_unicode_dict() - if isinstance(include_headers, list): - headers = { - h.lower(): headers[h] for h in include_headers if h in headers - } - payload["headers"] = headers - return json.dumps(payload) - return str(payload) - - def __clean_payload(self, payload): - """ - disallow null values in request parameters - """ - if isinstance(payload, dict): - payload = { - k: self.__clean_payload(v) for k, v in payload.items() if v is not None - } - elif isinstance(payload, list): - payload = [self.__clean_payload(v) for v in payload if v is not None] - return payload - - def close_used_contexts(self, spider): - contexts = list(self.used_contexts.pop(id(spider), set())) - if contexts: - request = CloseContextRequest( - contexts, - meta={"proxy": None}, - ) - - def handle_close_contexts_result(result): - if isinstance(result, Response): - if result.status == 200: - self.service_logger.debug( - f"Successfully closed {len(request.contexts)} " - f"contexts with request {result.request}" - ) - else: - self.service_logger.warning( - f"Could not close contexts: {result.text}" - ) - elif isinstance(result, Failure): - self.service_logger.warning( - f"Could not close contexts: {result.value}", - exc_info=failure_to_exc_info(result), - ) - - dfd = self.crawler.engine.download(request) - dfd.addBoth(handle_close_contexts_result) - - raise DontCloseSpider() - - def process_response(self, middleware, request, response, spider): - if not isinstance(response, TextResponse): - return response - - puppeteer_request = request.meta.get("puppeteer_request") - if puppeteer_request is None: - return response - - if b"application/json" not in response.headers.get(b"Content-Type", b""): - return response.replace(request=request) - - response_data = json.loads(response.text) - if response.status != 200: - reason = response_data.pop("error", f"undefined, status {response.status}") - middleware.service_logger.warning( - f"Request {request} is not succeeded. Reason: {reason}" - ) - context_id = response_data.get("contextId") - if context_id: - self.used_contexts[id(spider)].add(context_id) - return response - - response_cls = self._get_response_class(puppeteer_request.action) - - return self._form_response( - response_cls, - response_data, - puppeteer_request.url, - request, - puppeteer_request, - spider, - ) + def __init__(self): + super().__init__() - def _form_response( - self, - response_cls, - response_data, - url, - request, - puppeteer_request, - spider, - ): - context_id = response_data.pop("contextId", puppeteer_request.context_id) - page_id = response_data.pop("pageId", puppeteer_request.page_id) - self.used_contexts[id(spider)].add(context_id) + def _download_request(self, request, spider): + return request - return response_cls( - url=url, - puppeteer_request=puppeteer_request, - context_id=context_id, - page_id=page_id, - request=request, - **response_data, - ) + async def _start_browser_manager(self) -> None: + return - def _get_response_class(self, request_action): - if isinstance( - request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm) - ): - return PuppeteerHtmlResponse - if isinstance(request_action, Screenshot): - return PuppeteerScreenshotResponse - if isinstance(request_action, Har): - return PuppeteerHarResponse - if isinstance(request_action, RecaptchaSolver): - return PuppeteerRecaptchaSolverResponse - if isinstance(request_action, Compose): - # Response class is a last action's response class - return self._get_response_class(request_action.actions[-1]) - return PuppeteerJsonResponse + async def _stop_browser_manager(self) -> None: + return diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index b051ed0..0417f70 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,32 +1,37 @@ +import json import logging from collections import defaultdict from typing import List, Union +from urllib.parse import urlencode, urljoin from scrapy import signals from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy.exceptions import DontCloseSpider, IgnoreRequest, NotConfigured +from scrapy.http import Headers, Response, TextResponse +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure from scrapypuppeteer.actions import ( Click, + Compose, CustomJsAction, + FillForm, + GoBack, + GoForward, + GoTo, + Har, RecaptchaSolver, Screenshot, Scroll, ) -from scrapypuppeteer.browser_managers import BrowserManager -from scrapypuppeteer.browser_managers.playwright_browser_manager import ( - PlaywrightBrowserManager, -) -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( - PyppeteerBrowserManager, -) -from scrapypuppeteer.browser_managers.service_browser_manager import ( - ServiceBrowserManager, -) from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest from scrapypuppeteer.response import ( + PuppeteerHarResponse, PuppeteerHtmlResponse, + PuppeteerJsonResponse, + PuppeteerRecaptchaSolverResponse, PuppeteerResponse, + PuppeteerScreenshotResponse, ) @@ -63,8 +68,6 @@ class PuppeteerServiceDownloaderMiddleware: SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately - EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" - service_logger = logging.getLogger(__name__) def __init__( @@ -73,18 +76,18 @@ def __init__( service_url: str, include_headers: Union[bool, List[str]], include_meta: bool, - browser_manager: BrowserManager, ): self.service_base_url = service_url self.include_headers = include_headers self.include_meta = include_meta self.crawler = crawler self.used_contexts = defaultdict(set) - self.browser_manager = browser_manager @classmethod def from_crawler(cls, crawler): service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + # if service_url is None: + # raise ValueError("Puppeteer service URL must be provided") if cls.INCLUDE_HEADERS_SETTING in crawler.settings: try: include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) @@ -93,35 +96,203 @@ def from_crawler(cls, crawler): else: include_headers = cls.DEFAULT_INCLUDE_HEADERS include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + middleware = cls(crawler, service_url, include_headers, include_meta) + crawler.signals.connect( + middleware.close_used_contexts, signal=signals.spider_idle + ) + return middleware - execution_method = crawler.settings.get( - cls.EXECUTION_METHOD_SETTING, "PUPPETEER" - ).lower() + def process_request(self, request, **_): + if isinstance(request, CloseContextRequest): + return self.process_close_context_request(request) + + if isinstance(request, PuppeteerRequest): + return self.process_puppeteer_request(request) - if execution_method == "pyppeteer": - browser_manager = PyppeteerBrowserManager() - elif execution_method == "puppeteer": - browser_manager = ServiceBrowserManager( - service_url, include_meta, include_headers, crawler + def process_close_context_request(self, request: CloseContextRequest): + if not request.is_valid_url: + return request.replace( + url=urljoin(self.service_base_url, "/close_context"), ) - elif execution_method == "playwright": - browser_manager = PlaywrightBrowserManager() - else: - raise NameError("Wrong EXECUTION_METHOD") - middleware = cls( - crawler, service_url, include_headers, include_meta, browser_manager - ) - crawler.signals.connect( - middleware.browser_manager.close_used_contexts, signal=signals.spider_idle + def process_puppeteer_request(self, request: PuppeteerRequest): + action = request.action + service_url = urljoin(self.service_base_url, action.endpoint) + service_params = self._encode_service_params(request) + if service_params: + service_url += "?" + service_params + + meta = { + "puppeteer_request": request, + "dont_obey_robotstxt": True, + "proxy": None, + } + if self.include_meta: + meta = {**request.meta, **meta} + + return ActionRequest( + url=service_url, + action=action, + method="POST", + headers=Headers({"Content-Type": action.content_type}), + body=self._serialize_body(action, request), + dont_filter=True, + cookies=request.cookies, + priority=request.priority, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta=meta, ) - return middleware - def process_request(self, request, spider): - return self.browser_manager.process_request(request) + @staticmethod + def _encode_service_params(request): + service_params = {} + if request.context_id is not None: + service_params["contextId"] = request.context_id + if request.page_id is not None: + service_params["pageId"] = request.page_id + if request.close_page: + service_params["closePage"] = 1 + return urlencode(service_params) + + def _serialize_body(self, action, request): + payload = action.payload() + if action.content_type == "application/json": + payload = self.__clean_payload(payload) + proxy = request.meta.get("proxy") + if proxy: + payload["proxy"] = proxy + include_headers = ( + self.include_headers + if request.include_headers is None + else request.include_headers + ) + if include_headers: + headers = request.headers.to_unicode_dict() + if isinstance(include_headers, list): + headers = { + h.lower(): headers[h] for h in include_headers if h in headers + } + payload["headers"] = headers + return json.dumps(payload) + return str(payload) + + def __clean_payload(self, payload): + """ + disallow null values in request parameters + """ + if isinstance(payload, dict): + payload = { + k: self.__clean_payload(v) for k, v in payload.items() if v is not None + } + elif isinstance(payload, list): + payload = [self.__clean_payload(v) for v in payload if v is not None] + return payload def process_response(self, request, response, spider): - return self.browser_manager.process_response(self, request, response, spider) + if not isinstance(response, TextResponse): + return response + + puppeteer_request = request.meta.get("puppeteer_request") + if puppeteer_request is None: + return response + + if b"application/json" not in response.headers.get(b"Content-Type", b""): + return response.replace(request=request) + + response_data = json.loads(response.text) + if response.status != 200: + reason = response_data.pop("error", f"undefined, status {response.status}") + self.service_logger.warning( + f"Request {request} is not succeeded. Reason: {reason}" + ) + context_id = response_data.get("contextId") + if context_id: + self.used_contexts[id(spider)].add(context_id) + return response + + response_cls = self._get_response_class(puppeteer_request.action) + + return self._form_response( + response_cls, + response_data, + puppeteer_request.url, + request, + puppeteer_request, + spider, + ) + + def _form_response( + self, + response_cls, + response_data, + url, + request, + puppeteer_request, + spider, + ): + context_id = response_data.pop("contextId", puppeteer_request.context_id) + page_id = response_data.pop("pageId", puppeteer_request.page_id) + + self.used_contexts[id(spider)].add(context_id) + + return response_cls( + url=url, + puppeteer_request=puppeteer_request, + context_id=context_id, + page_id=page_id, + request=request, + **response_data, + ) + + def _get_response_class(self, request_action): + if isinstance( + request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm) + ): + return PuppeteerHtmlResponse + if isinstance(request_action, Screenshot): + return PuppeteerScreenshotResponse + if isinstance(request_action, Har): + return PuppeteerHarResponse + if isinstance(request_action, RecaptchaSolver): + return PuppeteerRecaptchaSolverResponse + if isinstance(request_action, Compose): + # Response class is a last action's response class + return self._get_response_class(request_action.actions[-1]) + return PuppeteerJsonResponse + + def close_used_contexts(self, spider): + contexts = list(self.used_contexts.pop(id(spider), set())) + if contexts: + request = CloseContextRequest( + contexts, + meta={"proxy": None}, + ) + + def handle_close_contexts_result(result): + if isinstance(result, Response): + if result.status == 200: + self.service_logger.debug( + f"Successfully closed {len(request.contexts)} " + f"contexts with request {result.request}" + ) + else: + self.service_logger.warning( + f"Could not close contexts: {result.text}" + ) + elif isinstance(result, Failure): + self.service_logger.warning( + f"Could not close contexts: {result.value}", + exc_info=failure_to_exc_info(result), + ) + else: + print(f"Not a Response or Failure: {type(result)}, {result}") + + dfd = self.crawler.engine.download(request) + dfd.addBoth(handle_close_contexts_result) + + raise DontCloseSpider() class PuppeteerRecaptchaDownloaderMiddleware: diff --git a/setup.py b/setup.py index b9b7750..56cef92 100644 --- a/setup.py +++ b/setup.py @@ -2,24 +2,22 @@ from setuptools import find_packages, setup - -def read_long_description(file_path): - with open(file_path, "r") as file: - return file.read() +with open("README.md", "r") as readme: + long_description = readme.read() setup( name="scrapy-puppeteer-client", version="0.3.8", description="A library to use Puppeteer-managed browser in Scrapy spiders", - long_description=read_long_description("README.md"), + long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/ispras/scrapy-puppeteer", author="MODIS @ ISP RAS", maintainer="Maksim Varlamov", maintainer_email="varlamov@ispras.ru", packages=find_packages(), - install_requires=["scrapy>=2.6", "pyppeteer", "syncer", "bs4", "playwright"], + install_requires=["scrapy>=2.6", "playwright"], python_requires=">=3.6", license="BSD", classifiers=[