diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 53e88092..c2cb33c9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,13 @@ TBR --- * Use the new ``web_poet.HttpResponse`` which replaces ``web_poet.ResponseData``. +* We have these **backward incompatible** changes since the + ``web_poet.OverrideRule`` follow a different structure: + + * Deprecated ``PerDomainOverridesRegistry`` in lieu of the newer + ``OverridesRegistry`` which provides a wide variety of features + for better URL matching. + * This resuls in a newer format in the ``SCRAPY_POET_OVERRIDES`` setting. 0.3.0 (2022-01-28) diff --git a/docs/conf.py b/docs/conf.py index de2227c2..2e205d04 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -188,7 +188,8 @@ intersphinx_mapping = { 'python': ('https://docs.python.org/3', None, ), 'scrapy': ('https://docs.scrapy.org/en/latest', None, ), - 'web_poet': ('https://web-poet.readthedocs.io/en/stable/', None), + 'web-poet': ('https://web-poet.readthedocs.io/en/latest/', None), + 'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None), } autodoc_default_options = { diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 20a82efa..2be5d731 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -9,7 +9,7 @@ system. If that’s not the case, see :ref:`intro-install`. .. note:: - This tutorial can be followed without reading `web-poet docs`_, but + This tutorial can be followed without reading `web-poet`_ docs, but for a better understanding it is highly recommended to check them first. @@ -26,7 +26,7 @@ This tutorial will walk you through these tasks: If you're not already familiar with Scrapy, and want to learn it quickly, the `Scrapy Tutorial`_ is a good resource. -.. _web-poet docs: https://web-poet.readthedocs.io/en/stable/ +.. _web-poet: https://web-poet.readthedocs.io/en/stable/ Creating a spider ================= @@ -125,8 +125,8 @@ To use ``scrapy-poet``, enable its downloader middleware in ``settings.py``: ``BookPage`` class we created previously can be used without ``scrapy-poet``, and even without Scrapy (note that imports were from ``web_poet`` so far). -``scrapy-poet`` makes it easy to use ``web-poet`` Page Objects -(such as BookPage) in Scrapy spiders. +``scrapy-poet`` makes it easy to use `web-poet`_ Page Objects +(such as ``BookPage``) in Scrapy spiders. Changing spider =============== @@ -354,12 +354,10 @@ be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``: .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "toscrape.com": { - BookListPage: BTSBookListPage, - BookPage: BTSBookPage - } - } + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage) + ] The spider is back to life! ``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects @@ -390,7 +388,7 @@ to implement new ones: class BPBookListPage(WebPage): def book_urls(self): - return self.css('.article-info a::attr(href)').getall() + return self.css('article.post h4 a::attr(href)').getall() class BPBookPage(ItemWebPage): @@ -398,7 +396,7 @@ to implement new ones: def to_item(self): return { 'url': self.url, - 'name': self.css(".book-data h4::text").get().strip(), + 'name': self.css("body div > h1::text").get().strip(), } The last step is configuring the overrides so that these new Page Objects @@ -408,32 +406,82 @@ are used for the domain .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "toscrape.com": { - BookListPage: BTSBookListPage, - BookPage: BTSBookPage - }, - "bookpage.com": { - BookListPage: BPBookListPage, - BookPage: BPBookPage - } - } + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage), + ("bookpage.com", BPBookListPage, BookListPage), + ("bookpage.com", BPBookPage, BookPage) + ] The spider is now ready to extract books from both sites 😀. The full example `can be seen here `_ -On a surface, it looks just like a different way to organize Scrapy spider +On the surface, it looks just like a different way to organize Scrapy spider code - and indeed, it *is* just a different way to organize the code, but it opens some cool possibilities. +In the examples above we have been configuring the overrides +for a particular domain, but more complex URL patterns are also possible. +For example, the pattern ``books.toscrape.com/cataloge/category/`` +is accepted and it would restrict the override only to category pages. + +It is even possible to configure more complex patterns by using the +:py:class:`web_poet.overrides.OverrideRule` class instead of a triplet in +the configuration. Another way of declaring the earlier config +for ``SCRAPY_POET_OVERRIDES`` would be the following: + +.. code-block:: python + + from url_matcher import Patterns + from web_poet import OverrideRule + + + SCRAPY_POET_OVERRIDES = [ + OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), + OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + ] + +As you can see, this could get verbose. The earlier tuple config simply offers +a shortcut to be more concise. + +.. note:: + + Also see the `url-matcher `_ + documentation for more information about the patterns syntax. + +Manually defining overrides like this would be inconvenient, most +especially for larger projects. Fortunately, `web-poet`_ has a cool feature to +annotate Page Objects like :py:func:`web_poet.handle_urls` that would define +and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the +:py:class:`web_poet.overrides.OverrideRule` rules could then be simply read as: + +.. code:: python + + from web_poet import default_registry, consume_modules + + # The consume_modules() must be called first if you need to properly import + # rules from other packages. Otherwise, it can be omitted. + # More info about this caveat on web-poet docs. + consume_modules("external_package_A", "another_ext_package.lib") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + +For more info on this, you can refer to these docs: + + * ``scrapy-poet``'s :ref:`overrides` Tutorial section. + * External `web-poet`_ docs. + + * Specifically, the :external:ref:`intro-overrides` Tutorial section. + Next steps ========== Now that you know how ``scrapy-poet`` is supposed to work, what about trying to apply it to an existing or new Scrapy project? -Also, please check :ref:`overrides`, :ref:`providers` and refer to spiders in the "example" -folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders +Also, please check the :ref:`overrides` and :ref:`providers` sections as well as +refer to spiders in the "example" folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders .. _Scrapy Tutorial: https://docs.scrapy.org/en/latest/intro/tutorial.html diff --git a/docs/overrides.rst b/docs/overrides.rst index 9e0907d7..3ceb3d39 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -8,6 +8,18 @@ on the request URL domain. Please have a look to :ref:`intro-tutorial` to learn the basics about overrides before digging deeper in the content of this page. +.. tip:: + + Some real-world examples on this topic can be found in: + + - `Example 1 `_: + rules using tuples + - `Example 2 `_: + rules using tuples and :py:class:`web_poet.overrides.OverrideRule` + - `Example 3 `_: + rules using :py:func:`web_poet.handle_urls` decorator and retrieving them + via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` + Page Objects refinement ======================= @@ -47,13 +59,11 @@ And then override it for a particular domain using ``settings.py``: .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "example.com": { - BookPage: ISBNBookPage - } - } + SCRAPY_POET_OVERRIDES = [ + ("example.com", ISBNBookPage, BookPage) + ] -This new Page Objects gets the original ``BookPage`` as dependency and enrich +This new Page Object gets the original ``BookPage`` as dependency and enrich the obtained item with the ISBN from the page HTML. .. note:: @@ -80,20 +90,118 @@ the obtained item with the ISBN from the page HTML. return item +Overrides rules +=============== + +The default way of configuring the override rules is using triplets +of the form (``url pattern``, ``override_type``, ``overridden_type``). But more +complex rules can be introduced if the class :py:class:`web_poet.overrides.OverrideRule` +is used. The following example configures an override that is only applied for +book pages from ``books.toscrape.com``: + +.. code-block:: python + + from web_poet import OverrideRule + + + SCRAPY_POET_OVERRIDES = [ + OverrideRule( + for_patterns=Patterns( + include=["books.toscrape.com/cataloge/*index.html|"], + exclude=["/catalogue/category/"]), + use=MyBookPage, + instead_of=BookPage + ) + ] + +Note how category pages are excluded by using a ``exclude`` pattern. +You can find more information about the patterns syntax in the +`url-matcher `_ +documentation. + + +Decorate Page Objects with the rules +==================================== + +Having the rules along with the Page Objects is a good idea, +as you can identify with a single sight what the Page Object is doing +along with where it is applied. This can be done by decorating the +Page Objects with :py:func:`web_poet.handle_urls` provided by `web-poet`_. + +.. tip:: + Make sure to read the :external:ref:`intro-overrides` Tutorial section of + `web-poet`_ to learn all of its other functionalities that is not covered + in this section. + +Let's see an example: + +.. code-block:: python + + from web_poet import handle_urls + + + @handle_urls("toscrape.com", BookPage) + class BTSBookPage(BookPage): + + def to_item(self): + return { + 'url': self.url, + 'name': self.css("title::text").get(), + } + +The :py:func:`web_poet.handle_urls` decorator in this case is indicating that +the class ``BSTBookPage`` should be used instead of ``BookPage`` +for the domain ``toscrape.com``. + +In order to configure the ``scrapy-poet`` overrides automatically +using these annotations, you can directly interact with `web-poet`_'s +``default_registry`` (an instance of :py:class:`web_poet.overrides.PageObjectRegistry`). + +For example: + +.. code-block:: python + + from web_poet import default_registry, consume_modules + + # The consume_modules() must be called first if you need to properly import + # rules from other packages. Otherwise, it can be omitted. + # More info about this caveat on web-poet docs. + consume_modules("external_package_A", "another_ext_package.lib") + + # To get all of the Override Rules that were declared via annotations. + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + +The :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` method of the +``default_registry`` above returns ``List[OverrideRule]`` that were declared +using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much +more convenient that manually defining all of the :py:class:`web_poet.overrides.OverrideRule`. + +Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as +``List[OverrideRule]``, you can easily modify it later on if needed. + +.. note:: + + For more info and advanced features of `web-poet`_'s :py:func:`web_poet.handle_urls` + and its registry, kindly read the `web-poet `_ + documentation, specifically its :external:ref:`intro-overrides` tutorial + section. + + Overrides registry ================== The overrides registry is responsible for informing whether there exists an -override for a particular type for a given response. The default overrides -registry keeps a map of overrides for each domain and read this configuration -from settings ``SCRAPY_POET_OVERRIDES`` as has been seen in the :ref:`intro-tutorial` +override for a particular type for a given request. The default overrides +registry allows to configure these rules using patterns that follow the +`url-matcher `_ syntax. These rules can be configured using the +``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial` example. But the registry implementation can be changed at convenience. A different registry implementation can be configured using the property ``SCRAPY_POET_OVERRIDES_REGISTRY`` in ``settings.py``. The new registry -must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase`` -and must implement the method ``overrides_for``. As other Scrapy components, -it can be initialized from the ``from_crawler`` class method if implemented. -This might be handy to be able to access settings, stats, request meta, etc. - +must be a subclass of :class:`scrapy_poet.overrides.OverridesRegistryBase` and +must implement the method :meth:`scrapy_poet.overrides.OverridesRegistryBase.overrides_for`. +As other Scrapy components, it can be initialized from the ``from_crawler`` class +method if implemented. This might be handy to be able to access settings, stats, +request meta, etc. diff --git a/docs/settings.rst b/docs/settings.rst index c13a9580..2dbdec30 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -25,7 +25,7 @@ Default: ``None`` Mapping of overrides for each domain. The format of the such ``dict`` mapping depends on the currently set Registry. The default is currently -:class:`~.PerDomainOverridesRegistry`. This can be overriden by the setting below: +:class:`~.OverridesRegistry`. This can be overriden by the setting below: ``SCRAPY_POET_OVERRIDES_REGISTRY``. There are sections dedicated for this at :ref:`intro-tutorial` and :ref:`overrides`. @@ -36,7 +36,7 @@ SCRAPY_POET_OVERRIDES_REGISTRY Defaut: ``None`` -Sets an alternative Registry to replace the default :class:`~.PerDomainOverridesRegistry`. +Sets an alternative Registry to replace the default :class:`~.OverridesRegistry`. To use this, set a ``str`` which denotes the absolute object path of the new Registry. diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index 266f019d..ab266c08 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -28,7 +28,7 @@ def to_item(self): class BPBookListPage(WebPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" def book_urls(self): - return self.css('.article-info a::attr(href)').getall() + return self.css('article.post h4 a::attr(href)').getall() class BPBookPage(ItemWebPage): @@ -36,7 +36,7 @@ class BPBookPage(ItemWebPage): def to_item(self): return { 'url': self.url, - 'name': self.css(".book-data h4::text").get().strip(), + 'name': self.css("body div > h1::text").get().strip(), } @@ -45,12 +45,10 @@ class BooksSpider(scrapy.Spider): start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] # Configuring different page objects pages from the bookpage.com domain custom_settings = { - "SCRAPY_POET_OVERRIDES": { - "bookpage.com": { - BookListPage: BPBookListPage, - BookPage: BPBookPage - } - } + "SCRAPY_POET_OVERRIDES": [ + ("bookpage.com", BPBookListPage, BookListPage), + ("bookpage.com", BPBookPage, BookPage) + ] } def parse(self, response, page: BookListPage): diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index 9e6e8c2a..b4c366a7 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -8,6 +8,9 @@ """ import scrapy from web_poet import ItemWebPage, WebPage +from web_poet.overrides import OverrideRule +from url_matcher import Patterns + from scrapy_poet import callback_for @@ -41,7 +44,7 @@ def to_item(self): class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" def book_urls(self): - return self.css('.article-info a::attr(href)').getall() + return self.css('article.post h4 a::attr(href)').getall() class BPBookPage(BookPage): @@ -49,7 +52,7 @@ class BPBookPage(BookPage): def to_item(self): return { 'url': self.url, - 'name': self.css(".book-data h4::text").get().strip(), + 'name': self.css("body div > h1::text").get().strip(), } @@ -58,16 +61,14 @@ class BooksSpider(scrapy.Spider): start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] # Configuring different page objects pages for different domains custom_settings = { - "SCRAPY_POET_OVERRIDES": { - "toscrape.com": { - BookListPage: BTSBookListPage, - BookPage: BTSBookPage - }, - "bookpage.com": { - BookListPage: BPBookListPage, - BookPage: BPBookPage - }, - } + "SCRAPY_POET_OVERRIDES": [ + ("toscrape.com", BTSBookListPage, BookListPage), + ("toscrape.com", BTSBookPage, BookPage), + + # We could also use the long-form version if we want to. + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), + OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + ] } def parse(self, response, page: BookListPage): diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py new file mode 100644 index 00000000..f25fff07 --- /dev/null +++ b/example/example/spiders/books_04_overrides_03.py @@ -0,0 +1,76 @@ +""" +Scrapy spider which uses Page Objects both for crawling and extraction, +and uses overrides to support two different sites without changing +the crawling logic (the spider is exactly the same) + +No configured default logic: if used for an unregistered domain, no logic +at all is applied. + +This example is quite similar to books_04_overrides_02.py where the only +difference is that this example is using the ``@handle_urls`` decorator to +store the rules in web-poet's registry. +""" +import scrapy +from web_poet import ItemWebPage, WebPage, handle_urls, default_registry +from web_poet.overrides import OverrideRule +from url_matcher import Patterns + +from scrapy_poet import callback_for + + +class BookListPage(WebPage): + + def book_urls(self): + return [] + + +class BookPage(ItemWebPage): + + def to_item(self): + return None + + +@handle_urls("toscrape.com", overrides=BookListPage) +class BTSBookListPage(BookListPage): + """Logic to extract listings from pages like https://books.toscrape.com""" + def book_urls(self): + return self.css('.image_container a::attr(href)').getall() + + +@handle_urls("toscrape.com", overrides=BookPage) +class BTSBookPage(BookPage): + """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" + def to_item(self): + return { + 'url': self.url, + 'name': self.css("title::text").get(), + } + + +@handle_urls("bookpage.com", overrides=BookListPage) +class BPBookListPage(BookListPage): + """Logic to extract listings from pages like https://bookpage.com/reviews""" + def book_urls(self): + return self.css('article.post h4 a::attr(href)').getall() + + +@handle_urls("bookpage.com", overrides=BookPage) +class BPBookPage(BookPage): + """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" + def to_item(self): + return { + 'url': self.url, + 'name': self.css("body div > h1::text").get().strip(), + } + + +class BooksSpider(scrapy.Spider): + name = 'books_04_overrides_03' + start_urls = ['http://books.toscrape.com/', 'https://bookpage.com/reviews'] + # Configuring different page objects pages for different domains + custom_settings = { + "SCRAPY_POET_OVERRIDES": default_registry.get_overrides() + } + + def parse(self, response, page: BookListPage): + yield from response.follow_all(page.book_urls(), callback_for(BookPage)) diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index 07b4ee5e..d1a9ef47 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -54,14 +54,14 @@ def decode(self, obj: Any) -> Any: return pickle.loads(data) def __str__(self) -> str: - return ( + return ( #pragma: no cover f"SqlitedictCache <{self.db.filename} | " f"compressed: {self.compressed} | " f"{len(self.db)} records>" ) def __repr__(self) -> str: - return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" + return f"SqlitedictCache({self.path!r}, compressed={self.compressed})" #pragma: no cover def __getitem__(self, fingerprint: str) -> Any: return self.db[fingerprint] diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 79a585b8..034ee23a 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -15,14 +15,14 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.conf import build_component_list from scrapy.utils.defer import maybeDeferred_coro -from scrapy.utils.misc import load_object +from scrapy.utils.misc import load_object, create_instance from scrapy_poet.cache import SqlitedictCache from scrapy_poet.injection_errors import (UndeclaredProvidedTypeError, NonCallableProviderError, InjectionError) from scrapy_poet.overrides import OverridesRegistryBase, \ - PerDomainOverridesRegistry + OverridesRegistry from scrapy_poet.page_input_providers import PageObjectInputProvider from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse from web_poet.pages import is_injectable @@ -43,7 +43,7 @@ def __init__(self, overrides_registry: Optional[OverridesRegistryBase] = None): self.crawler = crawler self.spider = crawler.spider - self.overrides_registry = overrides_registry or PerDomainOverridesRegistry() + self.overrides_registry = overrides_registry or OverridesRegistry() self.load_providers(default_providers) self.init_cache() @@ -352,6 +352,8 @@ class MySpider(Spider): spider = MySpider() spider.settings = settings crawler.spider = spider + if not overrides_registry: + overrides_registry = create_instance(OverridesRegistry, settings, crawler) return Injector(crawler, overrides_registry=overrides_registry) diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index afc631f1..9b9978c8 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -11,8 +11,9 @@ from twisted.internet.defer import inlineCallbacks from scrapy.utils.misc import create_instance, load_object + from .api import DummyResponse -from .overrides import PerDomainOverridesRegistry +from .overrides import OverridesRegistry from .page_input_providers import HttpResponseProvider from .injection import Injector @@ -38,7 +39,7 @@ def __init__(self, crawler: Crawler) -> None: self.crawler = crawler settings = self.crawler.settings registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", - PerDomainOverridesRegistry)) + OverridesRegistry)) self.overrides_registry = create_instance(registry_cls, settings, crawler) self.injector = Injector(crawler, default_providers=DEFAULT_PROVIDERS, diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index dc3b9cf8..a5e330d3 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,13 +1,22 @@ +import logging from abc import ABC, abstractmethod -from typing import Dict, Mapping, Callable +from collections import defaultdict +from typing import Dict, Mapping, Callable, Iterable, Union, Tuple, Optional, List from scrapy import Request from scrapy.crawler import Crawler -from scrapy_poet.utils import get_domain +from url_matcher import Patterns, URLMatcher +from url_matcher.util import get_domain +from web_poet.overrides import OverrideRule -class OverridesRegistryBase(ABC): +logger = logging.getLogger(__name__) + +RuleAsTuple = Union[Tuple[str, Callable, Callable], List] +RuleFromUser = Union[RuleAsTuple, OverrideRule] + +class OverridesRegistryBase(ABC): @abstractmethod def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: """ @@ -18,27 +27,97 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: pass -class PerDomainOverridesRegistry(Dict[str, Dict[Callable, Callable]], OverridesRegistryBase): +class OverridesRegistry(OverridesRegistryBase): """ - Simple dictionary based registry that reads the overrides - from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings + Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` + in the spider settings. It is a list and each rule can be a tuple or an + instance of the class :py:class:`web_poet.overrides.OverrideRule`. + + If a tuple is provided: + + - the **first** element is the pattern to match the URL, + - the **second** element is the type to be used instead of the type in + the **third** element. + + Another way to see it for the URLs that match the pattern ``tuple[0]`` use + ``tuple[1]`` instead of ``tuple[2]``. Example of overrides configuration: .. code-block:: python - SCRAPY_POET_OVERRIDES = { - "example.com": { - BookPage: ISBNBookPage - } - } + from url_matcher import Patterns + from scrapy_poet.overrides import OverrideRule + + + SCRAPY_POET_OVERRIDES = [ + # Option 1 + ("books.toscrape.com", ISBNBookPage, BookPage), + + # Option 2 + OverrideRule( + for_patterns=Patterns(["books.toscrape.com"]), + use=MyBookListPage, + instead_of=BookListPage, + ), + ] + + .. _web-poet: https://web-poet.readthedocs.io + + Now, if you've used web-poet_'s built-in functionality to directly create + the :py:class:`web_poet.overrides.OverrideRule` in the Page Object via the + :py:func:`web_poet.handle_urls` annotation, you can quickly import them via + the following code below. It finds all the rules annotated using web-poet_'s + :py:func:`web_poet.handle_urls` as a decorator that were registered into + ``web_poet.default_registry`` (an instance of + :py:class:`web_poet.overrides.PageObjectRegistry`). + + .. code-block:: python + + from web_poet import default_registry, consume_modules + + # The consume_modules() must be called first if you need to properly + # import rules from other packages. Otherwise, it can be omitted. + # More info about this caveat on web-poet docs. + consume_modules("external_package_A.po", "another_ext_package.lib") + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + + Make sure to call :py:func:`web_poet.overrides.consume_modules` beforehand. + More info on this at web-poet_. """ @classmethod - def from_crawler(cls, crawler: Crawler): - return cls(crawler.settings.getdict("SCRAPY_POET_OVERRIDES", {})) + def from_crawler(cls, crawler: Crawler) -> Crawler: + return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", [])) - def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: - return self.get(get_domain(request.url), {}) + def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: + self.rules: List[OverrideRule] = [] + self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) + for rule in rules or []: + self.add_rule(rule) + logger.debug(f"List of parsed OverrideRules:\n{self.rules}") + def add_rule(self, rule: RuleFromUser) -> None: + if isinstance(rule, (tuple, list)): + if len(rule) != 3: + raise ValueError( + f"Invalid overrides rule: {rule}. Rules as tuples must have " + f"3 elements: (1) the pattern, (2) the PO class used as a " + f"replacement and (3) the PO class to be replaced." + ) + pattern, use, instead_of = rule + rule = OverrideRule( + for_patterns=Patterns([pattern]), use=use, instead_of=instead_of + ) + self.rules.append(rule) + self.matcher[rule.instead_of].add_or_update( + len(self.rules) - 1, rule.for_patterns + ) + def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: + overrides: Dict[Callable, Callable] = {} + for instead_of, matcher in self.matcher.items(): + rule_id = matcher.match(request.url) + if rule_id is not None: + overrides[instead_of] = self.rules[rule_id].use + return overrides diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 8cdcb6f0..80a7d715 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,25 +1,11 @@ import os from scrapy.utils.project import project_data_dir, inside_project -from tldextract import tldextract - - -def get_domain(url): - """ - Return the domain without any subdomain - - >>> get_domain("http://blog.example.com") - 'example.com' - >>> get_domain("http://www.example.com") - 'example.com' - >>> get_domain("http://deeper.blog.example.co.uk") - 'example.co.uk' - """ - return ".".join(tldextract.extract(url)[-2:]) def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str: """Return a path to a folder where Scrapy is storing data. + Usually that's a .scrapy folder inside the project. """ # This code is extracted from scrapy.utils.project.data_path function, diff --git a/setup.py b/setup.py index 6cc9740a..39e5a000 100755 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ 'andi >= 0.4.1', 'attrs', 'parsel', + 'url-matcher', 'web-poet @ git+https://git@github.com/scrapinghub/web-poet@master#egg=web-poet', 'tldextract', 'sqlitedict', diff --git a/tests/conftest.py b/tests/conftest.py index 78538fc2..209ac514 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,6 @@ import pytest from scrapy.settings import Settings -from scrapy_poet.page_input_providers import HttpResponseProvider - @pytest.fixture() def settings(request): diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py new file mode 100644 index 00000000..287bd7ea --- /dev/null +++ b/tests/po_lib/__init__.py @@ -0,0 +1,27 @@ +""" +This package is just for overrides testing purposes. +""" +import socket +from typing import Dict, Any, Callable + +from url_matcher import Patterns +from url_matcher.util import get_domain +from web_poet import handle_urls, ItemWebPage + +from tests.mockserver import get_ephemeral_port + + +# Need to define it here since it's always changing +DOMAIN = get_domain(socket.gethostbyname(socket.gethostname())) +PORT = get_ephemeral_port() + + +class POOverriden(ItemWebPage): + def to_item(self): + return {"msg": "PO that will be replace"} + + +@handle_urls(f"{DOMAIN}:{PORT}", overrides=POOverriden) +class POIntegration(ItemWebPage): + def to_item(self): + return {"msg": "PO replacement"} diff --git a/tests/test_injection.py b/tests/test_injection.py index 81f0aabf..99393e52 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -8,7 +8,9 @@ import parsel from scrapy import Request from scrapy.http import Response -from scrapy_poet.utils import get_domain +from url_matcher import Patterns + +from url_matcher.util import get_domain from scrapy_poet import CacheDataProviderMixin, HttpResponseProvider, PageObjectInputProvider, \ DummyResponse @@ -16,9 +18,10 @@ get_injector_for_testing, get_response_for_testing from scrapy_poet.injection_errors import NonCallableProviderError, \ InjectionError, UndeclaredProvidedTypeError -from scrapy_poet.overrides import PerDomainOverridesRegistry +from scrapy_poet.overrides import OverridesRegistry from web_poet import Injectable, ItemPage from web_poet.mixins import ResponseShortcutsMixin +from web_poet.overrides import OverrideRule def get_provider(classes, content=None): @@ -306,13 +309,11 @@ def test_overrides(self, providers, override_should_happen): domain = "example.com" if override_should_happen else "other-example.com" # The request domain is example.com, so overrides shouldn't be applied # when we configure them for domain other-example.com - overrides = { - domain: { - PricePO: PriceInDollarsPO, - EurDollarRate: OtherEurDollarRate - } - } - registry = PerDomainOverridesRegistry(overrides) + overrides = [ + (domain, PriceInDollarsPO, PricePO), + OverrideRule(Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate) + ] + registry = OverridesRegistry(overrides) injector = get_injector_for_testing(providers, overrides_registry=registry) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index f4020af0..ac4ea36a 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -15,13 +15,16 @@ import attr from scrapy_poet import callback_for +from url_matcher.util import get_domain + +from tests.mockserver import get_ephemeral_port from scrapy_poet import InjectionMiddleware -from scrapy_poet.utils import get_domain from web_poet.pages import WebPage, ItemPage, ItemWebPage from scrapy_poet.cache import SqlitedictCache from scrapy_poet.page_input_providers import ( PageObjectInputProvider ) +from web_poet import default_registry from web_poet.page_inputs import HttpResponse from scrapy_poet import DummyResponse from tests.utils import (HtmlResource, @@ -107,10 +110,12 @@ def test_basic_case(settings): def test_overrides(settings): host = socket.gethostbyname(socket.gethostname()) domain = get_domain(host) - settings["SCRAPY_POET_OVERRIDES"] = { - domain: {BreadcrumbsExtraction: OverridenBreadcrumbsExtraction}} + port = get_ephemeral_port() + settings["SCRAPY_POET_OVERRIDES"] = [ + (f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction) + ] item, url, _ = yield crawl_single_item(spider_for(ProductPage), - ProductHtml, settings) + ProductHtml, settings, port=port) assert item == { 'url': url, 'name': 'Chocolate', @@ -346,3 +351,29 @@ def get_middleware(settings): mock.call('/tmp/cache', compressed=True), mock.call().close() ] + + +@inlineCallbacks +def test_web_poet_integration(settings): + """This tests scrapy-poet's integration with web-poet most especially when + populating override settings via: + + from web_poet import default_registry + + SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + """ + + # Only import them in this test scope since they need to be synced with + # the URL of the Page Object annotated with @handle_urls. + from tests.po_lib import DOMAIN, PORT, POOverriden + + # Override rules are defined in `tests/po_lib/__init__.py`. + rules = default_registry.get_overrides() + + # Converting it to a set removes potential duplicate OverrideRules + settings["SCRAPY_POET_OVERRIDES"] = set(rules) + + item, url, _ = yield crawl_single_item( + spider_for(POOverriden), ProductHtml, settings, port=PORT + ) + assert item == {"msg": "PO replacement"} diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..05e55542 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,21 @@ +from unittest import mock +from pathlib import PosixPath + +from scrapy_poet.utils import get_scrapy_data_path + + +@mock.patch("scrapy_poet.utils.os.makedirs") +@mock.patch("scrapy_poet.utils.inside_project") +def test_get_scrapy_data_path(mock_inside_project, mock_makedirs, tmp_path): + mock_inside_project.return_value = False + + path = tmp_path / "test_dir" + result = get_scrapy_data_path(createdir=True, default_dir=path) + + assert isinstance(result, PosixPath) + assert str(result) # should be non-empty + + mock_inside_project.assert_called_once() + + mock_makedirs.assert_called_once() + mock_makedirs.assert_called_with(path, exist_ok=True) diff --git a/tests/utils.py b/tests/utils.py index 7dd46b7d..55b26f5b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -24,26 +24,26 @@ def render_GET(self, request): @inlineCallbacks -def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None): +def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None, port=None): """Use spider_cls to crawl resource_cls. URL of the resource is passed to the spider as ``url`` argument. Return ``(items, resource_url, crawler)`` tuple. """ spider_kwargs = {} if spider_kwargs is None else spider_kwargs crawler = make_crawler(spider_cls, settings) - with MockServer(resource_cls) as s: + with MockServer(resource_cls, port=port) as s: root_url = s.root_url yield crawler.crawl(url=root_url, **spider_kwargs) return crawler.spider.collected_items, s.root_url, crawler @inlineCallbacks -def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None): +def crawl_single_item(spider_cls, resource_cls, settings, spider_kwargs=None, port=None): """Run a spider where a single item is expected. Use in combination with ``capture_capture_exceptions`` and ``CollectorPipeline`` """ items, url, crawler = yield crawl_items(spider_cls, resource_cls, settings, - spider_kwargs=spider_kwargs) + spider_kwargs=spider_kwargs, port=port) assert len(items) == 1 resp = items[0] if 'exception' in resp: