diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fb3152eb..abed64be 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,13 @@ Changelog ========= +TBR +------------------ + +* More powerful overrides configuration by the introduction of + the ``HierarchicalOverridesRegistry``. The old registry was + removed. + 0.2.1 (2021-06-11) ------------------ diff --git a/docs/overrides.rst b/docs/overrides.rst index 5d115757..e17e17cf 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -78,20 +78,36 @@ the obtained item with the ISBN from the page HTML. item['isbn'] = self.css(".isbn-class::text").get() return item +Overrides rules +--------------- + +The former example showed how to configure the overrides for a particular +domain. This is by far the most common case, but sometimes this is not +enough: in some cases you may require to have different overrides for some subdomains +(e.g. ``uk.somesite.com`` and ``us.somesite.com``); in other cases +you may want to have specific overrides for a subsection of a site +(e.g. ``somesite.com`` and ``somesite.com/deals``). This is entirely possible. +In fact, the examples presented above are already valid keys to be used +in the setting dictionary ``SCRAPY_POET_OVERRIDES``. + +There is more information about how to configure ``SCRAPY_POET_OVERRIDES`` +and the supported rules in :class:`scrapy_poet.overrides.HierarchicalOverridesRegistry` +documentation. + Overrides registry ================== The overrides registry is responsible for informing whether there exists an override for a particular type for a given response. The default overrides -registry keeps a map of overrides for each domain and read this configuration -from settings ``SCRAPY_POET_OVERRIDES`` as has been seen in the :ref:`intro-tutorial` -example. +registry allows to configure the overriding rules and reads the configuration +from settings ``SCRAPY_POET_OVERRIDES``. See :class:`scrapy_poet.overrides.HierarchicalOverridesRegistry` +for more information. But the registry implementation can be changed at convenience. A different registry implementation can be configured using the property ``SCRAPY_POET_OVERRIDES_REGISTRY`` in ``settings.py``. The new registry -must be a subclass of ``scrapy_poet.overrides.OverridesRegistryBase`` +must be a subclass of :class:`scrapy_poet.overrides.OverridesRegistryBase` and must implement the method ``overrides_for``. As other Scrapy components, it can be initialized from the ``from_crawler`` class method if implemented. This might be handy to be able to access settings, stats, request meta, etc. diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 5ac5e5a2..8ecb3eea 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -18,7 +18,7 @@ NonCallableProviderError, InjectionError) from scrapy_poet.overrides import OverridesRegistryBase, \ - PerDomainOverridesRegistry + HierarchicalOverridesRegistry from scrapy_poet.page_input_providers import PageObjectInputProvider from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse from web_poet.pages import is_injectable @@ -39,7 +39,7 @@ def __init__(self, overrides_registry: Optional[OverridesRegistryBase] = None): self.crawler = crawler self.spider = crawler.spider - self.overrides_registry = overrides_registry or PerDomainOverridesRegistry() + self.overrides_registry = overrides_registry or HierarchicalOverridesRegistry() self.load_providers(default_providers) def load_providers(self, default_providers: Optional[Mapping] = None): diff --git a/scrapy_poet/middleware.py b/scrapy_poet/middleware.py index c2584c62..d347e7e9 100644 --- a/scrapy_poet/middleware.py +++ b/scrapy_poet/middleware.py @@ -11,7 +11,7 @@ from scrapy.utils.misc import create_instance, load_object from . import api -from .overrides import PerDomainOverridesRegistry +from .overrides import HierarchicalOverridesRegistry from .page_input_providers import ResponseDataProvider from .injection import Injector @@ -35,7 +35,7 @@ def __init__(self, crawler: Crawler): self.crawler = crawler settings = self.crawler.settings registry_cls = load_object(settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", - PerDomainOverridesRegistry)) + HierarchicalOverridesRegistry)) self.overrides_registry = create_instance(registry_cls, settings, crawler) self.injector = Injector(crawler, default_providers=DEFAULT_PROVIDERS, diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index dc3b9cf8..72893e51 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,9 +1,11 @@ from abc import ABC, abstractmethod -from typing import Dict, Mapping, Callable +from typing import Dict, Mapping, Callable, Optional, List + +from marisa_trie import Trie from scrapy import Request from scrapy.crawler import Crawler -from scrapy_poet.utils import get_domain +from scrapy_poet.utils import get_domain, url_hierarchical_str class OverridesRegistryBase(ABC): @@ -42,3 +44,101 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: return self.get(get_domain(request.url), {}) +class OverridesRecord: + + def __init__(self, hierarchical_url: str, overrides: Mapping[Callable, Callable]): + self.hierarchical_url = hierarchical_url + self.overrides = overrides + + +class HierarchicalOverridesRegistry(OverridesRegistryBase): + """ + Overrides registry that reads the overrides + from the option ``SCRAPY_POET_OVERRIDES`` in the spider settings + + Example of overrides configuration: + + .. code-block:: python + + SCRAPY_POET_OVERRIDES = { + "example.com": { + BookPage: ExampleBookPage + BookListPage: ExampleListBookPage + } + } + + The former example configures ``ExampleBookPage`` + and ``ExampleListBookPage`` to be used instead + of ``BookPage`` and ``BookListPage`` respectively + for any request to the domain ``example.com``. + + Each set of rules can be configured to override a particular + domain, subdomain or even a specific path. The following + table shows some examples of keys and what are they effect. + + .. list-table:: Overrides keys examples + :widths: auto + :width: 80% + :header-rows: 1 + + * - Key + - The overrides apply to + * - ``"subdomain.example.com"`` + - any request belonging to ``subdomain.example.com`` or any of its + subdomains + * - ``"example.com/path_to_content"`` + - any request to the netlocs ``example.com`` or ``www.example.com`` whose + URL path is a children of ``/path_to_content`` + * - ``""`` + - any request. Useful to set default overrides + + **The most specific rule is applied** when several rules could be + applied to the same URL. Imagine, for example, the case where you have rules + for ``""``, ``"toscrape.com"``, ``"books.toscrape.com"`` and ``"books.toscrape.com/catalogue"``: + + * The rules for ``""`` would be applied for the URL ``http://example.com`` + * The rules for ``"toscrape.com"`` would be applied for the URL ``http://toscrape.com/index.html`` + * The rules for ``"books.toscrape.com"`` would be applied for the URL ``http://books.toscrape.com`` + * The rules for ``"books.toscrape.com/catalogue"`` would be applied for the URL ``http://books.toscrape.com/catalogue/category`` + + This is useful as it allows to configure some general overrides for a site + and also some more specific overrides for some subsections of the site. + """ + + def __init__(self, all_overrides: Optional[Mapping[str, Mapping[Callable, Callable]]] = None) -> None: + super().__init__() + self.overrides: List[OverridesRecord] = [] + self.trie = Trie() + for domain_or_more, overrides in (all_overrides or {}).items(): + self.register(domain_or_more, overrides) + + def register(self, domain_or_more: str, overrides: Mapping[Callable, Callable]): + url = f"http://{domain_or_more}" + hurl = url_hierarchical_str(url) + record = OverridesRecord(hurl, overrides) + # Update case + if hurl in self.trie: + self.overrides[self.trie[hurl]] = record + return + + # Insert case. We have to rebuild the trie and the reindex the + # overrides list based on the new trie. + # Note that this is O(N), but register is expected to be executed only + # at initialization and we expect N to be low enough. + new_overrides = self.overrides + [record] + self.trie = Trie([override.hierarchical_url for override in new_overrides]) + self.overrides = [None] * len(new_overrides) # type: ignore + for override in new_overrides: + self.overrides[self.trie[override.hierarchical_url]] = override + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler.settings.getdict("SCRAPY_POET_OVERRIDES", {})) + + def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: + hurl = url_hierarchical_str(request.url) + max_prefix = max(self.trie.prefixes(hurl), default=None) + if max_prefix is not None: + return self.overrides[self.trie[max_prefix]].overrides + else: + return {} diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils.py index 57c95673..cf1323e3 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils.py @@ -1,3 +1,6 @@ +import re +from urllib.parse import urlsplit + from tldextract import tldextract @@ -11,5 +14,85 @@ def get_domain(url): 'example.com' >>> get_domain("http://deeper.blog.example.co.uk") 'example.co.uk' + >>> get_domain("http://127.0.0.1") + '127.0.0.1' + """ + return ".".join(el for el in tldextract.extract(url)[-2:] if el) + + +# Is IP Regex, from https://www.oreilly.com/library/view/regular-expressions-cookbook/9780596802837/ch07s16.html +_IS_IP_ADDRESS_RE = re.compile( + r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" +) + + +def url_hierarchical_str(url: str) -> str: + """ + Return a string that represents the url in a way that its + components are ordered by its hierarchical importance. That is, the top + level domain is the most important, so it is the fist element in the string. + Then goes the rest of the levels in the domain, the port and finally the path. + + Can be very useful to verify if a URL is a subpath of the other just + by checking if one url hierarchical str is the prefix of the other. + + Trailing slash for the path is removed and the query and the fragment + are ignored. + + >>> url_hierarchical_str("http://") + '' + >>> url_hierarchical_str("http://example.com:343") + 'com.example.:343' + >>> url_hierarchical_str("http://example.com:343/") + 'com.example.:343' + >>> url_hierarchical_str("http://WWW.example.com:343/") + 'com.example.:343' + >>> url_hierarchical_str("http://www.EXAMPLE.com:343/?id=23") + 'com.example.:343' + >>> url_hierarchical_str("http://www.example.com:343/page?id=23") + 'com.example.:343/page' + >>> url_hierarchical_str("http://www.example.com:343/page?id=23;params#fragment") + 'com.example.:343/page' + >>> url_hierarchical_str("http://127.0.0.1:80/page?id=23;params#fragment") + '127.0.0.1./page' + >>> url_hierarchical_str("https://127.0.0.1:443/page?id=23;params#fragment") + '127.0.0.1./page' + >>> url_hierarchical_str("https://127.0.0.1:333/page?id=23;params#fragment") + '127.0.0.1.:333/page' + >>> url_hierarchical_str("http://example.com:333/path/to/something") + 'com.example.:333/path/to/something' + >>> url_hierarchical_str("mailto://example.com") + Traceback (most recent call last): + ... + ValueError: Unsupported scheme for URL mailto://example.com + >>> url_hierarchical_str("http://example.com:k34") # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + ValueError: Port could not be cast to integer value as 'k34' + >>> url_hierarchical_str("/path") + Traceback (most recent call last): + ... + ValueError: Unsupported scheme for URL /path """ - return ".".join(tldextract.extract(url)[-2:]) + parts = urlsplit(url.strip()) + scheme, netloc, path, query, fragment = parts + if scheme.lower() not in ["http", "https"]: + raise ValueError(f"Unsupported scheme for URL {url}") + host = (parts.hostname or "").lower() + port = f":{parts.port}" if parts.port and parts.port not in [80, 443] else "" + + if not _IS_IP_ADDRESS_RE.match(host): + # Remove www and reverse the domains + dom_secs = host.split(".") + if dom_secs: + if dom_secs[0] == "www": + dom_secs = dom_secs[1:] + host = ".".join(reversed(dom_secs)) + if host: + host += "." + + if path.endswith("/"): + path = path[:-1] + + return f"{host}{port}{path}" + diff --git a/setup.py b/setup.py index 4f5ff450..7b26bd7c 100755 --- a/setup.py +++ b/setup.py @@ -15,7 +15,8 @@ 'attrs', 'parsel', 'web-poet', - 'tldextract'], + 'tldextract', + 'marisa-trie'], classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', diff --git a/tests/test_injection.py b/tests/test_injection.py index cb75018b..4d3f775e 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -11,7 +11,7 @@ get_injector_for_testing, get_response_for_testing from scrapy_poet.injection_errors import NonCallableProviderError, \ InjectionError, UndeclaredProvidedTypeError -from scrapy_poet.overrides import PerDomainOverridesRegistry +from scrapy_poet.overrides import HierarchicalOverridesRegistry from web_poet import Injectable, ItemPage from web_poet.mixins import ResponseShortcutsMixin @@ -303,7 +303,7 @@ def test_overrides(self, providers, override_should_happen): EurDollarRate: OtherEurDollarRate } } - registry = PerDomainOverridesRegistry(overrides) + registry = HierarchicalOverridesRegistry(overrides) injector = get_injector_for_testing(providers, overrides_registry=registry) diff --git a/tests/test_overrides.py b/tests/test_overrides.py new file mode 100644 index 00000000..acfec305 --- /dev/null +++ b/tests/test_overrides.py @@ -0,0 +1,116 @@ +from typing import Mapping + +import pytest + +from scrapy import Request, Spider +from scrapy.utils.test import get_crawler +from scrapy_poet.overrides import HierarchicalOverridesRegistry, \ + PerDomainOverridesRegistry + + +class _str(str, Mapping): # type: ignore + """Trick to use strings as overrides dicts for testing""" + ... + + +def _r(url: str): + return Request(url) + + +@pytest.fixture +def reg(): + return HierarchicalOverridesRegistry() + + +class TestHierarchicalOverridesRegistry: + + def test_replace(self, reg): + reg.register("toscrape.com", _str("ORIGINAL")) + assert reg.overrides_for(_r("http://toscrape.com:442/path")) == "ORIGINAL" + reg.register("toscrape.com", _str("REPLACED")) + assert reg.overrides_for(_r("http://www.toscrape.com/path")) == "REPLACED" + assert len(reg.overrides) == 1 + assert len(reg.trie) == 1 + + def test_init_and_global(self): + overrides = { + "": _str("GLOBAL"), + "toscrape.com": _str("TOSCRAPE") + } + reg = HierarchicalOverridesRegistry(overrides) + assert reg.overrides_for(_r("http://example.com/blabla")) == "GLOBAL" + assert reg.overrides_for(_r("http://toscrape.com/blabla")) == "TOSCRAPE" + + def test_register(self, reg): + assert reg.overrides_for(_r("http://books.toscrape.com/")) == {} + + reg.register("books.toscrape.com", _str("BOOKS_TO_SCRAPE")) + assert reg.overrides_for(_r("http://books.toscrape.com/")) == "BOOKS_TO_SCRAPE" + assert reg.overrides_for(_r("http://books.toscrape.com/path")) == "BOOKS_TO_SCRAPE" + assert reg.overrides_for(_r("http://toscrape.com/")) == {} + + reg.register("toscrape.com", _str("TO_SCRAPE")) + assert reg.overrides_for(_r("http://books.toscrape.com/")) == "BOOKS_TO_SCRAPE" + assert reg.overrides_for(_r("http://books.toscrape.com/path")) == "BOOKS_TO_SCRAPE" + assert reg.overrides_for(_r("http://toscrape.com/")) == "TO_SCRAPE" + assert reg.overrides_for(_r("http://www.toscrape.com/")) == "TO_SCRAPE" + assert reg.overrides_for(_r("http://toscrape.com/path")) == "TO_SCRAPE" + assert reg.overrides_for(_r("http://zz.com")) == {} + + reg.register("books.toscrape.com/category/books/classics_6/", _str("CLASSICS")) + assert reg.overrides_for(_r("http://books.toscrape.com/path?arg=1")) == "BOOKS_TO_SCRAPE" + assert reg.overrides_for(_r("http://toscrape.com")) == "TO_SCRAPE" + assert reg.overrides_for(_r("http://aa.com")) == {} + assert reg.overrides_for( + _r("https://books.toscrape.com/category/books/classics_6")) == "CLASSICS" + assert reg.overrides_for( + _r("http://books.toscrape.com/category/books/classics_6/path")) == "CLASSICS" + assert reg.overrides_for( + _r("http://books.toscrape.com/category/books/")) == "BOOKS_TO_SCRAPE" + + def test_from_crawler(self): + crawler = get_crawler(Spider) + reg = HierarchicalOverridesRegistry.from_crawler(crawler) + assert len(reg.overrides) == 0 + + settings = { + "SCRAPY_POET_OVERRIDES": { + "toscrape.com": _str("TOSCRAPE") + } + } + crawler = get_crawler(Spider, settings) + reg = HierarchicalOverridesRegistry.from_crawler(crawler) + assert len(reg.overrides) == 1 + assert reg.overrides_for(_r("http://toscrape.com/path")) == "TOSCRAPE" + + def test_domain_subdomain_case(self, reg): + reg.register("toscrape.com", _str("DOMAIN")) + reg.register("books.toscrape.com", _str("SUBDOMAIN")) + assert reg.overrides_for(_r("http://toscrape.com/blabla")) == "DOMAIN" + assert reg.overrides_for(_r("http://cars.toscrape.com/")) == "DOMAIN" + assert reg.overrides_for(_r("http://books2.toscrape.com:123/blabla")) == "DOMAIN" + assert reg.overrides_for(_r("https://mybooks.toscrape.com/blabla")) == "DOMAIN" + assert reg.overrides_for(_r("http://books.toscrape.com/blabla")) == "SUBDOMAIN" + assert reg.overrides_for(_r("http://www.books.toscrape.com")) == "SUBDOMAIN" + assert reg.overrides_for(_r("http://uk.books.toscrape.com/blabla")) == "SUBDOMAIN" + + def test_common_prefix_domains(self, reg): + reg.register("toscrape.com", _str("TOSCRAPE")) + reg.register("toscrape2.com", _str("TOSCRAPE2")) + assert reg.overrides_for(_r("http://toscrape.com/blabla")) == "TOSCRAPE" + assert reg.overrides_for(_r("http://toscrape2.com")) == "TOSCRAPE2" + + +class TestPerDomainOverridesRegistry: + + def test(self): + settings = { + "SCRAPY_POET_OVERRIDES": { + "toscrape.com": _str("TOSCRAPE") + } + } + crawler = get_crawler(Spider, settings) + reg = PerDomainOverridesRegistry.from_crawler(crawler) + assert reg.overrides_for(_r("http://toscrape.com/path")) == "TOSCRAPE" + assert reg.overrides_for(_r("http://books.toscrape.com/path")) == "TOSCRAPE" + assert reg.overrides_for(_r("http://toscrape2.com/path")) == {}