Skip to content
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a5a8f42
meta module
ivanprado Nov 29, 2021
ec80b69
CMD for listing overrides
ivanprado Nov 29, 2021
308bd1d
Refactoring with better names and structures and meta inclusion
ivanprado Nov 30, 2021
aa8000d
docstring
ivanprado Nov 30, 2021
a2d5cb6
Fix url_matcher dep
ivanprado Nov 30, 2021
1f1f410
Fix CI tests
ivanprado Nov 30, 2021
bdb8987
Make mypy happy again
ivanprado Nov 30, 2021
a3e3eea
Documentation fixed
ivanprado Nov 30, 2021
ef9945b
Minor changes
ivanprado Nov 30, 2021
f6fdac4
url-matcher has now been released.
ivanprado Dec 1, 2021
b050d01
Merge branch 'master' into handle_urls
BurnzZ Dec 20, 2021
ba52ce0
add entry point for CLI command
BurnzZ Dec 20, 2021
ba61626
fix import which fails tests
BurnzZ Dec 20, 2021
f5cffef
refactor namespace to be classes instead
BurnzZ Dec 20, 2021
c3579b9
fix failing mypy tests after refactoring
BurnzZ Dec 20, 2021
234b8d9
Merge branch 'master' of github.com:scrapinghub/web-poet into handle_…
BurnzZ Dec 21, 2021
531752f
update tests to improve coverage
BurnzZ Dec 21, 2021
7495b58
add missing import for find_page_object_overrides
BurnzZ Dec 21, 2021
0a0ee12
add docs for overrides
BurnzZ Dec 21, 2021
46d40e7
refactor by removing the need for find_page_object_overrides()
BurnzZ Dec 22, 2021
495642b
add docs about using multiple PageObjectRegistries
BurnzZ Dec 23, 2021
75593ed
add docs regarding organizing Page Object Overrides
BurnzZ Jan 4, 2022
0a2d779
update override docs to showcase url-matcher patterns
BurnzZ Jan 6, 2022
c000cbc
rename get_overrides_from_module into get_overrides_from
BurnzZ Jan 6, 2022
10dff5b
fix bug where module substring paths are not filtered out correctly
BurnzZ Jan 6, 2022
daa3ff9
create consume_modules() to properly load annotations in get_overrides()
BurnzZ Jan 6, 2022
3b05c07
update get_overrides_from to accept an arbitrary number of str inputs
BurnzZ Jan 6, 2022
f626efc
add more warning docs to get_overrides() to use consume_modules()
BurnzZ Jan 6, 2022
bd3a88e
enable ease of combining external Page Object packages
BurnzZ Jan 7, 2022
0cbeb0b
refactor get_overrides() to have a simpler interface with consume_mod…
BurnzZ Jan 12, 2022
de5563a
introduce concept of 'registry_pool' to access all PageObjectRegistry…
BurnzZ Jan 13, 2022
e7cca69
implement __hash__() in OverrideRule to easily identify uniqueness
BurnzZ Jan 13, 2022
eab277a
polish documentation with better examples and discussion
BurnzZ Jan 17, 2022
38e56cd
add more tests when PageObjectRegistry is instantiated
BurnzZ Jan 18, 2022
bf0b3e5
update PageObjectRegistry API for manipulating rules from different r…
BurnzZ Jan 20, 2022
d5a5d75
update OverrideRule __hash__() implementation after url-matcher==0.2.…
BurnzZ Feb 1, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ Changelog
TBR
------------------

* ``handle_urls`` decorator and ``find_page_object_overrides`` function added.
* new CLI tool for displaying all available Page Objects: ``web_poet <path>``
* removed support for Python 3.6
* added support for Python 3.10


0.1.1 (2021-06-02)
------------------

Expand Down
10 changes: 10 additions & 0 deletions docs/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,13 @@ Mixins
.. autoclass:: web_poet.mixins.ResponseShortcutsMixin
:members:
:no-special-members:


Overrides
=========

.. autofunction:: web_poet.handle_urls

.. automodule:: web_poet.overrides
:members:
:exclude-members: handle_urls
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,5 @@
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None, ),
'scrapy': ('https://docs.scrapy.org/en/latest', None, ),
'url-matcher': ('https://url-matcher.readthedocs.io/en/stable/', None, ),
}
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ sphinxcontrib-devhelp==1.0.2
sphinxcontrib-htmlhelp==2.0.0
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.3
sphinxcontrib-serializinghtml==1.1.5
sphinxcontrib-serializinghtml==1.1.5
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
author='Scrapinghub',
author_email='[email protected]',
url='https://github.com/scrapinghub/web-poet',
entry_points={'console_scripts': ['web_poet = web_poet.__main__:main']},
packages=find_packages(
exclude=(
'tests',
Expand All @@ -22,6 +23,8 @@
install_requires=(
'attrs',
'parsel',
'url-matcher',
'tabulate',
),
classifiers=(
'Development Status :: 2 - Pre-Alpha',
Expand Down
43 changes: 43 additions & 0 deletions tests/po_lib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
This package is just for overrides testing purposes.
"""
from typing import Dict, Any, Callable

from url_matcher import Patterns

from web_poet import handle_urls, PageObjectRegistry


class POBase:
expected_overrides: Callable
expected_patterns: Patterns
expected_meta: Dict[str, Any]


class POTopLevelOverriden1:
...


class POTopLevelOverriden2:
...


secondary_registry = PageObjectRegistry(name="secondary")


# This first annotation is ignored. A single annotation per registry is allowed
@handle_urls("example.com", POTopLevelOverriden1)
@handle_urls("example.com", POTopLevelOverriden1, exclude="/*.jpg|", priority=300)
class POTopLevel1(POBase):
expected_overrides = POTopLevelOverriden1
expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300)
expected_meta = {} # type: ignore


# The second annotation is for a different registry
@handle_urls("example.com", POTopLevelOverriden2)
@secondary_registry.handle_urls("example.org", POTopLevelOverriden2)
class POTopLevel2(POBase):
expected_overrides = POTopLevelOverriden2
expected_patterns = Patterns(["example.com"])
expected_meta = {} # type: ignore
16 changes: 16 additions & 0 deletions tests/po_lib/a_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from url_matcher import Patterns

from tests.po_lib import POBase
from web_poet import handle_urls


class POModuleOverriden:
...


@handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo")
class POModule(POBase):
expected_overrides = POModuleOverriden
expected_patterns = Patterns(["example.com"])
expected_meta = {"extra_arg": "foo"} # type: ignore

Empty file added tests/po_lib/an_empty_module.py
Empty file.
Empty file.
15 changes: 15 additions & 0 deletions tests/po_lib/nested_package/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from url_matcher import Patterns

from tests.po_lib import POBase
from web_poet import handle_urls


class PONestedPkgOverriden:
...


@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedPkgOverriden)
class PONestedPkg(POBase):
expected_overrides = PONestedPkgOverriden
expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"])
expected_meta = {} # type: ignore
21 changes: 21 additions & 0 deletions tests/po_lib/nested_package/a_nested_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from url_matcher import Patterns

from tests.po_lib import POBase, secondary_registry
from web_poet import handle_urls


class PONestedModuleOverriden:
...


class PONestedModuleOverridenSecondary:
...


@handle_urls(include=["example.com", "example.org"], exclude=["/*.jpg|"], overrides=PONestedModuleOverriden)
@secondary_registry.handle_urls("example.com", PONestedModuleOverridenSecondary)
class PONestedModule(POBase):
expected_overrides = PONestedModuleOverriden
expected_patterns = Patterns(include=["example.com", "example.org"], exclude=["/*.jpg|"])
expected_meta = {} # type: ignore

69 changes: 69 additions & 0 deletions tests/test_overrides.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pytest
from url_matcher import Patterns

from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2
from tests.po_lib.a_module import POModule
from tests.po_lib.nested_package import PONestedPkg
from tests.po_lib.nested_package.a_nested_module import PONestedModule, PONestedModuleOverridenSecondary
from web_poet.overrides import find_page_object_overrides


POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule}


def test_list_page_objects_from_pkg():
"""Tests that metadata is extracted properly from the po_lib package"""
rules = find_page_object_overrides("tests.po_lib")
assert {po.use for po in rules} == POS

for rule in rules:
assert rule.instead_of == rule.use.expected_overrides, rule.use
assert rule.for_patterns == rule.use.expected_patterns, rule.use
assert rule.meta == rule.use.expected_meta, rule.use


def test_list_page_objects_from_module():
rules = find_page_object_overrides("tests.po_lib.a_module")
assert len(rules) == 1
rule = rules[0]
assert rule.use == POModule
assert rule.for_patterns == POModule.expected_patterns
assert rule.instead_of == POModule.expected_overrides


def test_list_page_objects_from_empty_module():
rules = find_page_object_overrides("tests.po_lib.an_empty_module")
assert len(rules) == 0


def test_list_page_objects_from_empty_pkg():
rules = find_page_object_overrides("tests.po_lib.an_empty_package")
assert len(rules) == 0


def test_list_page_objects_from_unknown_module():
with pytest.raises(ImportError):
find_page_object_overrides("tests.po_lib.unknown_module")


def test_list_page_objects_from_imported_registry():
rules = find_page_object_overrides("tests.po_lib", registry_name="secondary")
assert len(rules) == 2
rule_for = {po.use: po for po in rules}

potop2 = rule_for[POTopLevel2]
assert potop2.for_patterns == Patterns(["example.org"])
assert potop2.instead_of == POTopLevelOverriden2

pones = rule_for[PONestedModule]
assert pones.for_patterns == Patterns(["example.com"])
assert pones.instead_of == PONestedModuleOverridenSecondary


def test_list_page_objects_from_non_existing_registry():
assert find_page_object_overrides("tests.po_lib", registry_name="not-exist") == []


def test_cmd():
from web_poet.__main__ import main
main(["tests.po_lib"])
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ commands =
[testenv:mypy]
deps =
mypy
types-tabulate

commands = mypy --ignore-missing-imports web_poet tests

Expand Down
3 changes: 2 additions & 1 deletion web_poet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .pages import WebPage, ItemPage, ItemWebPage, Injectable
from .page_inputs import ResponseData
from .page_inputs import ResponseData
from .overrides import handle_urls, PageObjectRegistry
57 changes: 57 additions & 0 deletions web_poet/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import argparse
from typing import Callable

import tabulate

from web_poet.overrides import find_page_object_overrides


def qualified_name(cls: Callable) -> str:
return f"{cls.__module__}.{cls.__name__}"


def main(args=None):
parser = argparse.ArgumentParser(
description="Tool that list the Page Object overrides from a package or module recursively"
)
parser.add_argument(
"module",
metavar="PKG_OR_MODULE",
type=str,
help="A package or module to list overrides from",
)
parser.add_argument(
"--registry",
"-n",
metavar="REGISTRY_NAME",
type=str,
help="Registry name to list overrides from",
default="default",
)
args = parser.parse_args(args)
table = [
(
"Use this",
"instead of",
"for the URL patterns",
"except for the patterns",
"with priority",
"meta",
)
]
table += [
(
qualified_name(rule.use),
qualified_name(rule.instead_of),
rule.for_patterns.include,
rule.for_patterns.exclude,
rule.for_patterns.priority,
rule.meta,
)
for rule in find_page_object_overrides(args.module, registry_name=args.registry)
]
print(tabulate.tabulate(table, headers="firstrow"))


if __name__ == "__main__":
main()
Loading