diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f6c97ef9..1e97b227 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,8 @@ TBR * Using the said additional requests needs ``async/await`` support in ``asyncio``. This raises the minimum scrapy requirement to ``scrapy>=2.6.0``. +* add ``async`` support for ``callback_for``. + 0.3.0 (2022-01-28) ------------------ diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index 7fea1ac5..d592d19c 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -15,6 +15,7 @@ These are mainly achieved by **scrapy-poet** implementing **providers** for them * :class:`scrapy_poet.page_input_providers.HttpClientProvider` * :class:`scrapy_poet.page_input_providers.MetaProvider` +.. _`intro-additional-requests`: Additional Requests =================== diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 9ee1fb08..6881fdac 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -198,6 +198,30 @@ returning the result of the ``to_item`` method call. We could use ``response.follow_all(links, callback_for(BookPage))``, without creating an attribute, but currently it won't work with Scrapy disk queues. +.. tip:: + + :func:`~.callback_for` also supports `async generators`. So having the + following: + + .. code-block:: python + + class BookPage(web_poet.ItemWebPage): + async def to_item(self): + return await do_something_async() + + callback_for(BookPage) + + would result in: + + .. code-block:: python + + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + + This is useful when the Page Objects uses additional requests, which rely + heavily on ``async/await`` syntax. More info on this in this tutorial + section: :ref:`intro-additional-requests`. + Final result ============ diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 53454809..bb0da171 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -1,4 +1,5 @@ from typing import Callable, Optional, Type +from inspect import iscoroutinefunction from scrapy.http import Request, Response @@ -67,6 +68,25 @@ def parse(self, response): parse_book = callback_for(BookPage) + This also produces an async generator callable if the Page Objects's + ``to_item()`` method is a coroutine which uses the ``async/await`` syntax. + So having the following: + + .. code-block:: python + + class BookPage(web_poet.ItemWebPage): + async def to_item(self): + return await do_something_async() + + callback_for(BookPage) + + would result in: + + .. code-block:: python + + async def parse_book(self, response: DummyResponse, page: BookPage): + yield await page.to_item() + The generated callback could be used as a spider instance method or passed as an inline/anonymous argument. Make sure to define it as a spider attribute (as shown in the example above) if you're planning to use @@ -90,5 +110,12 @@ def parse(self, response): def parse(*args, page: page_cls, **kwargs): # type: ignore yield page.to_item() # type: ignore + async def async_parse(*args, page: page_cls, **kwargs): # type: ignore + yield await page.to_item() # type: ignore + + if iscoroutinefunction(page_cls.to_item): + setattr(async_parse, _CALLBACK_FOR_MARKER, True) + return async_parse + setattr(parse, _CALLBACK_FOR_MARKER, True) return parse diff --git a/scrapy_poet/backend.py b/scrapy_poet/backend.py index 5dba5808..c72c0ec0 100644 --- a/scrapy_poet/backend.py +++ b/scrapy_poet/backend.py @@ -7,7 +7,6 @@ from scrapy_poet.utils import scrapy_response_to_http_response - logger = logging.getLogger(__name__) diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 7a830712..3002de71 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -14,6 +14,11 @@ class FakeItemPage(ItemPage): def to_item(self): return 'fake item page' +class FakeItemPageAsync(ItemPage): + + async def to_item(self): + return 'fake item page' + class FakeItemWebPage(ItemWebPage): @@ -28,6 +33,12 @@ class MySpider(scrapy.Spider): parse_web = callback_for(FakeItemWebPage) +class MySpiderAsync(scrapy.Spider): + + name = 'my_spider_async' + parse_item = callback_for(FakeItemPageAsync) + + def test_callback_for(): """Simple test case to ensure it works as expected.""" cb = callback_for(FakeItemPage) @@ -39,6 +50,20 @@ def test_callback_for(): assert list(result) == ['fake item page'] +@pytest.mark.asyncio +async def test_callback_for_async(): + cb = callback_for(FakeItemPageAsync) + assert callable(cb) + + fake_page = FakeItemPageAsync() + response = DummyResponse('http://example.com/') + result = cb(response=response, page=fake_page) + + assert await result.__anext__() == 'fake item page' + with pytest.raises(StopAsyncIteration): + assert await result.__anext__() + + def test_callback_for_instance_method(): spider = MySpider() response = DummyResponse('http://example.com/') @@ -47,12 +72,16 @@ def test_callback_for_instance_method(): assert list(result) == ['fake item page'] -def test_callback_for_inline(): - callback = callback_for(FakeItemPage) +@pytest.mark.asyncio +async def test_callback_for_instance_method_async(): + spider = MySpiderAsync() response = DummyResponse('http://example.com/') - fake_page = FakeItemPage() - result = callback(response, page=fake_page) - assert list(result) == ['fake item page'] + fake_page = FakeItemPageAsync() + result = spider.parse_item(response, page=fake_page) + + assert await result.__anext__() == 'fake item page' + with pytest.raises(StopAsyncIteration): + assert await result.__anext__() def test_default_callback(): @@ -93,6 +122,18 @@ def test_inline_callback(): assert str(exc.value) == msg +def test_inline_callback_async(): + """Sample request with inline callback using async callback_for.""" + spider = MySpiderAsync() + cb = callback_for(FakeItemPageAsync) + request = scrapy.Request('http://example.com/', callback=cb) + with pytest.raises(ValueError) as exc: + request_to_dict(request, spider) + + msg = f'Function {cb} is not an instance method in: {spider}' + assert str(exc.value) == msg + + def test_invalid_subclass(): """Classes should inherit from ItemPage."""