Skip to content

Commit

Permalink
Set dont_filter=True on additional requests (#199)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Jun 10, 2024
1 parent 045f3bf commit 7c079f0
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 2 deletions.
2 changes: 1 addition & 1 deletion scrapy_poet/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ async def scrapy_downloader(request: HttpRequest):
f"one of type: {type(request)!r}."
)

scrapy_request = http_request_to_scrapy_request(request)
scrapy_request = http_request_to_scrapy_request(request, dont_filter=True)

if scrapy_request.method == "HEAD":
scrapy_request.meta["dont_redirect"] = True
Expand Down
44 changes: 43 additions & 1 deletion tests/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import partial
from typing import Any, Callable, List, Optional, Sequence, Set
from unittest import mock
from urllib.parse import urlparse

import attr
import pytest
Expand Down Expand Up @@ -347,7 +348,7 @@ async def parse(self, response, page: ItemPage):


@inlineCallbacks
def test_additional_requests_dont_filter() -> None:
def test_additional_requests_dont_filter_duplicate() -> None:
"""Verify that while duplicate regular requests are filtered out,
additional requests are not (neither relative to the main requests not
relative to each other).
Expand Down Expand Up @@ -392,6 +393,47 @@ async def parse(self, response, page: ItemPage):
assert items == [{"a": "a"}]


@inlineCallbacks
def test_additional_requests_dont_filter_offsite() -> None:
pytest.importorskip("scrapy.downloadermiddlewares.offsite")

items = []

with MockServer(EchoResource) as server:

@attr.define
class ItemPage(WebPage):
http: HttpClient

async def to_item(self):
response1 = await self.http.request(
server.root_url,
body=b"a",
)
# Not filtered out by the offsite middleware because it is an
# additional request.
response2 = await self.http.request("data:,b")
return {response1.body.decode(): response2.body.decode()}

class TestSpider(Spider):
name = "test_spider"
allowed_domains = [urlparse(server.root_url).hostname]

def start_requests(self):
yield Request(server.root_url, callback=self.parse)
# Filtered out by the offsite middleware:
yield Request("data:,", callback=self.parse)

async def parse(self, response, page: ItemPage):
item = await page.to_item()
items.append(item)

crawler = make_crawler(TestSpider)
yield crawler.crawl()

assert items == [{"a": "b"}]


@inlineCallbacks
def test_additional_requests_no_cb_deps() -> None:
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/135
Expand Down

0 comments on commit 7c079f0

Please sign in to comment.