Skip to content

Commit c53d39b

Browse files
committed
Set dont_filter=True on additional requests
1 parent 045f3bf commit c53d39b

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

Diff for: scrapy_poet/downloader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ async def scrapy_downloader(request: HttpRequest):
2121
f"one of type: {type(request)!r}."
2222
)
2323

24-
scrapy_request = http_request_to_scrapy_request(request)
24+
scrapy_request = http_request_to_scrapy_request(request, dont_filter=True)
2525

2626
if scrapy_request.method == "HEAD":
2727
scrapy_request.meta["dont_redirect"] = True

Diff for: tests/test_downloader.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import partial
44
from typing import Any, Callable, List, Optional, Sequence, Set
55
from unittest import mock
6+
from urllib.parse import urlparse
67

78
import attr
89
import pytest
@@ -347,7 +348,7 @@ async def parse(self, response, page: ItemPage):
347348

348349

349350
@inlineCallbacks
350-
def test_additional_requests_dont_filter() -> None:
351+
def test_additional_requests_dont_filter_duplicate() -> None:
351352
"""Verify that while duplicate regular requests are filtered out,
352353
additional requests are not (neither relative to the main requests not
353354
relative to each other).
@@ -392,6 +393,45 @@ async def parse(self, response, page: ItemPage):
392393
assert items == [{"a": "a"}]
393394

394395

396+
@inlineCallbacks
397+
def test_additional_requests_dont_filter_offsite() -> None:
398+
items = []
399+
400+
with MockServer(EchoResource) as server:
401+
402+
@attr.define
403+
class ItemPage(WebPage):
404+
http: HttpClient
405+
406+
async def to_item(self):
407+
response1 = await self.http.request(
408+
server.root_url,
409+
body=b"a",
410+
)
411+
# Not filtered out by the offsite middleware because it is an
412+
# additional request.
413+
response2 = await self.http.request("data:,b")
414+
return {response1.body.decode(): response2.body.decode()}
415+
416+
class TestSpider(Spider):
417+
name = "test_spider"
418+
allowed_domains = [urlparse(server.root_url).hostname]
419+
420+
def start_requests(self):
421+
yield Request(server.root_url, callback=self.parse)
422+
# Filtered out by the offsite middleware:
423+
yield Request("data:,", callback=self.parse)
424+
425+
async def parse(self, response, page: ItemPage):
426+
item = await page.to_item()
427+
items.append(item)
428+
429+
crawler = make_crawler(TestSpider)
430+
yield crawler.crawl()
431+
432+
assert items == [{"a": "b"}]
433+
434+
395435
@inlineCallbacks
396436
def test_additional_requests_no_cb_deps() -> None:
397437
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/135

0 commit comments

Comments
 (0)