Skip to content

Commit 1ca82f8

Browse files
authored
Update of Scrapy integration, fixing dw middlewares (#186)
1 parent ea6c7f7 commit 1ca82f8

File tree

8 files changed

+47
-142
lines changed

8 files changed

+47
-142
lines changed

CHANGELOG.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
# Changelog
22

3-
## [1.5.6](../../releases/tag/v1.5.6) - Unreleased
3+
## [1.6.0](../../releases/tag/v1.6.0) - Unreleased
44

5-
...
5+
### Fixed
6+
7+
- Update of Scrapy integration, fixes in `ApifyScheduler`, `to_apify_request` and `apply_apify_settings`.
8+
9+
### Removed
10+
11+
- Removed `ApifyRetryMiddleware` and stay with the Scrapy's default one
612

713
## [1.5.5](../../releases/tag/v1.5.5) - 2024-02-01
814

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "apify"
3-
version = "1.5.6"
3+
version = "1.6.0"
44
description = "Apify SDK for Python"
55
readme = "README.md"
66
license = { text = "Apache Software License" }
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
from .apify_proxy import ApifyHttpProxyMiddleware
2-
from .apify_retry import ApifyRetryMiddleware

src/apify/scrapy/middlewares/apify_retry.py

Lines changed: 0 additions & 117 deletions
This file was deleted.

src/apify/scrapy/requests.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616
from apify.actor import Actor
1717

1818

19+
def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
20+
"""Returns True if the Scrapy request was produced by a downloader middleware, otherwise False.
21+
22+
Works for RetryMiddleware and RedirectMiddleware.
23+
"""
24+
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
25+
26+
1927
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
2028
"""Convert a Scrapy request to an Apify request.
2129
@@ -48,13 +56,16 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
4856
f'scrapy_request.headers is not an instance of the scrapy.http.headers.Headers class, scrapy_request.headers = {scrapy_request.headers}',
4957
)
5058

51-
# Add 'id' to the apify_request
52-
if scrapy_request.meta.get('apify_request_id'):
53-
apify_request['id'] = scrapy_request.meta['apify_request_id']
59+
if _is_request_produced_by_middleware(scrapy_request):
60+
apify_request['uniqueKey'] = scrapy_request.url
61+
else:
62+
# Add 'id' to the apify_request
63+
if scrapy_request.meta.get('apify_request_id'):
64+
apify_request['id'] = scrapy_request.meta['apify_request_id']
5465

55-
# Add 'uniqueKey' to the apify_request
56-
if scrapy_request.meta.get('apify_request_unique_key'):
57-
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
66+
# Add 'uniqueKey' to the apify_request
67+
if scrapy_request.meta.get('apify_request_unique_key'):
68+
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
5869

5970
# Serialize the Scrapy Request and store it in the apify_request.
6071
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,

src/apify/scrapy/scheduler.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ def has_pending_requests(self: ApifyScheduler) -> bool:
7070
def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
7171
"""Add a request to the scheduler.
7272
73+
This could be called from either from a spider or a downloader middleware (e.g. redirect, retry, ...).
74+
7375
Args:
7476
request: The request to add to the scheduler.
7577
@@ -94,7 +96,7 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
9496
traceback.print_exc()
9597
raise
9698

97-
Actor.log.debug(f'[{call_id}]: apify_request was added to the RQ (apify_request={apify_request})')
99+
Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...')
98100
return bool(result['wasAlreadyPresent'])
99101

100102
def next_request(self: ApifyScheduler) -> Request | None:
@@ -109,6 +111,7 @@ def next_request(self: ApifyScheduler) -> Request | None:
109111
if not isinstance(self._rq, RequestQueue):
110112
raise TypeError('self._rq must be an instance of the RequestQueue class')
111113

114+
# Fetch the next request from the Request Queue
112115
try:
113116
apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
114117
except BaseException:
@@ -123,6 +126,14 @@ def next_request(self: ApifyScheduler) -> Request | None:
123126
if not isinstance(self.spider, Spider):
124127
raise TypeError('self.spider must be an instance of the Spider class')
125128

129+
# Let the Request Queue know that the request is being handled. Every request should be marked as handled,
130+
# retrying is handled by the Scrapy's RetryMiddleware.
131+
try:
132+
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
133+
except BaseException:
134+
traceback.print_exc()
135+
raise
136+
126137
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
127138
Actor.log.debug(
128139
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',

src/apify/scrapy/utils.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,13 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
5959
# ensuring it is executed as the final step in the pipeline sequence
6060
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
6161

62-
# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
63-
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None
62+
# Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request
63+
# during process_response, but currently we have no way of detecting it and handling it properly.
64+
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None
6465

65-
# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
66+
# Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware
6667
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
67-
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950
68-
69-
# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
70-
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
71-
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000
68+
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750
7269

7370
# Store the proxy configuration
7471
settings['APIFY_PROXY_SETTINGS'] = proxy_config

tests/unit/scrapy/utils/test_apply_apify_settings.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,18 @@ def test__apply_apify_settings__update_downloader_middlewares() -> None:
3434
'DOWNLOADER_MIDDLEWARES': {
3535
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
3636
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234,
37-
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345,
3837
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
3938
},
4039
}
4140
)
4241
new_settings = apply_apify_settings(settings=settings)
4342

4443
assert new_settings.get('DOWNLOADER_MIDDLEWARES') == {
45-
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None,
46-
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
47-
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
48-
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950,
49-
'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000,
44+
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 750,
45+
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': None,
5046
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
47+
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
48+
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
5149
}
5250

5351

0 commit comments

Comments
 (0)