Skip to content

Commit 72a37f1

Browse files
authored
Add apply_apify_settings to Scrapy subpackage (#178)
1 parent 6294a1c commit 72a37f1

File tree

3 files changed

+106
-1
lines changed

3 files changed

+106
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
## [1.5.3](../../releases/tag/v1.5.3) - Unreleased
44

5-
...
5+
### Added
6+
7+
- Add `apply_apify_settings` function to Scrapy subpackage
68

79
## [1.5.2](../../releases/tag/v1.5.2) - 2024-01-19
810

src/apify/scrapy/utils.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
try:
1010
from scrapy import Request, Spider
11+
from scrapy.settings import Settings # noqa: TCH002
12+
from scrapy.utils.project import get_project_settings
1113
from scrapy.utils.python import to_bytes
1214
from scrapy.utils.request import request_from_dict
1315
except ImportError as exc:
@@ -153,6 +155,45 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
153155
return scrapy_request
154156

155157

158+
def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
159+
"""Integrates Apify configuration into a Scrapy project settings.
160+
161+
Note: The function directly modifies the passed `settings` object and also returns it.
162+
163+
Args:
164+
settings: Scrapy project settings to be modified.
165+
proxy_config: Proxy configuration to be stored in the settings.
166+
167+
Returns:
168+
Scrapy project settings with custom configurations.
169+
"""
170+
if settings is None:
171+
settings = get_project_settings()
172+
173+
# Use ApifyScheduler as the scheduler
174+
settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'
175+
176+
# Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
177+
# ensuring it is executed as the final step in the pipeline sequence
178+
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
179+
180+
# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
181+
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None
182+
183+
# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
184+
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
185+
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950
186+
187+
# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
188+
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
189+
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000
190+
191+
# Store the proxy configuration
192+
settings['APIFY_PROXY_SETTINGS'] = proxy_config
193+
194+
return settings
195+
196+
156197
async def open_queue_with_custom_client() -> RequestQueue:
157198
"""Open a Request Queue with custom Apify Client.
158199
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from __future__ import annotations
2+
3+
from scrapy.settings import Settings
4+
5+
from apify.scrapy.utils import apply_apify_settings
6+
7+
8+
def test__apply_apify_settings__overrides_scheduler() -> None:
9+
settings = Settings()
10+
new_settings = apply_apify_settings(settings=settings)
11+
12+
assert new_settings.get('SCHEDULER') == 'apify.scrapy.scheduler.ApifyScheduler'
13+
14+
15+
def test__apply_apify_settings__update_item_pipelines() -> None:
16+
settings = Settings(
17+
{
18+
'ITEM_PIPELINES': {
19+
'scrapy.pipelines.files.FilesPipeline': 1,
20+
}
21+
}
22+
)
23+
new_settings = apply_apify_settings(settings=settings)
24+
25+
assert new_settings.get('ITEM_PIPELINES') == {
26+
'scrapy.pipelines.files.FilesPipeline': 1,
27+
'apify.scrapy.pipelines.ActorDatasetPushPipeline': 1000,
28+
}
29+
30+
31+
def test__apply_apify_settings__update_downloader_middlewares() -> None:
32+
settings = Settings(
33+
{
34+
'DOWNLOADER_MIDDLEWARES': {
35+
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123,
36+
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234,
37+
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345,
38+
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
39+
},
40+
}
41+
)
42+
new_settings = apply_apify_settings(settings=settings)
43+
44+
assert new_settings.get('DOWNLOADER_MIDDLEWARES') == {
45+
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None,
46+
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
47+
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
48+
'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950,
49+
'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000,
50+
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543,
51+
}
52+
53+
54+
def test__apply_apify_settings__add_proxy_config() -> None:
55+
settings = Settings()
56+
new_settings = apply_apify_settings(settings=settings)
57+
assert new_settings.get('APIFY_PROXY_SETTINGS') is None
58+
59+
settings = Settings()
60+
proxy_config = {'useApifyProxy': True, 'apifyProxyGroups': []}
61+
new_settings = apply_apify_settings(settings=settings, proxy_config=proxy_config)
62+
assert new_settings.get('APIFY_PROXY_SETTINGS') == {'useApifyProxy': True, 'apifyProxyGroups': []}

0 commit comments

Comments
 (0)