Skip to content

Commit 1c68f62

Browse files
authored
Fixes conversion of userData and headers fields in Apify-Scrapy request translation (#179)
1 parent 6708f6f commit 1c68f62

File tree

12 files changed

+387
-302
lines changed

12 files changed

+387
-302
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@
22

33
## [1.5.4](../../releases/tag/v1.5.4) - Unreleased
44

5-
...
5+
### Added
6+
7+
- Add support for `headers` field in Apify <-> Scrapy request translation
8+
9+
### Fixed
10+
11+
- Fix conversion of `userData` field in Apify <-> Scrapy request translation
612

713
## [1.5.3](../../releases/tag/v1.5.3) - 2024-01-23
814

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ indent-style = "space"
135135
"**/{tests}/*" = [
136136
"D", # Everything from the pydocstyle
137137
"INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
138+
"PT011", # `pytest.raises({ExceptionType})` is too broad, set the `match` parameter or use a more specific exception
138139
"PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
139140
"S101", # Use of assert detected
140141
"T20", # flake8-print

src/apify/scrapy/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1+
from .requests import to_apify_request, to_scrapy_request
12
from .scheduler import ApifyScheduler
2-
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
3+
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client

src/apify/scrapy/middlewares/apify_retry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
) from exc
1515

1616
from apify.actor import Actor
17-
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
17+
from apify.scrapy.requests import to_apify_request
18+
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client
1819

1920
if TYPE_CHECKING:
2021
from apify.storages import RequestQueue

src/apify/scrapy/requests.py

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from __future__ import annotations
2+
3+
import codecs
4+
import pickle
5+
6+
try:
7+
from scrapy import Request, Spider
8+
from scrapy.utils.request import request_from_dict
9+
except ImportError as exc:
10+
raise ImportError(
11+
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
12+
) from exc
13+
14+
from apify._crypto import crypto_random_object_id
15+
from apify.actor import Actor
16+
17+
18+
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
19+
"""Convert a Scrapy request to an Apify request.
20+
21+
Args:
22+
scrapy_request: The Scrapy request to be converted.
23+
spider: The Scrapy spider that the request is associated with.
24+
25+
Raises:
26+
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
27+
28+
Returns:
29+
The converted Apify request.
30+
"""
31+
if not isinstance(scrapy_request, Request):
32+
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
33+
34+
call_id = crypto_random_object_id(8)
35+
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
36+
37+
apify_request = {
38+
'url': scrapy_request.url,
39+
'method': scrapy_request.method,
40+
'headers': scrapy_request.headers,
41+
'userData': scrapy_request.meta.get('userData', {}),
42+
}
43+
44+
# Add 'id' to the apify_request
45+
if scrapy_request.meta.get('apify_request_id'):
46+
apify_request['id'] = scrapy_request.meta['apify_request_id']
47+
48+
# Add 'uniqueKey' to the apify_request
49+
if scrapy_request.meta.get('apify_request_unique_key'):
50+
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
51+
52+
# Serialize the Scrapy Request and store it in the apify_request.
53+
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
54+
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
55+
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
56+
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
57+
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
58+
59+
apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
60+
61+
Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
62+
return apify_request
63+
64+
65+
def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
66+
"""Convert an Apify request to a Scrapy request.
67+
68+
Args:
69+
apify_request: The Apify request to be converted.
70+
spider: The Scrapy spider that the request is associated with.
71+
72+
Raises:
73+
TypeError: If the apify_request is not a dictionary.
74+
ValueError: If the apify_request does not contain the required keys.
75+
76+
Returns:
77+
The converted Scrapy request.
78+
"""
79+
if not isinstance(apify_request, dict):
80+
raise TypeError('apify_request must be a dictionary')
81+
82+
required_keys = ['url', 'method', 'id', 'uniqueKey']
83+
missing_keys = [key for key in required_keys if key not in apify_request]
84+
85+
if missing_keys:
86+
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
87+
88+
call_id = crypto_random_object_id(8)
89+
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
90+
91+
# If the apify_request comes from the Scrapy
92+
if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
93+
# Deserialize the Scrapy Request from the apify_request.
94+
# - This process involves decoding the base64-encoded request data and reconstructing
95+
# the Scrapy Request object from its dictionary representation.
96+
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
97+
98+
scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
99+
if not isinstance(scrapy_request_dict_encoded, str):
100+
raise TypeError('scrapy_request_dict_encoded must be a string')
101+
102+
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
103+
if not isinstance(scrapy_request_dict, dict):
104+
raise TypeError('scrapy_request_dict must be a dictionary')
105+
106+
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
107+
if not isinstance(scrapy_request, Request):
108+
raise TypeError('scrapy_request must be an instance of the Request class')
109+
110+
Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
111+
112+
# Update the meta field with the meta field from the apify_request
113+
meta = scrapy_request.meta or {}
114+
meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
115+
scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this
116+
117+
# If the apify_request comes directly from the Request Queue, typically start URLs
118+
else:
119+
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
120+
121+
scrapy_request = Request(
122+
url=apify_request['url'],
123+
method=apify_request['method'],
124+
meta={
125+
'apify_request_id': apify_request['id'],
126+
'apify_request_unique_key': apify_request['uniqueKey'],
127+
},
128+
)
129+
130+
# Add optional 'headers' field
131+
if 'headers' in apify_request:
132+
scrapy_request.headers = apify_request['headers']
133+
134+
# Add optional 'userData' field
135+
if 'userData' in apify_request:
136+
scrapy_request.meta['userData'] = apify_request['userData']
137+
138+
Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
139+
return scrapy_request

src/apify/scrapy/scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515
from apify._crypto import crypto_random_object_id
1616
from apify.actor import Actor
17-
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client, to_apify_request, to_scrapy_request
17+
from apify.scrapy.requests import to_apify_request, to_scrapy_request
18+
from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client
1819
from apify.storages import RequestQueue
1920

2021

src/apify/scrapy/utils.py

Lines changed: 0 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import codecs
5-
import pickle
64
from base64 import b64encode
75
from urllib.parse import unquote
86

97
try:
10-
from scrapy import Request, Spider
118
from scrapy.settings import Settings # noqa: TCH002
129
from scrapy.utils.project import get_project_settings
1310
from scrapy.utils.python import to_bytes
14-
from scrapy.utils.request import request_from_dict
1511
except ImportError as exc:
1612
raise ImportError(
1713
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
1814
) from exc
1915

20-
from apify._crypto import crypto_random_object_id
2116
from apify.actor import Actor
2217
from apify.storages import RequestQueue, StorageClientManager
2318

@@ -42,119 +37,6 @@ def get_running_event_loop_id() -> int:
4237
return id(asyncio.get_running_loop())
4338

4439

45-
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
46-
"""Convert a Scrapy request to an Apify request.
47-
48-
Args:
49-
scrapy_request: The Scrapy request to be converted.
50-
spider: The Scrapy spider that the request is associated with.
51-
52-
Raises:
53-
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
54-
55-
Returns:
56-
The converted Apify request.
57-
"""
58-
if not isinstance(scrapy_request, Request):
59-
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
60-
61-
call_id = crypto_random_object_id(8)
62-
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
63-
64-
apify_request = {
65-
'url': scrapy_request.url,
66-
'method': scrapy_request.method,
67-
}
68-
69-
# Add 'id' to the apify_request
70-
if scrapy_request.meta.get('apify_request_id'):
71-
apify_request['id'] = scrapy_request.meta['apify_request_id']
72-
73-
# Add 'uniqueKey' to the apify_request
74-
if scrapy_request.meta.get('apify_request_unique_key'):
75-
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
76-
77-
# Serialize the Scrapy Request and store it in the apify_request.
78-
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
79-
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
80-
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
81-
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
82-
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
83-
apify_request['userData'] = {'scrapy_request': scrapy_request_dict_encoded}
84-
85-
Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
86-
return apify_request
87-
88-
89-
def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
90-
"""Convert an Apify request to a Scrapy request.
91-
92-
Args:
93-
apify_request: The Apify request to be converted.
94-
spider: The Scrapy spider that the request is associated with.
95-
96-
Raises:
97-
TypeError: If the apify_request is not a dictionary.
98-
ValueError: If the apify_request does not contain the required keys.
99-
100-
Returns:
101-
The converted Scrapy request.
102-
"""
103-
if not isinstance(apify_request, dict):
104-
raise TypeError('apify_request must be a dictionary')
105-
106-
required_keys = ['url', 'method', 'id', 'uniqueKey']
107-
missing_keys = [key for key in required_keys if key not in apify_request]
108-
109-
if missing_keys:
110-
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
111-
112-
call_id = crypto_random_object_id(8)
113-
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
114-
115-
# If the apify_request comes from the Scrapy
116-
if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']:
117-
# Deserialize the Scrapy Request from the apify_request.
118-
# - This process involves decoding the base64-encoded request data and reconstructing
119-
# the Scrapy Request object from its dictionary representation.
120-
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
121-
122-
scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
123-
if not isinstance(scrapy_request_dict_encoded, str):
124-
raise TypeError('scrapy_request_dict_encoded must be a string')
125-
126-
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
127-
if not isinstance(scrapy_request_dict, dict):
128-
raise TypeError('scrapy_request_dict must be a dictionary')
129-
130-
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
131-
if not isinstance(scrapy_request, Request):
132-
raise TypeError('scrapy_request must be an instance of the Request class')
133-
134-
Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
135-
136-
# Update the meta field with the meta field from the apify_request
137-
meta = scrapy_request.meta or {}
138-
meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']})
139-
scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this
140-
141-
# If the apify_request comes directly from the Request Queue, typically start URLs
142-
else:
143-
Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)')
144-
145-
scrapy_request = Request(
146-
url=apify_request['url'],
147-
method=apify_request['method'],
148-
meta={
149-
'apify_request_id': apify_request['id'],
150-
'apify_request_unique_key': apify_request['uniqueKey'],
151-
},
152-
)
153-
154-
Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}')
155-
return scrapy_request
156-
157-
15840
def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict | None = None) -> Settings:
15941
"""Integrates Apify configuration into a Scrapy project settings.
16042

tests/unit/scrapy/requests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)