1
1
from __future__ import annotations
2
2
3
3
import asyncio
4
- import codecs
5
- import pickle
6
4
from base64 import b64encode
7
5
from urllib .parse import unquote
8
6
9
7
try :
10
- from scrapy import Request , Spider
11
8
from scrapy .settings import Settings # noqa: TCH002
12
9
from scrapy .utils .project import get_project_settings
13
10
from scrapy .utils .python import to_bytes
14
- from scrapy .utils .request import request_from_dict
15
11
except ImportError as exc :
16
12
raise ImportError (
17
13
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".' ,
18
14
) from exc
19
15
20
- from apify ._crypto import crypto_random_object_id
21
16
from apify .actor import Actor
22
17
from apify .storages import RequestQueue , StorageClientManager
23
18
@@ -42,119 +37,6 @@ def get_running_event_loop_id() -> int:
42
37
return id (asyncio .get_running_loop ())
43
38
44
39
45
- def to_apify_request (scrapy_request : Request , spider : Spider ) -> dict :
46
- """Convert a Scrapy request to an Apify request.
47
-
48
- Args:
49
- scrapy_request: The Scrapy request to be converted.
50
- spider: The Scrapy spider that the request is associated with.
51
-
52
- Raises:
53
- TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
54
-
55
- Returns:
56
- The converted Apify request.
57
- """
58
- if not isinstance (scrapy_request , Request ):
59
- raise TypeError ('scrapy_request must be an instance of the scrapy.Request class' )
60
-
61
- call_id = crypto_random_object_id (8 )
62
- Actor .log .debug (f'[{ call_id } ]: to_apify_request was called (scrapy_request={ scrapy_request } )...' )
63
-
64
- apify_request = {
65
- 'url' : scrapy_request .url ,
66
- 'method' : scrapy_request .method ,
67
- }
68
-
69
- # Add 'id' to the apify_request
70
- if scrapy_request .meta .get ('apify_request_id' ):
71
- apify_request ['id' ] = scrapy_request .meta ['apify_request_id' ]
72
-
73
- # Add 'uniqueKey' to the apify_request
74
- if scrapy_request .meta .get ('apify_request_unique_key' ):
75
- apify_request ['uniqueKey' ] = scrapy_request .meta ['apify_request_unique_key' ]
76
-
77
- # Serialize the Scrapy Request and store it in the apify_request.
78
- # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
79
- # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
80
- # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
81
- scrapy_request_dict = scrapy_request .to_dict (spider = spider )
82
- scrapy_request_dict_encoded = codecs .encode (pickle .dumps (scrapy_request_dict ), 'base64' ).decode ()
83
- apify_request ['userData' ] = {'scrapy_request' : scrapy_request_dict_encoded }
84
-
85
- Actor .log .debug (f'[{ call_id } ]: scrapy_request was converted to the apify_request={ apify_request } ' )
86
- return apify_request
87
-
88
-
89
- def to_scrapy_request (apify_request : dict , spider : Spider ) -> Request :
90
- """Convert an Apify request to a Scrapy request.
91
-
92
- Args:
93
- apify_request: The Apify request to be converted.
94
- spider: The Scrapy spider that the request is associated with.
95
-
96
- Raises:
97
- TypeError: If the apify_request is not a dictionary.
98
- ValueError: If the apify_request does not contain the required keys.
99
-
100
- Returns:
101
- The converted Scrapy request.
102
- """
103
- if not isinstance (apify_request , dict ):
104
- raise TypeError ('apify_request must be a dictionary' )
105
-
106
- required_keys = ['url' , 'method' , 'id' , 'uniqueKey' ]
107
- missing_keys = [key for key in required_keys if key not in apify_request ]
108
-
109
- if missing_keys :
110
- raise ValueError (f'apify_request must contain { ", " .join (map (repr , missing_keys ))} key(s)' )
111
-
112
- call_id = crypto_random_object_id (8 )
113
- Actor .log .debug (f'[{ call_id } ]: to_scrapy_request was called (apify_request={ apify_request } )...' )
114
-
115
- # If the apify_request comes from the Scrapy
116
- if 'userData' in apify_request and 'scrapy_request' in apify_request ['userData' ]:
117
- # Deserialize the Scrapy Request from the apify_request.
118
- # - This process involves decoding the base64-encoded request data and reconstructing
119
- # the Scrapy Request object from its dictionary representation.
120
- Actor .log .debug (f'[{ call_id } ]: Restoring the Scrapy Request from the apify_request...' )
121
-
122
- scrapy_request_dict_encoded = apify_request ['userData' ]['scrapy_request' ]
123
- if not isinstance (scrapy_request_dict_encoded , str ):
124
- raise TypeError ('scrapy_request_dict_encoded must be a string' )
125
-
126
- scrapy_request_dict = pickle .loads (codecs .decode (scrapy_request_dict_encoded .encode (), 'base64' ))
127
- if not isinstance (scrapy_request_dict , dict ):
128
- raise TypeError ('scrapy_request_dict must be a dictionary' )
129
-
130
- scrapy_request = request_from_dict (scrapy_request_dict , spider = spider )
131
- if not isinstance (scrapy_request , Request ):
132
- raise TypeError ('scrapy_request must be an instance of the Request class' )
133
-
134
- Actor .log .debug (f'[{ call_id } ]: Scrapy Request successfully reconstructed (scrapy_request={ scrapy_request } )...' )
135
-
136
- # Update the meta field with the meta field from the apify_request
137
- meta = scrapy_request .meta or {}
138
- meta .update ({'apify_request_id' : apify_request ['id' ], 'apify_request_unique_key' : apify_request ['uniqueKey' ]})
139
- scrapy_request ._meta = meta # scrapy_request.meta is a property, so we have to set it like this
140
-
141
- # If the apify_request comes directly from the Request Queue, typically start URLs
142
- else :
143
- Actor .log .debug (f'[{ call_id } ]: gonna create a new Scrapy Request (cannot be restored)' )
144
-
145
- scrapy_request = Request (
146
- url = apify_request ['url' ],
147
- method = apify_request ['method' ],
148
- meta = {
149
- 'apify_request_id' : apify_request ['id' ],
150
- 'apify_request_unique_key' : apify_request ['uniqueKey' ],
151
- },
152
- )
153
-
154
- Actor .log .debug (f'[{ call_id } ]: an apify_request was converted to the scrapy_request={ scrapy_request } ' )
155
- return scrapy_request
156
-
157
-
158
40
def apply_apify_settings (* , settings : Settings | None = None , proxy_config : dict | None = None ) -> Settings :
159
41
"""Integrates Apify configuration into a Scrapy project settings.
160
42
0 commit comments