Skip to content

Commit f3fba07

Browse files
author
Alessio Pollero
committed
Implement multiple urls in single request
1 parent 853d0a3 commit f3fba07

File tree

2 files changed

+36
-30
lines changed

2 files changed

+36
-30
lines changed

scrapyrt/core.py

+17-11
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,15 @@ def __init__(self, spider_name, request_kwargs, max_requests=None, start_request
145145
# callback will be added after instantiation of crawler object
146146
# because we need to know if spider has method available
147147
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
148+
148149
if request_kwargs.get("url"):
149-
self.request = self.create_spider_request(deepcopy(request_kwargs))
150+
urls = request_kwargs.pop('url').split('|')
151+
self.requests = []
152+
for url in urls:
153+
self.requests.append(self.create_spider_request(request_kwargs, url))
150154
else:
151-
self.request = None
155+
self.requests = None
156+
152157
self.start_requests = start_requests
153158
self._request_scheduled = False
154159

@@ -193,15 +198,16 @@ def spider_idle(self, spider):
193198
which is totally wrong.
194199
195200
"""
196-
if spider is self.crawler.spider and self.request and not self._request_scheduled:
201+
if spider is self.crawler.spider and self.requests and not self._request_scheduled:
197202
callback = getattr(self.crawler.spider, self.callback_name)
198203
assert callable(callback), 'Invalid callback'
199-
self.request = self.request.replace(callback=callback)
200-
modify_request = getattr(
201-
self.crawler.spider, "modify_realtime_request", None)
202-
if callable(modify_request):
203-
self.request = modify_request(self.request)
204-
spider.crawler.engine.crawl(self.request, spider)
204+
for i in range(len(self.requests)):
205+
self.requests[i] = self.requests[i].replace(callback=callback)
206+
modify_request = getattr(
207+
self.crawler.spider, "modify_realtime_request", None)
208+
if callable(modify_request):
209+
self.requests[i] = modify_request(self.requests[i])
210+
spider.crawler.engine.crawl(self.requests[i], spider)
205211
self._request_scheduled = True
206212
raise DontCloseSpider
207213

@@ -262,8 +268,8 @@ def return_items(self, result):
262268
results["errors"] = self.errors
263269
return results
264270

265-
def create_spider_request(self, kwargs):
266-
url = kwargs.pop('url')
271+
def create_spider_request(self, kwargs, url):
272+
267273
try:
268274
req = Request(url, **kwargs)
269275
except (TypeError, ValueError) as e:

tests/test_crawl_manager.py

+19-19
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
from .spiders import MetaSpider
2222

23-
2423
class TestCrawlManager(unittest.TestCase):
2524

2625
def setUp(self):
@@ -90,7 +89,7 @@ def setUp(self):
9089
# test callback
9190
self.spider.parse_something = lambda: None
9291
self.crawl_manager.callback_name = 'parse_something'
93-
self.request = self.crawl_manager.request
92+
self.request = self.crawl_manager.requests[0]
9493

9594
def _call_spider_idle(self):
9695
try:
@@ -99,13 +98,13 @@ def _call_spider_idle(self):
9998
pass
10099

101100
def test_spider_opened(self):
102-
self.assertIsNone(self.crawl_manager.request.callback)
101+
self.assertIsNone(self.crawl_manager.requests[0].callback)
103102
self._call_spider_idle()
104103
self.crawler.engine.crawl.assert_called_once_with(
105-
self.crawl_manager.request, self.spider)
106-
self.assertNotEqual(self.request, self.crawl_manager.request)
104+
self.crawl_manager.requests[0], self.spider)
105+
self.assertNotEqual(self.request, self.crawl_manager.requests[0])
107106
self.assertEquals(
108-
self.crawl_manager.request.callback, self.spider.parse_something)
107+
self.crawl_manager.requests[0].callback, self.spider.parse_something)
109108

110109
def test_raise_error_if_not_callable(self):
111110
self.spider.parse_something = None
@@ -114,8 +113,9 @@ def test_raise_error_if_not_callable(self):
114113
self.assertFalse(self.crawler.engine.crawl.called)
115114

116115
def test_modify_realtime_request(self):
117-
self.assertDictEqual(self.crawl_manager.request.meta, {})
118-
self.assertEqual(self.crawl_manager.request.method, 'GET')
116+
for request in self.crawl_manager.requests:
117+
self.assertDictEqual(request.meta, {})
118+
self.assertEqual(request.method, 'GET')
119119

120120
def modify_realtime_request(request):
121121
request = request.replace(method='POST')
@@ -125,16 +125,16 @@ def modify_realtime_request(request):
125125
self.spider.modify_realtime_request = modify_realtime_request
126126
self._call_spider_idle()
127127
self.crawler.engine.crawl.assert_called_once_with(
128-
self.crawl_manager.request, self.spider)
129-
self.assertEqual(self.crawl_manager.request.method, 'POST')
130-
self.assertEqual(self.crawl_manager.request.meta['foo'], 'bar')
128+
self.crawl_manager.requests[0], self.spider)
129+
self.assertEqual(self.crawl_manager.requests[0].method, 'POST')
130+
self.assertEqual(self.crawl_manager.requests[0].meta['foo'], 'bar')
131131

132132
def test_modify_realtime_request_is_not_callable(self):
133133
self.spider.modify_realtime_request = None
134134
self._call_spider_idle()
135135
self.crawler.engine.crawl.assert_called_once_with(
136-
self.crawl_manager.request, self.spider)
137-
self.assertNotEqual(self.request, self.crawl_manager.request)
136+
self.crawl_manager.requests[0], self.spider)
137+
self.assertNotEqual(self.request, self.crawl_manager.requests[0])
138138

139139

140140
class TestHandleScheduling(TestCrawlManager):
@@ -146,13 +146,13 @@ def setUp(self):
146146

147147
def test_handle_scheduling(self):
148148
self.crawl_manager.handle_scheduling(
149-
self.crawl_manager.request, self.spider)
149+
self.crawl_manager.requests[0], self.spider)
150150
self.crawl_manager.limit_requests.assert_called_once_with(self.spider)
151151
self.crawl_manager.limit_runtime.assert_called_once_with(self.spider)
152152

153153
def test_handle_scheduling_another_spider(self):
154154
self.crawl_manager.handle_scheduling(
155-
self.crawl_manager.request, self.another_spider)
155+
self.crawl_manager.requests[0], self.another_spider)
156156
self.assertFalse(self.crawl_manager.limit_requests.called)
157157
self.assertFalse(self.crawl_manager.limit_runtime.called)
158158

@@ -321,24 +321,24 @@ def test_return_items_without_debug(self):
321321
class TestCreateSpiderRequest(TestCrawlManager):
322322

323323
def test_valid_arguments(self):
324-
req = self.crawl_manager.create_spider_request(self.kwargs)
324+
req = self.crawl_manager.create_spider_request(self.kwargs, self.kwargs['url'])
325325
self.assertTrue(req.dont_filter)
326326
self.assertEqual(req.url, self.url)
327327

328328
def test_invalid_arguments(self):
329329
self.kwargs['url1'] = 'http://localhost/foo'
330330
exception = self.assertRaises(
331-
Error, self.crawl_manager.create_spider_request, self.kwargs)
331+
Error, self.crawl_manager.create_spider_request, self.kwargs, self.kwargs['url1'])
332332
self.assertEqual(exception.status, '400')
333333

334334
def test_invalid_url(self):
335335
self.kwargs['url'] = '//localhost/foo'
336336
exception = self.assertRaises(
337-
Error, self.crawl_manager.create_spider_request, self.kwargs)
337+
Error, self.crawl_manager.create_spider_request, self.kwargs, self.kwargs['url'])
338338
self.assertEqual(exception.status, '400')
339339
self.kwargs['url'] = 'localhost/foo'
340340
exception = self.assertRaises(
341-
Error, self.crawl_manager.create_spider_request, self.kwargs)
341+
Error, self.crawl_manager.create_spider_request, self.kwargs, self.kwargs['url'])
342342
self.assertEqual(exception.status, '400')
343343

344344

0 commit comments

Comments
 (0)