From 94ddc8d11afe2eb10bdcc95b3eadb051ffe3e119 Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Thu, 26 Oct 2017 16:31:31 +0200 Subject: [PATCH 1/2] Implement multiple urls in single request --- scrapyrt/core.py | 28 ++++++++++++++++----------- tests/test_crawl_manager.py | 38 ++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/scrapyrt/core.py b/scrapyrt/core.py index 1df6f4a..10a0992 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -145,10 +145,15 @@ def __init__(self, spider_name, request_kwargs, max_requests=None, start_request # callback will be added after instantiation of crawler object # because we need to know if spider has method available self.callback_name = request_kwargs.pop('callback', None) or 'parse' + if request_kwargs.get("url"): - self.request = self.create_spider_request(deepcopy(request_kwargs)) + urls = request_kwargs.pop('url').split('|') + self.requests = [] + for url in urls: + self.requests.append(self.create_spider_request(url, request_kwargs)) else: - self.request = None + self.requests = None + self.start_requests = start_requests self._request_scheduled = False @@ -193,15 +198,16 @@ def spider_idle(self, spider): which is totally wrong. """ - if spider is self.crawler.spider and self.request and not self._request_scheduled: + if spider is self.crawler.spider and self.requests and not self._request_scheduled: callback = getattr(self.crawler.spider, self.callback_name) assert callable(callback), 'Invalid callback' - self.request = self.request.replace(callback=callback) - modify_request = getattr( - self.crawler.spider, "modify_realtime_request", None) - if callable(modify_request): - self.request = modify_request(self.request) - spider.crawler.engine.crawl(self.request, spider) + for i in range(len(self.requests)): + self.requests[i] = self.requests[i].replace(callback=callback) + modify_request = getattr( + self.crawler.spider, "modify_realtime_request", None) + if callable(modify_request): + self.requests[i] = modify_request(self.requests[i]) + spider.crawler.engine.crawl(self.requests[i], spider) self._request_scheduled = True raise DontCloseSpider @@ -262,8 +268,8 @@ def return_items(self, result): results["errors"] = self.errors return results - def create_spider_request(self, kwargs): - url = kwargs.pop('url') + def create_spider_request(self, url, kwargs): + try: req = Request(url, **kwargs) except (TypeError, ValueError) as e: diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index f0df003..f1b9b18 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -20,7 +20,6 @@ from .spiders import MetaSpider - class TestCrawlManager(unittest.TestCase): def setUp(self): @@ -90,7 +89,7 @@ def setUp(self): # test callback self.spider.parse_something = lambda: None self.crawl_manager.callback_name = 'parse_something' - self.request = self.crawl_manager.request + self.request = self.crawl_manager.requests[0] def _call_spider_idle(self): try: @@ -99,13 +98,13 @@ def _call_spider_idle(self): pass def test_spider_opened(self): - self.assertIsNone(self.crawl_manager.request.callback) + self.assertIsNone(self.crawl_manager.requests[0].callback) self._call_spider_idle() self.crawler.engine.crawl.assert_called_once_with( - self.crawl_manager.request, self.spider) - self.assertNotEqual(self.request, self.crawl_manager.request) + self.crawl_manager.requests[0], self.spider) + self.assertNotEqual(self.request, self.crawl_manager.requests[0]) self.assertEquals( - self.crawl_manager.request.callback, self.spider.parse_something) + self.crawl_manager.requests[0].callback, self.spider.parse_something) def test_raise_error_if_not_callable(self): self.spider.parse_something = None @@ -114,8 +113,9 @@ def test_raise_error_if_not_callable(self): self.assertFalse(self.crawler.engine.crawl.called) def test_modify_realtime_request(self): - self.assertDictEqual(self.crawl_manager.request.meta, {}) - self.assertEqual(self.crawl_manager.request.method, 'GET') + for request in self.crawl_manager.requests: + self.assertDictEqual(request.meta, {}) + self.assertEqual(request.method, 'GET') def modify_realtime_request(request): request = request.replace(method='POST') @@ -125,16 +125,16 @@ def modify_realtime_request(request): self.spider.modify_realtime_request = modify_realtime_request self._call_spider_idle() self.crawler.engine.crawl.assert_called_once_with( - self.crawl_manager.request, self.spider) - self.assertEqual(self.crawl_manager.request.method, 'POST') - self.assertEqual(self.crawl_manager.request.meta['foo'], 'bar') + self.crawl_manager.requests[0], self.spider) + self.assertEqual(self.crawl_manager.requests[0].method, 'POST') + self.assertEqual(self.crawl_manager.requests[0].meta['foo'], 'bar') def test_modify_realtime_request_is_not_callable(self): self.spider.modify_realtime_request = None self._call_spider_idle() self.crawler.engine.crawl.assert_called_once_with( - self.crawl_manager.request, self.spider) - self.assertNotEqual(self.request, self.crawl_manager.request) + self.crawl_manager.requests[0], self.spider) + self.assertNotEqual(self.request, self.crawl_manager.requests[0]) class TestHandleScheduling(TestCrawlManager): @@ -146,13 +146,13 @@ def setUp(self): def test_handle_scheduling(self): self.crawl_manager.handle_scheduling( - self.crawl_manager.request, self.spider) + self.crawl_manager.requests[0], self.spider) self.crawl_manager.limit_requests.assert_called_once_with(self.spider) self.crawl_manager.limit_runtime.assert_called_once_with(self.spider) def test_handle_scheduling_another_spider(self): self.crawl_manager.handle_scheduling( - self.crawl_manager.request, self.another_spider) + self.crawl_manager.requests[0], self.another_spider) self.assertFalse(self.crawl_manager.limit_requests.called) self.assertFalse(self.crawl_manager.limit_runtime.called) @@ -321,24 +321,24 @@ def test_return_items_without_debug(self): class TestCreateSpiderRequest(TestCrawlManager): def test_valid_arguments(self): - req = self.crawl_manager.create_spider_request(self.kwargs) + req = self.crawl_manager.create_spider_request(self.kwargs.pop('url'), self.kwargs) self.assertTrue(req.dont_filter) self.assertEqual(req.url, self.url) def test_invalid_arguments(self): self.kwargs['url1'] = 'http://localhost/foo' exception = self.assertRaises( - Error, self.crawl_manager.create_spider_request, self.kwargs) + Error, self.crawl_manager.create_spider_request, self.kwargs.pop('url1'), self.kwargs) self.assertEqual(exception.status, '400') def test_invalid_url(self): self.kwargs['url'] = '//localhost/foo' exception = self.assertRaises( - Error, self.crawl_manager.create_spider_request, self.kwargs) + Error, self.crawl_manager.create_spider_request, self.kwargs.pop('url'), self.kwargs) self.assertEqual(exception.status, '400') self.kwargs['url'] = 'localhost/foo' exception = self.assertRaises( - Error, self.crawl_manager.create_spider_request, self.kwargs) + Error, self.crawl_manager.create_spider_request, self.kwargs.pop('url'), self.kwargs) self.assertEqual(exception.status, '400') From 3094319c70752877d0a5bbaf638fea6ca209c645 Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Tue, 20 Mar 2018 12:55:14 +0100 Subject: [PATCH 2/2] Allow passing spider parameters in url --- scrapyrt/resources.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index 1103fc0..9dd1dd8 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -212,6 +212,10 @@ def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs): max_requests = api_params['max_requests'] except (KeyError, IndexError): max_requests = None + crawler_params = api_params.copy() + for api_param in ['max_requests', 'start_requests', 'spider_name','url']: + crawler_params.pop(api_param, None) + kwargs.update(crawler_params) dfd = self.run_crawl( spider_name, scrapy_request_args, max_requests, start_requests=start_requests, *args, **kwargs)