Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement multiple urls in single request #70

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions scrapyrt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,15 @@ def __init__(self, spider_name, request_kwargs, max_requests=None, start_request
# callback will be added after instantiation of crawler object
# because we need to know if spider has method available
self.callback_name = request_kwargs.pop('callback', None) or 'parse'

if request_kwargs.get("url"):
self.request = self.create_spider_request(deepcopy(request_kwargs))
urls = request_kwargs.pop('url').split('|')
self.requests = []
for url in urls:
self.requests.append(self.create_spider_request(url, request_kwargs))
else:
self.request = None
self.requests = None

self.start_requests = start_requests
self._request_scheduled = False

Expand Down Expand Up @@ -193,15 +198,16 @@ def spider_idle(self, spider):
which is totally wrong.

"""
if spider is self.crawler.spider and self.request and not self._request_scheduled:
if spider is self.crawler.spider and self.requests and not self._request_scheduled:
callback = getattr(self.crawler.spider, self.callback_name)
assert callable(callback), 'Invalid callback'
self.request = self.request.replace(callback=callback)
modify_request = getattr(
self.crawler.spider, "modify_realtime_request", None)
if callable(modify_request):
self.request = modify_request(self.request)
spider.crawler.engine.crawl(self.request, spider)
for i in range(len(self.requests)):
self.requests[i] = self.requests[i].replace(callback=callback)
modify_request = getattr(
self.crawler.spider, "modify_realtime_request", None)
if callable(modify_request):
self.requests[i] = modify_request(self.requests[i])
spider.crawler.engine.crawl(self.requests[i], spider)
self._request_scheduled = True
raise DontCloseSpider

Expand Down Expand Up @@ -262,8 +268,8 @@ def return_items(self, result):
results["errors"] = self.errors
return results

def create_spider_request(self, kwargs):
url = kwargs.pop('url')
def create_spider_request(self, url, kwargs):

try:
req = Request(url, **kwargs)
except (TypeError, ValueError) as e:
Expand Down
4 changes: 4 additions & 0 deletions scrapyrt/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@ def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs):
max_requests = api_params['max_requests']
except (KeyError, IndexError):
max_requests = None
crawler_params = api_params.copy()
for api_param in ['max_requests', 'start_requests', 'spider_name','url']:
crawler_params.pop(api_param, None)
kwargs.update(crawler_params)
dfd = self.run_crawl(
spider_name, scrapy_request_args, max_requests,
start_requests=start_requests, *args, **kwargs)
Expand Down
38 changes: 19 additions & 19 deletions tests/test_crawl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from .spiders import MetaSpider


class TestCrawlManager(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -90,7 +89,7 @@ def setUp(self):
# test callback
self.spider.parse_something = lambda: None
self.crawl_manager.callback_name = 'parse_something'
self.request = self.crawl_manager.request
self.request = self.crawl_manager.requests[0]

def _call_spider_idle(self):
try:
Expand All @@ -99,13 +98,13 @@ def _call_spider_idle(self):
pass

def test_spider_opened(self):
self.assertIsNone(self.crawl_manager.request.callback)
self.assertIsNone(self.crawl_manager.requests[0].callback)
self._call_spider_idle()
self.crawler.engine.crawl.assert_called_once_with(
self.crawl_manager.request, self.spider)
self.assertNotEqual(self.request, self.crawl_manager.request)
self.crawl_manager.requests[0], self.spider)
self.assertNotEqual(self.request, self.crawl_manager.requests[0])
self.assertEquals(
self.crawl_manager.request.callback, self.spider.parse_something)
self.crawl_manager.requests[0].callback, self.spider.parse_something)

def test_raise_error_if_not_callable(self):
self.spider.parse_something = None
Expand All @@ -114,8 +113,9 @@ def test_raise_error_if_not_callable(self):
self.assertFalse(self.crawler.engine.crawl.called)

def test_modify_realtime_request(self):
self.assertDictEqual(self.crawl_manager.request.meta, {})
self.assertEqual(self.crawl_manager.request.method, 'GET')
for request in self.crawl_manager.requests:
self.assertDictEqual(request.meta, {})
self.assertEqual(request.method, 'GET')

def modify_realtime_request(request):
request = request.replace(method='POST')
Expand All @@ -125,16 +125,16 @@ def modify_realtime_request(request):
self.spider.modify_realtime_request = modify_realtime_request
self._call_spider_idle()
self.crawler.engine.crawl.assert_called_once_with(
self.crawl_manager.request, self.spider)
self.assertEqual(self.crawl_manager.request.method, 'POST')
self.assertEqual(self.crawl_manager.request.meta['foo'], 'bar')
self.crawl_manager.requests[0], self.spider)
self.assertEqual(self.crawl_manager.requests[0].method, 'POST')
self.assertEqual(self.crawl_manager.requests[0].meta['foo'], 'bar')

def test_modify_realtime_request_is_not_callable(self):
self.spider.modify_realtime_request = None
self._call_spider_idle()
self.crawler.engine.crawl.assert_called_once_with(
self.crawl_manager.request, self.spider)
self.assertNotEqual(self.request, self.crawl_manager.request)
self.crawl_manager.requests[0], self.spider)
self.assertNotEqual(self.request, self.crawl_manager.requests[0])


class TestHandleScheduling(TestCrawlManager):
Expand All @@ -146,13 +146,13 @@ def setUp(self):

def test_handle_scheduling(self):
self.crawl_manager.handle_scheduling(
self.crawl_manager.request, self.spider)
self.crawl_manager.requests[0], self.spider)
self.crawl_manager.limit_requests.assert_called_once_with(self.spider)
self.crawl_manager.limit_runtime.assert_called_once_with(self.spider)

def test_handle_scheduling_another_spider(self):
self.crawl_manager.handle_scheduling(
self.crawl_manager.request, self.another_spider)
self.crawl_manager.requests[0], self.another_spider)
self.assertFalse(self.crawl_manager.limit_requests.called)
self.assertFalse(self.crawl_manager.limit_runtime.called)

Expand Down Expand Up @@ -321,24 +321,24 @@ def test_return_items_without_debug(self):
class TestCreateSpiderRequest(TestCrawlManager):

def test_valid_arguments(self):
req = self.crawl_manager.create_spider_request(self.kwargs)
req = self.crawl_manager.create_spider_request(self.kwargs.pop('url'), self.kwargs)
self.assertTrue(req.dont_filter)
self.assertEqual(req.url, self.url)

def test_invalid_arguments(self):
self.kwargs['url1'] = 'http://localhost/foo'
exception = self.assertRaises(
Error, self.crawl_manager.create_spider_request, self.kwargs)
Error, self.crawl_manager.create_spider_request, self.kwargs.pop('url1'), self.kwargs)
self.assertEqual(exception.status, '400')

def test_invalid_url(self):
self.kwargs['url'] = '//localhost/foo'
exception = self.assertRaises(
Error, self.crawl_manager.create_spider_request, self.kwargs)
Error, self.crawl_manager.create_spider_request, self.kwargs.pop('url'), self.kwargs)
self.assertEqual(exception.status, '400')
self.kwargs['url'] = 'localhost/foo'
exception = self.assertRaises(
Error, self.crawl_manager.create_spider_request, self.kwargs)
Error, self.crawl_manager.create_spider_request, self.kwargs.pop('url'), self.kwargs)
self.assertEqual(exception.status, '400')


Expand Down