From 83a8ec00264ac2a7a0133412842c64ccb72e5e69 Mon Sep 17 00:00:00 2001 From: Serhii Abarovskyi <aserhii@protonmail.com> Date: Tue, 1 Jun 2021 15:08:07 +0300 Subject: [PATCH] Add max_items --- scrapyrt/core.py | 13 ++++++++++++- tests/test_crawl_manager.py | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/scrapyrt/core.py b/scrapyrt/core.py index 314cb1f..2f63110 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -96,14 +96,16 @@ class CrawlManager(object): Runs crawls """ - def __init__(self, spider_name, request_kwargs, max_requests=None, start_requests=False): + def __init__(self, spider_name, request_kwargs, max_items=None, max_requests=None, start_requests=False): self.spider_name = spider_name self.log_dir = settings.LOG_DIR self.items = [] self.items_dropped = [] self.errors = [] + self.max_items = int(max_items) if max_items else None self.max_requests = int(max_requests) if max_requests else None self.timeout_limit = int(settings.TIMEOUT_LIMIT) + self.items_count = 0 self.request_count = 0 self.debug = settings.DEBUG self.crawler_process = None @@ -196,6 +198,15 @@ def limit_runtime(self, spider): if (time_now - start_time).seconds >= self.timeout_limit: spider.crawler.engine.close_spider(spider, reason="timeout") + def limit_items(self, spider): + """Stop crawl after reaching max_items.""" + if self.max_items and self.max_items <= self.items_count: + reason = "stop generating items, only {} items allowed".format( + self.max_items) + spider.crawler.engine.close_spider(spider, reason=reason) + else: + self.items_count += 1 + def limit_requests(self, spider): """Stop crawl after reaching max_requests.""" if self.max_requests and self.max_requests <= self.request_count: diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index 3f166d2..7590c22 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -252,6 +252,22 @@ def test_handle_spider_error_another_spider(self): self.assertEqual(len(self.crawl_manager.errors), 0) +class TestLimitItems(TestCrawlManager): + + def test_max_items_not_set(self): + for i in range(100): + self.crawl_manager.limit_items(self.spider) + self.assertFalse(self.crawler.engine.close_spider.called) + + def test_max_items_set(self): + self.crawl_manager.max_items = 10 + for i in range(self.crawl_manager.max_items): + self.crawl_manager.limit_items(self.spider) + self.assertFalse(self.crawler.engine.close_spider.called) + self.crawl_manager.limit_items(self.spider) + self.assertTrue(self.crawler.engine.close_spider.called) + + class TestLimitRequests(TestCrawlManager): def test_max_requests_not_set(self):