From 83a8ec00264ac2a7a0133412842c64ccb72e5e69 Mon Sep 17 00:00:00 2001
From: Serhii Abarovskyi <aserhii@protonmail.com>
Date: Tue, 1 Jun 2021 15:08:07 +0300
Subject: [PATCH] Add max_items

---
 scrapyrt/core.py            | 13 ++++++++++++-
 tests/test_crawl_manager.py | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/scrapyrt/core.py b/scrapyrt/core.py
index 314cb1f..2f63110 100644
--- a/scrapyrt/core.py
+++ b/scrapyrt/core.py
@@ -96,14 +96,16 @@ class CrawlManager(object):
     Runs crawls
     """
 
-    def __init__(self, spider_name, request_kwargs, max_requests=None, start_requests=False):
+    def __init__(self, spider_name, request_kwargs, max_items=None, max_requests=None, start_requests=False):
         self.spider_name = spider_name
         self.log_dir = settings.LOG_DIR
         self.items = []
         self.items_dropped = []
         self.errors = []
+        self.max_items = int(max_items) if max_items else None
         self.max_requests = int(max_requests) if max_requests else None
         self.timeout_limit = int(settings.TIMEOUT_LIMIT)
+        self.items_count = 0
         self.request_count = 0
         self.debug = settings.DEBUG
         self.crawler_process = None
@@ -196,6 +198,15 @@ def limit_runtime(self, spider):
         if (time_now - start_time).seconds >= self.timeout_limit:
             spider.crawler.engine.close_spider(spider, reason="timeout")
 
+    def limit_items(self, spider):
+        """Stop crawl after reaching max_items."""
+        if self.max_items and self.max_items <= self.items_count:
+            reason = "stop generating items, only {} items allowed".format(
+                self.max_items)
+            spider.crawler.engine.close_spider(spider, reason=reason)
+        else:
+            self.items_count += 1
+
     def limit_requests(self, spider):
         """Stop crawl after reaching max_requests."""
         if self.max_requests and self.max_requests <= self.request_count:
diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py
index 3f166d2..7590c22 100644
--- a/tests/test_crawl_manager.py
+++ b/tests/test_crawl_manager.py
@@ -252,6 +252,22 @@ def test_handle_spider_error_another_spider(self):
         self.assertEqual(len(self.crawl_manager.errors), 0)
 
 
+class TestLimitItems(TestCrawlManager):
+
+    def test_max_items_not_set(self):
+        for i in range(100):
+            self.crawl_manager.limit_items(self.spider)
+        self.assertFalse(self.crawler.engine.close_spider.called)
+
+    def test_max_items_set(self):
+        self.crawl_manager.max_items = 10
+        for i in range(self.crawl_manager.max_items):
+            self.crawl_manager.limit_items(self.spider)
+        self.assertFalse(self.crawler.engine.close_spider.called)
+        self.crawl_manager.limit_items(self.spider)
+        self.assertTrue(self.crawler.engine.close_spider.called)
+
+
 class TestLimitRequests(TestCrawlManager):
 
     def test_max_requests_not_set(self):