diff --git a/setup.py b/setup.py index 30c8b94..690b9de 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.24.0", + "text-unidecode>=1.3.0", "web-poet>=0.17.1", "xtractmime>=0.2.1", "zyte-common-items>=0.26.2", diff --git a/tests/__init__.py b/tests/__init__.py index df7b124..e57b600 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,5 @@ +import contextlib +import os from typing import Any, Dict, Optional, Type import pytest @@ -20,3 +22,15 @@ def get_crawler( runner = CrawlerRunner(settings) crawler = runner.create_crawler(spider_cls) return crawler + + +# https://stackoverflow.com/a/34333710 +@contextlib.contextmanager +def set_env(**environ): + old_environ = dict(os.environ) + os.environ.update(environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(old_environ) diff --git a/tests/incremental/__init__.py b/tests/incremental/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/incremental/test_collection_fp_manager.py b/tests/incremental/test_collection_fp_manager.py index c6cdcf2..75059f3 100644 --- a/tests/incremental/test_collection_fp_manager.py +++ b/tests/incremental/test_collection_fp_manager.py @@ -2,14 +2,20 @@ from unittest.mock import MagicMock, patch import pytest +from scrapy import Spider from scrapy.statscollectors import StatsCollector from scrapy.utils.request import RequestFingerprinter +from scrapy.utils.test import get_crawler as _get_crawler from twisted.internet.defer import Deferred, inlineCallbacks -from tests import get_crawler -from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager +from zyte_spider_templates._incremental.manager import ( + CollectionsFingerprintsManager, + _get_collection_name, +) from zyte_spider_templates.spiders.article import ArticleSpider +from .. import get_crawler, set_env + @pytest.fixture def mock_crawler(): @@ -207,3 +213,39 @@ def test_spider_closed(mock_scrapinghub_client): fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch) # type: ignore fp_manager.spider_closed() fp_manager.save_batch.assert_called_once() + + +@pytest.mark.parametrize( + ("env_vars", "settings", "spider_name", "collection_name"), + ( + # INCREMENTAL_CRAWL_COLLECTION_NAME > SHUB_VIRTUAL_SPIDER > Spider.name + # INCREMENTAL_CRAWL_COLLECTION_NAME is used as is, others are + # slugified, length-limited and they and get an “_incremental” suffix. + ( + {}, + {}, + "a A-1.α" + "a" * 2048, + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental", + ), + ( + {"SHUB_VIRTUAL_SPIDER": "a A-1.α" + "a" * 2048}, + {}, + "foo", + "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental", + ), + ( + {"SHUB_VIRTUAL_SPIDER": "bar"}, + {"INCREMENTAL_CRAWL_COLLECTION_NAME": "a A-1.α" + "a" * 2048}, + "foo", + "a A-1.α" + "a" * 2048, + ), + ), +) +def test_collection_name(env_vars, settings, spider_name, collection_name): + class TestSpider(Spider): + name = spider_name + + crawler = _get_crawler(settings_dict=settings, spidercls=TestSpider) + crawler.spider = TestSpider() + with set_env(**env_vars): + assert _get_collection_name(crawler) == collection_name diff --git a/tox.ini b/tox.ini index 2a65da7..523441f 100644 --- a/tox.ini +++ b/tox.ini @@ -34,6 +34,7 @@ deps = scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.24.0 + text-unidecode==1.3.0 web-poet==0.17.1 xtractmime==0.2.1 zyte-common-items==0.26.2 diff --git a/zyte_spider_templates/_incremental/manager.py b/zyte_spider_templates/_incremental/manager.py index 6e4ff28..b65d665 100644 --- a/zyte_spider_templates/_incremental/manager.py +++ b/zyte_spider_templates/_incremental/manager.py @@ -1,5 +1,6 @@ import asyncio import logging +import re from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Set, Tuple, Union @@ -10,6 +11,7 @@ from scrapy import signals from scrapy.crawler import Crawler from scrapy.http.request import Request +from text_unidecode import unidecode from zyte_common_items import Item from zyte_spider_templates.utils import ( @@ -22,11 +24,22 @@ logger = logging.getLogger(__name__) INCREMENTAL_SUFFIX = "_incremental" +_MAX_LENGTH = 2048 - len(INCREMENTAL_SUFFIX) COLLECTION_API_URL = "https://storage.scrapinghub.com/collections" THREAD_POOL_EXECUTOR = ThreadPoolExecutor(max_workers=10) +def _get_collection_name(crawler: Crawler) -> str: + if name := crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME"): + return name + name = ( + unidecode(get_spider_name(crawler)).rstrip("_")[:_MAX_LENGTH] + + INCREMENTAL_SUFFIX + ) + return re.sub(r"[^a-zA-Z0-9_]", "_", name) + + class CollectionsFingerprintsManager: def __init__(self, crawler: Crawler) -> None: self.writer = None @@ -37,7 +50,7 @@ def __init__(self, crawler: Crawler) -> None: self.batch_size = crawler.settings.getint("INCREMENTAL_CRAWL_BATCH_SIZE", 50) project_id = get_project_id(crawler) - collection_name = self.get_collection_name(crawler) + collection_name = _get_collection_name(crawler) self.init_collection(project_id, collection_name) self.api_url = f"{COLLECTION_API_URL}/{project_id}/s/{collection_name}" @@ -51,12 +64,6 @@ def __init__(self, crawler: Crawler) -> None: crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) - def get_collection_name(self, crawler): - return ( - crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME") - or f"{get_spider_name(crawler)}{INCREMENTAL_SUFFIX}" - ) - def init_collection(self, project_id, collection_name) -> None: client = get_client() collection = client.get_project(project_id).collections.get_store(