apify · vdusek · May 10, 2025 · May 10, 2025 · May 15, 2025 · May 16, 2025
diff --git a/docs/deployment/code_examples/google/cloud_run_example.py b/docs/deployment/code_examples/google/cloud_run_example.py
@@ -5,24 +5,23 @@
 import uvicorn
 from litestar import Litestar, get
 
-from crawlee import service_locator
 from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
-
-# highlight-start
-# Disable writing storage data to the file system
-configuration = service_locator.get_configuration()
-configuration.persist_storage = False
-configuration.write_metadata = False
-# highlight-end
+from crawlee.storage_clients import MemoryStorageClient
 
 
 @get('/')
 async def main() -> str:
     """The crawler entry point that will be called when the HTTP endpoint is accessed."""
+    # highlight-start
+    # Disable writing storage data to the file system
+    storage_client = MemoryStorageClient()
+    # highlight-end
+
     crawler = PlaywrightCrawler(
         headless=True,
         max_requests_per_crawl=10,
         browser_type='firefox',
+        storage_client=storage_client,
     )
 
     @crawler.router.default_handler

diff --git a/docs/deployment/code_examples/google/google_example.py b/docs/deployment/code_examples/google/google_example.py
@@ -6,22 +6,21 @@
 import functions_framework
 from flask import Request, Response
 
-from crawlee import service_locator
 from crawlee.crawlers import (
     BeautifulSoupCrawler,
     BeautifulSoupCrawlingContext,
 )
-
-# highlight-start
-# Disable writing storage data to the file system
-configuration = service_locator.get_configuration()
-configuration.persist_storage = False
-configuration.write_metadata = False
-# highlight-end
+from crawlee.storage_clients import MemoryStorageClient
 
 
 async def main() -> str:
+    # highlight-start
+    # Disable writing storage data to the file system
+    storage_client = MemoryStorageClient()
+    # highlight-end
+
     crawler = BeautifulSoupCrawler(
+        storage_client=storage_client,
         max_request_retries=1,
         request_handler_timeout=timedelta(seconds=30),
         max_requests_per_crawl=10,

diff --git a/docs/examples/code_examples/export_entire_dataset_to_file_csv.py b/docs/examples/code_examples/export_entire_dataset_to_file_csv.py
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     await crawler.run(['https://crawlee.dev'])
 
     # Export the entire dataset to a CSV file.
-    await crawler.export_data_csv(path='results.csv')
+    await crawler.export_data(path='results.csv')
 
 
 if __name__ == '__main__':

diff --git a/docs/examples/code_examples/export_entire_dataset_to_file_json.py b/docs/examples/code_examples/export_entire_dataset_to_file_json.py
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     await crawler.run(['https://crawlee.dev'])
 
     # Export the entire dataset to a JSON file.
-    await crawler.export_data_json(path='results.json')
+    await crawler.export_data(path='results.json')
 
 
 if __name__ == '__main__':

diff --git a/docs/examples/code_examples/parsel_crawler.py b/docs/examples/code_examples/parsel_crawler.py
@@ -40,7 +40,7 @@ async def some_hook(context: BasicCrawlingContext) -> None:
     await crawler.run(['https://github.com'])
 
     # Export the entire dataset to a JSON file.
-    await crawler.export_data_json(path='results.json')
+    await crawler.export_data(path='results.json')
 
 
 if __name__ == '__main__':

diff --git a/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py b/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py
diff --git a/docs/guides/code_examples/storages/rq_basic_example.py b/docs/guides/code_examples/storages/rq_basic_example.py
@@ -12,7 +12,7 @@ async def main() -> None:
     await request_queue.add_request('https://apify.com/')
 
     # Add multiple requests as a batch.
-    await request_queue.add_requests_batched(
+    await request_queue.add_requests(
         ['https://crawlee.dev/', 'https://crawlee.dev/python/']
     )
 

diff --git a/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py b/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py
@@ -10,9 +10,7 @@ async def main() -> None:
     request_queue = await RequestQueue.open(name='my-request-queue')
 
     # Interact with the request queue directly, e.g. add a batch of requests.
-    await request_queue.add_requests_batched(
-        ['https://apify.com/', 'https://crawlee.dev/']
-    )
+    await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/'])
 
     # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
     # list as request manager to it. It will be managed by the crawler.

diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx
@@ -52,12 +52,12 @@ class BaseStorage {
 
 class RequestLoader {
     <<abstract>>
+    + handled_count
+    + total_count
     + fetch_next_request()
     + mark_request_as_handled()
     + is_empty()
     + is_finished()
-    + get_handled_count()
-    + get_total_count()
     + to_tandem()
 }
 

diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx
@@ -24,7 +24,6 @@ import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/stora
 import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py';
 
 import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py';
-import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py';
 
 Crawlee offers multiple storage types for managing and persisting your crawling data. Request-oriented storages, such as the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, help you store and deduplicate URLs, while result-oriented storages, like <ApiLink to="class/Dataset">`Dataset`</ApiLink> and <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs.
 
@@ -210,12 +209,6 @@ Default storages are purged before the crawler starts, unless explicitly configu
 
 If you do not explicitly interact with storages in your code, the purging will occur automatically when the <ApiLink to="class/BasicCrawler#run">`BasicCrawler.run`</ApiLink> method is invoked.
 
-If you need to purge storages earlier, you can call <ApiLink to="class/MemoryStorageClient#purge_on_start">`MemoryStorageClient.purge_on_start`</ApiLink> directly if you are using the default storage client. This method triggers the purging process for the underlying storage implementation you are currently using.
-
-<RunnableCodeBlock className="language-python" language="python">
-    {CleaningPurgeExplicitlyExample}
-</RunnableCodeBlock>
-
 ## Conclusion
 
 This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests and store and retrieve scraping results using the `RequestQueue`, `Dataset`, and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run and how to purge them explicitly. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md
@@ -0,0 +1,123 @@
+---
+id: upgrading-to-v1
+title: Upgrading to v1
+---
+
+This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.
+
+## Storage clients
+
+In v1.0, we are introducing a new storage clients system. We have completely reworked their interface,
+making it much simpler to write your own storage clients. This allows you to easily store your request queues,
+key-value stores, and datasets in various destinations.
+
+### New storage clients
+
+Previously, the `MemoryStorageClient` handled both in-memory storage and file system persistence, depending
+on configuration. In v1.0, we've split this into two dedicated classes:
+
+- `MemoryStorageClient` - stores all data in memory only.
+- `FileSystemStorageClient` - persists data on the file system, with in-memory caching for improved performance.
+
+For details about the new interface, see the `BaseStorageClient` documentation. You can also check out
+the [Storage clients guide](https://crawlee.dev/python/docs/guides/) for more information on available
+storage clients and instructions on writing your own.
+
+### Memory storage client
+
+Before:
+
+```python
+from crawlee.configuration import Configuration
+from crawlee.storage_clients import MemoryStorageClient
+
+configuration = Configuration(persist_storage=False)
+storage_client = MemoryStorageClient.from_config(configuration)
+```
+
+Now:
+
+```python
+from crawlee.storage_clients import MemoryStorageClient
+
+storage_client = MemoryStorageClient()
+```
+
+### File-system storage client
+
+Before:
+
+```python
+from crawlee.configuration import Configuration
+from crawlee.storage_clients import MemoryStorageClient
+
+configuration = Configuration(persist_storage=True)
+storage_client = MemoryStorageClient.from_config(configuration)
+```
+
+Now:
+
+```python
+from crawlee.storage_clients import FileSystemStorageClient
+
+storage_client = FileSystemStorageClient()
+```
+
+The way you register storage clients remains the same:
+
+```python
+from crawlee import service_locator
+from crawlee.crawlers import ParselCrawler
+from crawlee.storage_clients import MemoryStorageClient
+
+storage_client = MemoryStorageClient()
+
+# Either via the service locator:
+service_locator.set_storage_client(storage_client)
+
+# Or provide it directly to the crawler:
+crawler = ParselCrawler(storage_client=storage_client)
+```
+
+### Breaking changes
+
+The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class.
+Persistence is now determined solely by the storage client class you use.
+
+### Writing custom storage clients
+
+The storage client interface has been fully reworked. Collection storage clients have been removed - now there is
+one storage client class per storage type (`RequestQueue`, `KeyValueStore`, and `Dataset`). Writing your own storage
+clients is now much simpler, allowing you to store your request queues, key-value stores, and datasets in any
+destination you choose.
+
+## Dataset
+
+- There are two new methods:
+  - `purge`
+  - `list_items`
+- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead.
+- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property.
+- The `set_metadata` method has been removed.
+- The `write_to_json` and `write_to_csv` methods have been removed - use `export_to` instead.
+
+## Key-value store
+
+- There are three new methods:
+  - `purge`
+  - `delete_value`
+  - `list_keys`
+- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead.
+- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property.
+- The `set_metadata` method has been removed.
+
+## Request queue
+
+- There are two new methods:
+  - `purge`
+  - `add_requests` (renamed from `add_requests_batched`)
+- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead.
+- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property.
+- The `set_metadata` method has been removed.
+- `resource_directory` from `RequestQueueMetadata` removed – use `path_to_...` property.
+- `RequestQueueHead` model replaced with `RequestQueueHeadWithLocks`.
diff --git a/pyproject.toml b/pyproject.toml
@@ -144,9 +144,9 @@ ignore = [
     "ISC001",   # This rule may cause conflicts when used with the formatter
     "FIX",      # flake8-fixme
     "PLR0911",  # Too many return statements
+    "PLR0912",  # Too many branches
     "PLR0913",  # Too many arguments in function definition
     "PLR0915",  # Too many statements
-    "PTH",      # flake8-use-pathlib
     "PYI034",   # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
     "PYI036",   # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
     "S102",     # Use of `exec` detected
@@ -168,6 +168,7 @@ indent-style = "space"
     "F401", # Unused imports
 ]
 "**/{tests}/*" = [
+    "ASYNC230", # Async functions should not open files with blocking methods like `open`
     "D",       # Everything from the pydocstyle
     "INP001",  # File {filename} is part of an implicit namespace package, add an __init__.py
     "PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
@@ -205,9 +206,6 @@ builtins-ignorelist = ["id"]
 [tool.ruff.lint.isort]
 known-first-party = ["crawlee"]
 
-[tool.ruff.lint.pylint]
-max-branches = 18
-
 [tool.pytest.ini_options]
 addopts = "-ra"
 asyncio_default_fixture_loop_scope = "function"

diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py
@@ -22,7 +22,7 @@
 cli = typer.Typer(no_args_is_help=True)
 
 template_directory = importlib.resources.files('crawlee') / 'project_template'
-with open(str(template_directory / 'cookiecutter.json')) as f:
+with (template_directory / 'cookiecutter.json').open() as f:
     cookiecutter_json = json.load(f)
 
 crawler_choices = cookiecutter_json['crawler_type']

diff --git a/src/crawlee/_consts.py b/src/crawlee/_consts.py
@@ -1,3 +1,4 @@
 from __future__ import annotations
 
 METADATA_FILENAME = '__metadata__.json'
+"""The name of the metadata file for storage clients."""
diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py
@@ -158,7 +158,23 @@ class Request(BaseModel):
     ```
     """
 
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(populate_by_name=True, extra='allow')
+
+    id: str
+    """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
+    with `unique_key`."""
+
+    unique_key: Annotated[str, Field(alias='uniqueKey')]
+    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
+    to the same URL.
+
+    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
+    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
+    of `http://www.example.com/something`.
+
+    Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior
+    and specify which URLs shall be considered equal.
+    """
 
     url: Annotated[str, BeforeValidator(validate_http_url), Field()]
     """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
@@ -207,22 +223,6 @@ class Request(BaseModel):
     handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
     """Timestamp when the request was handled."""
 
-    unique_key: Annotated[str, Field(alias='uniqueKey')]
-    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
-    to the same URL.
-
-    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
-    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
-    of `http://www.example.com/something`.
-
-    Pass an arbitrary non-empty text value to the `unique_key` property
-    to override the default behavior and specify which URLs shall be considered equal.
-    """
-
-    id: str
-    """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
-    with `unique_key`."""
-
     @classmethod
     def from_url(
         cls,
@@ -398,6 +398,11 @@ def forefront(self) -> bool:
     def forefront(self, new_value: bool) -> None:
         self.crawlee_data.forefront = new_value
 
+    @property
+    def was_already_handled(self) -> bool:
+        """Indicates whether the request was handled."""
+        return self.handled_at is not None
+
 
 class RequestWithLock(Request):
     """A crawling request with information about locks."""