Skip to content

refactor!: Introduce new storage client system #1194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f285707
refactor!: Introduce new storage client system
vdusek May 10, 2025
dd9be6e
Cleanup
vdusek May 10, 2025
89bfa5b
Address feedback
vdusek May 15, 2025
4050c75
Add purge_if_needed method and improve some typing based on Pylance
vdusek May 16, 2025
26f46e2
Address more feedback
vdusek May 20, 2025
c83a36a
RQ FS client improvements
vdusek Jun 4, 2025
c967fe5
Add caching to RQ FS client
vdusek Jun 5, 2025
7df046f
RQ FS performance optimization in add_requests
vdusek Jun 5, 2025
3555565
RQ FS performance issues in fetch_next_request
vdusek Jun 6, 2025
946d1e2
RQ FS fetch performance for is_empty
vdusek Jun 6, 2025
9f10b95
rm code duplication for open methods
vdusek Jun 6, 2025
0864ff8
Request loaders use async getters for handled/total req cnt
vdusek Jun 9, 2025
af0d129
Add missing_ok when removing files
vdusek Jun 9, 2025
9998a58
Improve is_empty
vdusek Jun 10, 2025
fdee111
Optimize RQ memory storage client
vdusek Jun 10, 2025
79cdfc0
Add upgrading guide and skip problematic test
vdusek Jun 11, 2025
3d2fd73
Merge branch 'master' into new-storage-clients
vdusek Jun 11, 2025
e818585
chore: update `docusaurus-plugin-typedoc-api`, fix failing docs build
barjin Jun 11, 2025
65db9ac
fix docs
vdusek Jun 11, 2025
2b786f7
add retries to atomic write
vdusek Jun 12, 2025
2cb04c5
chore(deps): update dependency pytest-cov to ~=6.2.0 (#1244)
renovate[bot] Jun 12, 2025
0c8c4ec
Fix atomic write on Windows
vdusek Jun 12, 2025
ce1eeb1
resolve write function during import time
vdusek Jun 14, 2025
4c05cee
Merge branch 'master' into new-storage-clients
vdusek Jun 14, 2025
8c80513
Update file utils
vdusek Jun 16, 2025
70bc071
revert un-intentionally makefile changes
vdusek Jun 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions docs/deployment/code_examples/google/cloud_run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,23 @@
import uvicorn
from litestar import Litestar, get

from crawlee import service_locator
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

# highlight-start
# Disable writing storage data to the file system
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
# highlight-end
from crawlee.storage_clients import MemoryStorageClient


@get('/')
async def main() -> str:
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end

crawler = PlaywrightCrawler(
headless=True,
max_requests_per_crawl=10,
browser_type='firefox',
storage_client=storage_client,
)

@crawler.router.default_handler
Expand Down
15 changes: 7 additions & 8 deletions docs/deployment/code_examples/google/google_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,21 @@
import functions_framework
from flask import Request, Response

from crawlee import service_locator
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)

# highlight-start
# Disable writing storage data to the file system
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
# highlight-end
from crawlee.storage_clients import MemoryStorageClient


async def main() -> str:
# highlight-start
# Disable writing storage data to the file system
storage_client = MemoryStorageClient()
# highlight-end

crawler = BeautifulSoupCrawler(
storage_client=storage_client,
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a CSV file.
await crawler.export_data_csv(path='results.csv')
await crawler.export_data(path='results.csv')


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a JSON file.
await crawler.export_data_json(path='results.json')
await crawler.export_data(path='results.json')


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code_examples/parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def some_hook(context: BasicCrawlingContext) -> None:
await crawler.run(['https://github.com'])

# Export the entire dataset to a JSON file.
await crawler.export_data_json(path='results.json')
await crawler.export_data(path='results.json')


if __name__ == '__main__':
Expand Down

This file was deleted.

2 changes: 1 addition & 1 deletion docs/guides/code_examples/storages/rq_basic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ async def main() -> None:
await request_queue.add_request('https://apify.com/')

# Add multiple requests as a batch.
await request_queue.add_requests_batched(
await request_queue.add_requests(
['https://crawlee.dev/', 'https://crawlee.dev/python/']
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ async def main() -> None:
request_queue = await RequestQueue.open(name='my-request-queue')

# Interact with the request queue directly, e.g. add a batch of requests.
await request_queue.add_requests_batched(
['https://apify.com/', 'https://crawlee.dev/']
)
await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/'])

# Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
# list as request manager to it. It will be managed by the crawler.
Expand Down
4 changes: 2 additions & 2 deletions docs/guides/request_loaders.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ class BaseStorage {

class RequestLoader {
<<abstract>>
+ handled_count
+ total_count
+ fetch_next_request()
+ mark_request_as_handled()
+ is_empty()
+ is_finished()
+ get_handled_count()
+ get_total_count()
+ to_tandem()
}

Expand Down
7 changes: 0 additions & 7 deletions docs/guides/storages.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/stora
import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py';

import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py';
import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py';

Crawlee offers multiple storage types for managing and persisting your crawling data. Request-oriented storages, such as the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, help you store and deduplicate URLs, while result-oriented storages, like <ApiLink to="class/Dataset">`Dataset`</ApiLink> and <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs.

Expand Down Expand Up @@ -210,12 +209,6 @@ Default storages are purged before the crawler starts, unless explicitly configu

If you do not explicitly interact with storages in your code, the purging will occur automatically when the <ApiLink to="class/BasicCrawler#run">`BasicCrawler.run`</ApiLink> method is invoked.

If you need to purge storages earlier, you can call <ApiLink to="class/MemoryStorageClient#purge_on_start">`MemoryStorageClient.purge_on_start`</ApiLink> directly if you are using the default storage client. This method triggers the purging process for the underlying storage implementation you are currently using.

<RunnableCodeBlock className="language-python" language="python">
{CleaningPurgeExplicitlyExample}
</RunnableCodeBlock>

## Conclusion

This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests and store and retrieve scraping results using the `RequestQueue`, `Dataset`, and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run and how to purge them explicitly. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
123 changes: 123 additions & 0 deletions docs/upgrading/upgrading_to_v1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
---
id: upgrading-to-v1
title: Upgrading to v1
---

This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.

## Storage clients

In v1.0, we are introducing a new storage clients system. We have completely reworked their interface,
making it much simpler to write your own storage clients. This allows you to easily store your request queues,
key-value stores, and datasets in various destinations.

### New storage clients

Previously, the `MemoryStorageClient` handled both in-memory storage and file system persistence, depending
on configuration. In v1.0, we've split this into two dedicated classes:

- `MemoryStorageClient` - stores all data in memory only.
- `FileSystemStorageClient` - persists data on the file system, with in-memory caching for improved performance.

For details about the new interface, see the `BaseStorageClient` documentation. You can also check out
the [Storage clients guide](https://crawlee.dev/python/docs/guides/) for more information on available
storage clients and instructions on writing your own.

### Memory storage client

Before:

```python
from crawlee.configuration import Configuration
from crawlee.storage_clients import MemoryStorageClient

configuration = Configuration(persist_storage=False)
storage_client = MemoryStorageClient.from_config(configuration)
```

Now:

```python
from crawlee.storage_clients import MemoryStorageClient

storage_client = MemoryStorageClient()
```

### File-system storage client

Before:

```python
from crawlee.configuration import Configuration
from crawlee.storage_clients import MemoryStorageClient

configuration = Configuration(persist_storage=True)
storage_client = MemoryStorageClient.from_config(configuration)
```

Now:

```python
from crawlee.storage_clients import FileSystemStorageClient

storage_client = FileSystemStorageClient()
```

The way you register storage clients remains the same:

```python
from crawlee import service_locator
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import MemoryStorageClient

storage_client = MemoryStorageClient()

# Either via the service locator:
service_locator.set_storage_client(storage_client)

# Or provide it directly to the crawler:
crawler = ParselCrawler(storage_client=storage_client)
```

### Breaking changes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'd be fair to mention that when you call for example StorageClient.open_dataset_client() or DatasetClient.open() twice with the same arguments, you may get different instances of DatasetClient. It's not a frequent use case by any means, but let's strive to maintain order.


The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class.
Persistence is now determined solely by the storage client class you use.

### Writing custom storage clients

The storage client interface has been fully reworked. Collection storage clients have been removed - now there is
one storage client class per storage type (`RequestQueue`, `KeyValueStore`, and `Dataset`). Writing your own storage
clients is now much simpler, allowing you to store your request queues, key-value stores, and datasets in any
destination you choose.

## Dataset

- There are two new methods:
- `purge`
- `list_items`
- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead.
- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property.
- The `set_metadata` method has been removed.
- The `write_to_json` and `write_to_csv` methods have been removed - use `export_to` instead.

## Key-value store

- There are three new methods:
- `purge`
- `delete_value`
- `list_keys`
- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead.
- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property.
- The `set_metadata` method has been removed.

## Request queue

- There are two new methods:
- `purge`
- `add_requests` (renamed from `add_requests_batched`)
- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead.
- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property.
- The `set_metadata` method has been removed.
- `resource_directory` from `RequestQueueMetadata` removed – use `path_to_...` property.
- `RequestQueueHead` model replaced with `RequestQueueHeadWithLocks`.
6 changes: 2 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,9 @@ ignore = [
"ISC001", # This rule may cause conflicts when used with the formatter
"FIX", # flake8-fixme
"PLR0911", # Too many return statements
"PLR0912", # Too many branches
"PLR0913", # Too many arguments in function definition
"PLR0915", # Too many statements
"PTH", # flake8-use-pathlib
"PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
"PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
"S102", # Use of `exec` detected
Expand All @@ -168,6 +168,7 @@ indent-style = "space"
"F401", # Unused imports
]
"**/{tests}/*" = [
"ASYNC230", # Async functions should not open files with blocking methods like `open`
"D", # Everything from the pydocstyle
"INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
"PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
Expand Down Expand Up @@ -205,9 +206,6 @@ builtins-ignorelist = ["id"]
[tool.ruff.lint.isort]
known-first-party = ["crawlee"]

[tool.ruff.lint.pylint]
max-branches = 18

[tool.pytest.ini_options]
addopts = "-ra"
asyncio_default_fixture_loop_scope = "function"
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
cli = typer.Typer(no_args_is_help=True)

template_directory = importlib.resources.files('crawlee') / 'project_template'
with open(str(template_directory / 'cookiecutter.json')) as f:
with (template_directory / 'cookiecutter.json').open() as f:
cookiecutter_json = json.load(f)

crawler_choices = cookiecutter_json['crawler_type']
Expand Down
1 change: 1 addition & 0 deletions src/crawlee/_consts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import annotations

METADATA_FILENAME = '__metadata__.json'
"""The name of the metadata file for storage clients."""
39 changes: 22 additions & 17 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,23 @@ class Request(BaseModel):
```
"""

model_config = ConfigDict(populate_by_name=True)
model_config = ConfigDict(populate_by_name=True, extra='allow')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the extra='allow'?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of the persistance of __sequence and __forefront flags.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't a subclass of the model be more robust?


id: str
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
with `unique_key`."""

unique_key: Annotated[str, Field(alias='uniqueKey')]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.

If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
of `http://www.example.com/something`.

Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior
and specify which URLs shall be considered equal.
"""

url: Annotated[str, BeforeValidator(validate_http_url), Field()]
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
Expand Down Expand Up @@ -207,22 +223,6 @@ class Request(BaseModel):
handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
"""Timestamp when the request was handled."""

unique_key: Annotated[str, Field(alias='uniqueKey')]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.

If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
of `http://www.example.com/something`.

Pass an arbitrary non-empty text value to the `unique_key` property
to override the default behavior and specify which URLs shall be considered equal.
"""

id: str
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
with `unique_key`."""

@classmethod
def from_url(
cls,
Expand Down Expand Up @@ -398,6 +398,11 @@ def forefront(self) -> bool:
def forefront(self, new_value: bool) -> None:
self.crawlee_data.forefront = new_value

@property
def was_already_handled(self) -> bool:
"""Indicates whether the request was handled."""
return self.handled_at is not None


class RequestWithLock(Request):
"""A crawling request with information about locks."""
Expand Down
Loading
Loading