Skip to content

Commit be2232b

Browse files
committed
refactor!: Introduce new storage client system
1 parent bd4b940 commit be2232b

File tree

91 files changed

+7475
-6402
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+7475
-6402
lines changed

docs/deployment/code_examples/google/cloud_run_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,23 @@
55
import uvicorn
66
from litestar import Litestar, get
77

8-
from crawlee import service_locator
98
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
10-
11-
# highlight-start
12-
# Disable writing storage data to the file system
13-
configuration = service_locator.get_configuration()
14-
configuration.persist_storage = False
15-
configuration.write_metadata = False
16-
# highlight-end
9+
from crawlee.storage_clients import MemoryStorageClient
1710

1811

1912
@get('/')
2013
async def main() -> str:
2114
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
15+
# highlight-start
16+
# Disable writing storage data to the file system
17+
storage_client = MemoryStorageClient()
18+
# highlight-end
19+
2220
crawler = PlaywrightCrawler(
2321
headless=True,
2422
max_requests_per_crawl=10,
2523
browser_type='firefox',
24+
storage_client=storage_client,
2625
)
2726

2827
@crawler.router.default_handler

docs/deployment/code_examples/google/google_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,21 @@
66
import functions_framework
77
from flask import Request, Response
88

9-
from crawlee import service_locator
109
from crawlee.crawlers import (
1110
BeautifulSoupCrawler,
1211
BeautifulSoupCrawlingContext,
1312
)
14-
15-
# highlight-start
16-
# Disable writing storage data to the file system
17-
configuration = service_locator.get_configuration()
18-
configuration.persist_storage = False
19-
configuration.write_metadata = False
20-
# highlight-end
13+
from crawlee.storage_clients import MemoryStorageClient
2114

2215

2316
async def main() -> str:
17+
# highlight-start
18+
# Disable writing storage data to the file system
19+
storage_client = MemoryStorageClient()
20+
# highlight-end
21+
2422
crawler = BeautifulSoupCrawler(
23+
storage_client=storage_client,
2524
max_request_retries=1,
2625
request_handler_timeout=timedelta(seconds=30),
2726
max_requests_per_crawl=10,

docs/examples/code_examples/export_entire_dataset_to_file_csv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
3030
await crawler.run(['https://crawlee.dev'])
3131

3232
# Export the entire dataset to a CSV file.
33-
await crawler.export_data_csv(path='results.csv')
33+
await crawler.export_data(path='results.csv')
3434

3535

3636
if __name__ == '__main__':

docs/examples/code_examples/export_entire_dataset_to_file_json.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
3030
await crawler.run(['https://crawlee.dev'])
3131

3232
# Export the entire dataset to a JSON file.
33-
await crawler.export_data_json(path='results.json')
33+
await crawler.export_data(path='results.json')
3434

3535

3636
if __name__ == '__main__':

docs/examples/code_examples/parsel_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ async def some_hook(context: BasicCrawlingContext) -> None:
4040
await crawler.run(['https://github.com'])
4141

4242
# Export the entire dataset to a JSON file.
43-
await crawler.export_data_json(path='results.json')
43+
await crawler.export_data(path='results.json')
4444

4545

4646
if __name__ == '__main__':

docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

docs/guides/code_examples/storages/rq_basic_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ async def main() -> None:
1212
await request_queue.add_request('https://apify.com/')
1313

1414
# Add multiple requests as a batch.
15-
await request_queue.add_requests_batched(
15+
await request_queue.add_requests(
1616
['https://crawlee.dev/', 'https://crawlee.dev/python/']
1717
)
1818

docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@ async def main() -> None:
1010
request_queue = await RequestQueue.open(name='my-request-queue')
1111

1212
# Interact with the request queue directly, e.g. add a batch of requests.
13-
await request_queue.add_requests_batched(
14-
['https://apify.com/', 'https://crawlee.dev/']
15-
)
13+
await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/'])
1614

1715
# Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
1816
# list as request manager to it. It will be managed by the crawler.

docs/guides/request_loaders.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,12 @@ class BaseStorage {
5252
5353
class RequestLoader {
5454
<<abstract>>
55+
+ handled_count
56+
+ total_count
5557
+ fetch_next_request()
5658
+ mark_request_as_handled()
5759
+ is_empty()
5860
+ is_finished()
59-
+ get_handled_count()
60-
+ get_total_count()
6161
+ to_tandem()
6262
}
6363

docs/guides/storages.mdx

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/stora
2424
import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py';
2525

2626
import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py';
27-
import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py';
2827

2928
Crawlee offers multiple storage types for managing and persisting your crawling data. Request-oriented storages, such as the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, help you store and deduplicate URLs, while result-oriented storages, like <ApiLink to="class/Dataset">`Dataset`</ApiLink> and <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs.
3029

@@ -210,12 +209,6 @@ Default storages are purged before the crawler starts, unless explicitly configu
210209

211210
If you do not explicitly interact with storages in your code, the purging will occur automatically when the <ApiLink to="class/BasicCrawler#run">`BasicCrawler.run`</ApiLink> method is invoked.
212211

213-
If you need to purge storages earlier, you can call <ApiLink to="class/MemoryStorageClient#purge_on_start">`MemoryStorageClient.purge_on_start`</ApiLink> directly if you are using the default storage client. This method triggers the purging process for the underlying storage implementation you are currently using.
214-
215-
<RunnableCodeBlock className="language-python" language="python">
216-
{CleaningPurgeExplicitlyExample}
217-
</RunnableCodeBlock>
218-
219212
## Conclusion
220213

221214
This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests and store and retrieve scraping results using the `RequestQueue`, `Dataset`, and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run and how to purge them explicitly. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!

pyproject.toml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ crawlee = "crawlee._cli:cli"
9494

9595
[dependency-groups]
9696
dev = [
97-
"apify_client", # For e2e tests.
97+
"apify-client", # For e2e tests.
9898
"build~=1.2.2", # For e2e tests.
9999
"mypy~=1.15.0",
100100
"pre-commit~=4.2.0",
@@ -143,9 +143,9 @@ ignore = [
143143
"ISC001", # This rule may cause conflicts when used with the formatter
144144
"FIX", # flake8-fixme
145145
"PLR0911", # Too many return statements
146+
"PLR0912", # Too many branches
146147
"PLR0913", # Too many arguments in function definition
147148
"PLR0915", # Too many statements
148-
"PTH", # flake8-use-pathlib
149149
"PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime
150150
"PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None`
151151
"S102", # Use of `exec` detected
@@ -167,6 +167,7 @@ indent-style = "space"
167167
"F401", # Unused imports
168168
]
169169
"**/{tests}/*" = [
170+
"ASYNC230", # Async functions should not open files with blocking methods like `open`
170171
"D", # Everything from the pydocstyle
171172
"INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
172173
"PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable
@@ -204,9 +205,6 @@ builtins-ignorelist = ["id"]
204205
[tool.ruff.lint.isort]
205206
known-first-party = ["crawlee"]
206207

207-
[tool.ruff.lint.pylint]
208-
max-branches = 18
209-
210208
[tool.pytest.ini_options]
211209
addopts = "-ra"
212210
asyncio_default_fixture_loop_scope = "function"

src/crawlee/_autoscaling/autoscaled_pool.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ async def run(self) -> None:
142142

143143
logger.info('Waiting for remaining tasks to finish')
144144

145-
for task in run.worker_tasks:
145+
tasks_to_wait = list(run.worker_tasks)
146+
for task in tasks_to_wait:
146147
if not task.done():
147148
with suppress(BaseException):
148149
await task

src/crawlee/_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
cli = typer.Typer(no_args_is_help=True)
2323

2424
template_directory = importlib.resources.files('crawlee') / 'project_template'
25-
with open(str(template_directory / 'cookiecutter.json')) as f:
25+
with (template_directory / 'cookiecutter.json').open() as f:
2626
cookiecutter_json = json.load(f)
2727

2828
crawler_choices = cookiecutter_json['crawler_type']

src/crawlee/_service_locator.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
from crawlee._utils.docs import docs_group
44
from crawlee.configuration import Configuration
55
from crawlee.errors import ServiceConflictError
6-
from crawlee.events import EventManager
7-
from crawlee.storage_clients import StorageClient
6+
from crawlee.events import EventManager, LocalEventManager
7+
from crawlee.storage_clients import FileSystemStorageClient, StorageClient
88

99

1010
@docs_group('Classes')
@@ -49,8 +49,6 @@ def set_configuration(self, configuration: Configuration) -> None:
4949
def get_event_manager(self) -> EventManager:
5050
"""Get the event manager."""
5151
if self._event_manager is None:
52-
from crawlee.events import LocalEventManager
53-
5452
self._event_manager = (
5553
LocalEventManager().from_config(config=self._configuration)
5654
if self._configuration
@@ -77,13 +75,7 @@ def set_event_manager(self, event_manager: EventManager) -> None:
7775
def get_storage_client(self) -> StorageClient:
7876
"""Get the storage client."""
7977
if self._storage_client is None:
80-
from crawlee.storage_clients import MemoryStorageClient
81-
82-
self._storage_client = (
83-
MemoryStorageClient.from_config(config=self._configuration)
84-
if self._configuration
85-
else MemoryStorageClient.from_config()
86-
)
78+
self._storage_client = FileSystemStorageClient()
8779

8880
self._storage_client_was_retrieved = True
8981
return self._storage_client

src/crawlee/_types.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from crawlee.sessions import Session
2424
from crawlee.storage_clients.models import DatasetItemsListPage
2525
from crawlee.storages import KeyValueStore
26-
from crawlee.storages._dataset import ExportToKwargs, GetDataKwargs
26+
from crawlee.storages._types import ExportToKwargs, GetDataKwargs
2727

2828
# Workaround for https://github.com/pydantic/pydantic/issues/9445
2929
J = TypeVar('J', bound='JsonSerializable')
@@ -190,7 +190,7 @@ class PushDataKwargs(TypedDict):
190190

191191

192192
class PushDataFunctionCall(PushDataKwargs):
193-
data: JsonSerializable
193+
data: list[dict[str, Any]] | dict[str, Any]
194194
dataset_id: str | None
195195
dataset_name: str | None
196196

@@ -271,16 +271,12 @@ async def add_requests(
271271

272272
async def push_data(
273273
self,
274-
data: JsonSerializable,
274+
data: list[dict[str, Any]] | dict[str, Any],
275275
dataset_id: str | None = None,
276276
dataset_name: str | None = None,
277277
**kwargs: Unpack[PushDataKwargs],
278278
) -> None:
279279
"""Track a call to the `push_data` context helper."""
280-
from crawlee.storages._dataset import Dataset
281-
282-
await Dataset.check_and_serialize(data)
283-
284280
self.push_data_calls.append(
285281
PushDataFunctionCall(
286282
data=data,
@@ -520,7 +516,7 @@ class PushDataFunction(Protocol):
520516

521517
def __call__(
522518
self,
523-
data: JsonSerializable,
519+
data: list[dict[str, Any]] | dict[str, Any],
524520
dataset_id: str | None = None,
525521
dataset_name: str | None = None,
526522
**kwargs: Unpack[PushDataKwargs],

0 commit comments

Comments
 (0)