Skip to content

Commit 6a4cc28

Browse files
committed
Init of request queue and its clients
1 parent f499df7 commit 6a4cc28

23 files changed

+568
-477
lines changed

src/crawlee/storage_clients/_base/_dataset_client.py

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,10 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import AsyncIterator
10-
from datetime import datetime
1110
from pathlib import Path
1211
from typing import Any
1312

14-
from crawlee.storage_clients.models import DatasetItemsListPage
13+
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
1514

1615

1716
@docs_group('Abstract classes')
@@ -30,33 +29,8 @@ class DatasetClient(ABC):
3029

3130
@property
3231
@abstractmethod
33-
def id(self) -> str:
34-
"""The ID of the dataet, a unique identifier, typically a UUID or similar value."""
35-
36-
@property
37-
@abstractmethod
38-
def name(self) -> str | None:
39-
"""The optional human-readable name of the dataset."""
40-
41-
@property
42-
@abstractmethod
43-
def created_at(self) -> datetime:
44-
"""Timestamp when the dataset was first created, remains unchanged."""
45-
46-
@property
47-
@abstractmethod
48-
def accessed_at(self) -> datetime:
49-
"""Timestamp of last access to the dataset, updated on read or write operations."""
50-
51-
@property
52-
@abstractmethod
53-
def modified_at(self) -> datetime:
54-
"""Timestamp of last modification of the dataset, updated when new data are added."""
55-
56-
@property
57-
@abstractmethod
58-
def item_count(self) -> int:
59-
"""Total count of data items stored in the dataset."""
32+
def metadata(self) -> DatasetMetadata:
33+
"""The metadata of the dataset."""
6034

6135
@classmethod
6236
@abstractmethod

src/crawlee/storage_clients/_base/_key_value_store_client.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import AsyncIterator
10-
from datetime import datetime
1110
from pathlib import Path
1211

13-
from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
12+
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
1413

1514

1615
@docs_group('Abstract classes')
@@ -29,28 +28,8 @@ class KeyValueStoreClient(ABC):
2928

3029
@property
3130
@abstractmethod
32-
def id(self) -> str:
33-
"""The unique identifier of the key-value store (typically a UUID)."""
34-
35-
@property
36-
@abstractmethod
37-
def name(self) -> str | None:
38-
"""The optional human-readable name for the KVS."""
39-
40-
@property
41-
@abstractmethod
42-
def created_at(self) -> datetime:
43-
"""Timestamp when the KVS was first created, remains unchanged."""
44-
45-
@property
46-
@abstractmethod
47-
def accessed_at(self) -> datetime:
48-
"""Timestamp of last access to the KVS, updated on read or write operations."""
49-
50-
@property
51-
@abstractmethod
52-
def modified_at(self) -> datetime:
53-
"""Timestamp of last modification of the KVS, updated when new data are added, updated or deleted."""
31+
def metadata(self) -> KeyValueStoreMetadata:
32+
"""The metadata of the key-value store."""
5433

5534
@classmethod
5635
@abstractmethod

src/crawlee/storage_clients/_base/_request_queue_client.py

Lines changed: 25 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,21 @@
11
from __future__ import annotations
22

33
from abc import ABC, abstractmethod
4-
from datetime import datetime
54
from typing import TYPE_CHECKING
65

76
from crawlee._utils.docs import docs_group
87

98
if TYPE_CHECKING:
109
from collections.abc import Sequence
11-
from datetime import datetime
10+
from pathlib import Path
1211

1312
from crawlee.storage_clients.models import (
1413
BatchRequestsOperationResponse,
1514
ProcessedRequest,
1615
ProlongRequestLockResponse,
1716
Request,
18-
RequestQueueHead,
1917
RequestQueueHeadWithLocks,
18+
RequestQueueMetadata,
2019
)
2120

2221

@@ -30,58 +29,29 @@ class RequestQueueClient(ABC):
3029

3130
@property
3231
@abstractmethod
33-
def id(self) -> str:
34-
"""The ID of the dataset."""
32+
def metadata(self) -> RequestQueueMetadata:
33+
"""The metadata of the request queue."""
3534

36-
@property
37-
@abstractmethod
38-
def name(self) -> str | None:
39-
"""The name of the dataset."""
40-
41-
@property
42-
@abstractmethod
43-
def created_at(self) -> datetime:
44-
"""The time at which the dataset was created."""
45-
46-
@property
47-
@abstractmethod
48-
def accessed_at(self) -> datetime:
49-
"""The time at which the dataset was last accessed."""
50-
51-
@property
52-
@abstractmethod
53-
def modified_at(self) -> datetime:
54-
"""The time at which the dataset was last modified."""
55-
56-
@property
35+
@classmethod
5736
@abstractmethod
58-
def had_multiple_clients(self) -> bool:
59-
"""TODO."""
60-
61-
@property
62-
@abstractmethod
63-
def handled_request_count(self) -> int:
64-
"""TODO."""
65-
66-
@property
67-
@abstractmethod
68-
def pending_request_count(self) -> int:
69-
"""TODO."""
70-
71-
@property
72-
@abstractmethod
73-
def stats(self) -> dict:
74-
"""TODO."""
37+
async def open(
38+
cls,
39+
*,
40+
id: str | None = None,
41+
name: str | None = None,
42+
storage_dir: Path | None = None,
43+
) -> RequestQueueClient:
44+
"""Open a request queue client.
7545
76-
@property
77-
@abstractmethod
78-
def total_request_count(self) -> int:
79-
"""TODO."""
46+
Args:
47+
id: ID of the queue to open. If not provided, a new queue will be created with a random ID.
48+
name: Name of the queue to open. If not provided, the queue will be unnamed.
49+
purge_on_start: If True, the queue will be purged before opening.
50+
storage_dir: Directory to store the queue data in. If not provided, uses the default storage directory.
8051
81-
@property
82-
@abstractmethod
83-
def resource_directory(self) -> str:
84-
"""TODO."""
52+
Returns:
53+
A request queue client.
54+
"""
8555

8656
@abstractmethod
8757
async def drop(self) -> None:
@@ -90,17 +60,6 @@ async def drop(self) -> None:
9060
The backend method for the `RequestQueue.drop` call.
9161
"""
9262

93-
@abstractmethod
94-
async def list_head(self, *, limit: int | None = None) -> RequestQueueHead:
95-
"""Retrieve a given number of requests from the beginning of the queue.
96-
97-
Args:
98-
limit: How many requests to retrieve.
99-
100-
Returns:
101-
The desired number of requests from the beginning of the queue.
102-
"""
103-
10463
@abstractmethod
10564
async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks:
10665
"""Fetch and lock a specified number of requests from the start of the queue.
@@ -117,33 +76,16 @@ async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None)
11776
"""
11877

11978
@abstractmethod
120-
async def add_request(
121-
self,
122-
request: Request,
123-
*,
124-
forefront: bool = False,
125-
) -> ProcessedRequest:
126-
"""Add a request to the queue.
127-
128-
Args:
129-
request: The request to add to the queue.
130-
forefront: Whether to add the request to the head or the end of the queue.
131-
132-
Returns:
133-
Request queue operation information.
134-
"""
135-
136-
@abstractmethod
137-
async def batch_add_requests(
79+
async def add_requests_batch(
13880
self,
13981
requests: Sequence[Request],
14082
*,
14183
forefront: bool = False,
14284
) -> BatchRequestsOperationResponse:
143-
"""Add a batch of requests to the queue.
85+
"""Add a requests to the queue in batches.
14486
14587
Args:
146-
requests: The requests to add to the queue.
88+
requests: The batch of requests to add to the queue.
14789
forefront: Whether to add the requests to the head or the end of the queue.
14890
14991
Returns:
@@ -187,7 +129,7 @@ async def delete_request(self, request_id: str) -> None:
187129
"""
188130

189131
@abstractmethod
190-
async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse:
132+
async def delete_requests_batch(self, requests: list[Request]) -> BatchRequestsOperationResponse:
191133
"""Delete given requests from the queue.
192134
193135
Args:

src/crawlee/storage_clients/_base/_storage_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ async def open_dataset_client(
2323
purge_on_start: bool = True,
2424
storage_dir: Path | None = None,
2525
) -> DatasetClient:
26-
"""Open the dataset client."""
26+
"""Open a dataset client."""
2727

2828
@abstractmethod
2929
async def open_key_value_store_client(
@@ -34,7 +34,7 @@ async def open_key_value_store_client(
3434
purge_on_start: bool = True,
3535
storage_dir: Path | None = None,
3636
) -> KeyValueStoreClient:
37-
"""Open the key-value store client."""
37+
"""Open a key-value store client."""
3838

3939
@abstractmethod
4040
async def open_request_queue_client(
@@ -45,4 +45,4 @@ async def open_request_queue_client(
4545
purge_on_start: bool = True,
4646
storage_dir: Path | None = None,
4747
) -> RequestQueueClient:
48-
"""Open the request queue client."""
48+
"""Open a request queue client."""
Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
from ._dataset_client import FileSystemDatasetClient
2+
from ._key_value_store_client import FileSystemKeyValueStoreClient
3+
from ._request_queue_client import FileSystemRequestQueueClient
14
from ._storage_client import FileSystemStorageClient
25

3-
__all__ = ['FileSystemStorageClient']
6+
__all__ = [
7+
'FileSystemDatasetClient',
8+
'FileSystemKeyValueStoreClient',
9+
'FileSystemRequestQueueClient',
10+
'FileSystemStorageClient',
11+
]

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 10 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -76,38 +76,13 @@ def __init__(
7676

7777
@override
7878
@property
79-
def id(self) -> str:
80-
return self._metadata.id
81-
82-
@override
83-
@property
84-
def name(self) -> str:
85-
return self._metadata.name
86-
87-
@override
88-
@property
89-
def created_at(self) -> datetime:
90-
return self._metadata.created_at
91-
92-
@override
93-
@property
94-
def accessed_at(self) -> datetime:
95-
return self._metadata.accessed_at
96-
97-
@override
98-
@property
99-
def modified_at(self) -> datetime:
100-
return self._metadata.modified_at
101-
102-
@override
103-
@property
104-
def item_count(self) -> int:
105-
return self._metadata.item_count
79+
def metadata(self) -> DatasetMetadata:
80+
return self._metadata
10681

10782
@property
10883
def path_to_dataset(self) -> Path:
10984
"""The full path to the dataset directory."""
110-
return self._storage_dir / self._STORAGE_SUBDIR / self.name
85+
return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name
11186

11287
@property
11388
def path_to_metadata(self) -> Path:
@@ -170,12 +145,13 @@ async def open(
170145

171146
# Otherwise, create a new dataset client.
172147
else:
148+
now = datetime.now(timezone.utc)
173149
client = cls(
174150
id=crypto_random_object_id(),
175151
name=name,
176-
created_at=datetime.now(timezone.utc),
177-
accessed_at=datetime.now(timezone.utc),
178-
modified_at=datetime.now(timezone.utc),
152+
created_at=now,
153+
accessed_at=now,
154+
modified_at=now,
179155
item_count=0,
180156
storage_dir=storage_dir,
181157
)
@@ -194,12 +170,12 @@ async def drop(self) -> None:
194170
await asyncio.to_thread(shutil.rmtree, self.path_to_dataset)
195171

196172
# Remove the client from the cache.
197-
if self.name in self.__class__._cache_by_name: # noqa: SLF001
198-
del self.__class__._cache_by_name[self.name] # noqa: SLF001
173+
if self.metadata.name in self.__class__._cache_by_name: # noqa: SLF001
174+
del self.__class__._cache_by_name[self.metadata.name] # noqa: SLF001
199175

200176
@override
201177
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
202-
new_item_count = self.item_count
178+
new_item_count = self.metadata.item_count
203179

204180
# If data is a list, push each item individually.
205181
if isinstance(data, list):

0 commit comments

Comments
 (0)