Skip to content

Commit 761ed16

Browse files
committed
Utilize pathlib and use Config in constructors
1 parent 6a4cc28 commit 761ed16

26 files changed

+437
-471
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -660,14 +660,12 @@ async def _use_state(
660660
self,
661661
default_value: dict[str, JsonSerializable] | None = None,
662662
) -> dict[str, JsonSerializable]:
663-
kvs = await self.get_key_value_store()
664-
# TODO:
665-
# return some kvs value
663+
# TODO: implement
664+
return {}
666665

667666
async def _save_crawler_state(self) -> None:
668-
kvs = await self.get_key_value_store()
669-
# TODO:
670-
# some kvs call
667+
pass
668+
# TODO: implement
671669

672670
async def get_data(
673671
self,
@@ -697,16 +695,16 @@ async def export_data(
697695
dataset_id: str | None = None,
698696
dataset_name: str | None = None,
699697
) -> None:
700-
"""Export data from a `Dataset`.
698+
"""Export all items from a Dataset to a JSON or CSV file.
701699
702-
This helper method simplifies the process of exporting data from a `Dataset`. It opens the specified
703-
one and then exports the data based on the provided parameters. If you need to pass options
704-
specific to the output format, use the `export_data_csv` or `export_data_json` method instead.
700+
This method simplifies the process of exporting data collected during crawling. It automatically
701+
determines the export format based on the file extension (`.json` or `.csv`) and handles
702+
the conversion of `Dataset` items to the appropriate format.
705703
706704
Args:
707-
path: The destination path.
708-
dataset_id: The ID of the `Dataset`.
709-
dataset_name: The name of the `Dataset`.
705+
path: The destination file path. Must end with '.json' or '.csv'.
706+
dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead.
707+
dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead.
710708
"""
711709
dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
712710

src/crawlee/project_template/hooks/post_gen_project.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import subprocess
33
from pathlib import Path
44

5-
65
# % if cookiecutter.package_manager in ['poetry', 'uv']
76
Path('requirements.txt').unlink()
87

@@ -32,8 +31,9 @@
3231

3332
# Install requirements and generate requirements.txt as an impromptu lockfile
3433
subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt'])
35-
with open('requirements.txt', 'w') as requirements_txt:
36-
subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt)
34+
Path('requirements.txt').write_text(
35+
subprocess.check_output([str(path / 'pip'), 'freeze']).decode()
36+
)
3737

3838
# % if cookiecutter.crawler_type == 'playwright'
3939
subprocess.check_call([str(path / 'playwright'), 'install'])

src/crawlee/storage_clients/_base/_dataset_client.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import AsyncIterator
10-
from pathlib import Path
1110
from typing import Any
1211

12+
from crawlee.configuration import Configuration
1313
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
1414

1515

@@ -37,9 +37,9 @@ def metadata(self) -> DatasetMetadata:
3737
async def open(
3838
cls,
3939
*,
40-
id: str | None = None,
41-
name: str | None = None,
42-
storage_dir: Path | None = None,
40+
id: str | None,
41+
name: str | None,
42+
configuration: Configuration,
4343
) -> DatasetClient:
4444
"""Open existing or create a new dataset client.
4545
@@ -51,8 +51,7 @@ async def open(
5151
Args:
5252
id: The ID of the dataset. If not provided, an ID may be generated.
5353
name: The name of the dataset. If not provided a default name may be used.
54-
storage_dir: The path to the storage directory. If the client persists data,
55-
it should use this directory. May be ignored by non-persistent implementations.
54+
configuration: The configuration object.
5655
5756
Returns:
5857
A dataset client instance.

src/crawlee/storage_clients/_base/_key_value_store_client.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import AsyncIterator
10-
from pathlib import Path
1110

11+
from crawlee.configuration import Configuration
1212
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
1313

1414

@@ -36,9 +36,9 @@ def metadata(self) -> KeyValueStoreMetadata:
3636
async def open(
3737
cls,
3838
*,
39-
id: str | None = None,
40-
name: str | None = None,
41-
storage_dir: Path | None = None,
39+
id: str | None,
40+
name: str | None,
41+
configuration: Configuration,
4242
) -> KeyValueStoreClient:
4343
"""Open existing or create a new key-value store client.
4444
@@ -51,8 +51,7 @@ async def open(
5151
Args:
5252
id: The ID of the key-value store. If not provided, an ID may be generated.
5353
name: The name of the key-value store. If not provided a default name may be used.
54-
storage_dir: The path to the storage directory. If the client persists data,
55-
it should use this directory. May be ignored by non-persistent implementations.
54+
configuration: The configuration object.
5655
5756
Returns:
5857
A key-value store client instance.

src/crawlee/storage_clients/_base/_request_queue_client.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import Sequence
10-
from pathlib import Path
1110

11+
from crawlee.configuration import Configuration
1212
from crawlee.storage_clients.models import (
1313
BatchRequestsOperationResponse,
1414
ProcessedRequest,
@@ -37,17 +37,16 @@ def metadata(self) -> RequestQueueMetadata:
3737
async def open(
3838
cls,
3939
*,
40-
id: str | None = None,
41-
name: str | None = None,
42-
storage_dir: Path | None = None,
40+
id: str | None,
41+
name: str | None,
42+
configuration: Configuration,
4343
) -> RequestQueueClient:
4444
"""Open a request queue client.
4545
4646
Args:
4747
id: ID of the queue to open. If not provided, a new queue will be created with a random ID.
4848
name: Name of the queue to open. If not provided, the queue will be unnamed.
49-
purge_on_start: If True, the queue will be purged before opening.
50-
storage_dir: Directory to store the queue data in. If not provided, uses the default storage directory.
49+
configuration: The configuration object.
5150
5251
Returns:
5352
A request queue client.

src/crawlee/storage_clients/_base/_storage_client.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
if TYPE_CHECKING:
7-
from pathlib import Path
7+
from crawlee.configuration import Configuration
88

99
from ._dataset_client import DatasetClient
1010
from ._key_value_store_client import KeyValueStoreClient
@@ -20,8 +20,7 @@ async def open_dataset_client(
2020
*,
2121
id: str | None = None,
2222
name: str | None = None,
23-
purge_on_start: bool = True,
24-
storage_dir: Path | None = None,
23+
configuration: Configuration | None = None,
2524
) -> DatasetClient:
2625
"""Open a dataset client."""
2726

@@ -31,8 +30,7 @@ async def open_key_value_store_client(
3130
*,
3231
id: str | None = None,
3332
name: str | None = None,
34-
purge_on_start: bool = True,
35-
storage_dir: Path | None = None,
33+
configuration: Configuration | None = None,
3634
) -> KeyValueStoreClient:
3735
"""Open a key-value store client."""
3836

@@ -42,7 +40,6 @@ async def open_request_queue_client(
4240
*,
4341
id: str | None = None,
4442
name: str | None = None,
45-
purge_on_start: bool = True,
46-
storage_dir: Path | None = None,
43+
configuration: Configuration | None = None,
4744
) -> RequestQueueClient:
4845
"""Open a request queue client."""

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from collections.abc import AsyncIterator
2222
from typing import Any
2323

24+
from crawlee.configuration import Configuration
25+
2426
logger = getLogger(__name__)
2527

2628

@@ -32,14 +34,11 @@ class FileSystemDatasetClient(DatasetClient):
3234
filename, allowing for easy ordering and pagination.
3335
"""
3436

35-
_DEFAULT_NAME = 'default'
36-
"""The default name for the dataset when no name is provided."""
37-
3837
_STORAGE_SUBDIR = 'datasets'
3938
"""The name of the subdirectory where datasets are stored."""
4039

41-
_LOCAL_ENTRY_NAME_DIGITS = 9
42-
"""Number of digits used for the file names (e.g., 000000019.json)."""
40+
_ITEM_FILENAME_DIGITS = 9
41+
"""Number of digits used for the dataset item file names (e.g., 000000019.json)."""
4342

4443
_cache_by_name: ClassVar[dict[str, FileSystemDatasetClient]] = {}
4544
"""A dictionary to cache clients by their names."""
@@ -72,7 +71,7 @@ def __init__(
7271

7372
# Internal attributes
7473
self._lock = asyncio.Lock()
75-
"""A lock to ensure that only one file operation is performed at a time."""
74+
"""A lock to ensure that only one operation is performed at a time."""
7675

7776
@override
7877
@property
@@ -94,24 +93,24 @@ def path_to_metadata(self) -> Path:
9493
async def open(
9594
cls,
9695
*,
97-
id: str | None = None,
98-
name: str | None = None,
99-
storage_dir: Path | None = None,
96+
id: str | None,
97+
name: str | None,
98+
configuration: Configuration,
10099
) -> FileSystemDatasetClient:
101100
if id:
102101
raise ValueError(
103102
'Opening a dataset by "id" is not supported for file system storage client, use "name" instead.'
104103
)
105104

106-
name = name or cls._DEFAULT_NAME
105+
name = name or configuration.default_dataset_id
107106

108107
# Check if the client is already cached by name.
109108
if name in cls._cache_by_name:
110109
client = cls._cache_by_name[name]
111110
await client._update_metadata(update_accessed_at=True) # noqa: SLF001
112111
return client
113112

114-
storage_dir = storage_dir or Path.cwd()
113+
storage_dir = Path(configuration.storage_dir)
115114
dataset_path = storage_dir / cls._STORAGE_SUBDIR / name
116115
metadata_path = dataset_path / METADATA_FILENAME
117116

@@ -386,7 +385,7 @@ async def _push_item(self, item: dict[str, Any], item_id: int) -> None:
386385
# Acquire the lock to perform file operations safely.
387386
async with self._lock:
388387
# Generate the filename for the new item using zero-padded numbering.
389-
filename = f'{str(item_id).zfill(self._LOCAL_ENTRY_NAME_DIGITS)}.json'
388+
filename = f'{str(item_id).zfill(self._ITEM_FILENAME_DIGITS)}.json'
390389
file_path = self.path_to_dataset / filename
391390

392391
# Ensure the dataset directory exists.

src/crawlee/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
if TYPE_CHECKING:
2222
from collections.abc import AsyncIterator
2323

24+
from crawlee.configuration import Configuration
25+
2426

2527
logger = getLogger(__name__)
2628

@@ -33,9 +35,6 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
3335
in an accompanying file.
3436
"""
3537

36-
_DEFAULT_NAME = 'default'
37-
"""The default name for the unnamed key-value store."""
38-
3938
_STORAGE_SUBDIR = 'key_value_stores'
4039
"""The name of the subdirectory where key-value stores are stored."""
4140

@@ -68,7 +67,7 @@ def __init__(
6867

6968
# Internal attributes
7069
self._lock = asyncio.Lock()
71-
"""A lock to ensure that only one file operation is performed at a time."""
70+
"""A lock to ensure that only one operation is performed at a time."""
7271

7372
@override
7473
@property
@@ -90,22 +89,24 @@ def path_to_metadata(self) -> Path:
9089
async def open(
9190
cls,
9291
*,
93-
id: str | None = None,
94-
name: str | None = None,
95-
storage_dir: Path | None = None,
92+
id: str | None,
93+
name: str | None,
94+
configuration: Configuration,
9695
) -> FileSystemKeyValueStoreClient:
9796
if id:
9897
raise ValueError(
9998
'Opening a key-value store by "id" is not supported for file system storage client, use "name" instead.'
10099
)
101100

102-
name = name or cls._DEFAULT_NAME
101+
name = name or configuration.default_dataset_id
103102

104103
# Check if the client is already cached by name.
105104
if name in cls._cache_by_name:
106-
return cls._cache_by_name[name]
105+
client = cls._cache_by_name[name]
106+
await client._update_metadata(update_accessed_at=True) # noqa: SLF001
107+
return client
107108

108-
storage_dir = storage_dir or Path.cwd()
109+
storage_dir = Path(configuration.storage_dir)
109110
kvs_path = storage_dir / cls._STORAGE_SUBDIR / name
110111
metadata_path = kvs_path / METADATA_FILENAME
111112

0 commit comments

Comments
 (0)