Skip to content

Commit 6b7b8bd

Browse files
committed
Cleanup
1 parent be2232b commit 6b7b8bd

20 files changed

+343
-791
lines changed

src/crawlee/_consts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from __future__ import annotations
22

33
METADATA_FILENAME = '__metadata__.json'
4+
"""The name of the metadata file for storage clients."""

src/crawlee/_types.py

Lines changed: 150 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,41 @@
33
import dataclasses
44
from collections.abc import Iterator, Mapping
55
from dataclasses import dataclass
6-
from enum import Enum
7-
from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload
6+
from typing import (
7+
TYPE_CHECKING,
8+
Annotated,
9+
Any,
10+
Callable,
11+
Literal,
12+
Optional,
13+
Protocol,
14+
TypedDict,
15+
TypeVar,
16+
Union,
17+
cast,
18+
overload,
19+
)
820

921
from pydantic import ConfigDict, Field, PlainValidator, RootModel
10-
from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack
1122

1223
from crawlee._utils.docs import docs_group
1324

1425
if TYPE_CHECKING:
26+
import json
1527
import logging
1628
import re
17-
from collections.abc import Coroutine, Sequence
29+
from collections.abc import Callable, Coroutine, Sequence
30+
31+
from typing_extensions import NotRequired, Required, TypeAlias, Unpack
1832

1933
from crawlee import Glob, Request
2034
from crawlee._request import RequestOptions
35+
from crawlee.configuration import Configuration
2136
from crawlee.http_clients import HttpResponse
2237
from crawlee.proxy_configuration import ProxyInfo
2338
from crawlee.sessions import Session
24-
from crawlee.storage_clients.models import DatasetItemsListPage
39+
from crawlee.storage_clients import StorageClient
2540
from crawlee.storages import KeyValueStore
26-
from crawlee.storages._types import ExportToKwargs, GetDataKwargs
2741

2842
# Workaround for https://github.com/pydantic/pydantic/issues/9445
2943
J = TypeVar('J', bound='JsonSerializable')
@@ -138,15 +152,6 @@ def __init__(
138152
self.max_tasks_per_minute = max_tasks_per_minute
139153

140154

141-
@docs_group('Data structures')
142-
class StorageTypes(str, Enum):
143-
"""Possible Crawlee storage types."""
144-
145-
DATASET = 'Dataset'
146-
KEY_VALUE_STORE = 'Key-value store'
147-
REQUEST_QUEUE = 'Request queue'
148-
149-
150155
class EnqueueLinksKwargs(TypedDict):
151156
"""Keyword arguments for the `enqueue_links` methods."""
152157

@@ -416,55 +421,6 @@ def __call__(
416421
"""
417422

418423

419-
@docs_group('Functions')
420-
class ExportToFunction(Protocol):
421-
"""A function for exporting data from a `Dataset`.
422-
423-
It simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports
424-
its content to a `KeyValueStore`.
425-
"""
426-
427-
def __call__(
428-
self,
429-
dataset_id: str | None = None,
430-
dataset_name: str | None = None,
431-
**kwargs: Unpack[ExportToKwargs],
432-
) -> Coroutine[None, None, None]:
433-
"""Call dunder method.
434-
435-
Args:
436-
dataset_id: The ID of the `Dataset` to export data from.
437-
dataset_name: The name of the `Dataset` to export data from.
438-
**kwargs: Additional keyword arguments.
439-
"""
440-
441-
442-
@docs_group('Functions')
443-
class GetDataFunction(Protocol):
444-
"""A function for retrieving data from a `Dataset`.
445-
446-
It simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves
447-
data based on the provided parameters. It allows filtering and pagination.
448-
"""
449-
450-
def __call__(
451-
self,
452-
dataset_id: str | None = None,
453-
dataset_name: str | None = None,
454-
**kwargs: Unpack[GetDataKwargs],
455-
) -> Coroutine[None, None, DatasetItemsListPage]:
456-
"""Call dunder method.
457-
458-
Args:
459-
dataset_id: ID of the `Dataset` to get data from.
460-
dataset_name: Name of the `Dataset` to get data from.
461-
**kwargs: Additional keyword arguments.
462-
463-
Returns:
464-
A page of retrieved items.
465-
"""
466-
467-
468424
@docs_group('Functions')
469425
class GetKeyValueStoreFunction(Protocol):
470426
"""A function for accessing a `KeyValueStore`.
@@ -573,18 +529,6 @@ def __bool__(self) -> bool:
573529
return bool(self.screenshot or self.html)
574530

575531

576-
@docs_group('Functions')
577-
class GetPageSnapshot(Protocol):
578-
"""A function for getting snapshot of a page."""
579-
580-
def __call__(self) -> Coroutine[None, None, PageSnapshot]:
581-
"""Get page snapshot.
582-
583-
Returns:
584-
Snapshot of a page.
585-
"""
586-
587-
588532
@docs_group('Functions')
589533
class UseStateFunction(Protocol):
590534
"""A function for managing state within the crawling context.
@@ -652,3 +596,133 @@ async def get_snapshot(self) -> PageSnapshot:
652596
def __hash__(self) -> int:
653597
"""Return hash of the context. Each context is considered unique."""
654598
return id(self)
599+
600+
601+
class GetDataKwargs(TypedDict):
602+
"""Keyword arguments for dataset's `get_data` method."""
603+
604+
offset: NotRequired[int]
605+
"""Skips the specified number of items at the start."""
606+
607+
limit: NotRequired[int | None]
608+
"""The maximum number of items to retrieve. Unlimited if None."""
609+
610+
clean: NotRequired[bool]
611+
"""Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty."""
612+
613+
desc: NotRequired[bool]
614+
"""Set to True to sort results in descending order."""
615+
616+
fields: NotRequired[list[str]]
617+
"""Fields to include in each item. Sorts fields as specified if provided."""
618+
619+
omit: NotRequired[list[str]]
620+
"""Fields to exclude from each item."""
621+
622+
unwind: NotRequired[str]
623+
"""Unwinds items by a specified array field, turning each element into a separate item."""
624+
625+
skip_empty: NotRequired[bool]
626+
"""Excludes empty items from the results if True."""
627+
628+
skip_hidden: NotRequired[bool]
629+
"""Excludes fields starting with '#' if True."""
630+
631+
flatten: NotRequired[list[str]]
632+
"""Fields to be flattened in returned items."""
633+
634+
view: NotRequired[str]
635+
"""Specifies the dataset view to be used."""
636+
637+
638+
class ExportToKwargs(TypedDict):
639+
"""Keyword arguments for dataset's `export_to` method."""
640+
641+
key: Required[str]
642+
"""The key under which to save the data."""
643+
644+
content_type: NotRequired[Literal['json', 'csv']]
645+
"""The format in which to export the data. Either 'json' or 'csv'."""
646+
647+
to_kvs_id: NotRequired[str]
648+
"""ID of the key-value store to save the exported file."""
649+
650+
to_kvs_name: NotRequired[str]
651+
"""Name of the key-value store to save the exported file."""
652+
653+
to_kvs_storage_client: NotRequired[StorageClient]
654+
"""The storage client to use for saving the exported file."""
655+
656+
to_kvs_configuration: NotRequired[Configuration]
657+
"""The configuration to use for saving the exported file."""
658+
659+
660+
class ExportDataJsonKwargs(TypedDict):
661+
"""Keyword arguments for dataset's `export_data_json` method."""
662+
663+
skipkeys: NotRequired[bool]
664+
"""If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped
665+
instead of raising a `TypeError`."""
666+
667+
ensure_ascii: NotRequired[bool]
668+
"""Determines if non-ASCII characters should be escaped in the output JSON string."""
669+
670+
check_circular: NotRequired[bool]
671+
"""If False (default: True), skips the circular reference check for container types. A circular reference will
672+
result in a `RecursionError` or worse if unchecked."""
673+
674+
allow_nan: NotRequired[bool]
675+
"""If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply
676+
with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity)."""
677+
678+
cls: NotRequired[type[json.JSONEncoder]]
679+
"""Allows specifying a custom JSON encoder."""
680+
681+
indent: NotRequired[int]
682+
"""Specifies the number of spaces for indentation in the pretty-printed JSON output."""
683+
684+
separators: NotRequired[tuple[str, str]]
685+
"""A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ')
686+
otherwise."""
687+
688+
default: NotRequired[Callable]
689+
"""A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version
690+
of the object or raise a `TypeError`."""
691+
692+
sort_keys: NotRequired[bool]
693+
"""Specifies whether the output JSON object should have keys sorted alphabetically."""
694+
695+
696+
class ExportDataCsvKwargs(TypedDict):
697+
"""Keyword arguments for dataset's `export_data_csv` method."""
698+
699+
dialect: NotRequired[str]
700+
"""Specifies a dialect to be used in CSV parsing and writing."""
701+
702+
delimiter: NotRequired[str]
703+
"""A one-character string used to separate fields. Defaults to ','."""
704+
705+
doublequote: NotRequired[bool]
706+
"""Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled;
707+
when False, the `escapechar` is used as a prefix. Defaults to True."""
708+
709+
escapechar: NotRequired[str]
710+
"""A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar`
711+
if `doublequote` is False. Defaults to None, disabling escaping."""
712+
713+
lineterminator: NotRequired[str]
714+
"""The string used to terminate lines produced by the writer. Defaults to '\\r\\n'."""
715+
716+
quotechar: NotRequired[str]
717+
"""A one-character string used to quote fields containing special characters, like the delimiter or quotechar,
718+
or fields containing new-line characters. Defaults to '\"'."""
719+
720+
quoting: NotRequired[int]
721+
"""Controls when quotes should be generated by the writer and recognized by the reader. Can take any of
722+
the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`."""
723+
724+
skipinitialspace: NotRequired[bool]
725+
"""When True, spaces immediately following the delimiter are ignored. Defaults to False."""
726+
727+
strict: NotRequired[bool]
728+
"""When True, raises an exception on bad CSV input. Defaults to False."""

src/crawlee/_utils/data_processing.py

Lines changed: 0 additions & 41 deletions
This file was deleted.

0 commit comments

Comments
 (0)