|
3 | 3 | import dataclasses
|
4 | 4 | from collections.abc import Iterator, Mapping
|
5 | 5 | from dataclasses import dataclass
|
6 |
| -from enum import Enum |
7 |
| -from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload |
| 6 | +from typing import ( |
| 7 | + TYPE_CHECKING, |
| 8 | + Annotated, |
| 9 | + Any, |
| 10 | + Callable, |
| 11 | + Literal, |
| 12 | + Optional, |
| 13 | + Protocol, |
| 14 | + TypedDict, |
| 15 | + TypeVar, |
| 16 | + Union, |
| 17 | + cast, |
| 18 | + overload, |
| 19 | +) |
8 | 20 |
|
9 | 21 | from pydantic import ConfigDict, Field, PlainValidator, RootModel
|
10 |
| -from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack |
11 | 22 |
|
12 | 23 | from crawlee._utils.docs import docs_group
|
13 | 24 |
|
14 | 25 | if TYPE_CHECKING:
|
| 26 | + import json |
15 | 27 | import logging
|
16 | 28 | import re
|
17 |
| - from collections.abc import Coroutine, Sequence |
| 29 | + from collections.abc import Callable, Coroutine, Sequence |
| 30 | + |
| 31 | + from typing_extensions import NotRequired, Required, TypeAlias, Unpack |
18 | 32 |
|
19 | 33 | from crawlee import Glob, Request
|
20 | 34 | from crawlee._request import RequestOptions
|
| 35 | + from crawlee.configuration import Configuration |
21 | 36 | from crawlee.http_clients import HttpResponse
|
22 | 37 | from crawlee.proxy_configuration import ProxyInfo
|
23 | 38 | from crawlee.sessions import Session
|
24 |
| - from crawlee.storage_clients.models import DatasetItemsListPage |
| 39 | + from crawlee.storage_clients import StorageClient |
25 | 40 | from crawlee.storages import KeyValueStore
|
26 |
| - from crawlee.storages._types import ExportToKwargs, GetDataKwargs |
27 | 41 |
|
28 | 42 | # Workaround for https://github.com/pydantic/pydantic/issues/9445
|
29 | 43 | J = TypeVar('J', bound='JsonSerializable')
|
@@ -138,15 +152,6 @@ def __init__(
|
138 | 152 | self.max_tasks_per_minute = max_tasks_per_minute
|
139 | 153 |
|
140 | 154 |
|
141 |
| -@docs_group('Data structures') |
142 |
| -class StorageTypes(str, Enum): |
143 |
| - """Possible Crawlee storage types.""" |
144 |
| - |
145 |
| - DATASET = 'Dataset' |
146 |
| - KEY_VALUE_STORE = 'Key-value store' |
147 |
| - REQUEST_QUEUE = 'Request queue' |
148 |
| - |
149 |
| - |
150 | 155 | class EnqueueLinksKwargs(TypedDict):
|
151 | 156 | """Keyword arguments for the `enqueue_links` methods."""
|
152 | 157 |
|
@@ -416,55 +421,6 @@ def __call__(
|
416 | 421 | """
|
417 | 422 |
|
418 | 423 |
|
419 |
| -@docs_group('Functions') |
420 |
| -class ExportToFunction(Protocol): |
421 |
| - """A function for exporting data from a `Dataset`. |
422 |
| -
|
423 |
| - It simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports |
424 |
| - its content to a `KeyValueStore`. |
425 |
| - """ |
426 |
| - |
427 |
| - def __call__( |
428 |
| - self, |
429 |
| - dataset_id: str | None = None, |
430 |
| - dataset_name: str | None = None, |
431 |
| - **kwargs: Unpack[ExportToKwargs], |
432 |
| - ) -> Coroutine[None, None, None]: |
433 |
| - """Call dunder method. |
434 |
| -
|
435 |
| - Args: |
436 |
| - dataset_id: The ID of the `Dataset` to export data from. |
437 |
| - dataset_name: The name of the `Dataset` to export data from. |
438 |
| - **kwargs: Additional keyword arguments. |
439 |
| - """ |
440 |
| - |
441 |
| - |
442 |
| -@docs_group('Functions') |
443 |
| -class GetDataFunction(Protocol): |
444 |
| - """A function for retrieving data from a `Dataset`. |
445 |
| -
|
446 |
| - It simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves |
447 |
| - data based on the provided parameters. It allows filtering and pagination. |
448 |
| - """ |
449 |
| - |
450 |
| - def __call__( |
451 |
| - self, |
452 |
| - dataset_id: str | None = None, |
453 |
| - dataset_name: str | None = None, |
454 |
| - **kwargs: Unpack[GetDataKwargs], |
455 |
| - ) -> Coroutine[None, None, DatasetItemsListPage]: |
456 |
| - """Call dunder method. |
457 |
| -
|
458 |
| - Args: |
459 |
| - dataset_id: ID of the `Dataset` to get data from. |
460 |
| - dataset_name: Name of the `Dataset` to get data from. |
461 |
| - **kwargs: Additional keyword arguments. |
462 |
| -
|
463 |
| - Returns: |
464 |
| - A page of retrieved items. |
465 |
| - """ |
466 |
| - |
467 |
| - |
468 | 424 | @docs_group('Functions')
|
469 | 425 | class GetKeyValueStoreFunction(Protocol):
|
470 | 426 | """A function for accessing a `KeyValueStore`.
|
@@ -573,18 +529,6 @@ def __bool__(self) -> bool:
|
573 | 529 | return bool(self.screenshot or self.html)
|
574 | 530 |
|
575 | 531 |
|
576 |
| -@docs_group('Functions') |
577 |
| -class GetPageSnapshot(Protocol): |
578 |
| - """A function for getting snapshot of a page.""" |
579 |
| - |
580 |
| - def __call__(self) -> Coroutine[None, None, PageSnapshot]: |
581 |
| - """Get page snapshot. |
582 |
| -
|
583 |
| - Returns: |
584 |
| - Snapshot of a page. |
585 |
| - """ |
586 |
| - |
587 |
| - |
588 | 532 | @docs_group('Functions')
|
589 | 533 | class UseStateFunction(Protocol):
|
590 | 534 | """A function for managing state within the crawling context.
|
@@ -652,3 +596,133 @@ async def get_snapshot(self) -> PageSnapshot:
|
652 | 596 | def __hash__(self) -> int:
|
653 | 597 | """Return hash of the context. Each context is considered unique."""
|
654 | 598 | return id(self)
|
| 599 | + |
| 600 | + |
| 601 | +class GetDataKwargs(TypedDict): |
| 602 | + """Keyword arguments for dataset's `get_data` method.""" |
| 603 | + |
| 604 | + offset: NotRequired[int] |
| 605 | + """Skips the specified number of items at the start.""" |
| 606 | + |
| 607 | + limit: NotRequired[int | None] |
| 608 | + """The maximum number of items to retrieve. Unlimited if None.""" |
| 609 | + |
| 610 | + clean: NotRequired[bool] |
| 611 | + """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" |
| 612 | + |
| 613 | + desc: NotRequired[bool] |
| 614 | + """Set to True to sort results in descending order.""" |
| 615 | + |
| 616 | + fields: NotRequired[list[str]] |
| 617 | + """Fields to include in each item. Sorts fields as specified if provided.""" |
| 618 | + |
| 619 | + omit: NotRequired[list[str]] |
| 620 | + """Fields to exclude from each item.""" |
| 621 | + |
| 622 | + unwind: NotRequired[str] |
| 623 | + """Unwinds items by a specified array field, turning each element into a separate item.""" |
| 624 | + |
| 625 | + skip_empty: NotRequired[bool] |
| 626 | + """Excludes empty items from the results if True.""" |
| 627 | + |
| 628 | + skip_hidden: NotRequired[bool] |
| 629 | + """Excludes fields starting with '#' if True.""" |
| 630 | + |
| 631 | + flatten: NotRequired[list[str]] |
| 632 | + """Fields to be flattened in returned items.""" |
| 633 | + |
| 634 | + view: NotRequired[str] |
| 635 | + """Specifies the dataset view to be used.""" |
| 636 | + |
| 637 | + |
| 638 | +class ExportToKwargs(TypedDict): |
| 639 | + """Keyword arguments for dataset's `export_to` method.""" |
| 640 | + |
| 641 | + key: Required[str] |
| 642 | + """The key under which to save the data.""" |
| 643 | + |
| 644 | + content_type: NotRequired[Literal['json', 'csv']] |
| 645 | + """The format in which to export the data. Either 'json' or 'csv'.""" |
| 646 | + |
| 647 | + to_kvs_id: NotRequired[str] |
| 648 | + """ID of the key-value store to save the exported file.""" |
| 649 | + |
| 650 | + to_kvs_name: NotRequired[str] |
| 651 | + """Name of the key-value store to save the exported file.""" |
| 652 | + |
| 653 | + to_kvs_storage_client: NotRequired[StorageClient] |
| 654 | + """The storage client to use for saving the exported file.""" |
| 655 | + |
| 656 | + to_kvs_configuration: NotRequired[Configuration] |
| 657 | + """The configuration to use for saving the exported file.""" |
| 658 | + |
| 659 | + |
| 660 | +class ExportDataJsonKwargs(TypedDict): |
| 661 | + """Keyword arguments for dataset's `export_data_json` method.""" |
| 662 | + |
| 663 | + skipkeys: NotRequired[bool] |
| 664 | + """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped |
| 665 | + instead of raising a `TypeError`.""" |
| 666 | + |
| 667 | + ensure_ascii: NotRequired[bool] |
| 668 | + """Determines if non-ASCII characters should be escaped in the output JSON string.""" |
| 669 | + |
| 670 | + check_circular: NotRequired[bool] |
| 671 | + """If False (default: True), skips the circular reference check for container types. A circular reference will |
| 672 | + result in a `RecursionError` or worse if unchecked.""" |
| 673 | + |
| 674 | + allow_nan: NotRequired[bool] |
| 675 | + """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply |
| 676 | + with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).""" |
| 677 | + |
| 678 | + cls: NotRequired[type[json.JSONEncoder]] |
| 679 | + """Allows specifying a custom JSON encoder.""" |
| 680 | + |
| 681 | + indent: NotRequired[int] |
| 682 | + """Specifies the number of spaces for indentation in the pretty-printed JSON output.""" |
| 683 | + |
| 684 | + separators: NotRequired[tuple[str, str]] |
| 685 | + """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ') |
| 686 | + otherwise.""" |
| 687 | + |
| 688 | + default: NotRequired[Callable] |
| 689 | + """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version |
| 690 | + of the object or raise a `TypeError`.""" |
| 691 | + |
| 692 | + sort_keys: NotRequired[bool] |
| 693 | + """Specifies whether the output JSON object should have keys sorted alphabetically.""" |
| 694 | + |
| 695 | + |
| 696 | +class ExportDataCsvKwargs(TypedDict): |
| 697 | + """Keyword arguments for dataset's `export_data_csv` method.""" |
| 698 | + |
| 699 | + dialect: NotRequired[str] |
| 700 | + """Specifies a dialect to be used in CSV parsing and writing.""" |
| 701 | + |
| 702 | + delimiter: NotRequired[str] |
| 703 | + """A one-character string used to separate fields. Defaults to ','.""" |
| 704 | + |
| 705 | + doublequote: NotRequired[bool] |
| 706 | + """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled; |
| 707 | + when False, the `escapechar` is used as a prefix. Defaults to True.""" |
| 708 | + |
| 709 | + escapechar: NotRequired[str] |
| 710 | + """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar` |
| 711 | + if `doublequote` is False. Defaults to None, disabling escaping.""" |
| 712 | + |
| 713 | + lineterminator: NotRequired[str] |
| 714 | + """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'.""" |
| 715 | + |
| 716 | + quotechar: NotRequired[str] |
| 717 | + """A one-character string used to quote fields containing special characters, like the delimiter or quotechar, |
| 718 | + or fields containing new-line characters. Defaults to '\"'.""" |
| 719 | + |
| 720 | + quoting: NotRequired[int] |
| 721 | + """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of |
| 722 | + the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.""" |
| 723 | + |
| 724 | + skipinitialspace: NotRequired[bool] |
| 725 | + """When True, spaces immediately following the delimiter are ignored. Defaults to False.""" |
| 726 | + |
| 727 | + strict: NotRequired[bool] |
| 728 | + """When True, raises an exception on bad CSV input. Defaults to False.""" |
0 commit comments