Skip to content

Commit 88154fe

Browse files
committed
reduce warnings (pytest beautifulsoup)
1 parent 4d6d143 commit 88154fe

File tree

9 files changed

+1072
-3
lines changed

9 files changed

+1072
-3
lines changed

src/crawlee/crawlers/_beautifulsoup/_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def html_to_text(source: str | Tag) -> str:
2727
Newline separated plain text without tags.
2828
"""
2929
if isinstance(source, str):
30-
soup = BeautifulSoup(source)
30+
soup = BeautifulSoup(source, features='lxml')
3131
elif isinstance(source, BeautifulSoup):
3232
soup = source
3333
else:
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from ._dataset_client import ApifyDatasetClient
2+
from ._key_value_store_client import ApifyKeyValueStoreClient
3+
from ._request_queue_client import ApifyRequestQueueClient
4+
from ._storage_client import ApifyStorageClient
5+
6+
__all__ = [
7+
'ApifyDatasetClient',
8+
'ApifyKeyValueStoreClient',
9+
'ApifyRequestQueueClient',
10+
'ApifyStorageClient',
11+
]
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
from logging import getLogger
5+
from typing import TYPE_CHECKING, Any
6+
7+
from apify_client import ApifyClientAsync
8+
from typing_extensions import override
9+
10+
from crawlee.storage_clients._base import DatasetClient
11+
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
12+
13+
if TYPE_CHECKING:
14+
from collections.abc import AsyncIterator
15+
from datetime import datetime
16+
17+
from apify import Configuration
18+
from apify_client.clients import DatasetClientAsync
19+
20+
logger = getLogger(__name__)
21+
22+
23+
class ApifyDatasetClient(DatasetClient):
24+
"""An Apify platform implementation of the dataset client."""
25+
26+
def __init__(
27+
self,
28+
*,
29+
id: str,
30+
name: str | None,
31+
created_at: datetime,
32+
accessed_at: datetime,
33+
modified_at: datetime,
34+
item_count: int,
35+
api_client: DatasetClientAsync,
36+
) -> None:
37+
"""Initialize a new instance.
38+
39+
Preferably use the `ApifyDatasetClient.open` class method to create a new instance.
40+
"""
41+
self._metadata = DatasetMetadata(
42+
id=id,
43+
name=name,
44+
created_at=created_at,
45+
accessed_at=accessed_at,
46+
modified_at=modified_at,
47+
item_count=item_count,
48+
)
49+
50+
self._api_client = api_client
51+
"""The Apify dataset client for API operations."""
52+
53+
self._lock = asyncio.Lock()
54+
"""A lock to ensure that only one operation is performed at a time."""
55+
56+
@override
57+
@property
58+
def metadata(self) -> DatasetMetadata:
59+
return self._metadata
60+
61+
@override
62+
@classmethod
63+
async def open(
64+
cls,
65+
*,
66+
id: str | None,
67+
name: str | None,
68+
configuration: Configuration,
69+
) -> ApifyDatasetClient:
70+
token = configuration.token
71+
api_url = configuration.api_base_url
72+
73+
# Otherwise, create a new one.
74+
apify_client_async = ApifyClientAsync(
75+
token=token,
76+
api_url=api_url,
77+
max_retries=8,
78+
min_delay_between_retries_millis=500,
79+
timeout_secs=360,
80+
)
81+
82+
apify_datasets_client = apify_client_async.datasets()
83+
84+
metadata = DatasetMetadata.model_validate(
85+
await apify_datasets_client.get_or_create(name=id if id is not None else name),
86+
)
87+
88+
apify_dataset_client = apify_client_async.dataset(dataset_id=metadata.id)
89+
90+
return cls(
91+
id=metadata.id,
92+
name=metadata.name,
93+
created_at=metadata.created_at,
94+
accessed_at=metadata.accessed_at,
95+
modified_at=metadata.modified_at,
96+
item_count=metadata.item_count,
97+
api_client=apify_dataset_client,
98+
)
99+
100+
@override
101+
async def drop(self) -> None:
102+
async with self._lock:
103+
await self._api_client.delete()
104+
105+
@override
106+
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
107+
async with self._lock:
108+
await self._api_client.push_items(items=data)
109+
await self._update_metadata()
110+
111+
@override
112+
async def get_data(
113+
self,
114+
*,
115+
offset: int = 0,
116+
limit: int | None = 999_999_999_999,
117+
clean: bool = False,
118+
desc: bool = False,
119+
fields: list[str] | None = None,
120+
omit: list[str] | None = None,
121+
unwind: str | None = None,
122+
skip_empty: bool = False,
123+
skip_hidden: bool = False,
124+
flatten: list[str] | None = None,
125+
view: str | None = None,
126+
) -> DatasetItemsListPage:
127+
response = await self._api_client.list_items(
128+
offset=offset,
129+
limit=limit,
130+
clean=clean,
131+
desc=desc,
132+
fields=fields,
133+
omit=omit,
134+
unwind=unwind,
135+
skip_empty=skip_empty,
136+
skip_hidden=skip_hidden,
137+
flatten=flatten,
138+
view=view,
139+
)
140+
result = DatasetItemsListPage.model_validate(vars(response))
141+
await self._update_metadata()
142+
return result
143+
144+
@override
145+
async def iterate_items(
146+
self,
147+
*,
148+
offset: int = 0,
149+
limit: int | None = None,
150+
clean: bool = False,
151+
desc: bool = False,
152+
fields: list[str] | None = None,
153+
omit: list[str] | None = None,
154+
unwind: str | None = None,
155+
skip_empty: bool = False,
156+
skip_hidden: bool = False,
157+
) -> AsyncIterator[dict]:
158+
async for item in self._api_client.iterate_items(
159+
offset=offset,
160+
limit=limit,
161+
clean=clean,
162+
desc=desc,
163+
fields=fields,
164+
omit=omit,
165+
unwind=unwind,
166+
skip_empty=skip_empty,
167+
skip_hidden=skip_hidden,
168+
):
169+
yield item
170+
171+
await self._update_metadata()
172+
173+
async def _update_metadata(self) -> None:
174+
"""Update the dataset metadata file with current information."""
175+
metadata = await self._api_client.get()
176+
self._metadata = DatasetMetadata.model_validate(metadata)
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
from logging import getLogger
5+
from typing import TYPE_CHECKING, Any
6+
7+
from apify._crypto import create_hmac_signature
8+
from apify_client import ApifyClientAsync
9+
from typing_extensions import override
10+
from yarl import URL
11+
12+
from crawlee.storage_clients._base import KeyValueStoreClient
13+
from crawlee.storage_clients.models import (
14+
KeyValueStoreListKeysPage,
15+
KeyValueStoreMetadata,
16+
KeyValueStoreRecord,
17+
KeyValueStoreRecordMetadata,
18+
)
19+
20+
if TYPE_CHECKING:
21+
from collections.abc import AsyncIterator
22+
from datetime import datetime
23+
24+
from apify import Configuration
25+
from apify_client.clients import KeyValueStoreClientAsync
26+
27+
logger = getLogger(__name__)
28+
29+
30+
class ApifyKeyValueStoreClient(KeyValueStoreClient):
31+
"""An Apify platform implementation of the key-value store client."""
32+
33+
def __init__(
34+
self,
35+
*,
36+
id: str,
37+
name: str | None,
38+
created_at: datetime,
39+
accessed_at: datetime,
40+
modified_at: datetime,
41+
api_client: KeyValueStoreClientAsync,
42+
) -> None:
43+
"""Initialize a new instance.
44+
45+
Preferably use the `ApifyKeyValueStoreClient.open` class method to create a new instance.
46+
"""
47+
self._metadata = KeyValueStoreMetadata(
48+
id=id,
49+
name=name,
50+
created_at=created_at,
51+
accessed_at=accessed_at,
52+
modified_at=modified_at,
53+
)
54+
55+
self._api_client = api_client
56+
"""The Apify key-value store client for API operations."""
57+
58+
self._lock = asyncio.Lock()
59+
"""A lock to ensure that only one operation is performed at a time."""
60+
61+
@override
62+
@property
63+
def metadata(self) -> KeyValueStoreMetadata:
64+
return self._metadata
65+
66+
@override
67+
@classmethod
68+
async def open(
69+
cls,
70+
*,
71+
id: str | None,
72+
name: str | None,
73+
configuration: Configuration,
74+
) -> ApifyKeyValueStoreClient:
75+
token = configuration.token
76+
api_url = configuration.api_base_url
77+
78+
# Otherwise, create a new one.
79+
apify_client_async = ApifyClientAsync(
80+
token=token,
81+
api_url=api_url,
82+
max_retries=8,
83+
min_delay_between_retries_millis=500,
84+
timeout_secs=360,
85+
)
86+
87+
apify_kvss_client = apify_client_async.key_value_stores()
88+
89+
metadata = KeyValueStoreMetadata.model_validate(
90+
await apify_kvss_client.get_or_create(name=id if id is not None else name),
91+
)
92+
93+
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=metadata.id)
94+
95+
return cls(
96+
id=metadata.id,
97+
name=metadata.name,
98+
created_at=metadata.created_at,
99+
accessed_at=metadata.accessed_at,
100+
modified_at=metadata.modified_at,
101+
api_client=apify_kvs_client,
102+
)
103+
104+
@override
105+
async def purge(self) -> None:
106+
async with self._lock:
107+
await self._api_client.delete()
108+
109+
@override
110+
async def drop(self) -> None:
111+
async with self._lock:
112+
await self._api_client.delete()
113+
114+
@override
115+
async def get_value(self, key: str) -> KeyValueStoreRecord | None:
116+
response = await self._api_client.get_record(key)
117+
record = KeyValueStoreRecord.model_validate(response) if response else None
118+
await self._update_metadata()
119+
return record
120+
121+
@override
122+
async def set_value(self, key: str, value: Any, content_type: str | None = None) -> None:
123+
async with self._lock:
124+
await self._api_client.set_record(
125+
key=key,
126+
value=value,
127+
content_type=content_type,
128+
)
129+
await self._update_metadata()
130+
131+
@override
132+
async def delete_value(self, key: str) -> None:
133+
async with self._lock:
134+
await self._api_client.delete_record(key=key)
135+
await self._update_metadata()
136+
137+
@override
138+
async def iterate_keys(
139+
self,
140+
*,
141+
exclusive_start_key: str | None = None,
142+
limit: int | None = None,
143+
) -> AsyncIterator[KeyValueStoreRecordMetadata]:
144+
count = 0
145+
146+
while True:
147+
response = await self._api_client.list_keys(exclusive_start_key=exclusive_start_key)
148+
list_key_page = KeyValueStoreListKeysPage.model_validate(response)
149+
150+
for item in list_key_page.items:
151+
yield item
152+
count += 1
153+
154+
# If we've reached the limit, stop yielding
155+
if limit and count >= limit:
156+
break
157+
158+
# If we've reached the limit or there are no more pages, exit the loop
159+
if (limit and count >= limit) or not list_key_page.is_truncated:
160+
break
161+
162+
exclusive_start_key = list_key_page.next_exclusive_start_key
163+
164+
await self._update_metadata()
165+
166+
async def get_public_url(self, key: str) -> str:
167+
"""Get a URL for the given key that may be used to publicly access the value in the remote key-value store.
168+
169+
Args:
170+
key: The key for which the URL should be generated.
171+
"""
172+
if self._api_client.resource_id is None:
173+
raise ValueError('resource_id cannot be None when generating a public URL')
174+
175+
public_url = (
176+
URL(self._api_client.base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key
177+
)
178+
179+
key_value_store = self.metadata
180+
181+
if key_value_store and key_value_store.model_extra:
182+
url_signing_secret_key = key_value_store.model_extra.get('urlSigningSecretKey')
183+
if url_signing_secret_key:
184+
public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key))
185+
186+
return str(public_url)
187+
188+
async def _update_metadata(self) -> None:
189+
"""Update the key-value store metadata with current information."""
190+
metadata = await self._api_client.get()
191+
self._metadata = KeyValueStoreMetadata.model_validate(metadata)

0 commit comments

Comments
 (0)