Skip to content

Commit e84b978

Browse files
committed
Fix race condition in FS RQ and reduce pytest warnings (BS)
1 parent 4d6d143 commit e84b978

File tree

4 files changed

+34
-29
lines changed

4 files changed

+34
-29
lines changed

src/crawlee/crawlers/_beautifulsoup/_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def html_to_text(source: str | Tag) -> str:
2727
Newline separated plain text without tags.
2828
"""
2929
if isinstance(source, str):
30-
soup = BeautifulSoup(source)
30+
soup = BeautifulSoup(source, features='lxml')
3131
elif isinstance(source, BeautifulSoup):
3232
soup = source
3333
else:

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -688,31 +688,32 @@ async def is_empty(self) -> bool:
688688
Returns:
689689
True if the queue is empty, False otherwise.
690690
"""
691-
# Update accessed timestamp when checking if queue is empty
692-
await self._update_metadata(update_accessed_at=True)
691+
async with self._lock:
692+
# Update accessed timestamp when checking if queue is empty
693+
await self._update_metadata(update_accessed_at=True)
693694

694-
# Create the requests directory if it doesn't exist
695-
await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True)
695+
# Create the requests directory if it doesn't exist
696+
await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True)
696697

697-
# List all request files
698-
request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json'))
698+
# List all request files
699+
request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json'))
699700

700-
# Check each file to see if there are any unhandled requests
701-
for request_file in request_files:
702-
# Skip metadata file
703-
if request_file.name == METADATA_FILENAME:
704-
continue
701+
# Check each file to see if there are any unhandled requests
702+
for request_file in request_files:
703+
# Skip metadata file
704+
if request_file.name == METADATA_FILENAME:
705+
continue
705706

706-
file = await asyncio.to_thread(open, request_file)
707-
try:
708-
file_content = json.load(file)
709-
# If any request is not handled, the queue is not empty
710-
if file_content.get('handled_at') is None:
711-
return False
712-
except (json.JSONDecodeError, ValidationError):
713-
logger.warning(f'Failed to parse request file: {request_file}')
714-
finally:
715-
await asyncio.to_thread(file.close)
707+
file = await asyncio.to_thread(open, request_file)
708+
try:
709+
file_content = json.load(file)
710+
# If any request is not handled, the queue is not empty
711+
if file_content.get('handled_at') is None:
712+
return False
713+
except (json.JSONDecodeError, ValidationError):
714+
logger.warning(f'Failed to parse request file: {request_file}')
715+
finally:
716+
await asyncio.to_thread(file.close)
716717

717718
# If we got here, all requests are handled or there are no requests
718719
return True

tests/unit/_utils/test_html_to_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,4 +196,4 @@ def test_html_to_text_parsel() -> None:
196196

197197

198198
def test_html_to_text_beautifulsoup() -> None:
199-
assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT
199+
assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML, features='lxml')) == _EXPECTED_TEXT

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -465,8 +465,8 @@ async def handler(context: BasicCrawlingContext) -> None:
465465
assert visited == set(test_input.expected_urls)
466466

467467

468-
async def test_session_rotation() -> None:
469-
track_session_usage = Mock()
468+
async def test_session_rotation(server_url: URL) -> None:
469+
session_ids: list[str | None] = []
470470

471471
crawler = BasicCrawler(
472472
max_session_rotations=7,
@@ -475,16 +475,20 @@ async def test_session_rotation() -> None:
475475

476476
@crawler.router.default_handler
477477
async def handler(context: BasicCrawlingContext) -> None:
478-
track_session_usage(context.session.id if context.session else None)
478+
session_ids.append(context.session.id if context.session else None)
479479
raise SessionError('Test error')
480480

481-
await crawler.run([Request.from_url('https://someplace.com/', label='start')])
482-
assert track_session_usage.call_count == 7
481+
await crawler.run([str(server_url)])
483482

484-
session_ids = {call[0][0] for call in track_session_usage.call_args_list}
483+
# exactly 7 handler calls happened
485484
assert len(session_ids) == 7
485+
486+
# all session ids are not None
486487
assert None not in session_ids
487488

489+
# and each was a different session
490+
assert len(set(session_ids)) == 7
491+
488492

489493
async def test_final_statistics() -> None:
490494
crawler = BasicCrawler(max_request_retries=3)

0 commit comments

Comments
 (0)