Fix race condition in FS RQ and reduce pytest warnings (BS)

vdusek · vdusek · commit e84b978b05e9 · 2025-05-09T15:15:27.000+02:00
diff --git a/src/crawlee/crawlers/_beautifulsoup/_utils.py b/src/crawlee/crawlers/_beautifulsoup/_utils.py
@@ -27,7 +27,7 @@ def html_to_text(source: str | Tag) -> str:
         Newline separated plain text without tags.
     """
     if isinstance(source, str):
-        soup = BeautifulSoup(source)
+        soup = BeautifulSoup(source, features='lxml')
     elif isinstance(source, BeautifulSoup):
         soup = source
     else:
diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py
@@ -688,31 +688,32 @@ async def is_empty(self) -> bool:
         Returns:
             True if the queue is empty, False otherwise.
         """
-        # Update accessed timestamp when checking if queue is empty
-        await self._update_metadata(update_accessed_at=True)
+        async with self._lock:
+            # Update accessed timestamp when checking if queue is empty
+            await self._update_metadata(update_accessed_at=True)
 
-        # Create the requests directory if it doesn't exist
-        await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True)
+            # Create the requests directory if it doesn't exist
+            await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True)
 
-        # List all request files
-        request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json'))
+            # List all request files
+            request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json'))
 
-        # Check each file to see if there are any unhandled requests
-        for request_file in request_files:
-            # Skip metadata file
-            if request_file.name == METADATA_FILENAME:
-                continue
+            # Check each file to see if there are any unhandled requests
+            for request_file in request_files:
+                # Skip metadata file
+                if request_file.name == METADATA_FILENAME:
+                    continue
 
-            file = await asyncio.to_thread(open, request_file)
-            try:
-                file_content = json.load(file)
-                # If any request is not handled, the queue is not empty
-                if file_content.get('handled_at') is None:
-                    return False
-            except (json.JSONDecodeError, ValidationError):
-                logger.warning(f'Failed to parse request file: {request_file}')
-            finally:
-                await asyncio.to_thread(file.close)
+                file = await asyncio.to_thread(open, request_file)
+                try:
+                    file_content = json.load(file)
+                    # If any request is not handled, the queue is not empty
+                    if file_content.get('handled_at') is None:
+                        return False
+                except (json.JSONDecodeError, ValidationError):
+                    logger.warning(f'Failed to parse request file: {request_file}')
+                finally:
+                    await asyncio.to_thread(file.close)
 
         # If we got here, all requests are handled or there are no requests
         return True
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
@@ -196,4 +196,4 @@ def test_html_to_text_parsel() -> None:
 
 
 def test_html_to_text_beautifulsoup() -> None:
-    assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT
+    assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML, features='lxml')) == _EXPECTED_TEXT
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -465,8 +465,8 @@ async def handler(context: BasicCrawlingContext) -> None:
     assert visited == set(test_input.expected_urls)
 
 
-async def test_session_rotation() -> None:
-    track_session_usage = Mock()
+async def test_session_rotation(server_url: URL) -> None:
+    session_ids: list[str | None] = []
 
     crawler = BasicCrawler(
         max_session_rotations=7,
@@ -475,16 +475,20 @@ async def test_session_rotation() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        track_session_usage(context.session.id if context.session else None)
+        session_ids.append(context.session.id if context.session else None)
         raise SessionError('Test error')
 
-    await crawler.run([Request.from_url('https://someplace.com/', label='start')])
-    assert track_session_usage.call_count == 7
+    await crawler.run([str(server_url)])
 
-    session_ids = {call[0][0] for call in track_session_usage.call_args_list}
+    # exactly 7 handler calls happened
     assert len(session_ids) == 7
+
+    # all session ids are not None
     assert None not in session_ids
 
+    # and each was a different session
+    assert len(set(session_ids)) == 7
+
 
 async def test_final_statistics() -> None:
     crawler = BasicCrawler(max_request_retries=3)

Original file line number	Diff line number	Diff line change
`@@ -196,4 +196,4 @@ def test_html_to_text_parsel() -> None:`
`196`	`196`
`197`	`197`
`198`	`198`	`def test_html_to_text_beautifulsoup() -> None:`
`199`		`- assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT`
	`199`	`+ assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML, features='lxml')) == _EXPECTED_TEXT`