Skip to content

Commit 6bbd977

Browse files
committed
RQ FS performance issues in fetch_next_request
1 parent 52a5a79 commit 6bbd977

File tree

1 file changed

+9
-13
lines changed

1 file changed

+9
-13
lines changed

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
5757
_STORAGE_SUBSUBDIR_DEFAULT = 'default'
5858
"""The name of the subdirectory for the default request queue."""
5959

60+
_MAX_REQUESTS_IN_CACHE = 100_000
61+
"""Maximum number of requests to keep in cache for faster access."""
62+
6063
def __init__(
6164
self,
6265
*,
@@ -112,9 +115,6 @@ def __init__(
112115
self._in_progress = set[str]()
113116
"""A set of request IDs that are currently being processed."""
114117

115-
self._cache_size = 50
116-
"""Maximum number of requests to keep in cache."""
117-
118118
self._request_cache = deque[Request]()
119119
"""Cache for requests: forefront requests at the beginning, regular requests at the end."""
120120

@@ -463,10 +463,6 @@ async def fetch_next_request(self) -> Request | None:
463463
if candidate.id not in self._in_progress:
464464
next_request = candidate
465465

466-
# If cache is getting low, mark for refresh on next call.
467-
if len(self._request_cache) < self._cache_size // 4:
468-
self._cache_needs_refresh = True
469-
470466
if next_request is not None:
471467
self._in_progress.add(next_request.id)
472468

@@ -678,15 +674,15 @@ async def _update_metadata(
678674
async def _refresh_cache(self) -> None:
679675
"""Refresh the request cache from filesystem.
680676
681-
This method loads up to _cache_size requests from the filesystem,
677+
This method loads up to _MAX_REQUESTS_IN_CACHE requests from the filesystem,
682678
prioritizing forefront requests and maintaining proper ordering.
683679
"""
684680
self._request_cache.clear()
685681

686-
request_files = await self._get_request_files(self.path_to_rq)
682+
forefront_requests = list[Request]()
683+
regular_requests = list[Request]()
687684

688-
forefront_requests = []
689-
regular_requests = []
685+
request_files = await self._get_request_files(self.path_to_rq)
690686

691687
for request_file in request_files:
692688
request = await self._parse_request_file(request_file)
@@ -726,13 +722,13 @@ async def _refresh_cache(self) -> None:
726722
# Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted
727723
# by sequence (newest first), we need to add them in reverse order to maintain correct priority.
728724
for request in reversed(forefront_requests):
729-
if len(self._request_cache) >= self._cache_size:
725+
if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE:
730726
break
731727
self._request_cache.appendleft(request)
732728

733729
# Add regular requests to the end of the cache (right side).
734730
for request in regular_requests:
735-
if len(self._request_cache) >= self._cache_size:
731+
if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE:
736732
break
737733
self._request_cache.append(request)
738734

0 commit comments

Comments
 (0)