From 6f20be530e4eac7bdec3bc111e22b97f0c038860 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E7=9A=AE=E5=B9=BC=E9=B8=9F?= <2960474346@qq.com>
Date: Tue, 19 May 2026 20:35:15 +0800
Subject: [PATCH] fix: cache worker IPC state for prealloc thread

Avoid calling the Python worker IPC callback from the C++ prealloc thread. On single-GPU startup that callback can block on the GIL before the first page mapping call, which hangs background preallocation and keeps the server from becoming ready.
---
 csrc/inc/page_allocator.hpp |  1 +
 csrc/page_allocator.cpp     | 27 +++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/csrc/inc/page_allocator.hpp b/csrc/inc/page_allocator.hpp
index 6538bde5..43403d1b 100644
--- a/csrc/inc/page_allocator.hpp
+++ b/csrc/inc/page_allocator.hpp
@@ -155,6 +155,7 @@ class PageAllocator {
   BroadcastMapCallback broadcast_map_callback_;
   BroadcastUnmapCallback broadcast_unmap_callback_;
   ShouldUseWorkerIpcCallback should_use_worker_ipc_callback_;
+  mutable std::atomic<bool> should_use_worker_ipc_cached_{false};
 };
 
 } // namespace kvcached
diff --git a/csrc/page_allocator.cpp b/csrc/page_allocator.cpp
index f387abce..9c77ffe7 100644
--- a/csrc/page_allocator.cpp
+++ b/csrc/page_allocator.cpp
@@ -507,8 +507,11 @@ void PageAllocator::set_broadcast_unmap_callback(
 
 void PageAllocator::set_should_use_worker_ipc_callback(
     ShouldUseWorkerIpcCallback callback) {
+  bool use_worker_ipc = callback ? callback() : false;
   std::lock_guard<std::mutex> lock(lock_);
-  should_use_worker_ipc_callback_ = callback;
+  should_use_worker_ipc_callback_ = std::move(callback);
+  should_use_worker_ipc_cached_.store(use_worker_ipc,
+                                      std::memory_order_release);
   LOGGER(INFO, "Should-use-worker-ipc callback set for PageAllocator");
 }
 
@@ -731,10 +734,26 @@ void PageAllocator::stop_prealloc_thread_internal() {
 }
 
 bool PageAllocator::should_use_worker_ipc() const {
-  if (should_use_worker_ipc_callback_) {
-    return should_use_worker_ipc_callback_();
+  ShouldUseWorkerIpcCallback callback;
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+    if (prealloc_thread_ &&
+        prealloc_thread_->get_id() == std::this_thread::get_id()) {
+      // The background prealloc thread must not re-enter Python here. It only
+      // consumes the cached decision, which non-prealloc callers refresh.
+      return should_use_worker_ipc_cached_.load(std::memory_order_acquire);
+    }
+    callback = should_use_worker_ipc_callback_;
   }
-  return false;
+
+  if (callback) {
+    bool use_worker_ipc = callback();
+    should_use_worker_ipc_cached_.store(use_worker_ipc,
+                                        std::memory_order_release);
+    return use_worker_ipc;
+  }
+
+  return should_use_worker_ipc_cached_.load(std::memory_order_acquire);
 }
 
 } // namespace kvcached
\ No newline at end of file