From be3fb3fd5593647e7357176abc70ab318dc16a8d Mon Sep 17 00:00:00 2001 From: Aaron Oneal Date: Wed, 17 Jun 2026 22:10:02 -0700 Subject: [PATCH 1/4] fix: properly detect zombie backend processes in is_running() When a backend process (e.g. llama-server) is OOM-killed by the kernel, it becomes a zombie () until reaped by its parent. The existing is_running() fallback used kill(pid, 0) which returns success for zombies since the PID still exists in the kernel process table. This caused the watchdog to incorrectly report the zombie as running, preventing automatic restart. Fix: Add a waitpid(pid, &status, WNOHANG) call before the kill(pid, 0) fallback to reap any zombie process. If waitpid returns > 0, the process has exited (including zombie state), so return false (not running). Only fall through to kill(pid, 0) if the process is truly alive. --- src/cpp/server/utils/platform/process_unix.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/cpp/server/utils/platform/process_unix.cpp b/src/cpp/server/utils/platform/process_unix.cpp index 81e3b4250..62bfbc3e5 100644 --- a/src/cpp/server/utils/platform/process_unix.cpp +++ b/src/cpp/server/utils/platform/process_unix.cpp @@ -370,6 +370,16 @@ bool UnixProcessPlatform::is_running(ProcessHandle handle) { } #endif + // Reap any zombie before falling back to kill(). kill(pid,0) returns + // success for zombies (PID still in kernel table) so we must try to wait + // on the child first. + int status = 0; + pid_t result = waitpid(handle.pid, &status, WNOHANG); + if (result > 0) { + // Process has exited (including zombie) — reap it and report dead. + return false; + } + errno = 0; return ::kill(handle.pid, 0) == 0 || errno == EPERM; } From ac327f849a7039135e822771fb9cf29cbbfc7bd0 Mon Sep 17 00:00:00 2001 From: Aaron Oneal Date: Thu, 18 Jun 2026 22:13:51 -0700 Subject: [PATCH 2/4] fix(process-manager): make is_running() non-mutating on Linux Use /proc//stat to detect zombie processes instead of waitpid(), preventing unintended reaping. Add CMake test target and unit tests to verify the non-mutating contract is preserved. --- CMakeLists.txt | 31 +++ .../server/utils/platform/process_unix.cpp | 52 ++++- test/cpp/test_process_manager.cpp | 218 ++++++++++++++++++ 3 files changed, 294 insertions(+), 7 deletions(-) create mode 100644 test/cpp/test_process_manager.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e125642f..6c93ba2a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1720,3 +1720,34 @@ if(EXISTS "${_GGUF_CAPS_TEST_SRC}") include(CTest) add_test(NAME GgufCapabilitiesTest COMMAND test_gguf_capabilities) endif() + +# ProcessManager non-mutating is_running() test (POSIX). +set(_PM_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_process_manager.cpp" +) +set(_PM_TEST_LIBS + "${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/utils/process_manager.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/utils/platform/process_unix.cpp" +) + +list(GET _PM_TEST_LIBS 0 _pm_lib0) +list(GET _PM_TEST_LIBS 1 _pm_lib1) +if(EXISTS "${_PM_TEST_SRC}" AND EXISTS "${_pm_lib0}" AND EXISTS "${_pm_lib1}") + add_executable(test_process_manager + test/cpp/test_process_manager.cpp + src/cpp/server/utils/process_manager.cpp + src/cpp/server/utils/platform/process_unix.cpp + ) + target_include_directories(test_process_manager PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/include + ${CMAKE_CURRENT_BINARY_DIR}/include + ) + if(UNIX) + target_link_libraries(test_process_manager PRIVATE pthread) + target_link_options(test_process_manager PRIVATE -pthread) + endif() + + # Enable testing and register the test with CTest + include(CTest) + add_test(NAME ProcessManagerTest COMMAND test_process_manager) +endif() diff --git a/src/cpp/server/utils/platform/process_unix.cpp b/src/cpp/server/utils/platform/process_unix.cpp index 62bfbc3e5..11497605f 100644 --- a/src/cpp/server/utils/platform/process_unix.cpp +++ b/src/cpp/server/utils/platform/process_unix.cpp @@ -21,6 +21,9 @@ #ifdef __linux__ #include +#include +#include +#include #endif #ifdef HAVE_LIBCAP @@ -82,6 +85,45 @@ static void preserve_capabilities_for_exec() { } #endif +#ifdef __linux__ +// Check whether a process is a zombie by reading /proc//stat. +// Returns true if the process exists and is in zombie ('Z') state, +// false otherwise (including if the proc entry doesn't exist). +// This is non-mutating — it does not reap the child. +static bool is_zombie_by_proc(pid_t pid) { + char path[64]; + std::snprintf(path, sizeof(path), "/proc/%d/stat", pid); + std::ifstream f(path); + if (!f.is_open()) { + return false; + } + std::string line; + if (!std::getline(f, line)) { + return false; + } + // /proc//stat format: pid (comm) field3 field4 ... + // The comm field is enclosed in parentheses and may contain spaces, + // so we find both delimiters to correctly skip to the field list. + auto open_paren = line.find('('); + if (open_paren == std::string::npos) { + return false; + } + auto close_paren = line.rfind(')'); + if (close_paren == std::string::npos || close_paren <= open_paren) { + return false; + } + // Everything after the closing ')' is the remaining fields. + // Field 1 after ')' is the process state character. + std::string rest = line.substr(close_paren + 1); + std::istringstream iss(rest); + char state; + if (!(iss >> state)) { + return false; + } + return state == 'Z'; +} +#endif + class UnixProcessPlatform : public ProcessPlatform { public: ProcessHandle spawn( @@ -370,15 +412,11 @@ bool UnixProcessPlatform::is_running(ProcessHandle handle) { } #endif - // Reap any zombie before falling back to kill(). kill(pid,0) returns - // success for zombies (PID still in kernel table) so we must try to wait - // on the child first. - int status = 0; - pid_t result = waitpid(handle.pid, &status, WNOHANG); - if (result > 0) { - // Process has exited (including zombie) — reap it and report dead. +#ifdef __linux__ + if (is_zombie_by_proc(handle.pid)) { return false; } +#endif errno = 0; return ::kill(handle.pid, 0) == 0 || errno == EPERM; diff --git a/test/cpp/test_process_manager.cpp b/test/cpp/test_process_manager.cpp new file mode 100644 index 000000000..6cc384ad5 --- /dev/null +++ b/test/cpp/test_process_manager.cpp @@ -0,0 +1,218 @@ +// Standalone test for ProcessManager::is_running() non-mutating contract on POSIX. +// +// Verifies: +// 1. is_running() returns true for a running process +// 2. is_running() returns false for an exited process WITHOUT reaping it +// 3. reap_process() retrieves the real exit code after is_running() returned false +// +// Compile: +// g++ -std=c++17 -I src/cpp/include test/cpp/test_process_manager.cpp \ +// src/cpp/server/utils/process_manager.cpp \ +// src/cpp/server/utils/platform/process_unix.cpp \ +// -pthread -o test_process_manager +// +// Run: +// ./test_process_manager + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include +#include +#endif + +using lemon::utils::ProcessHandle; +using lemon::utils::ProcessManager; + +static int g_failures = 0; + +static void check(const char* name, bool ok) { + std::printf("[%s] %s\n", ok ? "PASS" : "FAIL", name); + if (!ok) ++g_failures; +} + +// Spawn a child via fork() that exits with a known code. +// Returns the child PID (or -1 on failure). +static pid_t spawn_child(int exit_code) { + pid_t pid = fork(); + if (pid < 0) { + return -1; + } + if (pid == 0) { + _exit(exit_code); + } + return pid; +} + +// Create a ProcessHandle from a raw PID. +static ProcessHandle make_handle(pid_t pid) { + ProcessHandle h; + h.handle = nullptr; + h.pid = static_cast(pid); + return h; +} + +// Wait for a child to exit without reaping it. +// Uses waitid(WNOWAIT) on platforms that support it, or /proc//stat on Linux. +// Returns the child PID when it has exited (is a zombie), or -1 on timeout. +static pid_t wait_for_exit_no_reap(pid_t child, int timeout_ms = 5000) { + auto start = std::chrono::steady_clock::now(); + while (true) { +#ifdef WNOWAIT + siginfo_t info; + std::memset(&info, 0, sizeof(info)); + if (waitid(P_PID, static_cast(child), &info, WEXITED | WNOHANG | WNOWAIT) == 0) { + if (info.si_pid != 0) { + return child; + } + } +#endif +#ifdef __linux__ + { + char path[64]; + std::snprintf(path, sizeof(path), "/proc/%d/stat", child); + std::ifstream f(path); + if (f.is_open()) { + std::string line; + if (std::getline(f, line)) { + auto open_paren = line.find('('); + if (open_paren != std::string::npos) { + auto close_paren = line.rfind(')'); + if (close_paren != std::string::npos && close_paren > open_paren) { + std::string rest = line.substr(close_paren + 1); + std::istringstream iss(rest); + char state; + if (iss >> state && state == 'Z') { + return child; + } + } + } + } + } + } +#endif + // Fallback: try waitpid with WNOHANG. If the child has already + // been reaped by something else (or we lost the race), return -1 + // to indicate we couldn't verify the exit without consuming it. + int status = 0; + pid_t r = waitpid(child, &status, WNOHANG); + if (r == child) { + return -1; + } + auto now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - start).count() > timeout_ms) { + return -1; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +int main() { + // --- Test: is_running() returns false for an exited child --- + { + pid_t child = spawn_child(42); + check("spawn_child returns valid PID", child > 0); + + // Wait for the child to exit without reaping it. + pid_t exited = wait_for_exit_no_reap(child); + check("child exited (detected without reap)", exited > 0); + + if (exited > 0) { + ProcessHandle h = make_handle(exited); + + // is_running() must NOT reap the child — it should just report + // that the process is no longer running. + bool running = ProcessManager::is_running(h); + check("is_running() returns false for exited child", !running); + + // reap_process() should retrieve the real exit code (42). + int exit_code = ProcessManager::reap_process(h); + check("reap_process() returns real exit code 42", exit_code == 42); + } else { + // wait_for_exit_no_reap timed out — child already exited, so it's + // a zombie. Clean it up to avoid leaking zombies. + int status = 0; + waitpid(child, &status, 0); + } + } + + // --- Test: is_running() returns false for PID 0 --- + { + ProcessHandle h = make_handle(0); + check("is_running() returns false for PID 0", !ProcessManager::is_running(h)); + } + + // --- Test: is_running() returns false for negative PID --- + { + ProcessHandle h = make_handle(-1); + check("is_running() returns false for negative PID", !ProcessManager::is_running(h)); + } + + // --- Test: is_running() returns false for a non-existent PID --- + { + // Use a PID that almost certainly doesn't exist. + ProcessHandle h = make_handle(999999); + check("is_running() returns false for non-existent PID", !ProcessManager::is_running(h)); + } + + // --- Test: is_running() returns true for a running process --- + { + pid_t child = fork(); + check("fork succeeds", child >= 0); + + if (child > 0) { + ProcessHandle h = make_handle(child); + check("is_running() returns true for running child", ProcessManager::is_running(h)); + + // Clean up: kill the child and reap. + kill(child, SIGKILL); + int status = 0; + waitpid(child, &status, 0); + } else if (child == 0) { + // Parent will SIGKILL us — just sleep forever. + while (true) { + sleep(1); + } + } + } + + // --- Test: reap_process() returns -1 for a still-running process --- + { + pid_t child = fork(); + check("fork for reap test", child >= 0); + + if (child > 0) { + ProcessHandle h = make_handle(child); + // Give the child a moment to start. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + int rc = ProcessManager::reap_process(h); + check("reap_process() returns -1 for running process", rc == -1); + + // Clean up. + kill(child, SIGKILL); + int status = 0; + waitpid(child, &status, 0); + } else if (child == 0) { + while (true) { + sleep(1); + } + } + } + + if (g_failures == 0) { + std::printf("\nAll process_manager tests passed\n"); + return 0; + } + std::printf("\n%d process_manager test(s) FAILED\n", g_failures); + return 1; +} From f200f7034760149efcd219bd376a471ee16cab4d Mon Sep 17 00:00:00 2001 From: Aaron Oneal Date: Fri, 19 Jun 2026 08:34:58 -0700 Subject: [PATCH 3/4] build(cmake): restrict ProcessManager test to Linux --- CMakeLists.txt | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c93ba2a0..318ed4b7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1721,33 +1721,33 @@ if(EXISTS "${_GGUF_CAPS_TEST_SRC}") add_test(NAME GgufCapabilitiesTest COMMAND test_gguf_capabilities) endif() -# ProcessManager non-mutating is_running() test (POSIX). -set(_PM_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_process_manager.cpp" -) -set(_PM_TEST_LIBS - "${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/utils/process_manager.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/utils/platform/process_unix.cpp" -) - -list(GET _PM_TEST_LIBS 0 _pm_lib0) -list(GET _PM_TEST_LIBS 1 _pm_lib1) -if(EXISTS "${_PM_TEST_SRC}" AND EXISTS "${_pm_lib0}" AND EXISTS "${_pm_lib1}") - add_executable(test_process_manager - test/cpp/test_process_manager.cpp - src/cpp/server/utils/process_manager.cpp - src/cpp/server/utils/platform/process_unix.cpp +# ProcessManager non-mutating is_running() test (Linux /proc-specific). +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(_PM_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/test/cpp/test_process_manager.cpp" ) - target_include_directories(test_process_manager PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/include - ${CMAKE_CURRENT_BINARY_DIR}/include + set(_PM_TEST_LIBS + "${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/utils/process_manager.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/utils/platform/process_unix.cpp" ) - if(UNIX) + + list(GET _PM_TEST_LIBS 0 _pm_lib0) + list(GET _PM_TEST_LIBS 1 _pm_lib1) + if(EXISTS "${_PM_TEST_SRC}" AND EXISTS "${_pm_lib0}" AND EXISTS "${_pm_lib1}") + add_executable(test_process_manager + test/cpp/test_process_manager.cpp + src/cpp/server/utils/process_manager.cpp + src/cpp/server/utils/platform/process_unix.cpp + ) + target_include_directories(test_process_manager PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/include + ${CMAKE_CURRENT_BINARY_DIR}/include + ) target_link_libraries(test_process_manager PRIVATE pthread) target_link_options(test_process_manager PRIVATE -pthread) - endif() - # Enable testing and register the test with CTest - include(CTest) - add_test(NAME ProcessManagerTest COMMAND test_process_manager) + # Enable testing and register the test with CTest + include(CTest) + add_test(NAME ProcessManagerTest COMMAND test_process_manager) + endif() endif() From 5f9b37ff04f05ed29e942556eb885f47c6097f7a Mon Sep 17 00:00:00 2001 From: Aaron Oneal Date: Sat, 27 Jun 2026 09:18:08 -0700 Subject: [PATCH 4/4] chore(process): remove zombie detection comments --- src/cpp/server/utils/platform/process_unix.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/cpp/server/utils/platform/process_unix.cpp b/src/cpp/server/utils/platform/process_unix.cpp index 11497605f..c4da2d6b7 100644 --- a/src/cpp/server/utils/platform/process_unix.cpp +++ b/src/cpp/server/utils/platform/process_unix.cpp @@ -86,10 +86,6 @@ static void preserve_capabilities_for_exec() { #endif #ifdef __linux__ -// Check whether a process is a zombie by reading /proc//stat. -// Returns true if the process exists and is in zombie ('Z') state, -// false otherwise (including if the proc entry doesn't exist). -// This is non-mutating — it does not reap the child. static bool is_zombie_by_proc(pid_t pid) { char path[64]; std::snprintf(path, sizeof(path), "/proc/%d/stat", pid);