From 8c149d4b4be0acec659cb66900d5902e99b4ebe4 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 2 Oct 2025 18:47:36 +0000 Subject: [PATCH 1/3] Fail fast on non-running container states Co-authored-by: victorgelias --- ai/worker/docker.go | 25 +++++++++++++++++- ai/worker/docker_test.go | 57 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/ai/worker/docker.go b/ai/worker/docker.go index 4e16d5d0b..2977121f3 100644 --- a/ai/worker/docker.go +++ b/ai/worker/docker.go @@ -780,9 +780,32 @@ tickerLoop: return err } - if json.State.Running { + // If the container is running, we're done. + if json.State.Running { break tickerLoop } + + // Detect terminal/non-recoverable states early and fail fast instead of + // waiting for the full timeout. This covers cases where the container + // was stopped or killed immediately after startup (e.g., by another + // orchestrator cleaning up), leaving it in a non-running state that + // will never transition to running. + if json.State != nil { + status := strings.ToLower(json.State.Status) + // Docker statuses can be: "created", "restarting", "running", + // "removing", "paused", "exited", or "dead". + // Treat exited/dead/removing as terminal. If not running and not + // restarting, and we have a non-zero exit code or an error, also fail. + if status == "exited" || status == "dead" || status == "removing" { + return fmt.Errorf("container entered terminal state before running: %s (exitCode=%d)", json.State.Status, json.State.ExitCode) + } + if !json.State.Restarting && json.State.ExitCode != 0 { + return fmt.Errorf("container exited before running (status=%s, exitCode=%d)", json.State.Status, json.State.ExitCode) + } + if !json.State.Restarting && json.State.Error != "" { + return fmt.Errorf("container error before running: %s", json.State.Error) + } + } } } diff --git a/ai/worker/docker_test.go b/ai/worker/docker_test.go index adb3a33fd..920e530a9 100644 --- a/ai/worker/docker_test.go +++ b/ai/worker/docker_test.go @@ -1165,6 +1165,63 @@ func TestDockerWaitUntilRunning(t *testing.T) { require.Contains(t, err.Error(), "timed out waiting for managed container") mockDockerClient.AssertExpectations(t) }) + + t.Run("FailFastOnExited", func(t *testing.T) { + // If the container is immediately exited, we should fail fast instead of waiting. + mockDockerClient := new(MockDockerClient) + // Always return non-running, exited state + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "exited", + Running: false, + ExitCode: 137, + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "terminal state") + mockDockerClient.AssertExpectations(t) + }) + + t.Run("FailFastOnDead", func(t *testing.T) { + mockDockerClient := new(MockDockerClient) + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "dead", + Running: false, + Error: "killed", + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "container entered terminal state") + mockDockerClient.AssertExpectations(t) + }) + + t.Run("FailFastOnExitCodeNonZeroWithoutRestarting", func(t *testing.T) { + mockDockerClient := new(MockDockerClient) + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "created", + Running: false, + Restarting: false, + ExitCode: 1, + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "exited before running") + mockDockerClient.AssertExpectations(t) + }) } func TestHwGPU(t *testing.T) { From 07110e28a824aa3d267bf41ba6275c6673349549 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Thu, 2 Oct 2025 18:55:51 +0000 Subject: [PATCH 2/3] Refactor container state checking to fail fast Co-authored-by: victorgelias --- ai/worker/docker.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/ai/worker/docker.go b/ai/worker/docker.go index 2977121f3..794fdced9 100644 --- a/ai/worker/docker.go +++ b/ai/worker/docker.go @@ -785,18 +785,12 @@ tickerLoop: break tickerLoop } - // Detect terminal/non-recoverable states early and fail fast instead of - // waiting for the full timeout. This covers cases where the container - // was stopped or killed immediately after startup (e.g., by another - // orchestrator cleaning up), leaving it in a non-running state that - // will never transition to running. + // Fail fast on states that won't become running after startup. if json.State != nil { status := strings.ToLower(json.State.Status) - // Docker statuses can be: "created", "restarting", "running", - // "removing", "paused", "exited", or "dead". - // Treat exited/dead/removing as terminal. If not running and not - // restarting, and we have a non-zero exit code or an error, also fail. - if status == "exited" || status == "dead" || status == "removing" { + // Consider exited/dead as terminal. "removing" will surface via + // inspect error or transition to exited/dead shortly. + if status == "exited" || status == "dead" { return fmt.Errorf("container entered terminal state before running: %s (exitCode=%d)", json.State.Status, json.State.ExitCode) } if !json.State.Restarting && json.State.ExitCode != 0 { From df1704f3f2581e533d49dd1b50ba5b401cc33a64 Mon Sep 17 00:00:00 2001 From: Victor Elias Date: Thu, 2 Oct 2025 16:10:15 -0300 Subject: [PATCH 3/3] why can't ai lint --- ai/worker/docker.go | 34 ++++++------ ai/worker/docker_test.go | 112 +++++++++++++++++++-------------------- 2 files changed, 73 insertions(+), 73 deletions(-) diff --git a/ai/worker/docker.go b/ai/worker/docker.go index 794fdced9..2756ab6fc 100644 --- a/ai/worker/docker.go +++ b/ai/worker/docker.go @@ -780,26 +780,26 @@ tickerLoop: return err } - // If the container is running, we're done. - if json.State.Running { + // If the container is running, we're done. + if json.State.Running { break tickerLoop } - // Fail fast on states that won't become running after startup. - if json.State != nil { - status := strings.ToLower(json.State.Status) - // Consider exited/dead as terminal. "removing" will surface via - // inspect error or transition to exited/dead shortly. - if status == "exited" || status == "dead" { - return fmt.Errorf("container entered terminal state before running: %s (exitCode=%d)", json.State.Status, json.State.ExitCode) - } - if !json.State.Restarting && json.State.ExitCode != 0 { - return fmt.Errorf("container exited before running (status=%s, exitCode=%d)", json.State.Status, json.State.ExitCode) - } - if !json.State.Restarting && json.State.Error != "" { - return fmt.Errorf("container error before running: %s", json.State.Error) - } - } + // Fail fast on states that won't become running after startup. + if json.State != nil { + status := strings.ToLower(json.State.Status) + // Consider exited/dead as terminal. "removing" will surface via + // inspect error or transition to exited/dead shortly. + if status == "exited" || status == "dead" { + return fmt.Errorf("container entered terminal state before running: %s (exitCode=%d)", json.State.Status, json.State.ExitCode) + } + if !json.State.Restarting && json.State.ExitCode != 0 { + return fmt.Errorf("container exited before running (status=%s, exitCode=%d)", json.State.Status, json.State.ExitCode) + } + if !json.State.Restarting && json.State.Error != "" { + return fmt.Errorf("container error before running: %s", json.State.Error) + } + } } } diff --git a/ai/worker/docker_test.go b/ai/worker/docker_test.go index 920e530a9..ea9d2f975 100644 --- a/ai/worker/docker_test.go +++ b/ai/worker/docker_test.go @@ -1166,62 +1166,62 @@ func TestDockerWaitUntilRunning(t *testing.T) { mockDockerClient.AssertExpectations(t) }) - t.Run("FailFastOnExited", func(t *testing.T) { - // If the container is immediately exited, we should fail fast instead of waiting. - mockDockerClient := new(MockDockerClient) - // Always return non-running, exited state - mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ - ContainerJSONBase: &types.ContainerJSONBase{ - State: &types.ContainerState{ - Status: "exited", - Running: false, - ExitCode: 137, - }, - }, - }, nil) - - err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) - require.Error(t, err) - require.Contains(t, err.Error(), "terminal state") - mockDockerClient.AssertExpectations(t) - }) - - t.Run("FailFastOnDead", func(t *testing.T) { - mockDockerClient := new(MockDockerClient) - mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ - ContainerJSONBase: &types.ContainerJSONBase{ - State: &types.ContainerState{ - Status: "dead", - Running: false, - Error: "killed", - }, - }, - }, nil) - - err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) - require.Error(t, err) - require.Contains(t, err.Error(), "container entered terminal state") - mockDockerClient.AssertExpectations(t) - }) - - t.Run("FailFastOnExitCodeNonZeroWithoutRestarting", func(t *testing.T) { - mockDockerClient := new(MockDockerClient) - mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ - ContainerJSONBase: &types.ContainerJSONBase{ - State: &types.ContainerState{ - Status: "created", - Running: false, - Restarting: false, - ExitCode: 1, - }, - }, - }, nil) - - err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) - require.Error(t, err) - require.Contains(t, err.Error(), "exited before running") - mockDockerClient.AssertExpectations(t) - }) + t.Run("FailFastOnExited", func(t *testing.T) { + // If the container is immediately exited, we should fail fast instead of waiting. + mockDockerClient := new(MockDockerClient) + // Always return non-running, exited state + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "exited", + Running: false, + ExitCode: 137, + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "terminal state") + mockDockerClient.AssertExpectations(t) + }) + + t.Run("FailFastOnDead", func(t *testing.T) { + mockDockerClient := new(MockDockerClient) + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "dead", + Running: false, + Error: "killed", + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "container entered terminal state") + mockDockerClient.AssertExpectations(t) + }) + + t.Run("FailFastOnExitCodeNonZeroWithoutRestarting", func(t *testing.T) { + mockDockerClient := new(MockDockerClient) + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "created", + Running: false, + Restarting: false, + ExitCode: 1, + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "exited before running") + mockDockerClient.AssertExpectations(t) + }) } func TestHwGPU(t *testing.T) {