diff --git a/ai/worker/docker.go b/ai/worker/docker.go index 4e16d5d0b..2756ab6fc 100644 --- a/ai/worker/docker.go +++ b/ai/worker/docker.go @@ -780,9 +780,26 @@ tickerLoop: return err } + // If the container is running, we're done. if json.State.Running { break tickerLoop } + + // Fail fast on states that won't become running after startup. + if json.State != nil { + status := strings.ToLower(json.State.Status) + // Consider exited/dead as terminal. "removing" will surface via + // inspect error or transition to exited/dead shortly. + if status == "exited" || status == "dead" { + return fmt.Errorf("container entered terminal state before running: %s (exitCode=%d)", json.State.Status, json.State.ExitCode) + } + if !json.State.Restarting && json.State.ExitCode != 0 { + return fmt.Errorf("container exited before running (status=%s, exitCode=%d)", json.State.Status, json.State.ExitCode) + } + if !json.State.Restarting && json.State.Error != "" { + return fmt.Errorf("container error before running: %s", json.State.Error) + } + } } } diff --git a/ai/worker/docker_test.go b/ai/worker/docker_test.go index adb3a33fd..ea9d2f975 100644 --- a/ai/worker/docker_test.go +++ b/ai/worker/docker_test.go @@ -1165,6 +1165,63 @@ func TestDockerWaitUntilRunning(t *testing.T) { require.Contains(t, err.Error(), "timed out waiting for managed container") mockDockerClient.AssertExpectations(t) }) + + t.Run("FailFastOnExited", func(t *testing.T) { + // If the container is immediately exited, we should fail fast instead of waiting. + mockDockerClient := new(MockDockerClient) + // Always return non-running, exited state + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "exited", + Running: false, + ExitCode: 137, + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "terminal state") + mockDockerClient.AssertExpectations(t) + }) + + t.Run("FailFastOnDead", func(t *testing.T) { + mockDockerClient := new(MockDockerClient) + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "dead", + Running: false, + Error: "killed", + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "container entered terminal state") + mockDockerClient.AssertExpectations(t) + }) + + t.Run("FailFastOnExitCodeNonZeroWithoutRestarting", func(t *testing.T) { + mockDockerClient := new(MockDockerClient) + mockDockerClient.On("ContainerInspect", mock.Anything, containerID).Return(types.ContainerJSON{ + ContainerJSONBase: &types.ContainerJSONBase{ + State: &types.ContainerState{ + Status: "created", + Running: false, + Restarting: false, + ExitCode: 1, + }, + }, + }, nil) + + err := dockerWaitUntilRunning(ctx, mockDockerClient, containerID, pollingInterval) + require.Error(t, err) + require.Contains(t, err.Error(), "exited before running") + mockDockerClient.AssertExpectations(t) + }) } func TestHwGPU(t *testing.T) {