volcano-sh · Aman-Cool · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · gemini-code-assist
diff --git a/pkg/workloadmanager/handlers.go b/pkg/workloadmanager/handlers.go
@@ -133,7 +133,7 @@
 	response, err := s.createSandbox(c.Request.Context(), dynamicClient, sandbox, sandboxClaim, sandboxEntry, resultChan)
 	if err != nil {
 		klog.Errorf("create sandbox failed %s/%s: %v", sandbox.Namespace, sandbox.Name, err)
-		respondError(c, http.StatusInternalServerError, "internal server error")
+		respondError(c, http.StatusInternalServerError, err.Error())
 		return
 	}
 
@@ -141,7 +141,7 @@
 }

 // createSandbox performs sandbox creation and returns the response payload or an error with an HTTP status code.
 func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interface, sandbox *sandboxv1alpha1.Sandbox, sandboxClaim *extensionsv1alpha1.SandboxClaim, sandboxEntry *sandboxEntry, resultChan <-chan SandboxStatusUpdate) (*types.CreateSandboxResponse, error) {
 	// Store placeholder before creating, make sandbox/sandboxClaim GarbageCollection possible
 	sandboxStorePlaceHolder := buildSandboxPlaceHolder(sandbox, sandboxEntry)
 	if err := s.storeClient.StoreSandbox(ctx, sandboxStorePlaceHolder); err != nil {
@@ -197,12 +197,25 @@
 		sandboxRollbackFunc()
 	}()
 
+	// Use NewTimer so we can stop it explicitly when another branch wins,
+	// preventing the runtime from retaining the timer until it fires.
+	timer := time.NewTimer(2 * time.Minute) // consistent with router settings
+
 	var createdSandbox *sandboxv1alpha1.Sandbox
 	select {
 	case result := <-resultChan:
+		timer.Stop()
+		if result.Err != nil {
+			klog.Warningf("sandbox %s/%s failed: %v", sandbox.Namespace, sandbox.Name, result.Err)
+			return nil, result.Err
+		}
 		createdSandbox = result.Sandbox
 		klog.V(2).Infof("sandbox %s/%s running", createdSandbox.Namespace, createdSandbox.Name)
-	case <-time.After(2 * time.Minute): // consistent with router settings
+	case <-ctx.Done():
+		timer.Stop()
+		klog.Warningf("sandbox %s/%s wait canceled: %v", sandbox.Namespace, sandbox.Name, ctx.Err())
+		return nil, fmt.Errorf("sandbox creation canceled: %w", ctx.Err())
+	case <-timer.C:
 		klog.Warningf("sandbox %s/%s create timed out", sandbox.Namespace, sandbox.Name)
 		return nil, fmt.Errorf("sandbox creation timed out")
 	}

diff --git a/pkg/workloadmanager/handlers_test.go b/pkg/workloadmanager/handlers_test.go
@@ -324,7 +324,7 @@ func TestHandleSandboxCreate(t *testing.T) {
 			body:              `{"name":"workload","namespace":"ns"}`,
 			createErr:         errors.New("create failed"),
 			expectStatus:      http.StatusInternalServerError,
-			expectMessage:     "internal server error",
+			expectMessage:     "create failed",
 			expectCreateCalls: 1,
 		},
 		{

diff --git a/pkg/workloadmanager/sandbox_controller.go b/pkg/workloadmanager/sandbox_controller.go
@@ -18,6 +18,7 @@ package workloadmanager
 
 import (
 	"context"
+	"fmt"
 	"sync"
 
 	"k8s.io/apimachinery/pkg/runtime"
@@ -40,6 +41,7 @@ type SandboxReconciler struct {
 
 type SandboxStatusUpdate struct {
 	Sandbox *sandboxv1alpha1.Sandbox
+	Err     error
 }
 
 func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
@@ -48,30 +50,46 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 		return ctrl.Result{}, client.IgnoreNotFound(err)
 	}
 
-	status := getSandboxStatus(sandbox)
-
-	// Check for pending requests with proper locking
-	if status == "running" {
-		klog.V(2).Infof("Sandbox %s/%s is running, sending notification", sandbox.Namespace, sandbox.Name)
-		r.mu.Lock()
-		resultChan, exists := r.watchers[req.NamespacedName]
-		if exists {
-			klog.V(2).Infof("Found %d pending requests for sandbox %s/%s", len(r.watchers), sandbox.Namespace, sandbox.Name)
-			// Remove from map before sending to avoid memory leak
-			delete(r.watchers, req.NamespacedName)
-		} else {
-			klog.V(2).Infof("No pending requests found for sandbox %s/%s", sandbox.Namespace, sandbox.Name)
+	status, failMsg := getSandboxStatus(sandbox)
+
+	// Only notify the waiter on a terminal state (running or failed).
+	// "unknown" means the sandbox is still being scheduled/started; stay quiet.
+	var (
+		update  SandboxStatusUpdate
+		hasWork bool
+	)
+	switch status {
+	case "running":
+		klog.V(2).Infof("Sandbox %s/%s is running, notifying waiter", sandbox.Namespace, sandbox.Name)
+		update = SandboxStatusUpdate{Sandbox: sandbox}
+		hasWork = true
+	case "failed":
+		klog.Warningf("Sandbox %s/%s entered a terminal failure state: %s", sandbox.Namespace, sandbox.Name, failMsg)
+		update = SandboxStatusUpdate{
+			Sandbox: sandbox,
+			Err:     fmt.Errorf("sandbox %s/%s failed: %s", sandbox.Namespace, sandbox.Name, failMsg),
 		}
-		r.mu.Unlock()
-
-		if exists {
-			// Send notification outside the lock to avoid deadlock
-			select {
-			case resultChan <- SandboxStatusUpdate{Sandbox: sandbox}:
-				klog.V(2).Infof("Notified waiter about sandbox %s/%s reaching Running state", sandbox.Namespace, sandbox.Name)
-			default:
-				klog.Warningf("Failed to notify watcher for sandbox %s/%s: channel buffer full or not receiving", sandbox.Namespace, sandbox.Name)
-			}
+		hasWork = true
+	default:
+		return ctrl.Result{}, nil
+	}
+
+	r.mu.Lock()
+	resultChan, exists := r.watchers[req.NamespacedName]
+	if exists {
+		delete(r.watchers, req.NamespacedName)
+	}
+	r.mu.Unlock()
+
+	if exists && hasWork {
+		// Send notification outside the lock to avoid deadlock.
+		// The channel has buffer 1 so this never blocks when the map entry
+		// is deleted before the send (only one sender per key is possible).
+		select {
+		case resultChan <- update:
+			klog.V(2).Infof("Notified waiter about sandbox %s/%s (status: %s)", sandbox.Namespace, sandbox.Name, status)
+		default:
+			klog.Warningf("Could not notify waiter for sandbox %s/%s: channel unexpectedly full", sandbox.Namespace, sandbox.Name)
 		}
 	}
 

diff --git a/pkg/workloadmanager/sandbox_helper.go b/pkg/workloadmanager/sandbox_helper.go
@@ -60,17 +60,35 @@ func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, podIP string, entry *san
 		SessionID:        entry.SessionID,
 		CreatedAt:        createdAt,
 		ExpiresAt:        expiresAt,
-		Status:           getSandboxStatus(sandbox),
+		Status:           sandboxStatusString(sandbox),
 	}
 }
 
-// getSandboxStatus extracts status from Sandbox CRD conditions
-func getSandboxStatus(sandbox *sandboxv1alpha1.Sandbox) string {
-	// Check conditions for Ready status
+// sandboxStatusString returns only the status string (discards the failure message).
+// Use this where only the status label is needed (e.g. store metadata).
+func sandboxStatusString(sandbox *sandboxv1alpha1.Sandbox) string {
+	s, _ := getSandboxStatus(sandbox)
+	return s
+}
+
+// getSandboxStatus extracts status from Sandbox CRD conditions.
+// Returns "running", "failed", or "unknown".
+func getSandboxStatus(sandbox *sandboxv1alpha1.Sandbox) (string, string) {
 	for _, condition := range sandbox.Status.Conditions {
-		if condition.Type == string(sandboxv1alpha1.SandboxConditionReady) && condition.Status == metav1.ConditionTrue {
-			return "running"
+		if condition.Type != string(sandboxv1alpha1.SandboxConditionReady) {
+			continue
+		}
+		if condition.Status == metav1.ConditionTrue {
+			return "running", ""
+		}
+		// ConditionFalse with a reason indicates a terminal failure, not transient pending.
+		if condition.Status == metav1.ConditionFalse && condition.Reason != "" {
+			msg := condition.Message
+			if msg == "" {
+				msg = condition.Reason
+			}
+			return "failed", msg
 		}
 	}
-	return "unknown"
+	return "unknown", ""
 }
diff --git a/pkg/workloadmanager/sandbox_helper_test.go b/pkg/workloadmanager/sandbox_helper_test.go
@@ -207,9 +207,10 @@ func TestBuildSandboxInfo_TableDriven(t *testing.T) {
 
 func TestGetSandboxStatus_TableDriven(t *testing.T) {
 	tests := []struct {
-		name     string
-		sandbox  *sandboxv1alpha1.Sandbox
-		expected string
+		name        string
+		sandbox     *sandboxv1alpha1.Sandbox
+		expected    string
+		expectedMsg string
 	}{
 		{
 			name: "ready condition true",
@@ -223,10 +224,11 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					},
 				},
 			},
-			expected: "running",
+			expected:    "running",
+			expectedMsg: "",
 		},
 		{
-			name: "ready condition false",
+			name: "ready condition false without reason",
 			sandbox: &sandboxv1alpha1.Sandbox{
 				Status: sandboxv1alpha1.SandboxStatus{
 					Conditions: []metav1.Condition{
@@ -237,7 +239,41 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					},
 				},
 			},
-			expected: "unknown",
+			expected:    "unknown",
+			expectedMsg: "",
+		},
+		{
+			name: "ready condition false with reason indicates terminal failure",
+			sandbox: &sandboxv1alpha1.Sandbox{
+				Status: sandboxv1alpha1.SandboxStatus{
+					Conditions: []metav1.Condition{
+						{
+							Type:    string(sandboxv1alpha1.SandboxConditionReady),
+							Status:  metav1.ConditionFalse,
+							Reason:  "ErrImagePull",
+							Message: "Back-off pulling image",
+						},
+					},
+				},
+			},
+			expected:    "failed",
+			expectedMsg: "Back-off pulling image",
+		},
+		{
+			name: "ready condition false with reason but no message falls back to reason",
+			sandbox: &sandboxv1alpha1.Sandbox{
+				Status: sandboxv1alpha1.SandboxStatus{
+					Conditions: []metav1.Condition{
+						{
+							Type:   string(sandboxv1alpha1.SandboxConditionReady),
+							Status: metav1.ConditionFalse,
+							Reason: "OOMKilled",
+						},
+					},
+				},
+			},
+			expected:    "failed",
+			expectedMsg: "OOMKilled",
 		},
 		{
 			name: "ready condition unknown",
@@ -251,7 +287,8 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					},
 				},
 			},
-			expected: "unknown",
+			expected:    "unknown",
+			expectedMsg: "",
 		},
 		{
 			name: "no conditions",
@@ -260,7 +297,8 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					Conditions: []metav1.Condition{},
 				},
 			},
-			expected: "unknown",
+			expected:    "unknown",
+			expectedMsg: "",
 		},
 		{
 			name: "nil conditions",
@@ -269,7 +307,8 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					Conditions: nil,
 				},
 			},
-			expected: "unknown",
+			expected:    "unknown",
+			expectedMsg: "",
 		},
 		{
 			name: "other condition type",
@@ -283,7 +322,8 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					},
 				},
 			},
-			expected: "unknown",
+			expected:    "unknown",
+			expectedMsg: "",
 		},
 		{
 			name: "multiple conditions with ready true",
@@ -301,14 +341,16 @@ func TestGetSandboxStatus_TableDriven(t *testing.T) {
 					},
 				},
 			},
-			expected: "running",
+			expected:    "running",
+			expectedMsg: "",
 		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result := getSandboxStatus(tt.sandbox)
+			result, msg := getSandboxStatus(tt.sandbox)
 			assert.Equal(t, tt.expected, result)
+			assert.Equal(t, tt.expectedMsg, msg)
 		})
 	}
 }