diff --git a/api/v1alpha1/sandbox_types.go b/api/v1alpha1/sandbox_types.go index 3dc16ee39..991ec4e43 100644 --- a/api/v1alpha1/sandbox_types.go +++ b/api/v1alpha1/sandbox_types.go @@ -25,11 +25,38 @@ type ConditionType string func (c ConditionType) String() string { return string(c) } const ( + // SandboxConditionInitialized represents the first-time setup of the sandbox dependencies + SandboxConditionInitialized ConditionType = "Initialized" + // SandboxReasonInitializing indicates the sandbox dependencies are being provisioned + SandboxReasonInitializing = "SandboxInitializing" + // SandboxReasonInitialized indicates the first-time setup of the sandbox is complete. + SandboxReasonInitialized = "SandboxInitialized" + + // SandboxConditionSuspended indicates the sandbox is administratively suspended + SandboxConditionSuspended ConditionType = "Suspended" + // SandboxReasonPendingEvaluation indicates the reason hasn't been evaluated yet by the controller. + SandboxReasonPendingEvaluation = "PendingEvaluation" + // SandboxReasonSuspended indicates the sandbox has been suspended and is not operational + SandboxReasonSuspendedNotOperational = "Suspended" + // SandboxReasonNotSuspended indicates the sandbox is operational and not suspended + SandboxReasonNotSuspended = "NotSuspended" + // SandboxConditionReady indicates readiness for Sandbox SandboxConditionReady ConditionType = "Ready" - - // SandboxReasonExpired indicates expired state for Sandbox + // SandboxReasonReady indicates the sandbox is fully operational + SandboxReasonReady = "SandboxReady" + // SandboxReasonPodInitializing indicates the sandbox pod is being created or starting + SandboxReasonPodInitializing = "SandboxPodInitializing" + // SandboxReasonPodNotReady indicates the sandbox pod is not ready for application. + SandboxReasonPodNotReady = "SandboxPodNotReady" + // SandboxReasonSuspended indicates the sandbox is suspended and not ready + SandboxReasonSuspended = "SandboxSuspended" + // SandboxReasonUnresponsive indicates the sandbox pod is in an unknown state + SandboxReasonUnresponsive = "SandboxUnresponsive" + // SandboxReasonExpired indicates the sandbox is being terminated due to expiration. SandboxReasonExpired = "SandboxExpired" + // SandboxReasonDeleting indicates the sandbox is being terminated due to deletion. + SandboxReasonDeleting = "SandboxDeleting" // SandboxPodNameAnnotation is the annotation used to track the pod name adopted from a warm pool. SandboxPodNameAnnotation = "agents.x-k8s.io/pod-name" diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/models.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/models.py index e0a45ff05..d1f3fb333 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/models.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/models.py @@ -15,6 +15,29 @@ from typing import Literal, Union from pydantic import BaseModel +class SandboxCondition(BaseModel): + """Represents a status condition of the Sandbox.""" + type: str + status: str + reason: str | None = None + message: str | None = None + +class SandboxStatus(BaseModel): + """Represents the status of the Sandbox with parsed conditions.""" + conditions: list[SandboxCondition] = [] + + @property + def initialized(self) -> str: + return next((c.status for c in self.conditions if c.type == "Initialized"), "Unknown") + + @property + def suspended(self) -> str: + return next((c.status for c in self.conditions if c.type == "Suspended"), "Unknown") + + @property + def ready(self) -> str: + return next((c.status for c in self.conditions if c.type == "Ready"), "Unknown") + class ExecutionResult(BaseModel): """A structured object for holding the result of a command execution.""" stdout: str = "" # Standard output from the command. diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/sandbox.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/sandbox.py index 41c98d352..5fa8f260b 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/sandbox.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/sandbox.py @@ -21,7 +21,8 @@ from .models import ( SandboxConnectionConfig, SandboxLocalTunnelConnectionConfig, - SandboxTracerConfig + SandboxTracerConfig, + SandboxStatus, ) from .k8s_helper import K8sHelper from .connector import SandboxConnector @@ -125,6 +126,12 @@ def _close_connection(self): self._is_closed = True logging.info(f"Connection to sandbox claim '{self.claim_name}' has been closed.") + def status(self) -> SandboxStatus: + """Fetches the current status of the Sandbox custom resource, reflecting initialized, suspended, and ready conditions.""" + sandbox_object = self.k8s_helper.get_sandbox(self.sandbox_id, self.namespace) or {} + status_dict = sandbox_object.get('status', {}) + return SandboxStatus(**status_dict) + def terminate(self): """Permanent deletion of all server side infrastructure and client side connection.""" # Close the client side connection and trace manager lifecycle diff --git a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/test/unit/test_sandbox.py b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/test/unit/test_sandbox.py index 16d0a6189..32fbd0ec0 100644 --- a/clients/python/agentic-sandbox-client/k8s_agent_sandbox/test/unit/test_sandbox.py +++ b/clients/python/agentic-sandbox-client/k8s_agent_sandbox/test/unit/test_sandbox.py @@ -16,7 +16,7 @@ from unittest.mock import MagicMock, patch from k8s_agent_sandbox.sandbox import Sandbox -from k8s_agent_sandbox.models import SandboxLocalTunnelConnectionConfig, SandboxTracerConfig +from k8s_agent_sandbox.models import SandboxLocalTunnelConnectionConfig, SandboxTracerConfig, SandboxStatus class TestSandbox(unittest.TestCase): @@ -167,5 +167,25 @@ def test_terminate(self): self.mock_k8s_helper.delete_sandbox_claim.assert_called_once_with(self.claim_name, self.namespace) + def test_status(self): + """Tests that the status method correctly parses sandbox conditions.""" + self.mock_k8s_helper.get_sandbox.return_value = { + "status": { + "conditions": [ + {"type": "Initialized", "status": "True", "reason": "SandboxInitialized"}, + {"type": "Suspended", "status": "False", "reason": "NotSuspended"}, + {"type": "Ready", "status": "True", "reason": "SandboxReady"} + ] + } + } + + status = self.sandbox.status() + + self.assertIsInstance(status, SandboxStatus) + self.assertEqual(status.initialized, "True") + self.assertEqual(status.suspended, "False") + self.assertEqual(status.ready, "True") + self.mock_k8s_helper.get_sandbox.assert_called_with(self.sandbox_id, self.namespace) + if __name__ == '__main__': unittest.main() diff --git a/clients/python/agentic-sandbox-client/test_client.py b/clients/python/agentic-sandbox-client/test_client.py index 3bd37fc06..764cf5432 100644 --- a/clients/python/agentic-sandbox-client/test_client.py +++ b/clients/python/agentic-sandbox-client/test_client.py @@ -32,6 +32,14 @@ def run_sandbox_tests(sandbox: Sandbox): """Tests methods on the Sandbox object (execution, files, etc).""" + print("\n--- Testing Sandbox Status ---") + status = sandbox.status() + print(f"Sandbox status conditions: {status.conditions}") + assert status.initialized == "True", f"Expected initialized='True', got {status.initialized}" + assert status.suspended == "False", f"Expected suspended='False', got {status.suspended}" + assert status.ready == "True", f"Expected ready='True', got {status.ready}" + print("--- Sandbox Status Test Passed! ---") + print("\n--- Testing Command Execution ---") command_to_run = "echo 'Hello from the sandbox shruti!'" print(f"Executing command: '{command_to_run}'") diff --git a/controllers/sandbox_controller.go b/controllers/sandbox_controller.go index e788be924..172747e10 100644 --- a/controllers/sandbox_controller.go +++ b/controllers/sandbox_controller.go @@ -104,6 +104,19 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // If the sandbox is being deleted, do nothing if !sandbox.DeletionTimestamp.IsZero() { log.Info("Sandbox is being deleted") + + oldStatus := sandbox.Status.DeepCopy() + meta.SetStatusCondition(&sandbox.Status.Conditions, metav1.Condition{ + Type: string(sandboxv1alpha1.SandboxConditionReady), + ObservedGeneration: sandbox.Generation, + Status: metav1.ConditionFalse, + Reason: sandboxv1alpha1.SandboxReasonDeleting, + Message: "Sandbox is terminating", + }) + if err := r.updateStatus(ctx, oldStatus, sandbox); err != nil { + log.Error(err, "Failed to update status for terminating sandbox") + } + return ctrl.Result{}, nil } @@ -167,8 +180,9 @@ func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox var allErrors error // Reconcile PVCs - err := r.reconcilePVCs(ctx, sandbox) - allErrors = errors.Join(allErrors, err) + pvcErr := r.reconcilePVCs(ctx, sandbox) + allErrors = errors.Join(allErrors, pvcErr) + pvcsProvisioned := pvcErr == nil // Reconcile Pod pod, err := r.reconcilePod(ctx, sandbox, nameHash) @@ -182,73 +196,112 @@ func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox } // Reconcile Service - svc, err := r.reconcileService(ctx, sandbox, nameHash) + _, err = r.reconcileService(ctx, sandbox, nameHash) allErrors = errors.Join(allErrors, err) + svcsProvisioned := err == nil - // compute and set overall Ready condition - readyCondition := r.computeReadyCondition(sandbox, allErrors, svc, pod) - meta.SetStatusCondition(&sandbox.Status.Conditions, readyCondition) + // compute and set overall conditions + conditions := r.computeConditions(sandbox, svcsProvisioned, pod, pvcsProvisioned) + for _, condition := range conditions { + meta.SetStatusCondition(&sandbox.Status.Conditions, condition) + } return allErrors } -func (r *SandboxReconciler) computeReadyCondition(sandbox *sandboxv1alpha1.Sandbox, err error, svc *corev1.Service, pod *corev1.Pod) metav1.Condition { - readyCondition := metav1.Condition{ +func (r *SandboxReconciler) computeConditions(sandbox *sandboxv1alpha1.Sandbox, svcsProvisioned bool, pod *corev1.Pod, pvcsProvisioned bool) []metav1.Condition { + var conditions []metav1.Condition + gen := sandbox.Generation + + // 1. Initialized Condition + initialized := metav1.Condition{ + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + ObservedGeneration: gen, + Status: metav1.ConditionFalse, + Reason: sandboxv1alpha1.SandboxReasonInitializing, + Message: "Provisioning dependencies", + } + if svcsProvisioned && pvcsProvisioned { + initialized.Status = metav1.ConditionTrue + initialized.Reason = sandboxv1alpha1.SandboxReasonInitialized + initialized.Message = "Service and PVCs are provisioned" + } + conditions = append(conditions, initialized) + + // 2. Suspended Condition + suspended := metav1.Condition{ + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + ObservedGeneration: gen, + Status: metav1.ConditionUnknown, + Reason: sandboxv1alpha1.SandboxReasonPendingEvaluation, + Message: "The suspension status has not yet been determined.", + } + isSuspended := sandbox.Spec.Replicas != nil && *sandbox.Spec.Replicas == 0 + if isSuspended { + suspended.Status = metav1.ConditionTrue + suspended.Reason = sandboxv1alpha1.SandboxReasonSuspendedNotOperational + suspended.Message = "Sandbox has been suspended and is not operational" + } else if pod != nil { + suspended.Status = metav1.ConditionFalse + suspended.Reason = sandboxv1alpha1.SandboxReasonNotSuspended + suspended.Message = "Sandbox is operational and not suspended" + } + conditions = append(conditions, suspended) + + // 3. Ready Condition + ready := metav1.Condition{ Type: string(sandboxv1alpha1.SandboxConditionReady), - ObservedGeneration: sandbox.Generation, - Message: "", + ObservedGeneration: gen, Status: metav1.ConditionFalse, - Reason: "DependenciesNotReady", - } - - if err != nil { - readyCondition.Reason = "ReconcilerError" - readyCondition.Message = "Error seen: " + err.Error() - return readyCondition - } - - message := "" - podReady := false - if pod != nil { - message = "Pod exists with phase: " + string(pod.Status.Phase) - // Check if pod Ready condition is true - if pod.Status.Phase == corev1.PodRunning { - message = "Pod is Running but not Ready" + Reason: sandboxv1alpha1.SandboxReasonInitializing, + Message: "Sandbox is initializing", + } + + if initialized.Status == metav1.ConditionFalse { + ready.Status = metav1.ConditionFalse + ready.Reason = sandboxv1alpha1.SandboxReasonInitializing + ready.Message = "Waiting for Sandbox to be provisioned" + } else if isSuspended { + ready.Status = metav1.ConditionFalse + ready.Reason = sandboxv1alpha1.SandboxReasonSuspended + ready.Message = "Sandbox is suspended" + } else if pod == nil { + ready.Status = metav1.ConditionFalse + ready.Reason = sandboxv1alpha1.SandboxReasonPodInitializing + ready.Message = "Pod is initializing" + } else { + // Pod exists + switch pod.Status.Phase { + case corev1.PodRunning: + podIsReady := false for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - if condition.Status == corev1.ConditionTrue { - message = "Pod is Ready" - podReady = true - } + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { + podIsReady = true break } } + if podIsReady { + ready.Status = metav1.ConditionTrue + ready.Reason = sandboxv1alpha1.SandboxReasonReady + ready.Message = "Sandbox is operational" + } else { + ready.Status = metav1.ConditionFalse + ready.Reason = sandboxv1alpha1.SandboxReasonPodNotReady + ready.Message = "Pod is not Ready" + } + case corev1.PodUnknown: + ready.Status = metav1.ConditionUnknown + ready.Reason = sandboxv1alpha1.SandboxReasonUnresponsive + ready.Message = "Pod status is unknown" + default: + ready.Status = metav1.ConditionFalse + ready.Reason = sandboxv1alpha1.SandboxReasonPodNotReady + ready.Message = "Pod is in phase: " + string(pod.Status.Phase) } - } else { - if sandbox.Spec.Replicas != nil && *sandbox.Spec.Replicas == 0 { - message = "Pod does not exist, replicas is 0" - // This is intended behaviour. So marking it ready. - podReady = true - } else { - message = "Pod does not exist" - } - } - - svcReady := false - if svc != nil { - message += "; Service Exists" - svcReady = true - } else { - message += "; Service does not exist" - } - - readyCondition.Message = message - if podReady && svcReady { - readyCondition.Status = metav1.ConditionTrue - readyCondition.Reason = "DependenciesReady" } + conditions = append(conditions, ready) - return readyCondition + return conditions } func (r *SandboxReconciler) updateStatus(ctx context.Context, oldStatus *sandboxv1alpha1.SandboxStatus, sandbox *sandboxv1alpha1.Sandbox) error { @@ -624,7 +677,7 @@ func checkSandboxExpiry(sandbox *sandboxv1alpha1.Sandbox) (bool, time.Duration) // sandboxMarkedExpired checks if the sandbox is already marked as expired func sandboxMarkedExpired(sandbox *sandboxv1alpha1.Sandbox) bool { cond := meta.FindStatusCondition(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady)) - return cond != nil && cond.Reason == sandboxv1alpha1.SandboxReasonExpired + return cond != nil && (cond.Reason == sandboxv1alpha1.SandboxReasonExpired) } // SetupWithManager sets up the controller with the Manager. diff --git a/controllers/sandbox_controller_test.go b/controllers/sandbox_controller_test.go index 522872070..311977d64 100644 --- a/controllers/sandbox_controller_test.go +++ b/controllers/sandbox_controller_test.go @@ -15,7 +15,6 @@ package controllers import ( - "errors" "testing" "time" @@ -55,150 +54,201 @@ func sandboxControllerRef(name string) metav1.OwnerReference { } } -func TestComputeReadyCondition(t *testing.T) { +func TestComputeConditions(t *testing.T) { r := &SandboxReconciler{} + gen := int64(1) + sbWithRepl := func(replicas int32) *sandboxv1alpha1.Sandbox { + return &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{Generation: gen}, + Spec: sandboxv1alpha1.SandboxSpec{Replicas: ptr.To(replicas)}, + } + } + testCases := []struct { - name string - sandbox *sandboxv1alpha1.Sandbox - err error - svc *corev1.Service - pod *corev1.Pod - expectedStatus metav1.ConditionStatus - expectedReason string + name string + sandbox *sandboxv1alpha1.Sandbox + svcsProvisioned bool + pvcsProvisioned bool + pod *corev1.Pod + expectedConditions []metav1.Condition }{ { - name: "all ready", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, + name: "1. Provisioning - No dependencies", + sandbox: sbWithRepl(1), + svcsProvisioned: false, + pvcsProvisioned: false, + pod: nil, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "False", ObservedGeneration: gen, Reason: "SandboxInitializing", Message: "Provisioning dependencies"}, + {Type: "Suspended", Status: "Unknown", ObservedGeneration: gen, Reason: "PendingEvaluation", Message: "The suspension status has not yet been determined."}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxInitializing", Message: "Waiting for Sandbox to be provisioned"}, + }, + }, + { + name: "2. Provisioning - Partial dependencies (missing service)", + sandbox: sbWithRepl(1), + svcsProvisioned: false, + pvcsProvisioned: true, + pod: nil, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "False", ObservedGeneration: gen, Reason: "SandboxInitializing", Message: "Provisioning dependencies"}, + {Type: "Suspended", Status: "Unknown", ObservedGeneration: gen, Reason: "PendingEvaluation", Message: "The suspension status has not yet been determined."}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxInitializing", Message: "Waiting for Sandbox to be provisioned"}, + }, + }, + { + name: "3. Dependencies provisioned, Pod missing", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, + pod: nil, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "Unknown", ObservedGeneration: gen, Reason: "PendingEvaluation", Message: "The suspension status has not yet been determined."}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxPodInitializing", Message: "Pod is initializing"}, }, - err: nil, - svc: &corev1.Service{}, + }, + { + name: "4. Pod Pending", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, + pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodPending}}, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "False", ObservedGeneration: gen, Reason: "NotSuspended", Message: "Sandbox is operational and not suspended"}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxPodNotReady", Message: "Pod is in phase: Pending"}, + }, + }, + { + name: "5. Pod Running but not Ready", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, pod: &corev1.Pod{ Status: corev1.PodStatus{ Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }, + {Type: corev1.PodReady, Status: corev1.ConditionFalse}, }, }, }, - expectedStatus: metav1.ConditionTrue, - expectedReason: "DependenciesReady", - }, - { - name: "error", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "False", ObservedGeneration: gen, Reason: "NotSuspended", Message: "Sandbox is operational and not suspended"}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxPodNotReady", Message: "Pod is not Ready"}, }, - err: errors.New("test error"), - svc: &corev1.Service{}, - pod: &corev1.Pod{}, - expectedStatus: metav1.ConditionFalse, - expectedReason: "ReconcilerError", }, { - name: "pod not ready", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, - }, - err: nil, - svc: &corev1.Service{}, + name: "6. Operational Sandbox - Fully Ready", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, pod: &corev1.Pod{ Status: corev1.PodStatus{ Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionFalse, - }, + {Type: corev1.PodReady, Status: corev1.ConditionTrue}, }, }, }, - expectedStatus: metav1.ConditionFalse, - expectedReason: "DependenciesNotReady", + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "False", ObservedGeneration: gen, Reason: "NotSuspended", Message: "Sandbox is operational and not suspended"}, + {Type: "Ready", Status: "True", ObservedGeneration: gen, Reason: "SandboxReady", Message: "Sandbox is operational"}, + }, }, { - name: "pod running but not ready", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, - }, - err: nil, - svc: &corev1.Service{}, + name: "7. Suspended by user - Pod still terminating", + sandbox: sbWithRepl(0), + svcsProvisioned: true, + pvcsProvisioned: true, pod: &corev1.Pod{ Status: corev1.PodStatus{ Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionTrue}, + }, }, }, - expectedStatus: metav1.ConditionFalse, - expectedReason: "DependenciesNotReady", + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "True", ObservedGeneration: gen, Reason: "Suspended", Message: "Sandbox has been suspended and is not operational"}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxSuspended", Message: "Sandbox is suspended"}, + }, }, { - name: "pod pending", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, + name: "8. Fully suspended - Pod deleted", + sandbox: sbWithRepl(0), + svcsProvisioned: true, + pvcsProvisioned: true, + pod: nil, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "True", ObservedGeneration: gen, Reason: "Suspended", Message: "Sandbox has been suspended and is not operational"}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxSuspended", Message: "Sandbox is suspended"}, }, - err: nil, - svc: &corev1.Service{}, - pod: &corev1.Pod{ - Status: corev1.PodStatus{ - Phase: corev1.PodPending, - }, + }, + { + name: "9. Resuming - Pod missing", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, + pod: nil, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "Unknown", ObservedGeneration: gen, Reason: "PendingEvaluation", Message: "The suspension status has not yet been determined."}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxPodInitializing", Message: "Pod is initializing"}, }, - expectedStatus: metav1.ConditionFalse, - expectedReason: "DependenciesNotReady", }, { - name: "service not ready", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, + name: "10. Unresponsive - Pod Status Unknown", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, + pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodUnknown}}, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "False", ObservedGeneration: gen, Reason: "NotSuspended", Message: "Sandbox is operational and not suspended"}, + {Type: "Ready", Status: "Unknown", ObservedGeneration: gen, Reason: "SandboxUnresponsive", Message: "Pod status is unknown"}, }, - err: nil, - svc: nil, - pod: &corev1.Pod{ - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - }, + }, + { + name: "11. Pod Failed - Crash Loop", + sandbox: sbWithRepl(1), + svcsProvisioned: true, + pvcsProvisioned: true, + pod: &corev1.Pod{Status: corev1.PodStatus{Phase: corev1.PodFailed}}, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "True", ObservedGeneration: gen, Reason: "SandboxInitialized", Message: "Service and PVCs are provisioned"}, + {Type: "Suspended", Status: "False", ObservedGeneration: gen, Reason: "NotSuspended", Message: "Sandbox is operational and not suspended"}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxPodNotReady", Message: "Pod is in phase: Failed"}, }, - expectedStatus: metav1.ConditionFalse, - expectedReason: "DependenciesNotReady", }, { - name: "all not ready", - sandbox: &sandboxv1alpha1.Sandbox{ - ObjectMeta: metav1.ObjectMeta{ - Generation: 1, - }, + name: "12. Suspended but missing dependencies", + sandbox: sbWithRepl(0), + svcsProvisioned: false, + pvcsProvisioned: false, + pod: nil, + expectedConditions: []metav1.Condition{ + {Type: "Initialized", Status: "False", ObservedGeneration: gen, Reason: "SandboxInitializing", Message: "Provisioning dependencies"}, + {Type: "Suspended", Status: "True", ObservedGeneration: gen, Reason: "Suspended", Message: "Sandbox has been suspended and is not operational"}, + {Type: "Ready", Status: "False", ObservedGeneration: gen, Reason: "SandboxInitializing", Message: "Waiting for Sandbox to be provisioned"}, }, - err: nil, - svc: nil, - pod: nil, - expectedStatus: metav1.ConditionFalse, - expectedReason: "DependenciesNotReady", }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - condition := r.computeReadyCondition(tc.sandbox, tc.err, tc.svc, tc.pod) - require.Equal(t, sandboxv1alpha1.SandboxConditionReady.String(), condition.Type) - require.Equal(t, tc.sandbox.Generation, condition.ObservedGeneration) - require.Equal(t, tc.expectedStatus, condition.Status) - require.Equal(t, tc.expectedReason, condition.Reason) + conditions := r.computeConditions(tc.sandbox, tc.svcsProvisioned, tc.pod, tc.pvcsProvisioned) + opts := []cmp.Option{ + cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"), + } + if diff := cmp.Diff(tc.expectedConditions, conditions, opts...); diff != "" { + t.Fatalf("unexpected conditions (-want,+got):\n%s", diff) + } }) } } @@ -210,6 +260,7 @@ func TestReconcile(t *testing.T) { name string initialObjs []runtime.Object sandboxSpec sandboxv1alpha1.SandboxSpec + deletionTimestamp *metav1.Time wantStatus sandboxv1alpha1.SandboxStatus wantObjs []client.Object wantDeletedObjs []client.Object @@ -237,11 +288,25 @@ func TestReconcile(t *testing.T) { LabelSelector: "agents.x-k8s.io/sandbox-name-hash=ab179450", // Pre-computed hash of "sandbox-name" Conditions: []metav1.Condition{ { - Type: "Ready", - Status: "False", + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonInitialized, + Message: "Service and PVCs are provisioned", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonNotSuspended, + Message: "Sandbox is operational and not suspended", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, ObservedGeneration: 1, - Reason: "DependenciesNotReady", - Message: "Pod exists with phase: ; Service Exists", + Reason: sandboxv1alpha1.SandboxReasonPodNotReady, + Message: "Pod is in phase: ", }, }, }, @@ -330,11 +395,25 @@ func TestReconcile(t *testing.T) { LabelSelector: "agents.x-k8s.io/sandbox-name-hash=ab179450", // Pre-computed hash of "sandbox-name" Conditions: []metav1.Condition{ { - Type: "Ready", - Status: "False", + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonInitialized, + Message: "Service and PVCs are provisioned", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + Status: metav1.ConditionFalse, ObservedGeneration: 1, - Reason: "DependenciesNotReady", - Message: "Pod exists with phase: ; Service Exists", + Reason: sandboxv1alpha1.SandboxReasonNotSuspended, + Message: "Sandbox is operational and not suspended", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonPodNotReady, + Message: "Pod is in phase: ", }, }, }, @@ -444,10 +523,10 @@ func TestReconcile(t *testing.T) { wantStatus: sandboxv1alpha1.SandboxStatus{ Conditions: []metav1.Condition{ { - Type: "Ready", + Type: string(sandboxv1alpha1.SandboxConditionReady), Status: "False", ObservedGeneration: 1, - Reason: "SandboxExpired", + Reason: sandboxv1alpha1.SandboxReasonExpired, Message: "Sandbox has expired", }, }, @@ -495,6 +574,32 @@ func TestReconcile(t *testing.T) { }, expectSandboxDeleted: true, }, + { + name: "Sandbox deleting (DeletionTimestamp set)", + deletionTimestamp: ptr.To(metav1.Now()), + sandboxSpec: sandboxv1alpha1.SandboxSpec{ + PodTemplate: sandboxv1alpha1.PodTemplate{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + }, + }, + }, + }, + }, + wantStatus: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonDeleting, + Message: "Sandbox is terminating", + }, + }, + }, + }, } for _, tc := range testCases { @@ -503,6 +608,10 @@ func TestReconcile(t *testing.T) { sb.Name = sandboxName sb.Namespace = sandboxNs sb.Generation = 1 + if tc.deletionTimestamp != nil { + sb.DeletionTimestamp = tc.deletionTimestamp + sb.Finalizers = []string{"test-finalizer"} + } sb.Spec = tc.sandboxSpec r := SandboxReconciler{ Client: newFakeClient(append(tc.initialObjs, sb)...), diff --git a/docs/keps/119-sandbox-state/README.md b/docs/keps/119-sandbox-state/README.md new file mode 100644 index 000000000..114e65306 --- /dev/null +++ b/docs/keps/119-sandbox-state/README.md @@ -0,0 +1,92 @@ +# Sandbox State Management with Conditions + +We need to expose `status` method for Sandboxes in the Python SDK: https://github.com/kubernetes-sigs/agent-sandbox/pull/280. The current outstanding implementation checks the `Pod` status instead of `Sandbox` and transforms the Pod status into the Sandbox status on the client side. This is not an ideal implementation. We should expose `status` of the Sandbox on the controller side as a first class field. + +To align with Kubernetes API standards and address the previous limitations of using `phase` for Sandbox as discussed in https://github.com/kubernetes-sigs/agent-sandbox/pull/121, this proposal uses a `status.conditions` model instead of adding the deprecated `status.phase` field. This model establishes three conditions: `Initialized`, `Ready` and `Suspended`. + +## Motivation + +We currently expose a single `Ready` condition for Sandboxes. Because Sandbox acts as an "aggregation" object, a common convention is that `Ready` should be `True` when all child objects (Pod, Service, PVC) are applied to the cluster and are themselves `Ready`. + +However, relying purely on the `Ready` condition makes it harder to observe certain lifecycle transitions—specifically, when a Sandbox is in the process of scaling down (suspending). While a controller or user can observe that a Sandbox should be suspended from `spec` and verify `status.observedGeneration` to know the controller has acted on the spec, they lack a clear signal indicating whether the scale-down process is actively happening or if it has fully completed without deeply inspecting the child objects. + +Adding the `Suspended` condition explicitly solves this visibility gap for scale-down. Additionally, adding `Initialized` makes the first-time setup of persistent infrastructure observable separately from the Pod. + +## Condition Hierarchy + +The Sandbox state is determined by three distinct layers. + +#### 1. `Initialized` +This represents the **First-Time Setup** of the sandbox. Once the persistent environment is established, this condition becomes `True` and remains `True` for the remainder of the sandbox's lifecycle. +* **Scope:** Successful creation of the **Service** and **PersistentVolumeClaim (PVC)** (if configured). +* **Persistence:** This remains `True` during suspension, confirming that the network identity and persistent storage are preserved even when the Pod is deleted. + +#### 2. `Suspended` +This condition explicitly tracks the scale-down process of the Sandbox. +* **Behavior:** When `True`, the **Pod** has been successfully terminated to conserve cluster resources, meaning the scale-down is complete. When `False`, it implies the Sandbox is either fully operational or actively in the process of scaling down. +* **Ready Impact:** Similar to a Deployment of size 0, a fully suspended Sandbox is not intrinsically "broken." If convention dictates that `Ready` means all child objects are successfully in their desired state, a suspended Sandbox (where desired Pods=0) is considered to have `Ready` equal to `True`. + +#### 3. `Ready` (Root Condition) +The overarching signal for whether all child objects are successfully applied to the cluster and are themselves `Ready`. + +--- + +## Condition Dependency Matrix + +The controller evaluates the hierarchy top-down. The "Gap" between `Initialized` and `Ready` represents the time taken to schedule and start the agent Pod. + +| Scenario | `Initialized` | `Suspended` | Pod | **`Ready` (Root)** | Ready Reason | +| :--- | :--- | :--- | :--- | :--- | :--- | +| **Provisioning** | `False` | `Unknown` | None | **`False`** | `SandboxInitializing` | +| **Pod Starting** | `True` | `False` | Pending | **`False`** | `SandboxPodInitializing` | +| **Operational** | `True` | `False` | Running & Ready | **`True`** | `SandboxReady` | +| **Suspended** | `True` | `True` | None | **`True`** | `SandboxSuspended` | +| **Unresponsive** | `True` | `False` | Unknown | **`Unknown`** | `SandboxUnresponsive` | +| **Expired** | `True` | `Any` | Any | **`False`** | `SandboxExpired` | +| **Terminating** | `True` | `Any` | Any | **`False`** | `SandboxDeleting` | + +#### Why "Initialized" matters +By isolating the one-time setup of Service into the `Initialized` condition, we provide a convenient top-level summary of the infrastructure state. While machines typically only care if an object is "ready" or "broken", humans can rely on the `Message` field for context, and advanced client-side tooling can traverse `ownerRefs` to find specific component failures. Surfacing `Initialized` explicitly acts as an optimization, saving clients from having to build that traversal logic just to verify if the persistent environment has been successfully established. + +#### Terminal States: Expired & Terminating +`Expired` and `Terminating` are treated as terminal overrides. Once a sandbox reaches its TTL or a deletion request is received, the overarching `Ready` condition transitions to `False` regardless of the sub-condition statuses. Please note `Expired` and `Terminating` aren't capabilities; they are the end of the object's life which is why they are **not** represented as **explicit conditions**. + +## Usage Examples + +Standard Kubernetes tooling can now interact with the sandbox state natively: + +```bash +# Block a CI/CD pipeline until the sandbox is fully ready +kubectl wait --for=condition=Ready sandbox + +# Verify if infrastructure is provisioned before using Sandbox +kubectl wait --for=condition=Initialized sandbox + +# Determine if a sandbox is down due to a crash or a suspend +kubectl get sandbox my-env -o custom-columns=READY:.status.conditions[?(@.type=="Ready")].reason +``` + +## Consumer Compatibility + +To prevent breaking external consumers (CLI tools, CI scripts, or monitoring): + +1. **Status Contract:** The `Status` field of the `Ready` condition remains the primary API contract for functional logic. Any consumer relying on `Status: True/False` will experience zero disruption. +2. **Reason and Message Field Usage:** The `Reason` field provides machine-readable strings intended for programmatic consumption (e.g., `SandboxSuspended`, `SandboxExpired`). The `Message` field provides human-readable diagnostic details. +3. **Migration Path:** If existing automation relies on specific `Reason` strings to infer state, it is recommended to migrate that logic to observe the `Status` field or the specific sub-conditions (`Initialized`, `Suspended`) introduced in this version. + +## Alternatives Considered + +#### 1. Retaining the Legacy `status.phase` Field +One option considered was to continue using the single-string `status.phase` field (e.g., `Pending`, `Running`, `Suspended`). + +* **Cons:** + * **Reduced Visibility:** A single string cannot represent concurrent states. It is cumbersome to distinguish between a sandbox that is both "Successfully Initialized" and "Administratively Suspended" simultaneously. + * **API Standards:** The Kubernetes API conventions explicitly deprecate the `Phase` pattern for new projects. Adhering to `Conditions` ensures compatibility with modern ecosystem tools like `kubectl wait`. + * **Logic Complexity:** As the sandbox evolves, the state machine would require an exponential number of strings to represent every possible combination of infrastructure and application health. + +#### 2. Utilizing a Single "Ready" Condition +We considered using only the `Ready` condition and overloading the `Reason` field to communicate the state of the infrastructure and the Pod. + +* **Cons:** + * **Brittle Client Logic:** While the `Reason` field is intended for machine consumption, forcing clients to handle a complex list of enum-like strings (e.g., `SandboxSuspended` vs `PodProvisioning`) within a single condition's `Reason` recreates the issues of the deprecated `Phase` field. + * **Ambiguity in Suspension:** If only a `Ready` condition exists, setting it to `False` during suspension provides no programmatic signal that the persistent data (PVC) and network identity (Service) are still safely intact. diff --git a/docs/keps/119-sandbox-state/kep.yaml b/docs/keps/119-sandbox-state/kep.yaml new file mode 100644 index 000000000..83d14ca4a --- /dev/null +++ b/docs/keps/119-sandbox-state/kep.yaml @@ -0,0 +1,10 @@ +title: Agent Sandbox Status Field +kep-number: 2 +authors: + - "@SHRUTI6991" +status: implementable +creation-date: 2026-03-16 +reviewers: + - "@barney-s" +approvers: + - "@janetkuo" \ No newline at end of file diff --git a/test/e2e/basic_test.go b/test/e2e/basic_test.go index 02c1cfc62..8c3f2c0ba 100644 --- a/test/e2e/basic_test.go +++ b/test/e2e/basic_test.go @@ -83,11 +83,25 @@ func TestSimpleSandbox(t *testing.T) { LabelSelector: "agents.x-k8s.io/sandbox-name-hash=" + nameHash, Conditions: []metav1.Condition{ { - Message: "Pod is Ready; Service Exists", + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + Status: metav1.ConditionTrue, ObservedGeneration: 1, - Reason: "DependenciesReady", - Status: "True", + Reason: sandboxv1alpha1.SandboxReasonInitialized, + Message: "Service and PVCs are provisioned", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonNotSuspended, + Message: "Sandbox is operational and not suspended", + }, + { Type: "Ready", + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonReady, + Message: "Sandbox is operational", }, }, }), diff --git a/test/e2e/replicas_test.go b/test/e2e/replicas_test.go index 8cdca80a6..95dfb35f4 100644 --- a/test/e2e/replicas_test.go +++ b/test/e2e/replicas_test.go @@ -48,11 +48,25 @@ func TestSandboxReplicas(t *testing.T) { LabelSelector: "agents.x-k8s.io/sandbox-name-hash=" + nameHash, Conditions: []metav1.Condition{ { - Message: "Pod is Ready; Service Exists", + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + Status: metav1.ConditionTrue, ObservedGeneration: 1, - Reason: "DependenciesReady", - Status: "True", + Reason: sandboxv1alpha1.SandboxReasonInitialized, + Message: "Service and PVCs are provisioned", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonNotSuspended, + Message: "Sandbox is operational and not suspended", + }, + { Type: "Ready", + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonReady, + Message: "Sandbox is operational", }, }, }), @@ -83,11 +97,25 @@ func TestSandboxReplicas(t *testing.T) { LabelSelector: "", Conditions: []metav1.Condition{ { - Message: "Pod does not exist, replicas is 0; Service Exists", + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + Status: metav1.ConditionTrue, ObservedGeneration: 2, - Reason: "DependenciesReady", - Status: "True", + Reason: sandboxv1alpha1.SandboxReasonInitialized, + Message: "Service and PVCs are provisioned", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + Status: metav1.ConditionTrue, + ObservedGeneration: 2, + Reason: sandboxv1alpha1.SandboxReasonSuspendedNotOperational, + Message: "Sandbox has been suspended and is not operational", + }, + { Type: "Ready", + Status: metav1.ConditionFalse, + ObservedGeneration: 2, + Reason: sandboxv1alpha1.SandboxReasonSuspended, + Message: "Sandbox is suspended", }, }, }), diff --git a/test/e2e/shutdown_test.go b/test/e2e/shutdown_test.go index b86b755dd..a16ab85e2 100644 --- a/test/e2e/shutdown_test.go +++ b/test/e2e/shutdown_test.go @@ -48,11 +48,25 @@ func TestSandboxShutdownTime(t *testing.T) { LabelSelector: "agents.x-k8s.io/sandbox-name-hash=" + nameHash, Conditions: []metav1.Condition{ { - Message: "Pod is Ready; Service Exists", + Type: string(sandboxv1alpha1.SandboxConditionInitialized), + Status: metav1.ConditionTrue, ObservedGeneration: 1, - Reason: "DependenciesReady", - Status: "True", + Reason: sandboxv1alpha1.SandboxReasonInitialized, + Message: "Service and PVCs are provisioned", + }, + { + Type: string(sandboxv1alpha1.SandboxConditionSuspended), + Status: metav1.ConditionFalse, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonNotSuspended, + Message: "Sandbox is operational and not suspended", + }, + { Type: "Ready", + Status: metav1.ConditionTrue, + ObservedGeneration: 1, + Reason: sandboxv1alpha1.SandboxReasonReady, + Message: "Sandbox is operational", }, }, }), @@ -84,11 +98,11 @@ func TestSandboxShutdownTime(t *testing.T) { Replicas: 0, Conditions: []metav1.Condition{ { - Message: "Sandbox has expired", + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, ObservedGeneration: 2, - Reason: "SandboxExpired", - Status: "False", - Type: "Ready", + Reason: sandboxv1alpha1.SandboxReasonExpired, + Message: "Sandbox has expired", }, }, }),