diff --git a/api/v1alpha1/sandbox_types.go b/api/v1alpha1/sandbox_types.go index 0866fdcb9..dac262470 100644 --- a/api/v1alpha1/sandbox_types.go +++ b/api/v1alpha1/sandbox_types.go @@ -19,6 +19,24 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// SandboxPhase is a simple, high-level summary of where the Sandbox is in its lifecycle. +type SandboxPhase string + +const ( + // SandboxPhasePending means the sandbox is being created + SandboxPhasePending SandboxPhase = "Pending" + // SandboxPhaseRunning means the sandbox is running + SandboxPhaseRunning SandboxPhase = "Running" + // SandboxPhasePaused means the sandbox is paused + SandboxPhasePaused SandboxPhase = "Paused" + // SandboxPhaseTerminating means the sandbox is terminating + SandboxPhaseTerminating SandboxPhase = "Terminating" + // SandboxPhaseFailed means the sandbox has failed + SandboxPhaseFailed SandboxPhase = "Failed" + // SandboxPhaseExpired means the sandbox has expired. This is a terminal phase that can only be set when the ShutdownPolicy is Retain. + SandboxPhaseExpired SandboxPhase = "Expired" +) + // ConditionType is a type of condition for a resource. type ConditionType string @@ -149,6 +167,23 @@ type Lifecycle struct { // SandboxStatus defines the observed state of Sandbox. type SandboxStatus struct { + // The phase of a Sandbox is a simple, high-level summary of where the Sandbox is in its lifecycle. + // The conditions array, the reason and message fields, and the individual container status arrays are + // more detail about the pod's status. + // There are five possible phase values: + // + // Pending: The Sandbox has been accepted by the Kubernetes system, but one or more of the Pod + // startup steps is not yet complete. + // Running: The Sandbox has been bound to a node, and all of the Pods have been created. At least + // one Pod is still running, or is in the process of starting or restarting. + // Paused: The Sandbox has been paused. + // Failed: All Pods in the Sandbox have terminated, and at least one Pod has terminated in a failure + // (exited with a non-zero exit code or was terminated by the system). + // Terminating: The Sandbox is terminating. + // + // +optional + Phase SandboxPhase `json:"phase,omitempty"` + // FQDN that is valid for default cluster settings // Limitation: Hardcoded to the domain .cluster.local // e.g. sandbox-example.default.svc.cluster.local diff --git a/controllers/sandbox_controller.go b/controllers/sandbox_controller.go index 50ad32a84..d30489ce9 100644 --- a/controllers/sandbox_controller.go +++ b/controllers/sandbox_controller.go @@ -102,16 +102,25 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // If the sandbox is being deleted, do nothing if !sandbox.ObjectMeta.DeletionTimestamp.IsZero() { log.Info("Sandbox is being deleted") + //sandbox.Status.Phase = sandboxv1alpha1.SandboxPhaseTerminating return ctrl.Result{}, nil } + // Set a default phase + if sandbox.Status.Phase == "" { + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhasePending + } + // Check if already marked as expired to avoid repeated operations, including cleanups if sandboxMarkedExpired(sandbox) { log.Info("Sandbox is already marked as expired") + oldStatus := sandbox.Status.DeepCopy() + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhaseExpired + err := r.updateStatus(ctx, oldStatus, sandbox) // Note: The sandbox won't be deleted if shutdown policy is changed to delete after expiration. // To delete an expired sandbox, the user should delete the sandbox instead of updating it. // This keeps the controller code simple. - return ctrl.Result{}, nil + return ctrl.Result{}, err } // Initialize trace ID for active resources missing an ID @@ -176,9 +185,20 @@ func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox if pod == nil { sandbox.Status.Replicas = 0 sandbox.Status.LabelSelector = "" + if sandbox.Spec.Replicas != nil && *sandbox.Spec.Replicas == 0 { + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhasePaused + } } else { sandbox.Status.Replicas = 1 sandbox.Status.LabelSelector = fmt.Sprintf("%s=%s", sandboxLabel, NameHash(sandbox.Name)) + switch pod.Status.Phase { + case corev1.PodRunning: + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhaseRunning + case corev1.PodPending: + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhasePending + case corev1.PodFailed: + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhaseFailed + } } // Reconcile Service @@ -225,9 +245,9 @@ func (r *SandboxReconciler) computeReadyCondition(sandbox *sandboxv1alpha1.Sandb } } } else { - if sandbox.Spec.Replicas != nil && *sandbox.Spec.Replicas == 0 { - message = "Pod does not exist, replicas is 0" + if sandbox.Status.Phase == sandboxv1alpha1.SandboxPhasePaused { // This is intended behaviour. So marking it ready. + message = "Sandbox is paused" podReady = true } else { message = "Pod does not exist" @@ -554,6 +574,7 @@ func (r *SandboxReconciler) handleSandboxExpiry(ctx context.Context, sandbox *sa if err := r.Delete(ctx, sandbox); err != nil && !k8serrors.IsNotFound(err) { allErrors = errors.Join(allErrors, fmt.Errorf("failed to delete sandbox: %w", err)) } else { + sandbox.Status.Phase = sandboxv1alpha1.SandboxPhaseTerminating return true, nil } } diff --git a/controllers/sandbox_controller_test.go b/controllers/sandbox_controller_test.go index 525672f3b..babf82bb1 100644 --- a/controllers/sandbox_controller_test.go +++ b/controllers/sandbox_controller_test.go @@ -177,6 +177,22 @@ func TestComputeReadyCondition(t *testing.T) { expectedStatus: metav1.ConditionFalse, expectedReason: "DependenciesNotReady", }, + { + name: "sandbox paused", + sandbox: &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + Generation: 1, + }, + Status: sandboxv1alpha1.SandboxStatus{ + Phase: sandboxv1alpha1.SandboxPhasePaused, + }, + }, + err: nil, + svc: &corev1.Service{}, + pod: nil, + expectedStatus: metav1.ConditionTrue, + expectedReason: "DependenciesReady", + }, { name: "all not ready", sandbox: &sandboxv1alpha1.Sandbox{ @@ -231,6 +247,7 @@ func TestReconcile(t *testing.T) { }, // Verify Sandbox status wantStatus: sandboxv1alpha1.SandboxStatus{ + Phase: sandboxv1alpha1.SandboxPhasePending, Service: sandboxName, ServiceFQDN: "sandbox-name.sandbox-ns.svc.cluster.local", Replicas: 1, @@ -324,6 +341,7 @@ func TestReconcile(t *testing.T) { }, // Verify Sandbox status wantStatus: sandboxv1alpha1.SandboxStatus{ + Phase: sandboxv1alpha1.SandboxPhasePending, Service: sandboxName, ServiceFQDN: "sandbox-name.sandbox-ns.svc.cluster.local", Replicas: 1, @@ -410,6 +428,28 @@ func TestReconcile(t *testing.T) { }, }, }, + { + name: "paused sandbox", + sandboxSpec: sandboxv1alpha1.SandboxSpec{ + Replicas: ptr.To(int32(0)), + }, + // Verify Sandbox status + wantStatus: sandboxv1alpha1.SandboxStatus{ + Phase: sandboxv1alpha1.SandboxPhasePaused, + Replicas: 0, + ServiceFQDN: "sandbox-name.sandbox-ns.svc.cluster.local", + Service: "sandbox-name", + Conditions: []metav1.Condition{ + { + Type: "Ready", + Status: "True", + ObservedGeneration: 1, + Reason: "DependenciesReady", + Message: "Sandbox is paused; Service Exists", + }, + }, + }, + }, { name: "sandbox expired with retain policy", initialObjs: []runtime.Object{ diff --git a/k8s/crds/agents.x-k8s.io_sandboxes.yaml b/k8s/crds/agents.x-k8s.io_sandboxes.yaml index 10765ca51..6c150f0d1 100644 --- a/k8s/crds/agents.x-k8s.io_sandboxes.yaml +++ b/k8s/crds/agents.x-k8s.io_sandboxes.yaml @@ -3977,6 +3977,8 @@ spec: - type type: object type: array + phase: + type: string replicas: format: int32 type: integer diff --git a/test/e2e/basic_test.go b/test/e2e/basic_test.go index 02c1cfc62..9ae2965e1 100644 --- a/test/e2e/basic_test.go +++ b/test/e2e/basic_test.go @@ -77,6 +77,7 @@ func TestSimpleSandbox(t *testing.T) { // Assert Sandbox object status reconciles as expected p := []predicates.ObjectPredicate{ predicates.SandboxHasStatus(sandboxv1alpha1.SandboxStatus{ + Phase: "Running", Service: "my-sandbox", ServiceFQDN: fmt.Sprintf("my-sandbox.%s.svc.cluster.local", ns.Name), Replicas: 1, diff --git a/test/e2e/replicas_test.go b/test/e2e/replicas_test.go index c030e3cde..a932a743d 100644 --- a/test/e2e/replicas_test.go +++ b/test/e2e/replicas_test.go @@ -42,6 +42,7 @@ func TestSandboxReplicas(t *testing.T) { // Assert Sandbox object status reconciles as expected p := []predicates.ObjectPredicate{ predicates.SandboxHasStatus(sandboxv1alpha1.SandboxStatus{ + Phase: "Running", Service: "my-sandbox", ServiceFQDN: "my-sandbox.my-sandbox-ns.svc.cluster.local", Replicas: 1, @@ -75,13 +76,14 @@ func TestSandboxReplicas(t *testing.T) { // Wait for sandbox status to reflect new state p = []predicates.ObjectPredicate{ predicates.SandboxHasStatus(sandboxv1alpha1.SandboxStatus{ + Phase: "Paused", Service: "my-sandbox", ServiceFQDN: "my-sandbox.my-sandbox-ns.svc.cluster.local", Replicas: 0, LabelSelector: "", Conditions: []metav1.Condition{ { - Message: "Pod does not exist, replicas is 0; Service Exists", + Message: "Sandbox is paused; Service Exists", ObservedGeneration: 2, Reason: "DependenciesReady", Status: "True", diff --git a/test/e2e/shutdown_test.go b/test/e2e/shutdown_test.go index 15291972f..92af99f6b 100644 --- a/test/e2e/shutdown_test.go +++ b/test/e2e/shutdown_test.go @@ -42,6 +42,7 @@ func TestSandboxShutdownTime(t *testing.T) { // Assert Sandbox object status reconciles as expected p := []predicates.ObjectPredicate{ predicates.SandboxHasStatus(sandboxv1alpha1.SandboxStatus{ + Phase: "Running", Service: "my-sandbox", ServiceFQDN: fmt.Sprintf("my-sandbox.%s.svc.cluster.local", ns.Name), Replicas: 1, @@ -76,6 +77,7 @@ func TestSandboxShutdownTime(t *testing.T) { p = []predicates.ObjectPredicate{ predicates.SandboxHasStatus(sandboxv1alpha1.SandboxStatus{ // Service/ServiceFQDN should be cleared from status when the Service is deleted + Phase: "Terminating", Service: "", ServiceFQDN: "", Replicas: 0,