diff --git a/api/v1alpha1/sandbox_types.go b/api/v1alpha1/sandbox_types.go index 0866fdcb9..417842334 100644 --- a/api/v1alpha1/sandbox_types.go +++ b/api/v1alpha1/sandbox_types.go @@ -25,9 +25,16 @@ type ConditionType string func (c ConditionType) String() string { return string(c) } const ( + // DefaultProgressDeadlineSeconds is the default maximum time (10 minutes) for a Sandbox to reach + // the Ready state. + DefaultProgressDeadlineSeconds int32 = 600 + // SandboxConditionReady indicates readiness for Sandbox SandboxConditionReady ConditionType = "Ready" + // SandboxReasonDeadlineExceeded indicates the sandbox failed to become ready within the deadline. + SandboxReasonDeadlineExceeded = "ProgressDeadlineExceeded" + // SandboxReasonExpired indicates expired state for Sandbox SandboxReasonExpired = "SandboxExpired" ) @@ -135,6 +142,14 @@ const ( // Lifecycle defines the lifecycle management for the Sandbox. type Lifecycle struct { + + // ProgressDeadlineSeconds is the maximum time in seconds for a Sandbox to become ready. + // Defaults to 600 seconds. + // +kubebuilder:default=600 + // +kubebuilder:validation:Minimum=0 + // +optional + ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"` + // ShutdownTime is the absolute time when the sandbox expires. // +kubebuilder:validation:Format="date-time" // +optional diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index aaf49af0c..852a8d6fc 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -41,6 +41,11 @@ func (in *EmbeddedObjectMetadata) DeepCopy() *EmbeddedObjectMetadata { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Lifecycle) DeepCopyInto(out *Lifecycle) { *out = *in + if in.ProgressDeadlineSeconds != nil { + in, out := &in.ProgressDeadlineSeconds, &out.ProgressDeadlineSeconds + *out = new(int32) + **out = **in + } if in.ShutdownTime != nil { in, out := &in.ShutdownTime, &out.ShutdownTime *out = (*in).DeepCopy() diff --git a/controllers/sandbox_controller.go b/controllers/sandbox_controller.go index 50ad32a84..06a3afa04 100644 --- a/controllers/sandbox_controller.go +++ b/controllers/sandbox_controller.go @@ -105,13 +105,11 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } - // Check if already marked as expired to avoid repeated operations, including cleanups - if sandboxMarkedExpired(sandbox) { - log.Info("Sandbox is already marked as expired") - // Note: The sandbox won't be deleted if shutdown policy is changed to delete after expiration. - // To delete an expired sandbox, the user should delete the sandbox instead of updating it. - // This keeps the controller code simple. - return ctrl.Result{}, nil + // This stops reconciliation for resources that have already hit a deadline or expired. + // TODO: Use sandbox phase "Failed" check instead of these helper functions PR#121 + if sandboxMarkedExpired(sandbox) || sandboxStalled(sandbox) { + log.Info("Sandbox is in a terminal state (Expired or Stalled). Stopping reconciliation.") + return ctrl.Result{}, nil // stop trying to reconcile the resource } // Initialize trace ID for active resources missing an ID @@ -139,24 +137,44 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct var err error sandboxDeleted := false - expired, requeueAfter := checkSandboxExpiry(sandbox) + // Calculate lifecycle and timeout states + expired, expiryRequeue := checkSandboxExpiry(sandbox) + + deadlineHit := false + var deadlineRequeue time.Duration + // We only check the deadline if the sandbox is not yet in a "Ready" state and hasn't expired. + // TODO: Only check if the Sandbox is in a "Pending" status PR#121 + if !expired && !isSandboxReady(sandbox) { + deadlineHit, deadlineRequeue = checkProgressDeadline(sandbox) + } - // Check if sandbox has expired + // Handle state transitions + // Expiry takes precedence as it triggers a cleanup/deletion event. if expired { log.Info("Sandbox has expired, deleting child resources and checking shutdown policy") sandboxDeleted, err = r.handleSandboxExpiry(ctx, sandbox) + } else if deadlineHit { + log.Info("Sandbox progress deadline exceeded. Marking as stalled.") + return ctrl.Result{}, r.handleProgressDeadline(ctx, oldStatus, sandbox) // stop trying to reconcile the resource } else { + // Standard reconciliation for active, non-stalled resources. err = r.reconcileChildResources(ctx, sandbox) } + // Final Status update for non-deleted resources if !sandboxDeleted { - // Update status if statusUpdateErr := r.updateStatus(ctx, oldStatus, sandbox); statusUpdateErr != nil { - // Surface update error err = errors.Join(err, statusUpdateErr) } } - // return errors seen + + // Select the earliest requeue time to ensure the controller wakes up for the next event. + requeueAfter := deadlineRequeue + if expiryRequeue > 0 && (requeueAfter == 0 || expiryRequeue < requeueAfter) { + requeueAfter = expiryRequeue + } + + // Return with the calculated minimum requeue duration return ctrl.Result{RequeueAfter: requeueAfter}, err } @@ -550,7 +568,8 @@ func (r *SandboxReconciler) handleSandboxExpiry(ctx context.Context, sandbox *sa allErrors = errors.Join(allErrors, fmt.Errorf("failed to delete service: %w", err)) } - if sandbox.Spec.ShutdownPolicy != nil && *sandbox.Spec.ShutdownPolicy == sandboxv1alpha1.ShutdownPolicyDelete { + if sandbox.Spec.ShutdownPolicy != nil && + *sandbox.Spec.ShutdownPolicy == sandboxv1alpha1.ShutdownPolicyDelete { if err := r.Delete(ctx, sandbox); err != nil && !k8serrors.IsNotFound(err) { allErrors = errors.Join(allErrors, fmt.Errorf("failed to delete sandbox: %w", err)) } else { @@ -576,10 +595,37 @@ func (r *SandboxReconciler) handleSandboxExpiry(ctx context.Context, sandbox *sa return false, allErrors } +// handleProgressDeadline updates the Sandbox status on ProgressDeadlineExceeded +func (r *SandboxReconciler) handleProgressDeadline(ctx context.Context, + oldStatus *sandboxv1alpha1.SandboxStatus, sandbox *sandboxv1alpha1.Sandbox) error { + + if r.Tracer.IsRecording(ctx) { + r.Tracer.AddEvent(ctx, "SandboxProgressDeadlineExceeded", map[string]string{ + "sandbox.Name": sandbox.Name, + }) + } + + // TODO: update Sandbox phase to "Failed" PR#121 + + meta.SetStatusCondition(&sandbox.Status.Conditions, metav1.Condition{ + Type: string(sandboxv1alpha1.SandboxConditionReady), + Status: metav1.ConditionFalse, + ObservedGeneration: sandbox.Generation, + Reason: sandboxv1alpha1.SandboxReasonDeadlineExceeded, + Message: "Sandbox failed to reach Ready state within the allocated deadline.", + }) + + if updateErr := r.updateStatus(ctx, oldStatus, sandbox); updateErr != nil { + return updateErr + } + return nil +} + // checks if the sandbox has expired // returns true if expired, false otherwise // if not expired, also returns the duration to requeue after func checkSandboxExpiry(sandbox *sandboxv1alpha1.Sandbox) (bool, time.Duration) { + // If ShutdownTime is not set, the sandbox never expires. if sandbox.Spec.ShutdownTime == nil { return false, 0 } @@ -626,3 +672,35 @@ func (r *SandboxReconciler) SetupWithManager(mgr ctrl.Manager) error { Owns(&corev1.Service{}, builder.WithPredicates(labelSelectorPredicate)). Complete(r) } + +// checkProgressDeadline calculates if the sandbox has timed out. +func checkProgressDeadline(sandbox *sandboxv1alpha1.Sandbox) (bool, time.Duration) { + + deadline := sandboxv1alpha1.DefaultProgressDeadlineSeconds + if sandbox.Spec.ProgressDeadlineSeconds != nil { + deadline = *sandbox.Spec.ProgressDeadlineSeconds + } + + // TODO: This logic will need to be updated when Sandbox pause / resume is implemented. Issue #36. + elapsed := time.Since(sandbox.CreationTimestamp.Time) + deadlineDuration := time.Duration(deadline) * time.Second + + if elapsed >= deadlineDuration { + return true, 0 + } + + // Schedule the next reconciliation to trigger precisely when the deadline expires. + return false, deadlineDuration - elapsed +} + +// sandboxStalled checks if the sandbox is already marked with a deadline failure. +func sandboxStalled(sandbox *sandboxv1alpha1.Sandbox) bool { + cond := meta.FindStatusCondition(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady)) + return cond != nil && cond.Reason == sandboxv1alpha1.SandboxReasonDeadlineExceeded +} + +// isSandboxReady returns true if the Ready condition is currently True. +func isSandboxReady(sandbox *sandboxv1alpha1.Sandbox) bool { + cond := meta.FindStatusCondition(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady)) + return cond != nil && cond.Status == metav1.ConditionTrue +} diff --git a/controllers/sandbox_controller_test.go b/controllers/sandbox_controller_test.go index 525672f3b..d70f57e1e 100644 --- a/controllers/sandbox_controller_test.go +++ b/controllers/sandbox_controller_test.go @@ -214,6 +214,7 @@ func TestReconcile(t *testing.T) { wantObjs []client.Object wantDeletedObjs []client.Object expectSandboxDeleted bool + creationTime *time.Time }{ { name: "minimal sandbox spec with Pod and Service", @@ -495,17 +496,147 @@ func TestReconcile(t *testing.T) { }, expectSandboxDeleted: true, }, + { + name: "sandbox progress deadline hit surfaces failure", + sandboxSpec: sandboxv1alpha1.SandboxSpec{ + PodTemplate: sandboxv1alpha1.PodTemplate{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "test"}}, + }, + }, + }, + // Creation happened 15 mins ago, exceeding the 10 min default + creationTime: ptr.To(time.Now().Add(-15 * time.Minute)), + wantStatus: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: "Ready", + Status: "False", + ObservedGeneration: 1, + Reason: "ProgressDeadlineExceeded", + Message: "Sandbox failed to reach Ready state within the allocated deadline.", + }, + }, + // Confirms logic stopped reconciling child resources + Replicas: 0, + LabelSelector: "", + }, + }, + { + name: "already stalled sandbox stops reconciliation early", + initialObjs: []runtime.Object{ + // Object already has the failure condition + &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{Name: sandboxName, Namespace: sandboxNs}, + Status: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: "Ready", + Status: "False", + Reason: "ProgressDeadlineExceeded", + }, + }, + }, + }, + }, + sandboxSpec: sandboxv1alpha1.SandboxSpec{ + PodTemplate: sandboxv1alpha1.PodTemplate{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "test"}}, + }, + }, + }, + wantStatus: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: "Ready", + Status: "False", + Reason: "ProgressDeadlineExceeded", + }, + }, + }, + // DO NOT create the Pod if stalled + wantDeletedObjs: []client.Object{ + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: sandboxName, Namespace: sandboxNs}}, + }, + }, + { + name: "sandbox expired with retain policy", + initialObjs: []runtime.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: sandboxName, + Namespace: sandboxNs, + }, + }, + &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: sandboxName, + Namespace: sandboxNs, + }, + }, + }, + sandboxSpec: sandboxv1alpha1.SandboxSpec{ + PodTemplate: sandboxv1alpha1.PodTemplate{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + }, + }, + }, + }, + Lifecycle: sandboxv1alpha1.Lifecycle{ + ShutdownTime: ptr.To(metav1.NewTime(time.Now().Add(-1 * time.Hour))), + ShutdownPolicy: ptr.To(sandboxv1alpha1.ShutdownPolicyRetain), + }, + }, + wantStatus: sandboxv1alpha1.SandboxStatus{ + Conditions: []metav1.Condition{ + { + Type: "Ready", + Status: "False", + ObservedGeneration: 1, + Reason: "SandboxExpired", + Message: "Sandbox has expired", + }, + }, + }, + wantDeletedObjs: []client.Object{ + &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: sandboxName, Namespace: sandboxNs}}, + &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: sandboxName, Namespace: sandboxNs}}, + }, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - sb := &sandboxv1alpha1.Sandbox{} - sb.Name = sandboxName - sb.Namespace = sandboxNs - sb.Generation = 1 + // Use sandbox if found in initialObjs or create default sandbox + var sb *sandboxv1alpha1.Sandbox + for _, obj := range tc.initialObjs { + if s, ok := obj.(*sandboxv1alpha1.Sandbox); ok && s.Name == sandboxName { + sb = s + break + } + } + + if sb == nil { + sb = &sandboxv1alpha1.Sandbox{} + sb.Name = sandboxName + sb.Namespace = sandboxNs + sb.Generation = 1 + tc.initialObjs = append(tc.initialObjs, sb) + } + + // Test sandboxes do not have a creation timestamp, create a default + if tc.creationTime != nil { + sb.CreationTimestamp = metav1.NewTime(*tc.creationTime) + } else if sb.CreationTimestamp.IsZero() { + sb.CreationTimestamp = metav1.Now() + } sb.Spec = tc.sandboxSpec r := SandboxReconciler{ - Client: newFakeClient(append(tc.initialObjs, sb)...), + Client: newFakeClient(tc.initialObjs...), Scheme: Scheme, Tracer: asmetrics.NewNoOp(), } @@ -944,7 +1075,11 @@ func TestSandboxExpiry(t *testing.T) { } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - sandbox := &sandboxv1alpha1.Sandbox{} + sandbox := &sandboxv1alpha1.Sandbox{ + Spec: sandboxv1alpha1.SandboxSpec{ + Lifecycle: sandboxv1alpha1.Lifecycle{}, + }, + } sandbox.Spec.ShutdownTime = tc.shutdownTime if tc.deletionPolicy != "" { sandbox.Spec.ShutdownPolicy = ptr.To(tc.deletionPolicy) @@ -959,3 +1094,66 @@ func TestSandboxExpiry(t *testing.T) { }) } } + +func TestCheckProgressDeadline(t *testing.T) { + now := time.Now() + testCases := []struct { + name string + creationTime time.Time + deadline *int32 + wantHit bool + wantRequeue bool + }{ + { + name: "under deadline - default", + creationTime: now.Add(-5 * time.Minute), + deadline: nil, + wantHit: false, + wantRequeue: true, + }, + { + name: "over deadline - default", + creationTime: now.Add(-11 * time.Minute), + deadline: nil, + wantHit: true, + wantRequeue: false, + }, + { + name: "under deadline - custom", + creationTime: now.Add(-1 * time.Minute), + deadline: ptr.To(int32(120)), // 2 minutes + wantHit: false, + wantRequeue: true, + }, + { + name: "over deadline - custom", + creationTime: now.Add(-3 * time.Minute), + deadline: ptr.To(int32(120)), + wantHit: true, + wantRequeue: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + sandbox := &sandboxv1alpha1.Sandbox{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.NewTime(tc.creationTime), + }, + Spec: sandboxv1alpha1.SandboxSpec{ + Lifecycle: sandboxv1alpha1.Lifecycle{ + ProgressDeadlineSeconds: tc.deadline, + }, + }, + } + hit, requeueAfter := checkProgressDeadline(sandbox) + require.Equal(t, tc.wantHit, hit) + if tc.wantRequeue { + // Verify we schedule reconciliation exactly when the timer expires + require.Greater(t, requeueAfter, time.Duration(0)) + } else { + require.Equal(t, time.Duration(0), requeueAfter) + } + }) + } +} diff --git a/k8s/crds/agents.x-k8s.io_sandboxes.yaml b/k8s/crds/agents.x-k8s.io_sandboxes.yaml index 10765ca51..8d47293e1 100644 --- a/k8s/crds/agents.x-k8s.io_sandboxes.yaml +++ b/k8s/crds/agents.x-k8s.io_sandboxes.yaml @@ -3814,6 +3814,11 @@ spec: required: - spec type: object + progressDeadlineSeconds: + default: 600 + format: int32 + minimum: 0 + type: integer replicas: format: int32 maximum: 1 diff --git a/test/e2e/shutdown_test.go b/test/e2e/shutdown_test.go index d5cc46754..15291972f 100644 --- a/test/e2e/shutdown_test.go +++ b/test/e2e/shutdown_test.go @@ -70,10 +70,8 @@ func TestSandboxShutdownTime(t *testing.T) { // Set a shutdown time that ends shortly shutdown := metav1.NewTime(time.Now().Add(10 * time.Second)) - framework.MustUpdateObject(tc.ClusterClient, sandboxObj, func(obj *sandboxv1alpha1.Sandbox) { - obj.Spec.ShutdownTime = &shutdown - }) - + sandboxObj.Spec.ShutdownTime = &shutdown + require.NoError(t, tc.Update(t.Context(), sandboxObj)) // Wait for sandbox status to reflect new state p = []predicates.ObjectPredicate{ predicates.SandboxHasStatus(sandboxv1alpha1.SandboxStatus{