Skip to content

Commit 9a033ff

Browse files
vamsi-resolveclaude
andcommitted
fix: recover from warm pool pod deletion instead of permanent error loop
Cherry-picks two upstream fixes: 1. kubernetes-sigs#521 — When an adopted warm pool pod is deleted (node failure, drain, eviction), the controller returned a hard error because the agents.x-k8s.io/pod-name annotation pointed to a non-existent pod. This left the Sandbox stuck in a permanent reconcile error loop. Now the controller clears the stale annotation and falls through to create a replacement pod (which remounts the existing PVC). 2. kubernetes-sigs#469 — During warm pool adoption, ensure the pod-name annotation is correct before the sandbox can be observed as Ready. Prevents stale annotations from being set in the first place. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 9801457 commit 9a033ff

3 files changed

Lines changed: 39 additions & 19 deletions

File tree

controllers/sandbox_controller.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,12 @@ func (r *SandboxReconciler) reconcilePod(ctx context.Context, sandbox *sandboxv1
477477
return nil, fmt.Errorf("pod get failed: %w", err)
478478
}
479479
if podNameAnnotationExists {
480-
log.Error(err, "Pod not found")
481-
return nil, fmt.Errorf("pod in annotation get failed: %w", err)
480+
log.Info("Tracked pod not found, clearing stale annotation", "podName", podName)
481+
patch := client.MergeFrom(sandbox.DeepCopy())
482+
delete(sandbox.Annotations, sandboxv1alpha1.SandboxPodNameAnnotation)
483+
if patchErr := r.Patch(ctx, sandbox, patch); patchErr != nil {
484+
return nil, fmt.Errorf("failed to clear stale pod name annotation: %w", patchErr)
485+
}
482486
}
483487
pod = nil
484488
}

controllers/sandbox_controller_test.go

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,31 +1130,40 @@ func TestReconcilePod(t *testing.T) {
11301130
expectErr: true,
11311131
},
11321132
{
1133-
name: "error when annotated pod does not exist",
1134-
initialObjs: []runtime.Object{},
1135-
sandbox: &sandboxv1alpha1.Sandbox{
1133+
name: "clears stale annotation and creates replacement pod when annotated pod does not exist",
1134+
sandbox: func() *sandboxv1alpha1.Sandbox {
1135+
s := sandboxObj.DeepCopy()
1136+
s.Annotations = map[string]string{
1137+
sandboxv1alpha1.SandboxPodNameAnnotation: "non-existent-pod",
1138+
}
1139+
return s
1140+
}(),
1141+
wantPod: &corev1.Pod{
11361142
ObjectMeta: metav1.ObjectMeta{
1137-
Name: sandboxName,
1138-
Namespace: sandboxNs,
1143+
Name: sandboxName,
1144+
Namespace: sandboxNs,
1145+
ResourceVersion: "1",
1146+
Labels: map[string]string{
1147+
"agents.x-k8s.io/sandbox-name-hash": nameHash,
1148+
"custom-label": "label-val",
1149+
},
11391150
Annotations: map[string]string{
1140-
sandboxv1alpha1.SandboxPodNameAnnotation: "non-existent-pod",
1151+
"custom-annotation": "anno-val",
11411152
},
1153+
OwnerReferences: []metav1.OwnerReference{sandboxControllerRef(sandboxName)},
11421154
},
1143-
Spec: sandboxv1alpha1.SandboxSpec{
1144-
Replicas: ptr.To(int32(1)),
1145-
PodTemplate: sandboxv1alpha1.PodTemplate{
1146-
Spec: corev1.PodSpec{
1147-
Containers: []corev1.Container{
1148-
{
1149-
Name: "test-container",
1150-
},
1151-
},
1155+
Spec: corev1.PodSpec{
1156+
Containers: []corev1.Container{
1157+
{
1158+
Name: "test-container",
11521159
},
11531160
},
11541161
},
11551162
},
1156-
wantPod: nil,
1157-
expectErr: true,
1163+
expectErr: false,
1164+
wantSandboxAnnotations: map[string]string{
1165+
sandboxv1alpha1.SandboxPodNameAnnotation: sandboxName,
1166+
},
11581167
},
11591168
{
11601169
name: "refuses to delete annotated pod owned by a different controller",

extensions/controllers/sandboxclaim_controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,13 @@ func (r *SandboxClaimReconciler) adoptSandboxFromCandidates(ctx context.Context,
442442
if adopted.Annotations == nil {
443443
adopted.Annotations = make(map[string]string)
444444
}
445+
// Ensure the adopted sandbox records its pod name before it can be observed Ready.
446+
if podName := adopted.Annotations[v1alpha1.SandboxPodNameAnnotation]; podName != adopted.Name {
447+
if podName != "" {
448+
logger.Info("Correcting adopted sandbox pod-name annotation", "sandbox", adopted.Name, "oldPodName", podName, "newPodName", adopted.Name)
449+
}
450+
adopted.Annotations[v1alpha1.SandboxPodNameAnnotation] = adopted.Name
451+
}
445452
if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
446453
adopted.Annotations[asmetrics.TraceContextAnnotation] = tc
447454
}

0 commit comments

Comments
 (0)