Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions extensions/controllers/sandboxclaim_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,13 @@ func (r *SandboxClaimReconciler) adoptSandboxFromCandidates(ctx context.Context,
if adopted.Annotations == nil {
adopted.Annotations = make(map[string]string)
}
// Ensure the adopted sandbox records its pod name before it can be observed Ready.
if podName := adopted.Annotations[v1alpha1.SandboxPodNameAnnotation]; podName != adopted.Name {
if podName != "" {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears log.Info is being used here directly. If log is a package-level or globally scoped logger, it is recommended to retrieve the context-aware logger using logger := log.FromContext(ctx). This ensures trace IDs and other contextual request data are injected into the logs.

log.Info("Correcting adopted sandbox pod-name annotation", "sandbox", adopted.Name, "oldPodName", podName, "newPodName", adopted.Name)
}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assigning adopted.Annotations[v1alpha1.SandboxPodNameAnnotation] = adopted.Name mutates the object. While controller-runtime clients (Get/List) return deep copies by default making this safe, it is crucial to ensure adopted was not retrieved directly from a raw informer cache elsewhere in the chain. Direct mutation of cached objects causes data races and cache corruption.

adopted.Annotations[v1alpha1.SandboxPodNameAnnotation] = adopted.Name
}
if tc, ok := claim.Annotations[asmetrics.TraceContextAnnotation]; ok {
adopted.Annotations[asmetrics.TraceContextAnnotation] = tc
}
Expand Down
52 changes: 52 additions & 0 deletions extensions/controllers/sandboxclaim_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ func TestSandboxClaimSandboxAdoption(t *testing.T) {
existingObjects []client.Object
expectSandboxAdoption bool
expectedAdoptedSandbox string
expectedAnnotations map[string]string
expectNewSandboxCreated bool
simulateConflicts int
}{
Expand Down Expand Up @@ -927,6 +928,46 @@ func TestSandboxClaimSandboxAdoption(t *testing.T) {
expectedAdoptedSandbox: "not-ready-1",
expectNewSandboxCreated: false,
},
{
name: "corrects stale pod-name annotation when adopting sandbox",
existingObjects: []client.Object{
template,
claim,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By assigning a completely new map to sb.Annotations, you overwrite any default annotations that createWarmPoolSandbox might have initialized. It's safer to check if sb.Annotations is nil, initialize if so, and assign your specific key-value pair instead of overwriting the whole map.

func() client.Object {
sb := createWarmPoolSandbox("pool-sb-1", metav1.Time{Time: metav1.Now().Add(-1 * time.Hour)}, true)
sb.Annotations = map[string]string{
sandboxv1alpha1.SandboxPodNameAnnotation: "stale-pod-name",
}
return sb
}(),
createWarmPoolSandbox("pool-sb-2", metav1.Time{Time: metav1.Now().Add(-30 * time.Minute)}, true),
},
expectSandboxAdoption: true,
expectedAdoptedSandbox: "pool-sb-1",
expectNewSandboxCreated: false,
},
{
name: "accepts existing correct pod-name annotation when adopting sandbox",
existingObjects: []client.Object{
template,
claim,
func() client.Object {
sb := createWarmPoolSandbox("pool-sb-1", metav1.Time{Time: metav1.Now().Add(-1 * time.Hour)}, true)
sb.Annotations = map[string]string{
sandboxv1alpha1.SandboxPodNameAnnotation: "pool-sb-1",
"test.annotation/preserved": "true",
}
return sb
}(),
createWarmPoolSandbox("pool-sb-2", metav1.Time{Time: metav1.Now().Add(-30 * time.Minute)}, true),
},
expectSandboxAdoption: true,
expectedAdoptedSandbox: "pool-sb-1",
expectedAnnotations: map[string]string{
"test.annotation/preserved": "true",
},
expectNewSandboxCreated: false,
},
{
name: "retries on conflict when adopting sandbox",
existingObjects: []client.Object{
Expand Down Expand Up @@ -1009,6 +1050,17 @@ func TestSandboxClaimSandboxAdoption(t *testing.T) {
t.Errorf("expected adopted sandbox to be controlled by claim, got %v", controllerRef)
}

// 4. Verify the adopted sandbox records the adopted pod name
if val := adoptedSandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; val != adoptedSandbox.Name {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If adoptedSandbox.Annotations could theoretically be nil in other test cases, this manual check won't panic (as map lookup on nil is safe in Go), but using testing helpers like require.Equal(t, adoptedSandbox.Name, adoptedSandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]) provides much more readable diffs on test failure.

t.Errorf("expected adopted sandbox to have %q annotation %q, got %q; annotations=%v", sandboxv1alpha1.SandboxPodNameAnnotation, adoptedSandbox.Name, val, adoptedSandbox.Annotations)
}

for key, expected := range tc.expectedAnnotations {
if val := adoptedSandbox.Annotations[key]; val != expected {
t.Errorf("expected adopted sandbox to preserve annotation %q=%q, got %q; annotations=%v", key, expected, val, adoptedSandbox.Annotations)
}
}

} else if tc.expectNewSandboxCreated {
// Verify a new sandbox was created with the claim's name
var sandbox sandboxv1alpha1.Sandbox
Expand Down
194 changes: 149 additions & 45 deletions test/e2e/extensions/warmpool_sandbox_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,28 @@
package extensions

import (
"context"
"fmt"
"testing"
"time"

"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/watch"
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
extensionsv1alpha1 "sigs.k8s.io/agent-sandbox/extensions/api/v1alpha1"
"sigs.k8s.io/agent-sandbox/test/e2e/framework"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func TestWarmPoolSandboxWatcher(t *testing.T) {
tc := framework.NewTestContext(t)

ns := &corev1.Namespace{}
ns.Name = fmt.Sprintf("warmpool-watcher-test-%d", time.Now().UnixNano())
require.NoError(t, tc.CreateWithCleanup(t.Context(), ns))

// Create a SandboxTemplate
func newWarmPoolTemplate(namespace string) *extensionsv1alpha1.SandboxTemplate {
template := &extensionsv1alpha1.SandboxTemplate{}
template.Name = "test-template"
template.Namespace = ns.Name
template.Namespace = namespace
template.Spec.PodTemplate = sandboxv1alpha1.PodTemplate{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
Expand All @@ -50,39 +47,128 @@ func TestWarmPoolSandboxWatcher(t *testing.T) {
},
},
}
require.NoError(t, tc.CreateWithCleanup(t.Context(), template))
return template
}

// Create a SandboxWarmPool
warmPool := &extensionsv1alpha1.SandboxWarmPool{}
warmPool.Name = "test-warmpool"
warmPool.Namespace = ns.Name
warmPool.Spec.TemplateRef.Name = template.Name
warmPool.Spec.Replicas = 1
require.NoError(t, tc.CreateWithCleanup(t.Context(), warmPool))
func waitForWarmPoolSandboxReady(t *testing.T, tc *framework.TestContext, namespace string, warmPool *extensionsv1alpha1.SandboxWarmPool) {
t.Helper()

// Wait for warm pool Sandbox to become ready
var poolSandboxName string
require.Eventually(t, func() bool {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Go versions prior to 1.22, taking the address of a loop variable (&sb) captures the same memory address across iterations, leading to unpredictable behavior if isSandboxReady or metav1.IsControlledBy persist the pointer. Consider passing by value or using &sandboxList.Items[i] for pointer safety.

sandboxList := &sandboxv1alpha1.SandboxList{}
if err := tc.List(t.Context(), sandboxList, client.InNamespace(ns.Name)); err != nil {
if err := tc.List(t.Context(), sandboxList, client.InNamespace(namespace)); err != nil {
return false
}
for _, sb := range sandboxList.Items {
if sb.DeletionTimestamp.IsZero() && metav1.IsControlledBy(&sb, warmPool) {
for _, cond := range sb.Status.Conditions {
if cond.Type == string(sandboxv1alpha1.SandboxConditionReady) && cond.Status == metav1.ConditionTrue {
poolSandboxName = sb.Name
return true
}
}
if sb.DeletionTimestamp.IsZero() && metav1.IsControlledBy(&sb, warmPool) && isSandboxReady(&sb) {
return true
}
}
return false
}, 60*time.Second, 2*time.Second, "warm pool sandbox should become ready")
}

// Find the pod name from the pool sandbox
poolSandbox := &sandboxv1alpha1.Sandbox{}
require.NoError(t, tc.Get(t.Context(), types.NamespacedName{Name: poolSandboxName, Namespace: ns.Name}, poolSandbox))
func waitForClaimReady(t *testing.T, tc *framework.TestContext, claim *extensionsv1alpha1.SandboxClaim) {
t.Helper()

require.Eventually(t, func() bool {
if err := tc.Get(t.Context(), types.NamespacedName{Name: claim.Name, Namespace: claim.Namespace}, claim); err != nil {
return false
}
return claim.Status.SandboxStatus.Name != "" && isClaimReady(claim)
}, 30*time.Second, 1*time.Second, "claim should become ready")
}

func isSandboxReady(sb *sandboxv1alpha1.Sandbox) bool {
for _, cond := range sb.Status.Conditions {
if cond.Type == string(sandboxv1alpha1.SandboxConditionReady) && cond.Status == metav1.ConditionTrue {
return true
}
}
return false
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function isClaimReady checks for the constant sandboxv1alpha1.SandboxConditionReady. Ensure that SandboxClaim shares the exact same constant for its readiness condition type as Sandbox. If SandboxClaim has its own distinct condition type, using the Sandbox constant could lead to false negatives.

}

func isClaimReady(claim *extensionsv1alpha1.SandboxClaim) bool {
for _, cond := range claim.Status.Conditions {
if cond.Type == string(sandboxv1alpha1.SandboxConditionReady) && cond.Status == metav1.ConditionTrue {
return true
}
}
return false
}

func requirePodNameAnnotationWhenReady(
t *testing.T,
tc *framework.TestContext,
namespace string,
claim *extensionsv1alpha1.SandboxClaim,
) {
t.Helper()

ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second)
defer cancel()
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using metav1.ListOptions{} without specifying a ResourceVersion means the watch starts from the current cluster state. While Watch establishment is synchronous, in highly loaded clusters there's a micro-race where the Sandbox might be updated right before the watch fully connects. Consider listing with a ResourceVersion and watching from that specific version to guarantee no missed events.


// Use a direct API watch to avoid the async subscription race in framework.Watch
sandboxWatcher, err := tc.DynamicClient().Resource(
sandboxv1alpha1.GroupVersion.WithResource("sandboxes"),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The helper function requirePodNameAnnotationWhenReady implies it strictly performs assertions based on the name, but it actively creates the claim object via tc.CreateWithCleanup. Consider renaming it to something like createClaimAndVerifyPodNameAnnotationBeforeReady so the side-effect of resource creation is clear to readers.

).Namespace(namespace).Watch(ctx, metav1.ListOptions{})
require.NoError(t, err)
defer sandboxWatcher.Stop()

require.NoError(t, tc.CreateWithCleanup(t.Context(), claim))

for {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The line require.True(t, ok, ...) immediately fails the test if the watch channel closes. Kubernetes API server watches can close naturally (e.g. from timeouts or API server restarts). For more robust E2E tests, handle channel closure by gracefully re-establishing the watch instead of a hard failure.

select {
case <-ctx.Done():
t.Fatalf("timed out watching adopted sandbox readiness: %v", ctx.Err())
case event, ok := <-sandboxWatcher.ResultChan():
require.True(t, ok, "sandbox watch closed before observing adopted sandbox readiness")
require.NotEqual(t, watch.Error, event.Type, "received error event while watching sandboxes")

if event.Type == watch.Deleted {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The watch event might occasionally yield a *metav1.Status object (especially during watch errors or timeouts from the APIServer). Assuming it is always *unstructured.Unstructured and failing via require.True will cause flaky tests. It's better to explicitly check for *metav1.Status and log/retry, or safely ignore it.

continue
}

u, ok := event.Object.(*unstructured.Unstructured)
require.True(t, ok, "unexpected sandbox watch event object type: %T", event.Object)

sb := &sandboxv1alpha1.Sandbox{}
require.NoError(t, runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, sb))

controllerRef := metav1.GetControllerOf(sb)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If isSandboxReady(sb) returns false, the loop silently continues. If the test hangs because readiness is never reached, it will simply timeout after 60s with no clues. Adding a t.Logf("Observed sandbox %s controlled by claim, but not ready yet", sb.Name) right before the continue will greatly improve debuggability for flaky CI runs.

if controllerRef == nil || controllerRef.Kind != "SandboxClaim" || controllerRef.Name != claim.Name {
continue
}
if !isSandboxReady(sb) {
continue
}
if sb.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation] == "" {
t.Fatalf("observed adopted sandbox %s Ready=True without %s annotation", sb.Name, sandboxv1alpha1.SandboxPodNameAnnotation)
}
return
}
}
}

func TestWarmPoolSandboxWatcher(t *testing.T) {
tc := framework.NewTestContext(t)

ns := &corev1.Namespace{}
ns.Name = fmt.Sprintf("warmpool-watcher-test-%d", time.Now().UnixNano())
require.NoError(t, tc.CreateWithCleanup(t.Context(), ns))

template := newWarmPoolTemplate(ns.Name)
require.NoError(t, tc.CreateWithCleanup(t.Context(), template))

// Create a SandboxWarmPool
warmPool := &extensionsv1alpha1.SandboxWarmPool{}
warmPool.Name = "test-warmpool"
warmPool.Namespace = ns.Name
warmPool.Spec.TemplateRef.Name = template.Name
warmPool.Spec.Replicas = 1
require.NoError(t, tc.CreateWithCleanup(t.Context(), warmPool))

// Wait for warm pool Sandbox to become ready
waitForWarmPoolSandboxReady(t, tc, ns.Name, warmPool)

// Create a SandboxClaim to adopt the warm pool sandbox
claim := &extensionsv1alpha1.SandboxClaim{}
Expand All @@ -92,20 +178,7 @@ func TestWarmPoolSandboxWatcher(t *testing.T) {
require.NoError(t, tc.CreateWithCleanup(t.Context(), claim))

// Wait for claim to be ready with sandbox name in status
require.Eventually(t, func() bool {
if err := tc.Get(t.Context(), types.NamespacedName{Name: claim.Name, Namespace: ns.Name}, claim); err != nil {
return false
}
if claim.Status.SandboxStatus.Name == "" {
return false
}
for _, cond := range claim.Status.Conditions {
if cond.Type == string(sandboxv1alpha1.SandboxConditionReady) && cond.Status == metav1.ConditionTrue {
return true
}
}
return false
}, 30*time.Second, 1*time.Second, "claim should become ready")
waitForClaimReady(t, tc, claim)

// Verify the adopted sandbox is now owned by the claim
adoptedSandbox := &sandboxv1alpha1.Sandbox{}
Expand All @@ -117,7 +190,7 @@ func TestWarmPoolSandboxWatcher(t *testing.T) {

// Find the pod belonging to the adopted sandbox
podName := adoptedSandbox.Name
if ann, ok := adoptedSandbox.Annotations["agents.x-k8s.io/pod-name"]; ok && ann != "" {
if ann, ok := adoptedSandbox.Annotations[sandboxv1alpha1.SandboxPodNameAnnotation]; ok && ann != "" {
podName = ann
}
adoptedPod := &corev1.Pod{}
Expand All @@ -138,3 +211,34 @@ func TestWarmPoolSandboxWatcher(t *testing.T) {
return false
}, 15*time.Second, 500*time.Millisecond, "sandbox should become not-ready after pod deletion")
}

// TestWarmPoolPodNameAnnotationBeforeReady verifies that a warm-pool sandbox
// records its pod-name annotation before adoption can be observed as Ready.
func TestWarmPoolPodNameAnnotationBeforeReady(t *testing.T) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for this PR. This PR also resolves:

  1. Metric calculation:
    // Existence of the SandboxPodNameAnnotation implies the pod was adopted from a warm pool.
  2. Also Client side dependency for pod name annotation.

Can we simplify this test? It's long and hard to read. Can you add some comments and add helper methods wherever needed?

tc := framework.NewTestContext(t)

ns := &corev1.Namespace{}
ns.Name = fmt.Sprintf("warmpool-ready-annotation-test-%d", time.Now().UnixNano())
require.NoError(t, tc.CreateWithCleanup(t.Context(), ns))

template := newWarmPoolTemplate(ns.Name)
require.NoError(t, tc.CreateWithCleanup(t.Context(), template))

warmPool := &extensionsv1alpha1.SandboxWarmPool{}
warmPool.Name = "test-warmpool"
warmPool.Namespace = ns.Name
warmPool.Spec.TemplateRef.Name = template.Name
warmPool.Spec.Replicas = 1
require.NoError(t, tc.CreateWithCleanup(t.Context(), warmPool))

// Start from a Ready warm-pool Sandbox so the claim reconcile path must adopt it
waitForWarmPoolSandboxReady(t, tc, ns.Name, warmPool)

claim := &extensionsv1alpha1.SandboxClaim{}
claim.Name = "test-claim"
claim.Namespace = ns.Name
claim.Spec.TemplateRef.Name = template.Name

// Creating the claim should not observe Ready before the pod-name annotation is set
requirePodNameAnnotationWhenReady(t, tc, ns.Name, claim)
}
5 changes: 5 additions & 0 deletions test/e2e/framework/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ func (cl *ClusterClient) WatchSet() *WatchSet {
return cl.watchSet
}

// DynamicClient returns the dynamic client backing the test framework.
func (cl *ClusterClient) DynamicClient() dynamic.Interface {
return cl.dynamicClient
}

// List retrieves a list of objects matching the provided options.
func (cl *ClusterClient) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
cl.Helper()
Expand Down