Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions kubernetes/internal/controller/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,13 +467,19 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
return nil, err
}

// Build a set of live pod names for stale-allocation detection.
livePodSet := make(map[string]struct{}, len(spec.Pods))
for _, pod := range spec.Pods {
livePodSet[pod.Name] = struct{}{}
}

// GC + per-sandbox allocation requests: build SandboxRequests for all existing sandboxes and
// append orphan entries for pods whose sandbox no longer exists (e.g. force-deleted).
// Orphan entries carry PodSupplement=0 and ToRelease=orphan pods so the normal recycle path
// handles them without any special-casing outside this function.
// Terminating sandboxes are handled inside getSandboxRequest: they receive no new supplement and
// all unreleased pods are queued for release.
allRequest, err := allocator.getAllRequest(ctx, spec.Sandboxes, podAllocation)
allRequest, err := allocator.getAllRequest(ctx, spec.Sandboxes, podAllocation, livePodSet)
if err != nil {
return nil, err
}
Expand All @@ -494,13 +500,15 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec
// orphan entries for pods in podAllocation whose sandbox is no longer in the sandboxes list
// (e.g. force-deleted). Orphan entries carry PodSupplement=0 and ToRelease set to the orphan
// pods so the normal recycle path handles them without special-casing in the caller.
func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes []*sandboxv1alpha1.BatchSandbox, podAllocation map[string]string) ([]*algorithm.SandboxRequest, error) {
// livePodSet is the set of pod names currently known to the pool; it is used to detect stale
// annotation entries (pods that were deleted externally) so that supplement is recomputed correctly.
func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes []*sandboxv1alpha1.BatchSandbox, podAllocation map[string]string, livePodSet map[string]struct{}) ([]*algorithm.SandboxRequest, error) {
log := logf.FromContext(ctx)
existingSandboxes := make(map[string]struct{}, len(sandboxes))
allRequest := make([]*algorithm.SandboxRequest, 0, len(sandboxes))
for _, sandbox := range sandboxes {
existingSandboxes[sandbox.Name] = struct{}{}
request, err := allocator.getSandboxRequest(ctx, sandbox)
request, err := allocator.getSandboxRequest(ctx, sandbox, livePodSet)
if err != nil {
return nil, err
}
Expand All @@ -523,7 +531,11 @@ func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes
return allRequest, nil
}

func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox) (*algorithm.SandboxRequest, error) {
// getSandboxRequest computes the allocation request for a single sandbox.
// livePodSet is the set of pod names currently known to the pool; any name in the sandbox's
// alloc-status annotation that is absent from this set is treated as deleted and excluded from
// the effective-allocation count, causing supplement to increase so a replacement is scheduled.
func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox, livePodSet map[string]struct{}) (*algorithm.SandboxRequest, error) {
log := logf.FromContext(ctx)
allocated, err := allocator.GetSandboxAllocation(ctx, sandbox)
if err != nil {
Expand Down Expand Up @@ -577,9 +589,23 @@ func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbo
replica = *sandbox.Spec.Replicas
}

// Count only pods that still exist in the live pool pod set.
// Pods deleted externally (eviction, OOM kill, manual delete) remain in the alloc-status
// annotation but are absent from livePodSet; excluding them makes supplement > 0 so the
// pool schedules a replacement (fixes issue #954).
effectiveAllocated := int32(0)
for _, p := range allocated {
if _, alive := livePodSet[p]; alive {
effectiveAllocated++
} else {
log.Info("Detected stale allocation entry: pod no longer exists, excluding from effective count",
"sandbox", sandbox.Name, "pod", p)
}
}

supplement := int32(0)
if replica-int32(len(allocated)) > 0 {
supplement = replica - int32(len(allocated))
if replica-effectiveAllocated > 0 {
supplement = replica - effectiveAllocated

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Drop stale pods before allocating replacements

When an allocated pod disappears and a replacement is scheduled, this path only lowers the supplement count; it still returns the original allocated slice, so getLatestAllocated appends the new pod to the stale annotation and SyncSandboxAllocation writes both the deleted pod and the replacement back into the in-memory pool allocation. In the inspected pool reconcile flow, LatestAllocation is then used for allocatedCnt and status, so a one-replica sandbox whose old pod was deleted can permanently look like it has two allocated pods and can over-scale/retain extra idle pods. The stale names should be removed from the allocation being persisted or released from the store when they are detected.

Useful? React with 👍 / 👎.

}
Comment on lines +592 to 612

return &algorithm.SandboxRequest{
Expand Down
29 changes: 29 additions & 0 deletions kubernetes/internal/controller/allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,35 @@ func TestSchedule(t *testing.T) {
PodSupplement: 0,
},
},
{
// Regression test for issue #954: when a pod recorded in alloc-status is no longer
// present in the pool's live pod set (e.g. manual delete, node eviction, OOM kill),
// getSandboxRequest must exclude it from the effective-allocation count so that
// supplement > 0 and the pool schedules a replacement.
name: "allocated pod deleted externally - supplement triggers replacement",
spec: &AllocSpec{
// pool has one new ready pod; the previously-allocated pod1 is gone
Pods: []*corev1.Pod{
{ObjectMeta: metav1.ObjectMeta{Name: "pod2"}, Status: corev1.PodStatus{Phase: corev1.PodRunning, Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}}},
},
Pool: &sandboxv1alpha1.Pool{ObjectMeta: metav1.ObjectMeta{Name: "pool1"}},
Sandboxes: []*sandboxv1alpha1.BatchSandbox{
{ObjectMeta: metav1.ObjectMeta{Name: "sbx1"}, Spec: sandboxv1alpha1.BatchSandboxSpec{Replicas: &replica1}},
},
},
// pool store still shows pod1 allocated to sbx1 (not yet cleaned up)
poolAlloc: &PoolAllocation{PodAllocation: map[string]string{"pod1": "sbx1"}},
// annotation still records pod1 as allocated
sandboxAllocs: map[string]*SandboxAllocation{"sbx1": {Pods: []string{"pod1"}}},
releases: map[string]*AllocationRelease{"sbx1": {Pods: []string{}}},
released: map[string]*AllocationReleased{"sbx1": {Pods: []string{}}},
// pod1 absent from live pod set → effectiveAllocated=0 → supplement=1 → pod2 allocated
wantAction: &algorithm.AllocAction{
ToAllocate: map[string][]string{"sbx1": {"pod2"}},
ToRelease: map[string][]string{},
PodSupplement: 0,
},
},
{
name: "orphan sandbox - pods in store but sandbox no longer in spec",
spec: &AllocSpec{
Expand Down
Loading