From f3d169633fd2385c4997de2648b1cffaaf622a0c Mon Sep 17 00:00:00 2001 From: l1b0k Date: Fri, 17 Jan 2025 15:34:47 +0800 Subject: [PATCH] fix: eni may detached in race condition Signed-off-by: l1b0k --- pkg/eni/local.go | 28 +++++++++++++++++++++++++--- pkg/eni/local_test.go | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/pkg/eni/local.go b/pkg/eni/local.go index dc00c67b..bb2b8536 100644 --- a/pkg/eni/local.go +++ b/pkg/eni/local.go @@ -824,8 +824,7 @@ func (l *Local) Dispose(n int) int { // 1. check if can dispose the eni if n >= max(len(l.ipv4), len(l.ipv6)) { - eniType := strings.ToLower(l.eniType) - if eniType != "trunk" && eniType != "erdma" && !l.eni.Trunk && len(l.ipv4.InUse()) == 0 && len(l.ipv6.InUse()) == 0 { + if l.canDispose() { log.Info("dispose eni") l.status = statusDeleting return max(len(l.ipv4), len(l.ipv6)) @@ -902,7 +901,11 @@ func (l *Local) factoryDisposeWorker(ctx context.Context) { if l.status == statusDeleting { // remove the eni - + if !l.canDispose() { + l.cond.Wait() + // this can't happen + continue + } l.cond.L.Unlock() err := l.rateLimitEni.Wait(ctx) @@ -1082,6 +1085,25 @@ func (l *Local) commit(ctx context.Context, respCh chan *AllocResp, ipv4, ipv6 * } } +// canDispose will check current eni status , and make sure no pending jobs ,and no pod is using this eni +func (l *Local) canDispose() bool { + if l.eni == nil { + return true + } + eniType := strings.ToLower(l.eniType) + switch eniType { + case "trunk", "erdma": + return false + } + if l.eni.Trunk { + return false + } + return len(l.ipv4.InUse()) == 0 && + len(l.ipv6.InUse()) == 0 && + l.allocatingV4.Len() == 0 && + l.allocatingV6.Len() == 0 +} + // syncIPLocked will mark ip as invalid , if not found in remote func syncIPLocked(lo Set, remote []netip.Addr) { s := sets.New[netip.Addr](remote...) diff --git a/pkg/eni/local_test.go b/pkg/eni/local_test.go index b7cf1428..d8cb34e6 100644 --- a/pkg/eni/local_test.go +++ b/pkg/eni/local_test.go @@ -186,6 +186,25 @@ func TestLocal_AllocWorker_ParentCancelContext(t *testing.T) { assert.False(t, ok) } +func TestLocal_AllocWorker_UpdateCache(t *testing.T) { + local := NewLocalTest(&daemon.ENI{ID: "eni-1"}, nil, &types.PoolConfig{ + EnableIPv4: true, + }, "") + cni := &daemon.CNI{PodID: "pod-1"} + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + respCh := make(chan *AllocResp) + + req := NewLocalIPRequest() + req.NoCache = true + go local.allocWorker(ctx, cni, req, respCh) + req.cancel() + result, ok := <-respCh + assert.True(t, ok) + assert.NotNil(t, result) +} + func TestLocal_Dispose(t *testing.T) { local := NewLocalTest(&daemon.ENI{ID: "eni-1"}, nil, &types.PoolConfig{}, "") local.status = statusInUse @@ -229,6 +248,19 @@ func TestLocal_Allocate_NoCache(t *testing.T) { assert.Equal(t, 1, len(resp)) } +func TestLocal_DisposeFailWhenAllocatingIsNotEmpty(t *testing.T) { + local := NewLocalTest(&daemon.ENI{ID: "eni-1"}, nil, &types.PoolConfig{}, "") + local.status = statusInUse + local.ipv4.Add(NewValidIP(netip.MustParseAddr("192.0.2.1"), true)) + local.ipv6.Add(NewValidIP(netip.MustParseAddr("fd00:46dd:e::1"), false)) + + local.allocatingV4 = append(local.allocatingV4, NewLocalIPRequest()) + n := local.Dispose(1) + + assert.Equal(t, 1, n) + assert.Equal(t, statusInUse, local.status) +} + func TestLocal_Allocate_NoCache_AllocSuccess(t *testing.T) { local := NewLocalTest(&daemon.ENI{ID: "eni-1"}, nil, &types.PoolConfig{ MaxIPPerENI: 10, EnableIPv4: true, EnableIPv6: true}, "")