Skip to content

Commit 1879980

Browse files
Merge pull request #2227 from haircommander/cherry-pick-2200-to-release-4.15
[release-4.15] OCPBUGS-52183: UPSTREAM: <carry>: kubelet/cm: fix bug where kubelet restarts from missing cpuset cgroup
2 parents f383677 + b88a595 commit 1879980

File tree

7 files changed

+111
-23
lines changed

7 files changed

+111
-23
lines changed

pkg/kubelet/cm/cgroup_manager_linux.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,9 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
384384
if resourceConfig.PidsLimit != nil {
385385
resources.PidsLimit = *resourceConfig.PidsLimit
386386
}
387+
if !resourceConfig.CPUSet.IsEmpty() {
388+
resources.CpusetCpus = resourceConfig.CPUSet.String()
389+
}
387390

388391
m.maybeSetHugetlb(resourceConfig, resources)
389392

pkg/kubelet/cm/cpumanager/cpu_manager.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ type Manager interface {
9494
// GetCPUAffinity returns cpuset which includes cpus from shared pools
9595
// as well as exclusively allocated cpus
9696
GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
97+
98+
// GetAllCPUs returns all the CPUs known by cpumanager, as reported by the
99+
// hardware discovery. Maps to the CPU capacity.
100+
GetAllCPUs() cpuset.CPUSet
97101
}
98102

99103
type manager struct {
@@ -137,7 +141,11 @@ type manager struct {
137141
// stateFileDirectory holds the directory where the state file for checkpoints is held.
138142
stateFileDirectory string
139143

140-
// allocatableCPUs is the set of online CPUs as reported by the system
144+
// allCPUs is the set of online CPUs as reported by the system
145+
allCPUs cpuset.CPUSet
146+
147+
// allocatableCPUs is the set of online CPUs as reported by the system,
148+
// and available for allocation, minus the reserved set
141149
allocatableCPUs cpuset.CPUSet
142150

143151
// pendingAdmissionPod contain the pod during the admission phase
@@ -157,6 +165,11 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
157165
var policy Policy
158166
var err error
159167

168+
topo, err = topology.Discover(machineInfo)
169+
if err != nil {
170+
return nil, err
171+
}
172+
160173
switch policyName(cpuPolicyName) {
161174

162175
case PolicyNone:
@@ -166,10 +179,6 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
166179
}
167180

168181
case PolicyStatic:
169-
topo, err = topology.Discover(machineInfo)
170-
if err != nil {
171-
return nil, err
172-
}
173182
klog.InfoS("Detected CPU topology", "topology", topo)
174183

175184
reservedCPUs, ok := nodeAllocatableReservation[v1.ResourceCPU]
@@ -206,6 +215,7 @@ func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconc
206215
topology: topo,
207216
nodeAllocatableReservation: nodeAllocatableReservation,
208217
stateFileDirectory: stateFileDirectory,
218+
allCPUs: topo.CPUDetails.CPUs(),
209219
}
210220
manager.sourcesReady = &sourcesReadyStub{}
211221
return manager, nil
@@ -340,6 +350,10 @@ func (m *manager) GetAllocatableCPUs() cpuset.CPUSet {
340350
return m.allocatableCPUs.Clone()
341351
}
342352

353+
func (m *manager) GetAllCPUs() cpuset.CPUSet {
354+
return m.allCPUs.Clone()
355+
}
356+
343357
type reconciledContainer struct {
344358
podName string
345359
containerName string

pkg/kubelet/cm/cpumanager/cpu_manager_test.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -645,15 +645,8 @@ func TestCPUManagerGenerate(t *testing.T) {
645645
if rawMgr.policy.Name() != testCase.expectedPolicy {
646646
t.Errorf("Unexpected policy name. Have: %q wants %q", rawMgr.policy.Name(), testCase.expectedPolicy)
647647
}
648-
if rawMgr.policy.Name() == string(PolicyNone) {
649-
if rawMgr.topology != nil {
650-
t.Errorf("Expected topology to be nil for 'none' policy. Have: %q", rawMgr.topology)
651-
}
652-
}
653-
if rawMgr.policy.Name() != string(PolicyNone) {
654-
if rawMgr.topology == nil {
655-
t.Errorf("Expected topology to be non-nil for policy '%v'. Have: %q", rawMgr.policy.Name(), rawMgr.topology)
656-
}
648+
if rawMgr.topology == nil {
649+
t.Errorf("Expected topology to be non-nil for policy '%v'. Have: %q", rawMgr.policy.Name(), rawMgr.topology)
657650
}
658651
}
659652
})

pkg/kubelet/cm/cpumanager/fake_cpu_manager.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
8585
return cpuset.CPUSet{}
8686
}
8787

88+
func (m *fakeManager) GetAllCPUs() cpuset.CPUSet {
89+
klog.InfoS("GetAllCPUs")
90+
return cpuset.CPUSet{}
91+
}
92+
8893
// NewFakeManager creates empty/fake cpu manager
8994
func NewFakeManager() Manager {
9095
return &fakeManager{

pkg/kubelet/cm/node_container_manager_linux.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
5252
cgroupConfig := &CgroupConfig{
5353
Name: cm.cgroupRoot,
5454
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
55-
ResourceParameters: getCgroupConfig(nodeAllocatable),
55+
ResourceParameters: cm.getCgroupConfig(nodeAllocatable),
5656
}
5757
if cm.cgroupManager.Exists(cgroupConfig.Name) {
5858
return nil
@@ -80,7 +80,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
8080

8181
cgroupConfig := &CgroupConfig{
8282
Name: cm.cgroupRoot,
83-
ResourceParameters: getCgroupConfig(nodeAllocatable),
83+
ResourceParameters: cm.getCgroupConfig(nodeAllocatable),
8484
}
8585

8686
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
@@ -114,7 +114,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
114114
// Now apply kube reserved and system reserved limits if required.
115115
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
116116
klog.V(2).InfoS("Enforcing system reserved on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
117-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.SystemReservedCgroupName), nc.SystemReserved); err != nil {
117+
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved); err != nil {
118118
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
119119
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
120120
return fmt.Errorf(message)
@@ -123,7 +123,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
123123
}
124124
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
125125
klog.V(2).InfoS("Enforcing kube reserved on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
126-
if err := enforceExistingCgroup(cm.cgroupManager, cm.cgroupManager.CgroupName(nc.KubeReservedCgroupName), nc.KubeReserved); err != nil {
126+
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved); err != nil {
127127
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
128128
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
129129
return fmt.Errorf(message)
@@ -134,8 +134,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
134134
}
135135

136136
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
137-
func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error {
138-
rp := getCgroupConfig(rl)
137+
func (cm *containerManagerImpl) enforceExistingCgroup(cNameStr string, rl v1.ResourceList) error {
138+
cName := cm.cgroupManager.CgroupName(cNameStr)
139+
rp := cm.getCgroupConfig(rl)
139140
if rp == nil {
140141
return fmt.Errorf("%q cgroup is not configured properly", cName)
141142
}
@@ -156,17 +157,17 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
156157
ResourceParameters: rp,
157158
}
158159
klog.V(4).InfoS("Enforcing limits on cgroup", "cgroupName", cName, "cpuShares", cgroupConfig.ResourceParameters.CPUShares, "memory", cgroupConfig.ResourceParameters.Memory, "pidsLimit", cgroupConfig.ResourceParameters.PidsLimit)
159-
if err := cgroupManager.Validate(cgroupConfig.Name); err != nil {
160+
if err := cm.cgroupManager.Validate(cgroupConfig.Name); err != nil {
160161
return err
161162
}
162-
if err := cgroupManager.Update(cgroupConfig); err != nil {
163+
if err := cm.cgroupManager.Update(cgroupConfig); err != nil {
163164
return err
164165
}
165166
return nil
166167
}
167168

168169
// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
169-
func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
170+
func (cm *containerManagerImpl) getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
170171
// TODO(vishh): Set CPU Quota if necessary.
171172
if rl == nil {
172173
return nil
@@ -188,6 +189,18 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
188189
}
189190
rc.HugePageLimit = HugePageLimits(rl)
190191

192+
// In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
193+
// By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
194+
// However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
195+
// doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
196+
// An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
197+
// and this is sufficient.
198+
// Only do so on None policy, as Static policy will do its own updating of the cpuset.
199+
// Please see the comment on policy none's GetAllocatableCPUs
200+
if cm.cpuManager.GetAllocatableCPUs().IsEmpty() {
201+
rc.CPUSet = cm.cpuManager.GetAllCPUs()
202+
}
203+
191204
return &rc
192205
}
193206

pkg/kubelet/cm/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ package cm
1919
import (
2020
v1 "k8s.io/api/core/v1"
2121
"k8s.io/apimachinery/pkg/types"
22+
"k8s.io/utils/cpuset"
2223
)
2324

2425
// ResourceConfig holds information about all the supported cgroup resource parameters.
2526
type ResourceConfig struct {
2627
// Memory limit (in bytes).
2728
Memory *int64
29+
// CPU set (number of cpus the cgroup has access to).
30+
CPUSet cpuset.CPUSet
2831
// CPU shares (relative weight vs. other containers).
2932
CPUShares *uint64
3033
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.

test/e2e_node/node_container_manager_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,63 @@ var _ = SIGDescribe("Node Container Manager [Serial]", func() {
7171
framework.ExpectNoError(runTest(ctx, f))
7272
})
7373
})
74+
ginkgo.Context("Validate CGroup management", func() {
75+
// Regression test for https://issues.k8s.io/125923
76+
// In this issue there's a race involved with systemd which seems to manifest most likely, or perhaps only
77+
// (data gathered so far seems inconclusive) on the very first boot of the machine, so restarting the kubelet
78+
// seems not sufficient. OTOH, the exact reproducer seems to require a dedicate lane with only this test, or
79+
// to reboot the machine before to run this test. Both are practically unrealistic in CI.
80+
// The closest approximation is this test in this current form, using a kubelet restart. This at least
81+
// acts as non regression testing, so it still brings value.
82+
ginkgo.It("should correctly start with cpumanager none policy in use with systemd", func(ctx context.Context) {
83+
if !IsCgroup2UnifiedMode() {
84+
ginkgo.Skip("this test requires cgroups v2")
85+
}
86+
87+
var err error
88+
var oldCfg *kubeletconfig.KubeletConfiguration
89+
// Get current kubelet configuration
90+
oldCfg, err = getCurrentKubeletConfig(ctx)
91+
framework.ExpectNoError(err)
92+
93+
ginkgo.DeferCleanup(func(ctx context.Context) {
94+
if oldCfg != nil {
95+
// Update the Kubelet configuration.
96+
framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(oldCfg))
97+
98+
ginkgo.By("Restarting the kubelet")
99+
restartKubelet(true)
100+
101+
// wait until the kubelet health check will succeed
102+
gomega.Eventually(ctx, func(ctx context.Context) bool {
103+
return kubeletHealthCheck(kubeletHealthCheckURL)
104+
}).WithTimeout(2 * time.Minute).WithPolling(5 * time.Second).Should(gomega.BeTrue())
105+
ginkgo.By("Started the kubelet")
106+
}
107+
})
108+
109+
newCfg := oldCfg.DeepCopy()
110+
// Change existing kubelet configuration
111+
newCfg.CPUManagerPolicy = "none"
112+
newCfg.CgroupDriver = "systemd"
113+
114+
// Update the Kubelet configuration.
115+
framework.ExpectNoError(e2enodekubelet.WriteKubeletConfigFile(newCfg))
116+
117+
ginkgo.By("Restarting the kubelet")
118+
restartKubelet(true)
119+
120+
// wait until the kubelet health check will succeed
121+
gomega.Eventually(ctx, func(ctx context.Context) bool {
122+
return kubeletHealthCheck(kubeletHealthCheckURL)
123+
}).WithTimeout(2 * time.Minute).WithPolling(5 * time.Second).Should(gomega.BeTrue())
124+
ginkgo.By("Started the kubelet")
125+
126+
gomega.Consistently(ctx, func(ctx context.Context) bool {
127+
return getNodeReadyStatus(ctx, f) && kubeletHealthCheck(kubeletHealthCheckURL)
128+
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(gomega.BeTrue())
129+
})
130+
})
74131
})
75132

76133
func expectFileValToEqual(filePath string, expectedValue, delta int64) error {

0 commit comments

Comments
 (0)