@@ -52,7 +52,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
5252 cgroupConfig := & CgroupConfig {
5353 Name : cm .cgroupRoot ,
5454 // The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
55- ResourceParameters : getCgroupConfig (nodeAllocatable ),
55+ ResourceParameters : cm . getCgroupConfig (nodeAllocatable ),
5656 }
5757 if cm .cgroupManager .Exists (cgroupConfig .Name ) {
5858 return nil
@@ -80,7 +80,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
8080
8181 cgroupConfig := & CgroupConfig {
8282 Name : cm .cgroupRoot ,
83- ResourceParameters : getCgroupConfig (nodeAllocatable ),
83+ ResourceParameters : cm . getCgroupConfig (nodeAllocatable ),
8484 }
8585
8686 // Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
@@ -114,7 +114,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
114114 // Now apply kube reserved and system reserved limits if required.
115115 if nc .EnforceNodeAllocatable .Has (kubetypes .SystemReservedEnforcementKey ) {
116116 klog .V (2 ).InfoS ("Enforcing system reserved on cgroup" , "cgroupName" , nc .SystemReservedCgroupName , "limits" , nc .SystemReserved )
117- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .SystemReservedCgroupName ) , nc .SystemReserved ); err != nil {
117+ if err := cm .enforceExistingCgroup (nc .SystemReservedCgroupName , nc .SystemReserved ); err != nil {
118118 message := fmt .Sprintf ("Failed to enforce System Reserved Cgroup Limits on %q: %v" , nc .SystemReservedCgroupName , err )
119119 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
120120 return fmt .Errorf (message )
@@ -123,7 +123,7 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
123123 }
124124 if nc .EnforceNodeAllocatable .Has (kubetypes .KubeReservedEnforcementKey ) {
125125 klog .V (2 ).InfoS ("Enforcing kube reserved on cgroup" , "cgroupName" , nc .KubeReservedCgroupName , "limits" , nc .KubeReserved )
126- if err := enforceExistingCgroup ( cm .cgroupManager , cm . cgroupManager . CgroupName (nc .KubeReservedCgroupName ) , nc .KubeReserved ); err != nil {
126+ if err := cm .enforceExistingCgroup (nc .KubeReservedCgroupName , nc .KubeReserved ); err != nil {
127127 message := fmt .Sprintf ("Failed to enforce Kube Reserved Cgroup Limits on %q: %v" , nc .KubeReservedCgroupName , err )
128128 cm .recorder .Event (nodeRef , v1 .EventTypeWarning , events .FailedNodeAllocatableEnforcement , message )
129129 return fmt .Errorf (message )
@@ -134,8 +134,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
134134}
135135
136136// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
137- func enforceExistingCgroup (cgroupManager CgroupManager , cName CgroupName , rl v1.ResourceList ) error {
138- rp := getCgroupConfig (rl )
137+ func (cm * containerManagerImpl ) enforceExistingCgroup (cNameStr string , rl v1.ResourceList ) error {
138+ cName := cm .cgroupManager .CgroupName (cNameStr )
139+ rp := cm .getCgroupConfig (rl )
139140 if rp == nil {
140141 return fmt .Errorf ("%q cgroup is not configured properly" , cName )
141142 }
@@ -156,17 +157,17 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
156157 ResourceParameters : rp ,
157158 }
158159 klog .V (4 ).InfoS ("Enforcing limits on cgroup" , "cgroupName" , cName , "cpuShares" , cgroupConfig .ResourceParameters .CPUShares , "memory" , cgroupConfig .ResourceParameters .Memory , "pidsLimit" , cgroupConfig .ResourceParameters .PidsLimit )
159- if err := cgroupManager .Validate (cgroupConfig .Name ); err != nil {
160+ if err := cm . cgroupManager .Validate (cgroupConfig .Name ); err != nil {
160161 return err
161162 }
162- if err := cgroupManager .Update (cgroupConfig ); err != nil {
163+ if err := cm . cgroupManager .Update (cgroupConfig ); err != nil {
163164 return err
164165 }
165166 return nil
166167}
167168
168169// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
169- func getCgroupConfig (rl v1.ResourceList ) * ResourceConfig {
170+ func ( cm * containerManagerImpl ) getCgroupConfig (rl v1.ResourceList ) * ResourceConfig {
170171 // TODO(vishh): Set CPU Quota if necessary.
171172 if rl == nil {
172173 return nil
@@ -188,6 +189,18 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
188189 }
189190 rc .HugePageLimit = HugePageLimits (rl )
190191
192+ // In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
193+ // By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
194+ // However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
195+ // doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
196+ // An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
197+ // and this is sufficient.
198+ // Only do so on None policy, as Static policy will do its own updating of the cpuset.
199+ // Please see the comment on policy none's GetAllocatableCPUs
200+ if cm .cpuManager .GetAllocatableCPUs ().IsEmpty () {
201+ rc .CPUSet = cm .cpuManager .GetAllCPUs ()
202+ }
203+
191204 return & rc
192205}
193206
0 commit comments