Skip to content

Commit cdb4bf3

Browse files
committed
gpu: add support for additional temperature limits
Use the existing "temp-limit" as the global limit, and introduce GPU and memory thresholds. Signed-off-by: Tuomas Katila <[email protected]>
1 parent 3c7c5f5 commit cdb4bf3

File tree

3 files changed

+40
-17
lines changed

3 files changed

+40
-17
lines changed

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ type cliOptions struct {
7070
allowIDs string
7171
denyIDs string
7272
sharedDevNum int
73-
temperatureLimit int
73+
globalTempLimit int
74+
memoryTempLimit int
75+
gpuTempLimit int
7476
enableMonitoring bool
7577
wslScan bool
7678
healthManagement bool
@@ -402,13 +404,13 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
402404
return health
403405
}
404406

405-
limit := float64(dp.options.temperatureLimit)
406-
407407
// Temperatures for different areas
408-
klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC",
408+
klog.V(4).Infof("Temperatures: Memory=%dC, GPU=%dC, Global=%dC",
409409
deviceTemps.Memory, deviceTemps.GPU, deviceTemps.Global)
410410

411-
if deviceTemps.GPU > limit || deviceTemps.Global > limit || deviceTemps.Memory > limit {
411+
if deviceTemps.GPU > dp.options.gpuTempLimit ||
412+
deviceTemps.Global > dp.options.globalTempLimit ||
413+
deviceTemps.Memory > dp.options.memoryTempLimit {
412414
health = pluginapi.Unhealthy
413415
}
414416

@@ -784,7 +786,9 @@ func main() {
784786
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
785787
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
786788
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
787-
flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy")
789+
flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy")
790+
flag.IntVar(&opts.gpuTempLimit, "gpu-temp-limit", 100, "GPU temperature limit at which device is marked unhealthy")
791+
flag.IntVar(&opts.memoryTempLimit, "memory-temp-limit", 100, "Memory temperature limit at which device is marked unhealthy")
788792
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")
789793
flag.StringVar(&opts.allowIDs, "allow-ids", "", "comma-separated list of device IDs to allow (e.g. 0x49c5,0x49c6)")
790794
flag.StringVar(&opts.denyIDs, "deny-ids", "", "comma-separated list of device IDs to deny (e.g. 0x49c5,0x49c6)")

cmd/gpu_plugin/gpu_plugin_test.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,11 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) {
5858
}
5959

6060
type mockL0Service struct {
61-
indices []uint32
62-
memSize uint64
63-
healthy bool
64-
fail bool
61+
indices []uint32
62+
memSize uint64
63+
healthy bool
64+
failTemp bool
65+
fail bool
6566
}
6667

6768
func (m *mockL0Service) Run(keep bool) {
@@ -83,7 +84,7 @@ func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.Dev
8384
return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil
8485
}
8586
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
86-
if m.fail {
87+
if m.fail || m.failTemp {
8788
return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error")
8889
}
8990

@@ -608,6 +609,24 @@ func TestScanWithHealth(t *testing.T) {
608609
healthy: true,
609610
},
610611
},
612+
{
613+
name: "one device with failure on temp reading",
614+
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
615+
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
616+
sysfsfiles: map[string][]byte{
617+
"card0/device/vendor": []byte("0x8086"),
618+
},
619+
devfsdirs: []string{
620+
"card0",
621+
"by-path/pci-0000:00:00.0-card",
622+
"by-path/pci-0000:00:00.0-render",
623+
},
624+
expectedI915Devs: 1,
625+
l0mock: &mockL0Service{
626+
healthy: true,
627+
failTemp: true,
628+
},
629+
},
611630
{
612631
name: "one unhealthy device with proper symlink",
613632
pciAddresses: map[string]string{"0000:00:00.0": "card0"},

cmd/gpu_plugin/levelzeroservice/levelzero_service.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ type DeviceHealth struct {
3939
}
4040

4141
type DeviceTemperature struct {
42-
Global float64
43-
GPU float64
44-
Memory float64
42+
Global int
43+
GPU int
44+
Memory int
4545
}
4646

4747
type clientNotReadyErr struct{}
@@ -175,9 +175,9 @@ func (l *levelzero) GetDeviceTemperature(bdfAddress string) (DeviceTemperature,
175175
}
176176

177177
return DeviceTemperature{
178-
Global: temps.Global,
179-
GPU: temps.Gpu,
180-
Memory: temps.Memory,
178+
Global: int(temps.Global),
179+
GPU: int(temps.Gpu),
180+
Memory: int(temps.Memory),
181181
}, nil
182182
}
183183

0 commit comments

Comments
 (0)