From da0a03919cb79808058cf3ffc3bde7795ac55ad3 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Fri, 26 Sep 2025 10:12:08 +0300 Subject: [PATCH 1/2] [thermalctld] Change thermal monitor from sub-process to a thread --- sonic-thermalctld/scripts/thermalctld | 4 +- sonic-thermalctld/tests/test_thermalctld.py | 60 ++++++++++----------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 67fcf05d6..bf00ab211 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -14,7 +14,7 @@ from datetime import datetime import sonic_platform from sonic_py_common import daemon_base, logger -from sonic_py_common.task_base import ProcessTaskBase +from sonic_py_common.task_base import ThreadTaskBase from swsscommon import swsscommon @@ -741,7 +741,7 @@ class TemperatureUpdater(logger.Logger): self.chassis_table._del(name) -class ThermalMonitor(ProcessTaskBase): +class ThermalMonitor(ThreadTaskBase): # Initial update interval INITIAL_INTERVAL = 5 diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index 097fe1be0..bd5ffc702 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -1,6 +1,6 @@ import os import sys -import multiprocessing +import threading from imp import load_source # TODO: Replace with importlib once we no longer need to support Python 2 # TODO: Clean this up once we no longer need to support Python 2 @@ -131,7 +131,7 @@ class TestFanUpdater(object): @mock.patch('thermalctld.update_entity_info', mock.MagicMock()) def test_refresh_fan_drawer_status_fan_drawer_get_name_not_impl(self): # Test case where fan_drawer.get_name is not implemented - fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(MockChassis(), threading.Event()) mock_fan_drawer = mock.MagicMock() fan_updater._refresh_fan_drawer_status(mock_fan_drawer, 1) assert thermalctld.update_entity_info.call_count == 0 @@ -145,7 +145,7 @@ def test_update_fan_with_exception(self): fan.make_over_speed() chassis.get_all_fans().append(fan) - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED assert fan_updater.log_warning.call_count == 1 @@ -162,7 +162,7 @@ def test_set_fan_led_exception(self): mock_fan = MockFan() mock_fan.set_status_led = mock.MagicMock(side_effect=NotImplementedError) - fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(MockChassis(), threading.Event()) fan_updater._set_fan_led(mock_fan_drawer, mock_fan, 'Test Fan', fan_status) assert fan_updater.log_warning.call_count == 1 fan_updater.log_warning.assert_called_with('Failed to set status LED for fan Test Fan, set_status_led not implemented') @@ -170,7 +170,7 @@ def test_set_fan_led_exception(self): def test_fan_absent(self): chassis = MockChassis() chassis.make_absent_fan() - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -194,7 +194,7 @@ def test_fan_absent(self): def test_fan_faulty(self): chassis = MockChassis() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -218,7 +218,7 @@ def test_fan_faulty(self): def test_fan_under_speed(self): chassis = MockChassis() chassis.make_under_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -234,7 +234,7 @@ def test_fan_under_speed(self): def test_fan_over_speed(self): chassis = MockChassis() chassis.make_over_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -253,7 +253,7 @@ def test_update_psu_fans(self): mock_fan = MockFan() psu._fan_list.append(mock_fan) chassis._psu_list.append(psu) - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 0 @@ -274,7 +274,7 @@ def test_update_module_fans(self): chassis.set_modular_chassis(True) module._fan_list.append(mock_fan) chassis._module_list.append(module) - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 0 @@ -321,7 +321,7 @@ def test_insufficient_fan_number(): chassis = MockChassis() chassis.make_absent_fan() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) + fan_updater = thermalctld.FanUpdater(chassis, threading.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 3 expected_calls = [ @@ -405,7 +405,7 @@ class TestTemperatureUpdater(object): """ def test_deinit(self): chassis = MockChassis() - temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temp_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'} temp_updater.table = Table("STATE_DB", "xtable") temp_updater.table._del = mock.MagicMock() @@ -427,7 +427,7 @@ def test_deinit(self): def test_deinit_exception(self): chassis = MockChassis() - temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temp_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'} temp_updater.table = Table("STATE_DB", "xtable") temp_updater.table._del = mock.MagicMock() @@ -451,7 +451,7 @@ def test_deinit_exception(self): def test_over_temper(self): chassis = MockChassis() chassis.make_over_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -465,7 +465,7 @@ def test_over_temper(self): def test_under_temper(self): chassis = MockChassis() chassis.make_under_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -482,7 +482,7 @@ def test_update_psu_thermals(self): mock_thermal = MockThermal() psu._thermal_list.append(mock_thermal) chassis._psu_list.append(psu) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -502,7 +502,7 @@ def test_update_sfp_thermals(self): mock_thermal = MockThermal() sfp._thermal_list.append(mock_thermal) chassis._sfp_list.append(sfp) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -523,7 +523,7 @@ def test_update_thermal_with_exception(self): thermal.make_over_temper() chassis.get_all_thermals().append(thermal) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 2 @@ -544,7 +544,7 @@ def test_update_module_thermals(self): chassis = MockChassis() chassis.make_module_thermal() chassis.set_modular_chassis(True) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() assert len(temperature_updater.all_thermals) == 3 @@ -559,21 +559,21 @@ def test_dpu_chassis_thermals(): # Modular chassis (Not a dpu chassis) No Change in TemperatureUpdater Behaviour chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert temperature_updater.chassis_table # DPU chassis TemperatureUpdater without is_smartswitch False return - No update to CHASSIS_STATE_DB chassis.set_modular_chassis(False) chassis.set_dpu(True) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert not temperature_updater.chassis_table # DPU chassis TemperatureUpdater without get_dpu_id implmenetation- No update to CHASSIS_STATE_DB chassis.set_smartswitch(True) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert not temperature_updater.chassis_table # DPU chassis TemperatureUpdater with get_dpu_id implemented - Update data to CHASSIS_STATE_DB dpu_id = 1 chassis.set_dpu_id(dpu_id) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert temperature_updater.chassis_table # Table name in chassis state db = TEMPERATURE_INFO_0 for dpu_id 0 assert temperature_updater.chassis_table.table_name == f"{TEMPER_INFO_TABLE_NAME}_{dpu_id}" @@ -588,7 +588,7 @@ def test_dpu_chassis_state_deinit(): chassis.set_modular_chassis(False) chassis.set_dpu(True) chassis.set_dpu_id(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert temperature_updater.chassis_table temperature_updater.table = Table("STATE_DB", "xtable") temperature_updater.phy_entity_table = None @@ -611,7 +611,7 @@ def test_updater_dpu_thermal_check_chassis_table(): chassis.set_dpu(True) chassis.set_smartswitch(True) chassis.set_dpu_id(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals() @@ -628,17 +628,17 @@ def test_updater_thermal_check_modular_chassis(): chassis = MockChassis() assert chassis.is_modular_chassis() == False - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert temperature_updater.chassis_table == None chassis.set_modular_chassis(True) chassis.set_my_slot(-1) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert temperature_updater.chassis_table == None my_slot = 1 chassis.set_my_slot(my_slot) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) assert temperature_updater.chassis_table != None assert temperature_updater.chassis_table.table_name == '{}_{}'.format(TEMPER_INFO_TABLE_NAME, str(my_slot)) @@ -651,7 +651,7 @@ def test_updater_thermal_check_chassis_table(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals() @@ -670,7 +670,7 @@ def test_updater_thermal_check_min_max(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) + temperature_updater = thermalctld.TemperatureUpdater(chassis, threading.Event()) temperature_updater.update() slot_dict = temperature_updater.chassis_table.get(thermal.get_name()) From 4d0d34957a49ecb680feec6eaeb9aaad37a1c7d1 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox Date: Fri, 31 Oct 2025 05:35:44 +0200 Subject: [PATCH 2/2] Fix UT issue --- sonic-thermalctld/tests/test_thermalctld.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index 5273131bb..fbfe5ea09 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -813,7 +813,7 @@ def test_get_chassis_exception(self): # ThermalControlDaemon should raise SystemExit with CHASSIS_GET_ERROR code when chassis initialization fails with pytest.raises(SystemExit) as exc_info: - daemon_thermalctld = thermalctld.ThermalControlDaemon() + daemon_thermalctld = thermalctld.ThermalControlDaemon(5, 60, 30) # Verify it exits with the correct error code assert exc_info.value.code == thermalctld.CHASSIS_GET_ERROR @@ -836,7 +836,7 @@ def test_get_chassis_success(self): mock_platform_instance.get_chassis.return_value = mock_chassis mock_platform_class.return_value = mock_platform_instance - daemon_thermalctld = thermalctld.ThermalControlDaemon() + daemon_thermalctld = thermalctld.ThermalControlDaemon(5, 60, 30) # Verify chassis was set correctly assert daemon_thermalctld.chassis is mock_chassis