Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 43 additions & 17 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

from enum import Enum, auto
import os
import signal
import sys
import threading
Expand All @@ -31,6 +32,8 @@ INVALID_SLOT_OR_DPU = -1

ERR_UNKNOWN = 1

PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"

# Thermal control daemon is designed to never exit, it must always
# return non-zero exit code when exiting and so that supervisord will
# restart it automatically.
Expand Down Expand Up @@ -742,23 +745,27 @@ class TemperatureUpdater(logger.Logger):


class ThermalMonitor(ProcessTaskBase):
# Initial update interval
INITIAL_INTERVAL = 5

# Update interval value
UPDATE_INTERVAL = 60

# Update elapse threshold. If update used time is larger than the value, generate a warning log.
UPDATE_ELAPSED_THRESHOLD = 30

def __init__(self, chassis):
def __init__(
self,
chassis,
initial_interval=5,
update_interval=60,
update_elapsed_threshold=30,
):
"""
Initializer for ThermalMonitor
:param chassis: Object representing a platform chassis
"""
super(ThermalMonitor, self).__init__()

self.wait_time = self.INITIAL_INTERVAL
# Initial update interval
self.initial_interval = initial_interval
# Update interval value
self.update_interval = update_interval
# Update elapse threshold. If update used time is larger than the value, generate a warning log.
self.update_elapsed_threshold = update_elapsed_threshold

self.wait_time = self.initial_interval

# TODO: Refactor to eliminate the need for this Logger instance
self.logger = logger.Logger(SYSLOG_IDENTIFIER)
Expand All @@ -774,12 +781,12 @@ class ThermalMonitor(ProcessTaskBase):
self.fan_updater.update()
self.temperature_updater.update()
elapsed = time.time() - begin
if elapsed < self.UPDATE_INTERVAL:
self.wait_time = self.UPDATE_INTERVAL - elapsed
if elapsed < self.update_interval:
self.wait_time = self.update_interval - elapsed
else:
self.wait_time = self.INITIAL_INTERVAL
self.wait_time = self.initial_interval

if elapsed > self.UPDATE_ELAPSED_THRESHOLD:
if elapsed > self.update_elapsed_threshold:
self.logger.log_warning('Update fan and temperature status took {} seconds, '
'there might be performance risk'.format(elapsed))

Expand Down Expand Up @@ -808,9 +815,11 @@ class ThermalControlDaemon(daemon_base.DaemonBase):

POLICY_FILE = '/usr/share/sonic/platform/thermal_policy.json'

def __init__(self):
def __init__(self, platform_env_conf_file=PLATFORM_ENV_CONF_FILE):
"""
Initializer of ThermalControlDaemon
:param platform_env_conf_file: Filepath to platform env config.
Used for dependency injection in unit tests.
"""
super(ThermalControlDaemon, self).__init__(SYSLOG_IDENTIFIER)

Expand All @@ -823,7 +832,24 @@ class ThermalControlDaemon(daemon_base.DaemonBase):

self.chassis = sonic_platform.platform.Platform().get_chassis()

self.thermal_monitor = ThermalMonitor(self.chassis)
# Replace ThermalMonitor timing-related config with values
# from platform_env.conf if defined.
thermal_monitor_extra_args = {}
if os.path.isfile(platform_env_conf_file):
with open(platform_env_conf_file, 'r') as file:
for line in file:
if not line or '=' not in line:
continue
k, v = line.split('=', 1)
k = k.strip()
v = v.strip()
if k == 'THERMALCTLD_THERMAL_MONITOR_INITIAL_INTERVAL':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can these be wrapped in a try/except? Not sure if you want these to fallback to defaults or take the daemon down on an incorrect config file.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for raising this up! I think it's better to fallback to the default values and let the daemon continue.

I added try/except and updated the unit tests as well. Thanks, Gregory!

thermal_monitor_extra_args['initial_interval'] = int(v)
elif k == 'THERMALCTLD_THERMAL_MONITOR_UPDATE_INTERVAL':
thermal_monitor_extra_args['update_interval'] = int(v)
elif k == 'THERMALCTLD_THERMAL_MONITOR_UPDATE_ELAPSED_THRESHOLD':
thermal_monitor_extra_args['update_elapsed_threshold'] = int(v)
self.thermal_monitor = ThermalMonitor(self.chassis, **thermal_monitor_extra_args)
self.thermal_monitor.task_run()

self.thermal_manager = None
Expand Down
100 changes: 100 additions & 0 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import mock

import pytest
import tempfile
tests_path = os.path.dirname(os.path.abspath(__file__))

# Add mocked_libs path so that the file under test can load mocked modules from there
Expand Down Expand Up @@ -302,6 +303,60 @@ def test_main(self):
assert thermal_monitor.fan_updater.update.call_count == 1
assert thermal_monitor.temperature_updater.update.call_count == 1

def test_init_with_default_timing_config(self):
mock_chassis = MockChassis()
thermal_monitor = thermalctld.ThermalMonitor(mock_chassis)
assert thermal_monitor.initial_interval == 5
assert thermal_monitor.update_interval == 60
assert thermal_monitor.update_elapsed_threshold == 30

def test_init_with_custom_config(self):
mock_chassis = MockChassis()
thermal_monitor = thermalctld.ThermalMonitor(
mock_chassis,
initial_interval=5,
update_interval=10,
update_elapsed_threshold=15,
)
assert thermal_monitor.initial_interval == 5
assert thermal_monitor.update_interval == 10
assert thermal_monitor.update_elapsed_threshold == 15

@mock.patch('thermalctld.time.time')
def test_main_with_custom_timing_config(self, mock_time):
# Given
mock_chassis = MockChassis()
thermal_monitor = thermalctld.ThermalMonitor(
mock_chassis,
initial_interval=5,
update_interval=10,
update_elapsed_threshold=15,
)
thermal_monitor.logger.log_warning = mock.MagicMock()
# First tick should match initial_interval
assert thermal_monitor.wait_time == 5

# When - mock elapsed time (begin=0, end=7)
mock_time.side_effect = [0, 7]
thermal_monitor.main()
# Then - next tick should be what is left from update_interval: 10 - 7 = 3
assert thermal_monitor.wait_time == 3

# When - mock elapsed time larger than update_interval
mock_time.side_effect = [10, 21]
thermal_monitor.main()
# Then - next tick should become initial_interval
assert thermal_monitor.wait_time == 5

# When - mock elapsed time larger than update_elapsed_threshold
mock_time.side_effect = [26, 42]
thermal_monitor.main()
# Then - a warning should be logged
assert thermal_monitor.logger.log_warning.call_count == 1
thermal_monitor.logger.log_warning.assert_called_with(
'Update fan and temperature status took 16 seconds, there might be performance risk'
)


def test_insufficient_fan_number():
fan_status1 = thermalctld.FanStatus()
Expand Down Expand Up @@ -759,6 +814,51 @@ def test_daemon_run():
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert ret is True

def _create_platform_env_conf_file(content):
"""
Creates a platform_env.conf, under a temporary directory, with the given content
:param content: Content to write to the file
:return: Path to the created file
"""
root = tempfile.mkdtemp()
filepath = os.path.join(root, 'platform_env.conf')
with open(filepath, 'w+') as f:
f.write(content)
return filepath


def test_daemon_init_with_default_thermal_monitor_config():
filepath = _create_platform_env_conf_file("")
daemon_thermalctld = thermalctld.ThermalControlDaemon(
platform_env_conf_file=filepath
)
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert (
daemon_thermalctld.thermal_monitor.initial_interval
== thermalctld.ThermalMonitor.INITIAL_INTERVAL
)
assert (
daemon_thermalctld.thermal_monitor.update_interval
== thermalctld.ThermalMonitor.UPDATE_INTERVAL
)
assert (
daemon_thermalctld.thermal_monitor.update_elapsed_threshold
== thermalctld.ThermalMonitor.UPDATE_ELAPSED_THRESHOLD
)


def test_daemon_init_with_custom_thermal_monitor_config():
filepath = _create_platform_env_conf_file("""\
THERMALCTLD_THERMAL_MONITOR_INITIAL_INTERVAL=5
THERMALCTLD_THERMAL_MONITOR_UPDATE_INTERVAL=10
THERMALCTLD_THERMAL_MONITOR_UPDATE_ELAPSED_THRESHOLD=15
""")
daemon_thermalctld = thermalctld.ThermalControlDaemon(platform_env_conf_file=filepath)
daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert
assert daemon_thermalctld.thermal_monitor.initial_interval == 5
assert daemon_thermalctld.thermal_monitor.update_interval == 10
assert daemon_thermalctld.thermal_monitor.update_elapsed_threshold == 15


def test_try_get():
def good_callback():
Expand Down
Loading