Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ecd68c9
Refactored for graceful shutdown
rameshraghupathy Aug 24, 2025
b022801
Refactored for graceful shutdown
rameshraghupathy Aug 25, 2025
ef2f282
Refactored for graceful shutdown, fixing UT
rameshraghupathy Aug 26, 2025
98c2146
Refactored for graceful shutdown, fixing UT
rameshraghupathy Sep 1, 2025
b959da9
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 8, 2025
5b2c0f6
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
b02d32e
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
fde2fd4
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
3aee661
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
983992d
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
ad81510
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
e74e00e
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
72f7eba
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
b920d0f
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
6d3bb42
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
468f2ee
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
7a28e87
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
36d0df1
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
32a4ee6
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
8f7e3ae
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
9f406f5
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
b0dafa2
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
e56983a
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
9f81fac
Addressed review comments after the refactoring
rameshraghupathy Sep 12, 2025
33c503e
Addressed review comments after the refactoring
rameshraghupathy Sep 12, 2025
08faf9d
Fixing ut
rameshraghupathy Sep 12, 2025
b612f80
Fixing ut
rameshraghupathy Sep 12, 2025
1987132
Fixing ut
rameshraghupathy Sep 12, 2025
3169b9d
Addressed review comments related to refactoring
rameshraghupathy Oct 2, 2025
500f3d3
Fixing test failures
rameshraghupathy Oct 2, 2025
8762717
Improving coverage
rameshraghupathy Oct 2, 2025
ddf58cc
Improving coverage
rameshraghupathy Oct 2, 2025
9223003
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Oct 2, 2025
8cd0c59
Improving coverage
rameshraghupathy Oct 2, 2025
75e0d47
Improving coverage
rameshraghupathy Oct 2, 2025
b424cb7
Improving coverage
rameshraghupathy Oct 2, 2025
663c496
Fixing test failures
rameshraghupathy Oct 2, 2025
8332565
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Oct 21, 2025
6148a5e
Aliging with the module_base.py changes such as common API, timezone,…
rameshraghupathy Oct 21, 2025
b44cd97
Merge branch 'graceful-shutdown' of https://github.com/rameshraghupat…
rameshraghupathy Oct 21, 2025
e1b2dc5
Fixing test failures
rameshraghupathy Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 77 additions & 68 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -226,12 +226,16 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
"""
super(SmartSwitchModuleConfigUpdater, self).__init__(log_identifier)
self.chassis = chassis
self._state_db_connector = swsscommon.SonicV2Connector(use_unix_socket_path=True)
self._state_db_connector.connect(self._state_db_connector.STATE_DB)

def deinit(self):
"""
Destructor of SmartSwitchModuleConfigUpdater
:return:
"""
if self._state_db_connector:
self._state_db_connector.close()

def module_config_update(self, key, admin_state):
if not key.startswith(ModuleBase.MODULE_TYPE_DPU):
Expand All @@ -254,16 +258,37 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
self.log_warning("Invalid admin_state value: {}".format(admin_state))

def submit_callback(self, module_index, admin_state, key):
module = self.chassis.get_module(module_index)

if admin_state == MODULE_ADMIN_DOWN:
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.chassis.get_module(module_index).module_pre_shutdown, default=False)
try_get(self.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
if admin_state == MODULE_ADMIN_UP:
# This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.chassis.get_module(module_index).module_post_startup, default=False)
pass
# Pre-shutdown (if implemented), then mark shutdown transition and drive admin down
try_get(module.module_pre_shutdown, default=False)
try:
result = module.set_module_state_transition(self._state_db_connector, key, "shutdown")
if not result:
self.log_warning(f"Failed to set shutdown transition for {key}")
except Exception as e:
self.log_error(f"Failed to set shutdown transition for {key}: {e}")
result = try_get(module.set_admin_state, admin_state, default=False)
if not result:
self.log_error(f"Failed to set admin state for {key}")

elif admin_state == MODULE_ADMIN_UP:
# Mark startup transition before bring-up
try:
result = module.set_module_state_transition(self._state_db_connector, key, "startup")
if not result:
self.log_warning(f"Failed to set startup transition for {key}")
except Exception as e:
self.log_error(f"Failed to set startup transition for {key}: {e}")
result = try_get(module.set_admin_state, admin_state, default=False)
if not result:
self.log_error(f"Failed to set admin state for {key}")
# Optional post-startup hook (if implemented)
try_get(module.module_post_startup, default=False)

else:
self.log_warning(f"Invalid admin_state value: {admin_state}")

#
# Module Updater ==============================================================
Expand Down Expand Up @@ -723,7 +748,10 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None
self.module_transition_flag_helper = ModuleTransitionFlagHelper()

# Centralized transition API: reuse one STATE_DB connector
self._state_v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
self._state_v2.connect(self._state_v2.STATE_DB)

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
Expand Down Expand Up @@ -812,12 +840,8 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
if prev_status != ModuleBase.MODULE_STATUS_EMPTY and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE):
self.log_notice("{} operational status transitioning to offline".format(key))

# Persist dpu down time
# Persist dpu down time & capture cause
self.persist_dpu_reboot_time(key)
# persist reboot cause
# Clear transition flag in STATE_DB
self.module_transition_flag_helper.clear_transition_flag(key)

reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
self.persist_dpu_reboot_cause(reboot_cause, key)
# publish reboot cause to db
Expand Down Expand Up @@ -852,9 +876,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)

# Clear transition flag in STATE_DB
self.module_transition_flag_helper.clear_transition_flag(key)

def _get_module_info(self, module_index):
"""
Retrieves module info of this module
Expand Down Expand Up @@ -1336,33 +1357,6 @@ class DpuStateUpdater(logger.Logger):
self._update_dp_dpu_state('down')
self._update_cp_dpu_state('down')

class ModuleTransitionFlagHelper(logger.Logger):
def __init__(self, log_identifier = SYSLOG_IDENTIFIER):
super(ModuleTransitionFlagHelper, self).__init__(log_identifier)
# Use new connector to avoid redis failures
"""Create a helper function to get the module table,
since multiple threads updating with the same connector will cause redis failures"""
state_db = daemon_base.db_connect("STATE_DB")
self.module_table = swsscommon.Table(state_db, CHASSIS_MODULE_INFO_TABLE)

def set_transition_flag(self, module_name):
try:
self.module_table.hset(module_name, 'state_transition_in_progress', 'True')
self.module_table.hset(module_name, 'transition_start_time', datetime.now(timezone.utc).replace(tzinfo=None).isoformat())
except Exception as e:
self.log_error(f"Error setting transition flag for {module_name}: {e}")

def clear_transition_flag(self, module_name):
try:
self.log_info(f"Clearing transition flag for {module_name}")
self.module_table.hdel(module_name, 'state_transition_in_progress')
self.module_table.hdel(module_name, 'transition_start_time')
except Exception as e:
self.log_error(f"Error clearing transition flag for {module_name}: {e}")

def clear_all_transition_flags(self):
for module_name in self.module_table.getKeys():
self.clear_transition_flag(module_name)

#
# Daemon =======================================================================
Expand All @@ -1380,6 +1374,8 @@ class ChassisdDaemon(daemon_base.DaemonBase):
self.stop = threading.Event()

self.platform_chassis = chassis
self._state_db_connector = swsscommon.SonicV2Connector(use_unix_socket_path=True)
self._state_db_connector.connect(self._state_db_connector.STATE_DB)

for signum in self.FATAL_SIGNALS + self.NONFATAL_SIGNALS:
try:
Expand All @@ -1401,22 +1397,45 @@ class ChassisdDaemon(daemon_base.DaemonBase):
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))

def submit_dpu_callback(self, module_index, admin_state, module_name):
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
# Set admin_state change in progress using the centralized method
"""
Preserve original behavior:
- Always attempt module_pre_shutdown (no-op if not implemented)
- If admin_state == MODULE_ADMIN_DOWN:
* mark transition (shutdown) via ModuleBase centralized API
* then call set_admin_state(down)
- For any other admin_state (e.g., MODULE_PRE_SHUTDOWN), only run module_pre_shutdown
and do nothing else.
"""
module = self.module_updater.chassis.get_module(module_index)

# Always attempt pre-shutdown
try_get(module.module_pre_shutdown, default=False)

if admin_state == MODULE_ADMIN_DOWN:
ModuleTransitionFlagHelper().set_transition_flag(module_name)
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
# Use centralized ModuleBase API to mark shutdown transition
try:
result = module.set_module_state_transition(self._state_db_connector, module_name, "shutdown")
if not result:
self.log_warning(f"Failed to set transition flag (shutdown) for {module_name}")
except Exception as e:
self.log_error(f"Failed to set transition flag (shutdown) for {module_name}: {e}")

# Drive admin down using graceful shutdown if available
result = try_get(module.set_admin_state, admin_state, default=False)
if not result:
self.log_error(f"Failed to set admin state for {module_name}")
else:
pass

def set_initial_dpu_admin_state(self):
"""Send admin_state trigger once to modules those are powered up"""
"""Send admin_state trigger once to modules those are powered up,
and mark centralized 'startup' for DPUs intended UP but not ONLINE."""
threads = []
for module_index in range(0, self.module_updater.num_modules):
op = None
# Get operational state of DPU
module_name = self.platform_chassis.get_module(module_index).get_name()
operational_state = self.platform_chassis.get_module(module_index).get_oper_status()
module = self.platform_chassis.get_module(module_index)
module_name = module.get_name()
operational_state = module.get_oper_status()

try:
# Get admin state of DPU
Expand All @@ -1429,10 +1448,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):

# Initialize DPU_STATE DB table on bootup
dpu_state_key = "DPU_STATE|" + module_name
if operational_state == ModuleBase.MODULE_STATUS_ONLINE:
op_state = 'up'
else:
op_state = 'down'
op_state = 'up' if operational_state == ModuleBase.MODULE_STATUS_ONLINE else 'down'
self.module_updater.update_dpu_state(dpu_state_key, op_state)

if op is not None:
Expand Down Expand Up @@ -1486,16 +1502,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):

# Set the initial DPU admin state for SmartSwitch
if self.smartswitch:
# Clear all stale transition flags for SmartSwitch on startup
ModuleTransitionFlagHelper().clear_all_transition_flags()
self.set_initial_dpu_admin_state()
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
module_transition_flag_helper = ModuleTransitionFlagHelper()
# Clear all stale transition flags for SmartSwitch on startup
module_transition_flag_helper.clear_all_transition_flags()
self.set_initial_dpu_admin_state()
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
module_transition_flag_helper.clear_all_transition_flags()

while not self.stop.wait(CHASSIS_INFO_UPDATE_PERIOD_SECS):
self.module_updater.module_db_update()
Expand All @@ -1509,6 +1516,8 @@ class ChassisdDaemon(daemon_base.DaemonBase):

# Delete all the information from DB and then exit
self.module_updater.deinit()
if self._state_db_connector:
self._state_db_connector.close()

self.log_info("Shutting down...")

Expand Down
62 changes: 62 additions & 0 deletions sonic-chassisd/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# tests/conftest.py
import os
import sys
from imp import load_source

# 1) Ensure the daemon script is importable and loaded exactly once
_REPO_ROOT = os.path.dirname(os.path.dirname(__file__))
_SCRIPTS_DIR = os.path.join(_REPO_ROOT, "scripts")
_base = os.path.join(_SCRIPTS_DIR, "chassisd")
_chassisd_path = _base if os.path.exists(_base) else _base + ".py"

# Keep environment light for unit tests (many tests set this too; setting early helps)
os.environ.setdefault("CHASSISD_UNIT_TESTING", "1")

if "chassisd" not in sys.modules:
load_source("chassisd", _chassisd_path)

# 2) Provide a very small, memory-light stub for SonicV2Connector used by chassisd
# (Prevents lots of real redis connectors from being created across tests.)
class _FakeRedis:
__slots__ = ("_h",) # keep memory footprint tiny
def __init__(self): self._h = {}
def hgetall(self, key): return dict(self._h.get(key, {}))
def hset(self, key, *args, **kwargs):
if "mapping" in kwargs:
self._h.setdefault(key, {}).update(kwargs["mapping"])
return 1
if len(args) == 2:
field, value = args
self._h.setdefault(key, {})[field] = value
return 1
return 0

class _DummyV2:
# match what production code uses
STATE_DB = 6
CHASSIS_STATE_DB = 15 # harmless constant if referenced
def __init__(self, *a, **k): self._client = _FakeRedis()
def connect(self, _dbid): return None
def close(self): return None
def get_redis_client(self, _dbid): return self._client

# 3) Patch both the module-under-test’s swsscommon and the global swsscommon, if present
chassisd = sys.modules["chassisd"]

try:
import swsscommon as _sc
except Exception:
_sc = None

# Patch the symbol used by chassisd
if hasattr(chassisd, "swsscommon"):
chassisd.swsscommon.SonicV2Connector = _DummyV2
# ensure constants exist if code references them
if not hasattr(chassisd.swsscommon.SonicV2Connector, "STATE_DB"):
chassisd.swsscommon.SonicV2Connector.STATE_DB = 6

# Also patch the top-level swsscommon for tests that import it directly
if _sc is not None:
_sc.SonicV2Connector = _DummyV2
if not hasattr(_sc.SonicV2Connector, "STATE_DB"):
_sc.SonicV2Connector.STATE_DB = 6
8 changes: 8 additions & 0 deletions sonic-chassisd/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ def set_model(self, model):
def set_presence(self, presence):
self.presence = presence

def set_admin_state_using_graceful_shutdown(self, admin_state):
"""Mock implementation of graceful shutdown method"""
return self.set_admin_state(admin_state)

def set_module_state_transition(self, v2_connector, module_name, transition_type):
"""Mock implementation of state transition method"""
return True

class MockChassis:
def __init__(self):
self.module_list = []
Expand Down
Loading
Loading