From c68a1a686f2d2d2ec2cc79a4b3a2521d15ac2333 Mon Sep 17 00:00:00 2001 From: Arvindsrinivasan Lakshmi Narasimhan Date: Mon, 6 Oct 2025 17:10:28 +0000 Subject: [PATCH 1/2] add new lag id checker for voq chassis Signed-off-by: Arvindsrinivasan Lakshmi Narasimhan --- scripts/chassis_lag_id_checker | 166 +++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 scripts/chassis_lag_id_checker diff --git a/scripts/chassis_lag_id_checker b/scripts/chassis_lag_id_checker new file mode 100644 index 0000000000..31daa72c48 --- /dev/null +++ b/scripts/chassis_lag_id_checker @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +""" +chassis_lag_id_checker + +This script checks for synchronization of LAG (Link Aggregation Group) IDs +between the chassis_db and asic_db on VOQ chassis Linecard. +This script is intended to be run by Monit. +It will write an alerting message into syslog if it finds any mismatches in +LAG IDs between the chassis_db and asic_db. + +It performs the following steps: +- Retrieves LAG IDs from the ASIC DBs (per namespace). +- Retrieves the SYSTEM_LAG_ID_TABLE from the chassis DB. +- Compares the LAG IDs in the chassis DB and ASIC DBs to identify mismatches. +- Reports any mismatched LAG keys per ASIC namespace. +- Exits with a non-zero status if mismatches are found. + +Intended to be run on line cards (not on the supervisor) of a VOQ chassis +device. +Usage: + python3 chassis_lag_id_checker [--log-level LEVEL] + +Arguments: + --log-level LEVEL Set the logging level (DEBUG, INFO, WARNING, ERROR, + CRITICAL). Default is WARNING. + +""" + +import subprocess +import json +import logging +import sys +import argparse +import sonic_py_common.multi_asic as multi_asic +import sonic_py_common.device_info as device_info +from utilities_common import cli + + +def run_redis_dump(cmd_args): + """Run redis-dump with given command arguments and return parsed JSON output.""" + try: + #result = subprocess.run(cmd_args, capture_output=True, text=True) + result = cli.run_command(cmd_args) + if result.returncode != 0: + logging.error(f"Command failed: {result.stderr}") + raise RuntimeError(f"Command failed: {result.stderr}") + return json.loads(result.stdout) + except Exception as e: + logging.error(f"Error running redis-dump: {e}") + return {} + + +def extract_lag_ids_from_asic_db(db_output, key_pattern, lag_id_field): + """Extract LAG IDs from redis-dump output based on key pattern and field name.""" + lag_ids = set() + for key, info in db_output.items(): + if key_pattern in key: + lag_id = info.get('value', {}).get(lag_id_field, None) + if lag_id is None: + logging.error(f"{key} has bad lag_id") + lag_ids.add(lag_id) + return lag_ids + + +def extract_table_ids_from_chassis_db(table_output): + """Extract IDs from a table output (dict of key: id).""" + return set(table_output.values()) + + +def get_lag_ids_asic_namespace(asic_netns): + """Get LAG IDs from a specific ASIC namespace.""" + if asic_netns == multi_asic.DEFAULT_NAMESPACE: + asic_cmd = ["redis-dump", "-d", "1", "-k", "*SAI_OBJECT_TYPE_LAG:*", "-y"] + else: + asic_cmd = [ + "sudo", "ip", "netns", "exec", asic_netns, + "redis-dump", "-d", "1", "-k", "*SAI_OBJECT_TYPE_LAG:*", "-y" + ] + asic_db_output = run_redis_dump(asic_cmd) + return extract_lag_ids_from_asic_db( + asic_db_output, "SAI_OBJECT_TYPE_LAG", "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID" + ) + + +def get_chassis_lag_db_table(): + """Fetch and return the SYSTEM_LAG_ID_TABLE from chassis_db.""" + chassis_db_cmd = [ + "redis-dump", + "-H", "redis_chassis.server", + "-p", "6380", + "-d", "12", + "-k", "SYSTEM_LAG_ID_TABLE", + "-y" + ] + chassis_db_raw = run_redis_dump(chassis_db_cmd) + chassis_db_table = chassis_db_raw.get('SYSTEM_LAG_ID_TABLE', {}).get('value', {}) + if not chassis_db_table: + logging.error("No SYSTEM_LAG_ID_TABLE found in chassis_db") + sys.exit(2) + return chassis_db_table + + +def get_lag_key_mismatches(chassis_db_table, diff): + """Return list of keys from chassis_db_table whose values are in diff.""" + mismatches = [] + for key, value in chassis_db_table.items(): + if value in diff: + mismatches.append(key) + return mismatches + + +def compare_lag_ids(lag_ids_in_chassis_db, asic): + lag_ids_in_asic_db = get_lag_ids_asic_namespace(asic) + diff = lag_ids_in_chassis_db - lag_ids_in_asic_db + return diff + + +def check_lag_id_sync(): + """Check if LAG IDs in chassis_db and asic_db are in sync.""" + + chassis_db_lag_table = get_chassis_lag_db_table() + lag_ids_in_chassis_db = extract_table_ids_from_chassis_db(chassis_db_lag_table) + + asic_namespace = multi_asic.get_namespace_list() + diff_summary = {} + for asic_namespace in asic_namespace: + diff = compare_lag_ids(lag_ids_in_chassis_db, asic_namespace) + asic_name = "localhost" if asic_namespace == multi_asic.DEFAULT_NAMESPACE else asic_namespace + diff_summary[asic_name] = get_lag_key_mismatches(chassis_db_lag_table, diff) + + return diff_summary + + +def main(): + parser = argparse.ArgumentParser(description="Check LAG ID sync between chassis_db and asic_db") + parser.add_argument('--log-level', default='WARNING', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Set the logging level') + args = parser.parse_args() + logging.basicConfig(level=getattr(logging, args.log_level)) + + if not device_info.is_voq_chassis(): + logging.info("Not a voq chassis device. Exiting.....") + return + + if device_info.is_supervisor(): + logging.info("Not supported on supervisor. Exiting....") + return + + diff_summary = check_lag_id_sync() + mismatches_found = False + for asic, mismatches in diff_summary.items(): + if mismatches: + logging.critical(f"Mismatched LAG keys in {asic}: {mismatches}") + mismatches_found = True + + if mismatches_found: + logging.critical("Summary of mismatches:\n%s", json.dumps(diff_summary, indent=4)) + sys.exit(1) + else: + logging.info("All ASICs are in sync with chassis_db") + + +if __name__ == "__main__": + main() + sys.exit(0) From b3638b423e5f192880e0c7bf6df79868bbc83f38 Mon Sep 17 00:00:00 2001 From: Arvindsrinivasan Lakshmi Narasimhan Date: Tue, 7 Oct 2025 00:26:23 +0000 Subject: [PATCH 2/2] update run command Signed-off-by: Arvindsrinivasan Lakshmi Narasimhan --- scripts/chassis_lag_id_checker | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/chassis_lag_id_checker b/scripts/chassis_lag_id_checker index 31daa72c48..d15ff9134a 100644 --- a/scripts/chassis_lag_id_checker +++ b/scripts/chassis_lag_id_checker @@ -34,14 +34,12 @@ import sys import argparse import sonic_py_common.multi_asic as multi_asic import sonic_py_common.device_info as device_info -from utilities_common import cli def run_redis_dump(cmd_args): """Run redis-dump with given command arguments and return parsed JSON output.""" try: - #result = subprocess.run(cmd_args, capture_output=True, text=True) - result = cli.run_command(cmd_args) + result = subprocess.run(cmd_args, capture_output=True, text=True) if result.returncode != 0: logging.error(f"Command failed: {result.stderr}") raise RuntimeError(f"Command failed: {result.stderr}")