Skip to content

Commit 65b75f9

Browse files
Sureshkumar-sithaianssithaia-ebay
authored andcommitted
22454:Support for syslog alert on temperature change
1 parent a86dd89 commit 65b75f9

File tree

2 files changed

+123
-0
lines changed

2 files changed

+123
-0
lines changed

sonic-xcvrd/tests/test_xcvrd.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,57 @@ def test_update_port_transceiver_status_table_hw(self):
775775
dom_info_update.update_port_transceiver_status_table_hw(logical_port_name, port_mapping, status_tbl, stop_event)
776776
assert status_tbl.get_size_for_key(logical_port_name) == 5
777777

778+
@patch('xcvrd.xcvrd.get_physical_port_name_dict', MagicMock(return_value={0: 'Ethernet0'}))
779+
@patch('xcvrd.xcvrd._wrapper_get_presence', MagicMock(return_value=True))
780+
def test_update_transceiver_temperature_status(self):
781+
port_mapping = PortMapping()
782+
stop_event = threading.Event()
783+
mock_cmis_manager = MagicMock()
784+
mock_sfp_obj_dict = MagicMock()
785+
task = DomInfoUpdateTask(DEFAULT_NAMESPACE, port_mapping, mock_sfp_obj_dict, stop_event, mock_cmis_manager, helper_logger)
786+
temperature_status = {}
787+
logical_port_name = 'Ethernet0'
788+
physical_port = 0
789+
790+
# Case: temperature exceeds high alarm
791+
dom_info_cache = {
792+
physical_port: {'temperature': '95'}
793+
}
794+
dom_th_info_cache = {
795+
physical_port: {
796+
'temphighalarm': '90',
797+
'templowalarm': '10',
798+
'temphighwarning': '80',
799+
'templowwarning': '20'
800+
}
801+
}
802+
temperature_status={}
803+
804+
task.update_transceiver_temperature_status(logical_port_name, port_mapping,dom_info_cache, dom_th_info_cache, temperature_status)
805+
806+
# Assert that status updated and logger was called with expected message
807+
assert temperature_status[0] == 1 # TEMP_HIGH_ALARM
808+
809+
# Case: Low temperaturealarm
810+
dom_info_cache = {
811+
physical_port: {'temperature': '0'}
812+
}
813+
dom_th_info_cache = {
814+
physical_port: {
815+
'temphighalarm': '90',
816+
'templowalarm': '10',
817+
'temphighwarning': '80',
818+
'templowwarning': '20'
819+
}
820+
}
821+
822+
temperature_status.clear()
823+
824+
task.update_transceiver_temperature_status(logical_port_name, port_mapping,dom_info_cache, dom_th_info_cache, temperature_status)
825+
826+
# Assert that status updated and logger was called with expected message
827+
assert temperature_status[0] == 2 # LOW_HIGH_ALARM
828+
778829
@patch('xcvrd.xcvrd.get_physical_port_name_dict', MagicMock(return_value={0: 'Ethernet0'}))
779830
def test_delete_port_from_status_table_hw(self):
780831
logical_port_name = "Ethernet0"

sonic-xcvrd/xcvrd/dom/dom_mgr.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def __init__(self, namespaces, port_mapping, sfp_obj_dict, main_thread_stop_even
3434
threading.Thread.__init__(self)
3535
self.name = "DomInfoUpdateTask"
3636
self.exc = None
37+
self.dom_th_info_cache = {}
3738
self.task_stopping_event = threading.Event()
3839
self.main_thread_stop_event = main_thread_stop_event
3940
self.helper_logger = helper_logger
@@ -265,21 +266,25 @@ def task_worker(self):
265266
self.log_notice("Start DOM monitoring loop")
266267
firmware_info_cache = {}
267268
dom_info_cache = {}
269+
dom_th_info_cache = {}
268270
transceiver_status_cache = {}
269271
vdm_real_value_cache = {}
270272
vdm_flag_cache = {}
271273
pm_info_cache = {}
274+
temperature_status = {}
272275
sel, asic_context = port_event_helper.subscribe_port_config_change(self.namespaces)
273276

274277
# Start loop to update dom info in DB periodically
275278
while not self.task_stopping_event.wait(self.DOM_INFO_UPDATE_PERIOD_SECS):
276279
# Clear the cache at the begin of the loop to make sure it will be clear each time
277280
firmware_info_cache.clear()
278281
dom_info_cache.clear()
282+
dom_th_info_cache.clear()
279283
transceiver_status_cache.clear()
280284
vdm_real_value_cache.clear()
281285
vdm_flag_cache.clear()
282286
pm_info_cache.clear()
287+
temperature_status.clear()
283288

284289
# Handle port change event from main thread
285290
port_event_helper.handle_port_config_change(sel, asic_context, self.task_stopping_event, self.port_mapping, self.helper_logger, self.on_port_config_change)
@@ -310,6 +315,7 @@ def task_worker(self):
310315

311316
try:
312317
self.post_port_sfp_firmware_info_to_db(logical_port_name, self.port_mapping, self.xcvr_table_helper.get_firmware_info_tbl(asic_index), self.task_stopping_event, firmware_info_cache=firmware_info_cache)
318+
xcvrd.post_port_dom_threshold_info_to_db(logical_port_name, self.port_mapping, self.xcvr_table_helper.get_dom_threshold_tbl(asic_index), self.task_stopping_event, dom_th_info_cache=self.dom_th_info_cache)
313319
except (KeyError, TypeError) as e:
314320
#continue to process next port since execption could be raised due to port reset, transceiver removal
315321
self.log_warning("Got exception {} while processing firmware info for port {}, ignored".format(repr(e), logical_port_name))
@@ -326,6 +332,12 @@ def task_worker(self):
326332
self.xcvr_table_helper.get_status_tbl(asic_index),
327333
self.task_stopping_event,
328334
transceiver_status_cache=transceiver_status_cache)
335+
336+
self.update_transceiver_temperature_status(logical_port_name,
337+
self.port_mapping,
338+
dom_info_cache,
339+
self.dom_th_info_cache,
340+
temperature_status)
329341
except (KeyError, TypeError) as e:
330342
#continue to process next port since execption could be raised due to port reset, transceiver removal
331343
self.log_warning("Got exception {} while processing transceiver status hw for port {}, ignored".format(repr(e), logical_port_name))
@@ -360,6 +372,66 @@ def task_worker(self):
360372

361373
self.log_notice("Stop DOM monitoring loop")
362374

375+
def update_transceiver_temperature_status(self, logical_port_name,port_mapping,dom_info_cache, dom_th_info_cache, temperature_status):
376+
TEMP_NORMAL = 0
377+
TEMP_HIGH_ALARM = 1
378+
TEMP_LOW_ALARM = 2
379+
TEMP_HIGH_WARNING = 3
380+
TEMP_LOW_WARNING = 4
381+
382+
TEMP_ERROR_TO_DESCRIPTION_DICT = {
383+
TEMP_NORMAL: "normal",
384+
TEMP_HIGH_ALARM: "temperature high alarm",
385+
TEMP_LOW_ALARM: "temperature low alarm",
386+
TEMP_HIGH_WARNING: "temperature high warning",
387+
TEMP_LOW_WARNING: "temperature low warning"
388+
}
389+
390+
for physical_port, physical_port_name in xcvrd.get_physical_port_name_dict(logical_port_name, port_mapping).items():
391+
if self.task_stopping_event.is_set():
392+
break
393+
394+
if not xcvrd._wrapper_get_presence(physical_port):
395+
continue
396+
397+
ori_temp_status = temperature_status.get(physical_port)
398+
if ori_temp_status is None:
399+
ori_temp_status = TEMP_NORMAL
400+
temperature_status[physical_port] = ori_temp_status
401+
new_temp_status = TEMP_NORMAL
402+
403+
dom_info_dict = dom_info_cache.get(physical_port)
404+
dom_th_info_dict = dom_th_info_cache.get(physical_port)
405+
if dom_info_dict is not None and dom_th_info_dict is not None:
406+
temperature = dom_info_dict.get("temperature")
407+
temphighalarm = dom_th_info_dict.get("temphighalarm")
408+
templowalarm = dom_th_info_dict.get("templowalarm")
409+
temphighwarning = dom_th_info_dict.get("temphighwarning")
410+
templowwarning = dom_th_info_dict.get("templowwarning")
411+
if temperature != 'N/A' and temphighalarm != 'N/A' and templowalarm != 'N/A' and \
412+
temphighwarning != 'N/A' and templowwarning != 'N/A':
413+
if float(temperature) > float(temphighalarm):
414+
new_temp_status = TEMP_HIGH_ALARM
415+
elif float(temperature) > float(temphighwarning):
416+
new_temp_status = TEMP_HIGH_WARNING
417+
elif float(temperature) < float(templowalarm):
418+
new_temp_status = TEMP_LOW_ALARM
419+
elif float(temperature) < float(templowwarning):
420+
new_temp_status = TEMP_LOW_WARNING
421+
else:
422+
new_temp_status = TEMP_NORMAL
423+
424+
# Add syslog for temperature
425+
if ori_temp_status != new_temp_status:
426+
temperature_status[physical_port] = new_temp_status
427+
helper_logger.log_notice("{}: temperature status change from {} to {}".format(
428+
physical_port_name,
429+
TEMP_ERROR_TO_DESCRIPTION_DICT[ori_temp_status],
430+
TEMP_ERROR_TO_DESCRIPTION_DICT[new_temp_status]))
431+
elif new_temp_status > 0:
432+
helper_logger.log_notice("{}: {}".format(physical_port_name, TEMP_ERROR_TO_DESCRIPTION_DICT[new_temp_status]))
433+
434+
363435
def run(self):
364436
if self.task_stopping_event.is_set():
365437
return

0 commit comments

Comments
 (0)