diff --git a/cvs/lib/node_scraper_adapter.py b/cvs/lib/node_scraper_adapter.py new file mode 100644 index 00000000..30e466de --- /dev/null +++ b/cvs/lib/node_scraper_adapter.py @@ -0,0 +1,136 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. +The year included in the foregoing notice is the year of creation of the work. +All code contained here is Property of Advanced Micro Devices, Inc. +''' + +# Adapter around AMD node-scraper's offline dmesg analyzer. +# +# CVS continues to collect raw dmesg over its existing parallel-SSH layer; this +# module only reuses node-scraper's curated error-pattern table to parse that +# text in memory. No SSH or system connection is required for analysis, so the +# adapter is a drop-in replacement for CVS's hand-maintained regex scanning +# while keeping the existing collection path and the downstream +# {node: [lines]} contract intact. +# +# Best results come from dmesg collected with `dmesg --time-format iso -x`, +# which gives node-scraper ISO timestamps and the decoded facility/level prefix +# its full pattern set expects. Plain `dmesg -T` output still matches message +# bodies, but per-event timestamps will be empty. + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional, Union + +log = logging.getLogger(__name__) + +try: + from nodescraper.models import SystemInfo + from nodescraper.plugins.inband.dmesg.analyzer_args import DmesgAnalyzerArgs + from nodescraper.plugins.inband.dmesg.dmesg_plugin import DmesgPlugin + from nodescraper.plugins.inband.dmesg.dmesgdata import DmesgData + + NODE_SCRAPER_AVAILABLE = True + _IMPORT_ERROR: Optional[Exception] = None +except Exception as exc: # pragma: no cover - only hit when dependency is absent + NODE_SCRAPER_AVAILABLE = False + _IMPORT_ERROR = exc + + +DEFAULT_NODE_NAME = "cvs-node" + +EVENT_KEYS = ( + "priority", + "category", + "description", + "match_content", + "count", + "timestamps", + "source", +) + + +def is_available() -> bool: + """Return True if the amd-node-scraper package is importable.""" + return NODE_SCRAPER_AVAILABLE + + +def _require_node_scraper() -> None: + if not NODE_SCRAPER_AVAILABLE: + raise RuntimeError( + "amd-node-scraper is not installed; add it from requirements.txt to use " + f"the node-scraper dmesg adapter. Original import error: {_IMPORT_ERROR}" + ) + + +def parse_dmesg( + dmesg_content: str, + node_name: Optional[str] = None, + analysis_args: Optional[Union[dict, "DmesgAnalyzerArgs"]] = None, +) -> List[Dict[str, Any]]: + """Parse raw dmesg text using node-scraper's offline DmesgAnalyzer. + + Args: + dmesg_content: Raw dmesg log text. Collect with + `dmesg --time-format iso -x` for full fidelity (ISO timestamps and + the decoded level prefix); plain `dmesg -T` still matches message + bodies but without per-event timestamps. + node_name: Optional system name used to tag the analysis. + analysis_args: Optional DmesgAnalyzerArgs instance or dict of analyzer + args, e.g. {"check_unknown_dmesg_errors": False} or + {"error_regex": [{"regex": "...", "message": "...", + "event_category": "NETWORK"}]} to extend the built-in pattern set. + + Returns: + List of normalized event dicts, each with the keys in EVENT_KEYS: + priority, category, description, match_content, count, timestamps, + source. + """ + _require_node_scraper() + + plugin = DmesgPlugin(system_info=SystemInfo(name=node_name or DEFAULT_NODE_NAME)) + result = plugin.analyze( + data=DmesgData(dmesg_content=dmesg_content or ""), + analysis_args=analysis_args, + ) + return [_normalize_event(event) for event in result.events] + + +def _normalize_event(event: Any) -> Dict[str, Any]: + """Convert a node-scraper Event into a plain, JSON-friendly dict.""" + data = getattr(event, "data", {}) or {} + priority = getattr(event, "priority", None) + return { + "priority": getattr(priority, "name", str(priority)), + "category": getattr(event, "category", None), + "description": getattr(event, "description", None), + "match_content": data.get("match_content"), + "count": data.get("count", 1), + "timestamps": data.get("timestamps", []), + "source": data.get("source"), + } + + +def event_match_lines(events: List[Dict[str, Any]]) -> List[str]: + """Flatten normalized events into matched-line strings. + + Preserves the legacy `{node: [lines]}` contract used by CVS dmesg scans: + each detected error becomes a single human-readable line combining the + error label and the matched text. + """ + lines: List[str] = [] + for event in events: + match = event.get("match_content") + if isinstance(match, (list, tuple)): + text = " ".join(str(part) for part in match if part) + elif match is None: + text = "" + else: + text = str(match) + description = event.get("description") or "" + line = f"{description}: {text}".strip().rstrip(":").strip() + if line: + lines.append(line) + return lines diff --git a/cvs/lib/unittests/test_node_scraper_adapter.py b/cvs/lib/unittests/test_node_scraper_adapter.py new file mode 100644 index 00000000..37fb6976 --- /dev/null +++ b/cvs/lib/unittests/test_node_scraper_adapter.py @@ -0,0 +1,74 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. +The year included in the foregoing notice is the year of creation of the work. +All code contained here is Property of Advanced Micro Devices, Inc. +''' + +import unittest + +from cvs.lib import node_scraper_adapter + + +# Small, self-contained dmesg sample in node-scraper's `dmesg --time-format iso -x` +# format (decoded facility/level prefix + ISO timestamp). Covers an OOM kill, a +# segfault, and a RAS correctable error so the adapter exercises several of +# node-scraper's built-in patterns without depending on a large external file. +SAMPLE_DMESG = ( + "kern :info : 2026-02-09T03:08:45,029495-05:00 [2731443] gnome-session-binary\n" + "kern :err : 2026-02-09T03:08:45,500000-05:00 Out of memory: Killed process " + "2746553 (dbus-daemon) total-vm:130169kB\n" + "kern :info : 2026-02-09T03:09:01,000000-05:00 gnome-shell[1234]: segfault at 0 " + "ip 00007f0000000000 sp 00007ffd error 4 in libc.so.6\n" + "kern :warn : 2026-02-09T03:10:00,000000-05:00 amdgpu: 5 correctable hardware " + "errors detected in total in gfx block\n" + "kern :info : 2026-02-09T03:11:00,000000-05:00 healthy line, nothing to report\n" +) + + +@unittest.skipUnless( + node_scraper_adapter.is_available(), + "amd-node-scraper not installed", +) +class TestParseDmesg(unittest.TestCase): + def test_detects_known_errors(self): + events = node_scraper_adapter.parse_dmesg(SAMPLE_DMESG, node_name="node1") + descriptions = {event["description"] for event in events} + self.assertIn("Out of memory error", descriptions) + self.assertIn("Segmentation fault", descriptions) + + def test_event_shape(self): + events = node_scraper_adapter.parse_dmesg(SAMPLE_DMESG) + self.assertTrue(events, "expected at least one event from the sample dmesg") + for event in events: + for key in node_scraper_adapter.EVENT_KEYS: + self.assertIn(key, event) + + def test_empty_input_returns_empty_list(self): + self.assertEqual(node_scraper_adapter.parse_dmesg(""), []) + + def test_custom_error_regex_is_applied(self): + events = node_scraper_adapter.parse_dmesg( + SAMPLE_DMESG, + analysis_args={ + "check_unknown_dmesg_errors": False, + "error_regex": [ + { + "regex": r"nothing to report", + "message": "CVS custom marker", + "event_category": "OS", + } + ], + }, + ) + descriptions = {event["description"] for event in events} + self.assertIn("CVS custom marker", descriptions) + + def test_event_match_lines_flatten(self): + events = node_scraper_adapter.parse_dmesg(SAMPLE_DMESG) + lines = node_scraper_adapter.event_match_lines(events) + self.assertTrue(any("Out of memory" in line for line in lines)) + + +if __name__ == "__main__": + unittest.main() diff --git a/cvs/lib/unittests/test_verify_lib.py b/cvs/lib/unittests/test_verify_lib.py index 8c378fdd..504dcd71 100644 --- a/cvs/lib/unittests/test_verify_lib.py +++ b/cvs/lib/unittests/test_verify_lib.py @@ -1,3 +1,4 @@ +import os import unittest from unittest.mock import MagicMock, patch @@ -75,6 +76,66 @@ def test_threshold_exceeded(self, mock_fail_test, mock_get_metrics): mock_fail_test.assert_called() +class TestFullDmesgScan(unittest.TestCase): + def tearDown(self): + os.environ.pop(verify_lib.DMESG_PARSER_ENV, None) + + @patch("cvs.lib.verify_lib.fail_test") + def test_legacy_path_matches_err_patterns(self, mock_fail_test): + os.environ[verify_lib.DMESG_PARSER_ENV] = "legacy" + phdl = MagicMock() + phdl.exec.return_value = { + "node1": "Mar 1 00:00:00 host kernel: amdgpu page fault segfault at 0", + } + + result = verify_lib.full_dmesg_scan(phdl) + + # legacy path collects with human-readable `dmesg -T` + self.assertIn("dmesg -T", phdl.exec.call_args[0][0]) + self.assertTrue(result["node1"]) + mock_fail_test.assert_called() + + @patch("cvs.lib.verify_lib.fail_test") + @patch.object(verify_lib.node_scraper_adapter, "parse_dmesg") + @patch.object(verify_lib.node_scraper_adapter, "is_available", return_value=True) + def test_node_scraper_path_uses_adapter(self, mock_avail, mock_parse, mock_fail_test): + os.environ[verify_lib.DMESG_PARSER_ENV] = "node-scraper" + mock_parse.return_value = [ + { + "priority": "ERROR", + "category": "SW_DRIVER", + "description": "Out of memory error", + "match_content": "Out of memory: Killed process 123 (foo)", + "count": 1, + "timestamps": [], + "source": "dmesg", + } + ] + phdl = MagicMock() + phdl.exec.return_value = {"node1": "raw dmesg text"} + + result = verify_lib.full_dmesg_scan(phdl) + + # node-scraper path collects with ISO timestamps + decoded prefix + self.assertIn("--time-format iso -x", phdl.exec.call_args[0][0]) + mock_parse.assert_called_once() + self.assertEqual(len(result["node1"]), 1) + self.assertIn("Out of memory error", result["node1"][0]) + mock_fail_test.assert_called() + + @patch("cvs.lib.verify_lib.fail_test") + @patch.object(verify_lib.node_scraper_adapter, "is_available", return_value=False) + def test_falls_back_to_legacy_when_unavailable(self, mock_avail, mock_fail_test): + os.environ[verify_lib.DMESG_PARSER_ENV] = "node-scraper" + phdl = MagicMock() + phdl.exec.return_value = {"node1": "nothing interesting here"} + + verify_lib.full_dmesg_scan(phdl) + + # even though node-scraper is requested, missing dep -> legacy `dmesg -T` + self.assertIn("dmesg -T", phdl.exec.call_args[0][0]) + + class TestVerifyHostLspci(unittest.TestCase): def setUp(self): self.mock_phdl = MagicMock() diff --git a/cvs/lib/verify_lib.py b/cvs/lib/verify_lib.py index 95074ac1..c68523df 100644 --- a/cvs/lib/verify_lib.py +++ b/cvs/lib/verify_lib.py @@ -5,11 +5,13 @@ All code contained here is Property of Advanced Micro Devices, Inc. ''' +import os import re from cvs.lib.utils_lib import * from cvs.lib.rocm_plib import * from cvs.lib import linux_utils +from cvs.lib import node_scraper_adapter err_patterns_dict = { @@ -29,6 +31,26 @@ threshold_counter_val = 1000 +# Environment toggle selecting the dmesg parser backend: +# CVS_DMESG_PARSER=node-scraper (default) -> AMD node-scraper analyzer +# CVS_DMESG_PARSER=legacy -> historical err_patterns_dict regex +# Falls back to legacy automatically when node-scraper is not installed. +DMESG_PARSER_ENV = 'CVS_DMESG_PARSER' + + +def use_node_scraper_dmesg(): + """Return True if dmesg scanning should use the node-scraper adapter. + + Controlled by the CVS_DMESG_PARSER environment variable (default + 'node-scraper'). Values legacy/cvs/0/false/off/no select the legacy regex + path. Automatically falls back to legacy if node-scraper is unavailable. + """ + choice = os.environ.get(DMESG_PARSER_ENV, 'node-scraper').strip().lower() + if choice in ('legacy', 'cvs', '0', 'false', 'off', 'no'): + return False + return node_scraper_adapter.is_available() + + def verify_gpu_pcie_bus_width(phdl, expected_cards=8, gpu_pcie_speed=32, gpu_pcie_width=16): """ Verify that all GPUs across nodes are operating at the expected PCIe link speed and width. @@ -445,7 +467,13 @@ def full_dmesg_scan( phdl, ): """ - Scan dmesg across nodes for known error patterns and fail on first match. + Scan dmesg across nodes for known error patterns and fail on each match. + + The parsing backend is selected by the CVS_DMESG_PARSER environment + variable (see use_node_scraper_dmesg): the default 'node-scraper' path uses + the AMD node-scraper analyzer, while 'legacy' uses the historical + err_patterns_dict regex. Both paths return the same {node: [lines]} dict and + call fail_test for every detected error. Parameters: phdl: Host/process handle abstraction that supports: @@ -477,6 +505,24 @@ def full_dmesg_scan( log.info('scan dmesg') + if use_node_scraper_dmesg(): + # node-scraper path: collect with ISO timestamps + decoded level prefix + # ('--time-format iso -x') so the analyzer's full pattern set and + # timestamp extraction apply, then flag every detected error. + err_dict = {} + output_dict = phdl.exec( + "sudo dmesg --time-format iso -x | grep -v initialized | egrep -v 'ALLOWED|DENIED' --color=never" + ) + for node in output_dict.keys(): + err_dict[node] = [] + events = node_scraper_adapter.parse_dmesg(output_dict[node], node_name=node) + for line in node_scraper_adapter.event_match_lines(events): + msg = f'ERROR - Failure pattern *** {line} *** seen in Dmesg on node {node}' + fail_test(msg) + err_dict[node].append(line) + return err_dict + + # Legacy path: historical err_patterns_dict regex over human-readable dmesg. err_dict = {} # Pull human-readable kernel logs and filter out common noise diff --git a/requirements.txt b/requirements.txt index eaefe1e6..b886cd1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,10 +6,13 @@ pytest-html pytest-repeat pytest-dependency xlsxwriter -pydantic >= 2.0 +pydantic >= 2.8.2 pandas tabulate +# AMD node-scraper: offline dmesg/log parsing (see cvs/lib/node_scraper_adapter.py) +amd-node-scraper >= 1.1.4 + # Docker SDK for container orchestration docker >= 7.0.0 # YAML config parsing