-
Notifications
You must be signed in to change notification settings - Fork 9
AIMVT-139: node-scraper offline dmesg adapter + full_dmesg_scan toggle #221
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| ''' | ||
| Copyright 2025 Advanced Micro Devices, Inc. | ||
| All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. | ||
| The year included in the foregoing notice is the year of creation of the work. | ||
| All code contained here is Property of Advanced Micro Devices, Inc. | ||
| ''' | ||
|
|
||
| # Adapter around AMD node-scraper's offline dmesg analyzer. | ||
| # | ||
| # CVS continues to collect raw dmesg over its existing parallel-SSH layer; this | ||
| # module only reuses node-scraper's curated error-pattern table to parse that | ||
| # text in memory. No SSH or system connection is required for analysis, so the | ||
| # adapter is a drop-in replacement for CVS's hand-maintained regex scanning | ||
| # while keeping the existing collection path and the downstream | ||
| # {node: [lines]} contract intact. | ||
| # | ||
| # Best results come from dmesg collected with `dmesg --time-format iso -x`, | ||
| # which gives node-scraper ISO timestamps and the decoded facility/level prefix | ||
| # its full pattern set expects. Plain `dmesg -T` output still matches message | ||
| # bodies, but per-event timestamps will be empty. | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from typing import Any, Dict, List, Optional, Union | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
| try: | ||
| from nodescraper.models import SystemInfo | ||
| from nodescraper.plugins.inband.dmesg.analyzer_args import DmesgAnalyzerArgs | ||
| from nodescraper.plugins.inband.dmesg.dmesg_plugin import DmesgPlugin | ||
| from nodescraper.plugins.inband.dmesg.dmesgdata import DmesgData | ||
|
|
||
| NODE_SCRAPER_AVAILABLE = True | ||
| _IMPORT_ERROR: Optional[Exception] = None | ||
| except Exception as exc: # pragma: no cover - only hit when dependency is absent | ||
| NODE_SCRAPER_AVAILABLE = False | ||
| _IMPORT_ERROR = exc | ||
|
|
||
|
|
||
| DEFAULT_NODE_NAME = "cvs-node" | ||
|
|
||
| EVENT_KEYS = ( | ||
| "priority", | ||
| "category", | ||
| "description", | ||
| "match_content", | ||
| "count", | ||
| "timestamps", | ||
| "source", | ||
| ) | ||
|
|
||
|
|
||
| def is_available() -> bool: | ||
| """Return True if the amd-node-scraper package is importable.""" | ||
| return NODE_SCRAPER_AVAILABLE | ||
|
|
||
|
|
||
| def _require_node_scraper() -> None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can remove this, because node-scrapper will be a mandatory requirement for cvs |
||
| if not NODE_SCRAPER_AVAILABLE: | ||
| raise RuntimeError( | ||
| "amd-node-scraper is not installed; add it from requirements.txt to use " | ||
| f"the node-scraper dmesg adapter. Original import error: {_IMPORT_ERROR}" | ||
| ) | ||
|
|
||
|
|
||
| def parse_dmesg( | ||
| dmesg_content: str, | ||
| node_name: Optional[str] = None, | ||
| analysis_args: Optional[Union[dict, "DmesgAnalyzerArgs"]] = None, | ||
| ) -> List[Dict[str, Any]]: | ||
| """Parse raw dmesg text using node-scraper's offline DmesgAnalyzer. | ||
|
|
||
| Args: | ||
| dmesg_content: Raw dmesg log text. Collect with | ||
| `dmesg --time-format iso -x` for full fidelity (ISO timestamps and | ||
| the decoded level prefix); plain `dmesg -T` still matches message | ||
| bodies but without per-event timestamps. | ||
| node_name: Optional system name used to tag the analysis. | ||
| analysis_args: Optional DmesgAnalyzerArgs instance or dict of analyzer | ||
| args, e.g. {"check_unknown_dmesg_errors": False} or | ||
| {"error_regex": [{"regex": "...", "message": "...", | ||
| "event_category": "NETWORK"}]} to extend the built-in pattern set. | ||
|
|
||
| Returns: | ||
| List of normalized event dicts, each with the keys in EVENT_KEYS: | ||
| priority, category, description, match_content, count, timestamps, | ||
| source. | ||
| """ | ||
| _require_node_scraper() | ||
|
|
||
| plugin = DmesgPlugin(system_info=SystemInfo(name=node_name or DEFAULT_NODE_NAME)) | ||
| result = plugin.analyze( | ||
| data=DmesgData(dmesg_content=dmesg_content or ""), | ||
| analysis_args=analysis_args, | ||
| ) | ||
| return [_normalize_event(event) for event in result.events] | ||
|
Comment on lines
+93
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI you can also call plugin.run(collection=False, data=dmesg.log, analysis_args=analysis_args) this way the plugin returns the common DataPluginResult which you can later parse similarly for NicPlugin and other plugins you might be interested in. |
||
|
|
||
|
|
||
| def _normalize_event(event: Any) -> Dict[str, Any]: | ||
| """Convert a node-scraper Event into a plain, JSON-friendly dict.""" | ||
| data = getattr(event, "data", {}) or {} | ||
| priority = getattr(event, "priority", None) | ||
| return { | ||
| "priority": getattr(priority, "name", str(priority)), | ||
| "category": getattr(event, "category", None), | ||
| "description": getattr(event, "description", None), | ||
| "match_content": data.get("match_content"), | ||
| "count": data.get("count", 1), | ||
| "timestamps": data.get("timestamps", []), | ||
| "source": data.get("source"), | ||
| } | ||
|
|
||
|
|
||
| def event_match_lines(events: List[Dict[str, Any]]) -> List[str]: | ||
| """Flatten normalized events into matched-line strings. | ||
|
|
||
| Preserves the legacy `{node: [lines]}` contract used by CVS dmesg scans: | ||
| each detected error becomes a single human-readable line combining the | ||
| error label and the matched text. | ||
| """ | ||
| lines: List[str] = [] | ||
| for event in events: | ||
| match = event.get("match_content") | ||
| if isinstance(match, (list, tuple)): | ||
| text = " ".join(str(part) for part in match if part) | ||
| elif match is None: | ||
| text = "" | ||
| else: | ||
| text = str(match) | ||
| description = event.get("description") or "" | ||
| line = f"{description}: {text}".strip().rstrip(":").strip() | ||
| if line: | ||
| lines.append(line) | ||
| return lines | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| ''' | ||
| Copyright 2025 Advanced Micro Devices, Inc. | ||
| All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. | ||
| The year included in the foregoing notice is the year of creation of the work. | ||
| All code contained here is Property of Advanced Micro Devices, Inc. | ||
| ''' | ||
|
|
||
| import unittest | ||
|
|
||
| from cvs.lib import node_scraper_adapter | ||
|
|
||
|
|
||
| # Small, self-contained dmesg sample in node-scraper's `dmesg --time-format iso -x` | ||
| # format (decoded facility/level prefix + ISO timestamp). Covers an OOM kill, a | ||
| # segfault, and a RAS correctable error so the adapter exercises several of | ||
| # node-scraper's built-in patterns without depending on a large external file. | ||
| SAMPLE_DMESG = ( | ||
| "kern :info : 2026-02-09T03:08:45,029495-05:00 [2731443] gnome-session-binary\n" | ||
| "kern :err : 2026-02-09T03:08:45,500000-05:00 Out of memory: Killed process " | ||
| "2746553 (dbus-daemon) total-vm:130169kB\n" | ||
| "kern :info : 2026-02-09T03:09:01,000000-05:00 gnome-shell[1234]: segfault at 0 " | ||
| "ip 00007f0000000000 sp 00007ffd error 4 in libc.so.6\n" | ||
| "kern :warn : 2026-02-09T03:10:00,000000-05:00 amdgpu: 5 correctable hardware " | ||
| "errors detected in total in gfx block\n" | ||
| "kern :info : 2026-02-09T03:11:00,000000-05:00 healthy line, nothing to report\n" | ||
| ) | ||
|
|
||
|
|
||
| @unittest.skipUnless( | ||
| node_scraper_adapter.is_available(), | ||
| "amd-node-scraper not installed", | ||
| ) | ||
| class TestParseDmesg(unittest.TestCase): | ||
| def test_detects_known_errors(self): | ||
| events = node_scraper_adapter.parse_dmesg(SAMPLE_DMESG, node_name="node1") | ||
| descriptions = {event["description"] for event in events} | ||
| self.assertIn("Out of memory error", descriptions) | ||
| self.assertIn("Segmentation fault", descriptions) | ||
|
|
||
| def test_event_shape(self): | ||
| events = node_scraper_adapter.parse_dmesg(SAMPLE_DMESG) | ||
| self.assertTrue(events, "expected at least one event from the sample dmesg") | ||
| for event in events: | ||
| for key in node_scraper_adapter.EVENT_KEYS: | ||
| self.assertIn(key, event) | ||
|
|
||
| def test_empty_input_returns_empty_list(self): | ||
| self.assertEqual(node_scraper_adapter.parse_dmesg(""), []) | ||
|
|
||
| def test_custom_error_regex_is_applied(self): | ||
| events = node_scraper_adapter.parse_dmesg( | ||
| SAMPLE_DMESG, | ||
| analysis_args={ | ||
| "check_unknown_dmesg_errors": False, | ||
| "error_regex": [ | ||
| { | ||
| "regex": r"nothing to report", | ||
| "message": "CVS custom marker", | ||
| "event_category": "OS", | ||
| } | ||
| ], | ||
| }, | ||
| ) | ||
| descriptions = {event["description"] for event in events} | ||
| self.assertIn("CVS custom marker", descriptions) | ||
|
|
||
| def test_event_match_lines_flatten(self): | ||
| events = node_scraper_adapter.parse_dmesg(SAMPLE_DMESG) | ||
| lines = node_scraper_adapter.event_match_lines(events) | ||
| self.assertTrue(any("Out of memory" in line for line in lines)) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,11 +5,13 @@ | |
| All code contained here is Property of Advanced Micro Devices, Inc. | ||
| ''' | ||
|
|
||
| import os | ||
| import re | ||
|
|
||
| from cvs.lib.utils_lib import * | ||
| from cvs.lib.rocm_plib import * | ||
| from cvs.lib import linux_utils | ||
| from cvs.lib import node_scraper_adapter | ||
|
|
||
|
|
||
| err_patterns_dict = { | ||
|
|
@@ -29,6 +31,26 @@ | |
| threshold_counter_val = 1000 | ||
|
|
||
|
|
||
| # Environment toggle selecting the dmesg parser backend: | ||
| # CVS_DMESG_PARSER=node-scraper (default) -> AMD node-scraper analyzer | ||
| # CVS_DMESG_PARSER=legacy -> historical err_patterns_dict regex | ||
| # Falls back to legacy automatically when node-scraper is not installed. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need fall back mechanism? if we add node-scrapper to requiremets, all cvs installations will have it. |
||
| DMESG_PARSER_ENV = 'CVS_DMESG_PARSER' | ||
|
|
||
|
|
||
| def use_node_scraper_dmesg(): | ||
| """Return True if dmesg scanning should use the node-scraper adapter. | ||
|
|
||
| Controlled by the CVS_DMESG_PARSER environment variable (default | ||
| 'node-scraper'). Values legacy/cvs/0/false/off/no select the legacy regex | ||
| path. Automatically falls back to legacy if node-scraper is unavailable. | ||
| """ | ||
| choice = os.environ.get(DMESG_PARSER_ENV, 'node-scraper').strip().lower() | ||
| if choice in ('legacy', 'cvs', '0', 'false', 'off', 'no'): | ||
| return False | ||
| return node_scraper_adapter.is_available() | ||
|
|
||
|
|
||
| def verify_gpu_pcie_bus_width(phdl, expected_cards=8, gpu_pcie_speed=32, gpu_pcie_width=16): | ||
| """ | ||
| Verify that all GPUs across nodes are operating at the expected PCIe link speed and width. | ||
|
|
@@ -445,7 +467,13 @@ def full_dmesg_scan( | |
| phdl, | ||
| ): | ||
| """ | ||
| Scan dmesg across nodes for known error patterns and fail on first match. | ||
| Scan dmesg across nodes for known error patterns and fail on each match. | ||
|
|
||
| The parsing backend is selected by the CVS_DMESG_PARSER environment | ||
| variable (see use_node_scraper_dmesg): the default 'node-scraper' path uses | ||
| the AMD node-scraper analyzer, while 'legacy' uses the historical | ||
| err_patterns_dict regex. Both paths return the same {node: [lines]} dict and | ||
| call fail_test for every detected error. | ||
|
|
||
| Parameters: | ||
| phdl: Host/process handle abstraction that supports: | ||
|
|
@@ -477,6 +505,24 @@ def full_dmesg_scan( | |
|
|
||
| log.info('scan dmesg') | ||
|
|
||
| if use_node_scraper_dmesg(): | ||
| # node-scraper path: collect with ISO timestamps + decoded level prefix | ||
| # ('--time-format iso -x') so the analyzer's full pattern set and | ||
| # timestamp extraction apply, then flag every detected error. | ||
| err_dict = {} | ||
| output_dict = phdl.exec( | ||
| "sudo dmesg --time-format iso -x | grep -v initialized | egrep -v 'ALLOWED|DENIED' --color=never" | ||
| ) | ||
| for node in output_dict.keys(): | ||
| err_dict[node] = [] | ||
| events = node_scraper_adapter.parse_dmesg(output_dict[node], node_name=node) | ||
| for line in node_scraper_adapter.event_match_lines(events): | ||
| msg = f'ERROR - Failure pattern *** {line} *** seen in Dmesg on node {node}' | ||
| fail_test(msg) | ||
| err_dict[node].append(line) | ||
| return err_dict | ||
|
|
||
| # Legacy path: historical err_patterns_dict regex over human-readable dmesg. | ||
| err_dict = {} | ||
|
|
||
| # Pull human-readable kernel logs and filter out common noise | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,10 +6,13 @@ pytest-html | |
| pytest-repeat | ||
| pytest-dependency | ||
| xlsxwriter | ||
| pydantic >= 2.0 | ||
| pydantic >= 2.8.2 | ||
| pandas | ||
| tabulate | ||
|
|
||
| # AMD node-scraper: offline dmesg/log parsing (see cvs/lib/node_scraper_adapter.py) | ||
| amd-node-scraper >= 1.1.4 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1.1.7 is the latest version available There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason to hardcode the version? we are at 1.1.8 as of yesterday |
||
|
|
||
| # Docker SDK for container orchestration | ||
| docker >= 7.0.0 | ||
| # YAML config parsing | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can remove this, because node-scrapper will be a mandatory requirement for cvs