ai-dynamo · FrankD412 · May 21, 2026 · May 27, 2026 · coderabbitai · May 21, 2026
diff --git a/docs/reference/json-export-schema.md b/docs/reference/json-export-schema.md
@@ -31,7 +31,7 @@ A run with 20 requests against a streaming chat endpoint produces entries shaped
 
 ```json
 {
-  "schema_version": "1.3",
+  "schema_version": "1.4",
   "request_latency": {
     "unit": "ms",
     "avg": 2620.71,
@@ -72,6 +72,35 @@ In addition to the per-metric stats blocks, `profile_export_aiperf.json` include
 | `telemetry_data` | object | GPU telemetry summaries when telemetry collection was active. |
 | `error_summary` | array | Per-error counts collected during the run. |
 
+### `telemetry_data`
+
+Schema 1.4 adds a `platform` field to each GPU summary and vendor-scopes GPU
+telemetry metric names. NVIDIA metrics collected through DCGM or pynvml use
+`nvidia_*` names, and AMD metrics collected through amdsmi use `amd_*` names.
+Metric semantics are platform-specific; cross-platform comparisons require
+validation of the workload, collector behavior, and metric definitions.
+
+```json
+"telemetry_data": {
+  "endpoints": {
+    "localhost:9400": {
+      "gpus": {
+        "gpu_0": {
+          "gpu_index": 0,
+          "gpu_name": "NVIDIA H100",
+          "gpu_uuid": "GPU-...",
+          "platform": "nvidia",
+          "metrics": {
+            "nvidia_power_usage": {"unit": "W", "avg": 310.0},
+            "nvidia_gpu_utilization": {"unit": "%", "avg": 85.0}
+          }
+        }
+      }
+    }
+  }
+}
+```
+
 ### `run_info`
 
 Schema 1.2 introduced `run_info` to surface the seed and sweep coordinates needed to reproduce a run from the JSON file alone, without consulting the internal `run_config.json` handoff file. Schema 1.3 extends it with identifiers and the redacted CLI command.
@@ -114,6 +143,7 @@ The current schema version is exported as the top-level `schema_version` field o
 | `1.1` | Added `count` and `sum` to per-metric stats blocks. Backward-compatible for readers that ignore unknown fields; the new fields are present only on record-type metrics, omitted on derived/aggregate. |
 | `1.2` | Added top-level `run_info` block (`random_seed`, `trial`, `run_label`, `variation_label`, `variation_index`, `variation_values`). Backward-compatible: readers that don't need reproducibility can ignore the field. |
 | `1.3` | Added `benchmark_id`, `sweep_id`, and `cli_command` to `run_info`. `benchmark_id` duplicates the top-level field so `run_info` is self-contained; `sweep_id` (UUID4 of the outer sweep) lets readers join all per-run exports from one plan without consulting the parent multi-run artifact directory; `cli_command` records the redacted command line when available. Backward-compatible: nullable fields default to `null` when unavailable. |
+| `1.4` | Added per-GPU telemetry `platform` and renamed built-in NVIDIA GPU telemetry metrics to `nvidia_*`. AMD telemetry remains under `amd_*`. This is a telemetry metric-name breaking change for consumers of `telemetry_data`. |
 
 ### Other JSON exports use independent schema versions
 

diff --git a/docs/tutorials/gpu-telemetry.md b/docs/tutorials/gpu-telemetry.md
diff --git a/src/aiperf/common/models/export_models.py b/src/aiperf/common/models/export_models.py
@@ -89,6 +89,10 @@ class GpuSummary(AIPerfBaseModel):
     gpu_index: int
     gpu_name: str
     gpu_uuid: str
+    platform: str = Field(
+        default="unknown",
+        description="GPU telemetry platform namespace, e.g. 'nvidia', 'amd', or 'unknown'",
+    )
     hostname: str | None
     namespace: str | None = None
     pod_name: str | None = None
@@ -238,7 +242,7 @@ class JsonExportData(AIPerfBaseModel):
     model_config = ConfigDict(extra="allow")
 
     # Increment on breaking changes to the export structure
-    SCHEMA_VERSION: ClassVar[str] = "1.3"
+    SCHEMA_VERSION: ClassVar[str] = "1.4"
 
     schema_version: str | None = Field(
         default=None,

diff --git a/src/aiperf/common/models/telemetry_models.py b/src/aiperf/common/models/telemetry_models.py
@@ -3,13 +3,14 @@
 
 import numpy as np
 from numpy.typing import NDArray
-from pydantic import ConfigDict, Field
+from pydantic import AliasChoices, ConfigDict, Field
 
 from aiperf.common.exceptions import NoMetricValue
 from aiperf.common.models.base_models import AIPerfBaseModel
 from aiperf.common.models.export_models import TelemetryExportData
 from aiperf.common.models.record_models import MetricResult
 from aiperf.common.models.server_metrics_models import TimeRangeFilter
+from aiperf.gpu_telemetry.constants import UNKNOWN_GPU_TELEMETRY_PLATFORM
 
 
 class TelemetryMetrics(AIPerfBaseModel):
@@ -23,47 +24,73 @@ class TelemetryMetrics(AIPerfBaseModel):
 
     model_config = ConfigDict(extra="allow")
 
-    gpu_power_usage: float | None = Field(
-        default=None, description="Current GPU power usage in W"
+    nvidia_power_usage: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices("nvidia_power_usage", "gpu_power_usage"),
+        description="Current NVIDIA GPU power usage in W",
     )
-    energy_consumption: float | None = Field(
-        default=None, description="Cumulative energy consumption in MJ"
+    nvidia_energy_consumption: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices(
+            "nvidia_energy_consumption", "energy_consumption"
+        ),
+        description="NVIDIA GPU cumulative energy consumption in MJ",
     )
-    gpu_utilization: float | None = Field(
+    nvidia_gpu_utilization: float | None = Field(
         default=None,
-        description="GPU utilization percentage (0-100). "
+        validation_alias=AliasChoices("nvidia_gpu_utilization", "gpu_utilization"),
+        description="NVIDIA GPU utilization percentage (0-100). "
         "Percent of time over the past sample period during which one or more kernels was executing on the GPU.",
     )
-    gpu_memory_used: float | None = Field(
-        default=None, description="GPU memory used in GB"
+    nvidia_memory_utilization: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices("nvidia_memory_utilization", "mem_utilization"),
+        description="NVIDIA memory bandwidth utilization percentage (0-100). "
+        "Percent of time over the past sample period during which global (device) memory was being read or written.",
     )
-    gpu_temperature: float | None = Field(
-        default=None, description="GPU temperature in °C"
+    nvidia_memory_used: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices("nvidia_memory_used", "gpu_memory_used"),
+        description="NVIDIA GPU memory used in GB",
     )
-    mem_utilization: float | None = Field(
+    nvidia_temperature: float | None = Field(
         default=None,
-        description="Memory bandwidth utilization percentage (0-100). "
-        "Percent of time over the past sample period during which global (device) memory was being read or written.",
+        validation_alias=AliasChoices("nvidia_temperature", "gpu_temperature"),
+        description="NVIDIA GPU temperature in °C",
     )
-    sm_utilization: float | None = Field(
+    nvidia_sm_utilization: float | None = Field(
         default=None,
-        description="Streaming multiprocessor utilization percentage (0-100)",
+        validation_alias=AliasChoices("nvidia_sm_utilization", "sm_utilization"),
+        description="NVIDIA streaming multiprocessor utilization percentage (0-100)",
     )
-    decoder_utilization: float | None = Field(
-        default=None, description="Video decoder (NVDEC) utilization percentage (0-100)"
+    nvidia_decoder_utilization: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices(
+            "nvidia_decoder_utilization", "decoder_utilization"
+        ),
+        description="NVIDIA video decoder (NVDEC) utilization percentage (0-100)",
     )
-    encoder_utilization: float | None = Field(
-        default=None, description="Video encoder (NVENC) utilization percentage (0-100)"
+    nvidia_encoder_utilization: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices(
+            "nvidia_encoder_utilization", "encoder_utilization"
+        ),
+        description="NVIDIA video encoder (NVENC) utilization percentage (0-100)",
     )
-    jpg_utilization: float | None = Field(
-        default=None, description="JPEG decoder utilization percentage (0-100)"
+    nvidia_jpg_utilization: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices("nvidia_jpg_utilization", "jpg_utilization"),
+        description="NVIDIA JPEG decoder utilization percentage (0-100)",
     )
-    xid_errors: float | None = Field(
-        default=None, description="Value of the last XID error encountered"
+    nvidia_xid_errors: float | None = Field(
+        default=None,
+        validation_alias=AliasChoices("nvidia_xid_errors", "xid_errors"),
+        description="Value of the last NVIDIA XID error encountered",
     )
-    power_violation: float | None = Field(
+    nvidia_power_violation: float | None = Field(
         default=None,
-        description="Throttling duration due to power constraints in microseconds",
+        validation_alias=AliasChoices("nvidia_power_violation", "power_violation"),
+        description="NVIDIA throttling duration due to power constraints in microseconds",
     )
 
     # AMD ROCm telemetry (collected by AMDSMITelemetryCollector). These mirror
@@ -141,6 +168,10 @@ class GpuMetadata(AIPerfBaseModel):
     pod_name: str | None = Field(
         default=None, description="Pod name where the GPU is located (kubernetes only)"
     )
+    platform: str = Field(
+        default=UNKNOWN_GPU_TELEMETRY_PLATFORM,
+        description="GPU telemetry platform namespace, e.g. 'nvidia', 'amd', or 'unknown'",
+    )
 
 
 class TelemetryRecord(GpuMetadata):
@@ -649,6 +680,7 @@ def add_record(self, record: TelemetryRecord) -> None:
                     hostname=record.hostname,
                     namespace=record.namespace,
                     pod_name=record.pod_name,
+                    platform=record.platform,
                 ),
             )
 

diff --git a/src/aiperf/exporters/gpu_telemetry_console_exporter.py b/src/aiperf/exporters/gpu_telemetry_console_exporter.py
@@ -71,6 +71,7 @@ def get_renderable(self) -> RenderableType:
             RenderableType: Rich Group containing multiple Tables, or Text message if no data
         """
         renderables = []
+        renderables.append(self._create_platform_disclaimer())
         first_table = True
 
         # TelemetryExportData uses: endpoints[endpoint_display] -> EndpointData.gpus[gpu_key] -> GpuSummary
@@ -98,11 +99,29 @@ def get_renderable(self) -> RenderableType:
                 )
                 renderables.append(metrics_table)
 
-        if not renderables:
+        if len(renderables) == 1:
             return self._create_no_data_message()
 
         return Group(*renderables)
 
+    def _create_platform_disclaimer(self) -> Text:
+        """Create platform-specific comparability warning for telemetry summaries."""
+        platforms = sorted(
+            {
+                gpu_summary.platform
+                for endpoint_data in self._telemetry_results.endpoints.values()
+                for gpu_summary in endpoint_data.gpus.values()
+            }
+        )
+        platform_text = ", ".join(platforms) if platforms else "unknown"
+        return Text(
+            "GPU telemetry platform: "
+            f"{platform_text}. "
+            "Metric semantics are platform-specific; cross-platform comparisons "
+            "require workload and collector validation.",
+            style="yellow",
+        )
+
     def _create_summary_header(self, table_title_base: str) -> str:
         """Create the summary header with endpoint reachability status.
 
@@ -112,7 +131,7 @@ def _create_summary_header(self, table_title_base: str) -> str:
         Returns:
             Formatted title string with endpoint status
         """
-        title_lines = ["NVIDIA AIPerf | GPU Telemetry Summary"]
+        title_lines = ["AIPerf | GPU Telemetry Summary"]
 
         endpoints_configured = self._telemetry_results.summary.endpoints_configured
         endpoints_successful = self._telemetry_results.summary.endpoints_successful
@@ -122,15 +141,15 @@ def _create_summary_header(self, table_title_base: str) -> str:
 
         if failed_count == 0:
             title_lines.append(
-                f"[bold green]{successful_count}/{total_count} DCGM endpoints reachable[/bold green]"
+                f"[bold green]{successful_count}/{total_count} telemetry sources reachable[/bold green]"
             )
         elif successful_count == 0:
             title_lines.append(
-                f"[bold red]{successful_count}/{total_count} DCGM endpoints reachable[/bold red]"
+                f"[bold red]{successful_count}/{total_count} telemetry sources reachable[/bold red]"
             )
         else:
             title_lines.append(
-                f"[bold yellow]{successful_count}/{total_count} DCGM endpoints reachable[/bold yellow]"
+                f"[bold yellow]{successful_count}/{total_count} telemetry sources reachable[/bold yellow]"
             )
 
         for endpoint in endpoints_configured:

diff --git a/src/aiperf/exporters/metrics_csv_exporter.py b/src/aiperf/exporters/metrics_csv_exporter.py
@@ -28,7 +28,9 @@ def __init__(self, exporter_config: ExporterConfig, **kwargs) -> None:
         self._percentile_keys = _percentile_keys_from(STAT_KEYS)
         self.trace_or_debug(
             lambda: f"Initializing MetricsCsvExporter with config: {exporter_config}",
-            lambda: f"Initializing MetricsCsvExporter with file path: {self._file_path}",
+            lambda: (
+                f"Initializing MetricsCsvExporter with file path: {self._file_path}"
+            ),
         )
 
     def get_export_info(self) -> FileExportInfo:
@@ -203,6 +205,7 @@ def _write_telemetry_section(self, writer: csv.writer) -> None:
             "GPU_Index",
             "GPU_Name",
             "GPU_UUID",
+            "Platform",
         ]
         optional_headers, optional_fields = self._get_optional_headers_and_fields(
             "Hostname", "Namespace", "Pod Name"
@@ -259,8 +262,8 @@ def _write_gpu_metric_row_from_summary(
             endpoint_display: Display name of the endpoint
             gpu_summary: GpuSummary with pre-computed metrics (from TelemetryExportData)
             optional_fields: List of optional fields to write to the row
-            metric_key: Internal metric name (e.g., "gpu_power_usage")
-            metric_display: Display name for the metric (e.g., "GPU Power Usage")
+            metric_key: Internal metric name (e.g., "nvidia_power_usage")
+            metric_display: Display name for the metric (e.g., "NVIDIA GPU Power Usage")
             unit: Unit of measurement (e.g., "W", "GB", "%")
         """
         try:
@@ -274,6 +277,7 @@ def _write_gpu_metric_row_from_summary(
                 str(gpu_summary.gpu_index),
                 gpu_summary.gpu_name,
                 gpu_summary.gpu_uuid,
+                gpu_summary.platform,
                 metric_with_unit,
             ]
 

diff --git a/src/aiperf/gpu_telemetry/accumulator.py b/src/aiperf/gpu_telemetry/accumulator.py
@@ -288,6 +288,7 @@ def export_results(
                         gpu_index=gpu_data.metadata.gpu_index,
                         gpu_name=gpu_data.metadata.gpu_model_name,
                         gpu_uuid=gpu_uuid,
+                        platform=gpu_data.metadata.platform,
                         hostname=gpu_data.metadata.hostname,
                         namespace=gpu_data.metadata.namespace,
                         pod_name=gpu_data.metadata.pod_name,

diff --git a/src/aiperf/gpu_telemetry/amdsmi_collector.py b/src/aiperf/gpu_telemetry/amdsmi_collector.py
@@ -42,7 +42,10 @@
     TelemetryMetrics,
     TelemetryRecord,
 )
-from aiperf.gpu_telemetry.constants import AMDSMI_SOURCE_IDENTIFIER
+from aiperf.gpu_telemetry.constants import (
+    AMD_GPU_TELEMETRY_PLATFORM,
+    AMDSMI_SOURCE_IDENTIFIER,
+)
 from aiperf.gpu_telemetry.protocols import TErrorCallback, TRecordCallback
 
 __all__ = ["AMDSMITelemetryCollector"]
@@ -285,6 +288,7 @@ def _build_gpu_state(self, index: int, handle: Any) -> _AMDGpuDeviceState | None
                 pci_bus_id=pci_bus_id,
                 device=f"amd{index}",
                 hostname="localhost",
+                platform=AMD_GPU_TELEMETRY_PLATFORM,
             ),
         )