diff --git a/docs/reference/json-export-schema.md b/docs/reference/json-export-schema.md index 62d825c13..7e201ceac 100644 --- a/docs/reference/json-export-schema.md +++ b/docs/reference/json-export-schema.md @@ -31,7 +31,7 @@ A run with 20 requests against a streaming chat endpoint produces entries shaped ```json { - "schema_version": "1.3", + "schema_version": "1.4", "request_latency": { "unit": "ms", "avg": 2620.71, @@ -72,6 +72,35 @@ In addition to the per-metric stats blocks, `profile_export_aiperf.json` include | `telemetry_data` | object | GPU telemetry summaries when telemetry collection was active. | | `error_summary` | array | Per-error counts collected during the run. | +### `telemetry_data` + +Schema 1.4 adds a `platform` field to each GPU summary and vendor-scopes GPU +telemetry metric names. NVIDIA metrics collected through DCGM or pynvml use +`nvidia_*` names, and AMD metrics collected through amdsmi use `amd_*` names. +Metric semantics are platform-specific; cross-platform comparisons require +validation of the workload, collector behavior, and metric definitions. + +```json +"telemetry_data": { + "endpoints": { + "localhost:9400": { + "gpus": { + "gpu_0": { + "gpu_index": 0, + "gpu_name": "NVIDIA H100", + "gpu_uuid": "GPU-...", + "platform": "nvidia", + "metrics": { + "nvidia_power_usage": {"unit": "W", "avg": 310.0}, + "nvidia_gpu_utilization": {"unit": "%", "avg": 85.0} + } + } + } + } + } +} +``` + ### `run_info` Schema 1.2 introduced `run_info` to surface the seed and sweep coordinates needed to reproduce a run from the JSON file alone, without consulting the internal `run_config.json` handoff file. Schema 1.3 extends it with identifiers and the redacted CLI command. @@ -114,6 +143,7 @@ The current schema version is exported as the top-level `schema_version` field o | `1.1` | Added `count` and `sum` to per-metric stats blocks. Backward-compatible for readers that ignore unknown fields; the new fields are present only on record-type metrics, omitted on derived/aggregate. | | `1.2` | Added top-level `run_info` block (`random_seed`, `trial`, `run_label`, `variation_label`, `variation_index`, `variation_values`). Backward-compatible: readers that don't need reproducibility can ignore the field. | | `1.3` | Added `benchmark_id`, `sweep_id`, and `cli_command` to `run_info`. `benchmark_id` duplicates the top-level field so `run_info` is self-contained; `sweep_id` (UUID4 of the outer sweep) lets readers join all per-run exports from one plan without consulting the parent multi-run artifact directory; `cli_command` records the redacted command line when available. Backward-compatible: nullable fields default to `null` when unavailable. | +| `1.4` | Added per-GPU telemetry `platform` and renamed built-in NVIDIA GPU telemetry metrics to `nvidia_*`. AMD telemetry remains under `amd_*`. This is a telemetry metric-name breaking change for consumers of `telemetry_data`. | ### Other JSON exports use independent schema versions diff --git a/docs/tutorials/gpu-telemetry.md b/docs/tutorials/gpu-telemetry.md index caf5a2b96..dc9768632 100644 --- a/docs/tutorials/gpu-telemetry.md +++ b/docs/tutorials/gpu-telemetry.md @@ -53,6 +53,8 @@ AIPerf provides GPU telemetry collection with the `--gpu-telemetry` flag. Here's > > **pynvml mode:** When using `--gpu-telemetry pynvml`, DCGM endpoints are NOT used. Metrics are collected directly from local GPUs via the nvidia-ml-py library. > +> **Platform naming:** NVIDIA metrics from DCGM and pynvml are emitted under `nvidia_*` field names. AMD metrics from amdsmi are emitted under `amd_*` field names. Final GPU summaries include a `platform` field. Metric semantics are platform-specific; do not compare telemetry across NVIDIA and AMD platforms without validating the workload, collector behavior, and metric definitions. +> > **amdsmi mode:** When using `--gpu-telemetry amdsmi`, DCGM endpoints are NOT used. Metrics are collected directly from local AMD GPUs via the amdsmi library and emitted under vendor-namespaced `amd_*` field names (`amd_power`, `amd_gfx_activity`, `amd_temperature`, etc.) rather than NVML-shaped names. On Instinct datacenter parts `amd_mm_activity` is generally absent (sensor returns `'N/A'`); `amd_throttle_status` is a 0.0/1.0 snapshot per scrape (amdsmi exposes a boolean state, not a duration counter). > > To completely disable GPU telemetry collection, use `--no-gpu-telemetry`. @@ -175,20 +177,22 @@ Profiling: 64/64 |████████████████████ INFO Benchmark completed successfully - NVIDIA AIPerf | GPU Telemetry Summary - 1/1 DCGM endpoints reachable +GPU telemetry platform: nvidia. Metric semantics are platform-specific; cross-platform comparisons require workload and collector validation. + + AIPerf | GPU Telemetry Summary + 1/1 telemetry sources reachable • localhost:9401 ✔ localhost:9401 | GPU 0 | NVIDIA H100 80GB HBM3 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┓ ┃ Metric ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p50 ┃ std ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━┩ -│ GPU Power Usage (W) │ 348.69 │ 120.57 │ 386.02 │ 386.02 │ 386.02 │ 378.34 │ 85.97 │ -│ Energy Consumption (MJ) │ 0.24 │ 0.23 │ 0.25 │ 0.25 │ 0.25 │ 0.23 │ 0.01 │ -│ GPU Utilization (%) │ 45.82 │ 0.00 │ 66.00 │ 66.00 │ 66.00 │ 66.00 │ 24.52 │ -│ Memory Copy Utilization (%) │ 21.10 │ 0.00 │ 29.00 │ 29.00 │ 29.00 │ 29.00 │ 10.11 │ -│ GPU Memory Used (GB) │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 0.00 │ -│ GPU Memory Free (GB) │ 9.39 │ 9.39 │ 9.39 │ 9.39 │ 9.39 │ 9.39 │ 0.00 │ +│ NVIDIA GPU Power Usage (W) │ 348.69 │ 120.57 │ 386.02 │ 386.02 │ 386.02 │ 378.34 │ 85.97 │ +│ NVIDIA Energy Consumption (MJ) │ 0.24 │ 0.23 │ 0.25 │ 0.25 │ 0.25 │ 0.23 │ 0.01 │ +│ NVIDIA GPU Utilization (%) │ 45.82 │ 0.00 │ 66.00 │ 66.00 │ 66.00 │ 66.00 │ 24.52 │ +│ NVIDIA Memory Utilization (%) │ 21.10 │ 0.00 │ 29.00 │ 29.00 │ 29.00 │ 29.00 │ 10.11 │ +│ NVIDIA GPU Memory Used (GB) │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 0.00 │ +│ Framebuffer Memory Free (MB) │ 9,387.26 │ 9,385.80 │ 9,387.90 │ 9,387.90 │ 9,387.90 │ 9,387.90 │ 0.97 │ │ SM Clock Frequency (MHz) │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 0.00 │ │ Memory Clock Frequency (MHz) │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 0.00 │ │ Memory Temperature (°C) │ 45.99 │ 41.00 │ 48.00 │ 48.00 │ 48.00 │ 46.00 │ 2.08 │ @@ -410,17 +414,17 @@ The nvidia-ml-py library (pynvml) collects the following metrics directly from t | Metric | Description | Unit | |--------|-------------|------| -| GPU Power Usage | Current power draw | W | -| Energy Consumption | Total energy since boot | MJ | -| GPU Utilization | GPU compute utilization | % | -| Memory Utilization | Memory controller utilization | % | -| GPU Memory Used | Framebuffer memory in use | GB | -| GPU Temperature | GPU die temperature | °C | -| SM Utilization | Streaming multiprocessor utilization | % | -| Decoder Utilization | Video decoder utilization | % | -| Encoder Utilization | Video encoder utilization | % | -| JPEG Utilization | JPEG decoder utilization | % | -| Power Violation | Throttling duration due to power limits | µs | +| `nvidia_power_usage` | Current power draw | W | +| `nvidia_energy_consumption` | Total energy since boot | MJ | +| `nvidia_gpu_utilization` | GPU compute utilization | % | +| `nvidia_memory_utilization` | Memory controller utilization | % | +| `nvidia_memory_used` | Framebuffer memory in use | GB | +| `nvidia_temperature` | GPU die temperature | °C | +| `nvidia_sm_utilization` | Streaming multiprocessor utilization | % | +| `nvidia_decoder_utilization` | Video decoder utilization | % | +| `nvidia_encoder_utilization` | Video encoder utilization | % | +| `nvidia_jpg_utilization` | JPEG decoder utilization | % | +| `nvidia_power_violation` | Throttling duration due to power limits | µs | > [!NOTE] > Not all metrics are available on all GPU models. AIPerf gracefully handles missing metrics and reports only what the hardware supports. @@ -480,7 +484,7 @@ AMD signals are emitted under their own vendor-namespaced field names (not alias | Hardware | NVIDIA | NVIDIA | AMD ROCm | | Setup complexity | Requires container/service | `pip install nvidia-ml-py` | Ships with ROCm; install wheel from `/opt/rocm/share/amd_smi/` if missing | | Multi-node support | Yes (HTTP) | No (local) | No (local) | -| Field naming | `gpu_*` (NVML-shaped) | `gpu_*` (NVML-shaped) | `amd_*` (vendor-namespaced) | +| Field naming | `nvidia_*` | `nvidia_*` | `amd_*` | | Encoder/decoder util | Yes | Yes | No (Instinct GPUs report `'N/A'`) | | Error reporting | XID errors | (none) | ECC uncorrectable count (`amd_ecc_uncorrectable`) | | SM-level utilization | Yes (DCGM_FI_PROF_SM_ACTIVE) | Yes (GPM API) | Aliased to `gfx_activity` | @@ -539,14 +543,18 @@ aiperf profile --model MODEL ... --gpu-telemetry localhost:9400 dashboard custom The CSV format is identical to DCGM exporter configuration. See the **vLLM setup section above** (Step 1: Create a custom metrics configuration) for the complete CSV format example with all available DCGM fields. -**Behavior**: Custom metrics **extend** (not replace) the 7 core default metrics: -- GPU Power Usage -- Energy Consumption -- GPU Utilization -- GPU Memory Used -- GPU Temperature -- XID Errors -- Power Violation +**Behavior**: Custom metrics **extend** (not replace) the default NVIDIA metrics: +- `nvidia_power_usage` +- `nvidia_energy_consumption` +- `nvidia_gpu_utilization` +- `nvidia_memory_utilization` +- `nvidia_memory_used` +- `nvidia_temperature` +- `nvidia_encoder_utilization` +- `nvidia_decoder_utilization` +- `nvidia_sm_utilization` +- `nvidia_xid_errors` +- `nvidia_power_violation` > [!NOTE] > The file path can be absolute or relative. Use `.csv` extension so AIPerf can distinguish it from DCGM endpoint URLs. @@ -557,20 +565,22 @@ The CSV format is identical to DCGM exporter configuration. See the **vLLM setup ## Example Console Display: ``` - NVIDIA AIPerf | GPU Telemetry Summary - 1/1 DCGM endpoints reachable +GPU telemetry platform: nvidia. Metric semantics are platform-specific; cross-platform comparisons require workload and collector validation. + + AIPerf | GPU Telemetry Summary + 1/1 telemetry sources reachable • localhost:9401 ✔ localhost:9401 | GPU 0 | NVIDIA H100 80GB HBM3 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┓ ┃ Metric ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p50 ┃ std ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━┩ -│ GPU Power Usage (W) │ 348.69 │ 120.57 │ 386.02 │ 386.02 │ 386.02 │ 378.34 │ 85.97 │ -│ Energy Consumption (MJ) │ 0.24 │ 0.23 │ 0.25 │ 0.25 │ 0.25 │ 0.23 │ 0.01 │ -│ GPU Utilization (%) │ 45.82 │ 0.00 │ 66.00 │ 66.00 │ 66.00 │ 66.00 │ 24.52 │ -│ Memory Copy Utilization (%) │ 21.10 │ 0.00 │ 29.00 │ 29.00 │ 29.00 │ 29.00 │ 10.11 │ -│ GPU Memory Used (GB) │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 0.00 │ -│ GPU Memory Free (GB) │ 9.39 │ 9.39 │ 9.39 │ 9.39 │ 9.39 │ 9.39 │ 0.00 │ +│ NVIDIA GPU Power Usage (W) │ 348.69 │ 120.57 │ 386.02 │ 386.02 │ 386.02 │ 378.34 │ 85.97 │ +│ NVIDIA Energy Consumption (MJ) │ 0.24 │ 0.23 │ 0.25 │ 0.25 │ 0.25 │ 0.23 │ 0.01 │ +│ NVIDIA GPU Utilization (%) │ 45.82 │ 0.00 │ 66.00 │ 66.00 │ 66.00 │ 66.00 │ 24.52 │ +│ NVIDIA Memory Utilization (%) │ 21.10 │ 0.00 │ 29.00 │ 29.00 │ 29.00 │ 29.00 │ 10.11 │ +│ NVIDIA GPU Memory Used (GB) │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 92.70 │ 0.00 │ +│ Framebuffer Memory Free (MB) │ 9,387.26 │ 9,385.80 │ 9,387.90 │ 9,387.90 │ 9,387.90 │ 9,387.90 │ 0.97 │ │ SM Clock Frequency (MHz) │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 1,980.00 │ 0.00 │ │ Memory Clock Frequency (MHz) │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 2,619.00 │ 0.00 │ │ Memory Temperature (°C) │ 45.99 │ 41.00 │ 48.00 │ 48.00 │ 48.00 │ 46.00 │ 2.08 │ @@ -582,18 +592,18 @@ The CSV format is identical to DCGM exporter configuration. See the **vLLM setup ## Example CSV Export ``` -Endpoint,GPU_Index,GPU_Name,GPU_UUID,Metric,avg,min,max,p1,p5,p10,p25,p50,p75,p90,p95,p99,std -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,GPU Power Usage (W),348.69,120.57,386.02,120.57,120.57,,378.34,378.34,386.02,386.02,386.02,386.02,85.97 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,Energy Consumption (MJ),0.24,0.23,0.25,0.23,0.23,,0.23,0.23,0.25,0.25,0.25,0.25,0.01 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,GPU Utilization (%),45.82,0.00,66.00,0.00,0.00,,27.00,66.00,66.00,66.00,66.00,66.00,24.52 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,Memory Copy Utilization (%),21.10,0.00,29.00,0.00,0.00,,15.00,29.00,29.00,29.00,29.00,29.00,10.11 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,GPU Memory Used (GB),92.70,92.70,92.70,92.70,92.70,,92.70,92.70,92.70,92.70,92.70,92.70,0.00 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,GPU Memory Free (GB),9.39,9.39,9.39,9.39,9.39,,9.39,9.39,9.39,9.39,9.39,9.39,0.00 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,SM Clock Frequency (MHz),1980.00,1980.00,1980.00,1980.00,1980.00,,1980.00,1980.00,1980.00,1980.00,1980.00,1980.00,0.00 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,Memory Clock Frequency (MHz),2619.00,2619.00,2619.00,2619.00,2619.00,,2619.00,2619.00,2619.00,2619.00,2619.00,2619.00,0.00 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,Memory Temperature (°C),45.99,41.00,48.00,41.00,41.00,,46.00,46.00,48.00,48.00,48.00,48.00,2.08 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,GPU Temperature (°C),38.87,33.00,41.00,33.00,33.00,,39.00,39.00,41.00,41.00,41.00,41.00,2.38 -localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,XID Errors (count),0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00 +Endpoint,GPU_Index,GPU_Name,GPU_UUID,Platform,Metric,avg,min,max,p1,p5,p10,p25,p50,p75,p90,p95,p99,std +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA GPU Power Usage (W),348.69,120.57,386.02,120.57,120.57,,378.34,378.34,386.02,386.02,386.02,386.02,85.97 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA Energy Consumption (MJ),0.24,0.23,0.25,0.23,0.23,,0.23,0.23,0.25,0.25,0.25,0.25,0.01 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA GPU Utilization (%),45.82,0.00,66.00,0.00,0.00,,27.00,66.00,66.00,66.00,66.00,66.00,24.52 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA Memory Utilization (%),21.10,0.00,29.00,0.00,0.00,,15.00,29.00,29.00,29.00,29.00,29.00,10.11 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA GPU Memory Used (GB),92.70,92.70,92.70,92.70,92.70,,92.70,92.70,92.70,92.70,92.70,92.70,0.00 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,Framebuffer Memory Free (MB),9387.26,9385.80,9387.90,9385.80,9385.80,,9385.80,9387.90,9387.90,9387.90,9387.90,9387.90,0.97 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,SM Clock Frequency (MHz),1980.00,1980.00,1980.00,1980.00,1980.00,,1980.00,1980.00,1980.00,1980.00,1980.00,1980.00,0.00 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,Memory Clock Frequency (MHz),2619.00,2619.00,2619.00,2619.00,2619.00,,2619.00,2619.00,2619.00,2619.00,2619.00,2619.00,0.00 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,Memory Temperature (°C),45.99,41.00,48.00,41.00,41.00,,46.00,46.00,48.00,48.00,48.00,48.00,2.08 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA GPU Temperature (°C),38.87,33.00,41.00,33.00,33.00,,39.00,39.00,41.00,41.00,41.00,41.00,2.38 +localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa,nvidia,NVIDIA XID Errors (count),0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00 ``` ## Example JSON Export @@ -617,9 +627,10 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "gpu_index": 0, "gpu_name": "NVIDIA H100 80GB HBM3", "gpu_uuid": "GPU-afc3c15a-48a5-d669-0634-191c629f95fa", + "platform": "nvidia", "hostname": "69450c620e4d", "metrics": { - "gpu_power_usage": { + "nvidia_power_usage": { "avg": 348.6908823529412, "min": 120.57, "max": 386.022, @@ -636,7 +647,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "W" }, - "energy_consumption": { + "nvidia_energy_consumption": { "avg": 0.23782271866013072, "min": 0.229901671, "max": 0.246497393, @@ -653,7 +664,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "MJ" }, - "gpu_utilization": { + "nvidia_gpu_utilization": { "avg": 45.8235294117647, "min": 0.0, "max": 66.0, @@ -670,7 +681,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "%" }, - "memory_copy_utilization": { + "nvidia_memory_utilization": { "avg": 21.098039215686274, "min": 0.0, "max": 29.0, @@ -687,7 +698,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "%" }, - "gpu_memory_used": { + "nvidia_memory_used": { "avg": 92.69685977516342, "min": 92.69621555200001, "max": 92.698312704, @@ -704,24 +715,24 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "GB" }, - "gpu_memory_free": { - "avg": 9.387256704836602, - "min": 9.385803776000001, - "max": 9.387900928, - "p1": 9.385803776000001, - "p5": 9.385803776000001, + "nvidia_fb_free": { + "avg": 9387.256704836602, + "min": 9385.803776000001, + "max": 9387.900928, + "p1": 9385.803776000001, + "p5": 9385.803776000001, "p10": null, - "p25": 9.385803776000001, - "p50": 9.387900928, - "p75": 9.387900928, - "p90": 9.387900928, - "p95": 9.387900928, - "p99": 9.387900928, - "std": 0.0009674763104633748, + "p25": 9385.803776000001, + "p50": 9387.900928, + "p75": 9387.900928, + "p90": 9387.900928, + "p95": 9387.900928, + "p99": 9387.900928, + "std": 0.9674763104633748, "count": 153, - "unit": "GB" + "unit": "MB" }, - "sm_clock_frequency": { + "nvidia_sm_clock": { "avg": 1980.0, "min": 1980.0, "max": 1980.0, @@ -738,7 +749,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "MHz" }, - "memory_clock_frequency": { + "nvidia_mem_clock": { "avg": 2619.0, "min": 2619.0, "max": 2619.0, @@ -755,7 +766,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "MHz" }, - "memory_temperature": { + "nvidia_memory_temp": { "avg": 45.99346405228758, "min": 41.0, "max": 48.0, @@ -772,7 +783,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "°C" }, - "gpu_temperature": { + "nvidia_temperature": { "avg": 38.869281045751634, "min": 33.0, "max": 41.0, @@ -789,7 +800,7 @@ localhost:9401,0,NVIDIA H100 80GB HBM3,GPU-afc3c15a-48a5-d669-0634-191c629f95fa, "count": 153, "unit": "°C" }, - "xid_errors": { + "nvidia_xid_errors": { "avg": 0.0, "min": 0.0, "max": 0.0, diff --git a/src/aiperf/common/models/export_models.py b/src/aiperf/common/models/export_models.py index 5ad503693..7b8b6c030 100644 --- a/src/aiperf/common/models/export_models.py +++ b/src/aiperf/common/models/export_models.py @@ -94,6 +94,10 @@ class GpuSummary(AIPerfBaseModel): gpu_index: int gpu_name: str gpu_uuid: str + platform: str = Field( + default="unknown", + description="GPU telemetry platform namespace, e.g. 'nvidia', 'amd', or 'unknown'", + ) hostname: str | None namespace: str | None = None pod_name: str | None = None @@ -266,7 +270,7 @@ class JsonExportData(AIPerfBaseModel): model_config = ConfigDict(extra="allow") # Increment on breaking changes to the export structure - SCHEMA_VERSION: ClassVar[str] = "1.3" + SCHEMA_VERSION: ClassVar[str] = "1.4" schema_version: str | None = Field( default=None, diff --git a/src/aiperf/common/models/telemetry_models.py b/src/aiperf/common/models/telemetry_models.py index 425e20276..12828b6c8 100644 --- a/src/aiperf/common/models/telemetry_models.py +++ b/src/aiperf/common/models/telemetry_models.py @@ -3,13 +3,14 @@ import numpy as np from numpy.typing import NDArray -from pydantic import ConfigDict, Field +from pydantic import AliasChoices, ConfigDict, Field from aiperf.common.exceptions import NoMetricValue from aiperf.common.models.base_models import AIPerfBaseModel from aiperf.common.models.export_models import TelemetryExportData from aiperf.common.models.record_models import MetricResult from aiperf.common.models.server_metrics_models import TimeRangeFilter +from aiperf.gpu_telemetry.constants import UNKNOWN_GPU_TELEMETRY_PLATFORM class TelemetryMetrics(AIPerfBaseModel): @@ -23,47 +24,73 @@ class TelemetryMetrics(AIPerfBaseModel): model_config = ConfigDict(extra="allow") - gpu_power_usage: float | None = Field( - default=None, description="Current GPU power usage in W" + nvidia_power_usage: float | None = Field( + default=None, + validation_alias=AliasChoices("nvidia_power_usage", "gpu_power_usage"), + description="Current NVIDIA GPU power usage in W", ) - energy_consumption: float | None = Field( - default=None, description="Cumulative energy consumption in MJ" + nvidia_energy_consumption: float | None = Field( + default=None, + validation_alias=AliasChoices( + "nvidia_energy_consumption", "energy_consumption" + ), + description="NVIDIA GPU cumulative energy consumption in MJ", ) - gpu_utilization: float | None = Field( + nvidia_gpu_utilization: float | None = Field( default=None, - description="GPU utilization percentage (0-100). " + validation_alias=AliasChoices("nvidia_gpu_utilization", "gpu_utilization"), + description="NVIDIA GPU utilization percentage (0-100). " "Percent of time over the past sample period during which one or more kernels was executing on the GPU.", ) - gpu_memory_used: float | None = Field( - default=None, description="GPU memory used in GB" + nvidia_memory_utilization: float | None = Field( + default=None, + validation_alias=AliasChoices("nvidia_memory_utilization", "mem_utilization"), + description="NVIDIA memory bandwidth utilization percentage (0-100). " + "Percent of time over the past sample period during which global (device) memory was being read or written.", ) - gpu_temperature: float | None = Field( - default=None, description="GPU temperature in °C" + nvidia_memory_used: float | None = Field( + default=None, + validation_alias=AliasChoices("nvidia_memory_used", "gpu_memory_used"), + description="NVIDIA GPU memory used in GB", ) - mem_utilization: float | None = Field( + nvidia_temperature: float | None = Field( default=None, - description="Memory bandwidth utilization percentage (0-100). " - "Percent of time over the past sample period during which global (device) memory was being read or written.", + validation_alias=AliasChoices("nvidia_temperature", "gpu_temperature"), + description="NVIDIA GPU temperature in °C", ) - sm_utilization: float | None = Field( + nvidia_sm_utilization: float | None = Field( default=None, - description="Streaming multiprocessor utilization percentage (0-100)", + validation_alias=AliasChoices("nvidia_sm_utilization", "sm_utilization"), + description="NVIDIA streaming multiprocessor utilization percentage (0-100)", ) - decoder_utilization: float | None = Field( - default=None, description="Video decoder (NVDEC) utilization percentage (0-100)" + nvidia_decoder_utilization: float | None = Field( + default=None, + validation_alias=AliasChoices( + "nvidia_decoder_utilization", "decoder_utilization" + ), + description="NVIDIA video decoder (NVDEC) utilization percentage (0-100)", ) - encoder_utilization: float | None = Field( - default=None, description="Video encoder (NVENC) utilization percentage (0-100)" + nvidia_encoder_utilization: float | None = Field( + default=None, + validation_alias=AliasChoices( + "nvidia_encoder_utilization", "encoder_utilization" + ), + description="NVIDIA video encoder (NVENC) utilization percentage (0-100)", ) - jpg_utilization: float | None = Field( - default=None, description="JPEG decoder utilization percentage (0-100)" + nvidia_jpg_utilization: float | None = Field( + default=None, + validation_alias=AliasChoices("nvidia_jpg_utilization", "jpg_utilization"), + description="NVIDIA JPEG decoder utilization percentage (0-100)", ) - xid_errors: float | None = Field( - default=None, description="Value of the last XID error encountered" + nvidia_xid_errors: float | None = Field( + default=None, + validation_alias=AliasChoices("nvidia_xid_errors", "xid_errors"), + description="Value of the last NVIDIA XID error encountered", ) - power_violation: float | None = Field( + nvidia_power_violation: float | None = Field( default=None, - description="Throttling duration due to power constraints in microseconds", + validation_alias=AliasChoices("nvidia_power_violation", "power_violation"), + description="NVIDIA throttling duration due to power constraints in microseconds", ) # AMD ROCm telemetry (collected by AMDSMITelemetryCollector). These mirror @@ -141,6 +168,10 @@ class GpuMetadata(AIPerfBaseModel): pod_name: str | None = Field( default=None, description="Pod name where the GPU is located (kubernetes only)" ) + platform: str = Field( + default=UNKNOWN_GPU_TELEMETRY_PLATFORM, + description="GPU telemetry platform namespace, e.g. 'nvidia', 'amd', or 'unknown'", + ) class TelemetryRecord(GpuMetadata): @@ -649,6 +680,7 @@ def add_record(self, record: TelemetryRecord) -> None: hostname=record.hostname, namespace=record.namespace, pod_name=record.pod_name, + platform=record.platform, ), ) diff --git a/src/aiperf/exporters/gpu_telemetry_console_exporter.py b/src/aiperf/exporters/gpu_telemetry_console_exporter.py index 816c5c2aa..972b3d6c6 100644 --- a/src/aiperf/exporters/gpu_telemetry_console_exporter.py +++ b/src/aiperf/exporters/gpu_telemetry_console_exporter.py @@ -71,6 +71,7 @@ def get_renderable(self) -> RenderableType: RenderableType: Rich Group containing multiple Tables, or Text message if no data """ renderables = [] + renderables.append(self._create_platform_disclaimer()) first_table = True # TelemetryExportData uses: endpoints[endpoint_display] -> EndpointData.gpus[gpu_key] -> GpuSummary @@ -98,11 +99,29 @@ def get_renderable(self) -> RenderableType: ) renderables.append(metrics_table) - if not renderables: + if len(renderables) == 1: return self._create_no_data_message() return Group(*renderables) + def _create_platform_disclaimer(self) -> Text: + """Create platform-specific comparability warning for telemetry summaries.""" + platforms = sorted( + { + gpu_summary.platform + for endpoint_data in self._telemetry_results.endpoints.values() + for gpu_summary in endpoint_data.gpus.values() + } + ) + platform_text = ", ".join(platforms) if platforms else "unknown" + return Text( + "GPU telemetry platform: " + f"{platform_text}. " + "Metric semantics are platform-specific; cross-platform comparisons " + "require workload and collector validation.", + style="yellow", + ) + def _create_summary_header(self, table_title_base: str) -> str: """Create the summary header with endpoint reachability status. @@ -112,7 +131,7 @@ def _create_summary_header(self, table_title_base: str) -> str: Returns: Formatted title string with endpoint status """ - title_lines = ["NVIDIA AIPerf | GPU Telemetry Summary"] + title_lines = ["AIPerf | GPU Telemetry Summary"] endpoints_configured = self._telemetry_results.summary.endpoints_configured endpoints_successful = self._telemetry_results.summary.endpoints_successful @@ -122,15 +141,15 @@ def _create_summary_header(self, table_title_base: str) -> str: if failed_count == 0: title_lines.append( - f"[bold green]{successful_count}/{total_count} DCGM endpoints reachable[/bold green]" + f"[bold green]{successful_count}/{total_count} telemetry sources reachable[/bold green]" ) elif successful_count == 0: title_lines.append( - f"[bold red]{successful_count}/{total_count} DCGM endpoints reachable[/bold red]" + f"[bold red]{successful_count}/{total_count} telemetry sources reachable[/bold red]" ) else: title_lines.append( - f"[bold yellow]{successful_count}/{total_count} DCGM endpoints reachable[/bold yellow]" + f"[bold yellow]{successful_count}/{total_count} telemetry sources reachable[/bold yellow]" ) for endpoint in endpoints_configured: diff --git a/src/aiperf/exporters/metrics_csv_exporter.py b/src/aiperf/exporters/metrics_csv_exporter.py index f51913362..a7669f1f3 100644 --- a/src/aiperf/exporters/metrics_csv_exporter.py +++ b/src/aiperf/exporters/metrics_csv_exporter.py @@ -28,7 +28,9 @@ def __init__(self, exporter_config: ExporterConfig, **kwargs) -> None: self._percentile_keys = _percentile_keys_from(STAT_KEYS) self.trace_or_debug( lambda: f"Initializing MetricsCsvExporter with config: {exporter_config}", - lambda: f"Initializing MetricsCsvExporter with file path: {self._file_path}", + lambda: ( + f"Initializing MetricsCsvExporter with file path: {self._file_path}" + ), ) def get_export_info(self) -> FileExportInfo: @@ -203,6 +205,7 @@ def _write_telemetry_section(self, writer: csv.writer) -> None: "GPU_Index", "GPU_Name", "GPU_UUID", + "Platform", ] optional_headers, optional_fields = self._get_optional_headers_and_fields( "Hostname", "Namespace", "Pod Name" @@ -259,8 +262,8 @@ def _write_gpu_metric_row_from_summary( endpoint_display: Display name of the endpoint gpu_summary: GpuSummary with pre-computed metrics (from TelemetryExportData) optional_fields: List of optional fields to write to the row - metric_key: Internal metric name (e.g., "gpu_power_usage") - metric_display: Display name for the metric (e.g., "GPU Power Usage") + metric_key: Internal metric name (e.g., "nvidia_power_usage") + metric_display: Display name for the metric (e.g., "NVIDIA GPU Power Usage") unit: Unit of measurement (e.g., "W", "GB", "%") """ try: @@ -274,6 +277,7 @@ def _write_gpu_metric_row_from_summary( str(gpu_summary.gpu_index), gpu_summary.gpu_name, gpu_summary.gpu_uuid, + gpu_summary.platform, metric_with_unit, ] diff --git a/src/aiperf/gpu_telemetry/accumulator.py b/src/aiperf/gpu_telemetry/accumulator.py index 929c060da..1c03d8c87 100644 --- a/src/aiperf/gpu_telemetry/accumulator.py +++ b/src/aiperf/gpu_telemetry/accumulator.py @@ -288,6 +288,7 @@ def export_results( gpu_index=gpu_data.metadata.gpu_index, gpu_name=gpu_data.metadata.gpu_model_name, gpu_uuid=gpu_uuid, + platform=gpu_data.metadata.platform, hostname=gpu_data.metadata.hostname, namespace=gpu_data.metadata.namespace, pod_name=gpu_data.metadata.pod_name, diff --git a/src/aiperf/gpu_telemetry/amdsmi_collector.py b/src/aiperf/gpu_telemetry/amdsmi_collector.py index e74674b1d..8369ebee4 100644 --- a/src/aiperf/gpu_telemetry/amdsmi_collector.py +++ b/src/aiperf/gpu_telemetry/amdsmi_collector.py @@ -42,7 +42,10 @@ TelemetryMetrics, TelemetryRecord, ) -from aiperf.gpu_telemetry.constants import AMDSMI_SOURCE_IDENTIFIER +from aiperf.gpu_telemetry.constants import ( + AMD_GPU_TELEMETRY_PLATFORM, + AMDSMI_SOURCE_IDENTIFIER, +) from aiperf.gpu_telemetry.protocols import TErrorCallback, TRecordCallback __all__ = ["AMDSMITelemetryCollector"] @@ -285,6 +288,7 @@ def _build_gpu_state(self, index: int, handle: Any) -> _AMDGpuDeviceState | None pci_bus_id=pci_bus_id, device=f"amd{index}", hostname="localhost", + platform=AMD_GPU_TELEMETRY_PLATFORM, ), ) diff --git a/src/aiperf/gpu_telemetry/constants.py b/src/aiperf/gpu_telemetry/constants.py index ee9730e7b..b7640d5ef 100644 --- a/src/aiperf/gpu_telemetry/constants.py +++ b/src/aiperf/gpu_telemetry/constants.py @@ -19,19 +19,38 @@ # Source identifier for amdsmi collector (used in TelemetryRecord.dcgm_url field) AMDSMI_SOURCE_IDENTIFIER = "amdsmi://localhost" +NVIDIA_GPU_TELEMETRY_PLATFORM = "nvidia" +AMD_GPU_TELEMETRY_PLATFORM = "amd" +UNKNOWN_GPU_TELEMETRY_PLATFORM = "unknown" + +NVIDIA_TELEMETRY_FIELD_ALIASES = { + "gpu_power_usage": "nvidia_power_usage", + "energy_consumption": "nvidia_energy_consumption", + "gpu_utilization": "nvidia_gpu_utilization", + "mem_utilization": "nvidia_memory_utilization", + "gpu_memory_used": "nvidia_memory_used", + "gpu_temperature": "nvidia_temperature", + "decoder_utilization": "nvidia_decoder_utilization", + "encoder_utilization": "nvidia_encoder_utilization", + "jpg_utilization": "nvidia_jpg_utilization", + "sm_utilization": "nvidia_sm_utilization", + "xid_errors": "nvidia_xid_errors", + "power_violation": "nvidia_power_violation", +} + # DCGM field mapping to telemetry record fields DCGM_TO_FIELD_MAPPING = { - "DCGM_FI_DEV_POWER_USAGE": "gpu_power_usage", - "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": "energy_consumption", - "DCGM_FI_DEV_GPU_UTIL": "gpu_utilization", - "DCGM_FI_DEV_MEM_COPY_UTIL": "mem_utilization", - "DCGM_FI_DEV_FB_USED": "gpu_memory_used", - "DCGM_FI_DEV_GPU_TEMP": "gpu_temperature", - "DCGM_FI_DEV_ENC_UTIL": "encoder_utilization", - "DCGM_FI_DEV_DEC_UTIL": "decoder_utilization", - "DCGM_FI_PROF_SM_ACTIVE": "sm_utilization", - "DCGM_FI_DEV_XID_ERRORS": "xid_errors", - "DCGM_FI_DEV_POWER_VIOLATION": "power_violation", + "DCGM_FI_DEV_POWER_USAGE": "nvidia_power_usage", + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": "nvidia_energy_consumption", + "DCGM_FI_DEV_GPU_UTIL": "nvidia_gpu_utilization", + "DCGM_FI_DEV_MEM_COPY_UTIL": "nvidia_memory_utilization", + "DCGM_FI_DEV_FB_USED": "nvidia_memory_used", + "DCGM_FI_DEV_GPU_TEMP": "nvidia_temperature", + "DCGM_FI_DEV_ENC_UTIL": "nvidia_encoder_utilization", + "DCGM_FI_DEV_DEC_UTIL": "nvidia_decoder_utilization", + "DCGM_FI_PROF_SM_ACTIVE": "nvidia_sm_utilization", + "DCGM_FI_DEV_XID_ERRORS": "nvidia_xid_errors", + "DCGM_FI_DEV_POWER_VIOLATION": "nvidia_power_violation", } # GPU Telemetry Metrics Configuration @@ -40,18 +59,46 @@ # - field_name: Corresponds to TelemetryMetrics model field name # - unit_enum: MetricUnitT enum (use .value in exporters to get string) GPU_TELEMETRY_METRICS_CONFIG: list[tuple[str, str, MetricUnitT]] = [ - ("GPU Power Usage", "gpu_power_usage", PowerMetricUnit.WATT), - ("Energy Consumption", "energy_consumption", EnergyMetricUnit.MEGAJOULE), - ("GPU Utilization", "gpu_utilization", GenericMetricUnit.PERCENT), - ("GPU Memory Used", "gpu_memory_used", MetricSizeUnit.GIGABYTES), - ("GPU Temperature", "gpu_temperature", TemperatureMetricUnit.CELSIUS), - ("Memory Utilization", "mem_utilization", GenericMetricUnit.PERCENT), - ("SM Utilization", "sm_utilization", GenericMetricUnit.PERCENT), - ("Decoder Utilization", "decoder_utilization", GenericMetricUnit.PERCENT), - ("Encoder Utilization", "encoder_utilization", GenericMetricUnit.PERCENT), - ("JPEG Utilization", "jpg_utilization", GenericMetricUnit.PERCENT), - ("XID Errors", "xid_errors", GenericMetricUnit.COUNT), - ("Power Violation", "power_violation", MetricTimeUnit.MICROSECONDS), + ("NVIDIA GPU Power Usage", "nvidia_power_usage", PowerMetricUnit.WATT), + ( + "NVIDIA Energy Consumption", + "nvidia_energy_consumption", + EnergyMetricUnit.MEGAJOULE, + ), + ("NVIDIA GPU Utilization", "nvidia_gpu_utilization", GenericMetricUnit.PERCENT), + ( + "NVIDIA Memory Utilization", + "nvidia_memory_utilization", + GenericMetricUnit.PERCENT, + ), + ("NVIDIA GPU Memory Used", "nvidia_memory_used", MetricSizeUnit.GIGABYTES), + ("NVIDIA GPU Temperature", "nvidia_temperature", TemperatureMetricUnit.CELSIUS), + ( + "NVIDIA SM Utilization", + "nvidia_sm_utilization", + GenericMetricUnit.PERCENT, + ), + ( + "NVIDIA Decoder Utilization", + "nvidia_decoder_utilization", + GenericMetricUnit.PERCENT, + ), + ( + "NVIDIA Encoder Utilization", + "nvidia_encoder_utilization", + GenericMetricUnit.PERCENT, + ), + ( + "NVIDIA JPEG Utilization", + "nvidia_jpg_utilization", + GenericMetricUnit.PERCENT, + ), + ("NVIDIA XID Errors", "nvidia_xid_errors", GenericMetricUnit.COUNT), + ( + "NVIDIA Power Violation", + "nvidia_power_violation", + MetricTimeUnit.MICROSECONDS, + ), # AMD ROCm telemetry (collected by AMDSMITelemetryCollector). These mirror # the amdsmi field names rather than NVML semantics, since the underlying # signals do not always measure the same physical quantity. Registered here @@ -71,9 +118,9 @@ # These metrics accumulate over time (e.g., total energy consumed since boot), # so we compute the delta between baseline and final values rather than statistics. GPU_TELEMETRY_COUNTER_METRICS: set[str] = { - "energy_consumption", - "xid_errors", - "power_violation", + "nvidia_energy_consumption", + "nvidia_xid_errors", + "nvidia_power_violation", "amd_energy_consumption", "amd_ecc_uncorrectable", } diff --git a/src/aiperf/gpu_telemetry/dcgm_collector.py b/src/aiperf/gpu_telemetry/dcgm_collector.py index abf805c8e..b16434298 100644 --- a/src/aiperf/gpu_telemetry/dcgm_collector.py +++ b/src/aiperf/gpu_telemetry/dcgm_collector.py @@ -12,16 +12,19 @@ TRecordCallback, ) from aiperf.common.models import GpuMetadata, TelemetryMetrics, TelemetryRecord -from aiperf.gpu_telemetry.constants import DCGM_TO_FIELD_MAPPING +from aiperf.gpu_telemetry.constants import ( + DCGM_TO_FIELD_MAPPING, + NVIDIA_GPU_TELEMETRY_PLATFORM, +) __all__ = ["DCGMTelemetryCollector"] # Unit conversion scaling factors for DCGM metrics SCALING_FACTORS = { - "energy_consumption": 1e-9, # mJ -> MJ - "gpu_memory_used": 1.048576e-3, # MiB -> GB - "sm_utilization": 100, # ratio (0-1) -> percentage (0-100) - "power_violation": 1e-3, # ns -> µs + "nvidia_energy_consumption": 1e-9, # mJ -> MJ + "nvidia_memory_used": 1.048576e-3, # MiB -> GB + "nvidia_sm_utilization": 100, # ratio (0-1) -> percentage (0-100) + "nvidia_power_violation": 1e-3, # ns -> µs } @@ -154,6 +157,7 @@ def _parse_metrics_to_records(self, metrics_data: str) -> list[TelemetryRecord]: hostname=labels.get("Hostname"), namespace=labels.get("namespace"), pod_name=labels.get("pod"), + platform=NVIDIA_GPU_TELEMETRY_PLATFORM, ) base_metric_name = metric_name.removesuffix("_total") diff --git a/src/aiperf/gpu_telemetry/manager.py b/src/aiperf/gpu_telemetry/manager.py index 21cdaebfc..d524d11ff 100644 --- a/src/aiperf/gpu_telemetry/manager.py +++ b/src/aiperf/gpu_telemetry/manager.py @@ -313,7 +313,7 @@ async def _send_configure_status( reason = failure_reason or ( f"{self._collector_type} not available or no GPUs found" if is_local - else "no DCGM endpoints reachable" + else "no telemetry sources reachable" ) await self._send_telemetry_status( enabled=False, @@ -387,7 +387,7 @@ async def _handle_profile_complete_command( Ensures GPU telemetry captures final state for accurate counter deltas. This final scrape provides the end-point values needed for metrics like - energy_consumption which are computed as (final - baseline). + nvidia_energy_consumption which are computed as (final - baseline). Args: message: Profile complete command from SystemController @@ -520,9 +520,9 @@ async def _send_telemetry_status( Args: enabled: Whether telemetry collection is enabled/available - reason: Optional human-readable reason for status (e.g., "no DCGM endpoints reachable") - endpoints_configured: List of DCGM endpoint URLs in configured scope for display - endpoints_reachable: List of DCGM endpoint URLs that are accessible + reason: Optional human-readable reason for status + endpoints_configured: Telemetry source URLs in configured scope for display + endpoints_reachable: Telemetry source URLs that are accessible """ try: status_message = TelemetryStatusMessage( diff --git a/src/aiperf/gpu_telemetry/metrics_config.py b/src/aiperf/gpu_telemetry/metrics_config.py index 891704001..1e8562c2e 100644 --- a/src/aiperf/gpu_telemetry/metrics_config.py +++ b/src/aiperf/gpu_telemetry/metrics_config.py @@ -151,6 +151,9 @@ def _infer_unit_from_help(self, help_msg: str) -> MetricUnitT: "gb": MetricSizeUnit.GIGABYTES, "mb": MetricSizeUnit.MEGABYTES, "kb": MetricSizeUnit.KILOBYTES, + "gib": MetricSizeUnit.GIGABYTES, + "mib": MetricSizeUnit.MEGABYTES, + "kib": MetricSizeUnit.KILOBYTES, "mhz": FrequencyMetricUnit.MEGAHERTZ, "ghz": FrequencyMetricUnit.GIGAHERTZ, "c": TemperatureMetricUnit.CELSIUS, @@ -217,7 +220,13 @@ def build_custom_metrics_from_csv( ) continue - internal_name = dcgm_field.replace("DCGM_FI_DEV_", "").lower() + if dcgm_field.startswith("DCGM_FI_DEV_"): + dcgm_suffix = dcgm_field.removeprefix("DCGM_FI_DEV_") + elif dcgm_field.startswith("DCGM_FI_PROF_"): + dcgm_suffix = dcgm_field.removeprefix("DCGM_FI_PROF_") + else: + dcgm_suffix = dcgm_field + internal_name = f"nvidia_{dcgm_suffix.lower()}" display_name = help_msg.split("(")[0].strip() if not display_name: diff --git a/src/aiperf/gpu_telemetry/pynvml_collector.py b/src/aiperf/gpu_telemetry/pynvml_collector.py index 8c281221e..c94e92455 100644 --- a/src/aiperf/gpu_telemetry/pynvml_collector.py +++ b/src/aiperf/gpu_telemetry/pynvml_collector.py @@ -36,6 +36,7 @@ TelemetryRecord, ) from aiperf.gpu_telemetry.constants import ( + NVIDIA_GPU_TELEMETRY_PLATFORM, PYNVML_SOURCE_IDENTIFIER, ) from aiperf.gpu_telemetry.protocols import TErrorCallback, TRecordCallback @@ -47,10 +48,10 @@ class ScalingFactors: """Unit conversion scaling factors for NVML metrics.""" - gpu_power_usage = 1e-3 # mW -> W - energy_consumption = 1e-9 # mJ -> MJ - gpu_memory_used = 1e-9 # bytes -> GB - power_violation = 1e-3 # ns -> µs + nvidia_power_usage = 1e-3 # mW -> W + nvidia_energy_consumption = 1e-9 # mJ -> MJ + nvidia_memory_used = 1e-9 # bytes -> GB + nvidia_power_violation = 1e-3 # ns -> µs @dataclass(slots=True) @@ -250,6 +251,7 @@ def _create_gpu_for_device_index(self, index: int) -> GpuDeviceState | None: pci_bus_id=pci_bus_id, device=f"nvidia{index}", hostname="localhost", + platform=NVIDIA_GPU_TELEMETRY_PLATFORM, ), ) @@ -392,28 +394,28 @@ def _collect_gpu_metrics(self) -> list[TelemetryRecord]: # Power usage (milliwatts -> watts) with contextlib.suppress(NVMLError): power_mw = pynvml.nvmlDeviceGetPowerUsage(handle) - telemetry_data.gpu_power_usage = ( - power_mw * ScalingFactors.gpu_power_usage + telemetry_data.nvidia_power_usage = ( + power_mw * ScalingFactors.nvidia_power_usage ) # Total energy consumption (millijoules -> megajoules) with contextlib.suppress(NVMLError): energy_mj = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle) - telemetry_data.energy_consumption = ( - energy_mj * ScalingFactors.energy_consumption + telemetry_data.nvidia_energy_consumption = ( + energy_mj * ScalingFactors.nvidia_energy_consumption ) # GPU and memory utilization (percent) with contextlib.suppress(NVMLError): util = pynvml.nvmlDeviceGetUtilizationRates(handle) - telemetry_data.gpu_utilization = float(util.gpu) - telemetry_data.mem_utilization = float(util.memory) + telemetry_data.nvidia_gpu_utilization = float(util.gpu) + telemetry_data.nvidia_memory_utilization = float(util.memory) # Memory used (bytes -> gigabytes) with contextlib.suppress(NVMLError): mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - telemetry_data.gpu_memory_used = ( - mem_info.used * ScalingFactors.gpu_memory_used + telemetry_data.nvidia_memory_used = ( + mem_info.used * ScalingFactors.nvidia_memory_used ) # Temperature (Celsius) @@ -421,22 +423,22 @@ def _collect_gpu_metrics(self) -> list[TelemetryRecord]: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU ) - telemetry_data.gpu_temperature = float(temp) + telemetry_data.nvidia_temperature = float(temp) # Video decoder utilization (percent) with contextlib.suppress(NVMLError): dec_util, _ = pynvml.nvmlDeviceGetDecoderUtilization(handle) - telemetry_data.decoder_utilization = float(dec_util) + telemetry_data.nvidia_decoder_utilization = float(dec_util) # Video encoder utilization (percent) with contextlib.suppress(NVMLError): enc_util, _ = pynvml.nvmlDeviceGetEncoderUtilization(handle) - telemetry_data.encoder_utilization = float(enc_util) + telemetry_data.nvidia_encoder_utilization = float(enc_util) # JPEG decoder utilization (percent) with contextlib.suppress(NVMLError): jpg_util, _ = pynvml.nvmlDeviceGetJpgUtilization(handle) - telemetry_data.jpg_utilization = float(jpg_util) + telemetry_data.nvidia_jpg_utilization = float(jpg_util) # SM utilization: prefer GPM (device-level) over process enumeration sm_util: float | None = None @@ -456,15 +458,15 @@ def _collect_gpu_metrics(self) -> list[TelemetryRecord]: ) if sm_util is not None: - telemetry_data.sm_utilization = min(float(sm_util), 100.0) + telemetry_data.nvidia_sm_utilization = min(float(sm_util), 100.0) # Power violation / throttling duration (nanoseconds -> microseconds) with contextlib.suppress(NVMLError): violation = pynvml.nvmlDeviceGetViolationStatus( handle, pynvml.NVML_PERF_POLICY_POWER ) - telemetry_data.power_violation = ( - violation.violationTime * ScalingFactors.power_violation + telemetry_data.nvidia_power_violation = ( + violation.violationTime * ScalingFactors.nvidia_power_violation ) # Create record if any metrics were collected diff --git a/src/aiperf/plot/core/data_loader.py b/src/aiperf/plot/core/data_loader.py index 7f4c2ef62..0aaf7892a 100644 --- a/src/aiperf/plot/core/data_loader.py +++ b/src/aiperf/plot/core/data_loader.py @@ -29,6 +29,7 @@ HistogramTimeslice, ServerMetricsExportData, ) +from aiperf.gpu_telemetry.constants import NVIDIA_TELEMETRY_FIELD_ALIASES from aiperf.plot.constants import ( NON_METRIC_KEYS, PROFILE_EXPORT_AIPERF_AGGREGATE_JSON, @@ -1086,6 +1087,9 @@ def parse_line(line: str) -> dict: data = orjson.loads(line.encode("utf-8")) telemetry_data = data.pop("telemetry_data", {}) + for legacy_name, nvidia_name in NVIDIA_TELEMETRY_FIELD_ALIASES.items(): + if legacy_name in telemetry_data and nvidia_name not in telemetry_data: + telemetry_data[nvidia_name] = telemetry_data[legacy_name] flat_record = {**data, **telemetry_data} if "timestamp_ns" in flat_record: diff --git a/src/aiperf/plot/core/data_preparation.py b/src/aiperf/plot/core/data_preparation.py index 42eeb0007..67b7044f3 100644 --- a/src/aiperf/plot/core/data_preparation.py +++ b/src/aiperf/plot/core/data_preparation.py @@ -319,27 +319,43 @@ def prepare_timeslice_metrics( return plot_df, unit -def aggregate_gpu_telemetry(run: RunData) -> pd.DataFrame: +def aggregate_gpu_telemetry( + run: RunData, output_col: str = "nvidia_gpu_utilization" +) -> pd.DataFrame: """ Aggregate GPU telemetry data by averaging across GPUs at each timestamp. Args: run: RunData object with gpu_telemetry DataFrame + output_col: Column name to use in the returned DataFrame. Plot specs that + still reference the legacy ``gpu_utilization`` name pass that through + so downstream lookups (``df[y2_metric]``) keyed on the spec name match. Returns: - DataFrame with timestamp_s and averaged gpu_utilization + DataFrame with timestamp_s and averaged GPU utilization under ``output_col`` """ if run.gpu_telemetry is None or run.gpu_telemetry.empty: return pd.DataFrame() gpu_df = run.gpu_telemetry.copy() + utilization_col = ( + "nvidia_gpu_utilization" + if "nvidia_gpu_utilization" in gpu_df.columns + else "gpu_utilization" + ) + if utilization_col not in gpu_df.columns: + return pd.DataFrame() + # If gpu_index column exists, group by timestamp and average if "gpu_index" in gpu_df.columns: gpu_df = ( - gpu_df.groupby("timestamp_s").agg({"gpu_utilization": "mean"}).reset_index() + gpu_df.groupby("timestamp_s").agg({utilization_col: "mean"}).reset_index() ) + if output_col != utilization_col: + gpu_df = gpu_df.rename(columns={utilization_col: output_col}) + return gpu_df diff --git a/src/aiperf/plot/default_plot_config.yaml b/src/aiperf/plot/default_plot_config.yaml index bcd0e6f88..c4244dc5c 100644 --- a/src/aiperf/plot/default_plot_config.yaml +++ b/src/aiperf/plot/default_plot_config.yaml @@ -197,7 +197,7 @@ visualization: description: "Token throughput overlaid with GPU utilization to correlate performance" x: timestamp_s y: throughput_tokens_per_sec - y2: gpu_utilization + y2: nvidia_gpu_utilization title: "Output Token Throughput with GPU Utilization" primary_style: mode: lines diff --git a/src/aiperf/plot/handlers/single_run_handlers.py b/src/aiperf/plot/handlers/single_run_handlers.py index bf7898be8..dfc05dd31 100644 --- a/src/aiperf/plot/handlers/single_run_handlers.py +++ b/src/aiperf/plot/handlers/single_run_handlers.py @@ -1036,7 +1036,12 @@ class DualAxisHandler(BaseSingleRunHandler): "throughput_tokens_per_sec": lambda self, data: calculate_throughput_events( prepare_request_timeseries(data) ), - "gpu_utilization": lambda self, data: aggregate_gpu_telemetry(data), + "nvidia_gpu_utilization": lambda self, data: aggregate_gpu_telemetry( + data, "nvidia_gpu_utilization" + ), + "gpu_utilization": lambda self, data: aggregate_gpu_telemetry( + data, "gpu_utilization" + ), } def can_handle(self, spec: PlotSpec, data: RunData) -> bool: diff --git a/src/aiperf/plot/metric_names.py b/src/aiperf/plot/metric_names.py index b0ad1bf6b..b284b06a9 100644 --- a/src/aiperf/plot/metric_names.py +++ b/src/aiperf/plot/metric_names.py @@ -12,9 +12,17 @@ from collections.abc import Mapping from aiperf.common.enums import MetricFlags, MetricType -from aiperf.gpu_telemetry.constants import GPU_TELEMETRY_METRICS_CONFIG +from aiperf.gpu_telemetry.constants import ( + GPU_TELEMETRY_METRICS_CONFIG, + NVIDIA_TELEMETRY_FIELD_ALIASES, +) from aiperf.metrics.metric_registry import MetricRegistry +_GPU_DISPLAY_NAMES: dict[str, str] = { + field_name: display_name + for display_name, field_name, _ in GPU_TELEMETRY_METRICS_CONFIG +} + # Pre-compute all metric display names at module load time _ALL_METRIC_NAMES: dict[str, str] = { # Standard metrics from MetricRegistry @@ -24,9 +32,11 @@ if metric_class.header }, # GPU telemetry metrics + **_GPU_DISPLAY_NAMES, **{ - field_name: display_name - for display_name, field_name, _ in GPU_TELEMETRY_METRICS_CONFIG + legacy_name: _GPU_DISPLAY_NAMES[nvidia_name] + for legacy_name, nvidia_name in NVIDIA_TELEMETRY_FIELD_ALIASES.items() + if nvidia_name in _GPU_DISPLAY_NAMES }, # Derived metrics calculated during data processing "output_token_throughput_per_gpu": "Output Token Throughput Per GPU", # nosec @@ -73,6 +83,13 @@ field_name: unit_enum.info.tag if hasattr(unit_enum, "info") else str(unit_enum) for _, field_name, unit_enum in GPU_TELEMETRY_METRICS_CONFIG } +_GPU_METRIC_UNITS.update( + { + legacy_name: _GPU_METRIC_UNITS[nvidia_name] + for legacy_name, nvidia_name in NVIDIA_TELEMETRY_FIELD_ALIASES.items() + if nvidia_name in _GPU_METRIC_UNITS + } +) def get_all_metric_display_names() -> Mapping[str, str]: @@ -86,8 +103,8 @@ def get_all_metric_display_names() -> Mapping[str, str]: >>> names = get_all_metric_display_names() >>> names["time_to_first_token"] 'Time to First Token' - >>> names["gpu_power_usage"] - 'GPU Power Usage' + >>> names["nvidia_power_usage"] + 'NVIDIA GPU Power Usage' >>> names["output_token_throughput_per_gpu"] 'Output Token Throughput Per GPU' """ @@ -185,9 +202,9 @@ def get_gpu_metrics() -> list[str]: Examples: >>> metrics = get_gpu_metrics() - >>> 'gpu_utilization' in metrics + >>> 'nvidia_gpu_utilization' in metrics True - >>> 'gpu_memory_used' in metrics + >>> 'nvidia_memory_used' in metrics True """ return _GPU_METRICS @@ -198,15 +215,15 @@ def get_gpu_metric_unit(metric_name: str) -> str | None: Get the unit string for a GPU telemetry metric. Args: - metric_name: The GPU metric field name (e.g., "gpu_utilization") + metric_name: The GPU metric field name (e.g., "nvidia_gpu_utilization") Returns: Unit string (e.g., "%", "W", "°C") or None if not a GPU metric Examples: - >>> get_gpu_metric_unit("gpu_utilization") + >>> get_gpu_metric_unit("nvidia_gpu_utilization") '%' - >>> get_gpu_metric_unit("gpu_power_usage") + >>> get_gpu_metric_unit("nvidia_power_usage") 'W' >>> get_gpu_metric_unit("unknown_metric") None @@ -219,16 +236,16 @@ def get_metric_display_name_with_unit(metric_name: str) -> str: Get display name for a metric with unit suffix if available. Args: - metric_name: The metric identifier (e.g., "gpu_utilization") + metric_name: The metric identifier (e.g., "nvidia_gpu_utilization") Returns: - Human-readable display name with unit (e.g., "GPU Utilization (%)") + Human-readable display name with unit (e.g., "NVIDIA GPU Utilization (%)") Examples: - >>> get_metric_display_name_with_unit("gpu_utilization") - 'GPU Utilization (%)' - >>> get_metric_display_name_with_unit("memory_copy_utilization") - 'Memory Copy Utilization (%)' + >>> get_metric_display_name_with_unit("nvidia_gpu_utilization") + 'NVIDIA GPU Utilization (%)' + >>> get_metric_display_name_with_unit("nvidia_memory_utilization") + 'NVIDIA Memory Utilization (%)' >>> get_metric_display_name_with_unit("request_latency") 'Request Latency' """ diff --git a/tests/component_integration/gpu_telemetry/test_gpu_telemetry.py b/tests/component_integration/gpu_telemetry/test_gpu_telemetry.py index 12db0e42d..c8f1ab510 100644 --- a/tests/component_integration/gpu_telemetry/test_gpu_telemetry.py +++ b/tests/component_integration/gpu_telemetry/test_gpu_telemetry.py @@ -59,7 +59,11 @@ def test_dcgm_endpoints(self, cli, mock_dcgm_endpoints): # Verify each GPU has valid metrics with all required fields # Counter metrics only have avg (delta), not min/max - counter_metrics = {"energy_consumption", "xid_errors", "power_violation"} + counter_metrics = { + "nvidia_energy_consumption", + "nvidia_xid_errors", + "nvidia_power_violation", + } for gpu_id, gpu_data in endpoint_data.gpus.items(): assert gpu_data.metrics, f"GPU {gpu_id}: no metrics collected" for metric_name, metric_value in gpu_data.metrics.items(): diff --git a/tests/integration/test_custom_gpu_metrics.py b/tests/integration/test_custom_gpu_metrics.py index f4efe9a96..93bc0e221 100644 --- a/tests/integration/test_custom_gpu_metrics.py +++ b/tests/integration/test_custom_gpu_metrics.py @@ -15,7 +15,7 @@ from tests.harness.utils import AIPerfCLI, AIPerfMockServer # DCGMFaker provides 8 of the 12 default metrics defined in GPU_TELEMETRY_METRICS_CONFIG. -# Missing from DCGMFaker: encoder_utilization, decoder_utilization, sm_utilization, jpg_utilization +# Missing from DCGMFaker: nvidia_encoder_utilization, nvidia_decoder_utilization, nvidia_sm_utilization, nvidia_jpg_utilization DCGM_FAKER_DEFAULT_METRIC_COUNT = 8 @@ -45,7 +45,7 @@ def custom_gpu_metrics_csv(self, tmp_path: Path) -> Path: # Custom temperature metrics (DCGMFaker returns this) DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in °C) -# This is already a default metric (maps to mem_utilization), included to test deduplication +# This is already a default metric (maps to nvidia_memory_utilization), included to test deduplication DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory copy utilization (in %) """ csv_path.write_text(csv_content) @@ -119,8 +119,8 @@ async def test_custom_metrics_csv_loading_basic( for gpu_data in endpoint_data.gpus.values(): assert gpu_data.metrics is not None - # 8 defaults from DCGMFaker + 3 custom (sm_clock, mem_clock, memory_temp) - # Note: DCGM_FI_DEV_MEM_COPY_UTIL maps to default "mem_utilization", not added as custom + # 8 defaults from DCGMFaker + 3 custom (nvidia_sm_clock, nvidia_mem_clock, nvidia_memory_temp) + # Note: DCGM_FI_DEV_MEM_COPY_UTIL maps to default "nvidia_memory_utilization", not added as custom expected_min_metrics = DCGM_FAKER_DEFAULT_METRIC_COUNT + 3 assert len(gpu_data.metrics) >= expected_min_metrics, ( @@ -128,11 +128,11 @@ async def test_custom_metrics_csv_loading_basic( f"got {len(gpu_data.metrics)}" ) - # These are the actual custom metrics added (mem_copy_util is a default as mem_utilization) + # These are the actual custom metrics added (mem_copy_util is a default as nvidia_memory_utilization) custom_metric_names = [ - "sm_clock", - "mem_clock", - "memory_temp", + "nvidia_sm_clock", + "nvidia_mem_clock", + "nvidia_memory_temp", ] for metric_name in custom_metric_names: assert metric_name in gpu_data.metrics, ( @@ -148,29 +148,29 @@ async def test_custom_metrics_csv_loading_basic( ) assert ( - gpu_data.metrics["sm_clock"].unit + gpu_data.metrics["nvidia_sm_clock"].unit == FrequencyMetricUnit.MEGAHERTZ.value ), ( - f"sm_clock unit is {gpu_data.metrics['sm_clock'].unit}, expected {FrequencyMetricUnit.MEGAHERTZ.value}" + f"nvidia_sm_clock unit is {gpu_data.metrics['nvidia_sm_clock'].unit}, expected {FrequencyMetricUnit.MEGAHERTZ.value}" ) assert ( - gpu_data.metrics["mem_clock"].unit + gpu_data.metrics["nvidia_mem_clock"].unit == FrequencyMetricUnit.MEGAHERTZ.value ), ( - f"mem_clock unit is {gpu_data.metrics['mem_clock'].unit}, expected {FrequencyMetricUnit.MEGAHERTZ.value}" + f"nvidia_mem_clock unit is {gpu_data.metrics['nvidia_mem_clock'].unit}, expected {FrequencyMetricUnit.MEGAHERTZ.value}" ) assert ( - gpu_data.metrics["memory_temp"].unit + gpu_data.metrics["nvidia_memory_temp"].unit == TemperatureMetricUnit.CELSIUS.value ), ( - f"memory_temp unit is {gpu_data.metrics['memory_temp'].unit}, expected {TemperatureMetricUnit.CELSIUS.value}" + f"nvidia_memory_temp unit is {gpu_data.metrics['nvidia_memory_temp'].unit}, expected {TemperatureMetricUnit.CELSIUS.value}" ) - # DCGM_FI_DEV_MEM_COPY_UTIL maps to default "mem_utilization" (not "mem_copy_util") + # DCGM_FI_DEV_MEM_COPY_UTIL maps to default "nvidia_memory_utilization" (not "mem_copy_util") assert ( - gpu_data.metrics["mem_utilization"].unit + gpu_data.metrics["nvidia_memory_utilization"].unit == GenericMetricUnit.PERCENT.value ), ( - f"mem_utilization unit is {gpu_data.metrics['mem_utilization'].unit}, expected {GenericMetricUnit.PERCENT.value}" + f"nvidia_memory_utilization unit is {gpu_data.metrics['nvidia_memory_utilization'].unit}, expected {GenericMetricUnit.PERCENT.value}" ) async def test_custom_metrics_deduplication( @@ -207,13 +207,13 @@ async def test_custom_metrics_deduplication( f"Found duplicate metrics. Metrics list: {metric_names}" ) - assert "gpu_utilization" in gpu_data.metrics - assert "gpu_power_usage" in gpu_data.metrics + assert "nvidia_gpu_utilization" in gpu_data.metrics + assert "nvidia_power_usage" in gpu_data.metrics - assert "sm_clock" in gpu_data.metrics - assert "mem_clock" in gpu_data.metrics + assert "nvidia_sm_clock" in gpu_data.metrics + assert "nvidia_mem_clock" in gpu_data.metrics - # 8 defaults from DCGMFaker + 2 custom (sm_clock, mem_clock) + # 8 defaults from DCGMFaker + 2 custom (nvidia_sm_clock, nvidia_mem_clock) # GPU_UTIL and POWER_USAGE from CSV are already defaults, so not added as custom expected_min_metrics = DCGM_FAKER_DEFAULT_METRIC_COUNT + 2 @@ -246,9 +246,9 @@ async def test_invalid_csv_fallback_to_defaults( for dcgm_url in result.json.telemetry_data.endpoints: endpoint_data = result.json.telemetry_data.endpoints[dcgm_url] for gpu_data in endpoint_data.gpus.values(): - assert "sm_clock" in gpu_data.metrics + assert "nvidia_sm_clock" in gpu_data.metrics - # 8 defaults from DCGMFaker + 1 valid custom (sm_clock) + # 8 defaults from DCGMFaker + 1 valid custom (nvidia_sm_clock) expected_min_metrics = DCGM_FAKER_DEFAULT_METRIC_COUNT + 1 assert len(gpu_data.metrics) >= expected_min_metrics diff --git a/tests/integration/test_dcgm_faker.py b/tests/integration/test_dcgm_faker.py index 0e36aa5ff..ab0677847 100644 --- a/tests/integration/test_dcgm_faker.py +++ b/tests/integration/test_dcgm_faker.py @@ -43,22 +43,22 @@ def test_faker_output_parsed_by_real_telemetry_collector(self, gpu_name): # Verify TelemetryMetrics are correctly scaled telemetry = record.telemetry_data assert telemetry is not None - assert telemetry.gpu_power_usage == approx(gpu.power, abs=0.01) - assert telemetry.gpu_utilization == approx(gpu.util, abs=0.01) - assert telemetry.gpu_temperature == approx(gpu.temp, abs=0.01) - assert telemetry.energy_consumption == approx( + assert telemetry.nvidia_power_usage == approx(gpu.power, abs=0.01) + assert telemetry.nvidia_gpu_utilization == approx(gpu.util, abs=0.01) + assert telemetry.nvidia_temperature == approx(gpu.temp, abs=0.01) + assert telemetry.nvidia_energy_consumption == approx( gpu.energy * 1e-9, abs=0.01 ) # mJ to MJ - assert telemetry.gpu_memory_used == approx( + assert telemetry.nvidia_memory_used == approx( gpu.mem_used * 1.048576 * 1e-3, abs=0.01 ) # MiB to GB - assert telemetry.xid_errors == approx(gpu.xid, abs=0.01) - assert telemetry.power_violation == approx(gpu.power_viol, abs=0.01) + assert telemetry.nvidia_xid_errors == approx(gpu.xid, abs=0.01) + assert telemetry.nvidia_power_violation == approx(gpu.power_viol, abs=0.01) # Verify values are in reasonable ranges - assert 0 <= telemetry.gpu_utilization <= 100 - assert 0 < telemetry.gpu_power_usage <= gpu.cfg.max_power_w - assert 0 < telemetry.gpu_temperature <= 100 + assert 0 <= telemetry.nvidia_gpu_utilization <= 100 + assert 0 < telemetry.nvidia_power_usage <= gpu.cfg.max_power_w + assert 0 < telemetry.nvidia_temperature <= 100 def test_load_affects_telemetry_records(self): """Test that load changes affect TelemetryRecords when parsed by real collector.""" @@ -78,7 +78,9 @@ def test_load_affects_telemetry_records(self): high_telemetry = high_records[0].telemetry_data # High load should produce higher values - assert high_telemetry.gpu_power_usage > low_telemetry.gpu_power_usage - assert high_telemetry.gpu_temperature > low_telemetry.gpu_temperature - assert high_telemetry.gpu_utilization > low_telemetry.gpu_utilization - assert high_telemetry.gpu_memory_used > low_telemetry.gpu_memory_used + assert high_telemetry.nvidia_power_usage > low_telemetry.nvidia_power_usage + assert high_telemetry.nvidia_temperature > low_telemetry.nvidia_temperature + assert ( + high_telemetry.nvidia_gpu_utilization > low_telemetry.nvidia_gpu_utilization + ) + assert high_telemetry.nvidia_memory_used > low_telemetry.nvidia_memory_used diff --git a/tests/integration/test_gpu_telemetry.py b/tests/integration/test_gpu_telemetry.py index d8d0d5eb8..3c6ebbc34 100644 --- a/tests/integration/test_gpu_telemetry.py +++ b/tests/integration/test_gpu_telemetry.py @@ -56,9 +56,9 @@ async def test_gpu_telemetry( # Counter metrics only have avg (delta), not min/max counter_metrics = { - "energy_consumption", - "xid_errors", - "power_violation", + "nvidia_energy_consumption", + "nvidia_xid_errors", + "nvidia_power_violation", } for metric_name, metric_value in gpu_data.metrics.items(): assert metric_value is not None diff --git a/tests/unit/common/models/test_record_models.py b/tests/unit/common/models/test_record_models.py index 88be1a3c1..271fc1107 100644 --- a/tests/unit/common/models/test_record_models.py +++ b/tests/unit/common/models/test_record_models.py @@ -253,8 +253,8 @@ def test_aggregate_metric_omits_count(self) -> None: def test_unknown_tag_keeps_count(self) -> None: """Tags from other registries (e.g. GPU telemetry) keep count as-is.""" result = MetricResult( - tag="gpu_power_usage", - header="GPU Power Usage", + tag="nvidia_power_usage", + header="NVIDIA GPU Power Usage", unit="W", avg=250.0, count=42, diff --git a/tests/unit/exporters/conftest.py b/tests/unit/exporters/conftest.py index 027ec7cd5..5be79a2cc 100644 --- a/tests/unit/exporters/conftest.py +++ b/tests/unit/exporters/conftest.py @@ -83,17 +83,18 @@ def sample_telemetry_record(): gpu_index=0, gpu_model_name="NVIDIA H100", gpu_uuid="GPU-12345678-1234-1234-1234-123456789abc", + platform="nvidia", pci_bus_id="00000000:01:00.0", device="nvidia0", hostname="test-node-01", telemetry_data=TelemetryMetrics( - gpu_power_usage=300.0, - energy_consumption=1000.5, - gpu_utilization=85.0, - gpu_memory_used=72.5, - gpu_temperature=70.0, - xid_errors=0.0, - power_violation=0.0, + nvidia_power_usage=300.0, + nvidia_energy_consumption=1000.5, + nvidia_gpu_utilization=85.0, + nvidia_memory_used=72.5, + nvidia_temperature=70.0, + nvidia_xid_errors=0.0, + nvidia_power_violation=0.0, ), ) @@ -105,7 +106,7 @@ def sample_telemetry_results(): # Create JsonMetricResults for each GPU metric def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): return { - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=base_power, min=base_power - 20, @@ -115,7 +116,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): p99=base_power + 18, std=5.0, ), - "energy_consumption": JsonMetricResult( + "nvidia_energy_consumption": JsonMetricResult( unit="J", avg=base_energy, min=base_energy - 100, @@ -125,7 +126,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): p99=base_energy + 380, std=100.0, ), - "gpu_utilization": JsonMetricResult( + "nvidia_gpu_utilization": JsonMetricResult( unit="%", avg=base_util, min=base_util, @@ -135,7 +136,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): p99=base_util + 8, std=2.0, ), - "gpu_memory_used": JsonMetricResult( + "nvidia_memory_used": JsonMetricResult( unit="GB", avg=base_mem, min=base_mem, @@ -145,7 +146,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): p99=base_mem + 4, std=1.0, ), - "gpu_temperature": JsonMetricResult( + "nvidia_temperature": JsonMetricResult( unit="°C", avg=base_temp, min=base_temp, @@ -155,7 +156,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): p99=base_temp + 7, std=2.0, ), - "xid_errors": JsonMetricResult( + "nvidia_xid_errors": JsonMetricResult( unit="count", avg=0.0, min=0.0, @@ -165,7 +166,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): p99=0.0, std=0.0, ), - "power_violation": JsonMetricResult( + "nvidia_power_violation": JsonMetricResult( unit="ms", avg=120.0, min=100.0, @@ -197,6 +198,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): gpu_index=0, gpu_name="NVIDIA H100", gpu_uuid="GPU-12345678-1234-1234-1234-123456780000", + platform="nvidia", hostname="test-node-01", metrics=make_gpu_metrics(290.0, 1200.0, 84.0, 72.0, 69.0), ), @@ -204,6 +206,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): gpu_index=1, gpu_name="NVIDIA H100", gpu_uuid="GPU-12345678-1234-1234-1234-123456780001", + platform="nvidia", hostname="test-node-01", metrics=make_gpu_metrics(310.0, 1200.0, 84.0, 77.0, 69.0), ), @@ -215,6 +218,7 @@ def make_gpu_metrics(base_power, base_energy, base_util, base_mem, base_temp): gpu_index=0, gpu_name="NVIDIA A100", gpu_uuid="GPU-abcdef01-2345-6789-abcd-ef0123456789", + platform="nvidia", hostname="test-node-02", metrics=make_gpu_metrics(270.0, 1120.0, 81.0, 64.0, 69.0), ), @@ -245,18 +249,19 @@ def sample_telemetry_results_with_failures(): gpu_index=0, gpu_name="NVIDIA H100", gpu_uuid="GPU-12345678-1234-1234-1234-123456789abc", + platform="nvidia", hostname="test-node-01", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=310.0, min=300.0, max=320.0, std=10.0 ), - "gpu_utilization": JsonMetricResult( + "nvidia_gpu_utilization": JsonMetricResult( unit="%", avg=85.0, min=85.0, max=85.0, std=0.0 ), - "gpu_memory_used": JsonMetricResult( + "nvidia_memory_used": JsonMetricResult( unit="GB", avg=72.5, min=72.5, max=72.5, std=0.0 ), - "gpu_temperature": JsonMetricResult( + "nvidia_temperature": JsonMetricResult( unit="°C", avg=70.0, min=70.0, max=70.0, std=0.0 ), }, diff --git a/tests/unit/exporters/test_gpu_telemetry_console_exporter.py b/tests/unit/exporters/test_gpu_telemetry_console_exporter.py index cc8fcd057..f8d4512ec 100644 --- a/tests/unit/exporters/test_gpu_telemetry_console_exporter.py +++ b/tests/unit/exporters/test_gpu_telemetry_console_exporter.py @@ -121,7 +121,9 @@ async def test_export_with_telemetry_data( output = capsys.readouterr().out assert "GPU Telemetry Summary" in output - assert "DCGM endpoints reachable" in output + assert "telemetry sources reachable" in output + assert "GPU telemetry platform: nvidia" in output + assert "cross-platform comparisons require" in output assert "H100" in output or "A100" in output assert "Power" in output and "Usage" in output @@ -143,7 +145,7 @@ async def test_export_displays_all_endpoints( output = capsys.readouterr().out assert "localhost:9400" in output assert "remote-node:9400" in output - assert "2/2 DCGM endpoints reachable" in output + assert "2/2 telemetry sources reachable" in output @pytest.mark.asyncio async def test_export_shows_failed_endpoints( @@ -165,7 +167,7 @@ async def test_export_shows_failed_endpoints( await exporter.export(console) output = capsys.readouterr().out - assert "1/3 DCGM endpoints reachable" in output + assert "1/3 telemetry sources reachable" in output assert "localhost:9400" in output assert "unreachable-node:9400" in output or "unreachable" in output assert "❌" in output or "unreachable" in output @@ -324,7 +326,7 @@ async def test_export_handles_missing_metrics( hostname="test-node", metrics={ # Only include one metric, others are missing - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=100.0, min=90.0, max=110.0 ), }, @@ -384,7 +386,7 @@ async def test_export_all_endpoints_failed( output = capsys.readouterr().out assert "No GPU telemetry data collected" in output assert ( - "0/3 DCGM endpoints reachable" in output + "0/3 telemetry sources reachable" in output or "Unreachable endpoints" in output ) assert "node1:9400" in output @@ -485,7 +487,7 @@ async def test_export_with_mixed_successful_failed_endpoints( gpu_uuid="GPU-123", hostname="test-node", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=100.0, min=90.0, max=110.0 ), }, @@ -506,8 +508,8 @@ async def test_export_with_mixed_successful_failed_endpoints( await exporter.export(console) output = capsys.readouterr().out - # Should show 1/2 endpoints reachable - assert "1/2 DCGM endpoints reachable" in output + # Should show 1/2 telemetry sources reachable + assert "1/2 telemetry sources reachable" in output # Should show both endpoints with status assert "node1:9400" in output assert "node2:9400" in output diff --git a/tests/unit/exporters/test_metrics_csv_exporter.py b/tests/unit/exporters/test_metrics_csv_exporter.py index b0f79306d..1229b4cb6 100644 --- a/tests/unit/exporters/test_metrics_csv_exporter.py +++ b/tests/unit/exporters/test_metrics_csv_exporter.py @@ -404,8 +404,9 @@ async def test_csv_export_with_telemetry_data( # Check for telemetry section with structured table format assert "Endpoint" in content assert "GPU_Index" in content - assert "GPU Power Usage (W)" in content or "GPU Power Usage" in content - assert "GPU Utilization (%)" in content or "GPU Utilization" in content + assert "Platform" in content + assert "NVIDIA GPU Power Usage (W)" in content + assert "NVIDIA GPU Utilization (%)" in content @pytest.mark.asyncio async def test_csv_export_without_telemetry_data(self, mock_cfg): @@ -547,14 +548,14 @@ async def test_csv_gpu_summary_metrics_check(self, mock_cfg): gpu_uuid="GPU-123", hostname="test-node", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=100.0, min=90.0, max=110.0 ), }, ) # Metric check is now a simple dict lookup - assert "gpu_power_usage" in gpu_summary_with_metric.metrics + assert "nvidia_power_usage" in gpu_summary_with_metric.metrics assert "invalid_metric" not in gpu_summary_with_metric.metrics # GpuSummary without metrics @@ -566,7 +567,7 @@ async def test_csv_gpu_summary_metrics_check(self, mock_cfg): metrics={}, ) - assert "gpu_power_usage" not in gpu_summary_without_metric.metrics + assert "nvidia_power_usage" not in gpu_summary_without_metric.metrics @pytest.mark.asyncio async def test_csv_export_telemetry_multi_endpoint(self, mock_cfg): @@ -609,7 +610,7 @@ async def test_csv_export_telemetry_multi_endpoint(self, mock_cfg): gpu_uuid="GPU-111", hostname="node1", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=105.0, min=100.0, @@ -628,7 +629,7 @@ async def test_csv_export_telemetry_multi_endpoint(self, mock_cfg): gpu_uuid="GPU-222", hostname="node2", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=205.0, min=200.0, @@ -797,7 +798,7 @@ def _make_telemetry( namespace=namespace, pod_name=pod_name, metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=300.0, min=280.0, max=320.0 ) }, diff --git a/tests/unit/exporters/test_metrics_json_exporter.py b/tests/unit/exporters/test_metrics_json_exporter.py index 6d1c14de3..fb63db848 100644 --- a/tests/unit/exporters/test_metrics_json_exporter.py +++ b/tests/unit/exporters/test_metrics_json_exporter.py @@ -193,7 +193,7 @@ def error_summary(self): # Schema bump landed assert raw["schema_version"] == JsonExportData.SCHEMA_VERSION - assert JsonExportData.SCHEMA_VERSION == "1.3" + assert JsonExportData.SCHEMA_VERSION == "1.4" # Record metric: count and sum are present assert raw["request_latency"]["count"] == 100 @@ -247,7 +247,7 @@ async def test_run_info_populated_when_run_provided(self, mock_results, mock_cfg expected_file = output_dir / OutputDefaults.PROFILE_EXPORT_AIPERF_JSON_FILE data = JsonExportData.model_validate_json(expected_file.read_text()) - assert data.schema_version == "1.3" + assert data.schema_version == "1.4" assert data.run_info is not None assert data.run_info.benchmark_id == "abc123" assert data.run_info.sweep_id == "sweep-uuid-xyz" @@ -520,6 +520,7 @@ async def test_json_export_telemetry_structure( assert "gpu_index" in first_gpu assert "gpu_name" in first_gpu assert "gpu_uuid" in first_gpu + assert first_gpu["platform"] == "nvidia" # Verify metrics structure assert "metrics" in first_gpu @@ -630,7 +631,7 @@ async def test_json_export_telemetry_with_none_values(self, mock_results, mock_c hostname="test-host", metrics={ # Metric with None values for percentiles - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=100.0, min=None, @@ -743,7 +744,7 @@ async def test_json_export_telemetry_endpoint_normalization( gpu_uuid="GPU-123", hostname="node1", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=100.0, min=100.0, @@ -814,7 +815,7 @@ async def test_json_export_telemetry_multi_endpoint(self, mock_results, mock_cfg gpu_uuid="GPU-111", hostname="node1", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=105.0, min=100.0, @@ -833,7 +834,7 @@ async def test_json_export_telemetry_multi_endpoint(self, mock_results, mock_cfg gpu_uuid="GPU-222", hostname="node2", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=205.0, min=200.0, @@ -902,7 +903,7 @@ async def test_json_export_with_hostname_metadata(self, mock_results, mock_cfg): gpu_uuid="GPU-123", hostname="test-hostname", metrics={ - "gpu_power_usage": JsonMetricResult( + "nvidia_power_usage": JsonMetricResult( unit="W", avg=100.0, min=100.0, diff --git a/tests/unit/gpu_telemetry/conftest.py b/tests/unit/gpu_telemetry/conftest.py index c15fec719..75428a477 100644 --- a/tests/unit/gpu_telemetry/conftest.py +++ b/tests/unit/gpu_telemetry/conftest.py @@ -84,10 +84,10 @@ def sample_telemetry_records(): device="nvidia0", hostname="ed7e7a5e585f", telemetry_data=TelemetryMetrics( - gpu_power_usage=22.582, - energy_consumption=955.287014, - gpu_utilization=1.0, - gpu_memory_used=45.521, # 46614 MiB / 1024 ≈ 45.521 GB + nvidia_power_usage=22.582, + nvidia_energy_consumption=955.287014, + nvidia_gpu_utilization=1.0, + nvidia_memory_used=45.521, # 46614 MiB / 1024 ≈ 45.521 GB ), ), ] @@ -115,11 +115,11 @@ def multi_gpu_telemetry_records(): device="nvidia0", hostname="ed7e7a5e585f", telemetry_data=TelemetryMetrics( - gpu_power_usage=70.0 + (i % 30), # Varying power 70-99W - energy_consumption=(280000000 + (i * 2000000)) + nvidia_power_usage=70.0 + (i % 30), # Varying power 70-99W + nvidia_energy_consumption=(280000000 + (i * 2000000)) / 1e6, # Increasing energy - gpu_utilization=float(80 + (i % 20)), # 80-99% - gpu_memory_used=15.0 + (i % 5), # 15-19 GB + nvidia_gpu_utilization=float(80 + (i % 20)), # 80-99% + nvidia_memory_used=15.0 + (i % 5), # 15-19 GB ), ) ) @@ -136,11 +136,11 @@ def multi_gpu_telemetry_records(): device="nvidia1", hostname="ed7e7a5e585f", telemetry_data=TelemetryMetrics( - gpu_power_usage=42.0 + (i % 3), # Idle power 42-44W - energy_consumption=(230000000 + (i * 500000)) + nvidia_power_usage=42.0 + (i % 3), # Idle power 42-44W + nvidia_energy_consumption=(230000000 + (i * 500000)) / 1e6, # Slower energy growth - gpu_utilization=0.0, - gpu_memory_used=0.0, + nvidia_gpu_utilization=0.0, + nvidia_memory_used=0.0, ), ) ) @@ -157,10 +157,10 @@ def multi_gpu_telemetry_records(): device="nvidia2", hostname="ed7e7a5e585f", telemetry_data=TelemetryMetrics( - gpu_power_usage=200.0 + (i % 50), # Higher power 200-249W - energy_consumption=(250000000 + (i * 3000000)) / 1e6, - gpu_utilization=float(50 + (i % 30)), # 50-79% - gpu_memory_used=40.0 + (i % 10), # 40-49 GB + nvidia_power_usage=200.0 + (i % 50), # Higher power 200-249W + nvidia_energy_consumption=(250000000 + (i * 3000000)) / 1e6, + nvidia_gpu_utilization=float(50 + (i % 30)), # 50-79% + nvidia_memory_used=40.0 + (i % 10), # 40-49 GB ), ) ) diff --git a/tests/unit/gpu_telemetry/test_accumulator.py b/tests/unit/gpu_telemetry/test_accumulator.py index e9e5abdb3..0bae711d7 100644 --- a/tests/unit/gpu_telemetry/test_accumulator.py +++ b/tests/unit/gpu_telemetry/test_accumulator.py @@ -65,13 +65,13 @@ def sample_telemetry_record() -> TelemetryRecord: pci_bus_id="00000000:02:00.0", device="nvidia0", hostname="node1", - gpu_power_usage=75.5, - energy_consumption=1000.0, - gpu_utilization=85.0, - gpu_memory_used=15.26, - gpu_temperature=70.0, - xid_errors=0.0, - power_violation=120.0, + nvidia_power_usage=75.5, + nvidia_energy_consumption=1000.0, + nvidia_gpu_utilization=85.0, + nvidia_memory_used=15.26, + nvidia_temperature=70.0, + nvidia_xid_errors=0.0, + nvidia_power_violation=120.0, ) @@ -158,10 +158,10 @@ async def test_summarize_with_valid_data( gpu_index=sample_telemetry_record.gpu_index, gpu_uuid=sample_telemetry_record.gpu_uuid, gpu_model_name=sample_telemetry_record.gpu_model_name, - gpu_power_usage=75.0 + i, - energy_consumption=1000.0 + i * 10, - gpu_utilization=80.0 + i, - gpu_memory_used=15.0 + i * 0.1, + nvidia_power_usage=75.0 + i, + nvidia_energy_consumption=1000.0 + i * 10, + nvidia_gpu_utilization=80.0 + i, + nvidia_memory_used=15.0 + i * 0.1, ) await processor.process_telemetry_record(record) @@ -173,8 +173,8 @@ async def test_summarize_with_valid_data( # Check that metrics are properly tagged result_tags = [r.tag for r in results] - assert any("gpu_power_usage" in tag for tag in result_tags) - assert any("energy_consumption" in tag for tag in result_tags) + assert any("nvidia_power_usage" in tag for tag in result_tags) + assert any("nvidia_energy_consumption" in tag for tag in result_tags) @pytest.mark.asyncio async def test_summarize_handles_no_metric_value( @@ -333,18 +333,18 @@ async def test_summarize_generates_correct_tags( timestamp_ns=1000000000 + i * 1000000, gpu_uuid="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc", gpu_model_name="NVIDIA RTX 6000", - gpu_power_usage=75.0 + i, + nvidia_power_usage=75.0 + i, ) await processor.process_telemetry_record(record) results = await processor.summarize() # Check tag format: metric_name_dcgm_TAG_gpuINDEX_UUID - power_results = [r for r in results if "gpu_power_usage" in r.tag] + power_results = [r for r in results if "nvidia_power_usage" in r.tag] assert len(power_results) > 0 tag = power_results[0].tag - assert "gpu_power_usage" in tag + assert "nvidia_power_usage" in tag assert "dcgm_http" in tag # URL gets sanitized assert "node1" in tag assert "gpu0" in tag @@ -369,7 +369,7 @@ async def test_summarize_multiple_gpus( gpu_index=gpu_index, gpu_uuid=f"GPU-0000000{gpu_index}-0000-0000-0000-000000000000", gpu_model_name="NVIDIA RTX 6000", - gpu_power_usage=75.0 + gpu_index * 10 + i, + nvidia_power_usage=75.0 + gpu_index * 10 + i, ) await processor.process_telemetry_record(record) diff --git a/tests/unit/gpu_telemetry/test_amdsmi_collector.py b/tests/unit/gpu_telemetry/test_amdsmi_collector.py index ec0454155..fe1c9da62 100644 --- a/tests/unit/gpu_telemetry/test_amdsmi_collector.py +++ b/tests/unit/gpu_telemetry/test_amdsmi_collector.py @@ -375,12 +375,12 @@ async def test_collect_has_all_expected_fields(self, initialized_collector): assert td0.amd_mm_activity is None # NVML-named fields must NOT be set by the AMD collector. - assert td0.gpu_utilization is None - assert td0.sm_utilization is None - assert td0.mem_utilization is None - assert td0.encoder_utilization is None - assert td0.decoder_utilization is None - assert td0.jpg_utilization is None + assert td0.nvidia_gpu_utilization is None + assert td0.nvidia_sm_utilization is None + assert td0.nvidia_memory_utilization is None + assert td0.nvidia_encoder_utilization is None + assert td0.nvidia_decoder_utilization is None + assert td0.nvidia_jpg_utilization is None # VRAM: 183_678_435_328 bytes -> ~183.68 GB assert td0.amd_memory_used == pytest.approx(183.68, rel=1e-3) @@ -430,8 +430,8 @@ async def test_throttle_status_is_snapshot_not_accumulation( # GPU 1 not throttling -> 0.0. assert records1[1].telemetry_data.amd_throttle_status == 0.0 assert records2[1].telemetry_data.amd_throttle_status == 0.0 - # The synthesized power_violation field is no longer populated. - assert records2[0].telemetry_data.power_violation is None + # The synthesized nvidia_power_violation field is no longer populated. + assert records2[0].telemetry_data.nvidia_power_violation is None @pytest.mark.asyncio async def test_temperature_normalized_when_returned_in_millidegrees( @@ -559,9 +559,9 @@ async def test_ecc_uncorrectable_emitted_under_amd_namespace( records = await initialized_collector._loop_to_thread_collect() assert records[0].telemetry_data.amd_ecc_uncorrectable == 0.0 assert records[1].telemetry_data.amd_ecc_uncorrectable == 2.0 - # The synthesized xid_errors alias is no longer populated. - assert records[0].telemetry_data.xid_errors is None - assert records[1].telemetry_data.xid_errors is None + # The synthesized nvidia_xid_errors alias is no longer populated. + assert records[0].telemetry_data.nvidia_xid_errors is None + assert records[1].telemetry_data.nvidia_xid_errors is None # --------------------------------------------------------------------------- diff --git a/tests/unit/gpu_telemetry/test_jsonl_writer.py b/tests/unit/gpu_telemetry/test_jsonl_writer.py index c25073684..9b17d48fc 100644 --- a/tests/unit/gpu_telemetry/test_jsonl_writer.py +++ b/tests/unit/gpu_telemetry/test_jsonl_writer.py @@ -50,13 +50,13 @@ def sample_telemetry_record() -> TelemetryRecord: pci_bus_id="00000000:02:00.0", device="nvidia0", hostname="node1", - gpu_power_usage=75.5, - energy_consumption=1000.0, - gpu_utilization=85.0, - gpu_memory_used=15.26, - gpu_temperature=70.0, - xid_errors=None, - power_violation=0.0, + nvidia_power_usage=75.5, + nvidia_energy_consumption=1000.0, + nvidia_gpu_utilization=85.0, + nvidia_memory_used=15.26, + nvidia_temperature=70.0, + nvidia_xid_errors=None, + nvidia_power_violation=0.0, ) @@ -72,13 +72,13 @@ def sample_telemetry_record_partial() -> TelemetryRecord: pci_bus_id=None, device=None, hostname="node2", - gpu_power_usage=150.0, - energy_consumption=None, - gpu_utilization=95.0, - gpu_memory_used=70.0, - gpu_temperature=85.0, - xid_errors=None, - power_violation=None, + nvidia_power_usage=150.0, + nvidia_energy_consumption=None, + nvidia_gpu_utilization=95.0, + nvidia_memory_used=70.0, + nvidia_temperature=85.0, + nvidia_xid_errors=None, + nvidia_power_violation=None, ) @@ -223,8 +223,8 @@ async def test_process_telemetry_record_with_complete_data( assert record.gpu_index == 0 assert record.gpu_uuid == "GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc" assert record.gpu_model_name == "NVIDIA RTX 6000 Ada Generation" - assert record.telemetry_data.gpu_power_usage == 75.5 - assert record.telemetry_data.gpu_utilization == 85.0 + assert record.telemetry_data.nvidia_power_usage == 75.5 + assert record.telemetry_data.nvidia_gpu_utilization == 85.0 @pytest.mark.asyncio async def test_process_telemetry_record_with_partial_data( @@ -249,8 +249,8 @@ async def test_process_telemetry_record_with_partial_data( assert record.timestamp_ns == 2_000_000_000 assert record.pci_bus_id is None assert record.device is None - assert record.telemetry_data.energy_consumption is None - assert record.telemetry_data.power_violation is None + assert record.telemetry_data.nvidia_energy_consumption is None + assert record.telemetry_data.nvidia_power_violation is None @pytest.mark.asyncio async def test_process_multiple_telemetry_records( @@ -268,8 +268,8 @@ async def test_process_multiple_telemetry_records( make_telemetry_record( timestamp_ns=1_000_000_000 + i * 1_000_000, gpu_uuid="GPU-test-uuid", - gpu_power_usage=100.0 + i, - gpu_utilization=80.0 + i, + nvidia_power_usage=100.0 + i, + nvidia_gpu_utilization=80.0 + i, ) for i in range(5) ] @@ -327,7 +327,7 @@ async def test_buffer_auto_flush_at_batch_size( for i in range(batch_size * 2): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) await processor.process_telemetry_record(record) @@ -352,7 +352,7 @@ async def test_multiple_gpus_same_endpoint( gpu_index=i, gpu_uuid=f"GPU-{i}", gpu_model_name=f"GPU {i}", - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) for i in range(4) ] @@ -418,7 +418,7 @@ async def test_records_written_count( for i in range(10): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) await processor.process_telemetry_record(record) @@ -516,16 +516,16 @@ async def test_preserves_all_telemetry_fields( # Check telemetry data fields assert ( - record.telemetry_data.gpu_power_usage - == sample_telemetry_record.telemetry_data.gpu_power_usage + record.telemetry_data.nvidia_power_usage + == sample_telemetry_record.telemetry_data.nvidia_power_usage ) assert ( - record.telemetry_data.gpu_utilization - == sample_telemetry_record.telemetry_data.gpu_utilization + record.telemetry_data.nvidia_gpu_utilization + == sample_telemetry_record.telemetry_data.nvidia_gpu_utilization ) assert ( - record.telemetry_data.gpu_memory_used - == sample_telemetry_record.telemetry_data.gpu_memory_used + record.telemetry_data.nvidia_memory_used + == sample_telemetry_record.telemetry_data.nvidia_memory_used ) @pytest.mark.asyncio @@ -551,7 +551,7 @@ async def test_handles_none_values( # Verify None values are not present in the serialized dict assert "pci_bus_id" not in record_dict assert "device" not in record_dict - assert "energy_consumption" not in record_dict["telemetry_data"] + assert "nvidia_energy_consumption" not in record_dict["telemetry_data"] @pytest.mark.asyncio async def test_timestamp_precision( @@ -695,8 +695,8 @@ async def test_lifecycle_with_mock_aiofiles( for i in range(Environment.RECORD.EXPORT_BATCH_SIZE * 2): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, - gpu_utilization=80.0, + nvidia_power_usage=100.0 + i, + nvidia_gpu_utilization=80.0, ) await processor.process_telemetry_record(record) @@ -715,7 +715,7 @@ async def test_lifecycle_with_mock_aiofiles( record = TelemetryRecord.model_validate_json(line) assert record.timestamp_ns == 1_000_000_000 + i assert record.gpu_uuid == "GPU-test" - assert record.telemetry_data.gpu_power_usage == 100.0 + i + assert record.telemetry_data.nvidia_power_usage == 100.0 + i @pytest.mark.asyncio async def test_file_handle_lifecycle( @@ -761,7 +761,7 @@ async def test_flush_on_shutdown( for i in range(num_records): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) await processor.process_telemetry_record(record) @@ -785,7 +785,7 @@ async def test_wait_for_async_tasks( for i in range(processor._batch_size * 3): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) await processor.process_telemetry_record(record) @@ -813,7 +813,7 @@ async def test_statistics_logged_on_shutdown( for i in range(5): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) await processor.process_telemetry_record(record) @@ -882,8 +882,8 @@ async def test_concurrent_writes( gpu_index=i % 4, gpu_uuid=f"GPU-{i}", hostname=f"node{i % 3}", - gpu_power_usage=100.0 + i, - gpu_utilization=80.0, + nvidia_power_usage=100.0 + i, + nvidia_gpu_utilization=80.0, ) await processor.process_telemetry_record(record) @@ -912,7 +912,7 @@ async def test_large_batch_processing( for i in range(total_records): record = make_telemetry_record( timestamp_ns=1_000_000_000 + i, - gpu_power_usage=100.0 + i, + nvidia_power_usage=100.0 + i, ) await processor.process_telemetry_record(record) @@ -943,8 +943,8 @@ async def test_interleaved_gpu_records( gpu_index=gpu_idx, gpu_uuid=f"GPU-{gpu_idx}", gpu_model_name=f"GPU {gpu_idx}", - gpu_power_usage=100.0 + gpu_idx, - gpu_utilization=80.0 + cycle, + nvidia_power_usage=100.0 + gpu_idx, + nvidia_gpu_utilization=80.0 + cycle, ) await processor.process_telemetry_record(record) diff --git a/tests/unit/gpu_telemetry/test_metrics_config.py b/tests/unit/gpu_telemetry/test_metrics_config.py index be886f6e2..813bf598c 100644 --- a/tests/unit/gpu_telemetry/test_metrics_config.py +++ b/tests/unit/gpu_telemetry/test_metrics_config.py @@ -144,6 +144,9 @@ def test_infer_unit_from_help_message(self): assert ( loader._infer_unit_from_help("Memory (in MB)") == MetricSizeUnit.MEGABYTES ) + assert ( + loader._infer_unit_from_help("Memory (in MiB)") == MetricSizeUnit.MEGABYTES + ) assert ( loader._infer_unit_from_help("Memory (in KB)") == MetricSizeUnit.KILOBYTES ) @@ -196,30 +199,32 @@ def test_build_custom_metrics_deduplication(self): ) # Should return 2 metrics: - # - DCGM_FI_DEV_GPU_TEMP → gpu_temperature (in 7 defaults, SKIPPED) - # - DCGM_FI_DEV_POWER_USAGE → gpu_power_usage (in 7 defaults, SKIPPED) - # - DCGM_FI_DEV_SM_CLOCK → sm_clock (NOT in 7 defaults, ADDED with auto-generated name) - # - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL → nvlink_bandwidth_total (new field, ADDED) + # - DCGM_FI_DEV_GPU_TEMP → nvidia_temperature (in 7 defaults, SKIPPED) + # - DCGM_FI_DEV_POWER_USAGE → nvidia_power_usage (in 7 defaults, SKIPPED) + # - DCGM_FI_DEV_SM_CLOCK → nvidia_sm_clock (NOT in 7 defaults, ADDED with auto-generated name) + # - DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL → nvidia_nvlink_bandwidth_total (new field, ADDED) assert len(custom_metrics) == 2 custom_field_names = {m[1] for m in custom_metrics} - assert "sm_clock" in custom_field_names - assert "nvlink_bandwidth_total" in custom_field_names + assert "nvidia_sm_clock" in custom_field_names + assert "nvidia_nvlink_bandwidth_total" in custom_field_names # Verify the new DCGM mappings were returned assert "DCGM_FI_DEV_SM_CLOCK" in new_dcgm_mappings - assert new_dcgm_mappings["DCGM_FI_DEV_SM_CLOCK"] == "sm_clock" + assert new_dcgm_mappings["DCGM_FI_DEV_SM_CLOCK"] == "nvidia_sm_clock" assert "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL" in new_dcgm_mappings assert ( new_dcgm_mappings["DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL"] - == "nvlink_bandwidth_total" + == "nvidia_nvlink_bandwidth_total" ) # Apply mappings (simulating what cli_runner does) DCGM_TO_FIELD_MAPPING.update(new_dcgm_mappings) # Verify existing mappings were NOT changed - assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_GPU_TEMP"] == "gpu_temperature" - assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_POWER_USAGE"] == "gpu_power_usage" + assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_GPU_TEMP"] == "nvidia_temperature" + assert ( + DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_POWER_USAGE"] == "nvidia_power_usage" + ) finally: csv_path.unlink() @@ -241,8 +246,8 @@ def test_build_custom_metrics_field_name_generation(self): # Verify field names assert len(custom_metrics) == 2 - assert custom_metrics[0][1] == "nvlink_bandwidth_total" - assert custom_metrics[1][1] == "dcgm_fi_prof_pipe_tensor_active" + assert custom_metrics[0][1] == "nvidia_nvlink_bandwidth_total" + assert custom_metrics[1][1] == "nvidia_pipe_tensor_active" # Verify display names extracted from help messages (title cased with acronyms) assert custom_metrics[0][0] == "NVLINK Bandwidth" @@ -275,10 +280,10 @@ def test_metrics_removed_from_defaults_are_added_from_csv(self): # These fields were removed from DCGM mapping and are not in the 7 display defaults # When added via CSV, they'll get auto-generated field names - assert "sm_clock" not in existing_field_names - assert "mem_clock" not in existing_field_names - assert "memory_temp" not in existing_field_names - assert "power_mgmt_limit" not in existing_field_names + assert "nvidia_sm_clock" not in existing_field_names + assert "nvidia_mem_clock" not in existing_field_names + assert "nvidia_memory_temp" not in existing_field_names + assert "nvidia_power_mgmt_limit" not in existing_field_names custom_metrics, new_dcgm_mappings = loader.build_custom_metrics_from_csv( custom_csv_path=csv_path @@ -288,19 +293,19 @@ def test_metrics_removed_from_defaults_are_added_from_csv(self): assert len(custom_metrics) == 4 custom_field_names = {m[1] for m in custom_metrics} assert custom_field_names == { - "sm_clock", - "mem_clock", - "memory_temp", - "power_mgmt_limit", + "nvidia_sm_clock", + "nvidia_mem_clock", + "nvidia_memory_temp", + "nvidia_power_mgmt_limit", } # Verify new DCGM mappings (only if they didn't already exist from previous tests) # Note: Tests may run in any order, so some may already be in mapping for dcgm_field, field_name in [ - ("DCGM_FI_DEV_SM_CLOCK", "sm_clock"), - ("DCGM_FI_DEV_MEM_CLOCK", "mem_clock"), - ("DCGM_FI_DEV_MEMORY_TEMP", "memory_temp"), - ("DCGM_FI_DEV_POWER_MGMT_LIMIT", "power_mgmt_limit"), + ("DCGM_FI_DEV_SM_CLOCK", "nvidia_sm_clock"), + ("DCGM_FI_DEV_MEM_CLOCK", "nvidia_mem_clock"), + ("DCGM_FI_DEV_MEMORY_TEMP", "nvidia_memory_temp"), + ("DCGM_FI_DEV_POWER_MGMT_LIMIT", "nvidia_power_mgmt_limit"), ]: if dcgm_field in new_dcgm_mappings: assert new_dcgm_mappings[dcgm_field] == field_name @@ -309,22 +314,24 @@ def test_metrics_removed_from_defaults_are_added_from_csv(self): DCGM_TO_FIELD_MAPPING.update(new_dcgm_mappings) # After applying, all should be in the global mapping - assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_SM_CLOCK"] == "sm_clock" - assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_MEM_CLOCK"] == "mem_clock" - assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_MEMORY_TEMP"] == "memory_temp" + assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_SM_CLOCK"] == "nvidia_sm_clock" + assert DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_MEM_CLOCK"] == "nvidia_mem_clock" + assert ( + DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_MEMORY_TEMP"] == "nvidia_memory_temp" + ) assert ( DCGM_TO_FIELD_MAPPING["DCGM_FI_DEV_POWER_MGMT_LIMIT"] - == "power_mgmt_limit" + == "nvidia_power_mgmt_limit" ) # Verify the auto-generated field names are correct for metric in custom_metrics: display_name, field_name, unit = metric assert field_name in [ - "sm_clock", - "mem_clock", - "memory_temp", - "power_mgmt_limit", + "nvidia_sm_clock", + "nvidia_mem_clock", + "nvidia_memory_temp", + "nvidia_power_mgmt_limit", ] finally: csv_path.unlink() diff --git a/tests/unit/gpu_telemetry/test_pynvml_collector.py b/tests/unit/gpu_telemetry/test_pynvml_collector.py index d803905d4..c1c0180ab 100644 --- a/tests/unit/gpu_telemetry/test_pynvml_collector.py +++ b/tests/unit/gpu_telemetry/test_pynvml_collector.py @@ -361,27 +361,27 @@ async def test_collect_gpu_metrics(self, initialized_collector): assert gpu0.dcgm_url == PYNVML_SOURCE_IDENTIFIER assert gpu0.gpu_uuid == "GPU-abc123" assert gpu0.gpu_model_name == "NVIDIA GeForce RTX 4090" - assert gpu0.telemetry_data.gpu_power_usage == pytest.approx(350.0, rel=0.01) - assert gpu0.telemetry_data.gpu_utilization == 95.0 - assert gpu0.telemetry_data.mem_utilization == 45.0 - assert gpu0.telemetry_data.gpu_temperature == 72.0 - assert gpu0.telemetry_data.gpu_memory_used == pytest.approx(20.0, rel=0.1) - assert gpu0.telemetry_data.encoder_utilization == 30.0 - assert gpu0.telemetry_data.decoder_utilization == 25.0 - assert gpu0.telemetry_data.jpg_utilization == 10.0 - assert gpu0.telemetry_data.sm_utilization == 85.0 - assert gpu0.telemetry_data.power_violation == 5000.0 + assert gpu0.telemetry_data.nvidia_power_usage == pytest.approx(350.0, rel=0.01) + assert gpu0.telemetry_data.nvidia_gpu_utilization == 95.0 + assert gpu0.telemetry_data.nvidia_memory_utilization == 45.0 + assert gpu0.telemetry_data.nvidia_temperature == 72.0 + assert gpu0.telemetry_data.nvidia_memory_used == pytest.approx(20.0, rel=0.1) + assert gpu0.telemetry_data.nvidia_encoder_utilization == 30.0 + assert gpu0.telemetry_data.nvidia_decoder_utilization == 25.0 + assert gpu0.telemetry_data.nvidia_jpg_utilization == 10.0 + assert gpu0.telemetry_data.nvidia_sm_utilization == 85.0 + assert gpu0.telemetry_data.nvidia_power_violation == 5000.0 # GPU 1 verification assert gpu1.gpu_uuid == "GPU-def456" - assert gpu1.telemetry_data.gpu_power_usage == pytest.approx(280.0, rel=0.01) - assert gpu1.telemetry_data.gpu_utilization == 75.0 - assert gpu1.telemetry_data.mem_utilization == 35.0 - assert gpu1.telemetry_data.encoder_utilization == 20.0 - assert gpu1.telemetry_data.decoder_utilization == 15.0 - assert gpu1.telemetry_data.jpg_utilization == 5.0 - assert gpu1.telemetry_data.sm_utilization == 65.0 - assert gpu1.telemetry_data.power_violation == 2000.0 + assert gpu1.telemetry_data.nvidia_power_usage == pytest.approx(280.0, rel=0.01) + assert gpu1.telemetry_data.nvidia_gpu_utilization == 75.0 + assert gpu1.telemetry_data.nvidia_memory_utilization == 35.0 + assert gpu1.telemetry_data.nvidia_encoder_utilization == 20.0 + assert gpu1.telemetry_data.nvidia_decoder_utilization == 15.0 + assert gpu1.telemetry_data.nvidia_jpg_utilization == 5.0 + assert gpu1.telemetry_data.nvidia_sm_utilization == 65.0 + assert gpu1.telemetry_data.nvidia_power_violation == 2000.0 @pytest.mark.asyncio async def test_collect_handles_nvml_errors(self, patch_pynvml): @@ -400,9 +400,9 @@ async def test_collect_handles_nvml_errors(self, patch_pynvml): # Should still get records with other metrics assert len(records) == 2 for r in records: - assert r.telemetry_data.gpu_power_usage is None - assert r.telemetry_data.gpu_utilization is not None - assert r.telemetry_data.gpu_temperature is not None + assert r.telemetry_data.nvidia_power_usage is None + assert r.telemetry_data.nvidia_gpu_utilization is not None + assert r.telemetry_data.nvidia_temperature is not None await collector.stop() @@ -498,9 +498,9 @@ class TestPyNVMLScalingFactors: @pytest.mark.parametrize( "field,factor,raw_value,expected", [ - param("gpu_power_usage", 1e-3, 350000, 350.0, id="power_mW_to_W"), - param("energy_consumption", 1e-9, 1e9, 1.0, id="energy_mJ_to_MJ"), - param("gpu_memory_used", 1e-9, 20e9, 20.0, id="memory_bytes_to_GB"), + param("nvidia_power_usage", 1e-3, 350000, 350.0, id="power_mW_to_W"), + param("nvidia_energy_consumption", 1e-9, 1e9, 1.0, id="energy_mJ_to_MJ"), + param("nvidia_memory_used", 1e-9, 20e9, 20.0, id="memory_bytes_to_GB"), ], ) def test_scaling_factor(self, field, factor, raw_value, expected): @@ -556,16 +556,18 @@ async def test_handles_bytes_values( await collector.stop() @pytest.mark.asyncio - async def test_energy_consumption_collected(self, initialized_collector): + async def test_nvidia_energy_consumption_collected(self, initialized_collector): """Test energy consumption metric is collected and scaled correctly.""" records = initialized_collector._collect_gpu_metrics() gpu0 = next(r for r in records if r.gpu_index == 0) # 1000000000 mJ * 1e-9 = 1.0 MJ - assert gpu0.telemetry_data.energy_consumption == pytest.approx(1.0, rel=0.01) + assert gpu0.telemetry_data.nvidia_energy_consumption == pytest.approx( + 1.0, rel=0.01 + ) @pytest.mark.asyncio - async def test_sm_utilization_sums_multiple_processes(self, patch_pynvml): + async def test_nvidia_sm_utilization_sums_multiple_processes(self, patch_pynvml): """Test SM utilization sums across multiple processes on same GPU.""" mock_pynvml, PyNVMLTelemetryCollector = patch_pynvml @@ -583,12 +585,12 @@ async def test_sm_utilization_sums_multiple_processes(self, patch_pynvml): gpu0 = next(r for r in records if r.gpu_index == 0) # Should sum: 40 + 35 = 75 - assert gpu0.telemetry_data.sm_utilization == 75.0 + assert gpu0.telemetry_data.nvidia_sm_utilization == 75.0 await collector.stop() @pytest.mark.asyncio - async def test_empty_process_list_zero_sm_utilization(self, patch_pynvml): + async def test_empty_process_list_zero_nvidia_sm_utilization(self, patch_pynvml): """Test SM utilization is 0.0 when no processes are running.""" mock_pynvml, PyNVMLTelemetryCollector = patch_pynvml @@ -602,12 +604,12 @@ async def test_empty_process_list_zero_sm_utilization(self, patch_pynvml): records = collector._collect_gpu_metrics() for r in records: - assert r.telemetry_data.sm_utilization == 0.0 + assert r.telemetry_data.nvidia_sm_utilization == 0.0 await collector.stop() @pytest.mark.asyncio - async def test_sm_utilization_capped_at_100(self, patch_pynvml): + async def test_nvidia_sm_utilization_capped_at_100(self, patch_pynvml): """Test SM utilization is capped at 100% when sum exceeds it.""" mock_pynvml, PyNVMLTelemetryCollector = patch_pynvml @@ -625,7 +627,7 @@ async def test_sm_utilization_capped_at_100(self, patch_pynvml): gpu0 = next(r for r in records if r.gpu_index == 0) # Sum would be 60 + 55 = 115, but should be capped at 100.0 - assert gpu0.telemetry_data.sm_utilization == 100.0 + assert gpu0.telemetry_data.nvidia_sm_utilization == 100.0 await collector.stop() @@ -686,7 +688,7 @@ async def test_gpm_not_supported_uses_process_api(self, patch_pynvml): # Should still collect SM utilization via process API records = collector._collect_gpu_metrics() - assert all(r.telemetry_data.sm_utilization is not None for r in records) + assert all(r.telemetry_data.nvidia_sm_utilization is not None for r in records) await collector.stop() @@ -761,7 +763,7 @@ def mock_gpm_metrics_get(metrics_get): # SM utilization should come from GPM gpu0 = next(r for r in records if r.gpu_index == 0) - assert gpu0.telemetry_data.sm_utilization == 88.5 + assert gpu0.telemetry_data.nvidia_sm_utilization == 88.5 await collector.stop() @@ -783,7 +785,7 @@ async def test_gpm_query_support_failure_disables_gpm(self, patch_pynvml): # Should still work via process API records = collector._collect_gpu_metrics() - assert all(r.telemetry_data.sm_utilization is not None for r in records) + assert all(r.telemetry_data.nvidia_sm_utilization is not None for r in records) await collector.stop() @@ -833,7 +835,7 @@ async def test_gpm_metrics_get_failure_falls_back_to_process_api( records = collector._collect_gpu_metrics() # Should still get SM utilization from process API fallback - assert all(r.telemetry_data.sm_utilization is not None for r in records) + assert all(r.telemetry_data.nvidia_sm_utilization is not None for r in records) # Process API should have been called mock_pynvml.nvmlDeviceGetProcessesUtilizationInfo.assert_called() diff --git a/tests/unit/gpu_telemetry/test_telemetry_data_collector.py b/tests/unit/gpu_telemetry/test_telemetry_data_collector.py index 42a9dd722..5a5c5ce79 100644 --- a/tests/unit/gpu_telemetry/test_telemetry_data_collector.py +++ b/tests/unit/gpu_telemetry/test_telemetry_data_collector.py @@ -89,10 +89,10 @@ def test_complete_parsing_single_gpu(self, sample_dcgm_data): assert record.hostname == "ed7e7a5e585f" # Verify telemetry data has reasonable values from DCGMFaker - assert record.telemetry_data.gpu_power_usage is not None - assert 0 < record.telemetry_data.gpu_power_usage < 400 - assert record.telemetry_data.energy_consumption is not None - assert record.telemetry_data.gpu_memory_used is not None + assert record.telemetry_data.nvidia_power_usage is not None + assert 0 < record.telemetry_data.nvidia_power_usage < 400 + assert record.telemetry_data.nvidia_energy_consumption is not None + assert record.telemetry_data.nvidia_memory_used is not None def test_complete_parsing_multi_gpu(self, multi_gpu_dcgm_data): """Test parsing complete DCGM response for multiple GPUs. @@ -463,17 +463,19 @@ def test_unit_scaling_accuracy(self): collector = DCGMTelemetryCollector("http://localhost:9401/metrics") test_metrics = { - "gpu_power_usage": 100.0, # Should remain unchanged (W) - "energy_consumption": 1000.0, # mJ -> MJ (divide by 1e9) - "gpu_memory_used": 1024.0, # MiB -> GB (divide by 953.674...) + "nvidia_power_usage": 100.0, # Should remain unchanged (W) + "nvidia_energy_consumption": 1000.0, # mJ -> MJ (divide by 1e9) + "nvidia_memory_used": 1024.0, # MiB -> GB (divide by 953.674...) } scaled = collector._apply_scaling_factors(test_metrics) - assert scaled["gpu_power_usage"] == 100.0 - assert abs(scaled["energy_consumption"] - 1e-6) < 1e-10 # 1000mJ = 1e-6 MJ + assert scaled["nvidia_power_usage"] == 100.0 assert ( - abs(scaled["gpu_memory_used"] - 1.073741824) < 1e-6 + abs(scaled["nvidia_energy_consumption"] - 1e-6) < 1e-10 + ) # 1000mJ = 1e-6 MJ + assert ( + abs(scaled["nvidia_memory_used"] - 1.073741824) < 1e-6 ) # 1024 MiB ≈ 1.073 GB def test_temporal_consistency_in_batches(self, sample_dcgm_data): @@ -545,11 +547,11 @@ def test_nan_inf_values_filtering(self): # Should only include the valid metric assert len(records) == 1 # NaN, Inf, -Inf should be filtered out - assert records[0].telemetry_data.gpu_power_usage is None - assert records[0].telemetry_data.gpu_utilization is None - assert records[0].telemetry_data.gpu_temperature is None + assert records[0].telemetry_data.nvidia_power_usage is None + assert records[0].telemetry_data.nvidia_gpu_utilization is None + assert records[0].telemetry_data.nvidia_temperature is None # Valid value should be present - assert records[0].telemetry_data.gpu_memory_used is not None + assert records[0].telemetry_data.nvidia_memory_used is not None def test_invalid_gpu_index_handling(self): """Test handling of non-numeric GPU index values.""" @@ -639,25 +641,25 @@ def test_scaling_factors_with_none_values(self): collector = DCGMTelemetryCollector("http://localhost:9401/metrics") metrics_with_none = { - "gpu_power_usage": None, - "energy_consumption": 1000.0, - "gpu_memory_used": None, + "nvidia_power_usage": None, + "nvidia_energy_consumption": 1000.0, + "nvidia_memory_used": None, } scaled = collector._apply_scaling_factors(metrics_with_none) # None values should remain None - assert scaled["gpu_power_usage"] is None - assert scaled["gpu_memory_used"] is None + assert scaled["nvidia_power_usage"] is None + assert scaled["nvidia_memory_used"] is None # Non-None values should be scaled - assert abs(scaled["energy_consumption"] - 1e-6) < 1e-10 + assert abs(scaled["nvidia_energy_consumption"] - 1e-6) < 1e-10 def test_scaling_factors_preserves_unscaled_metrics(self): """Test that metrics without scaling factors are preserved as-is.""" collector = DCGMTelemetryCollector("http://localhost:9401/metrics") metrics = { - "gpu_power_usage": 100.0, + "nvidia_power_usage": 100.0, "unscaled_metric": 999.0, } @@ -666,4 +668,4 @@ def test_scaling_factors_preserves_unscaled_metrics(self): # Unscaled metric should remain unchanged assert scaled["unscaled_metric"] == 999.0 # Scaled metric should remain unchanged (power has factor 1.0) - assert scaled["gpu_power_usage"] == 100.0 + assert scaled["nvidia_power_usage"] == 100.0 diff --git a/tests/unit/gpu_telemetry/test_telemetry_integration.py b/tests/unit/gpu_telemetry/test_telemetry_integration.py index b379c0ac3..68f29d4a5 100644 --- a/tests/unit/gpu_telemetry/test_telemetry_integration.py +++ b/tests/unit/gpu_telemetry/test_telemetry_integration.py @@ -219,9 +219,13 @@ def mock_aiohttp_get(url, **kwargs): assert len(metric_results) > 0 - power_metrics = [r for r in metric_results if "gpu_power_usage" in r.tag] - util_metrics = [r for r in metric_results if "gpu_utilization" in r.tag] - memory_metrics = [r for r in metric_results if "gpu_memory_used" in r.tag] + power_metrics = [r for r in metric_results if "nvidia_power_usage" in r.tag] + util_metrics = [ + r for r in metric_results if "nvidia_gpu_utilization" in r.tag + ] + memory_metrics = [ + r for r in metric_results if "nvidia_memory_used" in r.tag + ] assert len(power_metrics) > 0 assert len(util_metrics) > 0 @@ -424,9 +428,11 @@ def mock_aiohttp_get_scaling(url, **kwargs): await processor.process_telemetry_record(record) metric_results = await processor.summarize() - memory_metrics = [r for r in metric_results if "gpu_memory_used" in r.tag] + memory_metrics = [ + r for r in metric_results if "nvidia_memory_used" in r.tag + ] energy_metrics = [ - r for r in metric_results if "energy_consumption" in r.tag + r for r in metric_results if "nvidia_energy_consumption" in r.tag ] assert len(memory_metrics) > 0 diff --git a/tests/unit/gpu_telemetry/test_telemetry_manager.py b/tests/unit/gpu_telemetry/test_telemetry_manager.py index e3ef14f69..ea04ff70c 100644 --- a/tests/unit/gpu_telemetry/test_telemetry_manager.py +++ b/tests/unit/gpu_telemetry/test_telemetry_manager.py @@ -349,7 +349,7 @@ async def test_send_telemetry_status_disabled_with_reason(self): # Mock publish method manager.publish = AsyncMock() - reason = "no DCGM endpoints reachable" + reason = "no telemetry sources reachable" endpoints_tested = ["http://node1:9401/metrics"] await manager._send_telemetry_status( @@ -666,7 +666,7 @@ async def test_configure_no_shutdown_when_no_endpoints_reachable(self): manager.publish.assert_called_once() call_args = manager.publish.call_args[0][0] assert call_args.enabled is False - assert "no DCGM endpoints reachable" in call_args.reason + assert "no telemetry sources reachable" in call_args.reason # Should NOT have collectors assert len(manager._collectors) == 0 diff --git a/tests/unit/gpu_telemetry/test_telemetry_models.py b/tests/unit/gpu_telemetry/test_telemetry_models.py index bf1995320..f3eae9a51 100644 --- a/tests/unit/gpu_telemetry/test_telemetry_models.py +++ b/tests/unit/gpu_telemetry/test_telemetry_models.py @@ -70,6 +70,56 @@ def counter_time_series() -> GpuMetricTimeSeries: return ts +class TestTelemetryMetrics: + """Test TelemetryMetrics field naming and compatibility behavior.""" + + def test_legacy_nvidia_fields_validate_to_nvidia_field_names(self) -> None: + """Legacy NVIDIA input aliases are accepted but not re-emitted.""" + metrics = TelemetryMetrics( + gpu_power_usage=75.5, + energy_consumption=1.25, + gpu_utilization=85.0, + mem_utilization=45.0, + gpu_memory_used=15.26, + gpu_temperature=67.0, + sm_utilization=91.0, + decoder_utilization=25.0, + encoder_utilization=30.0, + jpg_utilization=10.0, + xid_errors=0.0, + power_violation=5000.0, + ) + + assert metrics.nvidia_power_usage == 75.5 + assert metrics.nvidia_energy_consumption == 1.25 + assert metrics.nvidia_gpu_utilization == 85.0 + assert metrics.nvidia_memory_utilization == 45.0 + assert metrics.nvidia_memory_used == 15.26 + assert metrics.nvidia_temperature == 67.0 + assert metrics.nvidia_sm_utilization == 91.0 + assert metrics.nvidia_decoder_utilization == 25.0 + assert metrics.nvidia_encoder_utilization == 30.0 + assert metrics.nvidia_jpg_utilization == 10.0 + assert metrics.nvidia_xid_errors == 0.0 + assert metrics.nvidia_power_violation == 5000.0 + + dumped = metrics.model_dump(exclude_none=True) + assert dumped == { + "nvidia_power_usage": 75.5, + "nvidia_energy_consumption": 1.25, + "nvidia_gpu_utilization": 85.0, + "nvidia_memory_utilization": 45.0, + "nvidia_memory_used": 15.26, + "nvidia_temperature": 67.0, + "nvidia_sm_utilization": 91.0, + "nvidia_decoder_utilization": 25.0, + "nvidia_encoder_utilization": 30.0, + "nvidia_jpg_utilization": 10.0, + "nvidia_xid_errors": 0.0, + "nvidia_power_violation": 5000.0, + } + + class TestTelemetryRecord: """Test TelemetryRecord model validation and data structure integrity. @@ -96,10 +146,10 @@ def test_telemetry_record_complete_creation(self): device="nvidia0", hostname="ed7e7a5e585f", telemetry_data=TelemetryMetrics( - gpu_power_usage=75.5, - energy_consumption=1000000000, - gpu_utilization=85.0, - gpu_memory_used=15.26, + nvidia_power_usage=75.5, + nvidia_energy_consumption=1000000000, + nvidia_gpu_utilization=85.0, + nvidia_memory_used=15.26, ), ) @@ -113,10 +163,10 @@ def test_telemetry_record_complete_creation(self): assert record.device == "nvidia0" assert record.hostname == "ed7e7a5e585f" - assert record.telemetry_data.gpu_power_usage == 75.5 - assert record.telemetry_data.energy_consumption == 1000000000 - assert record.telemetry_data.gpu_utilization == 85.0 - assert record.telemetry_data.gpu_memory_used == 15.26 + assert record.telemetry_data.nvidia_power_usage == 75.5 + assert record.telemetry_data.nvidia_energy_consumption == 1000000000 + assert record.telemetry_data.nvidia_gpu_utilization == 85.0 + assert record.telemetry_data.nvidia_memory_used == 15.26 def test_telemetry_record_minimal_creation(self): """Test creating a TelemetryRecord with only required fields. @@ -145,10 +195,10 @@ def test_telemetry_record_minimal_creation(self): assert record.pci_bus_id is None assert record.device is None assert record.hostname is None - assert record.telemetry_data.gpu_power_usage is None - assert record.telemetry_data.energy_consumption is None - assert record.telemetry_data.gpu_utilization is None - assert record.telemetry_data.gpu_memory_used is None + assert record.telemetry_data.nvidia_power_usage is None + assert record.telemetry_data.nvidia_energy_consumption is None + assert record.telemetry_data.nvidia_gpu_utilization is None + assert record.telemetry_data.nvidia_memory_used is None def test_telemetry_record_field_validation(self): """Test Pydantic validation of required fields. @@ -217,17 +267,17 @@ def test_snapshot_creation_with_metrics(self): snapshot = GpuTelemetrySnapshot( timestamp_ns=1000000000, metrics={ - "gpu_power_usage": 75.5, - "gpu_utilization": 85.0, - "gpu_memory_used": 15.26, + "nvidia_power_usage": 75.5, + "nvidia_gpu_utilization": 85.0, + "nvidia_memory_used": 15.26, }, ) assert snapshot.timestamp_ns == 1000000000 assert len(snapshot.metrics) == 3 - assert snapshot.metrics["gpu_power_usage"] == 75.5 - assert snapshot.metrics["gpu_utilization"] == 85.0 - assert snapshot.metrics["gpu_memory_used"] == 15.26 + assert snapshot.metrics["nvidia_power_usage"] == 75.5 + assert snapshot.metrics["nvidia_gpu_utilization"] == 85.0 + assert snapshot.metrics["nvidia_memory_used"] == 15.26 def test_snapshot_empty_metrics(self): """Test creating a snapshot with no metrics.""" @@ -806,44 +856,44 @@ def test_add_record_grouped(self, gpu_telemetry_data: GpuTelemetryData): """Test adding TelemetryRecord creates grouped snapshots.""" record = _make_record( 1_000_000_000, - gpu_power_usage=100.0, - gpu_utilization=80.0, - gpu_memory_used=15.0, + nvidia_power_usage=100.0, + nvidia_gpu_utilization=80.0, + nvidia_memory_used=15.0, ) gpu_telemetry_data.add_record(record) ts = gpu_telemetry_data.time_series assert len(ts) == 1 assert ts.timestamps[0] == 1_000_000_000 - assert ts.get_metric_array("gpu_power_usage")[0] == 100.0 - assert ts.get_metric_array("gpu_utilization")[0] == 80.0 - assert ts.get_metric_array("gpu_memory_used")[0] == 15.0 + assert ts.get_metric_array("nvidia_power_usage")[0] == 100.0 + assert ts.get_metric_array("nvidia_gpu_utilization")[0] == 80.0 + assert ts.get_metric_array("nvidia_memory_used")[0] == 15.0 def test_add_record_filters_none_values(self, gpu_telemetry_data: GpuTelemetryData): """Test that None metric values are filtered out.""" record = _make_record( 1_000_000_000, - gpu_power_usage=100.0, - gpu_memory_used=15.0, - # gpu_utilization intentionally omitted (will be None) + nvidia_power_usage=100.0, + nvidia_memory_used=15.0, + # nvidia_gpu_utilization intentionally omitted (will be None) ) gpu_telemetry_data.add_record(record) ts = gpu_telemetry_data.time_series assert len(ts) == 1 - assert ts.get_metric_array("gpu_power_usage") is not None - assert ts.get_metric_array("gpu_memory_used") is not None - assert ts.get_metric_array("gpu_utilization") is None + assert ts.get_metric_array("nvidia_power_usage") is not None + assert ts.get_metric_array("nvidia_memory_used") is not None + assert ts.get_metric_array("nvidia_gpu_utilization") is None def test_get_metric_result(self, gpu_telemetry_data: GpuTelemetryData): """Test getting MetricResult for a specific metric.""" for i, power in enumerate([100.0, 120.0, 80.0]): gpu_telemetry_data.add_record( - _make_record(1_000_000_000 + i * 1_000_000, gpu_power_usage=power) + _make_record(1_000_000_000 + i * 1_000_000, nvidia_power_usage=power) ) result = gpu_telemetry_data.get_metric_result( - "gpu_power_usage", "power_tag", "GPU Power", "W" + "nvidia_power_usage", "power_tag", "GPU Power", "W" ) assert result.tag == "power_tag" @@ -860,13 +910,13 @@ def test_get_metric_result_with_time_filter( # Add records: warmup + profiling for ts, power in [(1, 50.0), (2, 100.0), (3, 120.0), (4, 80.0)]: gpu_telemetry_data.add_record( - _make_record(ts * 1_000_000_000, gpu_power_usage=power) + _make_record(ts * 1_000_000_000, nvidia_power_usage=power) ) # Exclude warmup at 1s time_filter = TimeRangeFilter(start_ns=2_000_000_000, end_ns=5_000_000_000) result = gpu_telemetry_data.get_metric_result( - "gpu_power_usage", "power_tag", "GPU Power", "W", time_filter=time_filter + "nvidia_power_usage", "power_tag", "GPU Power", "W", time_filter=time_filter ) # Stats should exclude warmup value of 50.0 @@ -882,12 +932,12 @@ def test_get_metric_result_counter_with_time_filter( # Add records: baseline + profiling for ts, energy in [(1, 1000.0), (2, 1200.0), (3, 1500.0), (4, 1800.0)]: gpu_telemetry_data.add_record( - _make_record(ts * 1_000_000_000, energy_consumption=energy) + _make_record(ts * 1_000_000_000, nvidia_energy_consumption=energy) ) time_filter = TimeRangeFilter(start_ns=2_000_000_000, end_ns=5_000_000_000) result = gpu_telemetry_data.get_metric_result( - "energy_consumption", + "nvidia_energy_consumption", "energy_tag", "Energy", "MJ", diff --git a/tests/unit/metrics/test_metrics_registry.py b/tests/unit/metrics/test_metrics_registry.py index 7418104a6..cd3a17044 100644 --- a/tests/unit/metrics/test_metrics_registry.py +++ b/tests/unit/metrics/test_metrics_registry.py @@ -88,7 +88,7 @@ def test_get_class_or_none_returns_none_for_unknown_tag(self): registries (e.g. GPU telemetry) must not raise here. """ assert MetricRegistry.get_class_or_none("definitely_not_a_real_metric") is None - assert MetricRegistry.get_class_or_none("gpu_power_usage") is None + assert MetricRegistry.get_class_or_none("nvidia_power_usage") is None def test_get_class_or_none_matches_get_class_for_known_tags(self): """For registered tags, both lookups return the same class.""" diff --git a/tests/unit/plot/test_data_loader.py b/tests/unit/plot/test_data_loader.py index 150c2962c..733bfd68d 100644 --- a/tests/unit/plot/test_data_loader.py +++ b/tests/unit/plot/test_data_loader.py @@ -1121,10 +1121,10 @@ def test_load_run_includes_gpu_telemetry(self, single_run_dir: Path) -> None: # Verify rich telemetry fields from real data expected_fields = [ "gpu_index", - "gpu_utilization", - "gpu_power_usage", - "gpu_memory_used", - "gpu_temperature", + "nvidia_gpu_utilization", + "nvidia_power_usage", + "nvidia_memory_used", + "nvidia_temperature", "sm_clock_frequency", "memory_clock_frequency", "dcgm_url", @@ -1152,10 +1152,10 @@ def test_load_gpu_telemetry_with_relative_timestamps( assert df is not None assert len(df) > 0 assert "timestamp_s" in df.columns - assert "gpu_utilization" in df.columns - assert "gpu_power_usage" in df.columns - assert "gpu_memory_used" in df.columns - assert "gpu_temperature" in df.columns + assert "nvidia_gpu_utilization" in df.columns + assert "nvidia_power_usage" in df.columns + assert "nvidia_memory_used" in df.columns + assert "nvidia_temperature" in df.columns # Check relative timestamp conversion # Note: First timestamp can be negative if telemetry started before first request @@ -1275,10 +1275,10 @@ def test_load_gpu_telemetry_flattens_nested_data( assert "hostname" in df.columns # Verify telemetry_data fields are flattened to top level - assert "gpu_utilization" in df.columns - assert "gpu_power_usage" in df.columns - assert "gpu_memory_used" in df.columns - assert "gpu_temperature" in df.columns + assert "nvidia_gpu_utilization" in df.columns + assert "nvidia_power_usage" in df.columns + assert "nvidia_memory_used" in df.columns + assert "nvidia_temperature" in df.columns assert "sm_clock_frequency" in df.columns assert "memory_clock_frequency" in df.columns diff --git a/tests/unit/plot/test_metric_names.py b/tests/unit/plot/test_metric_names.py index b926b1477..14280eec4 100644 --- a/tests/unit/plot/test_metric_names.py +++ b/tests/unit/plot/test_metric_names.py @@ -260,7 +260,7 @@ def test_adds_percentage_for_utilization_metrics(self): def test_handles_gpu_utilization_metric(self): """Test that GPU utilization metrics get percentage unit.""" - result = get_metric_display_name_with_unit("gpu_utilization") + result = get_metric_display_name_with_unit("nvidia_gpu_utilization") assert "(%)" in result or result.endswith("%)") def test_unit_in_parentheses_format(self): diff --git a/tests/unit/plot/test_png_exporter.py b/tests/unit/plot/test_png_exporter.py index 7e587237d..e5c81ede8 100644 --- a/tests/unit/plot/test_png_exporter.py +++ b/tests/unit/plot/test_png_exporter.py @@ -2154,11 +2154,15 @@ def test_empty_primary_data_raises_error( def test_metric_prep_registry(self, dual_axis_handler): """Test that the metric preparation registry contains expected functions.""" assert "throughput_tokens_per_sec" in dual_axis_handler.METRIC_PREP_FUNCTIONS + assert "nvidia_gpu_utilization" in dual_axis_handler.METRIC_PREP_FUNCTIONS assert "gpu_utilization" in dual_axis_handler.METRIC_PREP_FUNCTIONS assert callable( dual_axis_handler.METRIC_PREP_FUNCTIONS["throughput_tokens_per_sec"] ) + assert callable( + dual_axis_handler.METRIC_PREP_FUNCTIONS["nvidia_gpu_utilization"] + ) assert callable(dual_axis_handler.METRIC_PREP_FUNCTIONS["gpu_utilization"]) def test_axis_labels_from_available_metrics( diff --git a/tests/unit/plot/test_single_run_handlers.py b/tests/unit/plot/test_single_run_handlers.py index 392c8596d..54bd61163 100644 --- a/tests/unit/plot/test_single_run_handlers.py +++ b/tests/unit/plot/test_single_run_handlers.py @@ -417,7 +417,9 @@ def sample_spec(self): name="throughput_tokens_per_sec", axis="y", source=DataSource.REQUESTS ), MetricSpec( - name="gpu_utilization", axis="y2", source=DataSource.GPU_TELEMETRY + name="nvidia_gpu_utilization", + axis="y2", + source=DataSource.GPU_TELEMETRY, ), ] spec.primary_style = "area" @@ -431,7 +433,7 @@ def test_can_handle_returns_true_with_valid_gpu_telemetry( """Test can_handle returns True when GPU telemetry is available.""" mock_run = MagicMock() mock_run.gpu_telemetry = pd.DataFrame( - {"timestamp_s": [1, 2], "gpu_utilization": [80, 85]} + {"timestamp_s": [1, 2], "nvidia_gpu_utilization": [80, 85]} ) result = handler.can_handle(sample_spec, mock_run) diff --git a/tests/unit/post_processors/conftest.py b/tests/unit/post_processors/conftest.py index b7ddee0ae..e8c020dbb 100644 --- a/tests/unit/post_processors/conftest.py +++ b/tests/unit/post_processors/conftest.py @@ -828,16 +828,17 @@ def make_telemetry_record( gpu_index: int = 0, gpu_uuid: str = "GPU-test", gpu_model_name: str = "Test GPU", + platform: str = "nvidia", hostname: str = "node1", pci_bus_id: str | None = None, device: str | None = None, - gpu_power_usage: float | None = 100.0, - gpu_utilization: float | None = None, - energy_consumption: float | None = None, - gpu_memory_used: float | None = None, - gpu_temperature: float | None = None, - xid_errors: float | None = None, - power_violation: float | None = None, + nvidia_power_usage: float | None = 100.0, + nvidia_gpu_utilization: float | None = None, + nvidia_energy_consumption: float | None = None, + nvidia_memory_used: float | None = None, + nvidia_temperature: float | None = None, + nvidia_xid_errors: float | None = None, + nvidia_power_violation: float | None = None, ) -> TelemetryRecord: """Factory for creating TelemetryRecord instances with sensible defaults.""" return TelemetryRecord( @@ -846,16 +847,17 @@ def make_telemetry_record( gpu_index=gpu_index, gpu_uuid=gpu_uuid, gpu_model_name=gpu_model_name, + platform=platform, hostname=hostname, pci_bus_id=pci_bus_id, device=device, telemetry_data=TelemetryMetrics( - gpu_power_usage=gpu_power_usage, - gpu_utilization=gpu_utilization, - energy_consumption=energy_consumption, - gpu_memory_used=gpu_memory_used, - gpu_temperature=gpu_temperature, - xid_errors=xid_errors, - power_violation=power_violation, + nvidia_power_usage=nvidia_power_usage, + nvidia_gpu_utilization=nvidia_gpu_utilization, + nvidia_energy_consumption=nvidia_energy_consumption, + nvidia_memory_used=nvidia_memory_used, + nvidia_temperature=nvidia_temperature, + nvidia_xid_errors=nvidia_xid_errors, + nvidia_power_violation=nvidia_power_violation, ), ) diff --git a/tests/unit/property/_numeric_bounds_baseline.txt b/tests/unit/property/_numeric_bounds_baseline.txt index 88c2acb41..86b6fd962 100644 --- a/tests/unit/property/_numeric_bounds_baseline.txt +++ b/tests/unit/property/_numeric_bounds_baseline.txt @@ -381,18 +381,18 @@ TelemetryMetrics.amd_power: numeric field has no ge/gt/le/lt bound and is not Fi TelemetryMetrics.amd_temperature: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. TelemetryMetrics.amd_throttle_status: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. TelemetryMetrics.amd_umc_activity: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.decoder_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.encoder_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.energy_consumption: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.gpu_memory_used: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.gpu_power_usage: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.gpu_temperature: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.gpu_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.jpg_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.mem_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.power_violation: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.sm_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. -TelemetryMetrics.xid_errors: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_decoder_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_encoder_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_energy_consumption: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_memory_used: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_power_usage: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_temperature: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_gpu_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_jpg_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_memory_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_power_violation: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_sm_utilization: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. +TelemetryMetrics.nvidia_xid_errors: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. TelemetryRecord.gpu_index: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. TelemetryRecordsMessage.request_ns: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. TelemetryRecord.timestamp_ns: numeric field has no ge/gt/le/lt bound and is not FiniteFloat. Add a Pydantic numeric constraint, annotate as FiniteFloat, or add to NUMERIC_BOUNDS_WHITELIST. diff --git a/tests/unit/records/test_records_manager.py b/tests/unit/records/test_records_manager.py index 9afad6b63..6b07b800e 100644 --- a/tests/unit/records/test_records_manager.py +++ b/tests/unit/records/test_records_manager.py @@ -95,7 +95,7 @@ async def test_on_telemetry_records_valid(self): gpu_uuid="GPU-123", gpu_model_name="Test GPU", telemetry_data=TelemetryMetrics( - gpu_power_usage=100.0, + nvidia_power_usage=100.0, ), ) ] @@ -212,7 +212,7 @@ def test_telemetry_hierarchy_add_record(self): gpu_uuid="GPU-123", gpu_model_name="Test GPU", telemetry_data=TelemetryMetrics( - gpu_power_usage=100.0, + nvidia_power_usage=100.0, ), ) diff --git a/tests/unit/server/test_dcgm_faker.py b/tests/unit/server/test_dcgm_faker.py index 120e0103f..659d01d44 100644 --- a/tests/unit/server/test_dcgm_faker.py +++ b/tests/unit/server/test_dcgm_faker.py @@ -219,22 +219,22 @@ def test_faker_output_parsed_by_real_telemetry_collector(self, gpu_name): # Verify TelemetryMetrics are correctly scaled telemetry = record.telemetry_data assert telemetry is not None - assert telemetry.gpu_power_usage == approx(gpu.power, abs=0.01) - assert telemetry.gpu_utilization == approx(gpu.util, abs=0.01) - assert telemetry.gpu_temperature == approx(gpu.temp, abs=0.01) - assert telemetry.energy_consumption == approx( + assert telemetry.nvidia_power_usage == approx(gpu.power, abs=0.01) + assert telemetry.nvidia_gpu_utilization == approx(gpu.util, abs=0.01) + assert telemetry.nvidia_temperature == approx(gpu.temp, abs=0.01) + assert telemetry.nvidia_energy_consumption == approx( gpu.energy * 1e-9, abs=0.01 ) # mJ to MJ - assert telemetry.gpu_memory_used == approx( + assert telemetry.nvidia_memory_used == approx( gpu.mem_used * 1.048576 * 1e-3, abs=0.01 ) # MiB to GB - assert telemetry.xid_errors == approx(gpu.xid, abs=0.01) - assert telemetry.power_violation == approx(gpu.power_viol, abs=0.01) + assert telemetry.nvidia_xid_errors == approx(gpu.xid, abs=0.01) + assert telemetry.nvidia_power_violation == approx(gpu.power_viol, abs=0.01) # Verify values are in reasonable ranges - assert 0 <= telemetry.gpu_utilization <= 100 - assert 0 < telemetry.gpu_power_usage <= gpu.cfg.max_power_w - assert 0 < telemetry.gpu_temperature <= 100 + assert 0 <= telemetry.nvidia_gpu_utilization <= 100 + assert 0 < telemetry.nvidia_power_usage <= gpu.cfg.max_power_w + assert 0 < telemetry.nvidia_temperature <= 100 def test_load_affects_telemetry_records(self): """Test that load changes affect TelemetryRecords when parsed by real collector.""" @@ -254,10 +254,12 @@ def test_load_affects_telemetry_records(self): high_telemetry = high_records[0].telemetry_data # High load should produce higher values - assert high_telemetry.gpu_power_usage > low_telemetry.gpu_power_usage - assert high_telemetry.gpu_temperature > low_telemetry.gpu_temperature - assert high_telemetry.gpu_utilization > low_telemetry.gpu_utilization - assert high_telemetry.gpu_memory_used > low_telemetry.gpu_memory_used + assert high_telemetry.nvidia_power_usage > low_telemetry.nvidia_power_usage + assert high_telemetry.nvidia_temperature > low_telemetry.nvidia_temperature + assert ( + high_telemetry.nvidia_gpu_utilization > low_telemetry.nvidia_gpu_utilization + ) + assert high_telemetry.nvidia_memory_used > low_telemetry.nvidia_memory_used def test_metrics_clamped_to_bounds(self): """Test that all metrics are clamped to [0, max] bounds.""" @@ -274,11 +276,11 @@ def test_metrics_clamped_to_bounds(self): t = record.telemetry_data # All metrics should be non-negative - assert t.gpu_utilization >= 0 - assert t.gpu_power_usage >= 0 - assert t.gpu_temperature >= 0 - assert t.gpu_memory_used >= 0 + assert t.nvidia_gpu_utilization >= 0 + assert t.nvidia_power_usage >= 0 + assert t.nvidia_temperature >= 0 + assert t.nvidia_memory_used >= 0 # All metrics should not exceed their max values - assert t.gpu_utilization <= 100 - assert t.gpu_temperature <= 100 + assert t.nvidia_gpu_utilization <= 100 + assert t.nvidia_temperature <= 100