Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 48 additions & 8 deletions deployment/console-plugin-nvidia-gpu/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,54 @@ metadata:
{{- include "console-plugin-nvidia-gpu.labels" . | nindent 4 }}
data:
dcgm-metrics.csv: |
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.
DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.
DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.
DCGM_FI_DEV_POWER_USAGE, gauge, power usage.
# === Added by the console plugin ===
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, gauge, power mgmt limit.
DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.
DCGM_FI_DEV_SM_CLOCK, gauge, sm clock.
DCGM_FI_DEV_MAX_SM_CLOCK, gauge, max sm clock.
DCGM_FI_DEV_MEM_CLOCK, gauge, mem clock.
DCGM_FI_DEV_MAX_MEM_CLOCK, gauge, max mem clock.

# === Available by default ===
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).

# Power
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).

# PCIE
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.

# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.

# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).

# NVLink
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.

# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status

# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# DCP metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data.
DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload.
DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload.
11 changes: 5 additions & 6 deletions src/components/GPUDashboard/Cards/GPUDashboardGraphs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import {
humanizeDegrees,
humanizeHertz,
humanizePercentage,
humanizeRatio,
humanizeWatts,
} from '../../../utils/units';
import { useTranslation } from '../../../i18n';
Expand All @@ -17,7 +16,7 @@ import { useTranslation } from '../../../i18n';
//

/*
these are ok:
these are ok:
DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization.
DCGM_FI_DEV_POWER_USAGE, gauge, power usage.
DCGM_FI_DEV_GPU_TEMP, gauge, gpu temp.
Expand All @@ -32,7 +31,7 @@ import { useTranslation } from '../../../i18n';
*/

/* Used metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, gpu utilization.
Copy link
Member

@mresvanis mresvanis May 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

II{U,R}C we chose DCGM_FI_PROF_GR_ENGINE_ACTIVE because it is available also when using MIG, while DCGM_FI_DEV_GPU_UTIL is not.

If we would like to drop MIG support for other NVIDIA GPU models, then no objection from my part (whatever makes more sense business-wise).

DCGM_FI_DEV_GPU_UTIL, gauge, gpu utilization.
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, mem utilization.
DCGM_FI_DEV_ENC_UTIL, gauge, enc utilization.
DCGM_FI_DEV_DEC_UTIL, gauge, dec utilization.
Expand All @@ -57,9 +56,9 @@ export const GPUDashboardGraphs: React.FC = () => {
ariaTitle={t('Donut GPU utilization')}
ariaRangeTitle={t('GPU utilization over time')}
ariaDesc={t('Sparkline GPU utilization')}
query={`sum(DCGM_FI_PROF_GR_ENGINE_ACTIVE{UUID="${selectedGPU?.uuid}"})`}
maxDomain={1}
humanize={humanizeRatio}
query={`sum(DCGM_FI_DEV_GPU_UTIL{UUID="${selectedGPU?.uuid}"})`}
maxDomain={100}
humanize={humanizePercentage}
/>
</GridItem>
<GridItem span={6} lg={3}>
Expand Down
2 changes: 1 addition & 1 deletion src/components/GPUDashboard/Cards/WorkloadsCard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ const WorkloadsCard: React.FC = () => {
const [gpuMemoryMetrics, gpuMetricsLoaded, gpuMetricsError] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY_RANGE,
query:
'sum (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"}) by (exported_namespace, exported_pod, UUID)',
'sum (DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+"}) by (exported_namespace, exported_pod, UUID)',
timespan: ONE_DAY,
});

Expand Down
2 changes: 1 addition & 1 deletion src/hooks/use-gpus-info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export type GPUInfo = {
export const useGPUsInfo = (): [GPUInfo[], /* loaded */ boolean, /* error */ unknown] => {
const [result, loaded, error] = usePrometheusPoll({
endpoint: PrometheusEndpoint.QUERY,
query: 'DCGM_FI_PROF_GR_ENGINE_ACTIVE',
query: 'DCGM_FI_DEV_GPU_UTIL',
});

const gpus = useDeepCompareMemoize(
Expand Down
4 changes: 2 additions & 2 deletions src/utils/cluster-overview.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { GetQuery } from '@openshift-console/dynamic-plugin-sdk';

export const getGPUUtilizationQuery: GetQuery = () =>
'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"})) or vector(0)';
'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+"})) or vector(0)';
export const getGPUTotalUtilizationQuery: GetQuery = () =>
'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE)) or vector(0)';
'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_GPU_UTIL)) or vector(0)';

export const getPowerUsageUtilizationQuery: GetQuery = () =>
'sum(max by (UUID) (DCGM_FI_DEV_POWER_USAGE))';
Expand Down
2 changes: 1 addition & 1 deletion src/utils/project-overview.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
export const getGPUPodsQuery = (project: string) =>
`count((kube_pod_status_phase > 0) * on(pod) group_left(gpu,device,instance,modelName) label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+", exported_namespace="${project}"}, "pod", "$1", "exported_pod", "(.*)"))`;
`count((kube_pod_status_phase > 0) * on(pod) group_left(gpu,device,instance,modelName) label_replace(DCGM_FI_DEV_GPU_UTIL{exported_pod=~".+", exported_namespace="${project}"}, "pod", "$1", "exported_pod", "(.*)"))`;
Loading