From e496da8e6bc9c3958e37431cadcf14e74263e25d Mon Sep 17 00:00:00 2001 From: Konstantin Shalygin Date: Mon, 28 Apr 2025 14:07:55 +0300 Subject: [PATCH] feat: support for mine NVMe Volatile Memory Backup This PR add's metric, when NVMe PLP is failed (bool) Further use of the drive is unsafe; if the power fails, the data may be lost. Healthy device ```console [root@host]# smartctl --json --info --capabilities --health --attributes --tolerance=verypermissive --nocheck=standby --format=brief --log=error --device=nvme /dev/nvme1 | jq .smart_status { "passed": true, "nvme": { "value": 0 } } ``` PLP failed ```console [root@host]# smartctl --json --info --capabilities --health --attributes --tolerance=verypermissive --nocheck=standby --format=brief --log=error --device=nvme /dev/nvme0 | jq .smart_status { "passed": false, "nvme": { "value": 16, "spare_below_threshold": false, "temperature_above_or_below_threshold": false, "reliability_degraded": false, "media_read_only": false, "volatile_memory_backup_failed": true, "persistent_memory_region_unreliable": false, "other": 0 } } ``` Signed-off-by: Konstantin Shalygin --- metrics.go | 8 ++++++++ smartctl.go | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/metrics.go b/metrics.go index 0ac083f..e9c98e6 100644 --- a/metrics.go +++ b/metrics.go @@ -194,6 +194,14 @@ var ( }, nil, ) + metricDeviceVolatileMemoryBackupFailed = prometheus.NewDesc( + "smartctl_device_volatile_memory_backup_failed", + "Indicates that Volatile Memory Backup (NVMe PLP) is failed", + []string{ + "device", + }, + nil, + ) metricDeviceBytesRead = prometheus.NewDesc( "smartctl_device_bytes_read", "", diff --git a/smartctl.go b/smartctl.go index b136b8b..30b4fa5 100644 --- a/smartctl.go +++ b/smartctl.go @@ -109,6 +109,7 @@ func (smart *SMARTctl) Collect() { smart.mineNvmeCriticalWarning() smart.mineNvmeMediaErrors() smart.mineNvmeNumErrLogEntries() + smart.mineNvmeVolatileMemoryBackupFailed() smart.mineNvmeBytesRead() smart.mineNvmeBytesWritten() } @@ -382,6 +383,21 @@ func (smart *SMARTctl) mineNvmeNumErrLogEntries() { ) } +func (smart *SMARTctl) mineNvmeVolatileMemoryBackupFailed() { + nvmeStatus := smart.json.Get("smart_status.nvme") + if nvmeStatus.Exists() { + volatileMemoryBackupFailed := nvmeStatus.Get("volatile_memory_backup_failed") + if volatileMemoryBackupFailed.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceVolatileMemoryBackupFailed, + prometheus.CounterValue, + volatileMemoryBackupFailed.Float(), + smart.device.device, + ) + } + } +} + // https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0d-2023.12.28-Ratified.pdf // 4.1.4.2 SMART / Health Information (02h) // The SMART / Health Information log page is as defined in the NVM Express Base Specification. For the @@ -472,7 +488,7 @@ func (smart *SMARTctl) mineSmartStatus() { if smartStatus.Exists() { smart.ch <- prometheus.MustNewConstMetric( metricDeviceSmartStatus, - prometheus.GaugeValue, + prometheus.CounterValue, smartStatus.Get("passed").Float(), smart.device.device, )