Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: Supraseal healthpage output #325

Merged
merged 5 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion extern/supra_seal
75 changes: 75 additions & 0 deletions lib/supraffi/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package supraffi

import "time"

// HealthInfo represents NVMe device health information in a more Go-friendly format
type HealthInfo struct {
// Critical warning flags
CriticalWarning byte

// Temperature information in Celsius
Temperature float64
TemperatureSensors []float64
WarningTempTime time.Duration
CriticalTempTime time.Duration

// Reliability metrics
AvailableSpare uint8
AvailableSpareThreshold uint8
PercentageUsed uint8

// Usage statistics
DataUnitsRead uint64 // in 512-byte units
DataUnitsWritten uint64 // in 512-byte units
HostReadCommands uint64
HostWriteCommands uint64
ControllerBusyTime time.Duration

// Power and error statistics
PowerCycles uint64
PowerOnHours time.Duration
UnsafeShutdowns uint64
MediaErrors uint64
ErrorLogEntries uint64
}

// Helper methods for interpreting critical warning flags
const (
WarningSpareSpace = 1 << 0
WarningTemperature = 1 << 1
WarningReliability = 1 << 2
WarningReadOnly = 1 << 3
WarningVolatileMemory = 1 << 4
WarningPersistentMemory = 1 << 5
)

// HasWarning checks if a specific warning flag is set
func (h *HealthInfo) HasWarning(flag byte) bool {
return (h.CriticalWarning & flag) != 0
}

// GetWarnings returns a slice of active warning descriptions
func (h *HealthInfo) GetWarnings() []string {
var warnings []string

if h.HasWarning(WarningSpareSpace) {
warnings = append(warnings, "available spare space has fallen below threshold")
}
if h.HasWarning(WarningTemperature) {
warnings = append(warnings, "temperature is above critical threshold")
}
if h.HasWarning(WarningReliability) {
warnings = append(warnings, "device reliability has been degraded")
}
if h.HasWarning(WarningReadOnly) {
warnings = append(warnings, "media has been placed in read only mode")
}
if h.HasWarning(WarningVolatileMemory) {
warnings = append(warnings, "volatile memory backup device has failed")
}
if h.HasWarning(WarningPersistentMemory) {
warnings = append(warnings, "persistent memory region has become read-only")
}

return warnings
}
4 changes: 4 additions & 0 deletions lib/supraffi/no_supraseal.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ func GenerateMultiString(paths []Path) (string, error) {
return buffer.String(), nil
}

func GetHealthInfo() ([]HealthInfo, error) {
panic("GetHealthInfo: supraseal build tag not enabled")
}

// Pc2 performs the pc2 operation.
func Pc2(blockOffset uint64, numSectors int, outputDir string, sectorSize uint64) int {
panic("Pc2: supraseal build tag not enabled")
Expand Down
81 changes: 81 additions & 0 deletions lib/supraffi/seal.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,37 @@ package supraffi
#include <stdbool.h>
#include "supra_seal.h"
#include <stdlib.h>

typedef struct nvme_health_info {
uint8_t critical_warning;
int16_t temperature;
uint8_t available_spare;
uint8_t available_spare_threshold;
uint8_t percentage_used;
uint64_t data_units_read;
uint64_t data_units_written;
uint64_t host_read_commands;
uint64_t host_write_commands;
uint64_t controller_busy_time;
uint64_t power_cycles;
uint64_t power_on_hours;
uint64_t unsafe_shutdowns;
uint64_t media_errors;
uint64_t num_error_info_log_entries;
uint32_t warning_temp_time;
uint32_t critical_temp_time;
int16_t temp_sensors[8];
} nvme_health_info_t;

size_t get_nvme_health_info(nvme_health_info_t* health_infos, size_t max_controllers);

*/
import "C"
import (
"bytes"
"encoding/binary"
"fmt"
"time"
"unsafe"
)

Expand Down Expand Up @@ -137,6 +163,61 @@ func SupraSealInit(sectorSize uint64, configFile string) {
C.supra_seal_init(C.size_t(sectorSize), cConfigFile)
}

// GetHealthInfo retrieves health information for all NVMe devices
func GetHealthInfo() ([]HealthInfo, error) {
// Allocate space for raw C struct
const maxControllers = 64
rawInfos := make([]C.nvme_health_info_t, maxControllers)

// Get health info from C
count := C.get_nvme_health_info(
(*C.nvme_health_info_t)(unsafe.Pointer(&rawInfos[0])),
C.size_t(maxControllers),
)

if count == 0 {
return nil, fmt.Errorf("no NVMe controllers found")
}

// Convert C structs to Go structs
healthInfos := make([]HealthInfo, count)
for i := 0; i < int(count); i++ {
raw := &rawInfos[i]

// Convert temperature sensors, filtering out unused ones
sensors := make([]float64, 0, 8)
for _, temp := range raw.temp_sensors {
if temp != 0 {
sensors = append(sensors, float64(temp))
}
}

// todo likely not entirely correct
magik6k marked this conversation as resolved.
Show resolved Hide resolved
healthInfos[i] = HealthInfo{
CriticalWarning: byte(raw.critical_warning),
Temperature: float64(raw.temperature), // celsius??
TemperatureSensors: sensors,
WarningTempTime: time.Duration(raw.warning_temp_time) * time.Minute,
CriticalTempTime: time.Duration(raw.critical_temp_time) * time.Minute,
AvailableSpare: uint8(raw.available_spare),
AvailableSpareThreshold: uint8(raw.available_spare_threshold),
PercentageUsed: uint8(raw.percentage_used),
DataUnitsRead: uint64(raw.data_units_read),
DataUnitsWritten: uint64(raw.data_units_written),
HostReadCommands: uint64(raw.host_read_commands),
HostWriteCommands: uint64(raw.host_write_commands),
ControllerBusyTime: time.Duration(raw.controller_busy_time) * time.Minute,
PowerCycles: uint64(raw.power_cycles),
PowerOnHours: time.Duration(raw.power_on_hours) * time.Hour,
UnsafeShutdowns: uint64(raw.unsafe_shutdowns),
MediaErrors: uint64(raw.media_errors),
ErrorLogEntries: uint64(raw.num_error_info_log_entries),
}
}

return healthInfos, nil
}

// Pc1 performs the pc1 operation.
func Pc1(blockOffset uint64, replicaIDs [][32]byte, parentsFilename string, sectorSize uint64) int {
flatReplicaIDs := make([]byte, len(replicaIDs)*32)
Expand Down
113 changes: 111 additions & 2 deletions tasks/sealsupra/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,52 @@ import (
)

var (
phaseKey, _ = tag.NewKey("phase")
pre = "sealsupra_"
phaseKey, _ = tag.NewKey("phase")
nvmeDeviceKey, _ = tag.NewKey("nvme_device")
pre = "sealsupra_"
)

// SupraSealMeasures groups all SupraSeal metrics.
var SupraSealMeasures = struct {
PhaseLockCount *stats.Int64Measure
PhaseWaitingCount *stats.Int64Measure
PhaseAvgDuration *stats.Float64Measure

// NVMe Health measures
NVMeTemperature *stats.Float64Measure
NVMeAvailableSpare *stats.Int64Measure
NVMePercentageUsed *stats.Int64Measure
NVMePowerCycles *stats.Int64Measure
NVMePowerOnHours *stats.Float64Measure
NVMeUnsafeShutdowns *stats.Int64Measure
NVMeMediaErrors *stats.Int64Measure
NVMeErrorLogEntries *stats.Int64Measure
NVMeCriticalWarning *stats.Int64Measure

NVMeBytesRead *stats.Int64Measure
NVMeBytesWritten *stats.Int64Measure
NVMeReadIO *stats.Int64Measure
NVMeWriteIO *stats.Int64Measure
}{
PhaseLockCount: stats.Int64(pre+"phase_lock_count", "Number of active locks in each phase", stats.UnitDimensionless),
PhaseWaitingCount: stats.Int64(pre+"phase_waiting_count", "Number of goroutines waiting for a phase lock", stats.UnitDimensionless),
PhaseAvgDuration: stats.Float64(pre+"phase_avg_duration", "Average duration of each phase in seconds", stats.UnitSeconds),

// NVMe Health measures
NVMeTemperature: stats.Float64(pre+"nvme_temperature_celsius", "NVMe Temperature in Celsius", stats.UnitDimensionless),
NVMeAvailableSpare: stats.Int64(pre+"nvme_available_spare", "NVMe Available Spare", stats.UnitDimensionless),
NVMePercentageUsed: stats.Int64(pre+"nvme_percentage_used", "NVMe Percentage Used", stats.UnitDimensionless),
NVMePowerCycles: stats.Int64(pre+"nvme_power_cycles", "NVMe Power Cycles", stats.UnitDimensionless),
NVMePowerOnHours: stats.Float64(pre+"nvme_power_on_hours", "NVMe Power On Hours", stats.UnitDimensionless),
NVMeUnsafeShutdowns: stats.Int64(pre+"nvme_unsafe_shutdowns", "NVMe Unsafe Shutdowns", stats.UnitDimensionless),
NVMeMediaErrors: stats.Int64(pre+"nvme_media_errors", "NVMe Media Errors", stats.UnitDimensionless),
NVMeErrorLogEntries: stats.Int64(pre+"nvme_error_log_entries", "NVMe Error Log Entries", stats.UnitDimensionless),
NVMeCriticalWarning: stats.Int64(pre+"nvme_critical_warning", "NVMe Critical Warning Flags", stats.UnitDimensionless),

NVMeBytesRead: stats.Int64(pre+"nvme_bytes_read", "NVMe Bytes Read", stats.UnitBytes),
NVMeBytesWritten: stats.Int64(pre+"nvme_bytes_written", "NVMe Bytes Written", stats.UnitBytes),
NVMeReadIO: stats.Int64(pre+"nvme_read_io", "NVMe Read IOs", stats.UnitDimensionless),
NVMeWriteIO: stats.Int64(pre+"nvme_write_io", "NVMe Write IOs", stats.UnitDimensionless),
}

// init registers the views for SupraSeal metrics.
Expand All @@ -40,6 +73,82 @@ func init() {
Aggregation: view.LastValue(),
TagKeys: []tag.Key{phaseKey},
},
// NVMe Health views
&view.View{
Measure: SupraSealMeasures.NVMeTemperature,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeAvailableSpare,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePercentageUsed,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePowerCycles,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePowerOnHours,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeUnsafeShutdowns,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeMediaErrors,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeErrorLogEntries,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeCriticalWarning,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeBytesRead,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeBytesWritten,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeReadIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeWriteIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeReadIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeWriteIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
)
if err != nil {
panic(err)
Expand Down
Loading