From 5303e0a6a7171a2172db950e817669f12b5c3687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Magiera?= Date: Fri, 15 Nov 2024 18:37:58 +0100 Subject: [PATCH] wip: Supraseal healthpage output (#325) * supraffi: healthpage api * wire in health page getter * batchseal: Report nvme metrics to prometheus * make gen * batchseal: Fix data units --- extern/supra_seal | 2 +- lib/supraffi/common.go | 75 ++++++++++++++++++++ lib/supraffi/no_supraseal.go | 4 ++ lib/supraffi/seal.go | 81 +++++++++++++++++++++ tasks/sealsupra/metrics.go | 113 +++++++++++++++++++++++++++++- tasks/sealsupra/task_supraseal.go | 73 +++++++++++++++++++ 6 files changed, 345 insertions(+), 3 deletions(-) create mode 100644 lib/supraffi/common.go diff --git a/extern/supra_seal b/extern/supra_seal index d7591837c..4b5641401 160000 --- a/extern/supra_seal +++ b/extern/supra_seal @@ -1 +1 @@ -Subproject commit d7591837c662bb3996218ca1290e915b8012aa9e +Subproject commit 4b5641401318d37906e56e2dbc61b3ec9d0a9257 diff --git a/lib/supraffi/common.go b/lib/supraffi/common.go new file mode 100644 index 000000000..6b952171f --- /dev/null +++ b/lib/supraffi/common.go @@ -0,0 +1,75 @@ +package supraffi + +import "time" + +// HealthInfo represents NVMe device health information in a more Go-friendly format +type HealthInfo struct { + // Critical warning flags + CriticalWarning byte + + // Temperature information in Celsius + Temperature float64 + TemperatureSensors []float64 + WarningTempTime time.Duration + CriticalTempTime time.Duration + + // Reliability metrics + AvailableSpare uint8 + AvailableSpareThreshold uint8 + PercentageUsed uint8 + + // Usage statistics + DataUnitsRead uint64 // in 512-byte units + DataUnitsWritten uint64 // in 512-byte units + HostReadCommands uint64 + HostWriteCommands uint64 + ControllerBusyTime time.Duration + + // Power and error statistics + PowerCycles uint64 + PowerOnHours time.Duration + UnsafeShutdowns uint64 + MediaErrors uint64 + ErrorLogEntries uint64 +} + +// Helper methods for interpreting critical warning flags +const ( + WarningSpareSpace = 1 << 0 + WarningTemperature = 1 << 1 + WarningReliability = 1 << 2 + WarningReadOnly = 1 << 3 + WarningVolatileMemory = 1 << 4 + WarningPersistentMemory = 1 << 5 +) + +// HasWarning checks if a specific warning flag is set +func (h *HealthInfo) HasWarning(flag byte) bool { + return (h.CriticalWarning & flag) != 0 +} + +// GetWarnings returns a slice of active warning descriptions +func (h *HealthInfo) GetWarnings() []string { + var warnings []string + + if h.HasWarning(WarningSpareSpace) { + warnings = append(warnings, "available spare space has fallen below threshold") + } + if h.HasWarning(WarningTemperature) { + warnings = append(warnings, "temperature is above critical threshold") + } + if h.HasWarning(WarningReliability) { + warnings = append(warnings, "device reliability has been degraded") + } + if h.HasWarning(WarningReadOnly) { + warnings = append(warnings, "media has been placed in read only mode") + } + if h.HasWarning(WarningVolatileMemory) { + warnings = append(warnings, "volatile memory backup device has failed") + } + if h.HasWarning(WarningPersistentMemory) { + warnings = append(warnings, "persistent memory region has become read-only") + } + + return warnings +} diff --git a/lib/supraffi/no_supraseal.go b/lib/supraffi/no_supraseal.go index 5e2cfd293..f87749e3e 100644 --- a/lib/supraffi/no_supraseal.go +++ b/lib/supraffi/no_supraseal.go @@ -47,6 +47,10 @@ func GenerateMultiString(paths []Path) (string, error) { return buffer.String(), nil } +func GetHealthInfo() ([]HealthInfo, error) { + panic("GetHealthInfo: supraseal build tag not enabled") +} + // Pc2 performs the pc2 operation. func Pc2(blockOffset uint64, numSectors int, outputDir string, sectorSize uint64) int { panic("Pc2: supraseal build tag not enabled") diff --git a/lib/supraffi/seal.go b/lib/supraffi/seal.go index 2af840d22..81abf5cce 100644 --- a/lib/supraffi/seal.go +++ b/lib/supraffi/seal.go @@ -9,11 +9,37 @@ package supraffi #include #include "supra_seal.h" #include + +typedef struct nvme_health_info { + uint8_t critical_warning; + int16_t temperature; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + uint64_t data_units_read; + uint64_t data_units_written; + uint64_t host_read_commands; + uint64_t host_write_commands; + uint64_t controller_busy_time; + uint64_t power_cycles; + uint64_t power_on_hours; + uint64_t unsafe_shutdowns; + uint64_t media_errors; + uint64_t num_error_info_log_entries; + uint32_t warning_temp_time; + uint32_t critical_temp_time; + int16_t temp_sensors[8]; + } nvme_health_info_t; + +size_t get_nvme_health_info(nvme_health_info_t* health_infos, size_t max_controllers); + */ import "C" import ( "bytes" "encoding/binary" + "fmt" + "time" "unsafe" ) @@ -137,6 +163,61 @@ func SupraSealInit(sectorSize uint64, configFile string) { C.supra_seal_init(C.size_t(sectorSize), cConfigFile) } +// GetHealthInfo retrieves health information for all NVMe devices +func GetHealthInfo() ([]HealthInfo, error) { + // Allocate space for raw C struct + const maxControllers = 64 + rawInfos := make([]C.nvme_health_info_t, maxControllers) + + // Get health info from C + count := C.get_nvme_health_info( + (*C.nvme_health_info_t)(unsafe.Pointer(&rawInfos[0])), + C.size_t(maxControllers), + ) + + if count == 0 { + return nil, fmt.Errorf("no NVMe controllers found") + } + + // Convert C structs to Go structs + healthInfos := make([]HealthInfo, count) + for i := 0; i < int(count); i++ { + raw := &rawInfos[i] + + // Convert temperature sensors, filtering out unused ones + sensors := make([]float64, 0, 8) + for _, temp := range raw.temp_sensors { + if temp != 0 { + sensors = append(sensors, float64(temp)) + } + } + + // todo likely not entirely correct + healthInfos[i] = HealthInfo{ + CriticalWarning: byte(raw.critical_warning), + Temperature: float64(raw.temperature), // celsius?? + TemperatureSensors: sensors, + WarningTempTime: time.Duration(raw.warning_temp_time) * time.Minute, + CriticalTempTime: time.Duration(raw.critical_temp_time) * time.Minute, + AvailableSpare: uint8(raw.available_spare), + AvailableSpareThreshold: uint8(raw.available_spare_threshold), + PercentageUsed: uint8(raw.percentage_used), + DataUnitsRead: uint64(raw.data_units_read), + DataUnitsWritten: uint64(raw.data_units_written), + HostReadCommands: uint64(raw.host_read_commands), + HostWriteCommands: uint64(raw.host_write_commands), + ControllerBusyTime: time.Duration(raw.controller_busy_time) * time.Minute, + PowerCycles: uint64(raw.power_cycles), + PowerOnHours: time.Duration(raw.power_on_hours) * time.Hour, + UnsafeShutdowns: uint64(raw.unsafe_shutdowns), + MediaErrors: uint64(raw.media_errors), + ErrorLogEntries: uint64(raw.num_error_info_log_entries), + } + } + + return healthInfos, nil +} + // Pc1 performs the pc1 operation. func Pc1(blockOffset uint64, replicaIDs [][32]byte, parentsFilename string, sectorSize uint64) int { flatReplicaIDs := make([]byte, len(replicaIDs)*32) diff --git a/tasks/sealsupra/metrics.go b/tasks/sealsupra/metrics.go index 967ee5bbb..bf3fb93a1 100644 --- a/tasks/sealsupra/metrics.go +++ b/tasks/sealsupra/metrics.go @@ -7,8 +7,9 @@ import ( ) var ( - phaseKey, _ = tag.NewKey("phase") - pre = "sealsupra_" + phaseKey, _ = tag.NewKey("phase") + nvmeDeviceKey, _ = tag.NewKey("nvme_device") + pre = "sealsupra_" ) // SupraSealMeasures groups all SupraSeal metrics. @@ -16,10 +17,42 @@ var SupraSealMeasures = struct { PhaseLockCount *stats.Int64Measure PhaseWaitingCount *stats.Int64Measure PhaseAvgDuration *stats.Float64Measure + + // NVMe Health measures + NVMeTemperature *stats.Float64Measure + NVMeAvailableSpare *stats.Int64Measure + NVMePercentageUsed *stats.Int64Measure + NVMePowerCycles *stats.Int64Measure + NVMePowerOnHours *stats.Float64Measure + NVMeUnsafeShutdowns *stats.Int64Measure + NVMeMediaErrors *stats.Int64Measure + NVMeErrorLogEntries *stats.Int64Measure + NVMeCriticalWarning *stats.Int64Measure + + NVMeBytesRead *stats.Int64Measure + NVMeBytesWritten *stats.Int64Measure + NVMeReadIO *stats.Int64Measure + NVMeWriteIO *stats.Int64Measure }{ PhaseLockCount: stats.Int64(pre+"phase_lock_count", "Number of active locks in each phase", stats.UnitDimensionless), PhaseWaitingCount: stats.Int64(pre+"phase_waiting_count", "Number of goroutines waiting for a phase lock", stats.UnitDimensionless), PhaseAvgDuration: stats.Float64(pre+"phase_avg_duration", "Average duration of each phase in seconds", stats.UnitSeconds), + + // NVMe Health measures + NVMeTemperature: stats.Float64(pre+"nvme_temperature_celsius", "NVMe Temperature in Celsius", stats.UnitDimensionless), + NVMeAvailableSpare: stats.Int64(pre+"nvme_available_spare", "NVMe Available Spare", stats.UnitDimensionless), + NVMePercentageUsed: stats.Int64(pre+"nvme_percentage_used", "NVMe Percentage Used", stats.UnitDimensionless), + NVMePowerCycles: stats.Int64(pre+"nvme_power_cycles", "NVMe Power Cycles", stats.UnitDimensionless), + NVMePowerOnHours: stats.Float64(pre+"nvme_power_on_hours", "NVMe Power On Hours", stats.UnitDimensionless), + NVMeUnsafeShutdowns: stats.Int64(pre+"nvme_unsafe_shutdowns", "NVMe Unsafe Shutdowns", stats.UnitDimensionless), + NVMeMediaErrors: stats.Int64(pre+"nvme_media_errors", "NVMe Media Errors", stats.UnitDimensionless), + NVMeErrorLogEntries: stats.Int64(pre+"nvme_error_log_entries", "NVMe Error Log Entries", stats.UnitDimensionless), + NVMeCriticalWarning: stats.Int64(pre+"nvme_critical_warning", "NVMe Critical Warning Flags", stats.UnitDimensionless), + + NVMeBytesRead: stats.Int64(pre+"nvme_bytes_read", "NVMe Bytes Read", stats.UnitBytes), + NVMeBytesWritten: stats.Int64(pre+"nvme_bytes_written", "NVMe Bytes Written", stats.UnitBytes), + NVMeReadIO: stats.Int64(pre+"nvme_read_io", "NVMe Read IOs", stats.UnitDimensionless), + NVMeWriteIO: stats.Int64(pre+"nvme_write_io", "NVMe Write IOs", stats.UnitDimensionless), } // init registers the views for SupraSeal metrics. @@ -40,6 +73,82 @@ func init() { Aggregation: view.LastValue(), TagKeys: []tag.Key{phaseKey}, }, + // NVMe Health views + &view.View{ + Measure: SupraSealMeasures.NVMeTemperature, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeAvailableSpare, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMePercentageUsed, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMePowerCycles, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMePowerOnHours, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeUnsafeShutdowns, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeMediaErrors, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeErrorLogEntries, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeCriticalWarning, + Aggregation: view.LastValue(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeBytesRead, + Aggregation: view.Sum(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeBytesWritten, + Aggregation: view.Sum(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeReadIO, + Aggregation: view.Sum(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeWriteIO, + Aggregation: view.Sum(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeReadIO, + Aggregation: view.Sum(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, + &view.View{ + Measure: SupraSealMeasures.NVMeWriteIO, + Aggregation: view.Sum(), + TagKeys: []tag.Key{nvmeDeviceKey}, + }, ) if err != nil { panic(err) diff --git a/tasks/sealsupra/task_supraseal.go b/tasks/sealsupra/task_supraseal.go index 3db7e1791..157eff756 100644 --- a/tasks/sealsupra/task_supraseal.go +++ b/tasks/sealsupra/task_supraseal.go @@ -11,6 +11,8 @@ import ( logging "github.com/ipfs/go-log/v2" "github.com/snadrus/must" + "go.opencensus.io/stats" + "go.opencensus.io/tag" "golang.org/x/xerrors" "github.com/filecoin-project/go-address" @@ -104,6 +106,77 @@ func NewSupraSeal(sectorSize string, batchSize, pipelines int, dualHashers bool, supraffi.SupraSealInit(uint64(ssize), configFile) log.Infow("supraseal init done") + { + hp, err := supraffi.GetHealthInfo() + if err != nil { + return nil, xerrors.Errorf("get health page: %w", err) + } + + log.Infow("nvme health page", "hp", hp) + } + + // Initialize previous health infos slice + prevHealthInfos := make([]supraffi.HealthInfo, len(nvmeDevices)) + + go func() { + const intervalSeconds = 30 + ticker := time.NewTicker(time.Duration(intervalSeconds) * time.Second) + defer ticker.Stop() + + for range ticker.C { + healthInfos, err := supraffi.GetHealthInfo() + if err != nil { + log.Errorw("health page get error", "error", err) + continue + } + + for i, hi := range healthInfos { + if i >= len(nvmeDevices) { + log.Warnw("More health info entries than nvme devices", "index", i) + break + } + deviceName := nvmeDevices[i] + + ctx, err := tag.New( + context.Background(), + tag.Insert(nvmeDeviceKey, deviceName), + ) + if err != nil { + log.Errorw("Failed to create context with tags", "error", err) + continue + } + + // Record the metrics + stats.Record(ctx, SupraSealMeasures.NVMeTemperature.M(hi.Temperature)) + stats.Record(ctx, SupraSealMeasures.NVMeAvailableSpare.M(int64(hi.AvailableSpare))) + stats.Record(ctx, SupraSealMeasures.NVMePercentageUsed.M(int64(hi.PercentageUsed))) + stats.Record(ctx, SupraSealMeasures.NVMePowerCycles.M(int64(hi.PowerCycles))) + stats.Record(ctx, SupraSealMeasures.NVMePowerOnHours.M(hi.PowerOnHours.Hours())) + stats.Record(ctx, SupraSealMeasures.NVMeUnsafeShutdowns.M(int64(hi.UnsafeShutdowns))) + stats.Record(ctx, SupraSealMeasures.NVMeMediaErrors.M(int64(hi.MediaErrors))) + stats.Record(ctx, SupraSealMeasures.NVMeErrorLogEntries.M(int64(hi.ErrorLogEntries))) + stats.Record(ctx, SupraSealMeasures.NVMeCriticalWarning.M(int64(hi.CriticalWarning))) + + // For counters, compute difference from previous values + if prevHealthInfos[i].DataUnitsRead != 0 { + dataUnitsReadBytes := int64((hi.DataUnitsRead - prevHealthInfos[i].DataUnitsRead) * 512_000) + dataUnitsWrittenBytes := int64((hi.DataUnitsWritten - prevHealthInfos[i].DataUnitsWritten) * 512_000) + hostReadCommands := int64(hi.HostReadCommands - prevHealthInfos[i].HostReadCommands) + hostWriteCommands := int64(hi.HostWriteCommands - prevHealthInfos[i].HostWriteCommands) + + // Record the diffs and computed metrics + stats.Record(ctx, SupraSealMeasures.NVMeBytesRead.M(dataUnitsReadBytes)) + stats.Record(ctx, SupraSealMeasures.NVMeBytesWritten.M(dataUnitsWrittenBytes)) + stats.Record(ctx, SupraSealMeasures.NVMeReadIO.M(hostReadCommands)) + stats.Record(ctx, SupraSealMeasures.NVMeWriteIO.M(hostWriteCommands)) + } + + // Update previous health info + prevHealthInfos[i] = hi + } + } + }() + // Get maximum block offset (essentially the number of pages in the smallest nvme device) space := supraffi.GetMaxBlockOffset(uint64(ssize))