From 428a6072291664ceed36da7ecde34a051bf227f2 Mon Sep 17 00:00:00 2001 From: karencfv Date: Mon, 5 Jan 2026 21:30:56 +1300 Subject: [PATCH 01/12] [inventory] Add svcs in maintenance to DB --- Cargo.lock | 1 + illumos-utils/src/svcs.rs | 5 +- nexus/db-model/Cargo.toml | 1 + nexus/db-model/src/inventory.rs | 46 ++++++++++++++++- nexus/db-model/src/schema_versions.rs | 3 +- .../db-queries/src/db/datastore/inventory.rs | 51 +++++++++++++++++++ nexus/db-schema/src/schema.rs | 12 +++++ schema/crdb/dbinit.sql | 25 +++++++++ .../up01.sql | 24 +++++++++ 9 files changed, 163 insertions(+), 5 deletions(-) create mode 100644 schema/crdb/health-monitor-svcs-in-maintenance/up01.sql diff --git a/Cargo.lock b/Cargo.lock index e76aed1376c..8b306e311f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6764,6 +6764,7 @@ dependencies = [ "expectorate", "hex", "iddqd", + "illumos-utils", "ipnetwork", "itertools 0.14.0", "macaddr", diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 17b68412379..42abf8b4bf5 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -195,12 +195,13 @@ impl From for SvcState { } } +// TODO-K: Ugh, I think this might need to be versioned and moved out of here? #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running pub struct SvcInMaintenance { - fmri: String, - zone: String, + pub fmri: String, + pub zone: String, } impl SvcInMaintenance { diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index 9762bccbe76..1c30231bf53 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -21,6 +21,7 @@ derive-where.workspace = true diesel = { workspace = true, features = ["postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } hex.workspace = true iddqd.workspace = true +illumos-utils.workspace = true ipnetwork.workspace = true itertools.workspace = true macaddr.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 8caf04aadcf..bc6af948e03 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -27,14 +27,16 @@ use diesel::pg::Pg; use diesel::serialize::ToSql; use diesel::{serialize, sql_types}; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcInMaintenance; use ipnetwork::IpNetwork; use nexus_db_schema::schema::inv_zone_manifest_non_boot; use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, - inv_internal_dns, inv_last_reconciliation_dataset_result, + inv_health_monitor_svc_in_maintenance, inv_host_phase_1_active_slot, + inv_host_phase_1_flash_hash, inv_internal_dns, + inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_mupdate_override_non_boot, @@ -1012,6 +1014,46 @@ impl_enum_type!( Idle => b"idle" ); +// TODO-K: add docs and move type elsewhere? +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance)] +pub struct InvSvcInMaintenance { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub fmri: Option, + pub zone: Option, + pub error_messages: Vec, + // TODO-K: Check if this needs to be an option + pub time_of_status: Option>, +} + +impl InvSvcInMaintenance { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + svc: Option, + svc_errors: Vec, + time_of_status: Option>, + // TODO-K: Does this need to be here? or is it OK to bunch up all the + // errors in one place? + //svcs_cmd_error: Option, + ) -> Self { + let (fmri, zone) = match svc { + Some(svc) => (Some(svc.fmri), Some(svc.zone)), + None => (None, None), + }; + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + fmri, + zone, + error_messages: svc_errors, + time_of_status, + } + } +} + /// See [`sled_agent_types::inventory::ConfigReconcilerInventory`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_sled_config_reconciler)] diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 293b6086c6b..4c9434e0c7a 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(217, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(218, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(218, "health-monitor-svcs-in-maintenance"), KnownVersion::new(217, "multiple-default-ip-pools-per-silo"), KnownVersion::new(216, "add-trust-quorum"), KnownVersion::new(215, "support-up-to-12-disks"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index ccf453c5cd7..137662d22ad 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -60,6 +60,7 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; +use nexus_db_model::InvSvcInMaintenance; use nexus_db_model::InvZpool; use nexus_db_model::RotImageError; use nexus_db_model::SledRole; @@ -206,6 +207,33 @@ impl DataStore { } } + // TODO-K: Clean up + // Pull services in maintenance out of all sled agents + let mut svcs_in_maintenance = vec![]; + + for sled_agent in &collection.sled_agents { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => { + for svc in &svcs.services { + svcs_in_maintenance.push(InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + Some(svc.clone()), + svcs.errors.clone(), + svcs.time_of_status, + )); + } + } + Err(e) => svcs_in_maintenance.push(InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + vec![e.to_string()], + None, + )), + } + } + // Pull disks out of all sled agents let disks: Vec<_> = collection .sled_agents @@ -1394,6 +1422,25 @@ impl DataStore { } } + // Insert rows for all the unhealthy services we found + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); + loop { + let some_svcs_in_maintenance = + svcs_in_maintenance.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) + .values(some_svcs_in_maintenance) + .execute_async(&conn) + .await?; + } + } + // Insert rows for the sled agents that we found. In practice, we'd // expect these to all have baseboards (if using Oxide hardware) or // none have baseboards (if not). @@ -2223,6 +2270,8 @@ impl DataStore { .await? }; + // TODO-K: Remove rows for health monitor + Ok(NumRowsDeleted { ncollections, nsps, @@ -2299,6 +2348,7 @@ impl DataStore { "ncockroach_status" => ncockroach_status, "nntp_timesync" => nntp_timesync, "ninternal_dns" => ninternal_dns, + // TODO-K: add health monitor rows here too ); Ok(()) @@ -2587,6 +2637,7 @@ impl DataStore { disk_firmware }; + // TODO-K: Take inspiration here // Mapping of "Sled ID" -> "All disks reported by that sled" let physical_disks: BTreeMap< SledUuid, diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index de626e5c64f..c471c6e8981 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1716,6 +1716,18 @@ table! { } } +table! { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + fmri -> Nullable, + zone -> Nullable, + + error_messages -> Nullable>, + time_of_status -> Nullable, + } +} + table! { inv_sled_boot_partition (inv_collection_id, sled_id, boot_disk_slot) { inv_collection_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 29efbfcfb3b..31eaa0330be 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4021,6 +4021,31 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY, + + -- time when the status was checked if applicable + valid_until TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id) +) + -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql new file mode 100644 index 00000000000..71680604482 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -0,0 +1,24 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY, + + -- time when the status was checked if applicable + valid_until TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id) +) \ No newline at end of file From 64852eaa5831a0a827581fa94c499960d03c01ad Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 14:54:36 +1300 Subject: [PATCH 02/12] insert rows --- nexus/db-model/src/inventory.rs | 17 +++++-- .../db-queries/src/db/datastore/inventory.rs | 45 ++++++++++++++++++- nexus/db-schema/src/schema.rs | 4 +- schema/crdb/dbinit.sql | 16 ++++--- .../up01.sql | 14 ++++-- sled-agent/src/long_running_tasks.rs | 3 +- sled-agent/src/sim/sled_agent.rs | 4 +- uuid-kinds/src/lib.rs | 3 ++ 8 files changed, 90 insertions(+), 16 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index bc6af948e03..e3e2045b256 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -9,6 +9,7 @@ use crate::Generation; use crate::PhysicalDiskKind; use crate::omicron_zone_config::{self, OmicronZoneNic}; use crate::sled_cpu_family::SledCpuFamily; +use crate::to_db_typed_uuid; use crate::typed_uuid::DbTypedUuid; use crate::{ ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, @@ -63,6 +64,7 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -72,6 +74,8 @@ use omicron_uuid_kinds::OmicronSledConfigUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledKind; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::SvcInMaintenanceKind; +use omicron_uuid_kinds::SvcInMaintenanceUuid; use omicron_uuid_kinds::ZpoolKind; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind}; use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; @@ -1020,9 +1024,11 @@ impl_enum_type!( pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, + pub id: DbTypedUuid, pub fmri: Option, pub zone: Option, pub error_messages: Vec, + pub svcs_cmd_error: Option, // TODO-K: Check if this needs to be an option pub time_of_status: Option>, } @@ -1033,22 +1039,27 @@ impl InvSvcInMaintenance { sled_id: SledUuid, svc: Option, svc_errors: Vec, + svcs_cmd_error: Option, time_of_status: Option>, - // TODO-K: Does this need to be here? or is it OK to bunch up all the - // errors in one place? - //svcs_cmd_error: Option, ) -> Self { let (fmri, zone) = match svc { Some(svc) => (Some(svc.fmri), Some(svc.zone)), None => (None, None), }; + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + Self { inv_collection_id: inv_collection_id.into(), sled_id: sled_id.into(), + id, fmri, zone, error_messages: svc_errors, + svcs_cmd_error, time_of_status, } } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 137662d22ad..63869ffe42c 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -220,6 +220,7 @@ impl DataStore { sled_agent.sled_id, Some(svc.clone()), svcs.errors.clone(), + None, svcs.time_of_status, )); } @@ -228,7 +229,8 @@ impl DataStore { collection_id, sled_agent.sled_id, None, - vec![e.to_string()], + vec![], + Some(e.to_string()), None, )), } @@ -2692,6 +2694,7 @@ impl DataStore { disks }; + // TODO-K: get inspiration ID here // Mapping of "Sled ID" -> "All zpools reported by that sled" let zpools: BTreeMap> = { use nexus_db_schema::schema::inv_zpool::dsl; @@ -2762,6 +2765,46 @@ impl DataStore { datasets }; + // TODO-K: fix +// // Mapping of "Sled ID" -> "All SMF services in maintenance reported by +// // that sled" +// let svcs_in_maintenance: BTreeMap< +// Uuid, +// Vec, +// > = { +// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; +// +// let mut svcs = +// BTreeMap::>::new(); +// let mut paginator = Paginator::new( +// batch_size, +// dropshot::PaginationOrder::Ascending, +// ); +// while let Some(p) = paginator.next() { +// let batch = paginated_multicolumn( +// dsl::inv_health_monitor_svc_in_maintenance, +// (dsl::sled_id, dsl::id), +// &p.current_pagparams(), +// ) +// .filter(dsl::inv_collection_id.eq(db_id)) +// .select(InvDataset::as_select()) +// .load_async(&*conn) +// .await +// .map_err(|e| { +// public_error_from_diesel(e, ErrorHandler::Server) +// })?; +// paginator = p.found_batch(&batch, &|row| { +// (row.sled_id, row.name.clone()) +// }); +// for svc in batch { +// svcs.entry(svc.sled_id.into_untyped_uuid()) +// .or_default() +// .push(svc.into()); +// } +// } +// svcs +// }; + // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. let baseboard_id_ids: BTreeSet<_> = sps diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index c471c6e8981..8eab5de820f 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1717,13 +1717,15 @@ table! { } table! { - inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id) { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { inv_collection_id -> Uuid, sled_id -> Uuid, + id -> Uuid, fmri -> Nullable, zone -> Nullable, error_messages -> Nullable>, + svcs_cmd_error -> Nullable, time_of_status -> Nullable, } } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 31eaa0330be..08b9f3005ab 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4031,6 +4031,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, + -- unique id for each row + id UUID NOT NULL, + -- FMRI of the SMF service in maintenance fmri TEXT, @@ -4038,13 +4041,16 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance zone TEXT, -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY, + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, -- time when the status was checked if applicable - valid_until TIMESTAMPTZ, + time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id) -) + PRIMARY KEY (inv_collection_id, sled_id, id) +); -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. @@ -7680,7 +7686,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '217.0.0', NULL) + (TRUE, NOW(), NOW(), '218.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql index 71680604482..bf0b956d23e 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -8,6 +8,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, + -- unique id for each row + id UUID NOT NULL, + -- FMRI of the SMF service in maintenance fmri TEXT, @@ -15,10 +18,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance zone TEXT, -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY, + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, -- time when the status was checked if applicable - valid_until TIMESTAMPTZ, + time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id) -) \ No newline at end of file + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 700d4a08f4b..d27aaa8595e 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -275,7 +275,8 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +// TODO-K: Remove pub +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 075dc655a0f..42d4861d4b4 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,7 +168,9 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let health_monitor = HealthMonitorHandle::stub(); + // TODO-K: Uncomment + // let health_monitor = HealthMonitorHandle::stub(); + let health_monitor = crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index abc8690806e..6253b460886 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -83,6 +83,9 @@ impl_typed_uuid_kinds! { Sled = {}, SpUpdate = {}, SupportBundle = {}, + // `SvcInMaintenance`s do not contain IDs themselves. These IDs exist + // for the same reason as those in `OmicronSledConfig`. + SvcInMaintenance = {}, TufArtifact = {}, TufRepo = {}, TufTrustRoot = {}, From 52f78497b27285ce0f5619392bd6ee3374958436 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 19:32:49 +1300 Subject: [PATCH 03/12] display works --- nexus/db-model/src/inventory.rs | 1 + .../db-queries/src/db/datastore/inventory.rs | 151 +++++++++++++----- nexus/db-schema/src/schema.rs | 2 +- nexus/types/src/inventory/display.rs | 115 +++++++++---- sled-agent/src/sim/sled_agent.rs | 3 +- 5 files changed, 197 insertions(+), 75 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index e3e2045b256..a6c59ee9e8e 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1024,6 +1024,7 @@ impl_enum_type!( pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, + // TODO-K: Is this ID necessary? pub id: DbTypedUuid, pub fmri: Option, pub zone: Option, diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 63869ffe42c..bba42438f2f 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -27,6 +27,8 @@ use diesel::sql_types::Nullable; use futures::FutureExt; use futures::future::BoxFuture; use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; @@ -2766,44 +2768,73 @@ impl DataStore { }; // TODO-K: fix -// // Mapping of "Sled ID" -> "All SMF services in maintenance reported by -// // that sled" -// let svcs_in_maintenance: BTreeMap< -// Uuid, -// Vec, -// > = { -// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; -// -// let mut svcs = -// BTreeMap::>::new(); -// let mut paginator = Paginator::new( -// batch_size, -// dropshot::PaginationOrder::Ascending, -// ); -// while let Some(p) = paginator.next() { -// let batch = paginated_multicolumn( -// dsl::inv_health_monitor_svc_in_maintenance, -// (dsl::sled_id, dsl::id), -// &p.current_pagparams(), -// ) -// .filter(dsl::inv_collection_id.eq(db_id)) -// .select(InvDataset::as_select()) -// .load_async(&*conn) -// .await -// .map_err(|e| { -// public_error_from_diesel(e, ErrorHandler::Server) -// })?; -// paginator = p.found_batch(&batch, &|row| { -// (row.sled_id, row.name.clone()) -// }); -// for svc in batch { -// svcs.entry(svc.sled_id.into_untyped_uuid()) -// .or_default() -// .push(svc.into()); -// } -// } -// svcs -// }; + // Mapping of "Sled ID" -> "All SMF services in maintenance reported by + // that sled" + let mut svcs_in_maintenance_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let mut svcs = BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + // TODO-K: Do I actually need paginated multicolumn? + let batch: Vec = paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenance::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = + p.found_batch(&batch, &|row| (row.sled_id, row.id.clone())); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc.into()); + } + } + svcs + }; + // + // TODO-K: This is wrong. We want a vector of services, not just one + // let mut svcs_in_maintenance_by_sled = { + // use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + // + // let mut results: BTreeMap = BTreeMap::new(); + // + // let mut paginator = Paginator::new( + // batch_size, + // dropshot::PaginationOrder::Ascending, + // ); + // while let Some(p) = paginator.next() { + // let batch = paginated( + // dsl::inv_health_monitor_svc_in_maintenance, + // dsl::sled_id, + // &p.current_pagparams(), + // ) + // .filter(dsl::inv_collection_id.eq(db_id)) + // .select(InvSvcInMaintenance::as_select()) + // .load_async(&*conn) + // .await + // .map_err(|e| { + // public_error_from_diesel(e, ErrorHandler::Server) + // })?; + // paginator = p.found_batch(&batch, &|row| row.sled_id); + // + // for row in batch { + // results.insert(row.sled_id.into(), row); + // } + // } + // + // results + // }; // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. @@ -4060,6 +4091,48 @@ impl DataStore { )) })?; + // TODO-K; Clean up + // Convert all health checks into a full `HealthMonitorInventory` + let mut health_monitor = HealthMonitorInventory::new(); + + let svcs_in_maintenance = svcs_in_maintenance_by_sled + .remove(&sled_id.into_untyped_uuid()) + .map(|svcs| { + // TODO-K: Clean up + if let Some(e) = svcs[0].svcs_cmd_error.clone() { + return Err(e); + } + let mut services = vec![]; + for svc in &svcs { + let fmri = if let Some(f) = svc.fmri.clone() { + f + } else { + "".to_string() + }; + let zone = if let Some(z) = svc.zone.clone() { + z + } else { + "".to_string() + }; + + let service = SvcInMaintenance { fmri, zone }; + services.push(service) + } + + Ok(SvcsInMaintenanceResult { + services, + errors: svcs[0].error_messages.clone(), + time_of_status: svcs[0].time_of_status, + }) + }); + + if let Some(svcs) = svcs_in_maintenance { + println!("DEBUG {svcs:?}"); + health_monitor.smf_services_in_maintenance = svcs + }; + + // TODO-K: End of clean up bit + let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, source: s.source, @@ -4098,7 +4171,7 @@ impl DataStore { zone_image_resolver, // TODO-K[omicron#9516]: Actually query the DB when there is // something there - health_monitor: HealthMonitorInventory::new(), + health_monitor, }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 8eab5de820f..5954ceb738b 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1724,7 +1724,7 @@ table! { fmri -> Nullable, zone -> Nullable, - error_messages -> Nullable>, + error_messages -> Array, svcs_cmd_error -> Nullable, time_of_status -> Nullable, } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 163f8744c79..f21d283cc47 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -14,12 +14,14 @@ use chrono::SecondsFormat; use clap::Subcommand; use gateway_types::component::SpType; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcsInMaintenanceResult; use indent_write::fmt::IndentWriter; use itertools::Itertools; use omicron_common::disk::M2Slot; use omicron_uuid_kinds::{ DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, ZpoolUuid, }; +use sled_agent_types::inventory::HealthMonitorInventory; use sled_agent_types_versions::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, ConfigReconcilerInventory, ConfigReconcilerInventoryResult, @@ -896,41 +898,8 @@ fn display_sleds( } } - // TODO-K[omicron#9516]: This is temporarily hidden until we add the - // health monitor types to the DB. Once those have been integrated, - // we'll show health monitor status when everything is healthy as well. - if !health_monitor.is_empty() { - writeln!(indented, "HEALTH MONITOR")?; - let mut indent2 = IndentWriter::new(" ", &mut indented); - match &health_monitor.smf_services_in_maintenance { - Ok(svcs) => { - if !svcs.is_empty() { - if let Some(time_of_status) = &svcs.time_of_status { - writeln!( - indent2, - "SMF services in maintenance at {}:", - time_of_status.to_rfc3339_opts( - SecondsFormat::Millis, - /* use_z */ true, - ) - )?; - } - let mut indent3 = IndentWriter::new(" ", &mut indent2); - for svc in &svcs.services { - writeln!(indent3, "{svc}")?; - } - } - } - Err(e) => { - writeln!( - indent2, - "failed to retrieve SMF services in maintenance: {e}" - )?; - } - } - } - f = indented.into_inner(); + display_health_monitor(health_monitor, f)?; } Ok(()) } @@ -1122,6 +1091,84 @@ fn collect_config_reconciler_errors( .collect() } +fn display_health_monitor( + health_monitor: &HealthMonitorInventory, + f: &mut dyn fmt::Write, +) -> fmt::Result { + let HealthMonitorInventory { smf_services_in_maintenance } = health_monitor; + + writeln!(f, "\nHEALTH MONITOR")?; + + let mut indented = IndentWriter::new(" ", f); + + match &smf_services_in_maintenance { + Ok(svcs) => { + if !svcs.is_empty() { + let SvcsInMaintenanceResult { + services, + errors, + time_of_status, + } = svcs; + let time = if let Some(t) = time_of_status { + t.to_rfc3339_opts( + SecondsFormat::Millis, + /* use_z */ true, + ) + } else { + "unknown time".to_string() + }; + + writeln!( + indented, + "{} SMF services in maintenance at {}", + svcs.services.len(), + time + )?; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SvcRow { + fmri: String, + zone: String, + } + let rows = services.iter().map(|s| SvcRow { + fmri: s.fmri.clone(), + zone: s.zone.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(4, 1, 0, 0)) + .to_string(); + writeln!(indented, "{table}")?; + + if !errors.is_empty() { + writeln!( + indented, + "\nfound errors when retrieving services in maintenance:" + )?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + for e in errors { + writeln!(indent2, "{e}")?; + } + } + } else { + writeln!( + indented, + "no data on SMF services in maintenance has been collected" + )?; + } + } + Err(e) => { + writeln!( + indented, + "failed to retrieve SMF services in maintenance: {e}" + )?; + } + }; + + Ok(()) +} + fn display_sled_config( label: &str, config: &OmicronSledConfig, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 42d4861d4b4..a2feba8796a 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -170,7 +170,8 @@ impl SledAgent { // TODO-K: Uncomment // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + let health_monitor = + crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From 3b5e7799b2a7f0fe5b6117049e23de7ee1248ea9 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 20:03:43 +1300 Subject: [PATCH 04/12] clean up and disable svcs in sim --- .../db-queries/src/db/datastore/inventory.rs | 38 ++----------------- nexus/types/src/inventory/display.rs | 1 + sled-agent/src/sim/sled_agent.rs | 8 ++-- 3 files changed, 8 insertions(+), 39 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index bba42438f2f..feba0be3c7d 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -2793,48 +2793,15 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) })?; paginator = - p.found_batch(&batch, &|row| (row.sled_id, row.id.clone())); + p.found_batch(&batch, &|row| (row.sled_id, row.id)); for svc in batch { svcs.entry(svc.sled_id.into_untyped_uuid()) .or_default() - .push(svc.into()); + .push(svc); } } svcs }; - // - // TODO-K: This is wrong. We want a vector of services, not just one - // let mut svcs_in_maintenance_by_sled = { - // use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; - // - // let mut results: BTreeMap = BTreeMap::new(); - // - // let mut paginator = Paginator::new( - // batch_size, - // dropshot::PaginationOrder::Ascending, - // ); - // while let Some(p) = paginator.next() { - // let batch = paginated( - // dsl::inv_health_monitor_svc_in_maintenance, - // dsl::sled_id, - // &p.current_pagparams(), - // ) - // .filter(dsl::inv_collection_id.eq(db_id)) - // .select(InvSvcInMaintenance::as_select()) - // .load_async(&*conn) - // .await - // .map_err(|e| { - // public_error_from_diesel(e, ErrorHandler::Server) - // })?; - // paginator = p.found_batch(&batch, &|row| row.sled_id); - // - // for row in batch { - // results.insert(row.sled_id.into(), row); - // } - // } - // - // results - // }; // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. @@ -4127,6 +4094,7 @@ impl DataStore { }); if let Some(svcs) = svcs_in_maintenance { + // TODO-K: removeme println!("DEBUG {svcs:?}"); health_monitor.smf_services_in_maintenance = svcs }; diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index f21d283cc47..05b4341599c 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -1152,6 +1152,7 @@ fn display_health_monitor( } } } else { + // TODO-K: Should we record time even if no svcs in maintenance were found? writeln!( indented, "no data on SMF services in maintenance has been collected" diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index a2feba8796a..fc0063096e8 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,10 +168,10 @@ impl SledAgent { .await .start(&log, &config.dropshot); - // TODO-K: Uncomment - // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = - crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + // TODO-K: Uncomment and remove long running task + let health_monitor = HealthMonitorHandle::stub(); + //let health_monitor = + // crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From c6432017cd2c705b83c4686db7c9419e801c3a03 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 20:06:26 +1300 Subject: [PATCH 05/12] expectorate --- .../reconfigurator-cli/tests/output/cmds-example-stdout | 9 +++++++++ .../tests/output/cmds-mupdate-update-flow-stdout | 9 +++++++++ .../tests/output/cmds-nexus-generation-autobump-stdout | 9 +++++++++ .../tests/output/cmds-target-release-stdout | 9 +++++++++ .../tests/output/cmds-unsafe-zone-mgs-stdout | 9 +++++++++ 5 files changed, 45 insertions(+) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 83aa46e3d5c..1de518982b0 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1592,6 +1592,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 32d8d836-4d8a-4e54-8fa9-f31d79c42646 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1719,6 +1722,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 89d02b1b-478c-401a-8e28-7a26f74fa41b (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -1939,6 +1945,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 25242992fcd..f8a9c9a0680 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -314,6 +314,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -432,6 +435,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -539,6 +545,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 28743676866..d4f01859add 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -702,6 +702,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -868,6 +871,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1034,6 +1040,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index 952056fb802..bfc8a125487 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -689,6 +689,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -855,6 +858,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1021,6 +1027,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index 82562dc16a7..18efc9f2a04 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -673,6 +673,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -839,6 +842,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1005,6 +1011,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved From 9b1ac94f87db0a8fb303ca290de12c9fd83e89b3 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 20:07:30 +1300 Subject: [PATCH 06/12] fmt --- nexus/db-queries/src/db/datastore/inventory.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index feba0be3c7d..7243b7fc57d 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -2792,8 +2792,7 @@ impl DataStore { .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) })?; - paginator = - p.found_batch(&batch, &|row| (row.sled_id, row.id)); + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); for svc in batch { svcs.entry(svc.sled_id.into_untyped_uuid()) .or_default() From aa071e02333cc9117156f3aaeb286739c5dccb03 Mon Sep 17 00:00:00 2001 From: karencfv Date: Wed, 7 Jan 2026 20:42:00 +1300 Subject: [PATCH 07/12] fix display bug --- .../db-queries/src/db/datastore/inventory.rs | 41 ++++++++++++++----- nexus/types/src/inventory/display.rs | 36 ++++++++-------- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 7243b7fc57d..47bb56db94b 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -216,15 +216,30 @@ impl DataStore { for sled_agent in &collection.sled_agents { match &sled_agent.health_monitor.smf_services_in_maintenance { Ok(svcs) => { - for svc in &svcs.services { + // When there are no services in maintenance, we will still + // want to insert a row with the time the health check was + // made and any parsing errors we may have collected. + if svcs.services.is_empty() && svcs.time_of_status.is_some() + { svcs_in_maintenance.push(InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, - Some(svc.clone()), + None, svcs.errors.clone(), None, svcs.time_of_status, )); + } else { + for svc in &svcs.services { + svcs_in_maintenance.push(InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + Some(svc.clone()), + svcs.errors.clone(), + None, + svcs.time_of_status, + )); + } } } Err(e) => svcs_in_maintenance.push(InvSvcInMaintenance::new( @@ -2779,7 +2794,6 @@ impl DataStore { dropshot::PaginationOrder::Ascending, ); while let Some(p) = paginator.next() { - // TODO-K: Do I actually need paginated multicolumn? let batch: Vec = paginated_multicolumn( dsl::inv_health_monitor_svc_in_maintenance, (dsl::sled_id, dsl::id), @@ -4063,13 +4077,21 @@ impl DataStore { let svcs_in_maintenance = svcs_in_maintenance_by_sled .remove(&sled_id.into_untyped_uuid()) - .map(|svcs| { + .map(|rows| { // TODO-K: Clean up - if let Some(e) = svcs[0].svcs_cmd_error.clone() { + if let Some(e) = rows[0].svcs_cmd_error.clone() { return Err(e); } + let mut services = vec![]; - for svc in &svcs { + for svc in &rows { + if svc.fmri.is_none() && svc.zone.is_none() { + continue; + } + + // All rows should have both zone and FMRI populated or + // none at all. Nevertheless, we'll handle the case of a + // partially populated row. let fmri = if let Some(f) = svc.fmri.clone() { f } else { @@ -4081,14 +4103,13 @@ impl DataStore { "".to_string() }; - let service = SvcInMaintenance { fmri, zone }; - services.push(service) + services.push(SvcInMaintenance { fmri, zone }) } Ok(SvcsInMaintenanceResult { services, - errors: svcs[0].error_messages.clone(), - time_of_status: svcs[0].time_of_status, + errors: rows[0].error_messages.clone(), + time_of_status: rows[0].time_of_status, }) }); diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 05b4341599c..0bd2f6f679b 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -1121,26 +1121,27 @@ fn display_health_monitor( writeln!( indented, "{} SMF services in maintenance at {}", - svcs.services.len(), + services.len(), time )?; - #[derive(Tabled)] - #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct SvcRow { - fmri: String, - zone: String, - } - let rows = services.iter().map(|s| SvcRow { - fmri: s.fmri.clone(), - zone: s.zone.clone(), - }); - let table = tabled::Table::new(rows) - .with(tabled::settings::Style::empty()) - .with(tabled::settings::Padding::new(4, 1, 0, 0)) - .to_string(); - writeln!(indented, "{table}")?; - + if !services.is_empty() { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SvcRow { + fmri: String, + zone: String, + } + let rows = services.iter().map(|s| SvcRow { + fmri: s.fmri.clone(), + zone: s.zone.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(4, 1, 0, 0)) + .to_string(); + writeln!(indented, "{table}")?; + }; if !errors.is_empty() { writeln!( indented, @@ -1152,7 +1153,6 @@ fn display_health_monitor( } } } else { - // TODO-K: Should we record time even if no svcs in maintenance were found? writeln!( indented, "no data on SMF services in maintenance has been collected" From e3f0a1ff31c0ea2de8df0de6518343d1b73d4b9c Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 8 Jan 2026 16:56:55 +1300 Subject: [PATCH 08/12] Clean up --- illumos-utils/src/svcs.rs | 10 -- nexus/db-model/src/inventory.rs | 3 - .../db-queries/src/db/datastore/inventory.rs | 116 +++++++++--------- sled-agent/src/sim/sled_agent.rs | 6 +- 4 files changed, 58 insertions(+), 77 deletions(-) diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 42abf8b4bf5..1ea8eac69f1 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -19,7 +19,6 @@ use serde::Deserialize; use serde::Serialize; use slog::Logger; use slog::{error, info}; -use std::fmt::Display; #[cfg(target_os = "illumos")] use tokio::process::Command; @@ -195,7 +194,6 @@ impl From for SvcState { } } -// TODO-K: Ugh, I think this might need to be versioned and moved out of here? #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running @@ -211,14 +209,6 @@ impl SvcInMaintenance { } } -impl Display for SvcInMaintenance { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let SvcInMaintenance { fmri, zone } = self; - - writeln!(f, "FMRI: {} zone: {}", fmri, zone) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index a6c59ee9e8e..239de843876 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1018,19 +1018,16 @@ impl_enum_type!( Idle => b"idle" ); -// TODO-K: add docs and move type elsewhere? #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_health_monitor_svc_in_maintenance)] pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, - // TODO-K: Is this ID necessary? pub id: DbTypedUuid, pub fmri: Option, pub zone: Option, pub error_messages: Vec, pub svcs_cmd_error: Option, - // TODO-K: Check if this needs to be an option pub time_of_status: Option>, } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 47bb56db94b..8fd72bc3162 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -209,49 +209,55 @@ impl DataStore { } } - // TODO-K: Clean up // Pull services in maintenance out of all sled agents - let mut svcs_in_maintenance = vec![]; - - for sled_agent in &collection.sled_agents { - match &sled_agent.health_monitor.smf_services_in_maintenance { - Ok(svcs) => { + let svcs_in_maintenance: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { // When there are no services in maintenance, we will still // want to insert a row with the time the health check was // made and any parsing errors we may have collected. - if svcs.services.is_empty() && svcs.time_of_status.is_some() + Ok(svcs) + if svcs.services.is_empty() + && svcs.time_of_status.is_some() => { - svcs_in_maintenance.push(InvSvcInMaintenance::new( + vec![InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, None, svcs.errors.clone(), None, svcs.time_of_status, - )); - } else { - for svc in &svcs.services { - svcs_in_maintenance.push(InvSvcInMaintenance::new( + )] + } + Ok(svcs) => svcs + .services + .iter() + .map(|svc| { + InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, Some(svc.clone()), svcs.errors.clone(), None, svcs.time_of_status, - )); - } + ) + }) + .collect(), + Err(e) => { + vec![InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + vec![], + Some(e.to_string()), + None, + )] } } - Err(e) => svcs_in_maintenance.push(InvSvcInMaintenance::new( - collection_id, - sled_agent.sled_id, - None, - vec![], - Some(e.to_string()), - None, - )), - } - } + }) + .collect(); // Pull disks out of all sled agents let disks: Vec<_> = collection @@ -2656,7 +2662,6 @@ impl DataStore { disk_firmware }; - // TODO-K: Take inspiration here // Mapping of "Sled ID" -> "All disks reported by that sled" let physical_disks: BTreeMap< SledUuid, @@ -2711,7 +2716,6 @@ impl DataStore { disks }; - // TODO-K: get inspiration ID here // Mapping of "Sled ID" -> "All zpools reported by that sled" let zpools: BTreeMap> = { use nexus_db_schema::schema::inv_zpool::dsl; @@ -2782,7 +2786,6 @@ impl DataStore { datasets }; - // TODO-K: fix // Mapping of "Sled ID" -> "All SMF services in maintenance reported by // that sled" let mut svcs_in_maintenance_by_sled = { @@ -4071,56 +4074,49 @@ impl DataStore { )) })?; - // TODO-K; Clean up // Convert all health checks into a full `HealthMonitorInventory` let mut health_monitor = HealthMonitorInventory::new(); let svcs_in_maintenance = svcs_in_maintenance_by_sled .remove(&sled_id.into_untyped_uuid()) .map(|rows| { - // TODO-K: Clean up - if let Some(e) = rows[0].svcs_cmd_error.clone() { - return Err(e); + // Get metadata from the first row. All rows from the same + // collection and sled will share time_of_status, + // svcs_cmd_error and error_messages. + let first_row = + rows.first().expect("rows should not be empty"); + + // Check if the svcs command itself failed first. If so, we + // can safely assume no services in maintenance have been + // reported and return an error. + if let Some(e) = &first_row.svcs_cmd_error { + return Err(e.clone()); } - let mut services = vec![]; - for svc in &rows { - if svc.fmri.is_none() && svc.zone.is_none() { - continue; - } - - // All rows should have both zone and FMRI populated or - // none at all. Nevertheless, we'll handle the case of a - // partially populated row. - let fmri = if let Some(f) = svc.fmri.clone() { - f - } else { - "".to_string() - }; - let zone = if let Some(z) = svc.zone.clone() { - z - } else { - "".to_string() - }; - - services.push(SvcInMaintenance { fmri, zone }) - } + // Convert database rows to service in maintenance entries. + // All rows should have both zone and FMRI populated or none + // at all. Nevertheless, we'll handle the case of a + // partially populated row. + let services: Vec = rows + .iter() + .filter(|svc| svc.fmri.is_some() || svc.zone.is_some()) + .map(|svc| SvcInMaintenance { + fmri: svc.fmri.clone().unwrap_or_default(), + zone: svc.zone.clone().unwrap_or_default(), + }) + .collect(); Ok(SvcsInMaintenanceResult { services, - errors: rows[0].error_messages.clone(), - time_of_status: rows[0].time_of_status, + errors: first_row.error_messages.clone(), + time_of_status: first_row.time_of_status, }) }); if let Some(svcs) = svcs_in_maintenance { - // TODO-K: removeme - println!("DEBUG {svcs:?}"); health_monitor.smf_services_in_maintenance = svcs }; - // TODO-K: End of clean up bit - let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, source: s.source, @@ -4157,8 +4153,6 @@ impl DataStore { reconciler_status, last_reconciliation, zone_image_resolver, - // TODO-K[omicron#9516]: Actually query the DB when there is - // something there health_monitor, }; sled_agents diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index fc0063096e8..d2bb3f3506c 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -169,9 +169,9 @@ impl SledAgent { .start(&log, &config.dropshot); // TODO-K: Uncomment and remove long running task - let health_monitor = HealthMonitorHandle::stub(); - //let health_monitor = - // crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + // let health_monitor = HealthMonitorHandle::stub(); + let health_monitor = + crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From 4b80284b715b5f6d2a7c196122e52f6303508a6f Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 8 Jan 2026 19:05:44 +1300 Subject: [PATCH 09/12] Ability to remove rows from collection --- .../db-queries/src/db/datastore/inventory.rs | 17 ++++++++-- nexus/inventory/src/examples.rs | 32 ++++++++++++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 8fd72bc3162..a04ff5eb145 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -1950,6 +1950,7 @@ impl DataStore { nmupdate_override_non_boot: usize, nconfig_reconcilers: usize, nboot_partitions: usize, + nhealth_monitor_svc_in_maintenance: usize, nomicron_sled_configs: usize, nomicron_sled_config_disks: usize, nomicron_sled_config_datasets: usize, @@ -1984,6 +1985,7 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2188,6 +2190,16 @@ impl DataStore { .await? }; + // Remove rows associated with the health monitor + let nhealth_monitor_svc_in_maintenance = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + // Remove rows associated with `OmicronSledConfig`s. let nomicron_sled_configs = { use nexus_db_schema::schema::inv_omicron_sled_config::dsl; @@ -2295,8 +2307,6 @@ impl DataStore { .await? }; - // TODO-K: Remove rows for health monitor - Ok(NumRowsDeleted { ncollections, nsps, @@ -2318,6 +2328,7 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2362,6 +2373,7 @@ impl DataStore { "nmupdate_override_non_boot" => nmupdate_override_non_boot, "nconfig_reconcilers" => nconfig_reconcilers, "nboot_partitions" => nboot_partitions, + "nhealth_monitor_svc_in_maintenance" => nhealth_monitor_svc_in_maintenance, "nomicron_sled_configs" => nomicron_sled_configs, "nomicron_sled_config_disks" => nomicron_sled_config_disks, "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, @@ -2373,7 +2385,6 @@ impl DataStore { "ncockroach_status" => ncockroach_status, "nntp_timesync" => nntp_timesync, "ninternal_dns" => ninternal_dns, - // TODO-K: add health monitor rows here too ); Ok(()) diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 06ccdf83571..3efa0220d9d 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -7,6 +7,7 @@ use crate::CollectionBuilder; use crate::now_db_precision; use camino::Utf8Path; +use chrono::Utc; use clickhouse_admin_types::keeper::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::keeper::KeeperId; use gateway_client::types::PowerState; @@ -15,6 +16,8 @@ use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use gateway_types::rot::RotSlot; use iddqd::id_ord_map; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::InternalDnsGenerationStatus; use nexus_types::inventory::RotPage; @@ -573,6 +576,7 @@ pub fn representative() -> Representative { deserialized_zone_manifest: true, has_mupdate_override: true, }), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -605,6 +609,7 @@ pub fn representative() -> Representative { deserialized_zone_manifest: false, has_mupdate_override: false, }), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -635,13 +640,14 @@ pub fn representative() -> Representative { zone_image_resolver(ZoneImageResolverExampleKind::Mismatch { has_mupdate_override: true, }), + HealthMonitorInventory::new(), ), ) .unwrap(); // Finally, report a sled with unknown baseboard information. This should // look the same as the PC as far as inventory is concerned but let's verify - // it. + // it. Additionally, this sled will report a few SMF services in maintenance. let sled_agent_id_unknown = "5c5b4cf9-3e13-45fd-871c-f177d6537510".parse().unwrap(); @@ -660,6 +666,24 @@ pub fn representative() -> Representative { None, // Simulate an error here. zone_image_resolver(ZoneImageResolverExampleKind::Error), + HealthMonitorInventory { + smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { + services: vec![ + SvcInMaintenance { + fmri: "svc:/site/fake-service:default" + .to_string(), + zone: "global".to_string(), + }, + SvcInMaintenance { + fmri: "svc:/site/fake-service2:default" + .to_string(), + zone: "global".to_string(), + }, + ], + errors: vec![], + time_of_status: Some(Utc::now()), + }), + }, ), ) .unwrap(); @@ -980,6 +1004,7 @@ pub fn sled_agent( datasets: Vec, ledgered_sled_config: Option, zone_image_resolver: ZoneImageResolverInventory, + health_monitor: HealthMonitorInventory, ) -> Inventory { // Assume the `ledgered_sled_config` was reconciled successfully. let last_reconciliation = ledgered_sled_config.clone().map(|config| { @@ -1041,9 +1066,6 @@ pub fn sled_agent( reconciler_status, last_reconciliation, zone_image_resolver, - // TODO-K: We'll want to have the functionality to add some services - // here in a future PR. This will be more useful when we add this - // information to the DB. - health_monitor: HealthMonitorInventory::new(), + health_monitor, } } From ed3ed7fee7a4db0cb6b3c348edd75bb7feb6c558 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 8 Jan 2026 19:19:13 +1300 Subject: [PATCH 10/12] Disbale health monitor on simulated system --- sled-agent/src/long_running_tasks.rs | 3 +-- sled-agent/src/sim/sled_agent.rs | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index d27aaa8595e..700d4a08f4b 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -275,8 +275,7 @@ async fn spawn_bootstore_tasks( node_handle } -// TODO-K: Remove pub -pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d2bb3f3506c..075dc655a0f 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,10 +168,7 @@ impl SledAgent { .await .start(&log, &config.dropshot); - // TODO-K: Uncomment and remove long running task - // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = - crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + let health_monitor = HealthMonitorHandle::stub(); Arc::new(SledAgent { id, From 4489b986401e9cf76bb98e442d28f59c155c5456 Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 9 Jan 2026 10:51:31 +1300 Subject: [PATCH 11/12] fix tests --- nexus/inventory/src/examples.rs | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 3efa0220d9d..bf19668bded 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -7,7 +7,6 @@ use crate::CollectionBuilder; use crate::now_db_precision; use camino::Utf8Path; -use chrono::Utc; use clickhouse_admin_types::keeper::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::keeper::KeeperId; use gateway_client::types::PowerState; @@ -668,20 +667,14 @@ pub fn representative() -> Representative { zone_image_resolver(ZoneImageResolverExampleKind::Error), HealthMonitorInventory { smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { - services: vec![ - SvcInMaintenance { - fmri: "svc:/site/fake-service:default" - .to_string(), - zone: "global".to_string(), - }, - SvcInMaintenance { - fmri: "svc:/site/fake-service2:default" - .to_string(), - zone: "global".to_string(), - }, - ], + services: vec![SvcInMaintenance { + fmri: "svc:/site/fake-service:default".to_string(), + zone: "global".to_string(), + }], errors: vec![], - time_of_status: Some(Utc::now()), + time_of_status: Some( + "2026-01-01T00:00:00Z".parse().unwrap(), + ), }), }, ), From 335832e38854a5101512a9606455ba92bba2328e Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 9 Jan 2026 12:12:19 +1300 Subject: [PATCH 12/12] fmt --- nexus/db-model/src/inventory.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index b54002c8a9c..aaffca12703 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -38,8 +38,8 @@ use nexus_db_schema::schema::{ inv_health_monitor_svc_in_maintenance, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, - inv_last_reconciliation_disk_result, - inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_measurements, + inv_last_reconciliation_disk_result, inv_last_reconciliation_measurements, + inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_measurement_manifest_non_boot, inv_mupdate_override_non_boot, inv_ntp_timesync, inv_nvme_disk_firmware, inv_omicron_sled_config, inv_omicron_sled_config_dataset,