diff --git a/Cargo.lock b/Cargo.lock index e31c06d5635..7da553a09bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6764,6 +6764,7 @@ dependencies = [ "expectorate", "hex", "iddqd", + "illumos-utils", "ipnetwork", "itertools 0.14.0", "macaddr", diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 7839ade793e..753dafa69df 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1608,6 +1608,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 32d8d836-4d8a-4e54-8fa9-f31d79c42646 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1744,6 +1747,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 89d02b1b-478c-401a-8e28-7a26f74fa41b (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -1973,6 +1979,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 4ccceeef6ce..b16dd0685fa 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -323,6 +323,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -450,6 +453,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -566,6 +572,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 061d0d5a3e4..99f26160f7c 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -711,6 +711,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -886,6 +889,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1061,6 +1067,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index d55487fa680..a75f65c2727 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -698,6 +698,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -873,6 +876,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1048,6 +1054,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index 2a92774cfdc..e1295c23220 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -682,6 +682,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -857,6 +860,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1032,6 +1038,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 17b68412379..1ea8eac69f1 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -19,7 +19,6 @@ use serde::Deserialize; use serde::Serialize; use slog::Logger; use slog::{error, info}; -use std::fmt::Display; #[cfg(target_os = "illumos")] use tokio::process::Command; @@ -199,8 +198,8 @@ impl From for SvcState { #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running pub struct SvcInMaintenance { - fmri: String, - zone: String, + pub fmri: String, + pub zone: String, } impl SvcInMaintenance { @@ -210,14 +209,6 @@ impl SvcInMaintenance { } } -impl Display for SvcInMaintenance { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let SvcInMaintenance { fmri, zone } = self; - - writeln!(f, "FMRI: {} zone: {}", fmri, zone) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index 9762bccbe76..1c30231bf53 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -21,6 +21,7 @@ derive-where.workspace = true diesel = { workspace = true, features = ["postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } hex.workspace = true iddqd.workspace = true +illumos-utils.workspace = true ipnetwork.workspace = true itertools.workspace = true macaddr.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 6f580f1b600..aaffca12703 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -9,6 +9,7 @@ use crate::Generation; use crate::PhysicalDiskKind; use crate::omicron_zone_config::{self, OmicronZoneNic}; use crate::sled_cpu_family::SledCpuFamily; +use crate::to_db_typed_uuid; use crate::typed_uuid::DbTypedUuid; use crate::{ ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, @@ -27,14 +28,16 @@ use diesel::pg::Pg; use diesel::serialize::ToSql; use diesel::{serialize, sql_types}; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcInMaintenance; use ipnetwork::IpNetwork; use nexus_db_schema::schema::inv_zone_manifest_non_boot; use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, - inv_internal_dns, inv_last_reconciliation_dataset_result, + inv_health_monitor_svc_in_maintenance, inv_host_phase_1_active_slot, + inv_host_phase_1_flash_hash, inv_internal_dns, + inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_measurements, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_measurement_manifest_non_boot, @@ -63,6 +66,7 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -72,6 +76,8 @@ use omicron_uuid_kinds::OmicronSledConfigUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledKind; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::SvcInMaintenanceKind; +use omicron_uuid_kinds::SvcInMaintenanceUuid; use omicron_uuid_kinds::ZpoolKind; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind}; use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; @@ -1016,6 +1022,51 @@ impl_enum_type!( Idle => b"idle" ); +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance)] +pub struct InvSvcInMaintenance { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub id: DbTypedUuid, + pub fmri: Option, + pub zone: Option, + pub error_messages: Vec, + pub svcs_cmd_error: Option, + pub time_of_status: Option>, +} + +impl InvSvcInMaintenance { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + svc: Option, + svc_errors: Vec, + svcs_cmd_error: Option, + time_of_status: Option>, + ) -> Self { + let (fmri, zone) = match svc { + Some(svc) => (Some(svc.fmri), Some(svc.zone)), + None => (None, None), + }; + + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + id, + fmri, + zone, + error_messages: svc_errors, + svcs_cmd_error, + time_of_status, + } + } +} + /// See [`sled_agent_types::inventory::ConfigReconcilerInventory`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_sled_config_reconciler)] diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1f6193e6c76..0d9881d8f4c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(219, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(220, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(220, "health-monitor-svcs-in-maintenance"), KnownVersion::new(219, "blueprint-sled-last-used-ip"), KnownVersion::new(218, "measurements"), KnownVersion::new(217, "multiple-default-ip-pools-per-silo"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index de7346ae98f..04931f53054 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -27,6 +27,8 @@ use diesel::sql_types::Nullable; use futures::FutureExt; use futures::future::BoxFuture; use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; @@ -62,6 +64,7 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; +use nexus_db_model::InvSvcInMaintenance; use nexus_db_model::InvZpool; use nexus_db_model::RotImageError; use nexus_db_model::SledRole; @@ -210,6 +213,56 @@ impl DataStore { } } + // Pull services in maintenance out of all sled agents + let svcs_in_maintenance: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + // When there are no services in maintenance, we will still + // want to insert a row with the time the health check was + // made and any parsing errors we may have collected. + Ok(svcs) + if svcs.services.is_empty() + && svcs.time_of_status.is_some() => + { + vec![InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + svcs.errors.clone(), + None, + svcs.time_of_status, + )] + } + Ok(svcs) => svcs + .services + .iter() + .map(|svc| { + InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + Some(svc.clone()), + svcs.errors.clone(), + None, + svcs.time_of_status, + ) + }) + .collect(), + Err(e) => { + vec![InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + vec![], + Some(e.to_string()), + None, + )] + } + } + }) + .collect(); + // Pull disks out of all sled agents let disks: Vec<_> = collection .sled_agents @@ -1507,6 +1560,25 @@ impl DataStore { } } + // Insert rows for all the unhealthy services we found + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); + loop { + let some_svcs_in_maintenance = + svcs_in_maintenance.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) + .values(some_svcs_in_maintenance) + .execute_async(&conn) + .await?; + } + } + // Insert rows for the sled agents that we found. In practice, we'd // expect these to all have baseboards (if using Oxide hardware) or // none have baseboards (if not). @@ -2011,6 +2083,7 @@ impl DataStore { nmupdate_override_non_boot: usize, nconfig_reconcilers: usize, nboot_partitions: usize, + nhealth_monitor_svc_in_maintenance: usize, nomicron_sled_configs: usize, nomicron_sled_config_disks: usize, nomicron_sled_config_datasets: usize, @@ -2048,6 +2121,7 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2278,6 +2352,16 @@ impl DataStore { .await? }; + // Remove rows associated with the health monitor + let nhealth_monitor_svc_in_maintenance = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + // Remove rows associated with `OmicronSledConfig`s. let nomicron_sled_configs = { use nexus_db_schema::schema::inv_omicron_sled_config::dsl; @@ -2409,6 +2493,7 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2457,6 +2542,7 @@ impl DataStore { "nmupdate_override_non_boot" => nmupdate_override_non_boot, "nconfig_reconcilers" => nconfig_reconcilers, "nboot_partitions" => nboot_partitions, + "nhealth_monitor_svc_in_maintenance" => nhealth_monitor_svc_in_maintenance, "nomicron_sled_configs" => nomicron_sled_configs, "nomicron_sled_config_disks" => nomicron_sled_config_disks, "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, @@ -2860,6 +2946,39 @@ impl DataStore { datasets }; + // Mapping of "Sled ID" -> "All SMF services in maintenance reported by + // that sled" + let mut svcs_in_maintenance_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let mut svcs = BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let batch: Vec = paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenance::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc); + } + } + svcs + }; + // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. let baseboard_id_ids: BTreeSet<_> = sps @@ -4248,6 +4367,49 @@ impl DataStore { )) })?; + // Convert all health checks into a full `HealthMonitorInventory` + let mut health_monitor = HealthMonitorInventory::new(); + + let svcs_in_maintenance = svcs_in_maintenance_by_sled + .remove(&sled_id.into_untyped_uuid()) + .map(|rows| { + // Get metadata from the first row. All rows from the same + // collection and sled will share time_of_status, + // svcs_cmd_error and error_messages. + let first_row = + rows.first().expect("rows should not be empty"); + + // Check if the svcs command itself failed first. If so, we + // can safely assume no services in maintenance have been + // reported and return an error. + if let Some(e) = &first_row.svcs_cmd_error { + return Err(e.clone()); + } + + // Convert database rows to service in maintenance entries. + // All rows should have both zone and FMRI populated or none + // at all. Nevertheless, we'll handle the case of a + // partially populated row. + let services: Vec = rows + .iter() + .filter(|svc| svc.fmri.is_some() || svc.zone.is_some()) + .map(|svc| SvcInMaintenance { + fmri: svc.fmri.clone().unwrap_or_default(), + zone: svc.zone.clone().unwrap_or_default(), + }) + .collect(); + + Ok(SvcsInMaintenanceResult { + services, + errors: first_row.error_messages.clone(), + time_of_status: first_row.time_of_status, + }) + }); + + if let Some(svcs) = svcs_in_maintenance { + health_monitor.smf_services_in_maintenance = svcs + }; + let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, source: s.source, @@ -4284,9 +4446,7 @@ impl DataStore { reconciler_status, last_reconciliation, file_source_resolver, - // TODO-K[omicron#9516]: Actually query the DB when there is - // something there - health_monitor: HealthMonitorInventory::new(), + health_monitor, }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index dc4af105407..1ae68a4f12c 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1721,6 +1721,20 @@ table! { } } +table! { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + id -> Uuid, + fmri -> Nullable, + zone -> Nullable, + + error_messages -> Array, + svcs_cmd_error -> Nullable, + time_of_status -> Nullable, + } +} + table! { inv_sled_boot_partition (inv_collection_id, sled_id, boot_disk_slot) { inv_collection_id -> Uuid, diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 5c0ca382e40..51de5fa8ba1 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -16,6 +16,8 @@ use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use gateway_types::rot::RotSlot; use iddqd::id_ord_map; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::InternalDnsGenerationStatus; use nexus_types::inventory::RotPage; @@ -581,6 +583,7 @@ pub fn representative() -> Representative { has_mupdate_override: true, }, ), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -615,6 +618,7 @@ pub fn representative() -> Representative { has_mupdate_override: false, }, ), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -647,13 +651,14 @@ pub fn representative() -> Representative { has_mupdate_override: true, }, ), + HealthMonitorInventory::new(), ), ) .unwrap(); // Finally, report a sled with unknown baseboard information. This should // look the same as the PC as far as inventory is concerned but let's verify - // it. + // it. Additionally, this sled will report a few SMF services in maintenance. let sled_agent_id_unknown = "5c5b4cf9-3e13-45fd-871c-f177d6537510".parse().unwrap(); @@ -674,6 +679,18 @@ pub fn representative() -> Representative { file_source_resolver( OmicronFileSourceResolverExampleKind::Error, ), + HealthMonitorInventory { + smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { + services: vec![SvcInMaintenance { + fmri: "svc:/site/fake-service:default".to_string(), + zone: "global".to_string(), + }], + errors: vec![], + time_of_status: Some( + "2026-01-01T00:00:00Z".parse().unwrap(), + ), + }), + }, ), ) .unwrap(); @@ -1015,6 +1032,7 @@ pub fn sled_agent( datasets: Vec, ledgered_sled_config: Option, file_source_resolver: OmicronFileSourceResolverInventory, + health_monitor: HealthMonitorInventory, ) -> Inventory { // Assume the `ledgered_sled_config` was reconciled successfully. let last_reconciliation = ledgered_sled_config.clone().map(|config| { @@ -1087,9 +1105,6 @@ pub fn sled_agent( reconciler_status, last_reconciliation, file_source_resolver, - // TODO-K: We'll want to have the functionality to add some services - // here in a future PR. This will be more useful when we add this - // information to the DB. - health_monitor: HealthMonitorInventory::new(), + health_monitor, } } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 9416d4c9f8d..71e84beffcb 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -14,12 +14,14 @@ use chrono::SecondsFormat; use clap::Subcommand; use gateway_types::component::SpType; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcsInMaintenanceResult; use indent_write::fmt::IndentWriter; use itertools::Itertools; use omicron_common::disk::M2Slot; use omicron_uuid_kinds::{ DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, ZpoolUuid, }; +use sled_agent_types::inventory::HealthMonitorInventory; use sled_agent_types_versions::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, ConfigReconcilerInventory, ConfigReconcilerInventoryResult, @@ -907,41 +909,8 @@ fn display_sleds( } } - // TODO-K[omicron#9516]: This is temporarily hidden until we add the - // health monitor types to the DB. Once those have been integrated, - // we'll show health monitor status when everything is healthy as well. - if !health_monitor.is_empty() { - writeln!(indented, "HEALTH MONITOR")?; - let mut indent2 = IndentWriter::new(" ", &mut indented); - match &health_monitor.smf_services_in_maintenance { - Ok(svcs) => { - if !svcs.is_empty() { - if let Some(time_of_status) = &svcs.time_of_status { - writeln!( - indent2, - "SMF services in maintenance at {}:", - time_of_status.to_rfc3339_opts( - SecondsFormat::Millis, - /* use_z */ true, - ) - )?; - } - let mut indent3 = IndentWriter::new(" ", &mut indent2); - for svc in &svcs.services { - writeln!(indent3, "{svc}")?; - } - } - } - Err(e) => { - writeln!( - indent2, - "failed to retrieve SMF services in maintenance: {e}" - )?; - } - } - } - f = indented.into_inner(); + display_health_monitor(health_monitor, f)?; } Ok(()) } @@ -1133,6 +1102,85 @@ fn collect_config_reconciler_errors( .collect() } +fn display_health_monitor( + health_monitor: &HealthMonitorInventory, + f: &mut dyn fmt::Write, +) -> fmt::Result { + let HealthMonitorInventory { smf_services_in_maintenance } = health_monitor; + + writeln!(f, "\nHEALTH MONITOR")?; + + let mut indented = IndentWriter::new(" ", f); + + match &smf_services_in_maintenance { + Ok(svcs) => { + if !svcs.is_empty() { + let SvcsInMaintenanceResult { + services, + errors, + time_of_status, + } = svcs; + let time = if let Some(t) = time_of_status { + t.to_rfc3339_opts( + SecondsFormat::Millis, + /* use_z */ true, + ) + } else { + "unknown time".to_string() + }; + + writeln!( + indented, + "{} SMF services in maintenance at {}", + services.len(), + time + )?; + + if !services.is_empty() { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SvcRow { + fmri: String, + zone: String, + } + let rows = services.iter().map(|s| SvcRow { + fmri: s.fmri.clone(), + zone: s.zone.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(4, 1, 0, 0)) + .to_string(); + writeln!(indented, "{table}")?; + }; + if !errors.is_empty() { + writeln!( + indented, + "\nfound errors when retrieving services in maintenance:" + )?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + for e in errors { + writeln!(indent2, "{e}")?; + } + } + } else { + writeln!( + indented, + "no data on SMF services in maintenance has been collected" + )?; + } + } + Err(e) => { + writeln!( + indented, + "failed to retrieve SMF services in maintenance: {e}" + )?; + } + }; + + Ok(()) +} + fn display_sled_config( label: &str, config: &OmicronSledConfig, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index fe8d79d803a..14c7d9b1a70 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4060,6 +4060,37 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, + + -- time when the status was checked if applicable + time_of_status TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); + -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success @@ -7781,7 +7812,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '219.0.0', NULL) + (TRUE, NOW(), NOW(), '220.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql new file mode 100644 index 00000000000..bf0b956d23e --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -0,0 +1,30 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, + + -- time when the status was checked if applicable + time_of_status TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 2eb483166e9..a5dbedf77be 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -84,6 +84,9 @@ impl_typed_uuid_kinds! { Sled = {}, SpUpdate = {}, SupportBundle = {}, + // `SvcInMaintenance`s do not contain IDs themselves. These IDs exist + // for the same reason as those in `OmicronSledConfig`. + SvcInMaintenance = {}, TufArtifact = {}, TufRepo = {}, TufTrustRoot = {},