diff --git a/Cargo.lock b/Cargo.lock index dff93add700..37688d06ad4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8690,6 +8690,7 @@ dependencies = [ "thiserror 2.0.18", "tokio", "tokio-postgres", + "tokio-test", "tokio-util", "tough", "trust-quorum-types", @@ -15042,6 +15043,17 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "tokio-test" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6d24790a10a7af737693a3e8f1d03faef7e6ca0cc99aae5066f533766de545" +dependencies = [ + "futures-core", + "tokio", + "tokio-stream", +] + [[package]] name = "tokio-tungstenite" version = "0.21.0" diff --git a/Cargo.toml b/Cargo.toml index 2dae5076180..d80e7c74919 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -826,6 +826,7 @@ tofino = { git = "https://github.com/oxidecomputer/tofino" } tokio = "1.47.0" tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] } tokio-stream = "0.1.17" +tokio-test = "0.4.5" tokio-tungstenite = "0.23.1" tokio-util = { version = "0.7.15", features = ["io", "io-util", "time"] } toml = "0.8.23" diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index d56f426f915..c5ddaeb7e08 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -59,6 +59,7 @@ use nexus_types::internal_api::background::BlueprintRendezvousStats; use nexus_types::internal_api::background::BlueprintRendezvousStatus; use nexus_types::internal_api::background::DatasetsRendezvousStats; use nexus_types::internal_api::background::EreporterStatus; +use nexus_types::internal_api::background::FmAnalysisStatus; use nexus_types::internal_api::background::FmRendezvousStatus; use nexus_types::internal_api::background::InstanceReincarnationStatus; use nexus_types::internal_api::background::InstanceUpdaterStatus; @@ -1334,6 +1335,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "webhook_deliverator" => { print_task_webhook_deliverator(details); } + "fm_analysis" => { + print_task_fm_analysis(details); + } "fm_sitrep_loader" => { print_task_fm_sitrep_loader(details); } @@ -3430,6 +3434,82 @@ mod ereporter_status_fields { pub const NUM_WIDTH: usize = 4; } +fn print_task_fm_analysis(details: &serde_json::Value) { + use nexus_types::internal_api::background::fm_analysis::{ + AnalysisOutcome, Outcome, PreparationStatus, + }; + let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } = + match serde_json::from_value::(details.clone()) { + Err(error) => { + eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ); + return; + } + Ok(status) => status, + }; + pub const PARENT_SITREP_ID: &str = "parent sitrep ID:"; + pub const INV_ID: &str = "current inventory collection ID:"; + pub const WIDTH: usize = const_max_len(&[PARENT_SITREP_ID, INV_ID]) + 1; + println!(" {PARENT_SITREP_ID: { + println!( + " analysis was not performed, as the inventory has\n \ + not yet been loaded.\n\ + (i) note: this should only happen if Nexus has just started.", + ); + return; + } + Outcome::PreparationError(error) => { + println!( + "{ERRICON} failed to prepare analysis inputs:\n {error}" + ); + return; + } + Outcome::RanAnalysis { prep_status, outcome } => (prep_status, outcome), + }; + match analysis_outcome { + AnalysisOutcome::Error(error) => { + println!("{ERRICON} analysis failed: {error}"); + } + AnalysisOutcome::Unchanged => { + println!( + " no changes from the current situation report ({:?})", + parent_sitrep_id + ); + } + AnalysisOutcome::NotCommitted { sitrep_id, error } => { + println!( + " analysis succeeded, but the sitrep was not committed!" + ); + println!(" sitrep ID: {sitrep_id:?}"); + println!(" error: {error}"); + } + AnalysisOutcome::Committed { sitrep_id } => { + println!(" analyzed the situation, and committed a new sitrep!"); + println!(" sitrep ID: {sitrep_id:?}"); + } + } + println!(); + + let PreparationStatus { errors, report } = prep_status; + println!("{}", report.display_multiline(4)); + if !errors.is_empty() { + println!("{ERRICON} errors preparing analysis inputs:"); + for error in errors { + println!(" > {error}") + } + } + + // TODO(eliza): eventually there will also be a detailed analysis report, + // print that here as well... +} + fn print_task_fm_sitrep_loader(details: &serde_json::Value) { match serde_json::from_value::(details.clone()) { Err(error) => eprintln!( diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index ee3e3bd841a..f6457634107 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -112,6 +112,10 @@ task: "external_endpoints" on each one +task: "fm_analysis" + performs fault management analysis and updates the sitrep + + task: "fm_rendezvous" updates externally visible database tables to match the current fault management sitrep @@ -371,6 +375,10 @@ task: "external_endpoints" on each one +task: "fm_analysis" + performs fault management analysis and updates the sitrep + + task: "fm_rendezvous" updates externally visible database tables to match the current fault management sitrep @@ -617,6 +625,10 @@ task: "external_endpoints" on each one +task: "fm_analysis" + performs fault management analysis and updates the sitrep + + task: "fm_rendezvous" updates externally visible database tables to match the current fault management sitrep diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 0fbcfe948aa..1cdbe382a19 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -347,6 +347,10 @@ task: "external_endpoints" on each one +task: "fm_analysis" + performs fault management analysis and updates the sitrep + + task: "fm_rendezvous" updates externally visible database tables to match the current fault management sitrep @@ -687,6 +691,24 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_analysis" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + parent sitrep ID: None + current inventory collection ID: Some(..................... (collection)) + FAULT MANAGEMENT ANALYSIS SUMMARY + ===== ========== ======== ======= +/!\ analysis failed: FM analysis is not yet implemented + + fault management analysis inputs + ----- ---------- -------- ------ + parent sitrep: + inventory collection: ..................... + no new ereports since the parent sitrep + no cases copied forward + + task: "fm_rendezvous" configured period: every m last completed activation: , triggered by @@ -1320,6 +1342,24 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_analysis" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + parent sitrep ID: None + current inventory collection ID: Some(..................... (collection)) + FAULT MANAGEMENT ANALYSIS SUMMARY + ===== ========== ======== ======= +/!\ analysis failed: FM analysis is not yet implemented + + fault management analysis inputs + ----- ---------- -------- ------ + parent sitrep: + inventory collection: ..................... + no new ereports since the parent sitrep + no cases copied forward + + task: "fm_rendezvous" configured period: every m last completed activation: , triggered by diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 08b8f0b001d..c877645a239 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -971,6 +971,10 @@ impl Default for MulticastGroupReconcilerConfig { #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct FmTasksConfig { + /// period (in seconds) for periodic activations of the background task that + /// drives fault management analysis. + #[serde_as(as = "DurationSeconds")] + pub analysis_period_secs: Duration, /// period (in seconds) for periodic activations of the background task that /// reads the latest fault management sitrep from the database. #[serde_as(as = "DurationSeconds")] @@ -989,6 +993,10 @@ pub struct FmTasksConfig { impl Default for FmTasksConfig { fn default() -> Self { Self { + // Analysis is generally triggered by changes in the current sitrep, + // inventory, or by the ereport ingester(s), so it need not be + // periodically activated all that frequently. + analysis_period_secs: Duration::from_secs(60), sitrep_load_period_secs: Duration::from_secs(15), // This need not be activated very frequently, as it's triggered any // time the current sitrep changes, and activating it more @@ -1310,6 +1318,7 @@ mod test { probe_distributor.period_secs = 50 multicast_reconciler.period_secs = 60 fm.rendezvous_period_secs = 51 + fm.analysis_period_secs = 52 trust_quorum.period_secs = 60 attached_subnet_manager.period_secs = 60 session_cleanup.period_secs = 300 @@ -1566,6 +1575,7 @@ mod test { disable: false, }, fm: FmTasksConfig { + analysis_period_secs: Duration::from_secs(52), sitrep_load_period_secs: Duration::from_secs(48), sitrep_gc_period_secs: Duration::from_secs(49), rendezvous_period_secs: Duration::from_secs(51), @@ -1702,6 +1712,7 @@ mod test { fm.sitrep_gc_period_secs = 46 probe_distributor.period_secs = 47 fm.rendezvous_period_secs = 48 + fm.analysis_period_secs = 49 multicast_reconciler.period_secs = 60 trust_quorum.period_secs = 60 attached_subnet_manager.period_secs = 60 diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 2ff91e834b5..1e37ca90f48 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -138,6 +138,7 @@ nexus-db-lookup.workspace = true nexus-db-model.workspace = true nexus-db-queries.workspace = true nexus-db-schema.workspace = true +nexus-fm.workspace = true nexus-inventory.workspace = true nexus-metrics-producer-gc.workspace = true nexus-reconfigurator-execution.workspace = true @@ -200,6 +201,7 @@ sp-sim.workspace = true strum.workspace = true subprocess.workspace = true term.workspace = true +tokio-test.workspace = true tufaceous.workspace = true tufaceous-artifact.workspace = true tufaceous-lib.workspace = true diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 91902e95c93..62b9f0c66df 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -53,6 +53,7 @@ pub struct BackgroundTasks { pub task_webhook_deliverator: Activator, pub task_sp_ereport_ingester: Activator, pub task_reconfigurator_config_loader: Activator, + pub task_fm_analysis: Activator, pub task_fm_rendezvous: Activator, pub task_fm_sitrep_loader: Activator, pub task_fm_sitrep_gc: Activator, diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 1922df26a30..db92dc87a08 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -53,10 +53,13 @@ impl CaseMetadata { ) -> Self { let fm::Case { id, - created_sitrep_id, - closed_sitrep_id, - de, - comment, + metadata: + fm::case::Metadata { + created_sitrep_id, + closed_sitrep_id, + de, + comment, + }, alerts_requested: _, support_bundles_requested: _, ereports: _, diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 24f89ebca80..54aa6f6418d 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -319,20 +319,30 @@ impl DataStore { Ok((created, latest)) } - pub async fn ereports_list_unseen( + /// Lists ereports which have not been marked as **definitely seen** + /// (included in a committed sitrep) in the database, paginated by the + /// reporter restart ID and ENA. + /// + /// Note that this filters based only on whether they have been marked in + /// the database. Because marking seen ereports occurs asynchronously from + /// committing sitreps as part of FM rendezvous, ereports returned by this + /// query may have already been seen. These ereports must be filtered out at + /// a higher level based on the contents of the current sitrep when + /// determining which ereports are *actually* new. + pub async fn ereports_list_unmarked( &self, opctx: &OpContext, pagparams: &DataPageParams<'_, (Uuid, DbEna)>, ) -> ListResultVec { // TODO(eliza): ereports should probably have their own resource type someday... opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - Self::ereports_list_unseen_query(pagparams) + Self::ereports_list_unmarked_query(pagparams) .load_async(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - fn ereports_list_unseen_query( + fn ereports_list_unmarked_query( pagparams: &DataPageParams<'_, (Uuid, DbEna)>, ) -> impl RunnableQuery + use<> { paginated_multicolumn( @@ -671,23 +681,24 @@ mod tests { } #[tokio::test] - async fn expectorate_ereports_list_unseen() { + async fn expectorate_ereports_list_unmarked() { let pagparams = DataPageParams { marker: None, direction: PaginationOrder::Ascending, limit: NonZeroU32::new(100).unwrap(), }; - let query = DataStore::ereports_list_unseen_query(&pagparams); + let query = DataStore::ereports_list_unmarked_query(&pagparams); expectorate_query_contents( &query, - "tests/output/ereports_list_unseen.sql", + "tests/output/ereports_list_unmarked.sql", ) .await; } #[tokio::test] - async fn explain_ereports_list_unseen_query() { - let logctx = dev::test_setup_log("explain_ereports_list_unseen_query"); + async fn explain_ereports_list_unmarked_query() { + let logctx = + dev::test_setup_log("explain_ereports_list_unmarked_query"); let db = TestDatabase::new_with_pool(&logctx.log).await; let pool = db.pool(); let conn = pool.claim().await.unwrap(); @@ -697,7 +708,7 @@ mod tests { direction: PaginationOrder::Ascending, limit: NonZeroU32::new(100).unwrap(), }; - let query = DataStore::ereports_list_unseen_query(&pagparams); + let query = DataStore::ereports_list_unmarked_query(&pagparams); let explanation = query .explain_async(&conn) .await diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index fffd4df7877..0f5531f72df 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -370,12 +370,14 @@ impl DataStore { alert_requests.remove(&id).unwrap_or_default(); fm::Case { id, - created_sitrep_id: created_sitrep_id.into(), - closed_sitrep_id: closed_sitrep_id.map(Into::into), - de: de.into(), - comment, - ereports, + metadata: fm::case::Metadata { + created_sitrep_id: created_sitrep_id.into(), + closed_sitrep_id: closed_sitrep_id.map(Into::into), + de: de.into(), + comment, + }, alerts_requested, + ereports, support_bundles_requested: iddqd::IdOrdMap::new(), } })); @@ -1555,10 +1557,13 @@ mod tests { for case in &that.cases { let fm::Case { id, - created_sitrep_id, - closed_sitrep_id, - comment, - de, + metadata: + fm::case::Metadata { + created_sitrep_id, + closed_sitrep_id, + comment, + de, + }, ereports, alerts_requested, support_bundles_requested: _, @@ -1578,18 +1583,21 @@ mod tests { // :( assert_eq!(&expected.id, id, "while checking case {case_id}"); assert_eq!( - &expected.created_sitrep_id, created_sitrep_id, + &expected.metadata.created_sitrep_id, created_sitrep_id, + "while checking case {case_id}" + ); + assert_eq!( + &expected.metadata.closed_sitrep_id, closed_sitrep_id, "while checking case {case_id}" ); assert_eq!( - &expected.closed_sitrep_id, closed_sitrep_id, + &expected.metadata.comment, comment, "while checking case {case_id}" ); assert_eq!( - &expected.comment, comment, + &expected.metadata.de, de, "while checking case {case_id}" ); - assert_eq!(&expected.de, de, "while checking case {case_id}"); // Now, check that all the ereports are present in both cases. assert_eq!(ereports.len(), expected.ereports.len()); @@ -1776,13 +1784,15 @@ mod tests { fm::Case { id: omicron_uuid_kinds::CaseUuid::new_v4(), - created_sitrep_id: sitrep_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "my cool case".to_string(), + }, ereports, alerts_requested, support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "my cool case".to_string(), } }; @@ -1809,13 +1819,15 @@ mod tests { fm::Case { id: omicron_uuid_kinds::CaseUuid::new_v4(), - created_sitrep_id: sitrep_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "break in case of emergency".to_string(), + }, ereports, alerts_requested, support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "break in case of emergency".to_string(), } }; diff --git a/nexus/db-queries/tests/output/ereports_list_unseen.sql b/nexus/db-queries/tests/output/ereports_list_unmarked.sql similarity index 100% rename from nexus/db-queries/tests/output/ereports_list_unseen.sql rename to nexus/db-queries/tests/output/ereports_list_unmarked.sql diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 72ed64e0222..3c1a1a3700a 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -174,6 +174,8 @@ sp_ereport_ingester.period_secs = 30 # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +# How frequently to run analysis from the current sitrep. +fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, # as it does not impact the responsiveness of the fault management system, and # is activated every time the current sitrep changes. Periodic activations are diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index d5cf0b1a918..b4026bfb1de 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -158,6 +158,8 @@ sp_ereport_ingester.period_secs = 30 # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +# How frequently to run analysis from the current sitrep. +fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, # as it does not impact the responsiveness of the fault management system, and # is activated every time the current sitrep changes. Periodic activations are diff --git a/nexus/fm/src/analysis_input.rs b/nexus/fm/src/analysis_input.rs new file mode 100644 index 00000000000..a52e588e9bd --- /dev/null +++ b/nexus/fm/src/analysis_input.rs @@ -0,0 +1,524 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Inputs to fault management analysis. + +use iddqd::IdOrdMap; +use nexus_types::fm::{self, ClosedCaseReport, Sitrep, SitrepVersion}; +use nexus_types::inventory; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; + +pub use nexus_types::fm::AnalysisInputReport as Report; + +/// A complete set of inputs to a fault management analysis phase. +/// +/// This struct bundles together all the inputs to analysis, including: +/// +/// - The [parent sitrep](Input::parent_sitrep) +/// - The current [inventory collection](Input::inventory) +/// - Any [new ereports](Input::new_ereports) which were received since when +/// the parent sitrep was produced +/// - The set of [cases](Input::cases) which must be copied forwards into +/// the next sitrep +/// +/// This type represents the outputs of the analysis preparation phase. Once it +/// is constructed, the inputs are immutable and cannot be modified. To +/// construct a new `Input` as part of a preparaation phase, use +/// [`Input::builder`]. +pub struct Input { + parent_sitrep: Option>, + inv: Arc, + /// Ereports which are new and should be input to analysis in the next + /// sitrep. + new_ereports: IdOrdMap, + open_cases: IdOrdMap, + closed_cases_copied_forward: IdOrdMap, +} + +impl Input { + pub fn parent_sitrep(&self) -> Option<&Sitrep> { + self.parent_sitrep.as_ref().map(|s| &s.1) + } + + pub fn inventory(&self) -> &inventory::Collection { + &self.inv + } + + pub fn new_ereports(&self) -> &IdOrdMap { + &self.new_ereports + } + + pub fn cases(&self) -> &IdOrdMap { + &self.open_cases + } + + pub(crate) fn closed_cases_copied_forward(&self) -> &IdOrdMap { + &self.closed_cases_copied_forward + } + + /// Returns a [`Builder`] for constructing a new `Input` from the provided + /// `parent_sitrep` and inventory collection. + pub fn builder( + parent_sitrep: Option>, + inv: Arc, + ) -> Builder { + Builder { + parent_sitrep, + inv, + new_ereports: IdOrdMap::default(), + unmarked_seen_ereports: BTreeSet::default(), + } + } +} + +#[must_use] +pub struct Builder { + parent_sitrep: Option>, + inv: Arc, + /// Ereports which are new and should be input to analysis in the next + /// sitrep. + new_ereports: IdOrdMap, + + /// The IDs of any ereports which have been included in the parent sitrep, + /// but which have *not* yet been marked as seen in the database. + /// + /// These must be tracked in order to determine which closed cases must be + /// copied forwards due to containing unmarked ereports. + unmarked_seen_ereports: BTreeSet, +} + +impl Builder { + /// Adds a set of ereports which have not been marked as "seen" in the + /// database to the inputs under construction. + /// + /// This will filter out any ereports which are present in the parent sitrep + /// and have not yet been marked in the database, and then add any ereports + /// which remain to the set of ereports which are actually new and should be + /// included in the inputs to the next sitrep. + pub fn add_unmarked_ereports( + &mut self, + ereports: impl IntoIterator, + ) { + let parent_sitrep = self.parent_sitrep.as_ref().map(|s| &s.1); + self.new_ereports.extend(ereports.into_iter().filter_map(|ereport| { + if let Some(sitrep) = parent_sitrep { + let id = ereport.id(); + if sitrep.ereports_by_id.contains_key(&id) { + self.unmarked_seen_ereports.insert(*id); + return None; + } + } + + Some(ereport) + })) + } + + pub fn num_ereports(&self) -> usize { + self.new_ereports.len() + } + + /// Finish constructing the [`Input`] and return it, along with a [`Report`] + /// that provides a human-readable summary of how the inputs were + /// constructed. + pub fn build(self) -> (Input, Report) { + let parent_sitrep = self.parent_sitrep.as_ref().map(|s| &s.1); + let (parent_sitrep_id, parent_inv_id) = match parent_sitrep { + Some(sitrep) => { + let id = sitrep.id(); + let inv_id = sitrep.metadata.inv_collection_id; + (Some(id), Some(inv_id)) + } + None => (None, None), + }; + + let mut report = Report { + parent_sitrep_id, + parent_inv_id, + inv_id: self.inv.id, + new_ereport_ids: self + .new_ereports + .iter() + .map(|e| *e.id()) + .collect(), + open_cases: BTreeMap::new(), + closed_cases_copied_forward: BTreeMap::new(), + }; + + // Determine which cases must be copied forwards into the next sitrep. + // Cases from the parent sitrep should be copied forwards if: + // - The case is still open + let mut open_cases = IdOrdMap::new(); + // - The case has been closed, but it contains an ereport which has not + // yet been marked as "seen" in the database. + let mut closed_cases_copied_forward = IdOrdMap::new(); + for case in parent_sitrep.iter().flat_map(|s| s.cases.iter()) { + if case.is_open() { + report.open_cases.insert(case.id, case.metadata.clone()); + open_cases.insert_unique(case.clone()).expect( + "the case UUID is coming from iterating over another \ + `IdOrdMap`, so it must be unique", + ); + } else { + let unmarked_ereports = case + .ereports + .iter() + .filter_map(|ereport| { + let id = ereport.ereport_id(); + if self.unmarked_seen_ereports.contains(&id) { + Some(*id) + } else { + None + } + }) + .collect::>(); + if !unmarked_ereports.is_empty() { + report.closed_cases_copied_forward.insert( + case.id, + ClosedCaseReport { + metadata: case.metadata.clone(), + unmarked_ereports, + }, + ); + closed_cases_copied_forward.insert_unique(case.clone()).expect( + "the case UUID is coming from iterating over another \ + `IdOrdMap`, so it must be unique", + ); + } + } + } + let input = Input { + parent_sitrep: self.parent_sitrep.clone(), + inv: self.inv.clone(), + new_ereports: self.new_ereports, + open_cases, + closed_cases_copied_forward, + }; + + (input, report) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::SitrepBuilder; + use crate::test_util::FmTest; + use nexus_types::fm; + use nexus_types::fm::case::CaseEreport; + use nexus_types::fm::ereport::Reporter; + use nexus_types::fm::{DiagnosisEngineKind, SitrepVersion}; + use nexus_types::inventory::SpType; + use omicron_uuid_kinds::{ + CaseEreportUuid, CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, + }; + use std::sync::Arc; + + /// Wraps an `fm::Ereport` in a `fm::case::CaseEreport` for insertion into + /// a case's `ereports` map. + fn case_ereport( + ereport: &Arc, + sitrep_id: SitrepUuid, + ) -> CaseEreport { + CaseEreport { + id: CaseEreportUuid::new_v4(), + ereport: ereport.clone(), + assigned_sitrep_id: sitrep_id, + comment: "test assignment".to_string(), + } + } + + /// This test exercises the core filtering and copy-forward logic in + /// [`super::Builder`] and [`crate::builder::SitrepBuilder`]. + /// + /// The scenario is: + /// + /// - The parent sitrep has three cases: + /// 1. An open case, with one ereport. + /// 2. A closed case whose ereport has NOT yet been marked seen in the + /// DB (i.e. it will be passed to `add_unmarked_ereports`). + /// 3. A closed case whose ereport HAS already been marked seen (i.e. it + /// will NOT be passed to `add_unmarked_ereports`). + /// + /// - Three ereports are passed to `add_unmarked_ereports`: + /// 1. The ereport from the open case (already in the parent sitrep). + /// 2. The ereport from the second closed case (already in the parent + /// sitrep). + /// 3. A brand-new ereport that has never appeared in any sitrep. + #[test] + fn test_analysis_input_builder_and_sitrep_builder() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_analysis_input_builder_and_sitrep_builder", + ); + let log = &logctx.log; + + let mut fm_test = + FmTest::new("test_analysis_input_builder_and_sitrep_builder", log); + let (example_system, _blueprint) = fm_test.system_builder.build(); + let inv = Arc::new(example_system.collection); + + let now = chrono::Utc::now(); + let mut reporter = fm_test + .reporters + .reporter(Reporter::Sp { sp_type: SpType::Sled, slot: 0 }); + let ereport_in_open_case = + Arc::new(reporter.mk_ereport(now, Default::default())); + let ereport_in_closed_unmarked = + Arc::new(reporter.mk_ereport(now, Default::default())); + let ereport_in_closed_marked = + Arc::new(reporter.mk_ereport(now, Default::default())); + let ereport_new = + Arc::new(reporter.mk_ereport(now, Default::default())); + + // Make a parent sitrep, with three cases: + // + // 1. an open case + let open_case_id = CaseUuid::new_v4(); + // 2. a closed case with an unmarked ereport in it, + let closed_case_with_unmarked_id = CaseUuid::new_v4(); + // 3. a closed case with only marked ereports. + let closed_case_without_unmarked_id = CaseUuid::new_v4(); + let parent_sitrep_id = SitrepUuid::new_v4(); + let parent_sitrep = { + let open_case = { + let created_sitrep_id = parent_sitrep_id; + fm::Case { + id: open_case_id, + metadata: fm::case::Metadata { + created_sitrep_id, + closed_sitrep_id: None, + de: DiagnosisEngineKind::PowerShelf, + comment: "open case".to_string(), + }, + ereports: [ + case_ereport(&ereport_in_open_case, parent_sitrep_id), + case_ereport( + &ereport_in_closed_unmarked, + created_sitrep_id, + ), + ] + .into_iter() + .collect(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + } + }; + let closed_case_with_unmarked = { + let created_sitrep_id = SitrepUuid::new_v4(); + fm::Case { + id: closed_case_with_unmarked_id, + metadata: fm::case::Metadata { + created_sitrep_id, + closed_sitrep_id: Some(parent_sitrep_id), + de: DiagnosisEngineKind::PowerShelf, + comment: "closed case, has an unmarked ereport" + .to_string(), + }, + ereports: [ + case_ereport( + &ereport_in_closed_unmarked, + created_sitrep_id, + ), + case_ereport( + &ereport_in_closed_marked, + created_sitrep_id, + ), + ] + .into_iter() + .collect(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + } + }; + let closed_case_without_unmarked = { + let created_sitrep_id = SitrepUuid::new_v4(); + fm::Case { + id: closed_case_without_unmarked_id, + metadata: fm::case::Metadata { + created_sitrep_id, + closed_sitrep_id: Some(parent_sitrep_id), + de: DiagnosisEngineKind::PowerShelf, + comment: "closed case, no unmarked ereports" + .to_string(), + }, + ereports: [case_ereport( + &ereport_in_closed_marked, + created_sitrep_id, + )] + .into_iter() + .collect(), + alerts_requested: Default::default(), + support_bundles_requested: Default::default(), + } + }; + + let cases = [ + open_case, + closed_case_with_unmarked, + closed_case_without_unmarked, + ] + .into_iter() + .collect(); + + // ereports_by_id must contain all ereports referenced by cases in the + // sitrep — add_unmarked_ereports uses this map to detect which ereports + // have already appeared in the parent sitrep. + let ereports_by_id = [ + ereport_in_open_case.clone(), + ereport_in_closed_unmarked.clone(), + ereport_in_closed_marked.clone(), + ] + .into_iter() + .collect(); + + let sitrep = fm::Sitrep { + metadata: fm::SitrepMetadata { + id: parent_sitrep_id, + parent_sitrep_id: Some(SitrepUuid::new_v4()), + inv_collection_id: CollectionUuid::new_v4(), + creator_id: OmicronZoneUuid::new_v4(), + comment: "parent sitrep for test".to_string(), + time_created: chrono::Utc::now(), + }, + cases, + ereports_by_id, + }; + Arc::new(( + SitrepVersion { + id: parent_sitrep_id, + version: 420, + time_made_current: chrono::Utc::now(), + }, + sitrep, + )) + }; + + // Build analysis input + let (input, report) = { + let mut builder = Input::builder(Some(parent_sitrep), inv); + // Pass in three ereports: + // - one that is in the open case of the parent sitrep + // - one that is in the (to-be-copied-forward) closed case + // - one that is brand-new + // + // Notably, `ereport_in_closed_marked` is NOT passed here, + // simulating that it was already marked seen in the database. + builder.add_unmarked_ereports([ + (*ereport_in_open_case).clone(), + (*ereport_in_closed_unmarked).clone(), + (*ereport_new).clone(), + ]); + builder.build() + }; + dbg!(report); + + // Check the "new ereports" in the constructed input. + assert!( + input.new_ereports().contains_key(ereport_new.id()), + "ereport_new should be in new_ereports (it was not in the parent \ + sitrep)" + ); + assert!( + !input.new_ereports().contains_key(ereport_in_open_case.id()), + "ereport_in_open_case should NOT be in new_ereports (it is \ + already associated with an open case in the parent sitrep)" + ); + assert!( + !input.new_ereports().contains_key(ereport_in_closed_unmarked.id()), + "ereport_in_closed_unmarked should NOT be in new_ereports (it is \ + already associated with a closed case in the parent sitrep)" + ); + + assert_eq!( + input.new_ereports().len(), + 1, + "exactly one new ereport (ereport_new) should be in new_ereports" + ); + + // Check which closed cases should be copied forward. + assert!( + input + .closed_cases_copied_forward() + .contains_key(&closed_case_with_unmarked_id), + "closed_case_with_unmarked should be in closed_cases_copied_forward \ + because it has an ereport that has not yet been marked seen" + ); + assert!( + !input + .closed_cases_copied_forward() + .contains_key(&closed_case_without_unmarked_id), + "closed_case_without_unmarked should NOT be in \ + closed_cases_copied_forward because all its ereports have been \ + marked seen" + ); + assert_eq!( + input.closed_cases_copied_forward().len(), + 1, + "exactly one closed case should be copied forward" + ); + + // Check the contents of open cases. + assert!( + input.cases().contains_key(&open_case_id), + "the open case from the parent sitrep should be in input.cases()" + ); + assert_eq!(input.cases().len(), 1, "exactly one case should be open"); + + // Start building a sitrep... + let mut sitrep_builder = + SitrepBuilder::new_with_rng(log, &input, fm_test.sitrep_rng); + + // The open case from the parent sitrep must be accessible via + // case_mut() so that the diagnosis engine can update it. + assert!( + sitrep_builder.cases.case_mut(&open_case_id).is_some(), + "the open case should be accessible via case_mut()" + ); + assert!( + sitrep_builder + .cases + .case_mut(&closed_case_with_unmarked_id) + .is_none(), + "the closed_case_with_unmarked should NOT be accessible via \ + case_mut() (closed cases are not open for modification)" + ); + assert!( + sitrep_builder + .cases + .case_mut(&closed_case_without_unmarked_id) + .is_none(), + "the closed_case_without_unmarked should NOT be accessible via \ + case_mut() (closed cases are not open for modification)" + ); + + // Build the final sitrep + let output_sitrep = dbg!( + sitrep_builder.build(OmicronZoneUuid::new_v4(), chrono::Utc::now()) + ); + + assert!( + output_sitrep.cases.contains_key(&open_case_id), + "open case should be in the output sitrep's cases" + ); + assert!( + output_sitrep.cases.contains_key(&closed_case_with_unmarked_id), + "closed cases with unmarked ereports should be copied forward \ + into the output sitrep" + ); + assert!( + !output_sitrep.cases.contains_key(&closed_case_without_unmarked_id), + "closed cases WITHOUT unmarked ereports should NOT be copied \ + forward into the output sitrep" + ); + assert_eq!( + output_sitrep.cases.len(), + 2, + "the output sitrep should have exactly 2 cases: the open case and \ + the closed-but-copied-forward case" + ); + + logctx.cleanup_successful(); + } +} diff --git a/nexus/fm/src/builder.rs b/nexus/fm/src/builder.rs index 85c89c797fb..ca1c564e935 100644 --- a/nexus/fm/src/builder.rs +++ b/nexus/fm/src/builder.rs @@ -4,6 +4,8 @@ //! Sitrep builder +use crate::analysis_input; +use iddqd::IdOrdMap; use nexus_types::fm; use nexus_types::inventory; use omicron_uuid_kinds::OmicronZoneUuid; @@ -22,29 +24,23 @@ pub struct SitrepBuilder<'a> { pub parent_sitrep: Option<&'a fm::Sitrep>, pub sitrep_id: SitrepUuid, pub cases: case::AllCases, + closed_cases_copied_forward: &'a IdOrdMap, comment: String, } impl<'a> SitrepBuilder<'a> { - pub fn new( - log: &Logger, - inventory: &'a inventory::Collection, - parent_sitrep: Option<&'a fm::Sitrep>, - ) -> Self { - Self::new_with_rng( - log, - inventory, - parent_sitrep, - SitrepBuilderRng::from_entropy(), - ) + pub fn new(log: &Logger, inputs: &'a analysis_input::Input) -> Self { + Self::new_with_rng(log, inputs, SitrepBuilderRng::from_entropy()) } pub fn new_with_rng( log: &Logger, - inventory: &'a inventory::Collection, - parent_sitrep: Option<&'a fm::Sitrep>, + inputs: &'a analysis_input::Input, mut rng: SitrepBuilderRng, ) -> Self { + let parent_sitrep = inputs.parent_sitrep(); + let inventory = inputs.inventory(); + // TODO(eliza): should the RNG also be seeded with the parent sitrep // UUID and/or the Omicron zone UUID? Hmm. let sitrep_id = rng.sitrep_id(); @@ -54,13 +50,14 @@ impl<'a> SitrepBuilder<'a> { "inv_collection_id" => format!("{:?}", inventory.id), )); - let cases = - case::AllCases::new(log.clone(), sitrep_id, parent_sitrep, rng); + let cases = case::AllCases::new(log.clone(), sitrep_id, inputs, rng); + let closed_cases_copied_forward = inputs.closed_cases_copied_forward(); slog::info!( &log, - "preparing sitrep {sitrep_id:?}"; + "building sitrep {sitrep_id:?}"; "existing_open_cases" => cases.len(), + "closed_cases_copied_forward" => closed_cases_copied_forward.len(), ); SitrepBuilder { @@ -69,6 +66,7 @@ impl<'a> SitrepBuilder<'a> { inventory, parent_sitrep, comment: String::new(), + closed_cases_copied_forward, cases, } } @@ -91,11 +89,11 @@ impl<'a> SitrepBuilder<'a> { .cases .cases .into_iter() - .map(|case| { - let case = fm::Case::from(case); + .map(fm::Case::from) + .chain(self.closed_cases_copied_forward.iter().cloned()) + .inspect(|case| { ereports_by_id .extend(case.ereports.iter().map(|ce| ce.ereport.clone())); - case }) .collect(); fm::Sitrep { diff --git a/nexus/fm/src/builder/case.rs b/nexus/fm/src/builder/case.rs index f5b6a2d1419..9a502d4d764 100644 --- a/nexus/fm/src/builder/case.rs +++ b/nexus/fm/src/builder/case.rs @@ -31,14 +31,12 @@ impl AllCases { pub(super) fn new( log: slog::Logger, sitrep_id: SitrepUuid, - parent_sitrep: Option<&fm::Sitrep>, + inputs: &crate::analysis_input::Input, mut rng: rng::SitrepBuilderRng, ) -> Self { - // Copy forward any open cases from the parent sitrep. - // If a case was closed in the parent sitrep, skip it. - let cases: IdOrdMap<_> = parent_sitrep + let cases = inputs + .cases() .iter() - .flat_map(|s| s.open_cases()) .map(|case| { let rng = rng::CaseBuilderRng::new(case.id, &mut rng); CaseBuilder::new(&log, sitrep_id, case.clone(), rng) @@ -61,10 +59,12 @@ impl AllCases { iddqd::id_ord_map::Entry::Vacant(entry) => { let case = fm::Case { id, - created_sitrep_id: self.sitrep_id, - closed_sitrep_id: None, - de, - comment: String::new(), + metadata: fm::case::Metadata { + created_sitrep_id: self.sitrep_id, + closed_sitrep_id: None, + de, + comment: String::new(), + }, ereports: Default::default(), alerts_requested: Default::default(), support_bundles_requested: Default::default(), @@ -114,8 +114,8 @@ impl CaseBuilder { ) -> Self { let log = log.new(slog::o!( "case_id" => case.id.to_string(), - "de" => case.de.to_string(), - "created_sitrep_id" => case.created_sitrep_id.to_string(), + "de" => case.metadata.de.to_string(), + "created_sitrep_id" => case.metadata.created_sitrep_id.to_string(), )); Self { log, case, sitrep_id, rng } } @@ -149,7 +149,7 @@ impl CaseBuilder { } pub fn close(&mut self) { - self.case.closed_sitrep_id = Some(self.sitrep_id); + self.case.metadata.closed_sitrep_id = Some(self.sitrep_id); slog::info!(&self.log, "case closed"); } @@ -202,7 +202,7 @@ impl CaseBuilder { /// Mutably borrows the case's `comment` field (i.e. to append to it). pub fn comment_mut(&mut self) -> &mut String { - &mut self.case.comment + &mut self.case.metadata.comment } } diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index 069c2eb4dde..f5ff7da5b2c 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -6,6 +6,9 @@ pub mod builder; pub use builder::{CaseBuilder, SitrepBuilder}; +pub mod analysis_input; + +pub use nexus_types::fm::*; #[cfg(any(test, feature = "testing"))] pub mod test_util; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 75ae218c63a..822d24e950e 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -107,6 +107,7 @@ use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::ereport_ingester; use super::tasks::external_endpoints; +use super::tasks::fm_analysis::FmAnalysis; use super::tasks::fm_rendezvous::FmRendezvous; use super::tasks::fm_sitrep_gc; use super::tasks::fm_sitrep_load; @@ -267,6 +268,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator: Activator::new(), task_sp_ereport_ingester: Activator::new(), task_reconfigurator_config_loader: Activator::new(), + task_fm_analysis: Activator::new(), task_fm_sitrep_loader: Activator::new(), task_fm_sitrep_gc: Activator::new(), task_fm_rendezvous: Activator::new(), @@ -359,6 +361,7 @@ impl BackgroundTasksInitializer { task_webhook_deliverator, task_sp_ereport_ingester, task_reconfigurator_config_loader, + task_fm_analysis, task_fm_sitrep_loader, task_fm_sitrep_gc, task_fm_rendezvous, @@ -1109,6 +1112,7 @@ impl BackgroundTasksInitializer { datastore.clone(), resolver.clone(), nexus_id, + task_fm_analysis.clone(), config.sp_ereport_ingester.disable, )), opctx: opctx.child(BTreeMap::new()), @@ -1133,6 +1137,26 @@ impl BackgroundTasksInitializer { activator: task_fm_sitrep_loader, }); + let fm_analysis = FmAnalysis::new( + datastore.clone(), + sitrep_watcher.clone(), + inventory_load_watcher.clone(), + task_fm_sitrep_loader.clone(), + ); + driver.register(TaskDefinition { + name: "fm_analysis", + description: + "performs fault management analysis and updates the sitrep", + period: config.fm.analysis_period_secs, + task_impl: Box::new(fm_analysis), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![ + Box::new(sitrep_watcher.clone()), + Box::new(inventory_load_watcher.clone()), + ], + activator: task_fm_analysis, + }); + driver.register(TaskDefinition { name: "fm_rendezvous", description: diff --git a/nexus/src/app/background/tasks/ereport_ingester.rs b/nexus/src/app/background/tasks/ereport_ingester.rs index f2f03ec4d9f..0ecf759da32 100644 --- a/nexus/src/app/background/tasks/ereport_ingester.rs +++ b/nexus/src/app/background/tasks/ereport_ingester.rs @@ -9,6 +9,7 @@ //! //! [rfd520]: https://rfd.shared.oxide.computer/rfd/520#_determinations +use crate::app::background::Activator; use crate::app::background::BackgroundTask; use chrono::Utc; use ereport_types::Ena; @@ -30,6 +31,7 @@ use std::sync::Arc; pub struct SpEreportIngester { resolver: internal_dns_resolver::Resolver, + fm_analysis: Activator, disabled: bool, inner: Ingester, } @@ -58,9 +60,15 @@ impl SpEreportIngester { datastore: Arc, resolver: internal_dns_resolver::Resolver, nexus_id: OmicronZoneUuid, + fm_analysis: Activator, disabled: bool, ) -> Self { - Self { resolver, inner: Ingester { datastore, nexus_id }, disabled } + Self { + resolver, + inner: Ingester { datastore, nexus_id }, + fm_analysis, + disabled, + } } async fn actually_activate( @@ -143,6 +151,8 @@ impl SpEreportIngester { // TODO(eliza): what seems like an appropriate parallelism? should we // just do 16? let mut tasks = ParallelTaskSet::new(); + let mut total_ereports = 0; + let mut total_new_ereports = 0; for gateway_client::types::SpIdentifier { type_, slot } in sps { let sp_result = tasks @@ -163,6 +173,8 @@ impl SpEreportIngester { }) .await; if let Some(Some(sp_status)) = sp_result { + total_ereports += sp_status.status.ereports_received; + total_new_ereports += sp_status.status.new_ereports; status.sps.push(sp_status); } } @@ -170,10 +182,35 @@ impl SpEreportIngester { // Wait for remaining ingestion tasks to come back. while let Some(sp_result) = tasks.join_next().await { if let Some(sp_status) = sp_result { + total_ereports += sp_status.status.ereports_received; + total_new_ereports += sp_status.status.new_ereports; status.sps.push(sp_status); } } + // If any ereports were ingested that were not already in the database, + // trigger a new FM analysis run. + if total_new_ereports > 0 { + slog::info!( + opctx.log, + "ingested {total_ereports} ({total_new_ereports} new) \ + ereports from {} service processors", + status.sps.len(); + "total_ereports" => total_ereports, + "new_ereports" => total_new_ereports, + ); + self.fm_analysis.activate(); + } else { + slog::debug!( + opctx.log, + "ingested {total_ereports} (0 new) \ + ereports from {} service processors", + status.sps.len(); + "total_ereports" => total_ereports, + "new_ereports" => total_new_ereports, + ); + } + // Sort statuses for consistent output in OMDB commands. status.sps.sort_unstable_by_key(|sp| (sp.sp_type, sp.slot)); @@ -411,13 +448,22 @@ mod tests { cptestctx.logctx.log.clone(), datastore.clone(), ); + + let fm_analysis_activator = Activator::new(); + // in order to test that we actually activate the analysis task, we must + // wire this up now so that using it does not panic. + fm_analysis_activator.mark_wired_up().unwrap(); + let mut ingester = SpEreportIngester::new( datastore.clone(), nexus.internal_resolver.clone(), nexus.id(), + fm_analysis_activator.clone(), false, ); + let mut analysis_activated = + tokio_test::task::spawn(fm_analysis_activator.activated()); let activation1 = ingester.actually_activate(&opctx).await; assert!( activation1.errors.is_empty(), @@ -431,6 +477,10 @@ mod tests { "ereports from 4 SPs should be observed: {:?}", activation1.sps, ); + tokio_test::assert_ready!( + analysis_activated.poll(), + "fm analysis task should be activated" + ); for SpEreporterStatus { sp_type, slot, status } in &activation1.sps { assert_eq!( @@ -616,6 +666,8 @@ mod tests { // Activate the task again and assert that no new ereports were // ingested. + let mut analysis_activated = + tokio_test::task::spawn(fm_analysis_activator.activated()); let activation2 = ingester.actually_activate(&opctx).await; assert!( activation2.errors.is_empty(), @@ -623,6 +675,11 @@ mod tests { activation2.errors ); dbg!(&activation2); + tokio_test::assert_pending!( + analysis_activated.poll(), + "fm analysis task should not be activated when no new ereports \ + have been ingested" + ); assert_eq!(activation2.sps, &[], "no new ereports should be observed"); diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs new file mode 100644 index 00000000000..4bd8067cca9 --- /dev/null +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -0,0 +1,213 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::app::background::Activator; +use crate::app::background::BackgroundTask; +use crate::app::background::tasks::fm_sitrep_load::CurrentSitrep; +use anyhow::Context; +use futures::future::BoxFuture; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::pagination::Paginator; +use nexus_fm as fm; +use nexus_types::internal_api::background::FmAnalysisStatus; +use nexus_types::internal_api::background::fm_analysis as status; +use nexus_types::inventory; +use omicron_uuid_kinds::GenericUuid; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::watch; + +#[derive(Clone)] +pub struct FmAnalysis { + datastore: Arc, + sitrep_rx: watch::Receiver>, + inv_rx: watch::Receiver>>, + sitrep_loader: Activator, +} + +impl BackgroundTask for FmAnalysis { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async { + let status = self.actually_activate(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + let err = format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ); + json!({ "error": err }) + } + } + }) + } +} + +impl FmAnalysis { + pub fn new( + datastore: Arc, + sitrep_rx: watch::Receiver>, + inv_rx: watch::Receiver>>, + sitrep_loader: Activator, + ) -> Self { + Self { datastore, sitrep_rx, inv_rx, sitrep_loader } + } + + async fn actually_activate( + &mut self, + opctx: &OpContext, + ) -> FmAnalysisStatus { + let parent_sitrep = self.sitrep_rx.borrow_and_update().clone(); + let parent_sitrep_id = parent_sitrep.as_ref().map(|s| s.1.id()); + let Some(inv) = self.inv_rx.borrow_and_update().clone() else { + slog::debug!( + opctx.log, + "fault management analysis waiting for inventory to be loaded" + ); + return FmAnalysisStatus { + parent_sitrep_id, + inv_collection_id: None, + outcome: status::Outcome::WaitingForInventory, + }; + }; + let inv_collection_id = inv.id; + let opctx = opctx.child( + [ + ( + "parent_sitrep_id".to_string(), + format!("{parent_sitrep_id:?}"), + ), + ( + "inv_collection_id".to_string(), + inv_collection_id.to_string(), + ), + ] + .into_iter() + .collect(), + ); + + // Prepare analysis inputs. + let (inputs, prep_status) = match self + .prepare_inputs(&opctx, parent_sitrep, inv) + .await + { + Ok(inputs) => inputs, + Err(err) => { + let error = InlineErrorChain::new(&*err); + slog::error!(opctx.log, "preparing analysis inputs failed"; &error); + return FmAnalysisStatus { + parent_sitrep_id, + inv_collection_id: Some(inv_collection_id), + outcome: status::Outcome::PreparationError( + error.to_string(), + ), + }; + } + }; + + // Okay, actually run analysis and generate a new sitrep. + let outcome = self + .analyze(&opctx, inputs) + .await + .unwrap_or_else(|err| { + let error = InlineErrorChain::new(&*err); + slog::error!(opctx.log, "fault management analysis failed!"; &error); + status::AnalysisOutcome::Error(error.to_string()) + }); + + if let status::AnalysisOutcome::Committed { .. } = &outcome { + // If we committed a new sitrep, we ought to go ahead and load it + // now... + self.sitrep_loader.activate(); + } + + FmAnalysisStatus { + parent_sitrep_id, + inv_collection_id: Some(inv_collection_id), + outcome: status::Outcome::RanAnalysis { prep_status, outcome }, + } + } + + async fn prepare_inputs( + &mut self, + opctx: &OpContext, + parent_sitrep: Option, + inv: Arc, + ) -> anyhow::Result<(fm::analysis_input::Input, status::PreparationStatus)> + { + let mut builder = + fm::analysis_input::Input::builder(parent_sitrep, inv); + let mut errors = Vec::new(); + self.load_new_ereports(opctx, &mut builder, &mut errors) + .await + .context("failed to load new ereports")?; + + let (input, report) = builder.build(); + Ok((input, status::PreparationStatus { errors, report })) + } + + async fn load_new_ereports( + &mut self, + opctx: &OpContext, + builder: &mut fm::analysis_input::Builder, + errors: &mut Vec, + ) -> anyhow::Result<()> { + let mut paginator = Paginator::new( + nexus_db_queries::db::datastore::SQL_BATCH_SIZE, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let prev_total = builder.num_ereports(); + let batch = self + .datastore + .ereports_list_unmarked(opctx, &p.current_pagparams()) + .await?; + paginator = p.found_batch(&batch, &|e| { + (e.restart_id.into_untyped_uuid(), e.ena) + }); + let loaded = batch.len(); + let mut invalid = 0; + builder.add_unmarked_ereports(batch.into_iter().filter_map( + |ereport| { + let ereport = match fm::Ereport::try_from(ereport) { + Ok(ereport) => ereport, + Err(e) => { + invalid += 1; + errors.push(e.to_string()); + return None; + } + }; + + Some(ereport) + }, + )); + + let total = builder.num_ereports(); + let new = total - prev_total; + if invalid > 0 { + slog::warn!( + &opctx.log, + "loaded {loaded} ereports, {new} new, {invalid} invalid" + ); + } else { + slog::debug!(&opctx.log, "loaded {loaded} ereports, {new} new"); + } + } + + Ok(()) + } + + async fn analyze( + &mut self, + _opctx: &OpContext, + _inputs: fm::analysis_input::Input, + ) -> anyhow::Result { + anyhow::bail!("FM analysis is not yet implemented") + } +} diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs index 0a0fd1d02eb..9ed8dda9242 100644 --- a/nexus/src/app/background/tasks/fm_rendezvous.rs +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -370,13 +370,15 @@ mod tests { let case1_id = CaseUuid::new_v4(); let mut case1 = fm::Case { id: case1_id, - created_sitrep_id: sitrep1_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "my great case".to_string(), + }, alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "my great case".to_string(), }; case1 .alerts_requested @@ -445,13 +447,15 @@ mod tests { let case2_id = CaseUuid::new_v4(); let mut case2 = fm::Case { id: case2_id, - created_sitrep_id: sitrep1_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "my other great case".to_string(), + }, alerts_requested: iddqd::IdOrdMap::new(), ereports: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "my other great case".to_string(), }; case2 .alerts_requested @@ -557,7 +561,7 @@ mod tests { let mut marker = None; loop { let page = datastore - .ereports_list_unseen( + .ereports_list_unmarked( opctx, &DataPageParams { marker: marker.as_ref(), @@ -748,13 +752,15 @@ mod tests { .unwrap(); fm::Case { id: case1_id, - created_sitrep_id: sitrep1_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "case with two ereports".to_string(), + }, ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "case with two ereports".to_string(), } }; @@ -953,13 +959,15 @@ mod tests { .unwrap(); fm::Case { id: CaseUuid::new_v4(), - created_sitrep_id: sitrep1_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "case with ereport 1".to_string(), + }, ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "case with ereport 1".to_string(), } }; @@ -1059,13 +1067,15 @@ mod tests { .unwrap(); fm::Case { id: CaseUuid::new_v4(), - created_sitrep_id: sitrep1_id, - closed_sitrep_id: None, - de: fm::DiagnosisEngineKind::PowerShelf, + metadata: fm::case::Metadata { + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + comment: "case with all three ereports".to_string(), + }, ereports, alerts_requested: iddqd::IdOrdMap::new(), support_bundles_requested: iddqd::IdOrdMap::new(), - comment: "case with all three ereports".to_string(), } }; diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 8fb56d653f2..fdcb45ef8d0 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -21,6 +21,7 @@ pub mod dns_propagation; pub mod dns_servers; pub mod ereport_ingester; pub mod external_endpoints; +pub mod fm_analysis; pub mod fm_rendezvous; pub mod fm_sitrep_gc; pub mod fm_sitrep_load; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index f024b3446dd..76d4015c107 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -195,6 +195,8 @@ sp_ereport_ingester.period_secs = 30 # How frequently to check for a new fault management sitrep (made by any Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +# How frequently to run analysis from the current sitrep. +fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, # as it does not impact the responsiveness of the fault management system, and # is activated every time the current sitrep changes. Periodic activations are diff --git a/nexus/types/output/analysis_input_report_empty.out b/nexus/types/output/analysis_input_report_empty.out new file mode 100644 index 00000000000..6e19fc78550 --- /dev/null +++ b/nexus/types/output/analysis_input_report_empty.out @@ -0,0 +1,6 @@ +fault management analysis inputs +----- ---------- -------- ------ +parent sitrep: +inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb +no new ereports since the parent sitrep +no cases copied forward diff --git a/nexus/types/output/analysis_input_report_same_inv.out b/nexus/types/output/analysis_input_report_same_inv.out new file mode 100644 index 00000000000..91a4de738b4 --- /dev/null +++ b/nexus/types/output/analysis_input_report_same_inv.out @@ -0,0 +1,7 @@ +fault management analysis inputs +----- ---------- -------- ------ +parent sitrep: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa +inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb + --> same collection as parent sitrep +no new ereports since the parent sitrep +no cases copied forward diff --git a/nexus/types/output/analysis_input_report_with_cases.out b/nexus/types/output/analysis_input_report_with_cases.out new file mode 100644 index 00000000000..3cac8d5f9ad --- /dev/null +++ b/nexus/types/output/analysis_input_report_with_cases.out @@ -0,0 +1,25 @@ +fault management analysis inputs +----- ---------- -------- ------ +parent sitrep: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa +inventory collection: bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb + --> different from parent sitrep (collection cccccccc-cccc-cccc-cccc-cccccccccccc) + +new ereports (2 total): +* ereport dddddddd-dddd-dddd-dddd-dddddddddddd:3 +* ereport dddddddd-dddd-dddd-dddd-dddddddddddd:4 + +cases (2 total): + open cases (1 total): + * case 11111111-1111-1111-1111-111111111111 + diagnosis engine: power_shelf + opened in sitrep: 22222222-2222-2222-2222-222222222222 + comment: PSU 0 faulted + closed cases copied forwards (1 total): + * case 33333333-3333-3333-3333-333333333333 + diagnosis engine: power_shelf + opened in sitrep: 44444444-4444-4444-4444-444444444444 + closed in sitrep: 55555555-5555-5555-5555-555555555555 + comment: PSU 1 replaced + + copied forwards because these ereports haven't been marked seen yet: + * ereport dddddddd-dddd-dddd-dddd-dddddddddddd:2 diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index d32699ad9fb..9decc84764c 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -19,6 +19,9 @@ use omicron_uuid_kinds::{ CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, }; use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::fmt; use std::sync::Arc; /// A fault management situation report, or _sitrep_. @@ -80,7 +83,7 @@ impl Sitrep { &self, ) -> impl Iterator + '_ { self.cases.iter().flat_map(|case| { - let case_id = case.id; + let case_id = *case.id(); case.alerts_requested.iter().map(move |alert| (case_id, alert)) }) } @@ -152,3 +155,291 @@ pub struct SitrepVersion { pub enum DiagnosisEngineKind { PowerShelf, } + +/// Summarizes the inputs to sitrep analysis. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct AnalysisInputReport { + pub parent_sitrep_id: Option, + pub parent_inv_id: Option, + pub inv_id: CollectionUuid, + pub new_ereport_ids: BTreeSet, + /// Cases which were open in the parent sitrep. + pub open_cases: BTreeMap, + /// Cases which have closed, but which have been copied forwards as they + /// contain ereports which have not yet been marked seen. + pub closed_cases_copied_forward: BTreeMap, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct ClosedCaseReport { + pub metadata: case::Metadata, + pub unmarked_ereports: BTreeSet, +} + +impl AnalysisInputReport { + pub fn display_multiline(&self, indent: usize) -> impl fmt::Display + '_ { + InputReportMultilineDisplay { report: self, indent } + } +} + +struct InputReportMultilineDisplay<'report> { + report: &'report AnalysisInputReport, + indent: usize, +} + +impl fmt::Display for InputReportMultilineDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + report: + AnalysisInputReport { + parent_sitrep_id, + parent_inv_id, + inv_id, + new_ereport_ids, + open_cases, + closed_cases_copied_forward, + }, + indent, + } = self; + + writeln!(f, "{:indent$}fault management analysis inputs", "")?; + writeln!(f, "{:indent$}----- ---------- -------- ------", "")?; + if let Some(id) = parent_sitrep_id { + writeln!(f, "{:indent$}parent sitrep: {id}", "",)?; + } else { + writeln!(f, "{:indent$}parent sitrep: ", "")?; + } + + writeln!(f, "{:indent$}inventory collection: {inv_id}", "",)?; + if Some(inv_id) == parent_inv_id.as_ref() { + writeln!(f, "{:indent$} --> same collection as parent sitrep", "",)?; + } else if let Some(parent_inv_id) = parent_inv_id { + writeln!( + f, + "{:indent$} --> different from parent sitrep \ + (collection {parent_inv_id})", + "", + )?; + } + + if !new_ereport_ids.is_empty() { + writeln!( + f, + "\n{:indent$}new ereports ({} total):", + "", + new_ereport_ids.len() + )?; + for ereport_id in new_ereport_ids { + writeln!(f, "{:indent$}* ereport {ereport_id}", "")?; + } + } else { + writeln!( + f, + "{:indent$}no new ereports since the parent sitrep", + "", + )?; + } + + let total_cases = open_cases.len() + closed_cases_copied_forward.len(); + if total_cases > 0 { + writeln!(f, "\n{:indent$}cases ({} total):", "", total_cases)?; + let indent = indent + 2; + if open_cases.is_empty() { + writeln!(f, "{:indent$}no open cases", "",)?; + } else { + writeln!( + f, + "{:indent$}open cases ({} total):", + "", + open_cases.len() + )?; + for (case_id, metadata) in open_cases { + writeln!(f, "{:indent$}* case {case_id}", "")?; + metadata.display_multiline(indent + 2, None).fmt(f)?; + } + } + + if closed_cases_copied_forward.is_empty() { + writeln!( + f, + "{:indent$}no closed cases must be copied forwards", + "", + )?; + } else { + writeln!( + f, + "{:indent$}closed cases copied forwards ({} total):", + "", + closed_cases_copied_forward.len() + )?; + for ( + case_id, + ClosedCaseReport { metadata, unmarked_ereports }, + ) in closed_cases_copied_forward + { + writeln!(f, "{:indent$}* case {case_id}", "")?; + let indent = indent + 2; + metadata.display_multiline(indent, None).fmt(f)?; + writeln!( + f, + "\n{:indent$}copied forwards because these ereports \ + haven't been marked seen yet:", + "" + )?; + for ereport_id in unmarked_ereports { + writeln!(f, "{:indent$}* ereport {ereport_id}", "")?; + } + } + } + } else { + writeln!(f, "{:indent$}no cases copied forward", "")?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use ereport_types::{Ena, EreportId}; + use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, EreporterRestartUuid, SitrepUuid, + }; + use std::str::FromStr; + + fn example_report_with_cases() -> AnalysisInputReport { + let parent_sitrep_id = + SitrepUuid::from_str("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa") + .unwrap(); + let inv_id = + CollectionUuid::from_str("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb") + .unwrap(); + let parent_inv_id = + CollectionUuid::from_str("cccccccc-cccc-cccc-cccc-cccccccccccc") + .unwrap(); + let restart_id = EreporterRestartUuid::from_str( + "dddddddd-dddd-dddd-dddd-dddddddddddd", + ) + .unwrap(); + + let case1_id = + CaseUuid::from_str("11111111-1111-1111-1111-111111111111").unwrap(); + let case1_created_sitrep = + SitrepUuid::from_str("22222222-2222-2222-2222-222222222222") + .unwrap(); + + let case2_id = + CaseUuid::from_str("33333333-3333-3333-3333-333333333333").unwrap(); + let case2_created_sitrep = + SitrepUuid::from_str("44444444-4444-4444-4444-444444444444") + .unwrap(); + let case2_closed_sitrep = + SitrepUuid::from_str("55555555-5555-5555-5555-555555555555") + .unwrap(); + + let mut open_cases = BTreeMap::new(); + open_cases.insert( + case1_id, + case::Metadata { + created_sitrep_id: case1_created_sitrep, + closed_sitrep_id: None, + de: DiagnosisEngineKind::PowerShelf, + comment: "PSU 0 faulted".to_string(), + }, + ); + + let mut new_ereport_ids = BTreeSet::new(); + new_ereport_ids.insert(EreportId { restart_id, ena: Ena::from(3) }); + new_ereport_ids.insert(EreportId { restart_id, ena: Ena::from(4) }); + + let mut closed_cases_copied_forward = BTreeMap::new(); + let mut unmarked_ereports = BTreeSet::new(); + unmarked_ereports + .insert(EreportId { restart_id, ena: Ena::from(2u64) }); + closed_cases_copied_forward.insert( + case2_id, + ClosedCaseReport { + metadata: case::Metadata { + created_sitrep_id: case2_created_sitrep, + closed_sitrep_id: Some(case2_closed_sitrep), + de: DiagnosisEngineKind::PowerShelf, + comment: "PSU 1 replaced".to_string(), + }, + unmarked_ereports, + }, + ); + + AnalysisInputReport { + parent_sitrep_id: Some(parent_sitrep_id), + parent_inv_id: Some(parent_inv_id), + inv_id, + new_ereport_ids, + open_cases, + closed_cases_copied_forward, + } + } + + fn example_report_empty() -> AnalysisInputReport { + let inv_id = + CollectionUuid::from_str("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb") + .unwrap(); + + AnalysisInputReport { + parent_sitrep_id: None, + parent_inv_id: None, + inv_id, + new_ereport_ids: BTreeSet::new(), + open_cases: BTreeMap::new(), + closed_cases_copied_forward: BTreeMap::new(), + } + } + + fn example_report_same_inv() -> AnalysisInputReport { + let parent_sitrep_id = + SitrepUuid::from_str("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa") + .unwrap(); + let inv_id = + CollectionUuid::from_str("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb") + .unwrap(); + + AnalysisInputReport { + parent_sitrep_id: Some(parent_sitrep_id), + parent_inv_id: Some(inv_id), + inv_id, + new_ereport_ids: BTreeSet::new(), + open_cases: BTreeMap::new(), + closed_cases_copied_forward: BTreeMap::new(), + } + } + + #[test] + fn test_analysis_input_report_display_with_cases() { + let report = example_report_with_cases(); + let output = format!("{}", report.display_multiline(0)); + expectorate::assert_contents( + "output/analysis_input_report_with_cases.out", + &output, + ); + } + + #[test] + fn test_analysis_input_report_display_empty() { + let report = example_report_empty(); + let output = format!("{}", report.display_multiline(0)); + expectorate::assert_contents( + "output/analysis_input_report_empty.out", + &output, + ); + } + + #[test] + fn test_analysis_input_report_display_same_inv() { + let report = example_report_same_inv(); + let output = format!("{}", report.display_multiline(0)); + expectorate::assert_contents( + "output/analysis_input_report_same_inv.out", + &output, + ); + } +} diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index 7066a848a01..2abb8c05e2a 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -5,6 +5,7 @@ use crate::alert::AlertClass; use crate::fm::DiagnosisEngineKind; use crate::fm::Ereport; +use crate::fm::EreportId; use crate::support_bundle::BundleDataSelection; use iddqd::{IdOrdItem, IdOrdMap}; use omicron_uuid_kinds::{ @@ -17,21 +18,21 @@ use std::sync::Arc; #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct Case { pub id: CaseUuid, - pub created_sitrep_id: SitrepUuid, - pub closed_sitrep_id: Option, - - pub de: DiagnosisEngineKind, + #[serde(flatten)] + pub metadata: Metadata, pub ereports: IdOrdMap, pub alerts_requested: IdOrdMap, pub support_bundles_requested: IdOrdMap, - - pub comment: String, } impl Case { + pub fn id(&self) -> &CaseUuid { + &self.id + } + pub fn is_open(&self) -> bool { - self.closed_sitrep_id.is_none() + self.metadata.is_open() } pub fn display_indented( @@ -58,6 +59,81 @@ impl IdOrdItem for Case { iddqd::id_upcast!(); } +/// Metadata about a case. +#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] +pub struct Metadata { + pub created_sitrep_id: SitrepUuid, + pub closed_sitrep_id: Option, + + pub de: DiagnosisEngineKind, + + pub comment: String, +} + +impl Metadata { + pub fn is_open(&self) -> bool { + self.closed_sitrep_id.is_none() + } + + pub fn display_multiline( + &self, + indent: usize, + sitrep: Option, + ) -> impl fmt::Display + '_ { + struct DisplayMetadata<'a> { + meta: &'a Metadata, + indent: usize, + sitrep_id: Option, + } + + impl fmt::Display for DisplayMetadata<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let DisplayMetadata { + meta: + Metadata { + de, + created_sitrep_id, + closed_sitrep_id, + comment, + }, + indent, + sitrep_id, + } = self; + let sitrep_id = sitrep_id.as_ref(); + let this_sitrep = move |s| { + if Some(s) == sitrep_id { " <-- this sitrep" } else { "" } + }; + + const DE: &str = "diagnosis engine:"; + const OPENED_IN: &str = "opened in sitrep:"; + const CLOSED_IN: &str = "closed in sitrep:"; + const WIDTH: usize = const_max_len(&[DE, OPENED_IN, CLOSED_IN]); + writeln!(f, "{:>indent$}{DE:indent$}{OPENED_IN:indent$}{CLOSED_IN:indent$}comment: {comment}", "")?; + + Ok(()) + } + } + + DisplayMetadata { meta: self, indent, sitrep_id: sitrep } + } +} + #[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)] pub struct CaseEreport { pub id: CaseEreportUuid, @@ -75,6 +151,12 @@ impl IdOrdItem for CaseEreport { iddqd::id_upcast!(); } +impl CaseEreport { + pub fn ereport_id(&self) -> &EreportId { + self.ereport.id() + } +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct AlertRequest { pub id: AlertUuid, @@ -122,28 +204,13 @@ struct DisplayCase<'a> { impl fmt::Display for DisplayCase<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { const BULLET: &str = "* "; - const fn const_max_len(strs: &[&str]) -> usize { - let mut max = 0; - let mut i = 0; - while i < strs.len() { - let len = strs[i].len(); - if len > max { - max = len; - } - i += 1; - } - max - } let &Self { case: Case { id, - created_sitrep_id, - closed_sitrep_id, - de, + metadata, ereports, - comment, alerts_requested, support_bundles_requested, }, @@ -158,35 +225,14 @@ impl fmt::Display for DisplayCase<'_> { writeln!( f, "{:>indent$}case {id}", - if indent > 0 { BULLET } else { "" } + if indent > 0 { BULLET } else { "" }, )?; writeln!( f, "{:>indent$}=========================================", "" )?; - - const DE: &str = "diagnosis engine:"; - const OPENED_IN: &str = "opened in sitrep:"; - const CLOSED_IN: &str = "closed in sitrep:"; - const WIDTH: usize = const_max_len(&[DE, OPENED_IN, CLOSED_IN]); - writeln!(f, "{:>indent$}{DE:indent$}{OPENED_IN:indent$}{CLOSED_IN:indent$}comment: {comment}", "")?; + metadata.display_multiline(indent, sitrep_id).fmt(f)?; if !ereports.is_empty() { writeln!(f, "\n{:>indent$}ereports:", "")?; @@ -293,6 +339,19 @@ impl fmt::Display for DisplayCase<'_> { } } +const fn const_max_len(strs: &[&str]) -> usize { + let mut max = 0; + let mut i = 0; + while i < strs.len() { + let len = strs[i].len(); + if len > max { + max = len; + } + i += 1; + } + max +} + #[cfg(test)] mod tests { use super::*; @@ -438,14 +497,16 @@ mod tests { // Create the case let case = Case { id: case_id, - created_sitrep_id, - closed_sitrep_id: Some(closed_sitrep_id), - de: DiagnosisEngineKind::PowerShelf, + metadata: Metadata { + created_sitrep_id, + closed_sitrep_id: Some(closed_sitrep_id), + de: DiagnosisEngineKind::PowerShelf, + comment: "Power shelf rectifier added and removed here :-)" + .to_string(), + }, ereports, alerts_requested, support_bundles_requested, - comment: "Power shelf rectifier added and removed here :-)" - .to_string(), }; eprintln!("example case display:"); diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index d8e42697f45..1868dbb3b82 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -897,6 +897,56 @@ pub struct SitrepGcStatus { pub errors: Vec, } +/// The status of a `fm_analysis` background task activation. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct FmAnalysisStatus { + pub parent_sitrep_id: Option, + pub inv_collection_id: Option, + pub outcome: fm_analysis::Outcome, +} + +pub mod fm_analysis { + use super::*; + + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] + pub struct PreparationStatus { + pub errors: Vec, + pub report: crate::fm::AnalysisInputReport, + } + + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] + #[allow(clippy::large_enum_variant)] + pub enum Outcome { + /// Fault management analysis was not performed as no inventory + /// collection has been loaded. + WaitingForInventory, + + /// Preparing analysis input failed. + PreparationError(String), + + /// Preparation succeeded and analysis was performed. + RanAnalysis { prep_status: PreparationStatus, outcome: AnalysisOutcome }, + } + + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] + pub enum AnalysisOutcome { + /// An error occurred during analysis. + Error(String), + + /// Analysis produced a sitrep identical to the current sitrep, + /// so we threw it away and did nothing. + Unchanged, + + /// Analysis produced a new sitrep, but we failed to make it + /// the current sitrep. + NotCommitted { sitrep_id: SitrepUuid, error: String }, + + /// Analysis produced a new sitrep, which was saved and made the current + /// sitrep. + Committed { sitrep_id: SitrepUuid }, + } +} + /// The status of a `fm_rendezvous` background task activation. #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct FmRendezvousStatus { diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 092902ae68e..c5e7705d838 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -102,6 +102,8 @@ sp_ereport_ingester.disable = true # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +# How frequently to run analysis from the current sitrep. +fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, # as it does not impact the responsiveness of the fault management system, and # is activated every time the current sitrep changes. Periodic activations are diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index ef229ccd87e..ffc969cd25b 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -102,6 +102,8 @@ sp_ereport_ingester.disable = true # Nexus). # This is cheap, so we should check frequently. fm.sitrep_load_period_secs = 15 +# How frequently to run analysis from the current sitrep. +fm.analysis_period_secs = 120 # Sitrep GC, on the other hand, does not need to be activated very frequently, # as it does not impact the responsiveness of the fault management system, and # is activated every time the current sitrep changes. Periodic activations are