Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,7 @@ tofino = { git = "https://github.com/oxidecomputer/tofino" }
tokio = "1.47.0"
tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] }
tokio-stream = "0.1.17"
tokio-test = "0.4.5"
tokio-tungstenite = "0.23.1"
tokio-util = { version = "0.7.15", features = ["io", "io-util", "time"] }
toml = "0.8.23"
Expand Down
80 changes: 80 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ use nexus_types::internal_api::background::BlueprintRendezvousStats;
use nexus_types::internal_api::background::BlueprintRendezvousStatus;
use nexus_types::internal_api::background::DatasetsRendezvousStats;
use nexus_types::internal_api::background::EreporterStatus;
use nexus_types::internal_api::background::FmAnalysisStatus;
use nexus_types::internal_api::background::FmRendezvousStatus;
use nexus_types::internal_api::background::InstanceReincarnationStatus;
use nexus_types::internal_api::background::InstanceUpdaterStatus;
Expand Down Expand Up @@ -1334,6 +1335,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"webhook_deliverator" => {
print_task_webhook_deliverator(details);
}
"fm_analysis" => {
print_task_fm_analysis(details);
}
"fm_sitrep_loader" => {
print_task_fm_sitrep_loader(details);
}
Expand Down Expand Up @@ -3430,6 +3434,82 @@ mod ereporter_status_fields {
pub const NUM_WIDTH: usize = 4;
}

fn print_task_fm_analysis(details: &serde_json::Value) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if the user runs omdb nexus background-tasks show without selecting a specific task, they'll see the /!\ analysis failed: FM analysis is not yet implemented error. Is that something that's ok to ship, or would you want to suppress that for now?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OMDB is intended for use by Oxide engineering and support, and is not easily accessible to the end user. I'm not super worried about this being a call generator, since it's not in a customer-facing UI...

use nexus_types::internal_api::background::fm_analysis::{
AnalysisOutcome, Outcome, PreparationStatus,
};
let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } =
match serde_json::from_value::<FmAnalysisStatus>(details.clone()) {
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
);
return;
}
Ok(status) => status,
};
pub const PARENT_SITREP_ID: &str = "parent sitrep ID:";
pub const INV_ID: &str = "current inventory collection ID:";
pub const WIDTH: usize = const_max_len(&[PARENT_SITREP_ID, INV_ID]) + 1;
println!(" {PARENT_SITREP_ID:<WIDTH$}{parent_sitrep_id:?}");
println!(" {INV_ID:<WIDTH$}{inv_collection_id:?}");
println!(" FAULT MANAGEMENT ANALYSIS SUMMARY");
println!(" ===== ========== ======== =======");
let (prep_status, analysis_outcome) = match outcome {
Outcome::WaitingForInventory => {
println!(
" analysis was not performed, as the inventory has\n \
not yet been loaded.\n\
(i) note: this should only happen if Nexus has just started.",
);
return;
}
Outcome::PreparationError(error) => {
println!(
"{ERRICON} failed to prepare analysis inputs:\n {error}"
);
return;
}
Outcome::RanAnalysis { prep_status, outcome } => (prep_status, outcome),
};
match analysis_outcome {
AnalysisOutcome::Error(error) => {
println!("{ERRICON} analysis failed: {error}");
}
AnalysisOutcome::Unchanged => {
println!(
" no changes from the current situation report ({:?})",
parent_sitrep_id
);
}
AnalysisOutcome::NotCommitted { sitrep_id, error } => {
println!(
" analysis succeeded, but the sitrep was not committed!"
);
println!(" sitrep ID: {sitrep_id:?}");
println!(" error: {error}");
}
AnalysisOutcome::Committed { sitrep_id } => {
println!(" analyzed the situation, and committed a new sitrep!");
println!(" sitrep ID: {sitrep_id:?}");
}
}
println!();

let PreparationStatus { errors, report } = prep_status;
println!("{}", report.display_multiline(4));
if !errors.is_empty() {
println!("{ERRICON} errors preparing analysis inputs:");
for error in errors {
println!(" > {error}")
}
}

// TODO(eliza): eventually there will also be a detailed analysis report,
// print that here as well...
}

fn print_task_fm_sitrep_loader(details: &serde_json::Value) {
match serde_json::from_value::<SitrepLoadStatus>(details.clone()) {
Err(error) => eprintln!(
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down Expand Up @@ -371,6 +375,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down Expand Up @@ -617,6 +625,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down
40 changes: 40 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down Expand Up @@ -687,6 +691,24 @@ task: "external_endpoints"

TLS certificates: 0

task: "fm_analysis"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
FAULT MANAGEMENT ANALYSIS SUMMARY
===== ========== ======== =======
/!\ analysis failed: FM analysis is not yet implemented

fault management analysis inputs
----- ---------- -------- ------
parent sitrep: None
inventory collection: ..........<REDACTED_UUID>........... (collection)

new ereports (0 total):


task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down Expand Up @@ -1320,6 +1342,24 @@ task: "external_endpoints"

TLS certificates: 0

task: "fm_analysis"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
FAULT MANAGEMENT ANALYSIS SUMMARY
===== ========== ======== =======
/!\ analysis failed: FM analysis is not yet implemented

fault management analysis inputs
----- ---------- -------- ------
parent sitrep: None
inventory collection: ..........<REDACTED_UUID>........... (collection)

new ereports (0 total):


task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down
11 changes: 11 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,10 @@ impl Default for MulticastGroupReconcilerConfig {
#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct FmTasksConfig {
/// period (in seconds) for periodic activations of the background task that
/// drives fault management analysis.
#[serde_as(as = "DurationSeconds<u64>")]
pub analysis_period_secs: Duration,
/// period (in seconds) for periodic activations of the background task that
/// reads the latest fault management sitrep from the database.
#[serde_as(as = "DurationSeconds<u64>")]
Expand All @@ -989,6 +993,10 @@ pub struct FmTasksConfig {
impl Default for FmTasksConfig {
fn default() -> Self {
Self {
// Analysis is generally triggered by changes in the current sitrep,
// inventory, or by the ereport ingester(s), so it need not be
// periodically activated all that frequently.
analysis_period_secs: Duration::from_secs(60),
sitrep_load_period_secs: Duration::from_secs(15),
// This need not be activated very frequently, as it's triggered any
// time the current sitrep changes, and activating it more
Expand Down Expand Up @@ -1310,6 +1318,7 @@ mod test {
probe_distributor.period_secs = 50
multicast_reconciler.period_secs = 60
fm.rendezvous_period_secs = 51
fm.analysis_period_secs = 52
trust_quorum.period_secs = 60
attached_subnet_manager.period_secs = 60
session_cleanup.period_secs = 300
Expand Down Expand Up @@ -1566,6 +1575,7 @@ mod test {
disable: false,
},
fm: FmTasksConfig {
analysis_period_secs: Duration::from_secs(52),
sitrep_load_period_secs: Duration::from_secs(48),
sitrep_gc_period_secs: Duration::from_secs(49),
rendezvous_period_secs: Duration::from_secs(51),
Expand Down Expand Up @@ -1702,6 +1712,7 @@ mod test {
fm.sitrep_gc_period_secs = 46
probe_distributor.period_secs = 47
fm.rendezvous_period_secs = 48
fm.analysis_period_secs = 49
multicast_reconciler.period_secs = 60
trust_quorum.period_secs = 60
attached_subnet_manager.period_secs = 60
Expand Down
2 changes: 2 additions & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ nexus-db-lookup.workspace = true
nexus-db-model.workspace = true
nexus-db-queries.workspace = true
nexus-db-schema.workspace = true
nexus-fm.workspace = true
nexus-inventory.workspace = true
nexus-metrics-producer-gc.workspace = true
nexus-reconfigurator-execution.workspace = true
Expand Down Expand Up @@ -200,6 +201,7 @@ sp-sim.workspace = true
strum.workspace = true
subprocess.workspace = true
term.workspace = true
tokio-test.workspace = true
tufaceous.workspace = true
tufaceous-artifact.workspace = true
tufaceous-lib.workspace = true
Expand Down
1 change: 1 addition & 0 deletions nexus/background-task-interface/src/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ pub struct BackgroundTasks {
pub task_webhook_deliverator: Activator,
pub task_sp_ereport_ingester: Activator,
pub task_reconfigurator_config_loader: Activator,
pub task_fm_analysis: Activator,
pub task_fm_rendezvous: Activator,
pub task_fm_sitrep_loader: Activator,
pub task_fm_sitrep_gc: Activator,
Expand Down
2 changes: 2 additions & 0 deletions nexus/examples/config-second.toml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ sp_ereport_ingester.period_secs = 30
# Nexus).
# This is cheap, so we should check frequently.
fm.sitrep_load_period_secs = 15
# How frequently to run analysis from the current sitrep.
fm.analysis_period_secs = 120
# Sitrep GC, on the other hand, does not need to be activated very frequently,
# as it does not impact the responsiveness of the fault management system, and
# is activated every time the current sitrep changes. Periodic activations are
Expand Down
2 changes: 2 additions & 0 deletions nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ sp_ereport_ingester.period_secs = 30
# Nexus).
# This is cheap, so we should check frequently.
fm.sitrep_load_period_secs = 15
# How frequently to run analysis from the current sitrep.
fm.analysis_period_secs = 120
# Sitrep GC, on the other hand, does not need to be activated very frequently,
# as it does not impact the responsiveness of the fault management system, and
# is activated every time the current sitrep changes. Periodic activations are
Expand Down
Loading
Loading