Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,7 @@ tofino = { git = "https://github.com/oxidecomputer/tofino" }
tokio = "1.47.0"
tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] }
tokio-stream = "0.1.17"
tokio-test = "0.4.5"
tokio-tungstenite = "0.23.1"
tokio-util = { version = "0.7.15", features = ["io", "io-util", "time"] }
toml = "0.8.23"
Expand Down
80 changes: 80 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ use nexus_types::internal_api::background::BlueprintRendezvousStats;
use nexus_types::internal_api::background::BlueprintRendezvousStatus;
use nexus_types::internal_api::background::DatasetsRendezvousStats;
use nexus_types::internal_api::background::EreporterStatus;
use nexus_types::internal_api::background::FmAnalysisStatus;
use nexus_types::internal_api::background::FmRendezvousStatus;
use nexus_types::internal_api::background::InstanceReincarnationStatus;
use nexus_types::internal_api::background::InstanceUpdaterStatus;
Expand Down Expand Up @@ -1334,6 +1335,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"webhook_deliverator" => {
print_task_webhook_deliverator(details);
}
"fm_analysis" => {
print_task_fm_analysis(details);
}
"fm_sitrep_loader" => {
print_task_fm_sitrep_loader(details);
}
Expand Down Expand Up @@ -3430,6 +3434,82 @@ mod ereporter_status_fields {
pub const NUM_WIDTH: usize = 4;
}

fn print_task_fm_analysis(details: &serde_json::Value) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if the user runs omdb nexus background-tasks show without selecting a specific task, they'll see the /!\ analysis failed: FM analysis is not yet implemented error. Is that something that's ok to ship, or would you want to suppress that for now?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OMDB is intended for use by Oxide engineering and support, and is not easily accessible to the end user. I'm not super worried about this being a call generator, since it's not in a customer-facing UI...

use nexus_types::internal_api::background::fm_analysis::{
AnalysisOutcome, Outcome, PreparationStatus,
};
let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } =
match serde_json::from_value::<FmAnalysisStatus>(details.clone()) {
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
);
return;
}
Ok(status) => status,
};
pub const PARENT_SITREP_ID: &str = "parent sitrep ID:";
pub const INV_ID: &str = "current inventory collection ID:";
pub const WIDTH: usize = const_max_len(&[PARENT_SITREP_ID, INV_ID]) + 1;
println!(" {PARENT_SITREP_ID:<WIDTH$}{parent_sitrep_id:?}");
println!(" {INV_ID:<WIDTH$}{inv_collection_id:?}");
println!(" FAULT MANAGEMENT ANALYSIS SUMMARY");
println!(" ===== ========== ======== =======");
let (prep_status, analysis_outcome) = match outcome {
Outcome::WaitingForInventory => {
println!(
" analysis was not performed, as the inventory has\n \
not yet been loaded.\n\
(i) note: this should only happen if Nexus has just started.",
);
return;
}
Outcome::PreparationError(error) => {
println!(
"{ERRICON} failed to prepare analysis inputs:\n {error}"
);
return;
}
Outcome::RanAnalysis { prep_status, outcome } => (prep_status, outcome),
};
match analysis_outcome {
AnalysisOutcome::Error(error) => {
println!("{ERRICON} analysis failed: {error}");
}
AnalysisOutcome::Unchanged => {
println!(
" no changes from the current situation report ({:?})",
parent_sitrep_id
);
}
AnalysisOutcome::NotCommitted { sitrep_id, error } => {
println!(
" analysis succeeded, but the sitrep was not committed!"
);
println!(" sitrep ID: {sitrep_id:?}");
println!(" error: {error}");
}
AnalysisOutcome::Committed { sitrep_id } => {
println!(" analyzed the situation, and committed a new sitrep!");
println!(" sitrep ID: {sitrep_id:?}");
}
}
println!();

let PreparationStatus { errors, report } = prep_status;
println!("{}", report.display_multiline(4));
if !errors.is_empty() {
println!("{ERRICON} errors preparing analysis inputs:");
for error in errors {
println!(" > {error}")
}
}

// TODO(eliza): eventually there will also be a detailed analysis report,
// print that here as well...
}

fn print_task_fm_sitrep_loader(details: &serde_json::Value) {
match serde_json::from_value::<SitrepLoadStatus>(details.clone()) {
Err(error) => eprintln!(
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down Expand Up @@ -371,6 +375,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down Expand Up @@ -617,6 +625,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down
40 changes: 40 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ task: "external_endpoints"
on each one


task: "fm_analysis"
performs fault management analysis and updates the sitrep


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep
Expand Down Expand Up @@ -687,6 +691,24 @@ task: "external_endpoints"

TLS certificates: 0

task: "fm_analysis"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
FAULT MANAGEMENT ANALYSIS SUMMARY
===== ========== ======== =======
/!\ analysis failed: FM analysis is not yet implemented

fault management analysis inputs
----- ---------- -------- ------
parent sitrep: <none>
inventory collection: ..........<REDACTED_UUID>...........
no new ereports since the parent sitrep
no cases copied forward


task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down Expand Up @@ -1320,6 +1342,24 @@ task: "external_endpoints"

TLS certificates: 0

task: "fm_analysis"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
FAULT MANAGEMENT ANALYSIS SUMMARY
===== ========== ======== =======
/!\ analysis failed: FM analysis is not yet implemented

fault management analysis inputs
----- ---------- -------- ------
parent sitrep: <none>
inventory collection: ..........<REDACTED_UUID>...........
no new ereports since the parent sitrep
no cases copied forward


task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down
11 changes: 11 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,10 @@ impl Default for MulticastGroupReconcilerConfig {
#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct FmTasksConfig {
/// period (in seconds) for periodic activations of the background task that
/// drives fault management analysis.
#[serde_as(as = "DurationSeconds<u64>")]
pub analysis_period_secs: Duration,
/// period (in seconds) for periodic activations of the background task that
/// reads the latest fault management sitrep from the database.
#[serde_as(as = "DurationSeconds<u64>")]
Expand All @@ -989,6 +993,10 @@ pub struct FmTasksConfig {
impl Default for FmTasksConfig {
fn default() -> Self {
Self {
// Analysis is generally triggered by changes in the current sitrep,
// inventory, or by the ereport ingester(s), so it need not be
// periodically activated all that frequently.
analysis_period_secs: Duration::from_secs(60),
sitrep_load_period_secs: Duration::from_secs(15),
// This need not be activated very frequently, as it's triggered any
// time the current sitrep changes, and activating it more
Expand Down Expand Up @@ -1310,6 +1318,7 @@ mod test {
probe_distributor.period_secs = 50
multicast_reconciler.period_secs = 60
fm.rendezvous_period_secs = 51
fm.analysis_period_secs = 52
trust_quorum.period_secs = 60
attached_subnet_manager.period_secs = 60
session_cleanup.period_secs = 300
Expand Down Expand Up @@ -1566,6 +1575,7 @@ mod test {
disable: false,
},
fm: FmTasksConfig {
analysis_period_secs: Duration::from_secs(52),
sitrep_load_period_secs: Duration::from_secs(48),
sitrep_gc_period_secs: Duration::from_secs(49),
rendezvous_period_secs: Duration::from_secs(51),
Expand Down Expand Up @@ -1702,6 +1712,7 @@ mod test {
fm.sitrep_gc_period_secs = 46
probe_distributor.period_secs = 47
fm.rendezvous_period_secs = 48
fm.analysis_period_secs = 49
multicast_reconciler.period_secs = 60
trust_quorum.period_secs = 60
attached_subnet_manager.period_secs = 60
Expand Down
2 changes: 2 additions & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ nexus-db-lookup.workspace = true
nexus-db-model.workspace = true
nexus-db-queries.workspace = true
nexus-db-schema.workspace = true
nexus-fm.workspace = true
nexus-inventory.workspace = true
nexus-metrics-producer-gc.workspace = true
nexus-reconfigurator-execution.workspace = true
Expand Down Expand Up @@ -200,6 +201,7 @@ sp-sim.workspace = true
strum.workspace = true
subprocess.workspace = true
term.workspace = true
tokio-test.workspace = true
tufaceous.workspace = true
tufaceous-artifact.workspace = true
tufaceous-lib.workspace = true
Expand Down
1 change: 1 addition & 0 deletions nexus/background-task-interface/src/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ pub struct BackgroundTasks {
pub task_webhook_deliverator: Activator,
pub task_sp_ereport_ingester: Activator,
pub task_reconfigurator_config_loader: Activator,
pub task_fm_analysis: Activator,
pub task_fm_rendezvous: Activator,
pub task_fm_sitrep_loader: Activator,
pub task_fm_sitrep_gc: Activator,
Expand Down
11 changes: 7 additions & 4 deletions nexus/db-model/src/fm/case.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,13 @@ impl CaseMetadata {
) -> Self {
let fm::Case {
id,
created_sitrep_id,
closed_sitrep_id,
de,
comment,
metadata:
fm::case::Metadata {
created_sitrep_id,
closed_sitrep_id,
de,
comment,
},
alerts_requested: _,
support_bundles_requested: _,
ereports: _,
Expand Down
29 changes: 20 additions & 9 deletions nexus/db-queries/src/db/datastore/ereport.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,20 +319,30 @@ impl DataStore {
Ok((created, latest))
}

pub async fn ereports_list_unseen(
/// Lists ereports which have not been marked as **definitely seen**
/// (included in a committed sitrep) in the database, paginated by the
/// reporter restart ID and ENA.
///
/// Note that this filters based only on whether they have been marked in
/// the database. Because marking seen ereports occurs asynchronously from
/// committing sitreps as part of FM rendezvous, ereports returned by this
/// query may have already been seen. These ereports must be filtered out at
/// a higher level based on the contents of the current sitrep when
/// determining which ereports are *actually* new.
pub async fn ereports_list_unmarked(
&self,
opctx: &OpContext,
pagparams: &DataPageParams<'_, (Uuid, DbEna)>,
) -> ListResultVec<Ereport> {
// TODO(eliza): ereports should probably have their own resource type someday...
opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?;
Self::ereports_list_unseen_query(pagparams)
Self::ereports_list_unmarked_query(pagparams)
.load_async(&*self.pool_connection_authorized(opctx).await?)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

fn ereports_list_unseen_query(
fn ereports_list_unmarked_query(
pagparams: &DataPageParams<'_, (Uuid, DbEna)>,
) -> impl RunnableQuery<Ereport> + use<> {
paginated_multicolumn(
Expand Down Expand Up @@ -671,23 +681,24 @@ mod tests {
}

#[tokio::test]
async fn expectorate_ereports_list_unseen() {
async fn expectorate_ereports_list_unmarked() {
let pagparams = DataPageParams {
marker: None,
direction: PaginationOrder::Ascending,
limit: NonZeroU32::new(100).unwrap(),
};
let query = DataStore::ereports_list_unseen_query(&pagparams);
let query = DataStore::ereports_list_unmarked_query(&pagparams);
expectorate_query_contents(
&query,
"tests/output/ereports_list_unseen.sql",
"tests/output/ereports_list_unmarked.sql",
)
.await;
}

#[tokio::test]
async fn explain_ereports_list_unseen_query() {
let logctx = dev::test_setup_log("explain_ereports_list_unseen_query");
async fn explain_ereports_list_unmarkedquery() {
Comment thread
hawkw marked this conversation as resolved.
Outdated
let logctx =
dev::test_setup_log("explain_ereports_list_unmarked_query");
let db = TestDatabase::new_with_pool(&logctx.log).await;
let pool = db.pool();
let conn = pool.claim().await.unwrap();
Expand All @@ -697,7 +708,7 @@ mod tests {
direction: PaginationOrder::Ascending,
limit: NonZeroU32::new(100).unwrap(),
};
let query = DataStore::ereports_list_unseen_query(&pagparams);
let query = DataStore::ereports_list_unmarked_query(&pagparams);
let explanation = query
.explain_async(&conn)
.await
Expand Down
Loading
Loading