diff --git a/crates/admin-cli/src/machine/boot_interfaces/args.rs b/crates/admin-cli/src/machine/boot_interfaces/args.rs new file mode 100644 index 0000000000..ce918ae64c --- /dev/null +++ b/crates/admin-cli/src/machine/boot_interfaces/args.rs @@ -0,0 +1,36 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use carbide_uuid::machine::MachineId; +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(after_long_help = "\ +EXAMPLES: + +Show one machine's boot interfaces across every store: + $ nico-admin-cli machine boot-interfaces 12345678-1234-5678-90ab-cdef01234567 + +As JSON or YAML (uses the global --output/--format flag): + $ nico-admin-cli --output json machine boot-interfaces 12345678-1234-5678-90ab-cdef01234567 + $ nico-admin-cli --output yaml machine boot-interfaces 12345678-1234-5678-90ab-cdef01234567 + +")] +pub struct Args { + #[clap(help = "The machine ID whose boot interfaces to gather")] + pub machine: MachineId, +} diff --git a/crates/admin-cli/src/machine/boot_interfaces/cmd.rs b/crates/admin-cli/src/machine/boot_interfaces/cmd.rs new file mode 100644 index 0000000000..8dd278762c --- /dev/null +++ b/crates/admin-cli/src/machine/boot_interfaces/cmd.rs @@ -0,0 +1,452 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Render one machine's boot-interface view (the `GetMachineBootInterfaces` +//! RPC) as an ASCII table, JSON, or YAML. The view gathers the four stores a +//! host's boot interface can live in -- owned interface rows, predictions, the +//! explored endpoint default, and the retained post-deletion pairs -- plus the +//! effective boot interface the system would select and a divergence flag. + +use std::fmt::Write as _; + +use ::rpc::admin_cli::OutputFormat; +use ::rpc::forge as forgerpc; +use carbide_uuid::machine::MachineId; +use prettytable::{Cell, Row, Table}; +use serde::Serialize; + +use super::args::Args; +use crate::errors::CarbideCliResult; +use crate::rpc::ApiClient; + +/// The admin-cli-side projection of `GetMachineBootInterfacesResponse`, shaped +/// for clean JSON/YAML and for table rendering. Built straight from the proto +/// response, whose nullable boot-interface fields already carry absence as +/// `Option` (proto3 field presence). +#[derive(Debug, Serialize)] +struct BootInterfacesReport { + machine_id: Option, + machine_interfaces: Vec, + predicted_interfaces: Vec, + explored_endpoints: Vec, + retained_interfaces: Vec, + /// MAC the system would boot from now (`pick_boot_interface` over the owned + /// rows). `None` when there is no owned candidate yet. + effective_boot_interface_mac: Option, + /// The fully-populated effective boot interface id, when captured. + effective_boot_interface_id: Option, + /// True when the stores disagree about which MAC boots this machine. + divergent: bool, +} + +#[derive(Debug, Serialize)] +struct OwnedRow { + mac_address: String, + primary_interface: bool, + boot_interface_id: Option, + network_segment_type: Option, +} + +#[derive(Debug, Serialize)] +struct PredictedRow { + mac_address: String, + primary_interface: bool, + boot_interface_id: Option, + network_segment_type: Option, +} + +#[derive(Debug, Serialize)] +struct ExploredRow { + address: String, + boot_interface_mac: Option, + boot_interface_id: Option, +} + +#[derive(Debug, Serialize)] +struct RetainedRow { + mac_address: String, + boot_interface_id: String, + recorded_at: Option, +} + +impl From for BootInterfacesReport { + fn from(r: forgerpc::GetMachineBootInterfacesResponse) -> Self { + BootInterfacesReport { + machine_id: r.machine_id, + machine_interfaces: r + .machine_interfaces + .into_iter() + .map(|i| OwnedRow { + mac_address: i.mac_address, + primary_interface: i.primary_interface, + boot_interface_id: i.boot_interface_id, + network_segment_type: i.network_segment_type, + }) + .collect(), + predicted_interfaces: r + .predicted_interfaces + .into_iter() + .map(|p| PredictedRow { + mac_address: p.mac_address, + primary_interface: p.primary_interface, + boot_interface_id: p.boot_interface_id, + network_segment_type: p.network_segment_type, + }) + .collect(), + explored_endpoints: r + .explored_endpoints + .into_iter() + .map(|e| ExploredRow { + address: e.address, + boot_interface_mac: e.boot_interface_mac, + boot_interface_id: e.boot_interface_id, + }) + .collect(), + retained_interfaces: r + .retained_interfaces + .into_iter() + .map(|t| RetainedRow { + mac_address: t.mac_address, + boot_interface_id: t.boot_interface_id, + recorded_at: t.recorded_at.map(|ts| ts.to_string()), + }) + .collect(), + effective_boot_interface_mac: r.effective_boot_interface_mac, + effective_boot_interface_id: r.effective_boot_interface_id, + divergent: r.divergent, + } + } +} + +pub async fn handle_boot_interfaces( + args: Args, + output_format: OutputFormat, + api_client: &ApiClient, +) -> CarbideCliResult<()> { + let response = api_client.get_machine_boot_interfaces(args.machine).await?; + let report = BootInterfacesReport::from(response); + + match output_format { + OutputFormat::Json => { + println!("{}", serde_json::to_string_pretty(&report)?); + } + OutputFormat::Yaml => { + println!("{}", serde_yaml::to_string(&report)?); + } + // CSV is a poor fit for a multi-section report; AsciiTable is the + // human-readable form. Both render the same tables. + OutputFormat::AsciiTable | OutputFormat::Csv => { + print!("{}", render_tables(&report)); + } + } + Ok(()) +} + +/// One labeled table per store, then a summary block with the effective boot +/// interface and the divergence flag. +fn render_tables(report: &BootInterfacesReport) -> String { + let mut out = String::new(); + let dash = |s: &Option| s.as_deref().unwrap_or("-").to_string(); + + let machine_id = report + .machine_id + .map(|id| id.to_string()) + .unwrap_or_default(); + let _ = writeln!(out, "Boot interfaces for machine {machine_id}"); + + // Store 1: owned interface rows (authoritative for an owned machine). + let _ = writeln!(out, "\nmachine_interfaces (owned rows):"); + let mut owned = Table::new(); + owned.set_titles(Row::new( + [ + "MAC Address", + "Primary", + "Boot Interface Id", + "Segment Type", + ] + .into_iter() + .map(Cell::new) + .collect(), + )); + if report.machine_interfaces.is_empty() { + owned.add_row(Row::new(vec![Cell::new("(none)")])); + } else { + for i in &report.machine_interfaces { + owned.add_row(Row::new(vec![ + Cell::new(&i.mac_address), + Cell::new(&i.primary_interface.to_string()), + Cell::new(&dash(&i.boot_interface_id)), + Cell::new(&dash(&i.network_segment_type)), + ])); + } + } + let _ = write!(out, "{owned}"); + + // Store 2: predictions (pre-first-lease candidates). + let _ = writeln!(out, "\npredicted_machine_interfaces:"); + let mut predicted = Table::new(); + predicted.set_titles(Row::new( + [ + "MAC Address", + "Primary", + "Boot Interface Id", + "Segment Type", + ] + .into_iter() + .map(Cell::new) + .collect(), + )); + if report.predicted_interfaces.is_empty() { + predicted.add_row(Row::new(vec![Cell::new("(none)")])); + } else { + for p in &report.predicted_interfaces { + predicted.add_row(Row::new(vec![ + Cell::new(&p.mac_address), + Cell::new(&p.primary_interface.to_string()), + Cell::new(&dash(&p.boot_interface_id)), + Cell::new(&dash(&p.network_segment_type)), + ])); + } + } + let _ = write!(out, "{predicted}"); + + // Store 3: explored endpoint default (machine-less default; shown for the + // machine's BMC endpoints). + let _ = writeln!(out, "\nexplored_endpoints (default for unowned endpoints):"); + let mut explored = Table::new(); + explored.set_titles(Row::new( + [ + "Endpoint Address", + "Boot Interface MAC", + "Boot Interface Id", + ] + .into_iter() + .map(Cell::new) + .collect(), + )); + if report.explored_endpoints.is_empty() { + explored.add_row(Row::new(vec![Cell::new("(none)")])); + } else { + for e in &report.explored_endpoints { + explored.add_row(Row::new(vec![ + Cell::new(&e.address), + Cell::new(&dash(&e.boot_interface_mac)), + Cell::new(&dash(&e.boot_interface_id)), + ])); + } + } + let _ = write!(out, "{explored}"); + + // Store 4: retained post-deletion pairs (raw, including stale records). + let _ = writeln!( + out, + "\nretained_boot_interfaces (post-deletion, incl. stale):" + ); + let mut retained = Table::new(); + retained.set_titles(Row::new( + ["MAC Address", "Boot Interface Id", "Recorded At"] + .into_iter() + .map(Cell::new) + .collect(), + )); + if report.retained_interfaces.is_empty() { + retained.add_row(Row::new(vec![Cell::new("(none)")])); + } else { + for t in &report.retained_interfaces { + retained.add_row(Row::new(vec![ + Cell::new(&t.mac_address), + Cell::new(&t.boot_interface_id), + Cell::new(&dash(&t.recorded_at)), + ])); + } + } + let _ = write!(out, "{retained}"); + + // Summary: the effective pick and the divergence flag. + let _ = writeln!( + out, + "\nEffective boot interface MAC: {}", + dash(&report.effective_boot_interface_mac) + ); + let _ = writeln!( + out, + "Effective boot interface id: {}", + dash(&report.effective_boot_interface_id) + ); + let _ = writeln!(out, "Stores diverge on boot MAC: {}", report.divergent); + + out +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A fixed report exercising every store, a captured pair, a stale retained + /// record, a declared primary, and divergence. + fn sample_report() -> BootInterfacesReport { + BootInterfacesReport { + machine_id: None, + machine_interfaces: vec![OwnedRow { + mac_address: "aa:bb:cc:00:00:01".to_string(), + primary_interface: true, + boot_interface_id: Some("NIC.Slot.1-1-1".to_string()), + network_segment_type: Some("HostInband".to_string()), + }], + predicted_interfaces: vec![PredictedRow { + mac_address: "aa:bb:cc:00:00:02".to_string(), + primary_interface: false, + boot_interface_id: None, + network_segment_type: Some("Admin".to_string()), + }], + explored_endpoints: vec![ExploredRow { + address: "10.0.0.5".to_string(), + // A different NIC than the effective owned pick -> divergence. + boot_interface_mac: Some("aa:bb:cc:00:00:09".to_string()), + boot_interface_id: Some("NIC.Slot.9-1-1".to_string()), + }], + retained_interfaces: vec![RetainedRow { + mac_address: "aa:bb:cc:00:00:03".to_string(), + boot_interface_id: "NIC.Old.1-1-1".to_string(), + recorded_at: Some("2026-06-01T00:00:00Z".to_string()), + }], + effective_boot_interface_mac: Some("aa:bb:cc:00:00:01".to_string()), + effective_boot_interface_id: Some("NIC.Slot.1-1-1".to_string()), + divergent: true, + } + } + + #[test] + fn ascii_table_shows_each_store_and_summary() { + let table = render_tables(&sample_report()); + + // Section labels. + assert!(table.contains("machine_interfaces (owned rows):")); + assert!(table.contains("predicted_machine_interfaces:")); + assert!(table.contains("explored_endpoints")); + assert!(table.contains("retained_boot_interfaces")); + + // The boot_interface_id of the owned row. + assert!(table.contains("NIC.Slot.1-1-1")); + // The primary flag. + assert!(table.contains("true")); + // The retained record's recorded_at. + assert!(table.contains("2026-06-01T00:00:00Z")); + // The effective pick and divergence flag. + assert!(table.contains("Effective boot interface MAC: aa:bb:cc:00:00:01")); + assert!(table.contains("Stores diverge on boot MAC: true")); + } + + #[test] + fn json_round_trips_with_every_field() { + let json = serde_json::to_string_pretty(&sample_report()).expect("serialize json"); + + // Field presence. + assert!(json.contains("\"boot_interface_id\"")); + assert!(json.contains("NIC.Slot.1-1-1")); + assert!(json.contains("\"recorded_at\"")); + assert!(json.contains("2026-06-01T00:00:00Z")); + assert!(json.contains("\"primary_interface\": true")); + assert!(json.contains("\"divergent\": true")); + + // Round-trips into a generic JSON value with the expected structure. + let value: serde_json::Value = serde_json::from_str(&json).expect("parse json"); + assert_eq!(value["divergent"], serde_json::Value::Bool(true)); + assert_eq!(value["machine_interfaces"][0]["primary_interface"], true); + assert_eq!( + value["machine_interfaces"][0]["boot_interface_id"], + "NIC.Slot.1-1-1" + ); + assert_eq!( + value["retained_interfaces"][0]["recorded_at"], + "2026-06-01T00:00:00Z" + ); + assert_eq!(value["effective_boot_interface_mac"], "aa:bb:cc:00:00:01"); + } + + #[test] + fn yaml_round_trips_with_every_field() { + let yaml = serde_yaml::to_string(&sample_report()).expect("serialize yaml"); + + assert!(yaml.contains("boot_interface_id:")); + assert!(yaml.contains("NIC.Slot.1-1-1")); + assert!(yaml.contains("recorded_at:")); + assert!(yaml.contains("divergent: true")); + assert!(yaml.contains("primary_interface: true")); + + // Round-trips back into a generic YAML value. + let value: serde_yaml::Value = serde_yaml::from_str(&yaml).expect("parse yaml"); + assert_eq!(value["divergent"], serde_yaml::Value::Bool(true)); + assert_eq!( + value["retained_interfaces"][0]["recorded_at"], + serde_yaml::Value::String("2026-06-01T00:00:00Z".to_string()) + ); + } + + /// The proto -> report conversion: absent fields (proto3 field presence, + /// `None`) stay `None`, present ones pass through, and a `Timestamp` + /// renders as RFC 3339. + #[test] + fn from_proto_response_maps_fields() { + let response = forgerpc::GetMachineBootInterfacesResponse { + machine_id: None, + machine_interfaces: vec![forgerpc::MachineInterfaceBootInterface { + mac_address: "aa:bb:cc:00:00:01".to_string(), + primary_interface: true, + boot_interface_id: Some("NIC.Slot.1-1-1".to_string()), + network_segment_type: Some("HostInband".to_string()), + }], + predicted_interfaces: vec![], + explored_endpoints: vec![forgerpc::ExploredBootInterface { + address: "10.0.0.5".to_string(), + // An absent boot MAC -> `None` in the report. + boot_interface_mac: None, + boot_interface_id: Some("NIC.Slot.9-1-1".to_string()), + }], + retained_interfaces: vec![forgerpc::RetainedBootInterface { + mac_address: "aa:bb:cc:00:00:03".to_string(), + boot_interface_id: "NIC.Old.1-1-1".to_string(), + // The default Timestamp is the unix epoch; Display renders RFC 3339. + recorded_at: Some(Default::default()), + }], + effective_boot_interface_mac: Some("aa:bb:cc:00:00:01".to_string()), + // Absent -> `None`. + effective_boot_interface_id: None, + divergent: false, + }; + + let report = BootInterfacesReport::from(response); + + // Present values pass through; absent ones stay `None`. + assert_eq!( + report.machine_interfaces[0].boot_interface_id.as_deref(), + Some("NIC.Slot.1-1-1") + ); + assert!(report.machine_interfaces[0].primary_interface); + assert_eq!(report.explored_endpoints[0].boot_interface_mac, None); + assert_eq!(report.effective_boot_interface_id, None); + assert_eq!( + report.effective_boot_interface_mac.as_deref(), + Some("aa:bb:cc:00:00:01") + ); + // The Timestamp renders as an RFC 3339 string (epoch default here). + assert_eq!( + report.retained_interfaces[0].recorded_at.as_deref(), + Some("1970-01-01T00:00:00Z") + ); + } +} diff --git a/crates/admin-cli/src/machine/boot_interfaces/mod.rs b/crates/admin-cli/src/machine/boot_interfaces/mod.rs new file mode 100644 index 0000000000..e8176028df --- /dev/null +++ b/crates/admin-cli/src/machine/boot_interfaces/mod.rs @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +pub mod args; +pub mod cmd; + +pub use args::Args; + +use crate::cfg::run::Run; +use crate::cfg::runtime::RuntimeContext; +use crate::errors::CarbideCliResult; + +impl Run for Args { + async fn run(self, ctx: &mut RuntimeContext) -> CarbideCliResult<()> { + cmd::handle_boot_interfaces(self, ctx.config.format, &ctx.api_client).await + } +} diff --git a/crates/admin-cli/src/machine/mod.rs b/crates/admin-cli/src/machine/mod.rs index b49233b023..fcb019562f 100644 --- a/crates/admin-cli/src/machine/mod.rs +++ b/crates/admin-cli/src/machine/mod.rs @@ -16,6 +16,7 @@ */ pub mod auto_update; +pub mod boot_interfaces; pub mod common; pub mod force_delete; pub mod hardware_info; @@ -45,6 +46,16 @@ use crate::cfg::dispatch::Dispatch; pub enum Cmd { #[clap(about = "Display Machine information")] Show(show::Args), + #[clap( + about = "Show a machine's boot interfaces from every store (troubleshooting)", + long_about = "Gather one machine's boot-interface view from all four stores and print \ + them together: the owned `machine_interfaces` rows (authoritative for an owned \ + machine), `predicted_machine_interfaces` (pre-first-lease candidates), the \ + `explored_endpoints` default (for unowned endpoints), and the retained \ + post-deletion pairs (including stale records). Also reports the effective boot \ + interface the system would select and flags when the stores disagree. Read-only." + )] + BootInterfaces(boot_interfaces::Args), #[clap(subcommand, about = "Networking information")] Network(network::Args), #[clap( diff --git a/crates/admin-cli/src/rpc.rs b/crates/admin-cli/src/rpc.rs index 927e7b4ca0..07386ad22f 100644 --- a/crates/admin-cli/src/rpc.rs +++ b/crates/admin-cli/src/rpc.rs @@ -106,6 +106,22 @@ impl ApiClient { Ok(machine_details) } + /// Gather one machine's boot-interface view across all four stores -- the + /// owned interface rows, predictions, the explored endpoint default, and + /// the retained post-deletion pairs -- plus the effective boot interface + /// and a divergence flag. Read-only. + pub async fn get_machine_boot_interfaces( + &self, + id: MachineId, + ) -> CarbideCliResult { + Ok(self + .0 + .get_machine_boot_interfaces(rpc::GetMachineBootInterfacesRequest { + machine_id: Some(id), + }) + .await?) + } + pub async fn get_all_machines( &self, request: rpc::MachineSearchConfig, diff --git a/crates/api-core/src/api.rs b/crates/api-core/src/api.rs index 631aad6aa3..bfd5c673dc 100644 --- a/crates/api-core/src/api.rs +++ b/crates/api-core/src/api.rs @@ -1357,6 +1357,13 @@ impl Forge for Api { crate::handlers::boot_override::clear(self, request).await } + async fn get_machine_boot_interfaces( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::machine_boot_interfaces::get_machine_boot_interfaces(self, request).await + } + async fn get_network_topology( &self, request: Request, diff --git a/crates/api-core/src/auth/internal_rbac_rules.rs b/crates/api-core/src/auth/internal_rbac_rules.rs index d0b2893563..12ae108501 100644 --- a/crates/api-core/src/auth/internal_rbac_rules.rs +++ b/crates/api-core/src/auth/internal_rbac_rules.rs @@ -305,6 +305,7 @@ impl InternalRBACRules { x.perm("GetMachineBootOverride", vec![ForgeAdminCLI]); x.perm("SetMachineBootOverride", vec![ForgeAdminCLI]); x.perm("ClearMachineBootOverride", vec![ForgeAdminCLI]); + x.perm("GetMachineBootInterfaces", vec![ForgeAdminCLI]); x.perm("GetNetworkTopology", vec![ForgeAdminCLI]); x.perm("FindNetworkDevicesByDeviceIds", vec![ForgeAdminCLI]); x.perm("CreateCredential", vec![ForgeAdminCLI]); diff --git a/crates/api-core/src/handlers/machine_boot_interfaces.rs b/crates/api-core/src/handlers/machine_boot_interfaces.rs new file mode 100644 index 0000000000..cdc20c48dd --- /dev/null +++ b/crates/api-core/src/handlers/machine_boot_interfaces.rs @@ -0,0 +1,178 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! One machine's boot-interface view, gathered from every store that records +//! it. This is a read-only troubleshooting projection: it reports the four +//! places a host's boot interface can live -- owned `machine_interfaces` rows, +//! `predicted_machine_interfaces`, the `explored_endpoints` default, and the +//! post-deletion `retained_boot_interfaces` pairs -- alongside the effective +//! boot interface the system would select via `pick_boot_interface`, and a +//! divergence flag for when the stores disagree about which NIC boots. + +use std::collections::BTreeSet; + +use ::rpc::forge as rpc; +use mac_address::MacAddress; +use tonic::{Request, Response, Status}; + +use crate::api::{Api, log_request_data}; +use crate::handlers::utils::convert_and_log_machine_id; + +/// Gather the boot-interface view for one machine across all four stores. +/// +/// All four stores are read within a single read transaction. The effective +/// boot interface is the same +/// `pick_boot_interface` selection every other flow acts on, applied to the +/// owned `machine_interfaces` rows. +pub(crate) async fn get_machine_boot_interfaces( + api: &Api, + request: Request, +) -> Result, Status> { + log_request_data(&request); + let request = request.into_inner(); + let machine_id = convert_and_log_machine_id(request.machine_id.as_ref())?; + + let mut txn = api.txn_begin().await?; + + // Store 1: owned interface rows -- the authoritative store for a machine + // that exists. `find_by_machine_ids` returns a per-machine map. + let owned_interfaces = db::machine_interface::find_by_machine_ids(txn.as_mut(), &[machine_id]) + .await? + .remove(&machine_id) + .unwrap_or_default(); + + // Store 2: predictions -- the boot candidates a host offers before its + // first DHCP lease creates an owned row. + let predicted_interfaces = + db::predicted_machine_interface::find_by_machine_id(txn.as_mut(), &machine_id).await?; + + // Store 3: the explored endpoint default. The machine's BMC IP(s) map it to + // the explored endpoints site-explorer recorded a default against. + let bmc_pairs = + db::machine_topology::find_machine_bmc_pairs_by_machine_id(txn.as_mut(), vec![machine_id]) + .await?; + let bmc_ips: Vec = bmc_pairs + .into_iter() + .filter_map(|(_, ip)| ip) + .filter_map(|ip| ip.parse().ok()) + .collect(); + let explored_endpoints = if bmc_ips.is_empty() { + Vec::new() + } else { + // `find_by_ips` takes `impl DbReader`; the wrapping transaction + // implements it directly (a bare `&mut PgConnection` would need a + // coercion that generic bound can't perform). + db::explored_endpoints::find_by_ips(&mut txn, bmc_ips).await? + }; + + // Store 4: the retained post-deletion pairs. Collect the MACs the machine + // knows about and read their raw retained rows -- un-window-filtered, so + // stale records show up in the troubleshooting view. Owned (store 1) and + // predicted (store 2) MACs, plus each explored endpoint's recorded boot MAC + // (store 3): a retained record keyed on the explored boot MAC is surfaced + // too, even when no owned/predicted row carries that MAC. + let macs: Vec = owned_interfaces + .iter() + .map(|i| i.mac_address) + .chain(predicted_interfaces.iter().map(|p| p.mac_address)) + .chain( + explored_endpoints + .iter() + .filter_map(|e| e.boot_interface_mac), + ) + .collect::>() + .into_iter() + .collect(); + let retained_records = if macs.is_empty() { + Vec::new() + } else { + db::retained_boot_interface::find_records_by_macs(&mut txn, &macs).await? + }; + + txn.commit().await?; + + // The effective boot interface: `pick_boot_interface` over the owned rows + // (primary wins, else the lowest-MAC non-underlay NIC). This is what the + // controller and admin actions resolve. + let effective = model::machine::pick_boot_interface(&owned_interfaces); + let effective_mac = effective.map(|i| i.mac_address); + let effective_boot_interface = effective.and_then(|i| i.boot_interface()); + + // Divergence: do the stores agree on which MAC boots this machine? We + // compare the boot-MAC signals each store offers -- the effective owned + // pick, every explored endpoint's recorded default, and any predicted NIC + // flagged primary -- and flag a disagreement when more than one distinct + // MAC turns up. (Retained rows are post-deletion history, shown for context + // but not part of the agreement check.) A single signal, or none, is not a + // divergence. + let mut boot_macs: BTreeSet = BTreeSet::new(); + if let Some(mac) = effective_mac { + boot_macs.insert(mac); + } + for endpoint in &explored_endpoints { + if let Some(mac) = endpoint.boot_interface_mac { + boot_macs.insert(mac); + } + } + for prediction in &predicted_interfaces { + if prediction.primary_interface { + boot_macs.insert(prediction.mac_address); + } + } + let divergent = boot_macs.len() > 1; + + Ok(Response::new(rpc::GetMachineBootInterfacesResponse { + machine_id: Some(machine_id), + machine_interfaces: owned_interfaces + .iter() + .map(|i| rpc::MachineInterfaceBootInterface { + mac_address: i.mac_address.to_string(), + primary_interface: i.primary_interface, + boot_interface_id: i.boot_interface_id.clone(), + network_segment_type: i.network_segment_type.map(|t| t.to_string()), + }) + .collect(), + predicted_interfaces: predicted_interfaces + .iter() + .map(|p| rpc::PredictedBootInterface { + mac_address: p.mac_address.to_string(), + primary_interface: p.primary_interface, + boot_interface_id: p.boot_interface_id.clone(), + network_segment_type: Some(p.expected_network_segment_type.to_string()), + }) + .collect(), + explored_endpoints: explored_endpoints + .iter() + .map(|e| rpc::ExploredBootInterface { + address: e.address.to_string(), + boot_interface_mac: e.boot_interface_mac.map(|m| m.to_string()), + boot_interface_id: e.boot_interface_id.clone(), + }) + .collect(), + retained_interfaces: retained_records + .iter() + .map(|r| rpc::RetainedBootInterface { + mac_address: r.mac_address.to_string(), + boot_interface_id: r.boot_interface_id.clone(), + recorded_at: Some(r.recorded_at.into()), + }) + .collect(), + effective_boot_interface_mac: effective_mac.map(|m| m.to_string()), + effective_boot_interface_id: effective_boot_interface.map(|b| b.interface_id), + divergent, + })) +} diff --git a/crates/api-core/src/handlers/mod.rs b/crates/api-core/src/handlers/mod.rs index f3ee50aca2..12bf5ace35 100644 --- a/crates/api-core/src/handlers/mod.rs +++ b/crates/api-core/src/handlers/mod.rs @@ -46,6 +46,7 @@ pub mod instance; pub mod instance_type; pub mod logical_partition; pub mod machine; +pub mod machine_boot_interfaces; pub mod machine_discovery; pub mod machine_hardware_info; pub mod machine_identity; diff --git a/crates/api-core/src/tests/machine_boot_interfaces.rs b/crates/api-core/src/tests/machine_boot_interfaces.rs new file mode 100644 index 0000000000..7b72032752 --- /dev/null +++ b/crates/api-core/src/tests/machine_boot_interfaces.rs @@ -0,0 +1,253 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! `GetMachineBootInterfaces` gathers one machine's boot-interface view from +//! all four stores -- owned interface rows, predictions, the explored endpoint +//! default, and the retained post-deletion pairs -- and reports the effective +//! boot interface plus a divergence flag. These tests seed the stores for one +//! host and assert the gathered view. + +use mac_address::MacAddress; +use model::network_segment::NetworkSegmentType; +use model::predicted_machine_interface::NewPredictedMachineInterface; +use model::test_support::ManagedHostConfig; +use rpc::forge; +use rpc::forge::forge_server::Forge; + +use crate::sqlx_test; +use crate::test_support::fixture_config::{FixtureDefault as _, ManagedHostConfigExt as _}; +use crate::tests::common::api_fixtures; + +#[sqlx_test] +async fn test_get_machine_boot_interfaces_gathers_all_four_stores( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = api_fixtures::create_test_env(pool).await; + + // A real ingested host gives us owned `machine_interfaces` rows, BMC + // topology, and explored endpoints -- stores 1 and 3. + let host = + api_fixtures::site_explorer::new_host(&env, ManagedHostConfig::default().with_dpu_count(1)) + .await?; + let host_id = host.host_snapshot.id; + + // Read the owned rows the host ended up with: the primary is the effective + // boot interface `pick_boot_interface` selects, and we reuse its MAC to + // seed a retained record (store 4). + let primary_mac = { + let mut txn = env.pool.begin().await?; + let interfaces = db::machine_interface::find_by_machine_ids(txn.as_mut(), &[host_id]) + .await? + .remove(&host_id) + .expect("host should have interface rows"); + txn.rollback().await?; + interfaces + .iter() + .find(|i| i.primary_interface) + .expect("a DPU host has a primary interface") + .mac_address + }; + // A DPU host's primary carries a boot-interface id; seed a known one so the + // effective-pick contract is asserted against a concrete value -- a regression + // to no id then fails loudly instead of defaulting the assertion away. + let primary_boot_id = "NIC.Primary.1-1-1"; + + // Store 2: a prediction for this host, flagged primary, on a MAC that is + // NOT the owned effective pick -- two disagreeing boot-MAC signals, so the + // view must flag divergence. + let predicted_mac: MacAddress = "aa:bb:cc:dd:ee:01".parse()?; + // Store 4: a retained pair on the primary's MAC, aged well past any window. + // `find_records_by_macs` ignores the window, so the troubleshooting view + // surfaces it even though `find_by_mac` would hide a stale record. + { + let mut txn = env.pool.begin().await?; + db::predicted_machine_interface::create( + NewPredictedMachineInterface { + machine_id: &host_id, + mac_address: predicted_mac, + expected_network_segment_type: NetworkSegmentType::HostInband, + boot_interface_id: Some("NIC.Predicted.1-1-1".to_string()), + primary_interface: true, + }, + txn.as_mut(), + ) + .await?; + + // Seed the owned primary's boot-interface id so the effective pick has a + // concrete value to assert. + db::machine_interface::set_boot_interface_id(primary_mac, primary_boot_id, txn.as_mut()) + .await?; + + // Store 3: give the host's explored BMC endpoint a recorded boot + // interface so the explored-endpoint store has concrete data to + // surface. Resolve the BMC IP the same way the handler does (machine -> + // BMC pairs -> explored endpoint at that address) and set its default to + // the owned primary -- naming the same boot NIC, so it adds no new + // distinct boot-MAC signal and leaves the divergence verdict to the + // conflicting prediction. + let bmc_ip: std::net::IpAddr = + db::machine_topology::find_machine_bmc_pairs_by_machine_id(txn.as_mut(), vec![host_id]) + .await? + .into_iter() + .find_map(|(_, ip)| ip) + .expect("the ingested host should have a BMC address") + .parse()?; + db::explored_endpoints::set_boot_interface( + bmc_ip, + &model::machine_boot_interface::MachineBootInterface { + mac_address: primary_mac, + interface_id: primary_boot_id.to_string(), + }, + txn.as_mut(), + ) + .await?; + + db::retained_boot_interface::upsert(txn.as_mut(), primary_mac, "NIC.Retained.9-9-9") + .await?; + sqlx::query( + "UPDATE retained_boot_interfaces SET recorded_at = NOW() - INTERVAL '30 days' \ + WHERE mac_address = $1", + ) + .bind(primary_mac) + .execute(txn.as_mut()) + .await?; + txn.commit().await?; + } + + let report = env + .api + .get_machine_boot_interfaces(tonic::Request::new( + forge::GetMachineBootInterfacesRequest { + machine_id: Some(host_id), + }, + )) + .await? + .into_inner(); + + assert_eq!(report.machine_id, Some(host_id)); + + // Store 1: the owned rows include the primary, and the primary is flagged. + assert!( + !report.machine_interfaces.is_empty(), + "owned interface rows should be reported" + ); + let reported_primary = report + .machine_interfaces + .iter() + .find(|i| i.mac_address == primary_mac.to_string()) + .expect("the primary should appear among owned rows"); + assert!( + reported_primary.primary_interface, + "the primary row should carry the primary flag" + ); + + // Store 2: the seeded prediction shows up with its id and primary flag. + let reported_prediction = report + .predicted_interfaces + .iter() + .find(|p| p.mac_address == predicted_mac.to_string()) + .expect("the seeded prediction should be reported"); + assert!(reported_prediction.primary_interface); + assert_eq!( + reported_prediction.boot_interface_id.as_deref(), + Some("NIC.Predicted.1-1-1"), + "the prediction's boot interface id should be reported" + ); + + // Store 3: the host's explored BMC endpoint is surfaced with the boot + // interface we recorded against it. + let reported_explored = report + .explored_endpoints + .iter() + .find(|e| e.boot_interface_mac.as_deref() == Some(primary_mac.to_string().as_str())) + .expect("the host's explored endpoint default should be reported"); + assert_eq!( + reported_explored.boot_interface_id.as_deref(), + Some(primary_boot_id), + "the explored endpoint's recorded boot interface id should be reported" + ); + + // Store 4: the stale retained record is surfaced with its recorded_at, + // proving the un-window-filtered read. + let reported_retained = report + .retained_interfaces + .iter() + .find(|r| r.mac_address == primary_mac.to_string()) + .expect("the stale retained record should be surfaced"); + assert_eq!(reported_retained.boot_interface_id, "NIC.Retained.9-9-9"); + assert!( + reported_retained.recorded_at.is_some(), + "the retained record should carry recorded_at" + ); + + // Effective pick: the owned primary's MAC. + assert_eq!( + report.effective_boot_interface_mac.as_deref(), + Some(primary_mac.to_string().as_str()), + "the effective boot interface is the owned primary" + ); + assert_eq!( + report.effective_boot_interface_id.as_deref(), + Some(primary_boot_id), + "the effective boot interface id is the primary row's captured boot-interface id" + ); + + // Divergence: the predicted primary disagrees with the owned pick. + assert!( + report.divergent, + "a predicted primary on a different MAC than the owned pick is a divergence" + ); + + Ok(()) +} + +#[sqlx_test] +async fn test_get_machine_boot_interfaces_agrees_when_only_owned_rows_exist( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = api_fixtures::create_test_env(pool).await; + + let host = + api_fixtures::site_explorer::new_host(&env, ManagedHostConfig::default().with_dpu_count(1)) + .await?; + let host_id = host.host_snapshot.id; + + let report = env + .api + .get_machine_boot_interfaces(tonic::Request::new( + forge::GetMachineBootInterfacesRequest { + machine_id: Some(host_id), + }, + )) + .await? + .into_inner(); + + // No predictions seeded. + assert!(report.predicted_interfaces.is_empty()); + // The owned primary is the effective pick. + assert!(report.effective_boot_interface_mac.is_some()); + + // With at most one distinct boot-MAC signal (the owned pick; the explored + // default, when recorded, names the same boot NIC for a DPU host), the + // stores do not diverge. + assert!( + !report.divergent, + "a freshly ingested host with no conflicting prediction should not diverge" + ); + + Ok(()) +} diff --git a/crates/api-core/src/tests/mod.rs b/crates/api-core/src/tests/mod.rs index 842cedd347..5338cb8f3c 100644 --- a/crates/api-core/src/tests/mod.rs +++ b/crates/api-core/src/tests/mod.rs @@ -64,6 +64,7 @@ mod lldp; mod mac_address_pool; mod machine_admin_force_delete; mod machine_bmc_metadata; +mod machine_boot_interfaces; mod machine_boot_override; mod machine_dhcp; mod machine_discovery; diff --git a/crates/api-db/src/retained_boot_interface.rs b/crates/api-db/src/retained_boot_interface.rs index 76acf57784..8201c41e33 100644 --- a/crates/api-db/src/retained_boot_interface.rs +++ b/crates/api-db/src/retained_boot_interface.rs @@ -37,9 +37,22 @@ //! Migrations consume their records within minutes either way. use mac_address::MacAddress; -use sqlx::PgConnection; +use sqlx::{FromRow, PgConnection}; use crate::DatabaseError; +use crate::db_read::DbReader; + +/// One raw `retained_boot_interfaces` row: the preserved boot interface id for +/// a MAC and when it was recorded, returned verbatim with no retention-window +/// filtering. Built for the boot-interface troubleshooting view, which wants to +/// surface stale records too -- the window-filtered [`find_by_mac`] and the +/// consuming `take_by_mac` would hide or remove them. +#[derive(Debug, Clone, FromRow)] +pub struct RetainedBootInterfaceRecord { + pub mac_address: MacAddress, + pub boot_interface_id: String, + pub recorded_at: chrono::DateTime, +} /// Record the boot interface pair for a MAC, overwriting any prior record /// (the newest observation wins). @@ -81,6 +94,29 @@ pub async fn find_by_mac( .map_err(|e| DatabaseError::query(query, e)) } +/// Fetch the full retained records for a set of MACs without consuming them +/// and without any retention-window filtering -- every matching row, including +/// ones aged past the configured window, is returned with its `recorded_at`. +/// +/// This is the troubleshooting read: where [`find_by_mac`] answers "would this +/// MAC's pair still apply?" (window-filtered, value only), this answers "what +/// is actually on file for these MACs, fresh or stale?" so an operator can see +/// a record that exists but has aged out. Production reuse flows must stay on +/// the window-aware `take_by_mac`/`find_by_mac`. +pub async fn find_records_by_macs( + db: impl DbReader<'_>, + mac_addresses: &[MacAddress], +) -> Result, DatabaseError> { + let query = "SELECT mac_address, boot_interface_id, recorded_at \ + FROM retained_boot_interfaces WHERE mac_address = ANY($1) \ + ORDER BY mac_address"; + sqlx::query_as(query) + .bind(mac_addresses) + .fetch_all(db) + .await + .map_err(|e| DatabaseError::query(query, e)) +} + /// Consume the retained record for a MAC, returning its boot interface id /// when the record is within `window` (always, when no window is set). The /// record is removed either way -- a `machine_interfaces` row now diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index 6be0fa6776..346f28a7c9 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -315,6 +315,26 @@ fn main() -> Result<(), Box> { "forge.ManagedHostNetworkConfigResponse", "#[derive(serde::Serialize)]", ) + .type_attribute( + "forge.GetMachineBootInterfacesResponse", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.MachineInterfaceBootInterface", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.PredictedBootInterface", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.ExploredBootInterface", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.RetainedBootInterface", + "#[derive(serde::Serialize)]", + ) .type_attribute( "forge.RoutingProfile", "#[derive(serde::Serialize)]", diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index e1744d3579..16e801d546 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -376,6 +376,12 @@ service Forge { rpc SetMachineBootOverride(MachineBootOverride) returns (google.protobuf.Empty); rpc ClearMachineBootOverride(common.MachineInterfaceId) returns (google.protobuf.Empty); + // Gather one machine's boot-interface view from every store that records it + // (owned interface rows, predictions, the explored endpoint default, and the + // retained post-deletion pairs), plus the effective boot interface the + // system would select. Read-only; built for troubleshooting and verification. + rpc GetMachineBootInterfaces(GetMachineBootInterfacesRequest) returns (GetMachineBootInterfacesResponse); + // Get Network topology rpc GetNetworkTopology(NetworkTopologyRequest) returns (NetworkTopologyData); rpc FindNetworkDevicesByDeviceIds(NetworkDeviceIdList) returns (NetworkTopologyData); @@ -8743,3 +8749,79 @@ message ReWrapSecretsResponse { // unrouted KEK can be retired. uint64 stale_remaining = 3; } + +// --------------------------------------------------------------------------- +// GetMachineBootInterfaces: one machine's boot-interface view, gathered from +// every store that records it. Each store gets its own purpose-built report +// message that explicitly carries `boot_interface_id` -- the regular +// `MachineInterface` message drops it, so these are separate report types. +// --------------------------------------------------------------------------- + +message GetMachineBootInterfacesRequest { + common.MachineId machine_id = 1; +} + +// A `machine_interfaces` row's boot interface: the authoritative store for an +// owned machine. `primary_interface` is the designation `pick_boot_interface` +// keys on. +message MachineInterfaceBootInterface { + string mac_address = 1; + bool primary_interface = 2; + // Vendor-named Redfish EthernetInterface.Id, absent until site-explorer has + // captured it for this MAC. + optional string boot_interface_id = 3; + // Segment type of the row, when one is known (denormalized from the segment). + optional string network_segment_type = 4; +} + +// A `predicted_machine_interfaces` row's boot interface: the candidate a host +// offers in the window before its first DHCP lease creates an owned row. +message PredictedBootInterface { + string mac_address = 1; + bool primary_interface = 2; + optional string boot_interface_id = 3; + // The predicted/expected segment type carried on the prediction. + optional string network_segment_type = 4; +} + +// An `explored_endpoints` row's boot interface: site-explorer's per-cycle +// automatic pick for a BMC endpoint, used only for endpoints no machine owns. +message ExploredBootInterface { + // BMC endpoint address this explored default was recorded against. + string address = 1; + optional string boot_interface_mac = 2; + optional string boot_interface_id = 3; +} + +// A `retained_boot_interfaces` row: the last-known boot pair preserved past +// interface deletion, keyed by MAC. `recorded_at` is included so the view can +// show stale records (ones aged past the retention window). +message RetainedBootInterface { + string mac_address = 1; + string boot_interface_id = 2; + google.protobuf.Timestamp recorded_at = 3; +} + +message GetMachineBootInterfacesResponse { + common.MachineId machine_id = 1; + + // Boot interfaces from the four stores. + repeated MachineInterfaceBootInterface machine_interfaces = 2; + repeated PredictedBootInterface predicted_interfaces = 3; + repeated ExploredBootInterface explored_endpoints = 4; + repeated RetainedBootInterface retained_interfaces = 5; + + // The boot interface MAC the system would select for this machine right now, + // applying `pick_boot_interface` to the owned `machine_interfaces` rows. + // Absent when there is no owned candidate yet. + optional string effective_boot_interface_mac = 6; + // The fully-populated effective boot interface id (MAC + Redfish id), when + // the selected row has its interface id captured. Absent otherwise. + optional string effective_boot_interface_id = 7; + + // True when the stores do not all agree on the boot MAC -- a signal worth a + // closer look during troubleshooting (e.g. the explored default points at a + // different NIC than the effective owned pick, or a predicted primary + // disagrees). See the handler for the exact comparison. + bool divergent = 8; +}