diff --git a/crates/admin-cli/src/debug_bundle/cmds.rs b/crates/admin-cli/src/debug_bundle/cmds.rs index 690ed197e1..e629811871 100644 --- a/crates/admin-cli/src/debug_bundle/cmds.rs +++ b/crates/admin-cli/src/debug_bundle/cmds.rs @@ -580,6 +580,7 @@ struct MachineAnalysis { } /// Helper function to get BMC IP and MAC address from machine_id +#[allow(deprecated)] async fn get_bmc_ip_from_host_id( api_client: &ApiClient, host_id: &str, @@ -1698,6 +1699,7 @@ impl<'a> ZipBundleCreator<'a> { Ok(()) } + #[allow(deprecated)] fn add_machine_analysis_json( &self, zip: &mut ZipWriter, @@ -1801,6 +1803,7 @@ impl<'a> ZipBundleCreator<'a> { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] fn add_metadata( &self, zip: &mut ZipWriter, diff --git a/crates/admin-cli/src/dpu/network/cmd.rs b/crates/admin-cli/src/dpu/network/cmd.rs index b595553103..8e69a1fc0b 100644 --- a/crates/admin-cli/src/dpu/network/cmd.rs +++ b/crates/admin-cli/src/dpu/network/cmd.rs @@ -196,6 +196,7 @@ pub async fn show_dpu_network_config( Ok(()) } +#[allow(deprecated)] pub async fn show_dpu_status( api_client: &ApiClient, output_file: &mut Box, diff --git a/crates/admin-cli/src/dpu/reprovision/cmd.rs b/crates/admin-cli/src/dpu/reprovision/cmd.rs index dc0311f724..d732e38b40 100644 --- a/crates/admin-cli/src/dpu/reprovision/cmd.rs +++ b/crates/admin-cli/src/dpu/reprovision/cmd.rs @@ -48,6 +48,7 @@ pub async fn reprovision(api_client: &ApiClient, reprov: Args) -> CarbideCliResu } } +#[allow(deprecated)] async fn apply_health_report( api_client: &ApiClient, id: carbide_uuid::machine::MachineId, diff --git a/crates/admin-cli/src/dpu/status/cmd.rs b/crates/admin-cli/src/dpu/status/cmd.rs index 6eb1f8a850..15f17f9058 100644 --- a/crates/admin-cli/src/dpu/status/cmd.rs +++ b/crates/admin-cli/src/dpu/status/cmd.rs @@ -45,6 +45,7 @@ struct DpuStatus { } impl From for DpuStatus { + #[allow(deprecated)] fn from(machine: Machine) -> Self { let state = match machine.state.split_once(' ') { Some((state, _)) => state.to_owned(), @@ -105,6 +106,7 @@ impl From for Row { } } +#[allow(deprecated)] pub fn get_dpu_version_status(build_info: &BuildInfo, machine: &Machine) -> String { let mut version_statuses = Vec::default(); diff --git a/crates/admin-cli/src/dpu/versions/cmd.rs b/crates/admin-cli/src/dpu/versions/cmd.rs index d04bdf149f..eedd992681 100644 --- a/crates/admin-cli/src/dpu/versions/cmd.rs +++ b/crates/admin-cli/src/dpu/versions/cmd.rs @@ -58,6 +58,7 @@ struct DpuVersions { } impl From for DpuVersions { + #[allow(deprecated)] fn from(machine: Machine) -> Self { let state = match machine.state.split_once(' ') { Some((state, _)) => state.to_owned(), @@ -144,6 +145,7 @@ pub fn generate_firmware_status_table(machines: Vec) -> Box { Box::new(table) } +#[allow(deprecated)] pub async fn handle_dpu_versions( output_file: &mut Box, output_format: OutputFormat, diff --git a/crates/admin-cli/src/host/reprovision/cmd.rs b/crates/admin-cli/src/host/reprovision/cmd.rs index f99870db9f..71ca4b9780 100644 --- a/crates/admin-cli/src/host/reprovision/cmd.rs +++ b/crates/admin-cli/src/host/reprovision/cmd.rs @@ -24,6 +24,7 @@ use crate::errors::{CarbideCliError, CarbideCliResult}; use crate::machine::{HealthReportTemplates, get_health_report}; use crate::rpc::ApiClient; +#[allow(deprecated)] pub async fn trigger_reprovisioning_set( data: ReprovisionSet, api_client: &ApiClient, diff --git a/crates/admin-cli/src/inventory/cmds.rs b/crates/admin-cli/src/inventory/cmds.rs index e03e0662ab..389193a9ba 100644 --- a/crates/admin-cli/src/inventory/cmds.rs +++ b/crates/admin-cli/src/inventory/cmds.rs @@ -88,6 +88,7 @@ struct DpuMachineInfo<'a> { } /// Generate element containing all information needed to write a Machine Host. +#[allow(deprecated)] fn get_host_machine_info<'a>( machines: &'a [&'a ::rpc::Machine], ) -> HashMap<&'a str, HostMachineInfo<'a>> { @@ -127,6 +128,7 @@ fn get_host_machine_info<'a>( } /// Generate element containing all information needed to write a Machine Host. +#[allow(deprecated)] fn get_dpu_machine_info<'a>( machines: &'a [&'a ::rpc::Machine], ) -> HashMap<&'a str, DpuMachineInfo<'a>> { @@ -153,6 +155,7 @@ fn get_dpu_machine_info<'a>( } /// Generate element containing all information needed to write a BMC Host. +#[allow(deprecated)] fn get_bmc_info<'a>( machines: &[&'a ::rpc::Machine], managed_hosts: &'a [ExploredManagedHost], @@ -330,6 +333,7 @@ type CreateInventoryReturnType<'a> = ( ); /// Generate inventory item for instances. +#[allow(deprecated)] fn create_inventory_for_instances<'a>( instances: &'a InstanceList, machines: &'a MachineList, diff --git a/crates/admin-cli/src/machine/hardware_info/cmd.rs b/crates/admin-cli/src/machine/hardware_info/cmd.rs index 4ed87a598c..28def2a3ee 100644 --- a/crates/admin-cli/src/machine/hardware_info/cmd.rs +++ b/crates/admin-cli/src/machine/hardware_info/cmd.rs @@ -42,6 +42,7 @@ pub async fn handle_update_machine_hardware_info_gpus( .await } +#[allow(deprecated)] pub async fn handle_show_machine_hardware_info( api_client: &ApiClient, output_file: &mut Box, diff --git a/crates/admin-cli/src/machine/metadata/cmd.rs b/crates/admin-cli/src/machine/metadata/cmd.rs index 5c8e476067..76c00580f1 100644 --- a/crates/admin-cli/src/machine/metadata/cmd.rs +++ b/crates/admin-cli/src/machine/metadata/cmd.rs @@ -117,6 +117,7 @@ pub async fn metadata_remove_labels( Ok(()) } +#[allow(deprecated)] pub async fn metadata_from_expected_machine( api_client: &ApiClient, cmd: MachineMetadataCommandFromExpectedMachine, diff --git a/crates/admin-cli/src/machine/nvlink_info/cmd.rs b/crates/admin-cli/src/machine/nvlink_info/cmd.rs index 3aefe4139d..e43fa83984 100644 --- a/crates/admin-cli/src/machine/nvlink_info/cmd.rs +++ b/crates/admin-cli/src/machine/nvlink_info/cmd.rs @@ -22,6 +22,7 @@ use super::args::{NvlinkInfoArgs, NvlinkInfoPopulateArgs}; use crate::errors::{CarbideCliError, CarbideCliResult}; use crate::rpc::ApiClient; +#[allow(deprecated)] pub async fn handle_nvlink_info_show( args: NvlinkInfoArgs, api_client: &ApiClient, @@ -58,6 +59,7 @@ pub async fn handle_nvlink_info_show( Ok(()) } +#[allow(deprecated)] pub async fn handle_nvlink_info_populate( args: NvlinkInfoPopulateArgs, _output_format: OutputFormat, diff --git a/crates/admin-cli/src/machine/show/cmd.rs b/crates/admin-cli/src/machine/show/cmd.rs index d63f180bf2..81a71b6bd2 100644 --- a/crates/admin-cli/src/machine/show/cmd.rs +++ b/crates/admin-cli/src/machine/show/cmd.rs @@ -31,6 +31,7 @@ use crate::errors::{CarbideCliError, CarbideCliResult}; use crate::rpc::ApiClient; use crate::{async_write, async_write_table_as_csv, async_writeln}; +#[allow(deprecated)] fn convert_machine_to_nice_format( machine: forgerpc::Machine, history_count: u32, @@ -217,6 +218,7 @@ fn get_machine_type(machine_id: Option) -> String { .unwrap_or_else(|| "Unknown".to_string()) } +#[allow(deprecated)] fn convert_machines_to_nice_table(machines: forgerpc::MachineList) -> Box
{ let mut table = Box::new(Table::new()); @@ -425,6 +427,7 @@ pub async fn handle_show( Ok(()) } +#[allow(deprecated)] pub async fn get_next_free_machine( api_client: &ApiClient, machine_ids: &mut VecDeque, diff --git a/crates/admin-cli/src/managed_host/show/cmd.rs b/crates/admin-cli/src/managed_host/show/cmd.rs index 454892e5f3..5c23389934 100644 --- a/crates/admin-cli/src/managed_host/show/cmd.rs +++ b/crates/admin-cli/src/managed_host/show/cmd.rs @@ -497,6 +497,7 @@ fn format_health_alerts(alerts: &[HealthProbeAlert], width: usize) -> String { .join(&format!("\n{:, args: Args, diff --git a/crates/admin-cli/src/rpc.rs b/crates/admin-cli/src/rpc.rs index 927e7b4ca0..58c427f676 100644 --- a/crates/admin-cli/src/rpc.rs +++ b/crates/admin-cli/src/rpc.rs @@ -1276,6 +1276,7 @@ impl ApiClient { } /// Build an InstanceAllocationRequest from CLI args and machine info. + #[allow(deprecated)] pub async fn build_instance_request( &self, machine: Machine, diff --git a/crates/api-core/src/attestation/mod.rs b/crates/api-core/src/attestation/mod.rs index 12626f039f..d8d072fec9 100644 --- a/crates/api-core/src/attestation/mod.rs +++ b/crates/api-core/src/attestation/mod.rs @@ -53,6 +53,7 @@ pub async fn get_ek_cert_by_machine_id( // obtain an ek cert let tpm_ek_cert = machine + .status .hardware_info .as_ref() .ok_or_else(|| CarbideError::internal("Hardware Info not found.".to_string()))? diff --git a/crates/api-core/src/dpa/handler.rs b/crates/api-core/src/dpa/handler.rs index 25e8e158e9..f0b7597a13 100644 --- a/crates/api-core/src/dpa/handler.rs +++ b/crates/api-core/src/dpa/handler.rs @@ -201,7 +201,7 @@ async fn handle_dpa_message(services: Arc, message: SetVni, topic: String) let machine = machine.unwrap(); - let cur_spx_status_observations = machine.spx_status_observation.unwrap_or_default(); + let cur_spx_status_observations = machine.status.spx_status_observation.unwrap_or_default(); let mut new_spx_status_observations = MachineSpxStatusObservation::default(); let mut add_new_observation = true; diff --git a/crates/api-core/src/ethernet_virtualization.rs b/crates/api-core/src/ethernet_virtualization.rs index 984ac9034f..2ff07f5eba 100644 --- a/crates/api-core/src/ethernet_virtualization.rs +++ b/crates/api-core/src/ethernet_virtualization.rs @@ -332,10 +332,15 @@ pub async fn admin_network( // If we loop through the machine interfaces for the host snapshot and look for // that combo, the segment_id of that interface should be the network segment we want, // but checking against known admin segments adds a little bit of defense. - let interface = snapshot.host_snapshot.interfaces.iter().find(|interface| { - interface.attached_dpu_machine_id.as_ref() == Some(dpu_machine_id) - && admin_segment_ids.contains(&interface.segment_id) - }); + let interface = snapshot + .host_snapshot + .status + .interfaces + .iter() + .find(|interface| { + interface.attached_dpu_machine_id.as_ref() == Some(dpu_machine_id) + && admin_segment_ids.contains(&interface.segment_id) + }); let host_machine_id = snapshot.host_snapshot.id; let Some(interface) = interface else { @@ -351,6 +356,7 @@ pub async fn admin_network( // still disables the admin DHCP path on non-primary DPUs via is_primary_dpu. let active_interface = snapshot .host_snapshot + .status .interfaces .iter() .find(|interface| { diff --git a/crates/api-core/src/handlers/attestation.rs b/crates/api-core/src/handlers/attestation.rs index 7716ec3372..14b5ad8a33 100644 --- a/crates/api-core/src/handlers/attestation.rs +++ b/crates/api-core/src/handlers/attestation.rs @@ -58,7 +58,7 @@ pub(crate) async fn trigger_machine_attestation( id: format!("{}", machine_id), })); } - 1 => &machines[0].bmc_info, + 1 => &machines[0].status.bmc_info, _ => { return Err(Status::from(CarbideError::Internal { message: format!("Found more than one machine for machine id {}", machine_id), diff --git a/crates/api-core/src/handlers/bmc_endpoint_explorer.rs b/crates/api-core/src/handlers/bmc_endpoint_explorer.rs index bfb353b44c..7a99e01998 100644 --- a/crates/api-core/src/handlers/bmc_endpoint_explorer.rs +++ b/crates/api-core/src/handlers/bmc_endpoint_explorer.rs @@ -594,6 +594,7 @@ pub(crate) async fn admin_power_control( if let Some(power_state) = snapshot .host_snapshot + .status .power_options .map(|x| x.desired_power_state) && power_state == model::power_manager::PowerState::On @@ -1091,13 +1092,13 @@ pub(crate) async fn validate_and_complete_bmc_endpoint_request( id: machine_id.to_string(), })?; - let bmc_ip = machine.bmc_info.ip.as_ref().ok_or_else(|| { + let bmc_ip = machine.status.bmc_info.ip.as_ref().ok_or_else(|| { CarbideError::internal(format!( "Machine found for {machine_id} but BMC IP is missing" )) })?; - let bmc_mac_address = machine.bmc_info.mac.ok_or_else(|| { + let bmc_mac_address = machine.status.bmc_info.mac.ok_or_else(|| { CarbideError::internal(format!("BMC endpoint for {bmc_ip} ({machine_id}) found but does not have associated MAC")) })?; diff --git a/crates/api-core/src/handlers/component_manager.rs b/crates/api-core/src/handlers/component_manager.rs index bb91ef849c..0b2c8d77b2 100644 --- a/crates/api-core/src/handlers/component_manager.rs +++ b/crates/api-core/src/handlers/component_manager.rs @@ -585,6 +585,7 @@ async fn group_machine_ids_by_rack( /// Returns whether the machine is a rack-scale server (today just GB200, but will later include other SKUs) fn is_rack_scale_server(machine: &Machine) -> bool { machine + .status .hardware_info .as_ref() .is_some_and(|hw| hw.is_gbx00()) @@ -1126,7 +1127,7 @@ async fn resolve_compute_tray_endpoints( continue; }; - let Some(bmc_mac) = machine.bmc_info.mac else { + let Some(bmc_mac) = machine.status.bmc_info.mac else { unresolved.push(UnresolvedDevice { id: machine_id, reason: "BMC MAC not available".into(), @@ -1134,7 +1135,7 @@ async fn resolve_compute_tray_endpoints( continue; }; - let Some(bmc_ip) = machine.bmc_info.ip else { + let Some(bmc_ip) = machine.status.bmc_info.ip else { unresolved.push(UnresolvedDevice { id: machine_id, reason: "BMC IP not configured".into(), diff --git a/crates/api-core/src/handlers/dpf.rs b/crates/api-core/src/handlers/dpf.rs index 54c7082116..61ea4d7012 100644 --- a/crates/api-core/src/handlers/dpf.rs +++ b/crates/api-core/src/handlers/dpf.rs @@ -49,7 +49,7 @@ pub(crate) async fn modify_dpf_state( id: machine_id.to_string(), })?; - if !request.dpf_enabled && machine_snapshot.host_snapshot.dpf.used_for_ingestion { + if !request.dpf_enabled && machine_snapshot.host_snapshot.config.dpf.used_for_ingestion { return Err(CarbideError::FailedPrecondition(format!( "Cannot disable DPF for host {}: machine was ingested via DPF.", machine_id diff --git a/crates/api-core/src/handlers/dpu.rs b/crates/api-core/src/handlers/dpu.rs index dd6b0620b5..104fd60c12 100644 --- a/crates/api-core/src/handlers/dpu.rs +++ b/crates/api-core/src/handlers/dpu.rs @@ -104,6 +104,7 @@ pub(crate) async fn get_managed_host_network_config_inner( let primary_dpu_snapshot = snapshot .host_snapshot + .status .interfaces .iter() .find(|x| x.primary_interface) @@ -190,6 +191,7 @@ pub(crate) async fn get_managed_host_network_config_inner( let booturl_override = if snapshot .host_snapshot + .status .hardware_info .as_ref() .map(|h| h.machine_type) @@ -927,7 +929,7 @@ pub(crate) async fn record_dpu_network_status( id: dpu_machine_id.to_string(), })?; - if snapshot.host_snapshot.dpf.used_for_ingestion { + if snapshot.host_snapshot.config.dpf.used_for_ingestion { // DPF-managed DPUs don't use this upgrade path. Clear any stale flag so the DPU // doesn't keep receiving upgrade signals after the host was switched to DPF. if dpu_machine.needs_agent_upgrade() { diff --git a/crates/api-core/src/handlers/instance.rs b/crates/api-core/src/handlers/instance.rs index 650a74d79e..41a4dc66c9 100644 --- a/crates/api-core/src/handlers/instance.rs +++ b/crates/api-core/src/handlers/instance.rs @@ -893,16 +893,16 @@ pub(crate) async fn invoke_power( log_tenant_organization_id(instance.config.tenant.tenant_organization_id.as_str()); } - let bmc_ip = - snapshot - .host_snapshot - .bmc_info - .ip - .as_ref() - .ok_or_else(|| CarbideError::NotFoundError { - kind: "bmc_ip", - id: machine_id.to_string(), - })?; + let bmc_ip = snapshot + .host_snapshot + .status + .bmc_info + .ip + .as_ref() + .ok_or_else(|| CarbideError::NotFoundError { + kind: "bmc_ip", + id: machine_id.to_string(), + })?; let run_provisioning_instructions_on_every_boot = snapshot .instance @@ -1030,6 +1030,7 @@ pub(crate) async fn invoke_power( let bmc_mac_address = snapshot .host_snapshot + .status .bmc_info .mac .ok_or_else(|| CarbideError::NotFoundError { @@ -1045,7 +1046,7 @@ pub(crate) async fn invoke_power( .redfish_pool .create_client( &bmc_ip, - snapshot.host_snapshot.bmc_info.port, + snapshot.host_snapshot.status.bmc_info.port, RedfishAuth::Key(CredentialKey::BmcCredentials { credential_type: BmcCredentialType::BmcRoot { bmc_mac_address }, }), diff --git a/crates/api-core/src/handlers/instance_type.rs b/crates/api-core/src/handlers/instance_type.rs index c608ae68f6..7928be86bd 100644 --- a/crates/api-core/src/handlers/instance_type.rs +++ b/crates/api-core/src/handlers/instance_type.rs @@ -659,7 +659,7 @@ pub(crate) async fn remove_machine_association( .into()); } - if let Some(ref instance_type_id) = machine.instance_type_id { + if let Some(ref instance_type_id) = machine.config.instance_type_id { // Query the DB for the instance type so that we can use a row-level lock for coordination. // We need this so that ComputeAllocation additions and updates that increase allocations can't exceed the number // of machines associated with a type. diff --git a/crates/api-core/src/handlers/machine.rs b/crates/api-core/src/handlers/machine.rs index a666978ae4..b501b1968f 100644 --- a/crates/api-core/src/handlers/machine.rs +++ b/crates/api-core/src/handlers/machine.rs @@ -338,6 +338,7 @@ pub(crate) async fn admin_force_delete_machine( .and_then(|ctx| ctx.get_external_user_name()); let serial = machine + .status .hardware_info .as_ref() .and_then(|hw| hw.dmi_data.as_ref()) @@ -349,7 +350,7 @@ pub(crate) async fn admin_force_delete_machine( machine.id ); - if machine.instance_type_id.is_some() { + if machine.config.instance_type_id.is_some() { return Err(CarbideError::FailedPrecondition(format!( "association with instance type must be removed before deleting machine {}", &machine.id @@ -387,10 +388,10 @@ pub(crate) async fn admin_force_delete_machine( if let Some(host_machine) = &host_machine { response.managed_host_machine_id = host_machine.id.to_string(); - if let Some(iface) = host_machine.interfaces.first() { + if let Some(iface) = host_machine.status.interfaces.first() { response.managed_host_machine_interface_id = iface.id.to_string(); } - if let Some(ip) = host_machine.bmc_info.ip.as_ref() { + if let Some(ip) = host_machine.status.bmc_info.ip.as_ref() { response.managed_host_bmc_ip = ip.to_string(); } } @@ -401,7 +402,7 @@ pub(crate) async fn admin_force_delete_machine( let dpu_interfaces = dpu_machines .iter() - .flat_map(|m| m.interfaces.clone()) + .flat_map(|m| m.status.interfaces.clone()) .collect::>(); if let Some(iface) = dpu_interfaces.first() { response.dpu_machine_interface_ids = @@ -409,7 +410,7 @@ pub(crate) async fn admin_force_delete_machine( // deprecated field: response.dpu_machine_interface_id = iface.id.to_string(); } - if let Some(ip) = dpu_machine.bmc_info.ip.as_ref() { + if let Some(ip) = dpu_machine.status.bmc_info.ip.as_ref() { response.dpu_bmc_ip = ip.to_string(); } } @@ -418,7 +419,7 @@ pub(crate) async fn admin_force_delete_machine( } if let Some(machine) = &host_machine - && machine.dpf.used_for_ingestion + && machine.config.dpf.used_for_ingestion && api.dpf_sdk.is_none() && !request.allow_delete_with_orphaned_dpf_crds { @@ -463,8 +464,8 @@ pub(crate) async fn admin_force_delete_machine( } if let Some(machine) = &host_machine { - if let Some(ip) = machine.bmc_info.ip { - if let Some(bmc_mac_address) = machine.bmc_info.mac { + if let Some(ip) = machine.status.bmc_info.ip { + if let Some(bmc_mac_address) = machine.status.bmc_info.mac { let ip_address = ip.to_string(); tracing::info!( %ip, @@ -476,7 +477,7 @@ pub(crate) async fn admin_force_delete_machine( .redfish_pool .create_client( &ip_address, - machine.bmc_info.port, + machine.status.bmc_info.port, RedfishAuth::Key(CredentialKey::BmcCredentials { credential_type: BmcCredentialType::BmcRoot { bmc_mac_address }, }), @@ -582,7 +583,7 @@ pub(crate) async fn admin_force_delete_machine( if let Some(machine) = &host_machine { if request.delete_bmc_interfaces - && let Some(bmc_ip) = machine.bmc_info.ip + && let Some(bmc_ip) = machine.status.bmc_info.ip { response.host_bmc_interface_associated = true; if db::machine_interface::delete_by_ip(&mut txn, bmc_ip) @@ -595,7 +596,7 @@ pub(crate) async fn admin_force_delete_machine( db::machine::force_cleanup(&mut txn, &machine.id).await?; if request.delete_interfaces { - for interface in &machine.interfaces { + for interface in &machine.status.interfaces { // The delete retains each row's boot interface pair in // `retained_boot_interfaces`, so a re-ingested machine // recovers its boot target before its first DHCP. @@ -604,7 +605,7 @@ pub(crate) async fn admin_force_delete_machine( response.host_interfaces_deleted = true; } - if let Some(addr) = machine.bmc_info.ip { + if let Some(addr) = machine.status.bmc_info.ip { tracing::info!("Cleaning up explored endpoint at {addr} {}", machine.id); db::explored_endpoints::delete(&mut txn, addr).await?; @@ -666,7 +667,7 @@ pub(crate) async fn admin_force_delete_machine( db::network_devices::dpu_to_network_device_map::delete(&mut txn, &dpu_machine.id).await?; if request.delete_bmc_interfaces - && let Some(bmc_ip) = dpu_machine.bmc_info.ip + && let Some(bmc_ip) = dpu_machine.status.bmc_info.ip { response.dpu_bmc_interface_associated = true; if db::machine_interface::delete_by_ip(&mut txn, bmc_ip) @@ -683,13 +684,13 @@ pub(crate) async fn admin_force_delete_machine( db::machine::force_cleanup(&mut txn, &dpu_machine.id).await?; if request.delete_interfaces { - for interface in &dpu_machine.interfaces { + for interface in &dpu_machine.status.interfaces { db::machine_interface::delete(&interface.id, &mut txn).await?; } response.dpu_interfaces_deleted = true; } - if let Some(addr) = dpu_machine.bmc_info.ip { + if let Some(addr) = dpu_machine.status.bmc_info.ip { tracing::info!("Cleaning up explored endpoint at {addr} {}", dpu_machine.id); db::explored_endpoints::delete(&mut txn, addr).await?; @@ -755,7 +756,7 @@ fn snapshot_map_to_rpc_machines( } async fn clear_bmc_credentials(api: &Api, machine: &Machine) -> Result<(), CarbideError> { - if let Some(mac_address) = machine.bmc_info.mac { + if let Some(mac_address) = machine.status.bmc_info.mac { tracing::info!( "Cleaning up BMC credentials in vault at {} for machine {}", mac_address, diff --git a/crates/api-core/src/handlers/machine_scout.rs b/crates/api-core/src/handlers/machine_scout.rs index 94816d9761..2b1e415da3 100644 --- a/crates/api-core/src/handlers/machine_scout.rs +++ b/crates/api-core/src/handlers/machine_scout.rs @@ -245,7 +245,7 @@ pub(crate) async fn forge_agent_control( // cleanup: send it to discovery, which promotes it; the promoted host then // waits for its storage cleanup. Mirrors the state handler's // WaitingForDiscovery guard. - if host_machine.last_cleanup_time.is_some() + if host_machine.status.last_cleanup_time.is_some() || !host_machine.id.machine_type().is_host() { (Action::discovery(), Some(txn)) @@ -282,7 +282,7 @@ pub(crate) async fn forge_agent_control( }, .. } => { - let last_cleanup_time = host_machine.last_cleanup_time; + let last_cleanup_time = host_machine.status.last_cleanup_time; let state_version = host_machine.state.version; tracing::info!( "last_cleanup_time: {:?}, state_version: {:?}", @@ -302,10 +302,10 @@ pub(crate) async fn forge_agent_control( } => { tracing::info!( "Request Discovery {} < {}", - machine.last_discovery_time.unwrap_or_default(), + machine.status.last_discovery_time.unwrap_or_default(), machine.current_version().timestamp() ); - if machine.last_discovery_time.unwrap_or_default() + if machine.status.last_discovery_time.unwrap_or_default() < machine.current_version().timestamp() { (Action::discovery(), Some(txn)) @@ -386,7 +386,7 @@ fn record_reboot_duration_metric( metric_emitter: &ApiMetricsEmitter, machine: &model::machine::Machine, ) { - let Some(last_reboot_requested) = &machine.last_reboot_requested else { + let Some(last_reboot_requested) = &machine.status.last_reboot_requested else { return; }; @@ -407,6 +407,7 @@ fn record_reboot_duration_metric( // Extract product name and vendor from hardware info let product_name = machine + .status .hardware_info .as_ref() .and_then(|hi| hi.dmi_data.as_ref()) @@ -414,6 +415,7 @@ fn record_reboot_duration_metric( .unwrap_or_else(|| "unknown".to_string()); let vendor = machine + .status .hardware_info .as_ref() .and_then(|hi| hi.dmi_data.as_ref()) diff --git a/crates/api-core/src/handlers/managed_host.rs b/crates/api-core/src/handlers/managed_host.rs index 9762f87deb..825d1bf7c9 100644 --- a/crates/api-core/src/handlers/managed_host.rs +++ b/crates/api-core/src/handlers/managed_host.rs @@ -236,6 +236,7 @@ async fn set_primary_interface_core( })?; let bmc_addr = machine + .status .bmc_info .ip .ok_or_else(|| CarbideError::NotFoundError { diff --git a/crates/api-core/src/instance/mod.rs b/crates/api-core/src/instance/mod.rs index 104608a801..fe868eecc8 100644 --- a/crates/api-core/src/instance/mod.rs +++ b/crates/api-core/src/instance/mod.rs @@ -430,6 +430,7 @@ pub fn allocate_ib_port_guid( let mut updated_ib_config = ib_config.clone(); let ib_hw_info = machine + .status .hardware_info .as_ref() .ok_or(CarbideError::MissingArgument("no hardware info in machine"))? @@ -478,7 +479,7 @@ pub fn allocate_ib_port_guid( // Do additional ib ports verification if !guids.is_empty() { - if let Some(ib_interfaces_status) = &machine.infiniband_status_observation { + if let Some(ib_interfaces_status) = &machine.status.infiniband_status_observation { for guid in guids.iter() { for ib_status in ib_interfaces_status.ib_interfaces.iter() { if *guid == ib_status.guid && ib_status.lid == 0xffff_u16 { @@ -965,6 +966,7 @@ pub async fn batch_allocate_instances( // networking. let allowed_segment_ids: HashSet<_> = mh_snapshot .host_snapshot + .status .interfaces .iter() .filter(|iface| { diff --git a/crates/api-core/src/ipxe.rs b/crates/api-core/src/ipxe.rs index 14cd663514..ab2af8f325 100644 --- a/crates/api-core/src/ipxe.rs +++ b/crates/api-core/src/ipxe.rs @@ -457,7 +457,7 @@ exit || if target.arch == rpc::MachineArchitecture::Arm { console = "ttyAMA0"; qcow_imager_url = "chain ${base-url}/internal/aarch64/qcow-imager.efi loglevel=7 console=tty0 pci=realloc=off "; - } else if let Some(hardware_info) = machine.hardware_info.as_ref() + } else if let Some(hardware_info) = machine.status.hardware_info.as_ref() && let Some(dmi_info) = hardware_info.dmi_data.as_ref() && (dmi_info.sys_vendor == "Lenovo" || dmi_info.sys_vendor == "Supermicro") { diff --git a/crates/api-core/src/tests/common/api_fixtures/host.rs b/crates/api-core/src/tests/common/api_fixtures/host.rs index 81ad36b7d8..e2678407db 100644 --- a/crates/api-core/src/tests/common/api_fixtures/host.rs +++ b/crates/api-core/src/tests/common/api_fixtures/host.rs @@ -69,7 +69,7 @@ pub async fn host_discover_dhcp( &mut txn, ObjectColumnFilter::One( network_prefix::SegmentIdColumn, - &predicted_host.interfaces[0].segment_id, + &predicted_host.status.interfaces[0].segment_id, ), ) .await diff --git a/crates/api-core/src/tests/common/api_fixtures/mod.rs b/crates/api-core/src/tests/common/api_fixtures/mod.rs index 59d0f3c553..1dd9a394d5 100644 --- a/crates/api-core/src/tests/common/api_fixtures/mod.rs +++ b/crates/api-core/src/tests/common/api_fixtures/mod.rs @@ -421,7 +421,7 @@ impl TestEnv { } => ManagedHostState::Failed { details: FailureDetails { cause: details.cause, - failed_at: machine.failure_details.failed_at, + failed_at: machine.status.failure_details.failed_at, source: details.source, }, machine_id, @@ -2536,16 +2536,17 @@ pub async fn update_time_params( time: if let Some(last_reboot_requested) = last_reboot_requested { last_reboot_requested } else { - machine.last_reboot_requested.as_ref().unwrap().time - Duration::minutes(1) + machine.status.last_reboot_requested.as_ref().unwrap().time - Duration::minutes(1) }, - mode: machine.last_reboot_requested.as_ref().unwrap().mode, + mode: machine.status.last_reboot_requested.as_ref().unwrap().mode, restart_verified: None, verification_attempts: None, }; - let last_reboot_time = machine.last_reboot_time.unwrap() - Duration::minutes(2i64); + let last_reboot_time = machine.status.last_reboot_time.unwrap() - Duration::minutes(2i64); - let ts = machine.last_reboot_requested.as_ref().unwrap().time - Duration::minutes(retry_count); + let ts = machine.status.last_reboot_requested.as_ref().unwrap().time + - Duration::minutes(retry_count); let last_discovery_time = ts - Duration::minutes(1); let version = format!( diff --git a/crates/api-core/src/tests/common/api_fixtures/site_explorer.rs b/crates/api-core/src/tests/common/api_fixtures/site_explorer.rs index a69a8aeb03..2595cda9c3 100644 --- a/crates/api-core/src/tests/common/api_fixtures/site_explorer.rs +++ b/crates/api-core/src/tests/common/api_fixtures/site_explorer.rs @@ -104,7 +104,7 @@ async fn current_host_state_and_cleanup_needed( ( machine.current_state().clone(), - machine.last_cleanup_time.is_none(), + machine.status.last_cleanup_time.is_none(), ) } diff --git a/crates/api-core/src/tests/common/api_fixtures/test_managed_host.rs b/crates/api-core/src/tests/common/api_fixtures/test_managed_host.rs index 0977661064..36c7df7b26 100644 --- a/crates/api-core/src/tests/common/api_fixtures/test_managed_host.rs +++ b/crates/api-core/src/tests/common/api_fixtures/test_managed_host.rs @@ -52,14 +52,29 @@ impl From for (MachineId, MachineId) { type Txn<'a> = sqlx::Transaction<'a, sqlx::Postgres>; impl TestManagedHost { + // from_rpc_machine() will first trie to read from m.status.interfaces + // and fall back to the deprecated m.interfaces only if status is absent + // or if status.interfaces yields zero DPU ids + #[allow(deprecated)] pub fn from_rpc_machine(m: &rpc::Machine, api: Arc) -> Self { TestManagedHost { id: m.id.unwrap(), dpu_ids: m - .interfaces - .iter() - .filter_map(|i| i.attached_dpu_machine_id) - .collect(), + .status + .as_ref() + .map(|s| { + s.interfaces + .iter() + .filter_map(|i| i.attached_dpu_machine_id) + .collect::>() + }) + .filter(|ids| !ids.is_empty()) + .unwrap_or_else(|| { + m.interfaces + .iter() + .filter_map(|i| i.attached_dpu_machine_id) + .collect() + }), api, } } diff --git a/crates/api-core/src/tests/connected_device.rs b/crates/api-core/src/tests/connected_device.rs index a07a5438f1..3bf2f2b0f7 100644 --- a/crates/api-core/src/tests/connected_device.rs +++ b/crates/api-core/src/tests/connected_device.rs @@ -26,6 +26,8 @@ async fn test_find_connected_devices_by_machine_ids_single_id(pool: sqlx::PgPool let mh = create_managed_host_multi_dpu(&env, 1).await; let host_machine = mh.host().rpc_machine().await; let expected_machine_id = host_machine + .status + .unwrap() .associated_dpu_machine_ids .into_iter() .next() diff --git a/crates/api-core/src/tests/dpf/happy_path.rs b/crates/api-core/src/tests/dpf/happy_path.rs index b8f59de18e..a69ba62e38 100644 --- a/crates/api-core/src/tests/dpf/happy_path.rs +++ b/crates/api-core/src/tests/dpf/happy_path.rs @@ -69,7 +69,7 @@ async fn test_dpu_and_host_till_ready(pool: sqlx::PgPool) { let host = mh.host().db_machine(&mut txn).await; let dpu = mh.dpu().db_machine(&mut txn).await; - assert!(host.dpf.used_for_ingestion); + assert!(host.config.dpf.used_for_ingestion); assert!(matches!(dpu.current_state(), ManagedHostState::Ready)); let carbide_machines_per_state = env.test_meter.parsed_metrics("carbide_machines_per_state"); diff --git a/crates/api-core/src/tests/dpu_agent_upgrade.rs b/crates/api-core/src/tests/dpu_agent_upgrade.rs index 6192b2076e..60ce1e7060 100644 --- a/crates/api-core/src/tests/dpu_agent_upgrade.rs +++ b/crates/api-core/src/tests/dpu_agent_upgrade.rs @@ -417,6 +417,8 @@ impl TestManagedHost { .into_iter() .next() .expect("expected host machine to be found") + .status + .unwrap() .health .expect("expected health report") .alerts; diff --git a/crates/api-core/src/tests/dpu_nic_firmware.rs b/crates/api-core/src/tests/dpu_nic_firmware.rs index 43eb90493d..872dfe5d2c 100644 --- a/crates/api-core/src/tests/dpu_nic_firmware.rs +++ b/crates/api-core/src/tests/dpu_nic_firmware.rs @@ -80,7 +80,7 @@ async fn test_start_updates_with_multidpu( let mh = create_managed_host_multi_dpu(&env, 2).await; let host = mh.host().rpc_machine().await; - let dpu_ids = host.associated_dpu_machine_ids; + let dpu_ids = host.status.unwrap().associated_dpu_machine_ids; let dpu_machine_id = dpu_ids[0]; let dpu_machine_id2 = dpu_ids[1]; let mut txn = env.pool.begin().await?; diff --git a/crates/api-core/src/tests/dpu_reprovisioning.rs b/crates/api-core/src/tests/dpu_reprovisioning.rs index 20c6a1f82a..f3005ce51c 100644 --- a/crates/api-core/src/tests/dpu_reprovisioning.rs +++ b/crates/api-core/src/tests/dpu_reprovisioning.rs @@ -308,12 +308,12 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade(pool: sqlx::PgPool) { let dpu = mh.dpu().db_machine(&mut txn).await; assert_eq!(&dpu.reprovision_requested.unwrap().initiator, "AdminCli"); - let last_reboot_requested_time = dpu.last_reboot_requested; + let last_reboot_requested_time = dpu.status.last_reboot_requested; env.run_machine_state_controller_iteration().await; let dpu = mh.dpu().db_machine(&mut txn).await; assert_ne!( - dpu.last_reboot_requested.as_ref().unwrap().time, + dpu.status.last_reboot_requested.as_ref().unwrap().time, last_reboot_requested_time.as_ref().unwrap().time ); // DPU restart on Ready -> Reprovision state @@ -1174,14 +1174,14 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { let dpu = mh.dpu().db_machine(&mut txn).await; assert_eq!(dpu.reprovision_requested.unwrap().initiator, "AdminCli"); - let last_reboot_requested_time = dpu.last_reboot_requested.as_ref(); + let last_reboot_requested_time = dpu.status.last_reboot_requested.as_ref(); for _ in 0..3 { env.run_machine_state_controller_iteration().await; } let dpu = mh.dpu().db_machine(&mut txn).await; assert_ne!( - dpu.last_reboot_requested.unwrap().time, + dpu.status.last_reboot_requested.unwrap().time, last_reboot_requested_time.unwrap().time ); @@ -1227,7 +1227,7 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { update_time_params(&env.pool, &dpu, 1, None).await; let dpu = mh.dpu().next_iteration_machine(&env).await; assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::Reboot )); @@ -1237,18 +1237,18 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { let dpu_ = mh.dpu().db_machine(&mut txn).await; assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu_.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::Reboot )); txn.commit().await.unwrap(); let dpu = mh.dpu().next_iteration_machine(&env).await; assert_ne!( - dpu_.last_reboot_requested.as_ref().unwrap().time, - dpu.last_reboot_requested.as_ref().unwrap().time + dpu_.status.last_reboot_requested.as_ref().unwrap().time, + dpu.status.last_reboot_requested.as_ref().unwrap().time ); assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::Reboot )); @@ -1256,7 +1256,7 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { update_time_params(&env.pool, &dpu, 3, None).await; let dpu = mh.dpu().next_iteration_machine(&env).await; assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::Reboot )); @@ -1264,7 +1264,7 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { update_time_params(&env.pool, &dpu, 4, None).await; let dpu = mh.dpu().next_iteration_machine(&env).await; assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::PowerOff )); @@ -1272,7 +1272,7 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { update_time_params(&env.pool, &dpu, 5, None).await; let dpu = mh.dpu().next_iteration_machine(&env).await; assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::PowerOn )); @@ -1280,7 +1280,7 @@ async fn test_reboot_retry(pool: sqlx::PgPool) { update_time_params(&env.pool, &dpu, 5, None).await; let dpu = mh.dpu().next_iteration_machine(&env).await; assert!(matches!( - dpu.last_reboot_requested.as_ref().unwrap().mode, + dpu.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::Reboot )); } @@ -1304,7 +1304,7 @@ async fn test_reboot_no_retry_during_firmware_update(pool: sqlx::PgPool) { "AdminCli" ); - let last_reboot_requested_time = dpu.last_reboot_requested.as_ref(); + let last_reboot_requested_time = dpu.status.last_reboot_requested.as_ref(); let handler = MachineStateHandlerBuilder::builder() .hardware_models(env.config.get_firmware_config()) @@ -1321,7 +1321,7 @@ async fn test_reboot_no_retry_during_firmware_update(pool: sqlx::PgPool) { env.run_machine_state_controller_iteration().await; let dpu = mh.dpu().db_machine(&mut txn).await; assert_ne!( - dpu.last_reboot_requested.as_ref().unwrap().time, + dpu.status.last_reboot_requested.as_ref().unwrap().time, last_reboot_requested_time.unwrap().time ); @@ -1351,11 +1351,11 @@ async fn test_reboot_no_retry_during_firmware_update(pool: sqlx::PgPool) { let mut txn: sqlx::Transaction<'_, sqlx::Postgres> = env.pool.begin().await.unwrap(); let host = mh.host().db_machine(&mut txn).await; let dpu = mh.dpu().db_machine(&mut txn).await; - let last_reboot_requested = host.last_reboot_requested.as_ref().unwrap(); + let last_reboot_requested = host.status.last_reboot_requested.as_ref().unwrap(); tracing::info!("power request: {:?}", last_reboot_requested); assert!(matches!( - host.last_reboot_requested.as_ref().unwrap().mode, + host.status.last_reboot_requested.as_ref().unwrap().mode, MachineLastRebootRequestedMode::Reboot )); @@ -1635,7 +1635,7 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_onedpu_repro "AdminCli" ); - let last_reboot_requested_time = dpu.last_reboot_requested.as_ref(); + let last_reboot_requested_time = dpu.status.last_reboot_requested.as_ref(); env.run_machine_state_controller_iteration().await; let dpu = mh.dpu_n(0).db_machine(&mut txn).await; @@ -1654,7 +1654,7 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_onedpu_repro env.run_machine_state_controller_iteration().await; let dpu = mh.dpu_n(0).db_machine(&mut txn).await; assert_ne!( - dpu.last_reboot_requested.as_ref().unwrap().time, + dpu.status.last_reboot_requested.as_ref().unwrap().time, last_reboot_requested_time.unwrap().time ); assert_eq!( @@ -1772,12 +1772,12 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_bothdpu(pool "AdminCli" ); - let last_reboot_requested_time = dpu.last_reboot_requested.as_ref(); + let last_reboot_requested_time = dpu.status.last_reboot_requested.as_ref(); env.run_machine_state_controller_iteration().await; let dpu = mh.dpu_n(0).db_machine(&mut txn).await; assert_ne!( - dpu.last_reboot_requested.as_ref().unwrap().time, + dpu.status.last_reboot_requested.as_ref().unwrap().time, last_reboot_requested_time.unwrap().time ); assert_eq!( @@ -1798,7 +1798,7 @@ async fn test_dpu_for_reprovisioning_with_firmware_upgrade_multidpu_bothdpu(pool let dpu = mh.dpu_n(0).db_machine(&mut txn).await; assert_ne!( - dpu.last_reboot_requested.as_ref().unwrap().time, + dpu.status.last_reboot_requested.as_ref().unwrap().time, last_reboot_requested_time.unwrap().time ); assert_eq!( diff --git a/crates/api-core/src/tests/finder.rs b/crates/api-core/src/tests/finder.rs index 087d31525f..8666c23f5e 100644 --- a/crates/api-core/src/tests/finder.rs +++ b/crates/api-core/src/tests/finder.rs @@ -36,6 +36,26 @@ async fn test_ip_finder(db_pool: sqlx::PgPool) -> Result<(), eyre::Report> { let mh = create_managed_host(&env).await; let host_machine = mh.host().rpc_machine().await; + assert!( + host_machine.bmc_info.is_some(), + "bmc_info must be populated for the host machine" + ); + let status = host_machine + .status + .as_ref() + .expect("host machine must have a status message"); + assert!( + !status.interfaces.is_empty(), + "status.interfaces must be populated for the host machine" + ); + #[allow(deprecated)] + { + assert_eq!( + host_machine.interfaces, status.interfaces, + "interfaces must equal status.interfaces" + ); + } + mh.instance_builer(&env) .single_interface_network_config(segment_id) .keyset_ids(&["keyset1", "keyset2"]) @@ -150,7 +170,7 @@ async fn test_identify_uuid(db_pool: sqlx::PgPool) -> Result<(), eyre::Report> { .build() .await; let res = mh.host().rpc_machine().await; - let interface_id = &res.interfaces[0].id; + let interface_id = &res.status.as_ref().unwrap().interfaces[0].id; // Network segment let req = rpc::forge::IdentifyUuidRequest { @@ -243,8 +263,12 @@ async fn test_identify_mac(db_pool: sqlx::PgPool) -> Result<(), eyre::Report> { .into_inner() .machines .remove(0); - let interface_id = res.interfaces[0].id.as_ref().unwrap().to_string(); - let mac_address = &res.interfaces[0].mac_address; + let interface_id = res.status.as_ref().unwrap().interfaces[0] + .id + .as_ref() + .unwrap() + .to_string(); + let mac_address = &res.status.as_ref().unwrap().interfaces[0].mac_address; let req = rpc::forge::IdentifyMacRequest { mac_address: mac_address.to_string(), @@ -276,7 +300,16 @@ async fn test_identify_serial(db_pool: sqlx::PgPool) -> Result<(), eyre::Report> let res = mh.dpu().rpc_machine().await; assert_eq!( - res.discovery_info.unwrap().dmi_data.unwrap().product_serial, + res.status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap() + .dmi_data + .as_ref() + .unwrap() + .product_serial, dpu_config.serial ); diff --git a/crates/api-core/src/tests/host_bmc_firmware_test.rs b/crates/api-core/src/tests/host_bmc_firmware_test.rs index 04c12d7404..179071244e 100644 --- a/crates/api-core/src/tests/host_bmc_firmware_test.rs +++ b/crates/api-core/src/tests/host_bmc_firmware_test.rs @@ -544,10 +544,12 @@ async fn test_postingestion_bmc_upgrade(pool: sqlx::PgPool) -> CarbideResult<()> }; // "Site explorer" pass - let endpoints = - db::explored_endpoints::find_by_ips(txn.as_mut(), vec![host.bmc_info.ip_addr().unwrap()]) - .await - .unwrap(); + let endpoints = db::explored_endpoints::find_by_ips( + txn.as_mut(), + vec![host.status.bmc_info.ip_addr().unwrap()], + ) + .await + .unwrap(); let mut endpoint = endpoints.into_iter().next().unwrap(); endpoint.report.service[0].inventories[1].version = Some("1.13.2".to_string()); endpoint @@ -555,7 +557,7 @@ async fn test_postingestion_bmc_upgrade(pool: sqlx::PgPool) -> CarbideResult<()> .versions .insert(FirmwareComponentType::Uefi, "1.13.2".to_string()); db::explored_endpoints::try_update( - host.bmc_info.ip_addr().unwrap(), + host.status.bmc_info.ip_addr().unwrap(), endpoint.report_version, &endpoint.report, false, @@ -665,9 +667,11 @@ async fn test_postingestion_bmc_upgrade(pool: sqlx::PgPool) -> CarbideResult<()> }; // "Site explorer" pass to indicate that we're at the desired version - let endpoints = - db::explored_endpoints::find_by_ips(txn.as_mut(), vec![host.bmc_info.ip_addr().unwrap()]) - .await?; + let endpoints = db::explored_endpoints::find_by_ips( + txn.as_mut(), + vec![host.status.bmc_info.ip_addr().unwrap()], + ) + .await?; let mut endpoint = endpoints.into_iter().next().unwrap(); endpoint.report.service[0].inventories[0].version = Some("6.00.30.00".to_string()); endpoint @@ -675,7 +679,7 @@ async fn test_postingestion_bmc_upgrade(pool: sqlx::PgPool) -> CarbideResult<()> .versions .insert(FirmwareComponentType::Bmc, "6.00.30.00".to_string()); db::explored_endpoints::try_update( - host.bmc_info.ip_addr().unwrap(), + host.status.bmc_info.ip_addr().unwrap(), endpoint.report_version, &endpoint.report, false, @@ -742,7 +746,7 @@ async fn test_postingestion_bmc_upgrade(pool: sqlx::PgPool) -> CarbideResult<()> let mut txn = env.pool.begin().await.unwrap(); let host = mh.host().db_machine(&mut txn).await; assert!(host.host_reprovision_requested.is_none()); // Should be cleared or we'd right back in - assert!(host.update_complete); + assert!(host.status.update_complete); let reqs = db::host_machine_update::find_upgrade_needed(&mut txn, true, false).await?; assert!(reqs.is_empty()); txn.commit().await.unwrap(); @@ -762,11 +766,12 @@ async fn test_postingestion_bmc_upgrade(pool: sqlx::PgPool) -> CarbideResult<()> // Validate update_firmware_version_by_machine_id behavior assert_eq!( - host.bmc_info.firmware_version, + host.status.bmc_info.firmware_version, Some("6.00.30.00".to_string()) ); assert_eq!( - host.hardware_info + host.status + .hardware_info .as_ref() .unwrap() .dmi_data @@ -1490,10 +1495,12 @@ async fn test_instance_upgrading_actual_part_2( }; // "Site explorer" pass - let endpoints = - db::explored_endpoints::find_by_ips(txn.as_mut(), vec![host.bmc_info.ip_addr().unwrap()]) - .await - .unwrap(); + let endpoints = db::explored_endpoints::find_by_ips( + txn.as_mut(), + vec![host.status.bmc_info.ip_addr().unwrap()], + ) + .await + .unwrap(); let mut endpoint = endpoints.into_iter().next().unwrap(); endpoint.report.service[0].inventories[1].version = Some("1.13.2".to_string()); endpoint @@ -1501,7 +1508,7 @@ async fn test_instance_upgrading_actual_part_2( .versions .insert(FirmwareComponentType::Uefi, "1.13.2".to_string()); db::explored_endpoints::try_update( - host.bmc_info.ip_addr().unwrap(), + host.status.bmc_info.ip_addr().unwrap(), endpoint.report_version, &endpoint.report, false, @@ -1695,10 +1702,12 @@ async fn test_instance_upgrading_actual_part_2( ); // "Site explorer" pass to indicate that we're at the desired version - let endpoints = - db::explored_endpoints::find_by_ips(txn.as_mut(), vec![host.bmc_info.ip_addr().unwrap()]) - .await - .unwrap(); + let endpoints = db::explored_endpoints::find_by_ips( + txn.as_mut(), + vec![host.status.bmc_info.ip_addr().unwrap()], + ) + .await + .unwrap(); let mut endpoint = endpoints.into_iter().next().unwrap(); endpoint.report.service[0].inventories[0].version = Some("6.00.30.00".to_string()); endpoint @@ -1706,7 +1715,7 @@ async fn test_instance_upgrading_actual_part_2( .versions .insert(FirmwareComponentType::Bmc, "6.00.30.00".to_string()); db::explored_endpoints::try_update( - host.bmc_info.ip_addr().unwrap(), + host.status.bmc_info.ip_addr().unwrap(), endpoint.report_version, &endpoint.report, false, @@ -1873,11 +1882,12 @@ async fn test_instance_upgrading_actual_part_2( // Validate update_firmware_version_by_machine_id behavior assert_eq!( - host.bmc_info.firmware_version, + host.status.bmc_info.firmware_version, Some("6.00.30.00".to_string()) ); assert_eq!( - host.hardware_info + host.status + .hardware_info .as_ref() .unwrap() .dmi_data @@ -3064,10 +3074,12 @@ async fn test_manual_firmware_upgrade_workflow(pool: sqlx::PgPool) -> CarbideRes env.run_machine_state_controller_iteration().await; // "Site explorer" pass - let endpoints = - db::explored_endpoints::find_by_ips(txn.as_mut(), vec![host.bmc_info.ip_addr().unwrap()]) - .await - .unwrap(); + let endpoints = db::explored_endpoints::find_by_ips( + txn.as_mut(), + vec![host.status.bmc_info.ip_addr().unwrap()], + ) + .await + .unwrap(); let mut endpoint = endpoints.into_iter().next().unwrap(); endpoint.report.service[0].inventories[0].version = Some("6.00.30.00".to_string()); endpoint.report.service[0].inventories[1].version = Some("1.13.2".to_string()); @@ -3080,7 +3092,7 @@ async fn test_manual_firmware_upgrade_workflow(pool: sqlx::PgPool) -> CarbideRes .versions .insert(FirmwareComponentType::Bmc, "6.00.30.00".to_string()); db::explored_endpoints::try_update( - host.bmc_info.ip_addr().unwrap(), + host.status.bmc_info.ip_addr().unwrap(), endpoint.report_version, &endpoint.report, false, @@ -3170,7 +3182,7 @@ async fn test_forge_agent_control_waiting_for_scout_upgrade_returns_task_without let mut txn = env.pool.begin().await.unwrap(); let host = mh.host().db_machine(&mut txn).await; - assert!(host.last_cleanup_time.is_none()); + assert!(host.status.last_cleanup_time.is_none()); txn.commit().await.unwrap(); let response = env diff --git a/crates/api-core/src/tests/ib_fabric_monitor.rs b/crates/api-core/src/tests/ib_fabric_monitor.rs index 0eba25ce27..859d244cf0 100644 --- a/crates/api-core/src/tests/ib_fabric_monitor.rs +++ b/crates/api-core/src/tests/ib_fabric_monitor.rs @@ -143,11 +143,23 @@ async fn test_ib_port_down_sets_prevent_allocations_alert( } let machine = env.find_machine(host_machine_id).await.remove(0); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); let guid1 = discovery_info.infiniband_interfaces[0].guid.clone(); let machine = env.find_machine(host_machine_id).await.remove(0); - let health = machine.health.as_ref().expect("Machine should have health"); + let health = machine + .status + .as_ref() + .unwrap() + .health + .as_ref() + .expect("Machine should have health"); let has_ib_port_down_alert = health.alerts.iter().any(|alert| alert.id == "IbPortDown"); assert!( !has_ib_port_down_alert, @@ -160,7 +172,13 @@ async fn test_ib_port_down_sets_prevent_allocations_alert( env.run_ib_fabric_monitor_iteration().await; let machine = env.find_machine(host_machine_id).await.remove(0); - let health = machine.health.as_ref().expect("Machine should have health"); + let health = machine + .status + .as_ref() + .unwrap() + .health + .as_ref() + .expect("Machine should have health"); let ib_port_down_alert = health.alerts.iter().find(|alert| alert.id == "IbPortDown"); assert!( ib_port_down_alert.is_some(), @@ -186,7 +204,13 @@ async fn test_ib_port_down_sets_prevent_allocations_alert( // Verify IbPortDown alert is cleared let machine = env.find_machine(host_machine_id).await.remove(0); - let health = machine.health.as_ref().expect("Machine should have health"); + let health = machine + .status + .as_ref() + .unwrap() + .health + .as_ref() + .expect("Machine should have health"); let has_ib_port_down_alert = health.alerts.iter().any(|alert| alert.id == "IbPortDown"); assert!( !has_ib_port_down_alert, @@ -222,7 +246,13 @@ async fn test_ib_multiple_ports_down(pool: sqlx::PgPool) -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box = discovery_info @@ -604,9 +635,10 @@ async fn test_ib_skip_update_infiniband_status(pool: sqlx::PgPool) { assert_eq!(machine.current_state(), &ManagedHostState::Ready); assert!(!machine.is_dpu()); - assert!(machine.hardware_info.as_ref().is_some()); + assert!(machine.status.hardware_info.as_ref().is_some()); assert_eq!( machine + .status .hardware_info .as_ref() .unwrap() @@ -614,7 +646,13 @@ async fn test_ib_skip_update_infiniband_status(pool: sqlx::PgPool) { .len(), 6 ); - assert!(machine.infiniband_status_observation.as_ref().is_none()); + assert!( + machine + .status + .infiniband_status_observation + .as_ref() + .is_none() + ); } #[crate::sqlx_test] @@ -669,11 +707,36 @@ async fn test_update_instance_ib_config(pool: sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); let machine_guids = guids_by_device(&machine); assert_eq!(discovery_info.infiniband_interfaces.len(), 6); - assert!(machine.ib_status.as_ref().is_some()); - assert_eq!(machine.ib_status.as_ref().unwrap().ib_interfaces.len(), 6); + assert!( + machine + .status + .as_ref() + .unwrap() + .infiniband + .as_ref() + .is_some() + ); + assert_eq!( + machine + .status + .as_ref() + .unwrap() + .infiniband + .as_ref() + .unwrap() + .ib_interfaces + .len(), + 6 + ); // select the second MT2910 Family [ConnectX-7] and the first MT27800 Family [ConnectX-5] which are sorted by slots let ib_config = rpc::forge::InstanceInfinibandConfig { @@ -1076,6 +1139,9 @@ pub async fn try_allocate_instance( fn guids_by_device(machine: &rpc::forge::Machine) -> HashMap> { let mut ib_ifaces = machine + .status + .as_ref() + .unwrap() .discovery_info .as_ref() .unwrap() diff --git a/crates/api-core/src/tests/ib_machine.rs b/crates/api-core/src/tests/ib_machine.rs index b63eae397d..4001acd3c0 100644 --- a/crates/api-core/src/tests/ib_machine.rs +++ b/crates/api-core/src/tests/ib_machine.rs @@ -62,8 +62,20 @@ async fn monitor_ib_status_and_fix_incorrect_pkey_associations(pool: sqlx::PgPoo let machine_guids = guids.entry(host_machine_id).or_default(); - let discovery_info = machine.discovery_info.as_ref().unwrap(); - let ib_status = machine.ib_status.expect("IB status is missing"); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); + let ib_status = machine + .status + .as_ref() + .unwrap() + .infiniband + .clone() + .expect("IB status is missing"); assert_eq!( discovery_info.infiniband_interfaces.len(), ib_status.ib_interfaces.len() @@ -311,8 +323,20 @@ async fn monitor_ib_status_and_fix_incorrect_pkey_associations(pool: sqlx::PgPoo let machine = env.find_machine(rpc_machine_id).await.remove(0); - let discovery_info = machine.discovery_info.as_ref().unwrap(); - let ib_status = machine.ib_status.expect("IB status is missing"); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); + let ib_status = machine + .status + .as_ref() + .unwrap() + .infiniband + .clone() + .expect("IB status is missing"); assert_eq!( discovery_info.infiniband_interfaces.len(), ib_status.ib_interfaces.len() diff --git a/crates/api-core/src/tests/instance.rs b/crates/api-core/src/tests/instance.rs index 97090d3fea..4ca3b13258 100644 --- a/crates/api-core/src/tests/instance.rs +++ b/crates/api-core/src/tests/instance.rs @@ -1447,7 +1447,7 @@ async fn test_instance_cloud_init_metadata( let machine = mh.host().db_machine(&mut txn).await; let request = tonic::Request::new(rpc::forge::CloudInitInstructionsRequest { - ip: machine.interfaces[0].addresses[0].to_string(), + ip: machine.status.interfaces[0].addresses[0].to_string(), }); let response = env.api.get_cloud_init_instructions(request).await?; diff --git a/crates/api-core/src/tests/instance_allocate.rs b/crates/api-core/src/tests/instance_allocate.rs index 42e27abf65..320683c6b4 100644 --- a/crates/api-core/src/tests/instance_allocate.rs +++ b/crates/api-core/src/tests/instance_allocate.rs @@ -460,7 +460,13 @@ async fn test_zero_dpu_instance_allocation_auto( .await .remove(0); - let instance_network_restrictions = rpc_machine.instance_network_restrictions.unwrap(); + let instance_network_restrictions = rpc_machine + .status + .as_ref() + .unwrap() + .instance_network_restrictions + .clone() + .unwrap(); assert_eq!( instance_network_restrictions.network_segment_membership_type, forge::InstanceNetworkSegmentMembershipType::Static as i32, @@ -1217,7 +1223,13 @@ async fn test_reject_zero_dpu_instance_allocation_multiple_vpcs( db::network_segment::find_by_name(env.pool.begin().await?.deref_mut(), "HOST_INBAND_2") .await?; - let instance_network_restrictions = host_snapshot_rpc.instance_network_restrictions.unwrap(); + let instance_network_restrictions = host_snapshot_rpc + .status + .as_ref() + .unwrap() + .instance_network_restrictions + .clone() + .unwrap(); assert_eq!( instance_network_restrictions.network_segment_membership_type, forge::InstanceNetworkSegmentMembershipType::Static as i32, @@ -1385,7 +1397,13 @@ async fn test_single_dpu_instance_allocation( .machines .remove(0); - let dpu_machine_id = machine.associated_dpu_machine_ids.remove(0).into(); + let dpu_machine_id = machine + .status + .as_mut() + .unwrap() + .associated_dpu_machine_ids + .remove(0) + .into(); let response = env .api diff --git a/crates/api-core/src/tests/instance_type.rs b/crates/api-core/src/tests/instance_type.rs index 5fde606758..5e204bb44e 100644 --- a/crates/api-core/src/tests/instance_type.rs +++ b/crates/api-core/src/tests/instance_type.rs @@ -512,7 +512,7 @@ async fn test_instance_type_delete(pool: sqlx::PgPool) -> Result<(), Box Result<(), Box Option { - m.health.map(|r| r.try_into().unwrap()) + m.status + .and_then(|s| s.health) + .map(|r| r.try_into().unwrap()) } /// Loads aggregate health via FindMachinesByIds api @@ -966,7 +979,8 @@ async fn load_health_via_find_machines_by_ids( .into_inner() .machines .remove(0) - .health + .status + .and_then(|s| s.health) .map(|r| r.try_into().unwrap()) } @@ -1088,12 +1102,15 @@ async fn test_tenant_reported_issue_health_override_template( let machine = find_machine(&env, &host_machine_id).await; // Check that the override was stored - assert_eq!(machine.health_sources.len(), 2); + assert_eq!(machine.status.as_ref().unwrap().health_sources.len(), 2); assert_eq!( - machine.health_sources[1].mode, + machine.status.as_ref().unwrap().health_sources[1].mode, HealthReportApplyMode::Merge as i32 ); - assert_eq!(machine.health_sources[1].source, "tenant-reported-issue"); + assert_eq!( + machine.status.as_ref().unwrap().health_sources[1].source, + "tenant-reported-issue" + ); // Verify aggregate health includes the override let aggregate_health = aggregate(machine).unwrap(); @@ -1169,13 +1186,13 @@ async fn test_request_repair_health_override_template( let machine = find_machine(&env, &host_machine_id).await; // Check that the override was stored - assert_eq!(machine.health_sources.len(), 2); + assert_eq!(machine.status.as_ref().unwrap().health_sources.len(), 2); assert_eq!( - machine.health_sources[1].mode, + machine.status.as_ref().unwrap().health_sources[1].mode, HealthReportApplyMode::Merge as i32 ); assert_eq!( - machine.health_sources[1].source, + machine.status.as_ref().unwrap().health_sources[1].source, health_report::REPAIR_REQUEST_MERGE_SOURCE ); @@ -1274,8 +1291,11 @@ async fn test_tenant_reported_issue_and_request_repair_combined( let aggregate_health = aggregate(machine.clone()).unwrap(); // Check that both overrides were stored - assert_eq!(machine.health_sources.len(), 3); + assert_eq!(machine.status.as_ref().unwrap().health_sources.len(), 3); let sources: Vec = machine + .status + .as_ref() + .unwrap() .health_sources .iter() .map(|o| o.source.clone()) @@ -1284,7 +1304,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined( assert!(sources.contains(&health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string())); // All should be merge mode - for override_entry in &machine.health_sources { + for override_entry in &machine.status.as_ref().unwrap().health_sources { assert_eq!(override_entry.mode, HealthReportApplyMode::Merge as i32); } assert_eq!(aggregate_health.alerts.len(), 2); diff --git a/crates/api-core/src/tests/machine_interfaces.rs b/crates/api-core/src/tests/machine_interfaces.rs index cb16808d68..389f3b95f2 100644 --- a/crates/api-core/src/tests/machine_interfaces.rs +++ b/crates/api-core/src/tests/machine_interfaces.rs @@ -923,13 +923,17 @@ async fn machine_bmc_info_uses_bmc_interface_and_interfaces_exclude_it( assert_eq!(dpu_bmc_interface_mac, dpu_bmc_mac); assert_eq!( - host_machine.bmc_info.machine_interface_id, + host_machine.status.bmc_info.machine_interface_id, Some(host_bmc_interface_id) ); - assert_eq!(host_machine.bmc_info.mac, Some(host_bmc_interface_mac)); - assert_eq!(host_machine.bmc_info.ip, Some(host_bmc_interface_ip)); + assert_eq!( + host_machine.status.bmc_info.mac, + Some(host_bmc_interface_mac) + ); + assert_eq!(host_machine.status.bmc_info.ip, Some(host_bmc_interface_ip)); assert!( host_machine + .status .interfaces .iter() .all(|interface| interface.interface_type != InterfaceType::Bmc @@ -937,13 +941,14 @@ async fn machine_bmc_info_uses_bmc_interface_and_interfaces_exclude_it( ); assert_eq!( - dpu_machine.bmc_info.machine_interface_id, + dpu_machine.status.bmc_info.machine_interface_id, Some(dpu_bmc_interface_id) ); - assert_eq!(dpu_machine.bmc_info.mac, Some(dpu_bmc_interface_mac)); - assert_eq!(dpu_machine.bmc_info.ip, Some(dpu_bmc_interface_ip)); + assert_eq!(dpu_machine.status.bmc_info.mac, Some(dpu_bmc_interface_mac)); + assert_eq!(dpu_machine.status.bmc_info.ip, Some(dpu_bmc_interface_ip)); assert!( dpu_machine + .status .interfaces .iter() .all(|interface| interface.interface_type != InterfaceType::Bmc @@ -978,6 +983,9 @@ async fn machine_bmc_info_uses_bmc_interface_and_interfaces_exclude_it( ); assert!( host_rpc_machine + .status + .as_ref() + .unwrap() .interfaces .iter() .all(|interface| interface.interface_type != Some(rpc_bmc_type) @@ -1002,6 +1010,9 @@ async fn machine_bmc_info_uses_bmc_interface_and_interfaces_exclude_it( ); assert!( dpu_rpc_machine + .status + .as_ref() + .unwrap() .interfaces .iter() .all(|interface| interface.interface_type != Some(rpc_bmc_type) diff --git a/crates/api-core/src/tests/machine_network.rs b/crates/api-core/src/tests/machine_network.rs index d907e21215..0ffbcd7cfa 100644 --- a/crates/api-core/src/tests/machine_network.rs +++ b/crates/api-core/src/tests/machine_network.rs @@ -843,8 +843,16 @@ async fn test_managed_host_network_config_multi_dpu(pool: sqlx::PgPool) { let mh = api_fixtures::create_managed_host_multi_dpu(&env, 2).await; let host_machine = mh.host().rpc_machine().await; - let dpu_1_id = host_machine.associated_dpu_machine_ids[0]; - let dpu_2_id = host_machine.associated_dpu_machine_ids[1]; + let dpu_1_id = host_machine + .status + .as_ref() + .unwrap() + .associated_dpu_machine_ids[0]; + let dpu_2_id = host_machine + .status + .as_ref() + .unwrap() + .associated_dpu_machine_ids[1]; // And: Multiple admin segments exist when the DPU network config is rendered. let _second_admin_segment = create_network_segment( @@ -964,8 +972,16 @@ async fn test_managed_host_network_config_uses_non_dpu_primary_admin_interface(p // Given: A managed host with 2 DPUs and a separate host admin NIC marked primary. let mh = api_fixtures::create_managed_host_multi_dpu(&env, 2).await; let host_machine = mh.host().rpc_machine().await; - let dpu_1_id = host_machine.associated_dpu_machine_ids[0]; - let dpu_2_id = host_machine.associated_dpu_machine_ids[1]; + let dpu_1_id = host_machine + .status + .as_ref() + .unwrap() + .associated_dpu_machine_ids[0]; + let dpu_2_id = host_machine + .status + .as_ref() + .unwrap() + .associated_dpu_machine_ids[1]; let mut txn = env.pool.begin().await.unwrap(); let admin_segment = db::network_segment::admin(&mut txn) @@ -1157,6 +1173,8 @@ async fn test_managed_host_network_status(pool: sqlx::PgPool) { .into_inner() .machines .remove(0) + .status + .unwrap() .health; let mut reported_health = reported_health.unwrap(); assert!(reported_health.observed_at.is_some()); @@ -1418,6 +1436,8 @@ async fn test_retain_in_alert_since(pool: sqlx::PgPool) { .into_inner() .machines .remove(0) + .status + .unwrap() .health; let reported_health = reported_health.unwrap(); @@ -1445,6 +1465,8 @@ async fn test_retain_in_alert_since(pool: sqlx::PgPool) { .into_inner() .machines .remove(0) + .status + .unwrap() .health; let reported_health = reported_health.unwrap(); assert!(reported_health.observed_at.is_some()); diff --git a/crates/api-core/src/tests/machine_states.rs b/crates/api-core/src/tests/machine_states.rs index 5360430185..c11977d70a 100644 --- a/crates/api-core/src/tests/machine_states.rs +++ b/crates/api-core/src/tests/machine_states.rs @@ -309,10 +309,17 @@ async fn test_dpu_and_host_till_ready(pool: sqlx::PgPool) { let mut txn = env.db_txn().await; let dpu = mh.dpu().db_machine(&mut txn).await; - assert!(!mh.host().db_machine(&mut txn).await.dpf.used_for_ingestion); + assert!( + !mh.host() + .db_machine(&mut txn) + .await + .config + .dpf + .used_for_ingestion + ); for i in 0..mh.dpu_ids.len() { let dpu = mh.dpu_n(i).db_machine(&mut txn).await; - assert!(!dpu.dpf.used_for_ingestion); + assert!(!dpu.config.dpf.used_for_ingestion); } assert!(matches!(dpu.current_state(), ManagedHostState::Ready)); @@ -551,15 +558,25 @@ async fn test_machine_creator_created_host_advances_through_dpu_discovery( dpu_machine.current_state(), ); assert_eq!( - dpu_machine.hardware_info.as_ref().unwrap().machine_type, + dpu_machine + .status + .hardware_info + .as_ref() + .unwrap() + .machine_type, CpuArchitecture::Aarch64, ); - assert_eq!(dpu_machine.bmc_info.ip, Some(dpu_bmc_ip)); + assert_eq!(dpu_machine.status.bmc_info.ip, Some(dpu_bmc_ip)); assert_eq!( format!( "BF-{}", - dpu_machine.bmc_info.firmware_version.clone().unwrap() + dpu_machine + .status + .bmc_info + .firmware_version + .clone() + .unwrap() ), InitialDpuConfig::default() .find_bf3_entry() @@ -568,6 +585,7 @@ async fn test_machine_creator_created_host_advances_through_dpu_discovery( ); assert_eq!( dpu_machine + .status .hardware_info .as_ref() .unwrap() @@ -579,6 +597,7 @@ async fn test_machine_creator_created_host_advances_through_dpu_discovery( ); assert_eq!( dpu_machine + .status .hardware_info .as_ref() .unwrap() @@ -590,6 +609,7 @@ async fn test_machine_creator_created_host_advances_through_dpu_discovery( ); assert_eq!( dpu_machine + .status .hardware_info .as_ref() .unwrap() @@ -612,7 +632,7 @@ async fn test_machine_creator_created_host_advances_through_dpu_discovery( "expected DpuDiscoveringState, got {:?}", host_machine.current_state(), ); - assert!(host_machine.bmc_info.ip.is_some()); + assert!(host_machine.status.bmc_info.ip.is_some()); txn.commit().await.unwrap(); // 2nd creation does nothing. @@ -1147,7 +1167,7 @@ async fn test_nvme_clean_failed_state_host(pool: sqlx::PgPool) { &env.pool, &host, 1, - Some(host.last_reboot_requested.as_ref().unwrap().time - Duration::seconds(59)), + Some(host.status.last_reboot_requested.as_ref().unwrap().time - Duration::seconds(59)), ) .await; // let state machine check the failure condition. @@ -1282,10 +1302,10 @@ async fn test_repeated_initial_discovery_cleanup_failure_preserves_host_init_sou } )); assert!(matches!( - host.failure_details.source, + host.status.failure_details.source, FailureSource::StateMachineArea(StateMachineArea::HostInit) )); - let first_failed_at = host.failure_details.failed_at; + let first_failed_at = host.status.failure_details.failed_at; txn.commit().await.unwrap(); tokio::time::sleep(std::time::Duration::from_millis(1)).await; @@ -1298,11 +1318,11 @@ async fn test_repeated_initial_discovery_cleanup_failure_preserves_host_init_sou let mut txn = env.db_txn().await; let host = mh.host().db_machine(&mut txn).await; assert!(matches!( - host.failure_details.source, + host.status.failure_details.source, FailureSource::StateMachineArea(StateMachineArea::HostInit) )); assert!( - host.failure_details.failed_at > first_failed_at, + host.status.failure_details.failed_at > first_failed_at, "repeated cleanup failure should refresh failure details" ); txn.commit().await.unwrap(); @@ -1362,7 +1382,7 @@ async fn test_hdd_clean_failed_state_host(pool: sqlx::PgPool) { &env.pool, &host, 1, - Some(host.last_reboot_requested.as_ref().unwrap().time - Duration::seconds(59)), + Some(host.status.last_reboot_requested.as_ref().unwrap().time - Duration::seconds(59)), ) .await; // let state machine check the failure condition. @@ -1559,7 +1579,7 @@ async fn test_failed_state_host_discovery_recovery(pool: sqlx::PgPool) { .get_pxe_instructions(tonic::Request::new(rpc::forge::PxeInstructionRequest { arch: rpc::forge::MachineArchitecture::X86 as i32, product: None, - client_ip: Some(host.interfaces[0].addresses[0].to_string()), + client_ip: Some(host.status.interfaces[0].addresses[0].to_string()), ..Default::default() })) .await @@ -1592,8 +1612,8 @@ async fn test_failed_state_host_discovery_recovery(pool: sqlx::PgPool) { let mut txn = env.db_txn().await; let host = mh.host().db_machine(&mut txn).await; - assert!(host.last_reboot_requested.is_some()); - let last_reboot_requested_time = host.last_reboot_requested.as_ref().unwrap().time; + assert!(host.status.last_reboot_requested.is_some()); + let last_reboot_requested_time = host.status.last_reboot_requested.as_ref().unwrap().time; assert!(matches!( host.current_state(), @@ -1657,7 +1677,7 @@ async fn test_failed_state_host_discovery_recovery(pool: sqlx::PgPool) { assert_ne!( last_reboot_requested_time, - host.last_reboot_requested.as_ref().unwrap().time + host.status.last_reboot_requested.as_ref().unwrap().time ); txn.commit().await.unwrap(); @@ -2518,7 +2538,7 @@ async fn test_forge_agent_control_assigned_discovery_boot_does_not_reset_without let mut txn = env.db_txn().await; let host = mh.host().db_machine(&mut txn).await; - assert!(host.last_cleanup_time.is_none()); + assert!(host.status.last_cleanup_time.is_none()); txn.commit().await.unwrap(); let response = mh.host().forge_agent_control().await; @@ -2571,11 +2591,13 @@ async fn test_update_reboot_requested_time_off(pool: sqlx::PgPool) { assert_ne!( snapshot.dpu_snapshots[i] .clone() + .status .last_reboot_requested .map(|x| x.time) .unwrap_or_default(), snapshot1.dpu_snapshots[i] .clone() + .status .last_reboot_requested .unwrap() .time @@ -2603,11 +2625,13 @@ async fn test_update_reboot_requested_time_off(pool: sqlx::PgPool) { assert_ne!( snapshot1.dpu_snapshots[i] .clone() + .status .last_reboot_requested .map(|x| x.time) .unwrap_or_default(), snapshot2.dpu_snapshots[i] .clone() + .status .last_reboot_requested .unwrap() .time @@ -2640,11 +2664,13 @@ async fn test_update_reboot_requested_time_off(pool: sqlx::PgPool) { assert_eq!( snapshot2.dpu_snapshots[i] .clone() + .status .last_reboot_requested .map(|x| x.time) .unwrap_or_default(), snapshot3.dpu_snapshots[i] .clone() + .status .last_reboot_requested .unwrap() .time @@ -2853,7 +2879,7 @@ async fn test_polling_bios_setup_full_recovery_reruns_machine_setup_and_succeeds }, } ) { - if host.last_reboot_requested.is_some() { + if host.status.last_reboot_requested.is_some() { update_time_params(&env.pool, &host, 1, None).await; } mh.network_configured(&env).await; diff --git a/crates/api-core/src/tests/machine_topology.rs b/crates/api-core/src/tests/machine_topology.rs index 1846afe124..b272763820 100644 --- a/crates/api-core/src/tests/machine_topology.rs +++ b/crates/api-core/src/tests/machine_topology.rs @@ -123,7 +123,7 @@ async fn test_crud_machine_topology(pool: sqlx::PgPool) -> Result<(), Box Result<(), Box Result<(), Box Result<(), eyre::Report> { // Check that the expected alert is set on the Machine let mut host_machine = env.find_machine(rpc_host_id).await.remove(0); assert_eq!( - host_machine.maintenance_reference.clone().unwrap(), + host_machine + .config + .as_ref() + .unwrap() + .maintenance_reference + .clone() + .unwrap(), "https://jira.example.com/ABC-123" ); - assert!(host_machine.maintenance_start_time.is_some()); - let alerts = &mut host_machine.health.as_mut().unwrap().alerts; + assert!( + host_machine + .config + .as_ref() + .unwrap() + .maintenance_start_time + .is_some() + ); + let alerts = &mut host_machine + .status + .as_mut() + .unwrap() + .health + .as_mut() + .unwrap() + .alerts; assert_eq!(alerts.len(), 1); let alert = &mut alerts[0]; assert!(alert.in_alert_since.is_some()); @@ -139,9 +159,30 @@ async fn test_maintenance(db_pool: sqlx::PgPool) -> Result<(), eyre::Report> { // Maintenance reference is cleared and there's no alarm anymore let host_machine = env.find_machine(rpc_host_id).await.remove(0); - assert!(host_machine.maintenance_reference.is_none()); - assert!(host_machine.maintenance_start_time.is_none()); - let alerts = &host_machine.health.as_ref().unwrap().alerts; + assert!( + host_machine + .config + .as_ref() + .unwrap() + .maintenance_reference + .is_none() + ); + assert!( + host_machine + .config + .as_ref() + .unwrap() + .maintenance_start_time + .is_none() + ); + let alerts = &host_machine + .status + .as_ref() + .unwrap() + .health + .as_ref() + .unwrap() + .alerts; assert!(alerts.is_empty()); // There are now no machines in maintenance mode diff --git a/crates/api-core/src/tests/nvl_instance.rs b/crates/api-core/src/tests/nvl_instance.rs index 67c8614e97..787200b4ca 100644 --- a/crates/api-core/src/tests/nvl_instance.rs +++ b/crates/api-core/src/tests/nvl_instance.rs @@ -179,7 +179,13 @@ async fn test_create_instance_with_nvl_config(pool: sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -349,7 +355,13 @@ async fn test_detach_gpus_from_partition_by_clearing_nvlink_config(pool: sqlx::P let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -557,7 +569,13 @@ async fn test_with_multiple_nv_link_logical_partitions(pool: sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -647,7 +665,14 @@ async fn test_nvl_partition_monitor_adds_successful_partitions_when_some_creates ) .await; - let discovery_info = mh.host().rpc_machine().await.discovery_info.unwrap(); + let discovery_info = mh + .host() + .rpc_machine() + .await + .status + .unwrap() + .discovery_info + .unwrap(); let gpus: Vec = discovery_info.gpus.to_vec(); let nvl_config = rpc::forge::InstanceNvLinkConfig { @@ -775,8 +800,20 @@ async fn test_create_instances_with_nvl_configs_same_logical_partition_different assert_eq!(&machine1.state, "Ready"); assert_eq!(&machine2.state, "Ready"); - let discovery_info1 = machine1.discovery_info.as_ref().unwrap(); - let discovery_info2 = machine2.discovery_info.as_ref().unwrap(); + let discovery_info1 = machine1 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); + let discovery_info2 = machine2 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info1.gpus.len(), 4); assert_eq!(discovery_info2.gpus.len(), 4); let gpus1: Vec = discovery_info1.gpus.to_vec(); @@ -974,7 +1011,13 @@ async fn test_update_instance_with_nvl_config(pool: sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -1176,7 +1219,13 @@ async fn test_instance_update_logical_partition(pool: sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -1311,7 +1360,13 @@ async fn test_instance_delete_with_nvl_config(pool: sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -1420,7 +1475,13 @@ async fn test_create_instance_remove_from_default_partition(pool: sqlx::PgPool) let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -1567,7 +1628,13 @@ async fn test_create_instance_add_to_existing_partition(pool: sqlx::PgPool) { .await; let machine1 = mh1.host().rpc_machine().await; assert_eq!(&machine1.state, "Ready"); - let discovery_info1 = machine1.discovery_info.as_ref().unwrap(); + let discovery_info1 = machine1 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info1.gpus.len(), 4); @@ -1644,7 +1711,13 @@ async fn test_create_instance_add_to_existing_partition(pool: sqlx::PgPool) { .await; let machine2 = mh2.host().rpc_machine().await; assert_eq!(&machine2.state, "Ready"); - let discovery_info2 = machine2.discovery_info.as_ref().unwrap(); + let discovery_info2 = machine2 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info2.gpus.len(), 4); let gpus2: Vec = discovery_info2.gpus.to_vec(); @@ -1749,7 +1822,13 @@ async fn test_logical_partition_delete_with_instance_config(pool: sqlx::PgPool) let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -1957,7 +2036,13 @@ async fn test_create_instance_gpu_in_unknown_partition(pool: sqlx::PgPool) { .await; let machine1 = mh1.host().rpc_machine().await; assert_eq!(&machine1.state, "Ready"); - let discovery_info1 = machine1.discovery_info.as_ref().unwrap(); + let discovery_info1 = machine1 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info1.gpus.len(), 4); @@ -2070,6 +2155,9 @@ async fn assert_machine_nvlink_observation_present( ) { let machine = mh.host().rpc_machine().await; let observation = machine + .status + .as_ref() + .unwrap() .nvlink_status_observation .as_ref() .expect("expected nvlink_status_observation to be set"); @@ -2090,9 +2178,14 @@ async fn assert_machine_nvlink_observation_present( async fn assert_machine_nvlink_observation_null(mh: &TestManagedHost, pool: &sqlx::PgPool) { let machine = mh.host().rpc_machine().await; assert!( - machine.nvlink_status_observation.is_none(), + machine + .status + .as_ref() + .unwrap() + .nvlink_status_observation + .is_none(), "expected null nvlink_status_observation via RPC, got {:?}", - machine.nvlink_status_observation + machine.status.as_ref().unwrap().nvlink_status_observation ); let mut txn = pool @@ -2101,9 +2194,9 @@ async fn assert_machine_nvlink_observation_null(mh: &TestManagedHost, pool: &sql .expect("begin txn for nvlink observation check"); let db_machine = mh.host().db_machine(&mut txn).await; assert!( - db_machine.nvlink_status_observation.is_none(), + db_machine.status.nvlink_status_observation.is_none(), "expected null nvlink_status_observation in DB, got {:?}", - db_machine.nvlink_status_observation + db_machine.status.nvlink_status_observation ); txn.commit().await.expect("commit nvlink observation check"); } @@ -2158,7 +2251,13 @@ async fn run_create_instance_with_nvl_config_nmxc_simulator_scenario( let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -2514,7 +2613,13 @@ async fn test_rack_switch_create_instance_with_nvl_config_use_nmxc_simulator(poo assert_eq!(machine.rack_id.as_ref(), Some(&rack_id)); assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); let gpus: Vec = discovery_info.gpus.to_vec(); @@ -2689,8 +2794,20 @@ async fn test_create_instance_multiple_domains_use_nmxc_simulator(pool: sqlx::Pg assert_eq!(&machine4.state, "Ready"); assert_eq!(&machine5.state, "Ready"); - let discovery_info4 = machine4.discovery_info.as_ref().unwrap(); - let discovery_info5 = machine5.discovery_info.as_ref().unwrap(); + let discovery_info4 = machine4 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); + let discovery_info5 = machine5 + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info4.gpus.len(), 4); assert_eq!(discovery_info5.gpus.len(), 4); @@ -2803,7 +2920,13 @@ async fn test_instance_delete_with_nvl_config_use_nmxc_simulator(pool: sqlx::PgP let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -2907,7 +3030,13 @@ async fn test_managed_host_creation_with_tray_default_partition_use_nmxc_simulat let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); @@ -2982,7 +3111,13 @@ async fn test_null_nvlink_observation_after_nmxc_unreachable_use_nmxc_simulator( let machine = mh.host().rpc_machine().await; assert_eq!(&machine.state, "Ready"); - let discovery_info = machine.discovery_info.as_ref().unwrap(); + let discovery_info = machine + .status + .as_ref() + .unwrap() + .discovery_info + .as_ref() + .unwrap(); assert_eq!(discovery_info.gpus.len(), 4); let nvl_config = rpc::forge::InstanceNvLinkConfig { diff --git a/crates/api-core/src/tests/site_explorer.rs b/crates/api-core/src/tests/site_explorer.rs index 7b6ce06b90..f398a0d526 100644 --- a/crates/api-core/src/tests/site_explorer.rs +++ b/crates/api-core/src/tests/site_explorer.rs @@ -582,8 +582,8 @@ async fn test_delete_explored_endpoint(pool: PgPool) -> Result<(), Box for InstanceInterfaceConfig { // Find which interface on the machine is in this prefix let host_interfaces_in_instance_segment = machine + .status .interfaces .iter() .filter(|i| { diff --git a/crates/api-db/src/machine.rs b/crates/api-db/src/machine.rs index a5bfa05f18..576b678b6c 100644 --- a/crates/api-db/src/machine.rs +++ b/crates/api-db/src/machine.rs @@ -2750,7 +2750,7 @@ mod test { .await .unwrap() .unwrap(); - assert!(host.firmware_autoupdate.is_some()); + assert_eq!(host.config.firmware_autoupdate, Some(true)); txn.commit().await?; let mut txn: sqlx::Transaction<'_, sqlx::Postgres> = pool.begin().await.unwrap(); @@ -2759,7 +2759,7 @@ mod test { .await .unwrap() .unwrap(); - assert!(host.firmware_autoupdate.is_none()); + assert!(host.config.firmware_autoupdate.is_none()); Ok(()) } } diff --git a/crates/api-db/src/sku.rs b/crates/api-db/src/sku.rs index 60a74b1bcf..fd7b926d5c 100644 --- a/crates/api-db/src/sku.rs +++ b/crates/api-db/src/sku.rs @@ -314,7 +314,7 @@ pub async fn generate_sku_from_machine_at_version_0_or_1( )); }; - let Some(hardware_info) = machine.hardware_info.as_ref() else { + let Some(hardware_info) = machine.status.hardware_info.as_ref() else { return Err(DatabaseError::new( "generate sku: load hardware info", sqlx::Error::RowNotFound, @@ -369,7 +369,7 @@ pub async fn generate_sku_from_machine_at_version_0_or_1( let ib_capabilities = MachineCapabilityInfiniband::from_ib_interfaces_and_status( &hardware_info.infiniband_interfaces, - machine.infiniband_status_observation.as_ref(), + machine.status.infiniband_status_observation.as_ref(), ); let ib_components: Vec = ib_capabilities .into_iter() @@ -444,9 +444,9 @@ pub fn generate_base_sku_from_hardware( let capabilities = MachineCapabilitiesSet::from_hardware_info( hardware_info.clone(), - machine.infiniband_status_observation.as_ref(), + machine.status.infiniband_status_observation.as_ref(), machine.associated_dpu_machine_ids(), - machine.interfaces.clone(), + machine.status.interfaces.clone(), ); let chassis = SkuComponentChassis { @@ -567,7 +567,7 @@ pub async fn generate_sku_from_machine_at_version_2( )); }; - let Some(hardware_info) = machine.hardware_info.as_ref() else { + let Some(hardware_info) = machine.status.hardware_info.as_ref() else { return Err(DatabaseError::new( "generate sku: load hardware info (v2)", sqlx::Error::RowNotFound, @@ -620,7 +620,7 @@ pub async fn generate_sku_from_machine_at_version_3( )); }; - let Some(hardware_info) = machine.hardware_info.as_ref() else { + let Some(hardware_info) = machine.status.hardware_info.as_ref() else { return Err(DatabaseError::new( "generate sku: load hardware info (v3)", sqlx::Error::RowNotFound, @@ -669,7 +669,7 @@ pub async fn generate_sku_from_machine_at_version_4( )); }; - let Some(hardware_info) = machine.hardware_info.as_ref() else { + let Some(hardware_info) = machine.status.hardware_info.as_ref() else { return Err(DatabaseError::new( "generate sku: load hardware info (v4)", sqlx::Error::RowNotFound, diff --git a/crates/api-model/src/dpu_machine_update.rs b/crates/api-model/src/dpu_machine_update.rs index 2f1aad969b..b9772fc996 100644 --- a/crates/api-model/src/dpu_machine_update.rs +++ b/crates/api-model/src/dpu_machine_update.rs @@ -122,10 +122,11 @@ impl DpuMachineUpdate { .filter_map(|dpu| { // TODO: implement the logic to find the outdated DPUs which are ingested // using DPF. - if managed_host.host_snapshot.dpf.used_for_ingestion { + if managed_host.host_snapshot.config.dpf.used_for_ingestion { return None; } let firmware_version = dpu + .status .hardware_info .as_ref() .and_then(|info| info.dpu_info.as_ref()) diff --git a/crates/api-model/src/machine/capabilities.rs b/crates/api-model/src/machine/capabilities.rs index bda0713bed..2db100a906 100644 --- a/crates/api-model/src/machine/capabilities.rs +++ b/crates/api-model/src/machine/capabilities.rs @@ -22,8 +22,8 @@ use carbide_uuid::machine::MachineId; use serde::{Deserialize, Serialize}; use super::infiniband::MachineInfinibandStatusObservation; -use crate::hardware_info::{CpuInfo, InfinibandInterface}; -use crate::machine::{HardwareInfo, MachineInterfaceSnapshot}; +use crate::hardware_info::{CpuInfo, HardwareInfo, InfinibandInterface}; +use crate::machine::MachineInterfaceSnapshot; lazy_static::lazy_static! { static ref BLOCK_STORAGE_REGEX: regex::Regex = regex::Regex::new(r"(Virtual_CDROM\d+|Virtual_SD\d+|NO_MODEL|LOGICAL_VOLUME)").unwrap(); diff --git a/crates/api-model/src/machine/config.rs b/crates/api-model/src/machine/config.rs new file mode 100644 index 0000000000..ac6cd249a3 --- /dev/null +++ b/crates/api-model/src/machine/config.rs @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use carbide_uuid::instance_type::InstanceTypeId; + +use crate::machine::Dpf; + +/// Operator-set desired state for a machine, mutable via API calls that increment the +/// machine version. +/// +/// Corresponds to `MachineConfig` in the forge proto. Fields here are changed via +/// explicit operator API calls (maintenance, instance-type assignment, firmware policy, +/// DPF toggle). +#[derive(Debug, Clone, Default)] +pub struct MachineConfig { + /// Override to enable or disable firmware auto-update. + pub firmware_autoupdate: Option, + + /// The instance type this machine is associated with, if any. + pub instance_type_id: Option, + + /// DPF configuration for this machine (operator-enabled). + pub dpf: Dpf, +} diff --git a/crates/api-model/src/machine/json.rs b/crates/api-model/src/machine/json.rs index 56bf193210..75fb798e5f 100644 --- a/crates/api-model/src/machine/json.rs +++ b/crates/api-model/src/machine/json.rs @@ -35,8 +35,9 @@ use crate::machine::nvlink::MachineNvLinkStatusObservation; use crate::machine::spx::MachineSpxStatusObservation; use crate::machine::topology::MachineTopology; use crate::machine::{ - Dpf, FailureDetails, HostProfile, HostReprovisionRequest, Machine, MachineInterfaceSnapshot, - MachineLastRebootRequested, ManagedHostState, ReprovisionRequest, UpgradeDecision, + Dpf, FailureDetails, HostProfile, HostReprovisionRequest, Machine, MachineConfig, + MachineInterfaceSnapshot, MachineLastRebootRequested, MachineStatus, ManagedHostState, + ReprovisionRequest, UpgradeDecision, }; use crate::metadata::Metadata; use crate::power_manager::PowerOptions; @@ -154,7 +155,6 @@ impl TryFrom for Machine { Ok(Self { id: value.id, - rack_id: value.rack_id, state: Versioned { value: value.controller_state, version: value.controller_state_version.parse().map_err(|e| { @@ -174,53 +174,54 @@ impl TryFrom for Machine { })?, }, network_status_observation: value.network_status_observation, - infiniband_status_observation: value.infiniband_status_observation, - nvlink_status_observation: value.nvlink_status_observation, - spx_status_observation: value.spx_status_observation, history, - interfaces: value.interfaces, - hardware_info, - bmc_info: value.bmc_info, - last_reboot_time: value.last_reboot_time, - last_cleanup_time: value.last_cleanup_time, - last_discovery_time: value.last_discovery_time, - last_scout_contact_time: value.last_scout_contact_time, - last_scout_observed_version: value.last_scout_observed_version, - failure_details: value.failure_details, + metadata, + version, + rack_id: value.rack_id, + hw_sku: value.hw_sku, + config: MachineConfig { + firmware_autoupdate: value.firmware_autoupdate, + instance_type_id: value.instance_type_id, + dpf: value.dpf, + }, + status: MachineStatus { + interfaces: value.interfaces, + hardware_info, + bmc_info: value.bmc_info, + last_reboot_time: value.last_reboot_time, + last_cleanup_time: value.last_cleanup_time, + last_discovery_time: value.last_discovery_time, + last_scout_contact_time: value.last_scout_contact_time, + last_scout_observed_version: value.last_scout_observed_version, + failure_details: value.failure_details, + inventory: value.agent_reported_inventory, + last_reboot_requested: value.last_reboot_requested, + hw_sku: value.hw_sku_status, + hw_sku_device_type: value.hw_sku_device_type, + update_complete: value.update_complete, + nvlink_info: value.nvlink_info, + infiniband_status_observation: value.infiniband_status_observation, + nvlink_status_observation: value.nvlink_status_observation, + spx_status_observation: value.spx_status_observation, + slot_number: value.slot_number, + tray_index: value.tray_index, + power_options: value.power_options, + }, + health_reports: value.health_reports.unwrap_or_default(), reprovision_requested: value.reprovisioning_requested, host_reprovision_requested: value.host_reprovisioning_requested, - manual_firmware_upgrade_completed: value.manual_firmware_upgrade_completed, dpu_agent_upgrade_requested: value.dpu_agent_upgrade_requested, - health_reports: value.health_reports.unwrap_or_default(), - inventory: value.agent_reported_inventory, - last_reboot_requested: value.last_reboot_requested, controller_state_outcome: value.controller_state_outcome, bios_password_set_time: value.bios_password_set_time, last_machine_validation_time: value.last_machine_validation_time, discovery_machine_validation_id: value.discovery_machine_validation_id, cleanup_machine_validation_id: value.cleanup_machine_validation_id, - firmware_autoupdate: value.firmware_autoupdate, on_demand_machine_validation_id: value.on_demand_machine_validation_id, on_demand_machine_validation_request: value.on_demand_machine_validation_request, asn: value.asn, - metadata, - instance_type_id: value.instance_type_id, - version, - // Columns for these exist, but are unused in rust code - // deployed: value.deployed, - // created: value.created, - // updated: value.updated, - hw_sku: value.hw_sku, - hw_sku_status: value.hw_sku_status, - power_options: value.power_options, - hw_sku_device_type: value.hw_sku_device_type, - update_complete: value.update_complete, - nvlink_info: value.nvlink_info, - dpf: value.dpf, host_profile: value.host_profile, rack_fw_details: value.rack_fw_details, - slot_number: value.slot_number, - tray_index: value.tray_index, + manual_firmware_upgrade_completed: value.manual_firmware_upgrade_completed, }) } } diff --git a/crates/api-model/src/machine/mod.rs b/crates/api-model/src/machine/mod.rs index f176f1ed27..6c14c373b2 100644 --- a/crates/api-model/src/machine/mod.rs +++ b/crates/api-model/src/machine/mod.rs @@ -20,7 +20,6 @@ use std::fmt::Display; use std::net::{IpAddr, SocketAddr}; use carbide_uuid::domain::DomainId; -use carbide_uuid::instance_type::InstanceTypeId; use carbide_uuid::machine::{MachineId, MachineInterfaceId}; use carbide_uuid::machine_validation::MachineValidationId; use carbide_uuid::network::NetworkSegmentId; @@ -38,25 +37,18 @@ use sqlx::postgres::PgRow; use sqlx::{Column, FromRow, Row}; use strum_macros::EnumIter; -use self::infiniband::MachineInfinibandStatusObservation; use self::network::{MachineNetworkStatusObservation, ManagedHostNetworkConfig}; -use self::nvlink::MachineNvLinkStatusObservation; -use self::spx::MachineSpxStatusObservation; use super::StateSla; -use super::bmc_info::BmcInfo; -use super::hardware_info::MachineInventory; use super::instance::snapshot::InstanceSnapshot; use super::instance::status::extension_service::InstanceExtensionServiceStatusObservation; use super::instance::status::network::InstanceNetworkStatusObservation; use super::machine_boot_interface::MachineBootInterface; use super::metadata::Metadata; -use super::sku::SkuStatus; use crate::controller_outcome::PersistentStateHandlerOutcome; use crate::dpa_interface::DpaInterface; use crate::errors::{ModelError, ModelResult}; use crate::expected_machine::ExpectedMachineData; use crate::firmware::FirmwareComponentType; -use crate::hardware_info::{HardwareInfo, MachineNvLinkInfo}; use crate::instance::config::network::DeviceLocator; use crate::instance::snapshot::InstanceSnapshotPgJson; use crate::machine::capabilities::MachineCapabilitiesSet; @@ -64,13 +56,13 @@ use crate::machine::health_override::HealthReportSources; use crate::machine_interface::InterfaceType; use crate::machine_interface_address::InterfaceAssociationType; use crate::network_segment::NetworkSegmentType; -use crate::power_manager::PowerOptions; use crate::predicted_machine_interface::PredictedMachineInterface; use crate::state_history::StateHistoryRecord; pub mod slas; pub mod capabilities; +pub mod config; pub mod health_override; pub mod infiniband; pub mod json; @@ -79,9 +71,13 @@ pub mod machine_search_config; pub mod network; pub mod nvlink; pub mod spx; +pub mod status; pub mod topology; pub mod upgrade_policy; +pub use self::config::MachineConfig; +pub use self::status::MachineStatus; + #[derive(Clone, Debug, PartialEq, Eq)] pub struct DpuOsOperationalState { pub state_detail: String, @@ -406,7 +402,7 @@ impl ManagedHostStateSnapshot { /// to be duplicated at every state controller callsite needing to pass a MAC /// into things like machine_setup, is_bios_setup, etc. pub fn boot_interface_mac(&self) -> Option { - pick_boot_interface_mac(&self.host_snapshot.interfaces) + pick_boot_interface_mac(&self.host_snapshot.status.interfaces) } /// Returns the host's boot interface as a fully-populated @@ -419,7 +415,7 @@ impl ManagedHostStateSnapshot { /// alone. Because the MAC and id come from one row, the pair can never name a /// different interface than `boot_interface_mac`. pub fn boot_interface(&self) -> Option { - pick_boot_interface_pair(&self.host_snapshot.interfaces) + pick_boot_interface_pair(&self.host_snapshot.status.interfaces) } /// Returns `true` if override report is hw_health, `false` otherwise. @@ -625,6 +621,7 @@ impl ManagedHostStateSnapshot { pub fn sort_dpu_snapshots(&mut self) -> Result<(), ManagedHostStateSnapshotError> { let mac_pci_map: HashMap> = self .host_snapshot + .status .hardware_info .iter() .flat_map(|hi| &hi.network_interfaces) @@ -641,6 +638,7 @@ impl ManagedHostStateSnapshot { self.dpu_snapshots.sort_by(|lhs, rhs| { let Some(lhs_dpu_mac) = lhs + .status .hardware_info .as_ref() .and_then(|hi| hi.dpu_info.as_ref()) @@ -650,6 +648,7 @@ impl ManagedHostStateSnapshot { }; let Some(rhs_dpu_mac) = rhs + .status .hardware_info .as_ref() .and_then(|hi| hi.dpu_info.as_ref()) @@ -671,6 +670,7 @@ impl ManagedHostStateSnapshot { let primary_interface = self .host_snapshot + .status .interfaces .iter() .find(|interface| interface.primary_interface); @@ -760,44 +760,38 @@ pub struct Machine { /// applied yet, and other useful things. pub network_status_observation: Option, - /// The most recent status of infiniband interfaces. - pub infiniband_status_observation: Option, - - // The most recent status of the nvlink GPUs. - pub nvlink_status_observation: Option, - - // The most recent status of the SPX attachments. - pub spx_status_observation: Option, - /// A list of [StateHistoryRecord]s that this machine has experienced pub history: Vec, - /// A list of [MachineInterfaceSnapshot]s that this machine owns - pub interfaces: Vec, - - /// The Hardware information that was discovered for this machine - pub hardware_info: Option, - - /// The BMC info for this machine - pub bmc_info: BmcInfo, - - /// Last time when machine came up. - pub last_reboot_time: Option>, + /// Machine metadata + pub metadata: Metadata, - /// Last time when cleanup was performed successfully. - pub last_cleanup_time: Option>, + /// Version field that tracks changes to + /// - Metadata + pub version: ConfigVersion, + // Columns for these exist, but are unused in rust code + // /// When this machine record was created + // pub created: DateTime, + // /// When the machine record was last modified + // pub updated: DateTime, + // /// When the machine was last deployed + // pub deployed: Option>, + /// The rack this machine is assigned to (sourced from the expected-machine record at + /// ingestion time; not operator-mutable). + pub rack_id: Option, - /// Last time when discovery finished. - pub last_discovery_time: Option>, + /// The declared desired hardware SKU (sourced from the expected-machine record). + /// Distinct from `MachineStatus::hw_sku_status`, which reflects the observed match. + pub hw_sku: Option, - /// Last time when scout contacted the machine. - pub last_scout_contact_time: Option>, + /// Operator-set desired state. + pub config: MachineConfig, - /// Build version of forge-scout last observed during machine discovery registration. - pub last_scout_observed_version: Option, + /// System-observed state. + pub status: MachineStatus, - /// Failure cause. If failure cause is critical, machine will move into Failed state. - pub failure_details: FailureDetails, + /// All health report sources + pub health_reports: HealthReportSources, /// Last time when machine reprovision requested. pub reprovision_requested: Option, @@ -808,17 +802,6 @@ pub struct Machine { /// Does the forge-dpu-agent on this DPU need upgrading? pub dpu_agent_upgrade_requested: Option, - /// All health report sources - pub health_reports: HealthReportSources, - - // Inventory related to a DPU machine as reported by the agent there. - // Software and versions installed on the machine. - pub inventory: Option, - - /// Last time when machine reboot was requested. - /// This field takes care of reboot requested from state machine only. - pub last_reboot_requested: Option, - /// The result of the last attempt to change state pub controller_state_outcome: Option, @@ -834,68 +817,25 @@ pub struct Machine { /// current cleanup validation id. pub cleanup_machine_validation_id: Option, - /// Override to enable or disable firmware auto update - pub firmware_autoupdate: Option, - /// current on demand validation id. pub on_demand_machine_validation_id: Option, pub on_demand_machine_validation_request: Option, - /// The InstanceType with which a machine is associated if any - pub instance_type_id: Option, - pub asn: Option, - /// Machine metadata - pub metadata: Metadata, - - /// Version field that tracks changes to - /// - Metadata - pub version: ConfigVersion, - // Columns for these exist, but are unused in rust code - // /// When this machine record was created - // pub created: DateTime, - // /// When the machine record was last modified - // pub updated: DateTime, - // /// When the machine was last deployed - // pub deployed: Option>, - pub hw_sku: Option, - pub hw_sku_status: Option, - - /// Host's power options. - pub power_options: Option, - - /// The hardware SKU's device type - pub hw_sku_device_type: Option, - - /// If host upgrades have been completed since the last start explicit start request or actual start - pub update_complete: bool, - - /// The NVLink GPU info for this machine. - pub nvlink_info: Option, - - /// Whether the DPF is enabled for this machine - pub dpf: Dpf, - /// Per-host profile for state-machine-affecting settings, seeded from the /// expected-machine record. Future per-host knobs that influence ingestion /// or state transitions should be added here. pub host_profile: HostProfile, + /// Rack-level firmware upgrade status, updated by the rack state machine. + pub rack_fw_details: Option, + /// Timestamp when manual firmware upgrade was marked as completed /// TEMPORARY: Used for workflow where manual upgrades are required before automatic ones /// TODO: Remove after upgrade-through-scout is complete pub manual_firmware_upgrade_completed: Option>, - - /// The rack that this machine is associated with - pub rack_id: Option, - - /// Rack-level firmware upgrade status, updated by the rack state machine. - pub rack_fw_details: Option, - - pub slot_number: Option, - pub tray_index: Option, } // Dpf status field. @@ -956,7 +896,7 @@ impl Machine { } pub fn bmc_vendor(&self) -> bmc_vendor::BMCVendor { - match self.hardware_info.as_ref() { + match self.status.hardware_info.as_ref() { Some(hw) => hw.bmc_vendor(), None => bmc_vendor::BMCVendor::Unknown, } @@ -1020,7 +960,8 @@ impl Machine { /// e.g. `9C:63:C0:E6:B4:3D` -> `9c-63-c0-e6-b4-3d`. /// Not using Machine ID because it's too long, and not using IP because it's not stable. pub fn dpf_id(&self) -> Option { - self.bmc_info + self.status + .bmc_info .mac .map(|mac| mac.to_string().to_lowercase().replace(':', "-")) } @@ -1035,16 +976,18 @@ impl Machine { return Vec::new(); } - self.interfaces + self.status + .interfaces .iter() .filter_map(|i| i.attached_dpu_machine_id) .collect::>() } pub fn bmc_addr(&self) -> Option { - self.bmc_info + self.status + .bmc_info .ip - .map(|ip| SocketAddr::new(ip, self.bmc_info.port.unwrap_or(443))) + .map(|ip| SocketAddr::new(ip, self.status.bmc_info.port.unwrap_or(443))) } /// If this machine is a DPU, returns whether the version of the @@ -1070,12 +1013,12 @@ impl Machine { } pub fn to_capabilities(&self) -> Option { - self.hardware_info.clone().map(|info| { + self.status.hardware_info.clone().map(|info| { MachineCapabilitiesSet::from_hardware_info( info, - self.infiniband_status_observation.as_ref(), + self.status.infiniband_status_observation.as_ref(), self.associated_dpu_machine_ids(), - self.interfaces.clone(), + self.status.interfaces.clone(), ) }) } @@ -1102,7 +1045,8 @@ impl Machine { } pub fn primary_attached_dpu_machine_id(&self) -> Option { - self.interfaces + self.status + .interfaces .iter() .find(|iface| iface.primary_interface) .and_then(|iface| iface.attached_dpu_machine_id) @@ -1115,22 +1059,24 @@ impl Machine { )); } - let hardware_info = self - .hardware_info - .as_ref() - .ok_or(ModelError::DpuMappingError(format!( - "Missing hardware information for machine {}", - self.id - )))?; + let hardware_info = + self.status + .hardware_info + .as_ref() + .ok_or(ModelError::DpuMappingError(format!( + "Missing hardware information for machine {}", + self.id + )))?; let mut id_to_device_map: HashMap = HashMap::default(); let mut device_to_id_map: HashMap> = HashMap::default(); // in order to ensure that the primary dpu is assigned a network config, it is configured first. - // hardware_interfaces has the primary dpu as the first interface, self.interfaces may not. - // iterate over hardware_interfaces and match it to self.interfaces using the mac address + // hardware_interfaces has the primary dpu as the first interface, self.status.interfaces may not. + // iterate over hardware_interfaces and match it to self.status.interfaces using the mac address for hardware_iface in &hardware_info.network_interfaces { if let Some(pci) = &hardware_iface.pci_properties && let Some(iface) = self + .status .interfaces .iter() .find(|i| i.mac_address == hardware_iface.mac_address) @@ -1447,7 +1393,7 @@ fn bfb_install_support(dpu_snapshots: &[Machine]) -> bool { let bfb_install_support_ = |dpu_snapshots: &[Machine]| -> bool { dpu_snapshots .iter() - .all(|m| m.bmc_info.supports_bfb_install()) + .all(|m| m.status.bmc_info.supports_bfb_install()) }; bfb_install_support_(dpu_snapshots) @@ -2864,7 +2810,7 @@ pub fn dpf_based_dpu_provisioning_possible( } // DPF should be enabled for host. - if !state.host_snapshot.dpf.enabled { + if !state.host_snapshot.config.dpf.enabled { tracing::info!( "DPF based DPU provisioning is not possible because DPF is not enabled for the host {}.", state.host_snapshot.id @@ -2883,7 +2829,7 @@ pub fn dpf_based_dpu_provisioning_possible( // to continue or we should be trying to reprovision all the dpus (switching // to DPF). Reprovisioning only a subset of DPUs cannot flip the host to DPF. if reprovisioning_case - && !state.host_snapshot.dpf.used_for_ingestion + && !state.host_snapshot.config.dpf.used_for_ingestion && !state .dpu_snapshots .iter() @@ -2905,7 +2851,8 @@ pub fn dpf_based_dpu_provisioning_possible( // All DPUs should not be Bluefield 2. if state.dpu_snapshots.iter().any(|dpu| { - dpu.hardware_info + dpu.status + .hardware_info .as_ref() .and_then(|hardware_info| hardware_info.dpu_info.as_ref()) .map(|dpu_data| crate::site_explorer::is_bf2_dpu(&dpu_data.part_number)) @@ -2928,7 +2875,7 @@ pub fn dpf_based_dpu_provisioning_possible( if !state .dpu_snapshots .iter() - .all(|dpu| dpu.bmc_info.supports_bfb_install()) + .all(|dpu| dpu.status.bmc_info.supports_bfb_install()) { tracing::info!( "DPF based DPU provisioning is not possible because some DPUs do not support BFB install via Redfish." diff --git a/crates/api-model/src/machine/status.rs b/crates/api-model/src/machine/status.rs new file mode 100644 index 0000000000..91ae96b2df --- /dev/null +++ b/crates/api-model/src/machine/status.rs @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use chrono::{DateTime, Utc}; + +use crate::bmc_info::BmcInfo; +use crate::hardware_info::{HardwareInfo, MachineInventory, MachineNvLinkInfo}; +use crate::machine::infiniband::MachineInfinibandStatusObservation; +use crate::machine::nvlink::MachineNvLinkStatusObservation; +use crate::machine::spx::MachineSpxStatusObservation; +use crate::machine::{FailureDetails, MachineInterfaceSnapshot, MachineLastRebootRequested}; +use crate::power_manager::PowerOptions; +use crate::sku::SkuStatus; + +/// System-observed state for a machine. +/// +/// Corresponds to `MachineStatus` in the protobuf. +#[derive(Debug, Clone)] +pub struct MachineStatus { + pub interfaces: Vec, + pub hardware_info: Option, + pub bmc_info: BmcInfo, + pub last_reboot_time: Option>, + pub last_cleanup_time: Option>, + pub last_discovery_time: Option>, + pub last_scout_contact_time: Option>, + pub last_scout_observed_version: Option, + pub failure_details: FailureDetails, + pub inventory: Option, + pub last_reboot_requested: Option, + pub hw_sku: Option, + pub hw_sku_device_type: Option, + pub update_complete: bool, + pub nvlink_info: Option, + pub infiniband_status_observation: Option, + pub nvlink_status_observation: Option, + pub spx_status_observation: Option, + pub slot_number: Option, + pub tray_index: Option, + /// Power management state for this machine (hosts only; absent for DPUs). + pub power_options: Option, +} diff --git a/crates/api-web/src/dpu_versions.rs b/crates/api-web/src/dpu_versions.rs index 73457cd039..723b9f0d52 100644 --- a/crates/api-web/src/dpu_versions.rs +++ b/crates/api-web/src/dpu_versions.rs @@ -14,6 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +// Flat `rpc::forge::Machine` fields are deprecated in favour of `status`/`config` +// sub-messages, but this module must still read them until the REST API is migrated. +// See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] use std::sync::Arc; diff --git a/crates/api-web/src/health.rs b/crates/api-web/src/health.rs index aa036f738f..71dfa79b2d 100644 --- a/crates/api-web/src/health.rs +++ b/crates/api-web/src/health.rs @@ -14,6 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +// Flat `rpc::forge::Machine` fields are deprecated in favour of `status`/`config` +// sub-messages, but this module must still read them until the REST API is migrated. +// See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] use std::str::FromStr; use std::sync::Arc; diff --git a/crates/api-web/src/machine.rs b/crates/api-web/src/machine.rs index e1de2ce593..5d33a28aad 100644 --- a/crates/api-web/src/machine.rs +++ b/crates/api-web/src/machine.rs @@ -14,6 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +// Flat `rpc::forge::Machine` fields are deprecated in favour of `status`/`config` +// sub-messages, but this module must still read them until the REST API is migrated. +// See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] use std::collections::{HashMap, HashSet}; use std::sync::Arc; diff --git a/crates/api-web/src/managed_host.rs b/crates/api-web/src/managed_host.rs index 0dba7c859c..83aaa5c280 100644 --- a/crates/api-web/src/managed_host.rs +++ b/crates/api-web/src/managed_host.rs @@ -129,6 +129,7 @@ impl ManagedHostRowDisplay { // Decompose hardware_info into the pieces we want to show let (vendor, model, num_gpus, num_ib_ifs, host_memory) = host_snapshot + .status .hardware_info .map(|hardware_info| { let (vendor, model) = hardware_info @@ -153,17 +154,20 @@ impl ManagedHostRowDisplay { }) .unwrap_or_default(); let host_bmc_ip = host_snapshot + .status .bmc_info .ip .map(|ip| ip.to_string()) .unwrap_or_default(); let host_bmc_mac = host_snapshot + .status .bmc_info .mac .map(|m| m.to_string()) .unwrap_or_default(); let (host_admin_ip, host_admin_mac) = host_snapshot + .status .interfaces .into_iter() .find(|i| i.primary_interface) @@ -213,8 +217,8 @@ impl ManagedHostRowDisplay { maintenance_reference, maintenance_start_time, dpus: dpu_snapshots.into_iter().map_into().collect(), - dpf_enabled: host_snapshot.dpf.enabled, - dpf_used_for_ingestion: host_snapshot.dpf.used_for_ingestion, + dpf_enabled: host_snapshot.config.dpf.enabled, + dpf_used_for_ingestion: host_snapshot.config.dpf.used_for_ingestion, } } } @@ -222,12 +226,18 @@ impl ManagedHostRowDisplay { impl From for AttachedDpuRowDisplay { fn from(item: Machine) -> Self { let bmc_ip = item + .status .bmc_info .ip .map(|ip| ip.to_string()) .unwrap_or_default(); - let bmc_mac = item.bmc_info.mac.map(|m| m.to_string()).unwrap_or_default(); - let primary_iface = item.interfaces.iter().find(|i| i.primary_interface); + let bmc_mac = item + .status + .bmc_info + .mac + .map(|m| m.to_string()) + .unwrap_or_default(); + let primary_iface = item.status.interfaces.iter().find(|i| i.primary_interface); let oob_ip = primary_iface .and_then(|t| t.addresses.first().map(|a| a.to_string())) .unwrap_or_default(); diff --git a/crates/api-web/src/network_status.rs b/crates/api-web/src/network_status.rs index 92d15a3af8..6593659810 100644 --- a/crates/api-web/src/network_status.rs +++ b/crates/api-web/src/network_status.rs @@ -14,6 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +// Flat `rpc::forge::Machine` fields are deprecated in favour of `status`/`config` +// sub-messages, but this module must still read them until the REST API is migrated. +// See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] use std::cmp::min; use std::collections::HashMap; diff --git a/crates/api-web/src/nvlink.rs b/crates/api-web/src/nvlink.rs index 5d061f9564..f69b21a461 100644 --- a/crates/api-web/src/nvlink.rs +++ b/crates/api-web/src/nvlink.rs @@ -14,6 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +// Flat `rpc::forge::Machine` fields are deprecated in favour of `status`/`config` +// sub-messages, but this module must still read them until the REST API is migrated. +// See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] use std::collections::HashMap; use std::sync::Arc; diff --git a/crates/api-web/src/redfish_browser.rs b/crates/api-web/src/redfish_browser.rs index 68c4a62bf2..879684daf6 100644 --- a/crates/api-web/src/redfish_browser.rs +++ b/crates/api-web/src/redfish_browser.rs @@ -14,6 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +// Flat `rpc::forge::Machine` fields are deprecated in favour of `status`/`config` +// sub-messages, but this module must still read them until the REST API is migrated. +// See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] use std::sync::Arc; diff --git a/crates/dpa-manager/src/card_handler/svpc.rs b/crates/dpa-manager/src/card_handler/svpc.rs index 275a794490..876920ec6a 100644 --- a/crates/dpa-manager/src/card_handler/svpc.rs +++ b/crates/dpa-manager/src/card_handler/svpc.rs @@ -105,6 +105,7 @@ impl SvpcInterfaceHandler { let observed = Self::at_most_one( machine + .status .spx_status_observation .iter() .flat_map(|o| &o.spx_attachments) @@ -195,6 +196,7 @@ impl SvpcInterfaceHandler { let this_mac = dpa_interface.mac_address; let this_nic_observed_attachments = machine + .status .spx_status_observation .clone() .map(|observed| { diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 5ffe158532..ba79381a53 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,6 +15,10 @@ * limitations under the License. */ +// The deprecated fields on `rpc::forge::Machine` must still be read here for +// backwards-compat. See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] + use std::collections::{HashMap, HashSet}; use std::convert::TryFrom; use std::net::IpAddr; diff --git a/crates/ib-fabric/src/lib.rs b/crates/ib-fabric/src/lib.rs index 3f3b735d97..48eede70c2 100755 --- a/crates/ib-fabric/src/lib.rs +++ b/crates/ib-fabric/src/lib.rs @@ -684,7 +684,7 @@ async fn record_machine_infiniband_status_observation( ) -> Result { let mut result = MachineIbStatusEvaluation::default(); - if mh_snapshot.host_snapshot.hardware_info.is_none() { + if mh_snapshot.host_snapshot.status.hardware_info.is_none() { // Skip status update while hardware info is not available *metrics .num_machines_by_port_states @@ -700,6 +700,7 @@ async fn record_machine_infiniband_status_observation( let machine_id = &mh_snapshot.host_snapshot.id; let ib_hw_info = &mh_snapshot .host_snapshot + .status .hardware_info .as_ref() .unwrap() @@ -781,6 +782,7 @@ async fn record_machine_infiniband_status_observation( let mut prev = mh_snapshot .host_snapshot + .status .infiniband_status_observation .clone() .unwrap_or_default(); @@ -1074,7 +1076,10 @@ async fn record_machine_infiniband_status_observation( .map_err(|e| DatabaseError::new("acquire connection", e))?; db::machine::update_infiniband_status_observation(&mut conn, machine_id, &cur).await?; metrics.num_machine_ib_status_updates += 1; - mh_snapshot.host_snapshot.infiniband_status_observation = Some(cur); + mh_snapshot + .host_snapshot + .status + .infiniband_status_observation = Some(cur); } Ok(result) diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index d4ccb28b10..24b0191f7f 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -477,6 +477,7 @@ impl MachineStateHandler { ) { for dpu_snapshot in state.dpu_snapshots.iter() { let fw_version = dpu_snapshot + .status .hardware_info .as_ref() .and_then(|hi| hi.dpu_info.as_ref().map(|di| di.firmware_version.clone())); @@ -488,6 +489,7 @@ impl MachineStateHandler { } for mut component in dpu_snapshot + .status .inventory .as_ref() .map(|i| i.components.clone()) @@ -543,6 +545,7 @@ impl MachineStateHandler { ctx.metrics.is_usable_as_instance = state.is_usable_as_instance(false).is_ok(); ctx.metrics.num_gpus = state .host_snapshot + .status .hardware_info .as_ref() .map(|info| info.gpus.len()) @@ -554,7 +557,7 @@ impl MachineStateHandler { ctx.metrics.is_host_bios_password_set = state.host_snapshot.bios_password_set_time.is_some(); ctx.metrics.sku = state.host_snapshot.hw_sku.clone(); - ctx.metrics.sku_device_type = state.host_snapshot.hw_sku_device_type.clone(); + ctx.metrics.sku_device_type = state.host_snapshot.status.hw_sku_device_type.clone(); // Note that DPU alerts may be suppressed (classifications removed) in the aggregate health report. ctx.metrics.health.populate( @@ -891,7 +894,7 @@ impl MachineStateHandler { handler_restart_dpu( dpu_snapshot, ctx, - mh_snapshot.host_snapshot.dpf.used_for_ingestion, + mh_snapshot.host_snapshot.config.dpf.used_for_ingestion, ) .await?; ctx.pending_db_writes.push( @@ -1099,7 +1102,7 @@ impl MachineStateHandler { CleanupState::HostCleanup { boss_controller_id } => { if !cleanedup_after_state_transition( mh_snapshot.host_snapshot.state.version, - mh_snapshot.host_snapshot.last_cleanup_time, + mh_snapshot.host_snapshot.status.last_cleanup_time, ) { let status = trigger_reboot_if_needed( &mh_snapshot.host_snapshot, @@ -1282,7 +1285,7 @@ impl MachineStateHandler { // of failed state. if discovered_after_state_transition( mh_snapshot.host_snapshot.state.version, - mh_snapshot.host_snapshot.last_discovery_time, + mh_snapshot.host_snapshot.status.last_discovery_time, ) { ctx.metrics .machine_reboot_attempts_in_failed_during_discovery = @@ -1342,10 +1345,11 @@ impl MachineStateHandler { FailureCause::NVMECleanFailed { .. } if machine_id.machine_type().is_host() => { if cleanedup_after_state_transition( mh_snapshot.host_snapshot.state.version, - mh_snapshot.host_snapshot.last_cleanup_time, - ) && mh_snapshot.host_snapshot.failure_details.failed_at + mh_snapshot.host_snapshot.status.last_cleanup_time, + ) && mh_snapshot.host_snapshot.status.failure_details.failed_at < mh_snapshot .host_snapshot + .status .last_cleanup_time .unwrap_or_default() { @@ -1682,7 +1686,8 @@ impl MachineStateHandler { ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, ) -> Result>, StateHandlerError> { let host_machine_id = &mh_snapshot.host_snapshot.id; - let Some(last_scout_contact) = mh_snapshot.host_snapshot.last_scout_contact_time else { + let Some(last_scout_contact) = mh_snapshot.host_snapshot.status.last_scout_contact_time + else { return Ok(None); }; @@ -1871,7 +1876,8 @@ impl MachineStateHandler { machine_id: dpu.id, time: Utc::now(), }); - handler_restart_dpu(dpu, ctx, state.host_snapshot.dpf.used_for_ingestion).await?; + handler_restart_dpu(dpu, ctx, state.host_snapshot.config.dpf.used_for_ingestion) + .await?; } return Ok(next_state); } @@ -1952,7 +1958,7 @@ async fn handle_restart_verification( const MAX_VERIFICATION_ATTEMPTS: i32 = 2; // Check host first - if let Some(last_reboot) = &mh_snapshot.host_snapshot.last_reboot_requested + if let Some(last_reboot) = &mh_snapshot.host_snapshot.status.last_reboot_requested && last_reboot.restart_verified == Some(false) { let verification_attempts = last_reboot.verification_attempts.unwrap_or(0); @@ -2058,7 +2064,7 @@ async fn handle_restart_verification( let mut pending_message = Vec::new(); for dpu in &mh_snapshot.dpu_snapshots { - if let Some(last_reboot) = dpu.last_reboot_requested + if let Some(last_reboot) = dpu.status.last_reboot_requested && last_reboot.restart_verified == Some(false) { let verification_attempts = last_reboot.verification_attempts.unwrap_or(0); @@ -2311,7 +2317,7 @@ impl StateHandler for MachineStateHandler { let was_ready = matches!(mh_snapshot.managed_state, ManagedHostState::Ready); - if !mh_snapshot.host_snapshot.dpf.used_for_ingestion { + if !mh_snapshot.host_snapshot.config.dpf.used_for_ingestion { tracing::debug!( machine_id = %host_machine_id, removed_in = "v2.1", @@ -2716,6 +2722,7 @@ async fn check_if_not_in_original_failure_cause_anymore( /// Return `DpuModel` if the explored endpoint is a DPU pub fn identify_dpu(dpu_snapshot: &Machine) -> DpuModel { let model = dpu_snapshot + .status .hardware_info .as_ref() .and_then(|hi| { @@ -2835,6 +2842,7 @@ async fn handle_dpu_reprovision( ReprovisionState::PowerDown => { let basetime = state .host_snapshot + .status .last_reboot_requested .as_ref() .map(|x| x.time) @@ -2887,7 +2895,7 @@ async fn handle_dpu_reprovision( )), ReprovisionState::VerifyFirmareVersions => { // No need to compare version if machine is reprovisioned by DPF. - if !state.host_snapshot.dpf.used_for_ingestion + if !state.host_snapshot.config.dpf.used_for_ingestion && let Some(outcome) = check_fw_component_version(ctx, dpu_snapshot, hardware_models).await? { @@ -3400,19 +3408,21 @@ async fn handle_dpu_reprovision( &state.host_snapshot.id ); - let bmc_mac_address = state.host_snapshot.bmc_info.mac.ok_or_else(|| { - StateHandlerError::MissingData { - object_id: state.host_snapshot.id.to_string(), - missing: "bmc_mac", - } - })?; + let bmc_mac_address = + state.host_snapshot.status.bmc_info.mac.ok_or_else(|| { + StateHandlerError::MissingData { + object_id: state.host_snapshot.id.to_string(), + missing: "bmc_mac", + } + })?; - let bmc_ip_address = state.host_snapshot.bmc_info.ip.ok_or_else(|| { - StateHandlerError::MissingData { - object_id: state.host_snapshot.id.to_string(), - missing: "bmc_ip", - } - })?; + let bmc_ip_address = + state.host_snapshot.status.bmc_info.ip.ok_or_else(|| { + StateHandlerError::MissingData { + object_id: state.host_snapshot.id.to_string(), + missing: "bmc_ip", + } + })?; if let Err(ipmitool_error) = ctx .services @@ -3624,6 +3634,7 @@ async fn check_host_boot_config( fn should_skip_boot_order_remediation(mh_snapshot: &ManagedHostStateSnapshot) -> bool { mh_snapshot .host_snapshot + .status .hardware_info .as_ref() .is_some_and(|hw| hw.is_dgx_h100()) @@ -3645,7 +3656,8 @@ async fn should_wait_for_dpus_before_host_boot_config( !are_dpus_up_trigger_reboot_if_needed(mh_snapshot, reachability_params, ctx).await } HostBootConfigDpuFreshness::SinceLastHostRebootRequest => { - let Some(last_reboot_requested) = mh_snapshot.host_snapshot.last_reboot_requested + let Some(last_reboot_requested) = + mh_snapshot.host_snapshot.status.last_reboot_requested else { tracing::warn!( machine_id = %mh_snapshot.host_snapshot.id, @@ -3710,7 +3722,7 @@ pub async fn try_wait_for_dpu_discovery( } if !discovered_after_state_transition( dpu_snapshot.state.version, - dpu_snapshot.last_discovery_time, + dpu_snapshot.status.last_discovery_time, ) { // Reboot only the DPU for which the handler loop is called. if current_dpu_machine_id == &dpu_snapshot.id { @@ -3842,6 +3854,7 @@ async fn check_fw_component_version( // BMC FW version need to update in machine_topology->bmc_info if component == FirmwareComponentType::Bmc && dpu_snapshot + .status .bmc_info .clone() .firmware_version @@ -3858,7 +3871,7 @@ async fn check_fw_component_version( .and_then(|uefi| uefi.version) .unwrap_or_else(|| { dpu_snapshot - .hardware_info + .status.hardware_info .as_ref() .and_then(|h| h.dmi_data.as_ref()) .map(|d| d.bios_version.clone()) @@ -3904,16 +3917,16 @@ fn set_managed_host_topology_update_needed( fn get_failed_state(state: &ManagedHostStateSnapshot) -> Option<(MachineId, FailureDetails)> { // Return updated state only for errors which should cause machine to move into failed // state. - if state.host_snapshot.failure_details.cause != FailureCause::NoError { + if state.host_snapshot.status.failure_details.cause != FailureCause::NoError { return Some(( state.host_snapshot.id, - state.host_snapshot.failure_details.clone(), + state.host_snapshot.status.failure_details.clone(), )); } else { for dpu_snapshot in &state.dpu_snapshots { // In case of the DPU, use first failed DPU and recover it before moving forward. - if dpu_snapshot.failure_details.cause != FailureCause::NoError { - return Some((dpu_snapshot.id, dpu_snapshot.failure_details.clone())); + if dpu_snapshot.status.failure_details.cause != FailureCause::NoError { + return Some((dpu_snapshot.id, dpu_snapshot.status.failure_details.clone())); } } } @@ -4042,6 +4055,7 @@ impl DpuMachineStateHandler { tracing::info!( "DPU {dpu_machine_id} (BMC FW version: {}); next_state: {}.", dpu_snapshot + .status .bmc_info .firmware_version .clone() @@ -4129,7 +4143,7 @@ impl DpuMachineStateHandler { handler_restart_dpu( dpu_snapshot, ctx, - state.host_snapshot.dpf.used_for_ingestion, + state.host_snapshot.config.dpf.used_for_ingestion, ) .await?; } @@ -4199,7 +4213,7 @@ impl DpuMachineStateHandler { handler_restart_dpu( dpu_snapshot, ctx, - state.host_snapshot.dpf.used_for_ingestion, + state.host_snapshot.config.dpf.used_for_ingestion, ) .await?; } @@ -4243,6 +4257,7 @@ impl DpuMachineStateHandler { } => { let basetime = state .host_snapshot + .status .last_reboot_requested .as_ref() .map(|x| x.time) @@ -4294,7 +4309,7 @@ impl DpuMachineStateHandler { // fixme: in case of DPF ingested machine, the fw version compare should be done // with the image with which the ingestion is done. - if !state.host_snapshot.dpf.used_for_ingestion + if !state.host_snapshot.config.dpf.used_for_ingestion && let Some(outcome) = check_fw_component_version( ctx, dpu_snapshot, @@ -4377,7 +4392,7 @@ impl DpuMachineStateHandler { handler_restart_dpu( dpu_snapshot, ctx, - state.host_snapshot.dpf.used_for_ingestion, + state.host_snapshot.config.dpf.used_for_ingestion, ) .await?; @@ -4926,7 +4941,7 @@ pub async fn trigger_reboot_if_needed_with_location( let host = &state.host_snapshot; // Its highly unlikely that the host has never been rebooted (and the last_reboot_reqeusted // field shouldn't get cleared), but default it if its not set - let last_reboot_requested = match &target.last_reboot_requested { + let last_reboot_requested = match &target.status.last_reboot_requested { None => &MachineLastRebootRequested { time: host.state.version.timestamp(), mode: MachineLastRebootRequestedMode::Reboot, @@ -5070,8 +5085,12 @@ pub async fn trigger_reboot_if_needed_with_location( } else { // Reboot if target.id.machine_type().is_dpu() { - handler_restart_dpu(target, ctx, state.host_snapshot.dpf.used_for_ingestion) - .await?; + handler_restart_dpu( + target, + ctx, + state.host_snapshot.config.dpf.used_for_ingestion, + ) + .await?; } else { if let Ok(client) = ctx.services.create_redfish_client_from_machine(host).await { @@ -5119,7 +5138,7 @@ pub async fn trigger_reboot_if_needed_with_location( /// machine has come up or not after reboot. // True if machine is rebooted after state change. pub fn rebooted(target: &Machine) -> bool { - target.last_reboot_time.unwrap_or_default() > target.state.version.timestamp() + target.status.last_reboot_time.unwrap_or_default() > target.state.version.timestamp() } pub fn machine_validation_completed(target: &Machine) -> bool { @@ -5778,7 +5797,7 @@ impl StateHandler for HostMachineStateHandler { // reset, so only a real, discovered host enters it. A predicted host waits for // discovery to promote it; the promoted host then does the cleanup. // (machine_scout.rs mirrors this on the scout side.) - if mh_snapshot.host_snapshot.last_cleanup_time.is_none() + if mh_snapshot.host_snapshot.status.last_cleanup_time.is_none() && host_machine_id.machine_type().is_host() { return Ok(StateHandlerOutcome::transition(waiting_for_cleanup_state( @@ -5789,13 +5808,13 @@ impl StateHandler for HostMachineStateHandler { if !discovered_after_state_transition( mh_snapshot.host_snapshot.state.version, - mh_snapshot.host_snapshot.last_discovery_time, + mh_snapshot.host_snapshot.status.last_discovery_time, ) { tracing::trace!( machine_id = %host_machine_id, "Waiting for forge-scout to report host online. \ Host last seen {:?}, must come after DPU's {}", - mh_snapshot.host_snapshot.last_discovery_time, + mh_snapshot.host_snapshot.status.last_discovery_time, mh_snapshot.host_snapshot.state.version.timestamp() ); let status = trigger_reboot_if_needed( @@ -6200,7 +6219,7 @@ impl StateHandler for InstanceStateHandler { for dpa_interface in &mh_snapshot.dpa_interface_snapshots { if !dpa_interface.managed_host_network_config_version_synced( &mh_snapshot.instance, - &mh_snapshot.host_snapshot.spx_status_observation, + &mh_snapshot.host_snapshot.status.spx_status_observation, ) { return Ok(StateHandlerOutcome::wait( "Waiting for DPA agent(s) to apply network config and report healthy network" @@ -6241,6 +6260,7 @@ impl StateHandler for InstanceStateHandler { if let Err(not_synced_reason) = ib_config_synced( mh_snapshot .host_snapshot + .status .infiniband_status_observation .as_ref(), Some(&instance.config.infiniband), @@ -6254,7 +6274,11 @@ impl StateHandler for InstanceStateHandler { // Check if the nvlink config has been applied if let Err(not_synced_reason) = nvlink_config_synced( - mh_snapshot.host_snapshot.nvlink_status_observation.as_ref(), + mh_snapshot + .host_snapshot + .status + .nvlink_status_observation + .as_ref(), Some(&instance.config.nvlink), ) { return Ok(StateHandlerOutcome::wait(format!( @@ -6677,7 +6701,7 @@ impl StateHandler for InstanceStateHandler { handler_restart_dpu( dpu_snapshot, ctx, - mh_snapshot.host_snapshot.dpf.used_for_ingestion, + mh_snapshot.host_snapshot.config.dpf.used_for_ingestion, ) .await?; dpus_for_reprov.push(dpu_snapshot); @@ -6812,7 +6836,7 @@ impl StateHandler for InstanceStateHandler { } if !dpa_interface.managed_host_network_config_version_synced( &None, - &mh_snapshot.host_snapshot.spx_status_observation, + &mh_snapshot.host_snapshot.status.spx_status_observation, ) { return Ok(StateHandlerOutcome::wait( "Waiting for DPA agent(s) to apply network config and report healthy network" @@ -6828,6 +6852,7 @@ impl StateHandler for InstanceStateHandler { match ib_config_synced( mh_snapshot .host_snapshot + .status .infiniband_status_observation .as_ref(), Some(&instance.config.infiniband), @@ -7064,7 +7089,7 @@ impl StateHandler for InstanceStateHandler { for dpa_interface in &mh_snapshot.dpa_interface_snapshots { if !dpa_interface.managed_host_network_config_version_synced( &mh_snapshot.instance, - &mh_snapshot.host_snapshot.spx_status_observation, + &mh_snapshot.host_snapshot.status.spx_status_observation, ) { return Ok(StateHandlerOutcome::wait(format!( "Waiting for DPA agent {dpa_id} to apply network config and report healthy network", @@ -7383,6 +7408,7 @@ fn check_instance_network_synced_and_dpu_healthy( // allow primary dpu to be used when using one config with no device_locators match mh_snapshot .host_snapshot + .status .interfaces .iter() .find(|iface| iface.primary_interface) @@ -8378,15 +8404,12 @@ impl HostUpgradeState { let script = to_install.script.unwrap_or("/bin/false".into()); // Should always be Some at this point let upgrade_script_state = self.upgrade_script_state.clone(); let (username, password) = if let Some(credential_reader) = &self.credential_reader { - let bmc_mac_address = - state - .host_snapshot - .bmc_info - .mac - .ok_or_else(|| StateHandlerError::MissingData { - object_id: state.host_snapshot.id.to_string(), - missing: "bmc_mac", - })?; + let bmc_mac_address = state.host_snapshot.status.bmc_info.mac.ok_or_else(|| { + StateHandlerError::MissingData { + object_id: state.host_snapshot.id.to_string(), + missing: "bmc_mac", + } + })?; let key = CredentialKey::BmcCredentials { credential_type: BmcCredentialType::BmcRoot { bmc_mac_address }, }; @@ -8883,6 +8906,7 @@ impl HostUpgradeState { let address = state .host_snapshot + .status .bmc_info .ip_addr() .map_err(StateHandlerError::GenericError)?; @@ -9569,6 +9593,7 @@ fn requires_manual_firmware_upgrade( let is_gb200 = state .host_snapshot + .status .hardware_info .as_ref() .map(|hi| hi.is_gbx00()) @@ -10030,6 +10055,7 @@ async fn handle_boss_job_failure( PowerState::On => { let basetime = mh_snapshot .host_snapshot + .status .last_reboot_requested .as_ref() .map(|x| x.time) @@ -10166,8 +10192,8 @@ async fn restart_dpu( // We have seen the boot order be reset on DPUs in some edge cases (for example, after upgrading the BMC and CEC on BF3s) // This should take care of handling such cases. It is a no-op most of the time. // Skip for DPUs that get their BFB installed via redfish or DPF, they don't need to HTTP boot. - let redfish_install = - machine.bmc_info.supports_bfb_install() && services.site_config.dpu_enable_secure_boot; + let redfish_install = machine.status.bmc_info.supports_bfb_install() + && services.site_config.dpu_enable_secure_boot; if !redfish_install && !dpf_used_for_ingestion { let _ = dpu_redfish_client @@ -10198,6 +10224,7 @@ async fn needs_ipmi_restart( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, ) -> Result { let addr = machine + .status .bmc_info .ip_addr() .map_err(StateHandlerError::GenericError)?; @@ -10244,6 +10271,7 @@ async fn do_ipmi_restart( }); let bmc_mac = machine + .status .bmc_info .mac .ok_or_else(|| StateHandlerError::MissingData { @@ -10251,6 +10279,7 @@ async fn do_ipmi_restart( missing: "bmc_mac", })?; let ip: IpAddr = machine + .status .bmc_info .ip .ok_or_else(|| StateHandlerError::MissingData { @@ -10281,6 +10310,7 @@ pub async fn find_explored_refreshed_endpoint( ) -> Result, StateHandlerError> { let addr: IpAddr = state .host_snapshot + .status .bmc_info .ip_addr() .map_err(StateHandlerError::GenericError)?; @@ -10557,6 +10587,7 @@ async fn handle_instance_host_platform_config( // Wait for the power-down grace period before powering back on let basetime = mh_snapshot .host_snapshot + .status .last_reboot_requested .as_ref() .map(|x| x.time) @@ -11012,7 +11043,11 @@ async fn set_host_boot_order( e ); - let reboot_status = if mh_snapshot.host_snapshot.last_reboot_requested.is_none() + let reboot_status = if mh_snapshot + .host_snapshot + .status + .last_reboot_requested + .is_none() { handler_host_power_control( mh_snapshot, @@ -11220,6 +11255,7 @@ async fn set_host_boot_order( // Wait for the BMC to come back online after reset before powering on let basetime = mh_snapshot .host_snapshot + .status .last_reboot_requested .as_ref() .map(|x| x.time) diff --git a/crates/machine-controller/src/handler/attestation.rs b/crates/machine-controller/src/handler/attestation.rs index 08dcebcfb1..5ae6eed724 100644 --- a/crates/machine-controller/src/handler/attestation.rs +++ b/crates/machine-controller/src/handler/attestation.rs @@ -307,7 +307,7 @@ pub(crate) async fn handle_spdm_trigger_state( let devices_scheduled = trigger_attestation( &services.db_pool, redfish_client, - &mh_snapshot.host_snapshot.bmc_info, + &mh_snapshot.host_snapshot.status.bmc_info, host_machine_id, std::time::Duration::MAX, ) diff --git a/crates/machine-controller/src/handler/bios_config.rs b/crates/machine-controller/src/handler/bios_config.rs index b94aaa5992..d734cfd0fe 100644 --- a/crates/machine-controller/src/handler/bios_config.rs +++ b/crates/machine-controller/src/handler/bios_config.rs @@ -117,7 +117,12 @@ pub(super) async fn configure_host_bios( // // As of July 2024, Josh Price said there's an NBU FR to fix // this, but it wasn't target to a release yet. - let reboot_status = if mh_snapshot.host_snapshot.last_reboot_requested.is_none() { + let reboot_status = if mh_snapshot + .host_snapshot + .status + .last_reboot_requested + .is_none() + { handler_host_power_control(mh_snapshot, ctx, SystemPowerControl::ForceRestart) .await?; @@ -310,6 +315,7 @@ pub(super) async fn advance_bios_config_job( if current_power_state != libredfish::PowerState::On { let basetime = mh_snapshot .host_snapshot + .status .last_reboot_requested .as_ref() .map(|x| x.time) diff --git a/crates/machine-controller/src/handler/dpf.rs b/crates/machine-controller/src/handler/dpf.rs index d513462ac5..5cda09b09b 100644 --- a/crates/machine-controller/src/handler/dpf.rs +++ b/crates/machine-controller/src/handler/dpf.rs @@ -43,7 +43,7 @@ fn dpf_error(error: DpfError) -> StateHandlerError { } fn bmc_ip(machine: &Machine) -> Result { - machine.bmc_info.ip.ok_or_else(|| { + machine.status.bmc_info.ip.ok_or_else(|| { StateHandlerError::GenericError(eyre::eyre!("BMC IP is not set for machine {}", machine.id)) }) } @@ -179,6 +179,7 @@ async fn create_and_register_dpudevices_and_dpunode( ) -> Result<(), StateHandlerError> { let primary_dpu_id = state .host_snapshot + .status .interfaces .iter() .find(|iface| iface.primary_interface) @@ -190,6 +191,7 @@ async fn create_and_register_dpudevices_and_dpunode( for dpu in &state.dpu_snapshots { let serial_number = dpu + .status .hardware_info .as_ref() .and_then(|x| x.dmi_data.as_ref()) @@ -274,6 +276,7 @@ async fn handle_dpf_reboot( ) -> Result, StateHandlerError> { let reboot_already_requested = state .host_snapshot + .status .last_reboot_requested .as_ref() .is_some_and(|r| r.time > state.host_snapshot.state.version.timestamp()); diff --git a/crates/machine-controller/src/handler/machine_validation.rs b/crates/machine-controller/src/handler/machine_validation.rs index 79f975d0cd..24d2c48d0a 100644 --- a/crates/machine-controller/src/handler/machine_validation.rs +++ b/crates/machine-controller/src/handler/machine_validation.rs @@ -137,7 +137,7 @@ pub(crate) async fn handle_machine_validation_state( } // Host validation completed if machine_validation_completed(&mh_snapshot.host_snapshot) { - if mh_snapshot.host_snapshot.failure_details.cause == FailureCause::NoError { + if mh_snapshot.host_snapshot.status.failure_details.cause == FailureCause::NoError { tracing::info!( "{} machine validation completed", mh_snapshot.host_snapshot.id @@ -166,7 +166,7 @@ pub(crate) async fn handle_machine_validation_state( } else { tracing::info!("{} machine validation failed", mh_snapshot.host_snapshot.id); return Ok(StateHandlerOutcome::transition(ManagedHostState::Failed { - details: mh_snapshot.host_snapshot.failure_details.clone(), + details: mh_snapshot.host_snapshot.status.failure_details.clone(), machine_id: mh_snapshot.host_snapshot.id, retry_count: 0, })); diff --git a/crates/machine-controller/src/handler/power.rs b/crates/machine-controller/src/handler/power.rs index d66493f240..83e4e7c3a1 100644 --- a/crates/machine-controller/src/handler/power.rs +++ b/crates/machine-controller/src/handler/power.rs @@ -41,7 +41,7 @@ pub async fn handle_power( ctx: &mut StateHandlerContext<'_, MachineStateHandlerContextObjects>, power_options_config: &PowerOptionConfig, ) -> Result { - if let Some(power_options) = &mh_snapshot.host_snapshot.power_options { + if let Some(power_options) = &mh_snapshot.host_snapshot.status.power_options { match power_options.desired_power_state { model::power_manager::PowerState::On => { handle_power_desired_on(power_options, mh_snapshot, ctx, power_options_config).await diff --git a/crates/machine-controller/src/handler/sku.rs b/crates/machine-controller/src/handler/sku.rs index 64aa02ef6f..2b40b95d71 100644 --- a/crates/machine-controller/src/handler/sku.rs +++ b/crates/machine-controller/src/handler/sku.rs @@ -72,7 +72,7 @@ async fn match_sku_for_machine( host_handler_params: &HostHandlerParams, mh_snapshot: &ManagedHostStateSnapshot, ) -> Result, StateHandlerError> { - let sku_status = mh_snapshot.host_snapshot.hw_sku_status.as_ref(); + let sku_status = mh_snapshot.host_snapshot.status.hw_sku.as_ref(); if sku_status.is_none() || sku_status.is_some_and(|ss| { ss.last_match_attempt.is_some_and(|t| { @@ -111,7 +111,7 @@ async fn generate_missing_sku_for_machine( }; // its unlikely we got here without a bmc mac - let Some(bmc_mac_address) = mh_snapshot.host_snapshot.bmc_info.mac else { + let Some(bmc_mac_address) = mh_snapshot.host_snapshot.status.bmc_info.mac else { tracing::debug!("No bmc mac for machine {}", mh_snapshot.host_snapshot.id); return false; }; @@ -127,7 +127,7 @@ async fn generate_missing_sku_for_machine( return false; } - let sku_status = mh_snapshot.host_snapshot.hw_sku_status.as_ref(); + let sku_status = mh_snapshot.host_snapshot.status.hw_sku.as_ref(); if sku_status.is_some_and(|ss| { ss.last_generate_attempt.is_some_and(|t| { t > (Utc::now() @@ -254,7 +254,8 @@ pub(crate) async fn handle_bom_validation_requested( // If there is a request for verification pending, update the inventory regardless of other configs if let Some(verify_request_time) = mh_snapshot .host_snapshot - .hw_sku_status + .status + .hw_sku .as_ref() .and_then(|ss| ss.verify_request_time) && verify_request_time > mh_snapshot.host_snapshot.state.version.timestamp() @@ -466,7 +467,7 @@ pub(crate) async fn handle_bom_validation_state( BomValidating::UpdatingInventory(bom_validating_context) => { if !discovered_after_state_transition( mh_snapshot.host_snapshot.state.version, - mh_snapshot.host_snapshot.last_discovery_time, + mh_snapshot.host_snapshot.status.last_discovery_time, ) { match trigger_reboot_if_needed( &mh_snapshot.host_snapshot, @@ -598,7 +599,8 @@ pub(crate) async fn handle_bom_validation_state( ) } else if mh_snapshot .host_snapshot - .hw_sku_status + .status + .hw_sku .as_ref() .is_some_and(|ss| { ss.verify_request_time.is_some_and(|t| { diff --git a/crates/nvlink-manager/src/lib.rs b/crates/nvlink-manager/src/lib.rs index 821436b019..01b5c32a4e 100644 --- a/crates/nvlink-manager/src/lib.rs +++ b/crates/nvlink-manager/src/lib.rs @@ -131,7 +131,7 @@ fn build_machine_nvlink_info_from_nmx_c_hello( } if let Some(snapshot_info) = - snapshot.and_then(|snapshot| snapshot.host_snapshot.nvlink_info.as_ref()) + snapshot.and_then(|snapshot| snapshot.host_snapshot.status.nvlink_info.as_ref()) { return MachineNvLinkInfo { domain_uuid, @@ -145,7 +145,7 @@ fn build_machine_nvlink_info_from_nmx_c_hello( } let gpus = snapshot - .and_then(|snapshot| snapshot.host_snapshot.hardware_info.as_ref()) + .and_then(|snapshot| snapshot.host_snapshot.status.hardware_info.as_ref()) .map(nvlink_gpus_from_hardware_info) .unwrap_or_default(); @@ -156,7 +156,7 @@ fn build_machine_nvlink_info_from_nmx_c_hello( } } -/// Populates missing `machines.nvlink_info` entries (or nil `domain_uuid`) using NMX-C hello. +/// Populates missing `machines.status.nvlink_info` entries (or nil `domain_uuid`) using NMX-C hello. fn populate_machine_nvlink_info_if_needed( machine_nvlink_info: &mut HashMap>, managed_host_snapshots: &HashMap, @@ -1029,7 +1029,7 @@ impl NvlPartitionMonitor { > = managed_host_snapshots.iter().fold( HashMap::new(), |mut acc, (_machine_id, snapshot)| { - if let Some(nvlink_info) = snapshot.host_snapshot.nvlink_info.as_ref() { + if let Some(nvlink_info) = snapshot.host_snapshot.status.nvlink_info.as_ref() { let serial = nvlink_info.chassis_serial.trim(); if !serial.is_empty() { acc.entry(serial.to_string()).or_default().push(snapshot); @@ -1184,7 +1184,7 @@ impl NvlPartitionMonitor { ); for (machine_id, nvlink_info) in &nvlink_info_db_updates { if let Some(snapshot) = managed_host_snapshots_domain.get_mut(machine_id) { - snapshot.host_snapshot.nvlink_info = Some(nvlink_info.clone()); + snapshot.host_snapshot.status.nvlink_info = Some(nvlink_info.clone()); } } } @@ -1741,7 +1741,7 @@ impl NvlPartitionMonitor { return Ok(()); } - if let Some(nvlink_info) = &mh.host_snapshot.nvlink_info { + if let Some(nvlink_info) = &mh.host_snapshot.status.nvlink_info { for gpu in &nvlink_info.gpus { let nmxc_partition = match partition_ctx.gpu_to_partition_map.get(&gpu.guid) { // GPU is in a partition, so we need to remove it from the partition. diff --git a/crates/rpc-utils/src/managed_host_display.rs b/crates/rpc-utils/src/managed_host_display.rs index 4a1b671e18..397e69c622 100644 --- a/crates/rpc-utils/src/managed_host_display.rs +++ b/crates/rpc-utils/src/managed_host_display.rs @@ -15,6 +15,10 @@ * limitations under the License. */ +// The deprecated flat fields on `rpc::forge::Machine` must still be read here for +// backwards-compat until a follow-up PR migrates this crate to the new config/status sub-messages. +#![allow(deprecated)] + use std::collections::{BTreeMap, HashMap}; use std::fmt::Display; use std::sync::Arc; diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index 6be0fa6776..6349dbf682 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -258,6 +258,8 @@ fn main() -> Result<(), Box> { ) .type_attribute("forge.InstanceList", "#[derive(serde::Serialize)]") .type_attribute("forge.Machine", "#[derive(serde::Serialize)]") + .type_attribute("forge.MachineConfig", "#[derive(serde::Serialize)]") + .type_attribute("forge.MachineStatus", "#[derive(serde::Serialize)]") .type_attribute( "forge.MachineCapabilitiesSet", "#[derive(serde::Serialize)]", diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index e1744d3579..b5ad2d7006 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -3486,6 +3486,49 @@ message SwitchNvosInfo { optional uint32 port = 3; } +message MachineConfig { + // Maintenance annotation set by the operator. + optional string maintenance_reference = 1; + optional google.protobuf.Timestamp maintenance_start_time = 2; + + // Override to enable or disable firmware auto-update. + optional bool firmware_autoupdate = 3; + + // The instance type with which this machine is associated, if any. + optional string instance_type_id = 4; + + // DPF configuration for this machine. + optional DpfMachineState dpf = 5; +} + +message MachineStatus { + // Deprecated Machine.interfaces is superseded by this field. + repeated MachineInterface interfaces = 1; + optional machine_discovery.DiscoveryInfo discovery_info = 2; + google.protobuf.Timestamp last_reboot_time = 3; + google.protobuf.Timestamp last_observation_time = 4; + optional common.MachineId associated_host_machine_id = 5; + repeated common.MachineId associated_dpu_machine_ids = 6; + optional google.protobuf.Timestamp last_reboot_requested_time = 7; + optional string last_reboot_requested_mode = 8; + optional string dpu_agent_version = 9; + health.HealthReport health = 10; + repeated HealthSourceOrigin health_sources = 11; + optional string failure_details = 12; + optional InfinibandStatusObservation infiniband = 13; + MachineCapabilitiesSet capabilities = 14; + optional SkuStatus hw_sku = 15; + optional ManagedHostQuarantineState quarantine_state = 16; + optional string hw_sku_device_type = 17; + bool update_complete = 18; + optional MachineNVLinkInfo nvlink_info = 19; + optional MachineNVLinkStatusObservation nvlink_status_observation = 20; + optional MachineSpxStatusObservation spx_status_observation = 21; + optional string last_scout_observed_version = 22; + optional InstanceNetworkRestrictions instance_network_restrictions = 23; + LifecycleStatus lifecycle = 24; +} + message Machine { // Uniquely identifies a Forge machine. // The value of this field is globally unique. @@ -3509,53 +3552,89 @@ message Machine { // Note: This is not yet removed in this change to limit the amount of breaking // changes at once - but users shouldn't rely on it. The actually interesting // information around interfaces for Instance users is in `Instance::InstanceNetworkStatus`. - repeated MachineInterface interfaces = 9; + // Deprecated: use status.interfaces + // TODO: change to reserved once rest-api uses MachineStatus + repeated MachineInterface interfaces = 9 [deprecated = true]; // TODO: This field might actually move Instance since the discovered hardware // could be different per lifecycle. When the machine is updated with different // hardware and restarted, the data can change. The Instance would have the // capability to store the actual discovery information for each lifecycle - optional machine_discovery.DiscoveryInfo discovery_info = 10; + // Deprecated: use status.discovery_info + // TODO: change to reserved once rest-api uses MachineStatus + optional machine_discovery.DiscoveryInfo discovery_info = 10 [deprecated = true]; - // Machine type if it is a DPU or HOST. + // Machine type (DPU or HOST). Set at ingestion time; not operator-mutable. MachineType machine_type = 11; + // BMC connection details. Discovered at ingestion time; not operator-mutable. BmcInfo bmc_info = 12; - google.protobuf.Timestamp last_reboot_time = 14; + // Deprecated: use status.last_reboot_time + // TODO: change to reserved once rest-api uses MachineStatus + google.protobuf.Timestamp last_reboot_time = 14 [deprecated = true]; // This is used by legacy versions of forge-admin-cli as well as by Forge Cloud reserved 15; // Was NetworkHealth network_health = 15; - google.protobuf.Timestamp last_observation_time = 16; - optional string maintenance_reference = 17; - optional google.protobuf.Timestamp maintenance_start_time = 18; + // Deprecated: use status.last_observation_time + // TODO: change to reserved once rest-api uses MachineStatus + google.protobuf.Timestamp last_observation_time = 16 [deprecated = true]; + + // Deprecated: use config.maintenance_reference + // TODO: change to reserved once rest-api uses MachineConfig + optional string maintenance_reference = 17 [deprecated = true]; + + // Deprecated: use config.maintenance_start_time + // TODO: change to reserved once rest-api uses MachineConfig + optional google.protobuf.Timestamp maintenance_start_time = 18 [deprecated = true]; // Other machine ids associated with this machine - optional common.MachineId associated_host_machine_id = 19; + // Deprecated: use status.associated_host_machine_id + // TODO: change to reserved once rest-api uses MachineStatus + optional common.MachineId associated_host_machine_id = 19 [deprecated = true]; reserved 20; /* previously: optional common.MachineId associated_dpu_machine_id = 20; */ + // Hardware inventory (NIC firmware, etc.). Discovered at ingestion time; not operator-mutable. optional MachineInventory inventory = 21; - optional google.protobuf.Timestamp last_reboot_requested_time = 22; - optional string last_reboot_requested_mode = 23; + // Deprecated: use status.last_reboot_requested_time + // TODO: change to reserved once rest-api uses MachineStatus + optional google.protobuf.Timestamp last_reboot_requested_time = 22 [deprecated = true]; + // Deprecated: use status.last_reboot_requested_mode + // TODO: change to reserved once rest-api uses MachineStatus + optional string last_reboot_requested_mode = 23 [deprecated = true]; - optional string dpu_agent_version = 24; + // Deprecated: use status.dpu_agent_version + // TODO: change to reserved once rest-api uses MachineStatus + optional string dpu_agent_version = 24 [deprecated = true]; // Fields are not ordered numerically, check the whole message for max field number - repeated common.MachineId associated_dpu_machine_ids = 26; + // Deprecated: use status.associated_dpu_machine_ids + // TODO: change to reserved once rest-api uses MachineStatus + repeated common.MachineId associated_dpu_machine_ids = 26 [deprecated = true]; // For Host Machines returns the aggregate health of the host // For DPUs, it returns the individual DPU health - health.HealthReport health = 27; + // Deprecated: use status.health + // TODO: change to reserved once rest-api uses MachineStatus + health.HealthReport health = 27 [deprecated = true]; + + // Deprecated: use config.firmware_autoupdate + // TODO: change to reserved once rest-api uses MachineConfig + optional bool firmware_autoupdate = 28 [deprecated = true]; // Health report sources (identified by a specific source name and mode) that are applied // to the aggregate health of this machine - repeated HealthSourceOrigin health_sources = 29; - - optional bool firmware_autoupdate = 28; + // Deprecated: use status.health_sources + // TODO: change to reserved once rest-api uses MachineStatus + repeated HealthSourceOrigin health_sources = 29 [deprecated = true]; - optional string failure_details = 30; + // Deprecated: use status.failure_details + // TODO: change to reserved once rest-api uses MachineStatus + optional string failure_details = 30 [deprecated = true]; // Infiniband devices information attached to the machine. - optional InfinibandStatusObservation ib_status = 32; + // Deprecated: use status.ib_status + // TODO: change to reserved once rest-api uses MachineStatus + optional InfinibandStatusObservation ib_status = 32 [deprecated = true]; // Metadata associated with the Machine Metadata metadata = 33; @@ -3565,38 +3644,68 @@ message Machine { string version = 34; // Restrictions on the network configuration for any instances that can be allocated on this machine. - optional InstanceNetworkRestrictions instance_network_restrictions = 35; + // Deprecated: use status.instance_network_restrictions + // TODO: change to reserved once rest-api uses MachineStatus + optional InstanceNetworkRestrictions instance_network_restrictions = 35 [deprecated = true]; // The instance type with which a machine is associated if any. - optional string instance_type_id = 36; + // Deprecated: use config.instance_type_id + // TODO: change to reserved once rest-api uses MachineConfig + optional string instance_type_id = 36 [deprecated = true]; // The known capabilities of a machine - MachineCapabilitiesSet capabilities = 37; + // Deprecated: use status.capabilities + // TODO: change to reserved once rest-api uses MachineStatus + MachineCapabilitiesSet capabilities = 37 [deprecated = true]; + // The declared desired hardware SKU (from expected machine record). + // Distinct from status.hw_sku_status which reflects observed match. optional string hw_sku = 38; - optional SkuStatus hw_sku_status = 39; + // Deprecated: use status.hw_sku_status + // TODO: change to reserved once rest-api uses MachineStatus + optional SkuStatus hw_sku_status = 39 [deprecated = true]; - optional ManagedHostQuarantineState quarantine_state = 40; + // Deprecated: use status.quarantine_state + // TODO: change to reserved once rest-api uses MachineStatus + optional ManagedHostQuarantineState quarantine_state = 40 [deprecated = true]; - optional string hw_sku_device_type = 41; + // Deprecated: use status.hw_sku_device_type + // TODO: change to reserved once rest-api uses MachineStatus + optional string hw_sku_device_type = 41 [deprecated = true]; - bool update_complete = 42; + // Deprecated: use status.update_complete + // TODO: change to reserved once rest-api uses MachineStatus + bool update_complete = 42 [deprecated = true]; - optional MachineNVLinkInfo nvlink_info = 43; + // Deprecated: use status.nvlink_info + // TODO: change to reserved once rest-api uses MachineStatus + optional MachineNVLinkInfo nvlink_info = 43 [deprecated = true]; - optional MachineNVLinkStatusObservation nvlink_status_observation = 44; + // Deprecated: use status.nvlink_status_observation + // TODO: change to reserved once rest-api uses MachineStatus + optional MachineNVLinkStatusObservation nvlink_status_observation = 44 [deprecated = true]; - // The rack that this machine is associated with + // The rack this machine is assigned to (from expected machine record). optional common.RackId rack_id = 45; + // Slot and tray position within the rack (from expected machine record). optional PlacementInRack placement_in_rack = 46; - optional MachineSpxStatusObservation spx_status_observation = 47; + // Deprecated: use status.spx_status_observation + // TODO: change to reserved once rest-api uses MachineStatus + optional MachineSpxStatusObservation spx_status_observation = 47 [deprecated = true]; // Build version of forge-scout last observed during machine discovery registration. - optional string last_scout_observed_version = 48; + // Deprecated: use status.last_scout_observed_version + // TODO: change to reserved once rest-api uses MachineStatus + optional string last_scout_observed_version = 48 [deprecated = true]; + + // Deprecated: use config.dpf + // TODO: change to reserved once rest-api uses MachineConfig + optional DpfMachineState dpf = 49 [deprecated = true]; - optional DpfMachineState dpf = 49; + MachineConfig config = 50; + MachineStatus status = 51; } message DpfMachineState { diff --git a/crates/rpc/src/model/machine/mod.rs b/crates/rpc/src/model/machine/mod.rs index b535072746..9c2ba0070f 100644 --- a/crates/rpc/src/model/machine/mod.rs +++ b/crates/rpc/src/model/machine/mod.rs @@ -121,10 +121,19 @@ impl RpcTryFrom for Option { reprovision_request, snapshot .host_snapshot + .status .infiniband_status_observation .as_ref(), - snapshot.host_snapshot.nvlink_status_observation.as_ref(), - snapshot.host_snapshot.spx_status_observation.as_ref(), + snapshot + .host_snapshot + .status + .nvlink_status_observation + .as_ref(), + snapshot + .host_snapshot + .status + .spx_status_observation + .as_ref(), &snapshot.host_snapshot.health_reports, )?; @@ -141,7 +150,7 @@ impl RpcTryFrom for Option { .version_string(), instance_type_id: instance.instance_type_id.map(|i| i.to_string()), metadata: Some(instance.metadata.into()), - tpm_ek_certificate: snapshot.host_snapshot.hardware_info.and_then(|hi| { + tpm_ek_certificate: snapshot.host_snapshot.status.hardware_info.and_then(|hi| { hi.tpm_ek_certificate .map(|cert| BASE64_STANDARD.encode(cert.into_bytes())) }), @@ -154,8 +163,8 @@ impl From for rpc::forge::dpf_state_response::DpfState { fn from(value: Machine) -> Self { Self { machine_id: value.id.into(), - enabled: value.dpf.enabled, - used_for_ingestion: value.dpf.used_for_ingestion, + enabled: value.config.dpf.enabled, + used_for_ingestion: value.config.dpf.used_for_ingestion, } } } @@ -187,6 +196,9 @@ impl From for rpc::forge::DpfMachineState { } } +// The deprecated flat fields on `rpc::forge::Machine` must still be populated here for +// backwards-compat until a follow-up PR migrates callers to the new config/status sub-messages. +#[allow(deprecated)] impl From for rpc::forge::Machine { fn from(mut machine: Machine) -> Self { let health = match machine.is_dpu() { @@ -233,7 +245,7 @@ impl From for rpc::forge::Machine { }; let dpf = if !machine.is_dpu() { - Some(machine.dpf.clone().into()) + Some(machine.config.dpf.clone().into()) } else { // Dpf state is stored in host. None @@ -242,20 +254,164 @@ impl From for rpc::forge::Machine { let associated_dpu_machine_ids = machine.associated_dpu_machine_ids(); let instance_network_restrictions = Some(machine_instance_network_restrictions(&machine)); + // -- Pre-compute values shared between the new sub-messages and the deprecated flat fields. + // Each field appears once here, then is cloned into whichever site needs an owned copy. + + let capabilities = machine.to_capabilities().map(|mut c| { + c.sort(); + c.into() + }); + + let interfaces_rpc: Vec = machine + .status + .interfaces + .iter() + .cloned() + .map(|i| i.into()) + .collect(); + + let discovery_info = + machine.status.hardware_info.as_ref().and_then(|hw_info| { + match hw_info.clone().try_into() { + Ok(di) => Some(di), + Err(e) => { + tracing::warn!( + machine_id = %machine.id, + error = %e, + "Hardware information couldn't be parsed into discovery info", + ); + None + } + } + }); + + let failure_details: Option = + if machine.status.failure_details.cause != FailureCause::NoError { + Some(machine.status.failure_details.to_string()) + } else { + None + }; + + let ib_status = Some( + machine + .status + .infiniband_status_observation + .clone() + .map(|s| s.into()) + .unwrap_or_default(), + ); + + let placement_in_rack = Some(rpc::forge::PlacementInRack { + slot_number: machine.status.slot_number, + tray_index: machine.status.tray_index, + }); + + // Pre-compute lifecycle state fields shared between status.lifecycle and the flat + // Machine aliases (state, state_version, state_reason, state_sla). + let rpc_state = if machine.is_dpu() { + machine.state.value.dpu_state_string(&machine.id) + } else { + machine.state.value.to_string() + }; + let rpc_state_version = machine.state.version.version_string(); + let rpc_state_reason: Option = + machine.controller_state_outcome.map(Into::into); + + let quarantine_state = machine + .network_config + .quarantine_state + .take() + .map(Into::into); + + let health_sources: Vec = machine + .health_reports + .clone() + .into_iter() + .map(|(hr, m)| rpc::forge::HealthSourceOrigin { + mode: m as i32, + source: hr.source, + }) + .collect(); + + let last_observation_time = machine + .network_status_observation + .as_ref() + .map(|obs| obs.observed_at.into()); + + let dpu_agent_version = machine + .network_status_observation + .take() + .and_then(|obs| obs.agent_version); + + // -- Build the new structured config sub-message -- + let config_msg = rpc::forge::MachineConfig { + maintenance_reference: maintenance_reference.clone(), + maintenance_start_time: maintenance_start_time.map(rpc::Timestamp::from), + firmware_autoupdate: machine.config.firmware_autoupdate, + instance_type_id: machine + .config + .instance_type_id + .as_ref() + .map(|i| i.to_string()), + dpf, + }; + + // -- Build the new structured status sub-message -- + let status_msg = rpc::forge::MachineStatus { + interfaces: interfaces_rpc.clone(), + discovery_info: discovery_info.clone(), + last_reboot_time: machine.status.last_reboot_time.map(|t| t.into()), + last_observation_time, + associated_host_machine_id: None, // Gets filled in the `ManagedHostStateSnapshot` conversion + associated_dpu_machine_ids: associated_dpu_machine_ids.clone(), + last_reboot_requested_time: machine + .status + .last_reboot_requested + .as_ref() + .map(|x| x.time.into()), + last_reboot_requested_mode: machine + .status + .last_reboot_requested + .as_ref() + .map(|x| x.mode.to_string()), + dpu_agent_version: dpu_agent_version.clone(), + health: Some(health.clone().into()), + health_sources, + failure_details: failure_details.clone(), + infiniband: ib_status.clone(), + capabilities: capabilities.clone(), + hw_sku: machine.status.hw_sku.clone().map(|s| s.into()), + quarantine_state: quarantine_state.clone(), + hw_sku_device_type: machine.status.hw_sku_device_type.clone(), + update_complete: machine.status.update_complete, + nvlink_info: machine.status.nvlink_info.clone().map(|i| i.into()), + nvlink_status_observation: machine + .status + .nvlink_status_observation + .clone() + .map(|s| s.into()), + spx_status_observation: machine + .status + .spx_status_observation + .clone() + .map(|s| s.into()), + last_scout_observed_version: machine.status.last_scout_observed_version.clone(), + instance_network_restrictions: instance_network_restrictions.clone(), + lifecycle: Some(rpc::forge::LifecycleStatus { + state: rpc_state.clone(), + version: rpc_state_version.clone(), + state_reason: rpc_state_reason.clone(), + sla: None, // calculated at RPC handler, see ManagedHostStateSnapshot::rpc_machine_state + }), + }; + rpc::Machine { id: Some(machine.id), rack_id: machine.rack_id.clone(), - state: if machine.is_dpu() { - machine.state.value.dpu_state_string(&machine.id) - } else { - machine.state.value.to_string() - }, - capabilities: machine.to_capabilities().map(|mut c| { - c.sort(); - c.into() - }), - instance_type_id: machine.instance_type_id.map(|i| i.to_string()), - state_version: machine.state.version.version_string(), + state: rpc_state, + capabilities, + instance_type_id: machine.config.instance_type_id.map(|i| i.to_string()), + state_version: rpc_state_version, // calculated at RPC handler, see ManagedHostStateSnapshot::rpc_machine_state state_sla: None, machine_type: *RpcMachineTypeWrapper::from(machine.id.machine_type()) as _, @@ -266,46 +422,29 @@ impl From for rpc::forge::Machine { .into_iter() .map(|event| event.into()) .collect(), - interfaces: machine - .interfaces - .into_iter() - .map(|interface| interface.into()) - .collect(), - discovery_info: machine - .hardware_info - .and_then(|hw_info| match hw_info.try_into() { - Ok(di) => Some(di), - Err(e) => { - tracing::warn!( - machine_id = %machine.id, - error = %e, - "Hardware information couldn't be parsed into discovery info", - ); - None - } - }), - bmc_info: Some(machine.bmc_info.into()), - last_reboot_time: machine.last_reboot_time.map(|t| t.into()), - last_observation_time: machine - .network_status_observation - .as_ref() - .map(|obs| obs.observed_at.into()), - dpu_agent_version: machine - .network_status_observation - .and_then(|obs| obs.agent_version), + interfaces: interfaces_rpc, + discovery_info, + bmc_info: Some(machine.status.bmc_info.into()), + last_reboot_time: machine.status.last_reboot_time.map(|t| t.into()), + last_observation_time, + dpu_agent_version, maintenance_reference, maintenance_start_time: maintenance_start_time.map(rpc::Timestamp::from), associated_host_machine_id: None, // Gets filled in the `ManagedHostStateSnapshot` conversion associated_dpu_machine_ids, - inventory: Some(machine.inventory.unwrap_or_default().into()), + inventory: Some(machine.status.inventory.unwrap_or_default().into()), last_reboot_requested_time: machine + .status .last_reboot_requested .as_ref() .map(|x| x.time.into()), - last_reboot_requested_mode: machine.last_reboot_requested.map(|x| x.mode.to_string()), - state_reason: machine.controller_state_outcome.map(|r| r.into()), + last_reboot_requested_mode: machine + .status + .last_reboot_requested + .map(|x| x.mode.to_string()), + state_reason: rpc_state_reason, health: Some(health.into()), - firmware_autoupdate: machine.firmware_autoupdate, + firmware_autoupdate: machine.config.firmware_autoupdate, health_sources: machine .health_reports .into_iter() @@ -314,39 +453,28 @@ impl From for rpc::forge::Machine { source: hr.source, }) .collect(), - failure_details: if machine.failure_details.cause != FailureCause::NoError { - Some(machine.failure_details.to_string()) - } else { - None - }, - ib_status: Some( - machine - .infiniband_status_observation - .take() - .map(|status| status.into()) - .unwrap_or_default(), - ), + failure_details, + ib_status, instance_network_restrictions, hw_sku: machine.hw_sku, - hw_sku_status: machine.hw_sku_status.map(|s| s.into()), - quarantine_state: machine - .network_config - .quarantine_state - .take() - .map(Into::into), - hw_sku_device_type: machine.hw_sku_device_type, - update_complete: machine.update_complete, - nvlink_info: machine.nvlink_info.map(|info| info.into()), + hw_sku_status: machine.status.hw_sku.map(|s| s.into()), + quarantine_state, + hw_sku_device_type: machine.status.hw_sku_device_type, + update_complete: machine.status.update_complete, + nvlink_info: machine.status.nvlink_info.map(|info| info.into()), nvlink_status_observation: machine + .status .nvlink_status_observation .map(|status| status.into()), - spx_status_observation: machine.spx_status_observation.map(|status| status.into()), - placement_in_rack: Some(rpc::forge::PlacementInRack { - slot_number: machine.slot_number, - tray_index: machine.tray_index, - }), - last_scout_observed_version: machine.last_scout_observed_version, + spx_status_observation: machine + .status + .spx_status_observation + .map(|status| status.into()), + placement_in_rack, + last_scout_observed_version: machine.status.last_scout_observed_version, dpf, + config: Some(config_msg), + status: Some(status_msg), } } } @@ -492,6 +620,10 @@ pub trait ManagedHostStateSnapshotRpc { impl ManagedHostStateSnapshotRpc for ManagedHostStateSnapshot { /// Creates an RPC Machine representation for either the Host or one of the DPUs + // The `health` and `associated_host_machine_id` flat fields are deprecated in favour of + // `status.health` / `status.associated_host_machine_id`, but must still be written here + // for backwards-compat until rest-api migrates to the new sub-messages. + #[allow(deprecated)] fn rpc_machine_state( &self, dpu_machine_id: Option<&MachineId>, @@ -502,17 +634,22 @@ impl ManagedHostStateSnapshotRpc for ManagedHostStateSnapshot { let mut rpc_machine: rpc::forge::Machine = self.host_snapshot.clone().into(); let state = &self.host_snapshot.state.value; let version = &self.host_snapshot.state.version; + let sla: rpc::forge::StateSla = state_sla( + &self.host_snapshot.id, + state, + version, + &self.aggregate_health, + sla_config, + ) + .into(); rpc_machine.health = Some(self.aggregate_health.clone().into()); - rpc_machine.state_sla = Some( - state_sla( - &self.host_snapshot.id, - state, - version, - &self.aggregate_health, - sla_config, - ) - .into(), - ); + rpc_machine.state_sla = Some(sla); + if let Some(status) = rpc_machine.status.as_mut() { + status.health = Some(self.aggregate_health.clone().into()); + if let Some(lifecycle) = status.lifecycle.as_mut() { + lifecycle.sla = Some(sla); + } + } Some(rpc_machine) } Some(dpu_machine_id) => { @@ -521,18 +658,23 @@ impl ManagedHostStateSnapshotRpc for ManagedHostStateSnapshot { .iter() .find(|dpu| dpu.id == *dpu_machine_id)?; let mut rpc_machine: rpc::forge::Machine = dpu_snapshot.clone().into(); + let sla: rpc::forge::StateSla = state_sla( + &dpu_snapshot.id, + &dpu_snapshot.state.value, + &dpu_snapshot.state.version, + &self.aggregate_health, + sla_config, + ) + .into(); // In case the DPU does not know the associated Host - we can backfill the data here rpc_machine.associated_host_machine_id = Some(self.host_snapshot.id); - rpc_machine.state_sla = Some( - state_sla( - &dpu_snapshot.id, - &dpu_snapshot.state.value, - &dpu_snapshot.state.version, - &self.aggregate_health, - sla_config, - ) - .into(), - ); + rpc_machine.state_sla = Some(sla); + if let Some(status) = rpc_machine.status.as_mut() { + status.associated_host_machine_id = Some(self.host_snapshot.id); + if let Some(lifecycle) = status.lifecycle.as_mut() { + lifecycle.sla = Some(sla); + } + } Some(rpc_machine) } } @@ -543,6 +685,7 @@ fn machine_instance_network_restrictions( machine: &Machine, ) -> rpc::forge::InstanceNetworkRestrictions { let inband_interfaces = machine + .status .interfaces .iter() .filter(|i| matches!(i.network_segment_type, Some(NetworkSegmentType::HostInband))) diff --git a/crates/rvs/src/client/mod.rs b/crates/rvs/src/client/mod.rs index 4e777a1a36..e6a5631af3 100644 --- a/crates/rvs/src/client/mod.rs +++ b/crates/rvs/src/client/mod.rs @@ -1,3 +1,7 @@ +// The deprecated fields on `rpc::forge::Machine` must still be read here for +// backwards-compat. See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] + mod io; use std::collections::HashMap; diff --git a/crates/site-explorer/src/machine_creator.rs b/crates/site-explorer/src/machine_creator.rs index 95f14ef584..f2ee3fcf29 100644 --- a/crates/site-explorer/src/machine_creator.rs +++ b/crates/site-explorer/src/machine_creator.rs @@ -412,6 +412,7 @@ impl MachineCreator { // the same MAC address as this one, so something's weird here. Log this host's mac // addresses and the ones from the colliding hosts to help in diagnosis. let existing_macs = existing_machine + .status .hardware_info .as_ref() .map(|hw| hw.all_mac_addresses()) diff --git a/crates/site-explorer/tests/health_report.rs b/crates/site-explorer/tests/health_report.rs index de4680aef3..3289b4d831 100644 --- a/crates/site-explorer/tests/health_report.rs +++ b/crates/site-explorer/tests/health_report.rs @@ -15,6 +15,10 @@ * limitations under the License. */ +// The deprecated fields on `rpc::forge::Machine` must still be read here for +// backwards-compat. See https://github.com/NVIDIA/infra-controller/issues/2793 +#![allow(deprecated)] + use std::sync::Arc; use std::time::Duration; diff --git a/crates/site-explorer/tests/machine_creator.rs b/crates/site-explorer/tests/machine_creator.rs index 450d67794f..b2d99aa09b 100644 --- a/crates/site-explorer/tests/machine_creator.rs +++ b/crates/site-explorer/tests/machine_creator.rs @@ -473,11 +473,17 @@ async fn test_machine_creator_creates_managed_host( dpu_machine.current_state(), ); assert_eq!( - dpu_machine.hardware_info.as_ref().unwrap().machine_type, + dpu_machine + .status + .hardware_info + .as_ref() + .unwrap() + .machine_type, CpuArchitecture::Aarch64, ); assert_eq!( dpu_machine + .status .hardware_info .as_ref() .unwrap() @@ -489,6 +495,7 @@ async fn test_machine_creator_creates_managed_host( ); assert_eq!( dpu_machine + .status .hardware_info .as_ref() .unwrap() @@ -500,6 +507,7 @@ async fn test_machine_creator_creates_managed_host( ); assert_eq!( dpu_machine + .status .hardware_info .as_ref() .unwrap() @@ -523,7 +531,7 @@ async fn test_machine_creator_creates_managed_host( "expected DpuDiscoveringState, got {:?}", host_machine.current_state(), ); - assert!(host_machine.bmc_info.ip.is_some()); + assert!(host_machine.status.bmc_info.ip.is_some()); // 2nd creation does nothing. assert!( @@ -721,7 +729,7 @@ async fn test_machine_creator_creates_multi_dpu_managed_host( txn.commit().await?; } let hm = host_machine.clone().unwrap(); - assert!(hm.bmc_info.ip.is_some()); + assert!(hm.status.bmc_info.ip.is_some()); if host_machine_id.is_none() { host_machine_id = Some(hm.id); } @@ -994,10 +1002,10 @@ async fn test_machine_creator_creates_managed_host_with_dpf_disabled( for machine in machines { if machine.is_dpu() { // DPU has no expected-machine entry, so it always defaults to `true`. - assert!(machine.dpf.enabled); + assert!(machine.config.dpf.enabled); } else { // Host has expected-machine entry with `dpf_enabled: Some(false)`. - assert!(!machine.dpf.enabled); + assert!(!machine.config.dpf.enabled); } } @@ -1044,7 +1052,7 @@ async fn test_machine_creator_creates_managed_host_with_dpf_enabled( assert_eq!(machines.len(), 2); for machine in machines { - assert!(machine.dpf.enabled); + assert!(machine.config.dpf.enabled); } Ok(()) diff --git a/crates/site-explorer/tests/site_explorer.rs b/crates/site-explorer/tests/site_explorer.rs index c3eef3dee0..13a5e7c62d 100644 --- a/crates/site-explorer/tests/site_explorer.rs +++ b/crates/site-explorer/tests/site_explorer.rs @@ -2184,7 +2184,12 @@ async fn test_fallback_dpu_serial(pool: PgPool) -> Result<(), Box as AsRef>>::as_ref(&machines) .iter() - .any(|x| { x.bmc_info.ip.is_some_and(|ip| ip.to_string() == bmc_ip) }) + .any(|x| { + x.status + .bmc_info + .ip + .is_some_and(|ip| ip.to_string() == bmc_ip) + }) ); } Ok(()) @@ -2434,7 +2439,7 @@ async fn test_machine_creation_with_sku(pool: PgPool) -> Result<(), Box for forge::Machine { fn from(value: MockHost) -> Self { Self {