diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 9cbbdb79c76..a2578586dc4 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -34,16 +34,16 @@ use nexus_reconfigurator_simulation::{ }; use nexus_reconfigurator_simulation::{SimStateBuilder, SimTufRepoSource}; use nexus_reconfigurator_simulation::{SimTufRepoDescription, Simulator}; -use nexus_types::deployment::execution; +use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::execution::blueprint_external_dns_config; use nexus_types::deployment::execution::blueprint_internal_dns_config; use nexus_types::deployment::{Blueprint, UnstableReconfiguratorState}; use nexus_types::deployment::{BlueprintArtifactVersion, PendingMgsUpdate}; +use nexus_types::deployment::{BlueprintExpungedZoneAccessReason, execution}; use nexus_types::deployment::{ BlueprintHostPhase2DesiredContents, PlannerConfig, }; use nexus_types::deployment::{BlueprintSource, SledFilter}; -use nexus_types::deployment::{BlueprintZoneDisposition, ExpectedVersion}; use nexus_types::deployment::{ BlueprintZoneImageSource, PendingMgsUpdateDetails, }; @@ -2766,9 +2766,12 @@ fn sled_with_zone( ) -> anyhow::Result { let mut parent_sled_id = None; - for sled_id in builder.sled_ids_with_zones() { + for sled_id in builder.current_commissioned_sleds() { if builder - .current_sled_zones(sled_id, BlueprintZoneDisposition::any) + .current_in_service_and_expunged_sled_zones( + sled_id, + BlueprintExpungedZoneAccessReason::ReconfiguratorCli, + ) .any(|z| z.id == *zone_id) { parent_sled_id = Some(sled_id); diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 09ae6d1ebb8..c2ddf23c3c0 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -93,12 +93,8 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { let (image_source, nexus_generation) = commissioned_sled_ids .iter() .find_map(|&sled_id| { - builder - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .find_map(|zone| { + builder.current_in_service_sled_zones(sled_id).find_map( + |zone| { if let BlueprintZoneType::Nexus( blueprint_zone_type::Nexus { nexus_generation, @@ -113,7 +109,8 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { } else { None } - }) + }, + ) }) .context( "could not find in-service Nexus in parent blueprint", diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index a9232b975dd..9fc7523b5a2 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -3111,7 +3111,6 @@ mod tests { use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; - use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::ExpectedActiveRotSlot; use nexus_types::deployment::PendingMgsUpdate; @@ -3532,10 +3531,7 @@ mod tests { // Take the first two zones and set their image sources. { let zone_ids: Vec = builder - .current_sled_zones( - new_sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(new_sled_id) .map(|zone| zone.id) .take(2) .collect(); @@ -3577,7 +3573,8 @@ mod tests { // 3. both slots set to a known version // 4. slot_a set to a known version; slot b set to an unknown version { - let sled_ids = builder.sled_ids_with_zones().collect::>(); + let sled_ids = + builder.current_commissioned_sleds().collect::>(); assert!(sled_ids.len() >= 4, "at least 4 sleds"); let host_phase_2_samples = [ diff --git a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs index a8c87f3be9f..c71a1e886f7 100644 --- a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs +++ b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs @@ -1503,7 +1503,7 @@ mod tests { .expect("created builder"); let to_expunge = builder - .current_zones(BlueprintZoneDisposition::is_in_service) + .current_in_service_zones() .filter_map(|(sled_id, zone)| { if zone.zone_type.is_external_dns() { Some((sled_id, zone.id)) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index a75cac0d324..895626882cd 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -26,6 +26,7 @@ use itertools::Either; use nexus_inventory::now_db_precision; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; use nexus_types::deployment::BlueprintPhysicalDiskConfig; @@ -110,13 +111,10 @@ pub enum Error { #[error("no Boundary NTP zones exist in parent blueprint")] NoBoundaryNtpZonesInParentBlueprint, #[error( - "invariant violation: found decommissioned sled with \ - {num_zones} non-expunged zones: {sled_id}" + "invariant violation: commissioned sled missing from planning input's \ + list of sleds: {sled_id}" )] - DecommissionedSledWithNonExpungedZones { - sled_id: SledUuid, - num_zones: usize, - }, + CommissionedSledMissingFromInput { sled_id: SledUuid }, #[error("programming error in planner")] Planner(#[source] anyhow::Error), #[error("error editing sled {sled_id}")] @@ -704,10 +702,11 @@ impl<'a> BlueprintBuilder<'a> { // Compute the "in use" subnets; this includes all in-service internal // DNS zones _and_ any "expunged but not yet confirmed to be gone" - // zones, so we use the somewhat unusual `could_be_running` filter - // instead of the more typical `is_in_service`. + // zones, so we use the somewhat unusual + // `current_could_be_running_zones()` instead of the more typical + // `current_in_service_zones()`. let internal_dns_subnets_in_use = self - .current_zones(BlueprintZoneDisposition::could_be_running) + .current_could_be_running_zones() .filter_map(|(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::InternalDns(internal_dns) => { Some(DnsSubnet::from_addr(*internal_dns.dns_address.ip())) @@ -721,51 +720,129 @@ impl<'a> BlueprintBuilder<'a> { })) } - /// Iterates over the list of sled IDs for which we have zones. + /// Iterate over the sled IDs of all commissioned sleds known to this + /// builder. /// - /// This may include decommissioned sleds. - pub fn sled_ids_with_zones(&self) -> impl Iterator + '_ { - self.sled_editors.keys().copied() + /// This will include: + /// + /// * All commissioned sleds present in the parent blueprint that have not + /// been decommissioned by this builder + /// * Any sleds that were added by this builder + pub fn current_commissioned_sleds( + &self, + ) -> impl Iterator + '_ { + self.sled_editors.iter().filter_map(|(sled_id, editor)| { + match editor.state() { + SledState::Active => Some(*sled_id), + SledState::Decommissioned => None, + } + }) } - /// Iterates over all zones on a sled. + /// Iterate over the in-service [`BlueprintZoneConfig`] instances on a + /// particular sled. /// /// This will include both zones from the parent blueprint, as well /// as the changes made within this builder. - pub fn current_sled_zones( + pub fn current_in_service_sled_zones( &self, sled_id: SledUuid, - filter: F, - ) -> impl Iterator - where - F: FnMut(BlueprintZoneDisposition) -> bool, - { + ) -> impl Iterator { let Some(editor) = self.sled_editors.get(&sled_id) else { return Either::Left(iter::empty()); }; - Either::Right(editor.zones(filter)) + Either::Right(editor.in_service_zones()) } - /// Iterates over all zones on all sleds. + /// Iterate over the in-service [`BlueprintZoneConfig`] instances on all + /// commissioned sleds. /// - /// Acts like a combination of [`Self::sled_ids_with_zones`] and - /// [`Self::current_sled_zones`]. + /// This will include both zones from the parent blueprint, as well + /// as the changes made within this builder. + pub fn current_in_service_zones( + &self, + ) -> impl Iterator { + self.current_commissioned_sleds().flat_map(|sled_id| { + self.current_in_service_sled_zones(sled_id) + .map(move |zone| (sled_id, zone)) + }) + } + + /// Iterate over the expunged [`BlueprintZoneConfig`] instances on a + /// particular sled. /// /// This will include both zones from the parent blueprint, as well /// as the changes made within this builder. - pub fn current_zones( - &'a self, - filter: F, - ) -> impl Iterator - where - F: FnMut(BlueprintZoneDisposition) -> bool + Clone, - { - self.sled_ids_with_zones().flat_map(move |sled_id| { - self.current_sled_zones(sled_id, filter.clone()) - .map(move |config| (sled_id, config)) + /// + /// Like [`Blueprint::expunged_zones()`], callers must specify a + /// [`BlueprintExpungedZoneAccessReason`]. This allows us to statically + /// track all uses of expunged zones, each of which we must account for in + /// the planner's logic to permanently prune expunged zones from the + /// blueprint. + pub fn current_expunged_sled_zones( + &self, + sled_id: SledUuid, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + let Some(editor) = self.sled_editors.get(&sled_id) else { + return Either::Left(iter::empty()); + }; + Either::Right(editor.expunged_zones(reason)) + } + + /// Iterate over the [`BlueprintZoneConfig`] instances on a particular sled + /// that could be running (i.e., are in service or are expunged but not yet + /// ready for cleanup). + /// + /// This will include both zones from the parent blueprint, as well + /// as the changes made within this builder. + pub fn current_could_be_running_sled_zones( + &self, + sled_id: SledUuid, + ) -> impl Iterator { + let Some(editor) = self.sled_editors.get(&sled_id) else { + return Either::Left(iter::empty()); + }; + Either::Right(editor.could_be_running_zones()) + } + + /// Iterate over the [`BlueprintZoneConfig`] instances on all commissioned + /// sleds that could be running (i.e., are in service or are expunged but + /// not yet ready for cleanup). + /// + /// This will include both zones from the parent blueprint, as well + /// as the changes made within this builder. + pub fn current_could_be_running_zones( + &self, + ) -> impl Iterator { + self.current_commissioned_sleds().flat_map(|sled_id| { + self.current_could_be_running_sled_zones(sled_id) + .map(move |zone| (sled_id, zone)) }) } + /// Iterate all the [`BlueprintZoneConfig`] instances on a + /// particular sled, regarless of their disposition. + /// + /// This will include both zones from the parent blueprint, as well + /// as the changes made within this builder. + /// + /// Like [`Blueprint::expunged_zones()`], callers must specify a + /// [`BlueprintExpungedZoneAccessReason`]. This allows us to statically + /// track all uses of expunged zones, each of which we must account for in + /// the planner's logic to permanently prune expunged zones from the + /// blueprint. + pub fn current_in_service_and_expunged_sled_zones( + &self, + sled_id: SledUuid, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + let Some(editor) = self.sled_editors.get(&sled_id) else { + return Either::Left(iter::empty()); + }; + Either::Right(editor.all_in_service_and_expunged_zones(reason)) + } + pub fn current_sled_disks( &self, sled_id: SledUuid, @@ -961,9 +1038,13 @@ impl<'a> BlueprintBuilder<'a> { // Expunging a disk expunges any datasets and zones that depend on it, // so expunging all in-service disks should have also expunged all - // datasets and zones. Double-check that that's true. + // datasets and zones. Double-check that that's true, and grab all the + // expunged zones so we can immediately mark them as "ready for + // cleanup". (The sled is expunged, so it can't be running the zone!) let mut zones_ready_for_cleanup = Vec::new(); - for zone in editor.zones(BlueprintZoneDisposition::any) { + for zone in editor.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::PlannerCheckReadyForCleanup, + ) { match zone.disposition { BlueprintZoneDisposition::Expunged { .. } => { // Since this is a full sled expungement, we'll never see an @@ -1042,7 +1123,7 @@ impl<'a> BlueprintBuilder<'a> { let mut zones_to_expunge = Vec::new(); - for zone in editor.zones(BlueprintZoneDisposition::is_in_service) { + for zone in editor.in_service_zones() { if zone.zone_type.is_clickhouse_keeper() || zone.zone_type.is_clickhouse_server() { @@ -1085,7 +1166,7 @@ impl<'a> BlueprintBuilder<'a> { let mut zones_to_expunge = Vec::new(); - for zone in editor.zones(BlueprintZoneDisposition::is_in_service) { + for zone in editor.in_service_zones() { if zone.zone_type.is_clickhouse() { zones_to_expunge.push(zone.id); } @@ -1277,10 +1358,7 @@ impl<'a> BlueprintBuilder<'a> { fn next_internal_dns_gz_address_index(&self, sled_id: SledUuid) -> u32 { let used_internal_dns_gz_address_indices = self - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(sled_id) .filter_map(|z| match z.zone_type { BlueprintZoneType::InternalDns( blueprint_zone_type::InternalDns { @@ -1390,9 +1468,7 @@ impl<'a> BlueprintBuilder<'a> { "tried to ensure NTP zone for unknown sled {sled_id}" )) })?; - editor - .zones(BlueprintZoneDisposition::is_in_service) - .any(|z| z.zone_type.is_ntp()) + editor.in_service_zones().any(|z| z.zone_type.is_ntp()) }; if has_ntp { return Ok(Ensure::NotNeeded); @@ -1434,17 +1510,16 @@ impl<'a> BlueprintBuilder<'a> { })?; // If this sled already has a Crucible zone on this pool, do nothing. - let has_crucible_on_this_pool = - editor.zones(BlueprintZoneDisposition::is_in_service).any(|z| { - matches!( - &z.zone_type, - BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { - dataset, - .. - }) - if dataset.pool_name == pool_name - ) - }); + let has_crucible_on_this_pool = editor.in_service_zones().any(|z| { + matches!( + &z.zone_type, + BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { + dataset, + .. + }) + if dataset.pool_name == pool_name + ) + }); if has_crucible_on_this_pool { return Ok(Ensure::NotNeeded); } @@ -1788,9 +1863,8 @@ impl<'a> BlueprintBuilder<'a> { })?; // Ensure we have no other in-service NTP zones. - if let Some(in_service_ntp_zone) = editor - .zones(BlueprintZoneDisposition::is_in_service) - .find(|zone| zone.zone_type.is_ntp()) + if let Some(in_service_ntp_zone) = + editor.in_service_zones().find(|zone| zone.zone_type.is_ntp()) { return Err(Error::Planner(anyhow!( "attempted to add boundary NTP zone to sled {sled_id} which \ @@ -1881,9 +1955,8 @@ impl<'a> BlueprintBuilder<'a> { })?; // Find the internal NTP zone and expunge it. - let mut internal_ntp_zone_id_iter = editor - .zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|zone| { + let mut internal_ntp_zone_id_iter = + editor.in_service_zones().filter_map(|zone| { if matches!(zone.zone_type, BlueprintZoneType::InternalNtp(_)) { Some(zone.id) } else { @@ -2095,10 +2168,7 @@ impl<'a> BlueprintBuilder<'a> { // up a set of invalid zpools for this sled/kind pair. let mut skip_zpools = BTreeSet::new(); for zone_config in self - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(sled_id) .filter(|z| z.zone_type.kind() == zone_kind) { if let Some(zpool) = zone_config.zone_type.durable_zpool() { @@ -2961,7 +3031,7 @@ pub mod test { let editor = builder.sled_editors.get_mut(&sled_id).expect("found sled"); let crucible_zone_id = editor - .zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|zone_config| { if zone_config.zone_type.is_crucible() { return Some(zone_config.id); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs b/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs index 6fa3380f060..70fec007615 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/clickhouse.rs @@ -8,9 +8,7 @@ use clickhouse_admin_types::keeper::{ ClickhouseKeeperClusterMembership, KeeperId, }; -use nexus_types::deployment::{ - BlueprintZoneDisposition, BlueprintZoneType, ClickhouseClusterConfig, -}; +use nexus_types::deployment::{BlueprintZoneType, ClickhouseClusterConfig}; use omicron_uuid_kinds::OmicronZoneUuid; use slog::{Logger, error}; use std::collections::BTreeSet; @@ -29,9 +27,7 @@ impl ClickhouseZonesThatShouldBeRunning { pub fn new(blueprint: &BlueprintBuilder<'_>) -> Self { let mut keepers = BTreeSet::new(); let mut servers = BTreeSet::new(); - for (_, zone) in - blueprint.current_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone) in blueprint.current_in_service_zones() { match zone.zone_type { BlueprintZoneType::ClickhouseKeeper(_) => { keepers.insert(zone.id); diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs index e1f41bbc870..04feacef4fc 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs @@ -6,7 +6,6 @@ use anyhow::bail; use debug_ignore::DebugIgnore; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ExternalIpPolicy; use nexus_types::deployment::OmicronZoneExternalIp; @@ -99,9 +98,7 @@ impl ExternalNetworkingAllocator { external_ip_policy: &ExternalIpPolicy, ) -> anyhow::Result { Self::new( - builder - .current_zones(BlueprintZoneDisposition::is_in_service) - .map(|(_sled_id, zone)| zone), + builder.current_in_service_zones().map(|(_sled_id, zone)| zone), external_ip_policy, ) } diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs index e5c2a1f4b1d..93c54f10a2b 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs @@ -22,13 +22,13 @@ use illumos_utils::zpool::ZpoolName; use itertools::Either; use nexus_types::deployment::BlueprintDatasetConfig; use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; use nexus_types::deployment::BlueprintPhysicalDiskConfig; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::LastAllocatedSubnetIpOffset; @@ -48,6 +48,7 @@ use omicron_uuid_kinds::ZpoolUuid; use scalar::ScalarEditor; use sled_agent_types::inventory::MupdateOverrideBootInventory; use sled_agent_types::inventory::ZoneKind; +use std::iter; use std::mem; use std::net::Ipv6Addr; use underlay_ip_allocator::SledUnderlayIpAllocator; @@ -291,27 +292,66 @@ impl SledEditor { } } - pub fn zones( + pub fn in_service_zones( &self, - mut filter: F, - ) -> impl Iterator - where - F: FnMut(BlueprintZoneDisposition) -> bool, - { + ) -> impl Iterator { + match &self.0 { + InnerSledEditor::Active(editor) => { + Either::Left(editor.in_service_zones()) + } + InnerSledEditor::Decommissioned(_) => { + // A decommissioned sled cannot have any in-service zones! + Either::Right(iter::empty()) + } + } + } + + pub fn could_be_running_zones( + &self, + ) -> impl Iterator { match &self.0 { InnerSledEditor::Active(editor) => { - Either::Left(editor.zones(filter)) + Either::Left(editor.could_be_running_zones()) + } + InnerSledEditor::Decommissioned(_) => { + // A decommissioned sled cannot have any running zones! + Either::Right(iter::empty()) + } + } + } + + pub fn expunged_zones( + &self, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + match &self.0 { + InnerSledEditor::Active(editor) => { + Either::Left(editor.expunged_zones(reason)) } InnerSledEditor::Decommissioned(edited) => Either::Right( edited .config .zones .iter() - .filter(move |zone| filter(zone.disposition)), + .filter(move |zone| zone.disposition.is_expunged()), ), } } + pub fn all_in_service_and_expunged_zones( + &self, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + match &self.0 { + InnerSledEditor::Active(editor) => { + Either::Left(editor.all_in_service_and_expunged_zones(reason)) + } + InnerSledEditor::Decommissioned(edited) => { + Either::Right(edited.config.zones.iter()) + } + } + } + pub fn host_phase_2(&self) -> BlueprintHostPhase2DesiredSlots { match &self.0 { InnerSledEditor::Active(editor) => editor.host_phase_2(), @@ -592,9 +632,7 @@ impl ActiveSledEditor { fn validate_decommisionable(&self) -> Result<(), SledEditError> { // A sled is only decommissionable if all its zones have been expunged // (i.e., there are no zones left with an in-service disposition). - if let Some(zone) = - self.zones(BlueprintZoneDisposition::is_in_service).next() - { + if let Some(zone) = self.in_service_zones().next() { return Err(SledEditError::NonDecommissionableZoneNotExpunged { zone_id: zone.id, kind: zone.zone_type.kind(), @@ -636,14 +674,30 @@ impl ActiveSledEditor { self.datasets.datasets(filter) } - pub fn zones( + pub fn in_service_zones( &self, - filter: F, - ) -> impl Iterator - where - F: FnMut(BlueprintZoneDisposition) -> bool, - { - self.zones.zones(filter) + ) -> impl Iterator { + self.zones.in_service_zones() + } + + pub fn could_be_running_zones( + &self, + ) -> impl Iterator { + self.zones.could_be_running_zones() + } + + pub fn expunged_zones( + &self, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + self.zones.expunged_zones(reason) + } + + pub fn all_in_service_and_expunged_zones( + &self, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + self.zones.all_in_service_and_expunged_zones(reason) } pub fn host_phase_2(&self) -> BlueprintHostPhase2DesiredSlots { @@ -808,7 +862,7 @@ impl ActiveSledEditor { &mut self, rng: &mut SledPlannerRng, ) -> Result<(), SledEditError> { - for zone in self.zones.zones(BlueprintZoneDisposition::is_in_service) { + for zone in self.zones.in_service_zones() { ZoneDatasetConfigs::new(&self.disks, zone)? .ensure_in_service(&mut self.datasets, rng); } @@ -884,7 +938,7 @@ impl ActiveSledEditor { // Set all zone image sources to InstallDataset. This is an // acknowledgement of the current state of the world. let zone_ids: Vec<_> = self - .zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|zone| (zone.id, zone.kind())) .collect(); diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs index 7f710aa7cf4..3fd1c8bdfd5 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/zones.rs @@ -5,6 +5,7 @@ use crate::blueprint_builder::EditCounts; use iddqd::IdOrdMap; use iddqd::id_ord_map::Entry; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; @@ -69,7 +70,73 @@ impl ZonesEditor { self.counts } - pub fn zones( + /// Iterate over all in-service zones on this sled. + pub fn in_service_zones( + &self, + ) -> impl Iterator { + // Danger note: this call has no danger of accessing expunged zones, + // because we're filtering to in-service. + self.danger_all_zones(BlueprintZoneDisposition::is_in_service) + } + + /// Iterate over all zones on this sled that could be running; this includes + /// in-service zones and expunged zones that are not yet ready for cleanup + /// (because we haven't confirmed via inventory that they've been shut + /// down). + pub fn could_be_running_zones( + &self, + ) -> impl Iterator { + // Danger note: this may access expunged zones, but only if they're not + // yet ready for cleanup. We don't need the caller to provide a `reason` + // for that - we only need to track access reasons for cleanup purposes, + // which only acts on ready-to-clean-up expunged zones. + self.danger_all_zones(BlueprintZoneDisposition::could_be_running) + } + + /// Iterate over the expunged zones on this sled. + /// + /// Like `Blueprint::expunged_zones()`, callers must specify a + /// [`BlueprintExpungedZoneAccessReason`]. This allows us to statically + /// track all uses of expunged zones, each of which we must account for in + /// the planner's logic to permanently prune expunged zones from the + /// blueprint. + pub fn expunged_zones( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // Danger note: we will definitely access expunged zones, but the caller + // has provided a known `_reason`. + self.danger_all_zones(BlueprintZoneDisposition::is_expunged) + } + + /// Iterate over all zones on this sled, regardless of whether they're + /// in-service or expunged. + /// + /// Like [`Self::expunged_zones()`], callers are required to specify a + /// reason to access expunged zones. + /// + /// The set of zones returned by this method is equivalent to the set of + /// zones returned by chaining together calls to `Self::in_service_zones()` + /// and `Self::expunged_zones(reason)`, but only iterates over the zones + /// once. + pub fn all_in_service_and_expunged_zones( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // Danger note: this call will definitely access expunged zones, but we + // know the caller has provided a known reason to do so. + self.danger_all_zones(BlueprintZoneDisposition::any) + } + + /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint + /// that match the provided filter, along with the associated sled id. + /// + /// This method is prefixed with `danger_` and is private because it allows + /// the caller to potentially act on expunged zones without providing a + /// reason for doing so. It should only be called by `in_service_zones()` + /// and the helper methods that require callers to specify a + /// [`BlueprintExpungedZoneAccessReason`] defined above. + fn danger_all_zones( &self, mut filter: F, ) -> impl Iterator diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index bf264cd6e13..1970ccb7f96 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -28,6 +28,7 @@ use iddqd::IdOrdMap; use itertools::Either; use itertools::Itertools; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSource; use nexus_types::deployment::BlueprintZoneConfig; @@ -417,13 +418,9 @@ impl<'a> Planner<'a> { // we ourselves have made this change, which is fine. let all_zones_expunged = self .blueprint - .current_sled_zones(sled_id, BlueprintZoneDisposition::any) - .all(|zone| { - matches!( - zone.disposition, - BlueprintZoneDisposition::Expunged { .. } - ) - }); + .current_in_service_sled_zones(sled_id) + .next() + .is_none(); // Check 3: Are there any instances assigned to this sled? See // comment above; while we wait for omicron#4872, we just assume @@ -503,31 +500,15 @@ impl<'a> Planner<'a> { )?; } - // Check for any decommissioned sleds (i.e., sleds for which our - // blueprint has zones, but are not in the input sled list). Any zones - // for decommissioned sleds must have already be expunged for - // decommissioning to have happened; fail if we find non-expunged zones - // associated with a decommissioned sled. - for sled_id in self.blueprint.sled_ids_with_zones() { + // Check for any decommissioned sled mismatches: If our blueprint says a + // sled is commissioned, our input must still too. A sled can only be + // dropped from the input if we've decommissioned it via a prior + // blueprint. + for sled_id in self.blueprint.current_commissioned_sleds() { if !commissioned_sled_ids.contains(&sled_id) { - let num_zones = self - .blueprint - .current_sled_zones(sled_id, BlueprintZoneDisposition::any) - .filter(|zone| { - !matches!( - zone.disposition, - BlueprintZoneDisposition::Expunged { .. } - ) - }) - .count(); - if num_zones > 0 { - return Err( - Error::DecommissionedSledWithNonExpungedZones { - sled_id, - num_zones, - }, - ); - } + return Err(Error::CommissionedSledMissingFromInput { + sled_id, + }); } } @@ -642,10 +623,10 @@ impl<'a> Planner<'a> { }; let mut zones_ready_for_cleanup = Vec::new(); - for zone in self - .blueprint - .current_sled_zones(sled_id, BlueprintZoneDisposition::is_expunged) - { + for zone in self.blueprint.current_expunged_sled_zones( + sled_id, + BlueprintExpungedZoneAccessReason::PlannerCheckReadyForCleanup, + ) { // If this is a zone still waiting for cleanup, grab the generation // in which it was expunged. Otherwise, move on. let as_of_generation = match zone.disposition { @@ -909,17 +890,12 @@ impl<'a> Planner<'a> { // If the sled is still running some other control plane // services (which is evidence it previously had an NTP zone!), // we can go ahead and consider it eligible for new ones. - if self - .blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .any(|z| { + if self.blueprint.current_in_service_sled_zones(sled_id).any( + |z| { OmicronZoneType::from(z.zone_type.clone()) .requires_timesync() - }) - { + }, + ) { report .sleds_getting_ntp_and_discretionary_zones .insert(sled_id); @@ -1162,10 +1138,7 @@ impl<'a> Planner<'a> { .count(), discretionary_zones: self .blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(sled_id) .filter_map(|zone| { DiscretionaryOmicronZone::from_zone_type( &zone.zone_type, @@ -1229,14 +1202,16 @@ impl<'a> Planner<'a> { // reuse its subnet until it's ready for cleanup. For all other // services, we want to go ahead and replace them if they're below // the desired count based on purely "in service vs expunged". - let disposition_filter = if zone_kind == ZoneKind::InternalDns { - BlueprintZoneDisposition::could_be_running + let zones_iter = if zone_kind == ZoneKind::InternalDns { + Either::Left( + self.blueprint.current_could_be_running_sled_zones(sled_id), + ) } else { - BlueprintZoneDisposition::is_in_service + Either::Right( + self.blueprint.current_in_service_sled_zones(sled_id), + ) }; - num_existing_kind_zones += self - .blueprint - .current_sled_zones(sled_id, disposition_filter) + num_existing_kind_zones += zones_iter .filter(|z| { let matches_kind = z.zone_type.kind() == zone_kind; match discretionary_zone_kind { @@ -1424,7 +1399,8 @@ impl<'a> Planner<'a> { // Nexus zones. // // The logic is: - // - If any existing Nexus zone has the same image source, reuse its generation + // - If any existing Nexus zone has the same image source, reuse its + // generation // - Otherwise, use the highest existing generation + 1 // - If no existing zones exist, return an error // @@ -1439,10 +1415,11 @@ impl<'a> Planner<'a> { let mut highest_seen_generation = None; let mut same_image_nexus_generation = None; - // Iterate over both existing zones and ones that are actively being placed. + // Iterate over both existing zones and ones that are actively being + // placed. for (zone, nexus) in self .blueprint - .current_zones(BlueprintZoneDisposition::any) + .current_in_service_zones() .filter_map(|(_sled_id, z)| match &z.zone_type { BlueprintZoneType::Nexus(nexus) => Some((z, nexus)), _ => None, @@ -1599,10 +1576,7 @@ impl<'a> Planner<'a> { // https://github.com/oxidecomputer/omicron/issues/8589. let mut zones_currently_updating = self .blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(sled_id) .filter_map(|zone| { let bp_image_source = OmicronZoneImageSource::from(zone.image_source.clone()); @@ -1780,10 +1754,7 @@ impl<'a> Planner<'a> { .flat_map(|sled_id| { let log = &self.log; self.blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(sled_id) .filter_map(move |zone| { let desired_image_source = match target_release .zone_image_source(zone.zone_type.kind()) @@ -2100,10 +2071,7 @@ impl<'a> Planner<'a> { for sled_id in self.input.all_sled_ids(SledFilter::InService) { let mut zones_with_non_artifact = IdOrdMap::new(); // Are all zone image sources set to Artifact? - for z in self.blueprint.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) { + for z in self.blueprint.current_in_service_sled_zones(sled_id) { match &z.image_source { BlueprintZoneImageSource::InstallDataset => { zones_with_non_artifact.insert_overwrite(z); @@ -2202,26 +2170,20 @@ impl<'a> Planner<'a> { let mut old_nexuses_at_current_gen = 0; let mut nexuses_at_proposed_gen = 0; let mut nexuses_at_proposed_gen_missing_metadata_record = 0; - for sled_id in self.blueprint.sled_ids_with_zones() { - for z in self.blueprint.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) { - if let BlueprintZoneType::Nexus(nexus_zone) = &z.zone_type { - if nexus_zone.nexus_generation == proposed_generation { - nexuses_at_proposed_gen += 1; - if !self.input.not_yet_nexus_zones().contains(&z.id) { - nexuses_at_proposed_gen_missing_metadata_record += - 1; - } + for (_sled_id, z) in self.blueprint.current_in_service_zones() { + if let BlueprintZoneType::Nexus(nexus_zone) = &z.zone_type { + if nexus_zone.nexus_generation == proposed_generation { + nexuses_at_proposed_gen += 1; + if !self.input.not_yet_nexus_zones().contains(&z.id) { + nexuses_at_proposed_gen_missing_metadata_record += 1; } + } - if nexus_zone.nexus_generation == current_generation - && z.image_source - != new_repo.zone_image_source(z.zone_type.kind())? - { - old_nexuses_at_current_gen += 1; - } + if nexus_zone.nexus_generation == current_generation + && z.image_source + != new_repo.zone_image_source(z.zone_type.kind())? + { + old_nexuses_at_current_gen += 1; } } } @@ -2477,17 +2439,12 @@ impl<'a> Planner<'a> { fn all_non_nexus_zones_using_new_image(&self) -> Result { let new_repo = self.input.tuf_repo().description(); - for sled_id in self.blueprint.sled_ids_with_zones() { - for z in self.blueprint.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) { - let kind = z.zone_type.kind(); - if kind != ZoneKind::Nexus - && z.image_source != new_repo.zone_image_source(kind)? - { - return Ok(false); - } + for (_sled_id, z) in self.blueprint.current_in_service_zones() { + let kind = z.zone_type.kind(); + if kind != ZoneKind::Nexus + && z.image_source != new_repo.zone_image_source(kind)? + { + return Ok(false); } } return Ok(true); @@ -2503,10 +2460,7 @@ impl<'a> Planner<'a> { // conversions can change the image source. let mut image_source = None; for sled_id in self.input.all_sled_ids(SledFilter::InService) { - for zone in self.blueprint.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) { + for zone in self.blueprint.current_in_service_sled_zones(sled_id) { if self.input.active_nexus_zones().contains(&zone.id) { image_source = Some(zone.image_source.clone()); break; diff --git a/nexus/reconfigurator/planning/src/planner/image_source.rs b/nexus/reconfigurator/planning/src/planner/image_source.rs index 11458914dd7..448f8d0c3be 100644 --- a/nexus/reconfigurator/planning/src/planner/image_source.rs +++ b/nexus/reconfigurator/planning/src/planner/image_source.rs @@ -10,8 +10,8 @@ use nexus_types::{ deployment::{ BlueprintArtifactVersion, BlueprintHostPhase2DesiredContents, BlueprintHostPhase2DesiredSlots, BlueprintZoneConfig, - BlueprintZoneDisposition, BlueprintZoneImageSource, PlanningInput, - SledFilter, TargetReleaseDescription, + BlueprintZoneImageSource, PlanningInput, SledFilter, + TargetReleaseDescription, }, inventory::Collection, }; @@ -96,10 +96,7 @@ impl NoopConvertInfo { // Out of these, which zones' hashes (as reported in the zone // manifest) match the corresponding ones in the TUF repo? let zones = blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) + .current_in_service_sled_zones(sled_id) .map(|zone| { NoopConvertZoneInfo::new( zone, diff --git a/nexus/reconfigurator/planning/src/planner/zone_safety.rs b/nexus/reconfigurator/planning/src/planner/zone_safety.rs index ded59e5fc17..2d7a18bcdb7 100644 --- a/nexus/reconfigurator/planning/src/planner/zone_safety.rs +++ b/nexus/reconfigurator/planning/src/planner/zone_safety.rs @@ -8,7 +8,6 @@ use crate::blueprint_builder::BlueprintBuilder; use itertools::Itertools; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::CockroachdbUnsafeToShutdown; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::ZoneUnsafeToShutdown; @@ -126,9 +125,7 @@ impl<'a> ZoneSafetyChecksBuilder<'a> { // Precalculate sets of particular zone kinds that we care about in // other checks below. - for (_sled_id, zone) in - blueprint.current_zones(BlueprintZoneDisposition::is_in_service) - { + for (_sled_id, zone) in blueprint.current_in_service_zones() { match zone.zone_type.kind() { ZoneKind::BoundaryNtp => { boundary_ntp_zones.insert(zone.id); @@ -151,10 +148,7 @@ impl<'a> ZoneSafetyChecksBuilder<'a> { } fn build(mut self) -> ZoneSafetyChecks { - for (sled_id, zone) in self - .blueprint - .current_zones(BlueprintZoneDisposition::is_in_service) - { + for (sled_id, zone) in self.blueprint.current_in_service_zones() { if let Some(reason) = self.reason_zone_unsafe_to_shut_down(zone) { self.checks.insert(sled_id, zone.id, reason); } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 5cef5f2af23..99f739e2da8 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -851,6 +851,13 @@ pub enum BlueprintExpungedZoneAccessReason { /// new producers or if it has any assigned producers. OximeterExpungeAndReassignProducers, + /// The planner must compare expunged zones against inventory to transition + /// them to "ready for cleanup". + /// + /// The planner does not need to account for this when pruning zones. + /// (Moving them to "ready for cleanup" is a _prerequisite_ for pruning.) + PlannerCheckReadyForCleanup, + // -------------------------------------------------------------------- // Catch-all variants for non-production callers. The planner does not need // to account for these when pruning. @@ -862,6 +869,10 @@ pub enum BlueprintExpungedZoneAccessReason { /// expunged zones. Omdb, + /// `reconfigurator-cli` allows manual blueprint changes, including + /// modifying expunged zones. + ReconfiguratorCli, + /// Various unit and integration tests access expunged zones. Test, } @@ -1630,12 +1641,11 @@ impl BlueprintZoneDisposition { /// Always returns true. /// /// This is intended for use with methods that take a filtering closure - /// operating on a `BlueprintZoneDisposition` (e.g., - /// `Blueprint::all_omicron_zones()`), allowing callers to make it clear - /// they accept any disposition via + /// operating on a `BlueprintZoneDisposition`, allowing callers to make it + /// clear they accept any disposition via /// /// ```rust,ignore - /// blueprint.all_omicron_zones(BlueprintZoneDisposition::any) + /// blueprint.zones_with_filter(BlueprintZoneDisposition::any) /// ``` pub fn any(self) -> bool { true