diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 6462d96fbc7..a90a7350940 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -1044,6 +1044,9 @@ pub struct ZoneBuilder<'a> { /// The directories that will be searched for the image tarball for the /// provided zone type ([`Self::with_zone_type`]). zone_image_paths: Option<&'a [Utf8PathBuf]>, + /// The file name of the zone image to search for in [`Self::zone_image_paths`]. + /// If unset, defaults to `{zone_type}.tar.gz`. + zone_image_file_name: Option<&'a str>, /// The name of the type of zone being created (e.g. "propolis-server") zone_type: Option<&'a str>, /// Unique ID of the instance of the zone being created. (optional) @@ -1110,6 +1113,17 @@ impl<'a> ZoneBuilder<'a> { self } + /// The file name of the zone image to search for in the zone image + /// paths ([`Self::with_zone_image_paths`]). If unset, defaults to + /// `{zone_type}.tar.gz`. + pub fn with_zone_image_file_name( + mut self, + image_file_name: &'a str, + ) -> Self { + self.zone_image_file_name = Some(image_file_name); + self + } + /// The name of the type of zone being created (e.g. "propolis-server") pub fn with_zone_type(mut self, zone_type: &'a str) -> Self { self.zone_type = Some(zone_type); @@ -1227,6 +1241,7 @@ impl<'a> ZoneBuilder<'a> { underlay_vnic_allocator: Some(underlay_vnic_allocator), zone_root_path: Some(mut zone_root_path), zone_image_paths: Some(zone_image_paths), + zone_image_file_name, zone_type: Some(zone_type), unique_name, datasets: Some(datasets), @@ -1255,15 +1270,18 @@ impl<'a> ZoneBuilder<'a> { InstalledZone::get_zone_name(zone_type, unique_name); // Looks for the image within `zone_image_path`, in order. - let image = format!("{}.tar.gz", zone_type); + let image_file_name = match zone_image_file_name { + Some(image) => image, + None => &format!("{}.tar.gz", zone_type), + }; let zone_image_path = zone_image_paths .iter() .find_map(|image_path| { - let path = image_path.join(&image); + let path = image_path.join(image_file_name); if path.exists() { Some(path) } else { None } }) .ok_or_else(|| InstallZoneError::ImageNotFound { - image: image.to_string(), + image: image_file_name.to_string(), paths: zone_image_paths .iter() .map(|p| p.to_path_buf()) diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 680223920cd..5b8fdb39133 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -654,6 +654,18 @@ pub enum OmicronZoneImageSource { Artifact { hash: ArtifactHash }, } +impl OmicronZoneImageSource { + /// Return the artifact hash used for the zone image, if the zone's image + /// source is from the artifact store. + pub fn artifact_hash(&self) -> Option { + if let OmicronZoneImageSource::Artifact { hash } = self { + Some(*hash) + } else { + None + } + } +} + // See `OmicronZoneConfig`. This is a separate function instead of being `impl // Default` because we don't want to accidentally use this default in Rust code. fn deserialize_image_source_default() -> OmicronZoneImageSource { diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index b658aafa5f8..d19c4017aed 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -17,6 +17,7 @@ //! Operations that list or modify artifacts or the configuration are called by //! Nexus and handled by the Sled Agent API. +use std::collections::BTreeMap; use std::future::Future; use std::io::{ErrorKind, Write}; use std::net::SocketAddrV6; @@ -48,6 +49,8 @@ use tokio::sync::{mpsc, oneshot, watch}; use tokio::task::JoinSet; use tufaceous_artifact::ArtifactHash; +use crate::services::ServiceManager; + // These paths are defined under the artifact storage dataset. They // cannot conflict with any artifact paths because all artifact paths are // hexadecimal-encoded SHA-256 checksums. @@ -87,7 +90,11 @@ pub(crate) struct ArtifactStore { } impl ArtifactStore { - pub(crate) async fn new(log: &Logger, storage: T) -> ArtifactStore { + pub(crate) async fn new( + log: &Logger, + storage: T, + services: Option, + ) -> ArtifactStore { let log = log.new(slog::o!("component" => "ArtifactStore")); let mut ledger_paths = Vec::new(); @@ -126,6 +133,7 @@ impl ArtifactStore { tokio::task::spawn(ledger_manager( log.clone(), ledger_paths, + services, ledger_rx, config_tx, )); @@ -244,15 +252,27 @@ impl ArtifactStore { pub(crate) async fn get( &self, sha256: ArtifactHash, + ) -> Result { + Self::get_from_storage(&self.storage, &self.log, sha256).await + } + + /// Open an artifact file by hash from a storage handle. + /// + /// This is the same as [ArtifactStore::get], but can be called with only + /// a [StorageHandle]. + pub(crate) async fn get_from_storage( + storage: &T, + log: &Logger, + sha256: ArtifactHash, ) -> Result { let sha256_str = sha256.to_string(); let mut last_error = None; - for mountpoint in self.storage.artifact_storage_paths().await { + for mountpoint in storage.artifact_storage_paths().await { let path = mountpoint.join(&sha256_str); match File::open(&path).await { Ok(file) => { info!( - &self.log, + &log, "Retrieved artifact"; "sha256" => &sha256_str, "path" => path.as_str(), @@ -261,7 +281,7 @@ impl ArtifactStore { } Err(err) if err.kind() == ErrorKind::NotFound => {} Err(err) => { - log_and_store!(last_error, &self.log, "open", path, err); + log_and_store!(last_error, &log, "open", path, err); } } } @@ -430,9 +450,11 @@ type LedgerManagerRequest = async fn ledger_manager( log: Logger, ledger_paths: Vec, + services: Option, mut rx: mpsc::Receiver, config_channel: watch::Sender>, ) { + let services = services.as_ref(); let handle_request = async |new_config: ArtifactConfig| { if ledger_paths.is_empty() { return Err(Error::NoUpdateDataset); @@ -441,7 +463,25 @@ async fn ledger_manager( Ledger::::new(&log, ledger_paths.clone()).await { if new_config.generation > ledger.data().generation { - // New config generation; update the ledger. + // New config generation. First check that the configuration + // contains all artifacts that are presently in use. + let mut missing = BTreeMap::new(); + // Check artifacts from the current zone configuration. + if let Some(services) = services { + for zone in services.omicron_zones_list().await.zones { + if let Some(hash) = zone.image_source.artifact_hash() { + if !new_config.artifacts.contains(&hash) { + missing + .insert(hash, "current zone configuration"); + } + } + } + } + if !missing.is_empty() { + return Err(Error::InUseArtifactsMissing(missing)); + } + + // Everything looks okay; update the ledger. *ledger.data_mut() = new_config; ledger } else if new_config == *ledger.data() { @@ -740,7 +780,7 @@ impl RepoDepotApi for RepoDepotImpl { } #[derive(Debug, thiserror::Error, SlogInlineError)] -pub(crate) enum Error { +pub enum Error { #[error("Error while reading request body")] Body(dropshot::HttpError), @@ -784,6 +824,9 @@ pub(crate) enum Error { #[error("Digest mismatch: expected {expected}, actual {actual}")] HashMismatch { expected: ArtifactHash, actual: ArtifactHash }, + #[error("Artifacts in use are not present in new config: {0:?}")] + InUseArtifactsMissing(BTreeMap), + #[error("Blocking task failed")] Join(#[source] tokio::task::JoinError), @@ -813,6 +856,7 @@ impl From for HttpError { match err { // 4xx errors Error::HashMismatch { .. } + | Error::InUseArtifactsMissing { .. } | Error::NoConfig | Error::NotInConfig { .. } => { HttpError::for_bad_request(None, err.to_string()) @@ -951,7 +995,7 @@ mod test { let log = test_setup_log("generations"); let backend = TestBackend::new(2); - let store = ArtifactStore::new(&log.log, backend).await; + let store = ArtifactStore::new(&log.log, backend, None).await; // get_config returns None assert!(store.get_config().is_none()); @@ -1004,7 +1048,7 @@ mod test { async fn list_get_put() { let log = test_setup_log("list_get_put"); let backend = TestBackend::new(2); - let mut store = ArtifactStore::new(&log.log, backend).await; + let mut store = ArtifactStore::new(&log.log, backend, None).await; // get fails, because it doesn't exist yet assert!(matches!( @@ -1126,7 +1170,7 @@ mod test { let log = test_setup_log("no_dataset"); let backend = TestBackend::new(0); - let store = ArtifactStore::new(&log.log, backend).await; + let store = ArtifactStore::new(&log.log, backend, None).await; assert!(matches!( store.get(TEST_HASH).await, @@ -1154,7 +1198,7 @@ mod test { let log = test_setup_log("wrong_hash"); let backend = TestBackend::new(2); - let store = ArtifactStore::new(&log.log, backend).await; + let store = ArtifactStore::new(&log.log, backend, None).await; let mut config = ArtifactConfig { generation: 1u32.into(), artifacts: BTreeSet::new(), @@ -1214,7 +1258,7 @@ mod test { let log = test_setup_log("issue_7796"); let backend = TestBackend::new(2); - let store = ArtifactStore::new(&log.log, backend).await; + let store = ArtifactStore::new(&log.log, backend, None).await; let mut config = ArtifactConfig { generation: 1u32.into(), diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 7490a6f3d92..89fa551a5a1 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -25,6 +25,7 @@ //! - [ServiceManager::activate_switch] exposes an API to specifically enable //! or disable (via [ServiceManager::deactivate_switch]) the switch zone. +use crate::artifact_store::ArtifactStore; use crate::bootstrap::BootstrapNetworking; use crate::bootstrap::early_networking::{ EarlyNetworkSetup, EarlyNetworkSetupError, @@ -68,7 +69,8 @@ use internal_dns_types::names::DNS_ZONE; use itertools::Itertools; use nexus_config::{ConfigDropshotWithTls, DeploymentConfig}; use nexus_sled_agent_shared::inventory::{ - OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig, ZoneKind, + OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, + OmicronZonesConfig, ZoneKind, }; use omicron_common::address::AZ_PREFIX; use omicron_common::address::COCKROACH_PORT; @@ -108,7 +110,9 @@ use sled_hardware::is_gimlet; use sled_hardware::underlay; use sled_hardware_types::Baseboard; use sled_storage::config::MountConfig; -use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET}; +use sled_storage::dataset::{ + CONFIG_DATASET, INSTALL_DATASET, M2_ARTIFACT_DATASET, ZONE_DATASET, +}; use sled_storage::manager::StorageHandle; use slog::Logger; use slog_error_chain::InlineErrorChain; @@ -122,6 +126,7 @@ use tokio::io::AsyncWriteExt; use tokio::sync::Mutex; use tokio::sync::{MutexGuard, oneshot}; use tokio::task::JoinHandle; +use tufaceous_artifact::ArtifactHash; use uuid::Uuid; use illumos_utils::zone::Zones; @@ -312,6 +317,18 @@ pub enum Error { #[error("Unexpected zone config: zone {zone_id} is running on ramdisk ?!")] ZoneIsRunningOnRamdisk { zone_id: OmicronZoneUuid }, + + #[error( + "Couldn't find requested zone image ({hash}) for \ + {zone_kind:?} {id} in artifact store: {err}" + )] + ZoneArtifactNotFound { + hash: ArtifactHash, + zone_kind: &'static str, + id: OmicronZoneUuid, + #[source] + err: crate::artifact_store::Error, + }, } impl Error { @@ -1697,22 +1714,64 @@ impl ServiceManager { .map(|d| zone::Device { name: d.to_string() }) .collect(); - // Look for the image in the ramdisk first - let mut zone_image_paths = vec![Utf8PathBuf::from("/opt/oxide")]; - // Inject an image path if requested by a test. - if let Some(path) = self.inner.image_directory_override.get() { - zone_image_paths.push(path.clone()); + // TODO: `InstallDataset` should be renamed to something more accurate + // when all the major changes here have landed. Some zones are + // distributed from the host OS image and are never placed in the + // install dataset; that enum variant more accurately reflects that we + // are falling back to searching `/opt/oxide` in addition to the install + // datasets. + let image_source = match &request { + ZoneArgs::Omicron(zone_config) => &zone_config.zone.image_source, + ZoneArgs::Switch(_) => &OmicronZoneImageSource::InstallDataset, + }; + let zone_image_file_name = match image_source { + OmicronZoneImageSource::InstallDataset => None, + OmicronZoneImageSource::Artifact { hash } => Some(hash.to_string()), }; - - // If the boot disk exists, look for the image in the "install" dataset - // there too. let all_disks = self.inner.storage.get_latest_disks().await; - if let Some((_, boot_zpool)) = all_disks.boot_disk() { - zone_image_paths.push(boot_zpool.dataset_mountpoint( - &all_disks.mount_config().root, - INSTALL_DATASET, - )); - } + let zone_image_paths = match image_source { + OmicronZoneImageSource::InstallDataset => { + // Look for the image in the ramdisk first + let mut zone_image_paths = + vec![Utf8PathBuf::from("/opt/oxide")]; + // Inject an image path if requested by a test. + if let Some(path) = self.inner.image_directory_override.get() { + zone_image_paths.push(path.clone()); + }; + + // If the boot disk exists, look for the image in the "install" + // dataset there too. + if let Some((_, boot_zpool)) = all_disks.boot_disk() { + zone_image_paths.push(boot_zpool.dataset_mountpoint( + &all_disks.mount_config().root, + INSTALL_DATASET, + )); + } + + zone_image_paths + } + OmicronZoneImageSource::Artifact { .. } => { + // Search both artifact datasets, but look on the boot disk first. + let boot_zpool = + all_disks.boot_disk().map(|(_, boot_zpool)| boot_zpool); + // This iterator starts with the zpool for the boot disk (if it + // exists), and then is followed by all other zpools. + let zpool_iter = boot_zpool.clone().into_iter().chain( + all_disks + .all_m2_zpools() + .into_iter() + .filter(|zpool| Some(zpool) != boot_zpool.as_ref()), + ); + zpool_iter + .map(|zpool| { + zpool.dataset_mountpoint( + &all_disks.mount_config().root, + M2_ARTIFACT_DATASET, + ) + }) + .collect() + } + }; let zone_type_str = match &request { ZoneArgs::Omicron(zone_config) => { @@ -1736,6 +1795,9 @@ impl ServiceManager { if let Some(vnic) = bootstrap_vnic { zone_builder = zone_builder.with_bootstrap_vnic(vnic); } + if let Some(file_name) = &zone_image_file_name { + zone_builder = zone_builder.with_zone_image_file_name(file_name); + } let installed_zone = zone_builder .with_log(self.inner.log.clone()) .with_underlay_vnic_allocator(&self.inner.underlay_vnic_allocator) @@ -3609,6 +3671,26 @@ impl ServiceManager { let mut existing_zones = self.inner.zones.lock().await; + // Ensure that any zone images from the artifact store are present. + for zone in &request.zones { + if let Some(hash) = zone.image_source.artifact_hash() { + if let Err(err) = ArtifactStore::get_from_storage( + &self.inner.storage, + &self.inner.log, + hash, + ) + .await + { + return Err(Error::ZoneArtifactNotFound { + hash, + zone_kind: zone.zone_type.kind().report_str(), + id: zone.id, + err, + }); + } + } + } + // Read the existing set of services from the ledger. let zone_ledger_paths = self.all_omicron_zone_ledgers().await; let mut ledger = match Ledger::::new( diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 7e896fc24a8..98174dbdd61 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -155,9 +155,10 @@ impl SledAgent { simulated_upstairs.register_storage(id, &storage); - let repo_depot = ArtifactStore::new(&log, SimArtifactStorage::new()) - .await - .start(&log, &config.dropshot); + let repo_depot = + ArtifactStore::new(&log, SimArtifactStorage::new(), None) + .await + .start(&log, &config.dropshot); Arc::new(SledAgent { id, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 18aa53d6db0..6ab2a21c29a 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -586,10 +586,14 @@ impl SledAgent { ) .await?; - let repo_depot = ArtifactStore::new(&log, storage_manager.clone()) - .await - .start(sled_address, &config.dropshot) - .await?; + let repo_depot = ArtifactStore::new( + &log, + storage_manager.clone(), + Some(services.clone()), + ) + .await + .start(sled_address, &config.dropshot) + .await?; // Spawn a background task for managing notifications to nexus // about this sled-agent.