diff --git a/crates/api-core/src/cfg/file.rs b/crates/api-core/src/cfg/file.rs index 2dbde9d57f..3d5e8fa8c5 100644 --- a/crates/api-core/src/cfg/file.rs +++ b/crates/api-core/src/cfg/file.rs @@ -730,9 +730,9 @@ pub struct CarbideConfig { #[serde(default)] pub tracing: TracingConfig, - /// Secrets backend configuration. When present, credentials live - /// encrypted in Postgres and vault leaves the credential chain - /// entirely; when absent, vault remains the credential store. + /// Secrets backend configuration. When present, the credential reader + /// chain and write target are operator-configured (defaulting to the same + /// env -> file -> vault behavior as when it is absent); see `SecretsConfig`. pub secrets: Option, } @@ -846,24 +846,26 @@ pub enum BgpLeafSessionPassword { SiteWide, } -/// Configures the Postgres secrets backend. When this section is present, -/// credentials live encrypted in Postgres and vault is not in the -/// credential chain at all -- the one-time import either completes before -/// the process serves traffic, or the process does not start. Vault keeps -/// serving PKI certificates either way. +/// Configures the Postgres secrets backend and how credentials flow. When +/// this section is present the reader chain and the write target come from +/// `backends` / `writer` below; their defaults keep today's behavior +/// (env -> file -> vault, writes to vault), so adding `[secrets]` does not +/// change credential routing on its own. Operators choose which backends to +/// read, in what order, and which one takes writes, by editing `backends` +/// and `writer`. Vault keeps serving PKI certificates regardless of the +/// chain. /// -/// Enabling this on an existing site has two prerequisites that live -/// outside this process: +/// Two prerequisites live outside this process and matter once writes move +/// to Postgres (`writer = "postgres"`) or vault leaves `backends`: /// /// - Services that read credentials from vault through their own chains -/// (`bmc-proxy`, `dsx-exchange-consumer`) keep reading vault and will -/// not see anything carbide-api writes to Postgres afterwards. They must -/// be migrated or fed another way before credentials here change. -/// - During a rolling upgrade, replicas still running the vault config -/// keep writing rotated credentials to vault, where they are stranded -/// once the import has completed. Keep autonomous credential writers -/// (site-explorer credential rotation) disabled until the whole fleet -/// runs this config. +/// (`bmc-proxy`, `dsx-exchange-consumer`) will not see anything carbide-api +/// writes to Postgres. They must be pointed at the same backend, or fed +/// another way, before the credentials they read change. +/// - During a rolling upgrade, replicas still on an older config keep writing +/// rotated credentials to their own writer. Keep autonomous credential +/// writers (site-explorer credential rotation) disabled until the whole +/// fleet runs a consistent config. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields)] pub struct SecretsConfig { @@ -884,9 +886,37 @@ pub struct SecretsConfig { /// ``` pub routing: std::collections::HashMap, + /// The credential *backend* read order, highest priority first (first match + /// wins). The local-override readers (env, file) are always tried ahead of + /// these, when their `[credentials.*]` section is enabled; this list only + /// orders the backends behind them. Order is the operator's choice -- list + /// the backends you want, in the priority you want. Defaults to `["vault"]` + /// -- with the local overrides, that is the env -> file -> vault chain. + /// + /// For example, to roll Postgres in gradually, walk this list: + /// + /// 1. `["vault"]` -- Postgres configured but not yet read. + /// 2. `["postgres", "vault"]` -- Postgres in front, vault as the safety net + /// for anything Postgres misses. + /// 3. `["postgres"]` -- vault no longer read. + /// + /// An empty list, or a backend named twice, fails the boot. + #[serde(default = "default_secret_backends")] + pub backends: Vec, + + /// Where new credential writes go. Defaults to `vault`; set to `postgres` + /// to send new writes to the journal. Independent of `backends`: e.g. + /// `writer = "postgres"` while `postgres` is not in `backends` (reads still + /// served by vault) is a valid shadow-write -- it confirms writes land + /// before reads start trusting Postgres -- and only logs a warning. + #[serde(default)] + pub writer: CredentialBackend, + /// A source backend to import secrets from at startup. Unset means a /// fresh site with nothing to import; unsupported values fail config - /// parsing rather than silently skipping the import. + /// parsing rather than silently skipping the import. Independent of + /// `backends`/`writer` -- importing from vault is orthogonal to where + /// reads and writes flow. pub import_from: Option, /// How to treat secrets that already exist in Postgres during import. @@ -902,6 +932,27 @@ pub enum ImportSource { Vault, } +/// A credential backend -- postgres or vault. Listed in `[secrets].backends` to +/// order the backends behind the always-first local overrides (env, file; +/// first match wins, see `ChainedCredentialReader`), and named by +/// `[secrets].writer` to choose where new writes go. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, Deserialize, Serialize)] +#[serde(rename_all = "lowercase")] +pub enum CredentialBackend { + /// The Postgres secrets journal. + Postgres, + /// Vault/OpenBao KV. The default write target (today's behavior). + #[default] + Vault, +} + +/// The default backend order (just vault). With the always-first env/file +/// local overrides, this is the env -> file -> vault chain, so adding +/// `[secrets]` changes nothing until an operator edits it. +fn default_secret_backends() -> Vec { + vec![CredentialBackend::Vault] +} + /// Configures the KMS backends that wrap DEKs. Several named providers can /// be defined: the active one wraps DEKs for new writes, and every provider /// answers unwraps for the kek_ids it has. @@ -4156,6 +4207,11 @@ firmware_url = "https://firmware.example.com/fw-b.bin" secrets.import_approach, crate::secrets::ImportApproach::MissingOnly ); + + // backends/writer were omitted above, so they default to vault-only + // (env/file are prepended separately) writing to vault. + assert_eq!(secrets.backends, vec![CredentialBackend::Vault]); + assert_eq!(secrets.writer, CredentialBackend::Vault); } // Verifies that a typo'd import source fails config parsing instead of @@ -4186,6 +4242,96 @@ firmware_url = "https://firmware.example.com/fw-b.bin" assert!(toml::from_str::(toml_str).is_err()); } + // Verifies the backends list and writer parse from their enum values -- + // one with Postgres in front of vault (writes to Postgres) and a + // postgres-only one (vault not read, writes to Postgres). + #[test] + fn secrets_config_parses_backends_and_writer() { + #[derive(Deserialize)] + struct Wrapper { + secrets: SecretsConfig, + } + + let pg_first = r#" + [secrets] + backends = ["postgres", "vault"] + writer = "postgres" + + [secrets.kms] + active = "local" + [secrets.kms.providers.local] + type = "integrated" + keys.default-key = { env = "K" } + + [secrets.routing] + "/" = "default-key" + "#; + let secrets = toml::from_str::(pg_first) + .expect("parse pg-first") + .secrets; + assert_eq!( + secrets.backends, + vec![CredentialBackend::Postgres, CredentialBackend::Vault] + ); + assert_eq!(secrets.writer, CredentialBackend::Postgres); + + // Postgres-only reads, writes to postgres too. (The + // writer-defaults-to-vault case is covered by the deserialize test + // above, with vault still in backends -- pairing a postgres-only chain + // with a vault writer is the read-after-write gap run.rs warns about.) + let postgres_only = r#" + [secrets] + backends = ["postgres"] + writer = "postgres" + + [secrets.kms] + active = "local" + [secrets.kms.providers.local] + type = "integrated" + keys.default-key = { env = "K" } + + [secrets.routing] + "/" = "default-key" + "#; + let secrets = toml::from_str::(postgres_only) + .expect("parse postgres-only") + .secrets; + assert_eq!(secrets.backends, vec![CredentialBackend::Postgres]); + assert_eq!(secrets.writer, CredentialBackend::Postgres); + } + + // Verifies a typo'd backend or writer value fails parsing rather than + // silently dropping a backend from the chain. + #[test] + fn secrets_config_rejects_unknown_backend() { + #[derive(Deserialize)] + struct Wrapper { + #[expect(dead_code)] + secrets: SecretsConfig, + } + + let base_kms = r#" + [secrets.kms] + active = "local" + [secrets.kms.providers.local] + type = "integrated" + keys.default-key = { env = "K" } + [secrets.routing] + "/" = "default-key" + "#; + + let bad_backend = format!("[secrets]\nbackends = [\"postgrez\"]\n{base_kms}"); + assert!(toml::from_str::(&bad_backend).is_err()); + + // env/file are local overrides, not backends -- they belong in + // [credentials.*], not [secrets].backends, so they're rejected here. + let env_as_backend = format!("[secrets]\nbackends = [\"env\"]\n{base_kms}"); + assert!(toml::from_str::(&env_as_backend).is_err()); + + let bad_writer = format!("[secrets]\nwriter = \"valt\"\n{base_kms}"); + assert!(toml::from_str::(&bad_writer).is_err()); + } + // Verifies that a misspelled optional key in [secrets] -- here // `import_fom` for `import_from` -- fails to parse instead of leaving // the import silently disabled. Without deny_unknown_fields, the typo'd diff --git a/crates/api-core/src/run.rs b/crates/api-core/src/run.rs index c88a0e92e7..5241671930 100644 --- a/crates/api-core/src/run.rs +++ b/crates/api-core/src/run.rs @@ -34,7 +34,9 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::subscriber::NoSubscriber; -use crate::cfg::file::{CarbideConfig, ImportSource, ProviderConfig, SecretsConfig}; +use crate::cfg::file::{ + CarbideConfig, CredentialBackend, ImportSource, ProviderConfig, SecretsConfig, +}; use crate::listener::AdminUiRoutesBuilder; use crate::logging::metrics_endpoint::{MetricsEndpointConfig, run_metrics_endpoint}; use crate::logging::setup::{ @@ -200,55 +202,97 @@ pub async fn run( let db_pool = setup::create_and_connect_postgres_pool(&carbide_config).await?; - // Build the credential reader chain. Lookups try each reader in order - // and the first answer wins. - let mut readers: Vec> = Vec::new(); - if credential_config.env.enabled() { - readers.push(Box::new( + // Build the local-override readers (env, file); each is consulted only when + // its [credentials.*] section is enabled. The backends (postgres, + // vault) and the writer are chosen below. + let env_reader: Option> = if credential_config.env.enabled() { + Some(Box::new( carbide_secrets::local_credentials::EnvCredentials::new(credential_config.env.clone())?, - )); - } - if credential_config.file.enabled() { - readers.push(Box::new( + )) + } else { + None + }; + let file_reader: Option> = if credential_config.file.enabled() { + Some(Box::new( carbide_secrets::local_credentials::FileCredentialsWatcher::new( credential_config.file.clone(), ) .await?, - )); - } - - // With a [secrets] section, Postgres is the whole credential backend: - // it answers reads, takes every write, and vault is not in the chain - // at all -- the strict one-time import below either completes or the - // process does not start. Without the section, the store comes from - // CARBIDE_CREDENTIAL_STORE: vault (the default), or an in-memory store - // suitable only for development and testing. - let (writer, secrets_context): (Arc, Option) = - if let Some(ref secrets_config) = carbide_config.secrets { - let routing = SecretRouting::from_config(&secrets_config.routing) - .map_err(eyre::Report::new) - .wrap_err("secrets routing configuration")?; - let kms = build_kms_backend( - secrets_config, - &vault_config, - &routing, - &mut join_set, - &cancel_token, - )?; - - let pg_mgr = Arc::new( - crate::secrets::PostgresCredentialManager::new( - db_pool.clone(), - routing.clone(), - kms.clone(), - ) - .with_metrics(crate::secrets::SecretsMetrics::new(&metrics.meter)), - ); - tracing::info!( - active_provider = %secrets_config.kms.active, - "Postgres secrets backend configured" + )) + } else { + None + }; + // The local overrides that ended up enabled, in order -- always tried + // ahead of the backends. + let local_overrides: Vec> = + [env_reader, file_reader].into_iter().flatten().collect(); + + // With a [secrets] section, the credential chain and write target come from + // `backends`/`writer` -- defaulting to env -> file -> vault writing to vault, + // so the section alone changes nothing. The one-time vault import is + // independent: it runs iff `import_from` is set. Without the section, the + // store comes from CARBIDE_CREDENTIAL_STORE: vault (the default), or an + // in-memory store for development and testing. + let (credential_manager, secrets_context): ( + Arc, + Option, + ) = if let Some(ref secrets_config) = carbide_config.secrets { + // Reject a nonsensical backends list before anything with side effects + // runs (KMS task setup, the one-time vault import): a config error + // should fail the boot cleanly, not after a partial, hard-to-undo + // import that has already written the completion marker. + crate::secrets::validate_backends(&secrets_config.backends)?; + + let routing = SecretRouting::from_config(&secrets_config.routing) + .map_err(eyre::Report::new) + .wrap_err("secrets routing configuration")?; + let kms = build_kms_backend( + secrets_config, + &vault_config, + &routing, + &mut join_set, + &cancel_token, + )?; + + let pg_mgr = Arc::new( + crate::secrets::PostgresCredentialManager::new( + db_pool.clone(), + routing.clone(), + kms.clone(), + ) + .with_metrics(crate::secrets::SecretsMetrics::new(&metrics.meter)), + ); + tracing::info!( + active_provider = %secrets_config.kms.active, + backends = ?secrets_config.backends, + writer = ?secrets_config.writer, + "Postgres secrets backend configured" + ); + // New writes all go to `writer`, but reads take the first backend in + // `backends` that holds the path -- first-match-wins, evaluated per path. + // So unless `writer` is the highest-priority backend, a write can be + // shadowed: if a higher-priority backend also holds that path, reads keep + // returning its value and never reach the writer's. E.g. with + // backends = [vault, postgres] and writer = postgres, a path that exists + // in both reads vault's value until vault's copy of that path is + // removed (a read-after-write gap). The same gap exists when `writer` + // is not in `backends` at all. Both reduce to "writer isn't the top + // backend." We allow it -- a deliberate shadow-write is a valid, if + // advanced, setup -- but warn so an accidental one is visible. + let writer_is_top_backend = secrets_config.backends.first() == Some(&secrets_config.writer); + if !writer_is_top_backend { + tracing::warn!( + writer = ?secrets_config.writer, + backends = ?secrets_config.backends, + "secrets writer's backend is not the highest-priority backend: a write to a path a \ + higher-priority backend also holds is shadowed on read until that copy is removed \ + (read-after-write gap)" ); + } + // A one-time bulk import from vault, only when the operator asks for + // one. Independent of backends/writer. + if secrets_config.import_from == Some(ImportSource::Vault) { import_vault_secrets_once( &db_pool, secrets_config, @@ -257,29 +301,54 @@ pub async fn run( &vault_client, ) .await?; + } - readers.push(Box::new(pg_mgr.clone()) as Box); - (pg_mgr, Some(SecretsContext { routing, kms })) - } else { - let credential_store: Arc = match std::env::var( - "CARBIDE_CREDENTIAL_STORE", - ) + // Read order: the always-first local overrides, then the configured + // backends in the operator's chosen order (first match wins). The write + // target is the single backend `writer` names. (`backends` was + // validated at the top of this branch, before any side effects.) + let backend_readers = + secrets_config + .backends + .iter() + .map(|backend| -> Box { + match backend { + CredentialBackend::Postgres => Box::new(pg_mgr.clone()), + CredentialBackend::Vault => Box::new(vault_client.clone()), + } + }); + let chain: Vec> = + local_overrides.into_iter().chain(backend_readers).collect(); + let writer: Arc = match secrets_config.writer { + CredentialBackend::Vault => vault_client.clone(), + CredentialBackend::Postgres => pg_mgr.clone(), + }; + ( + create_credential_manager_from(writer, chain), + Some(SecretsContext { routing, kms }), + ) + } else { + let store: Arc = match std::env::var("CARBIDE_CREDENTIAL_STORE") .as_deref() .unwrap_or("vault") - { - "vault" => vault_client.clone(), - "memory" => Arc::new(MemoryCredentialStore::default()), - other => { - return Err(eyre::eyre!( - "Invalid CARBIDE_CREDENTIAL_STORE value {other:?}: expected \"vault\" or \"memory\"" - )); - } - }; - readers.push(Box::new(credential_store.clone())); - (credential_store, None) + { + "vault" => vault_client.clone(), + "memory" => Arc::new(MemoryCredentialStore::default()), + other => { + return Err(eyre::eyre!( + "Invalid CARBIDE_CREDENTIAL_STORE value {other:?}: expected \"vault\" or \"memory\"" + )); + } }; - - let credential_manager = create_credential_manager_from(writer, readers); + // env -> file -> the configured store; nothing from [secrets] applies. + let chain: Vec> = local_overrides + .into_iter() + .chain(std::iter::once( + Box::new(store.clone()) as Box + )) + .collect(); + (create_credential_manager_from(store, chain), None) + }; let redfish_pool = { let rf_pool = libredfish::RedfishClientPool::builder() @@ -478,23 +547,27 @@ fn build_kms_backend( Ok(Arc::new(MultiKmsProvider::new(active, providers))) } -/// Run the one-time vault import if the config asks for one and the -/// completion marker is not written yet. +/// Run the one-time vault import, skipping if the completion marker is +/// already written. The caller gates this on `import_from` (a fresh site +/// simply omits it), so by the time we are here an import is wanted. /// /// The import either completes before this process serves traffic, or the /// process does not start: enumeration is strict (any vault list or read /// failure aborts the boot), and an empty enumeration aborts too, because /// an empty vault on a site configured to import from it is far more /// likely a vault problem than a truly empty vault. A genuinely fresh -/// site simply omits `import_from`. Keeping it this absolute is what lets -/// the credential chain exclude vault entirely whenever `[secrets]` is -/// active -- there is no degraded half-migrated state to reason about. +/// site simply omits `import_from`. Keeping it strict gives a clean, +/// all-or-nothing bulk copy with no half-imported state to reason about. /// -/// Rolling upgrades still need care: a replica running the old vault -/// config can write credentials to vault after this import completes, and -/// those writes are stranded there. Site-explorer credential rotation is -/// the writer to worry about; keep it disabled until the whole fleet runs -/// the `[secrets]` config. +/// This is orthogonal to the reader chain and writer: an import seeds +/// Postgres with vault's secrets, but the read order and write target stay +/// exactly as `backends` / `writer` set them -- importing changes neither. +/// +/// Rolling upgrades still need care once writes move to Postgres: a replica +/// running an older config can write rotated credentials to its own writer, +/// where they are stranded. Site-explorer credential rotation is the writer +/// to worry about; keep it disabled until the whole fleet runs a consistent +/// config. async fn import_vault_secrets_once( db_pool: &PgPool, secrets_config: &SecretsConfig, @@ -502,10 +575,6 @@ async fn import_vault_secrets_once( kms: &dyn KmsBackend, vault_client: &ForgeVaultClient, ) -> eyre::Result<()> { - if secrets_config.import_from != Some(ImportSource::Vault) { - return Ok(()); - } - if is_import_complete(db_pool).await? { tracing::info!("Vault import already completed"); return Ok(()); diff --git a/crates/api-core/src/secrets/mod.rs b/crates/api-core/src/secrets/mod.rs index 975ea5e160..8883adc3fb 100644 --- a/crates/api-core/src/secrets/mod.rs +++ b/crates/api-core/src/secrets/mod.rs @@ -64,6 +64,25 @@ pub struct SecretsContext { pub kms: Arc, } +/// Reject a nonsensical `[secrets].backends` list at boot: empty (at least one +/// backend is required -- the local-override readers alone can't be the whole +/// credential source), or a backend named twice (dead after the first, by +/// first-match-wins). The order of the backends is the operator's choice. +pub fn validate_backends(backends: &[crate::cfg::file::CredentialBackend]) -> eyre::Result<()> { + if backends.is_empty() { + return Err(eyre::eyre!( + "[secrets].backends is empty; at least one backend (postgres or vault) is required" + )); + } + let unique: std::collections::HashSet<_> = backends.iter().collect(); + if unique.len() != backends.len() { + return Err(eyre::eyre!( + "[secrets].backends names a backend more than once" + )); + } + Ok(()) +} + /// The secret path that records vault import completion. It starts with a /// slash on purpose: real credential paths never do, so no `CredentialKey` /// can collide with it, and the kek-scoped journal queries exclude it. @@ -452,3 +471,32 @@ impl CredentialWriter for PostgresCredentialManager { } impl CredentialManager for PostgresCredentialManager {} + +#[cfg(test)] +mod backend_validation_tests { + use super::validate_backends; + use crate::cfg::file::CredentialBackend; + + #[test] + fn accepts_any_order_of_distinct_backends() { + assert!(validate_backends(&[CredentialBackend::Vault]).is_ok()); + assert!(validate_backends(&[CredentialBackend::Postgres]).is_ok()); + assert!( + validate_backends(&[CredentialBackend::Postgres, CredentialBackend::Vault]).is_ok() + ); + // Order is the operator's choice -- vault ahead of postgres is fine. + assert!( + validate_backends(&[CredentialBackend::Vault, CredentialBackend::Postgres]).is_ok() + ); + } + + #[test] + fn rejects_empty_backends() { + assert!(validate_backends(&[]).is_err()); + } + + #[test] + fn rejects_a_backend_named_twice() { + assert!(validate_backends(&[CredentialBackend::Vault, CredentialBackend::Vault]).is_err()); + } +}