From 7ec05142a85ccab71856141a49a4e4329d6b9aff Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Fri, 19 Sep 2025 17:13:25 +0200 Subject: [PATCH 1/7] feat: Add retirement duration for CA certificates --- Cargo.nix | 4 +- Cargo.toml | 2 +- deploy/helm/secret-operator/crds/crds.yaml | 14 +- .../secret-operator/pages/secretclass.adoc | 1 + rust/operator-binary/src/backend/tls/ca.rs | 229 +++++++++++++++++- rust/operator-binary/src/backend/tls/mod.rs | 72 +++++- rust/operator-binary/src/crd/mod.rs | 28 ++- rust/operator-binary/src/crd/v1alpha1_impl.rs | 4 + 8 files changed, 331 insertions(+), 23 deletions(-) diff --git a/Cargo.nix b/Cargo.nix index 464a50e2..c1b93238 100644 --- a/Cargo.nix +++ b/Cargo.nix @@ -10608,7 +10608,7 @@ rec { { name = "time"; packageId = "time"; - features = [ "parsing" ]; + features = [ "macros" "parsing" ]; } { name = "tokio"; @@ -11424,7 +11424,7 @@ rec { "std" = [ "alloc" "deranged/std" ]; "wasm-bindgen" = [ "dep:js-sys" ]; }; - resolvedDefaultFeatures = [ "alloc" "default" "formatting" "parsing" "std" ]; + resolvedDefaultFeatures = [ "alloc" "default" "formatting" "macros" "parsing" "std" ]; }; "time-core" = rec { crateName = "time-core"; diff --git a/Cargo.toml b/Cargo.toml index 32b5ee66..567bd763 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,7 +40,7 @@ socket2 = { version = "0.6", features = ["all"] } strum = { version = "0.27", features = ["derive"] } sys-mount = { version = "3.0", default-features = false } tempfile = "3.12" -time = { version = "0.3", features = ["parsing"] } +time = { version = "0.3", features = ["macros", "parsing"] } tokio = { version = "1.40", features = ["full"] } tokio-stream = { version = "0.1", features = ["net"] } tonic = "0.14" diff --git a/deploy/helm/secret-operator/crds/crds.yaml b/deploy/helm/secret-operator/crds/crds.yaml index cfa8b732..2c433ba8 100644 --- a/deploy/helm/secret-operator/crds/crds.yaml +++ b/deploy/helm/secret-operator/crds/crds.yaml @@ -96,6 +96,13 @@ spec: If `autoGenerate: true` then the Secret Operator will prepare a new CA certificate the old CA approaches expiration. If `autoGenerate: false` then the Secret Operator will log a warning instead. type: string + caCertificateRetirementDuration: + default: 1h + description: |- + Duration at the end of the CA certificate lifetime where no signed certificate will exist. + + Retired (or expired) CA certificates will not be published. + type: string keyGeneration: default: rsa: @@ -136,7 +143,12 @@ spec: type: object maxCertificateLifetime: default: 15d - description: Maximum lifetime the created certificates are allowed to have. In case consumers request a longer lifetime than allowed by this setting, the lifetime will be the minimum of both, so this setting takes precedence. The default value is 15 days. + description: |- + Maximum lifetime the created certificates are allowed to have. In case consumers request a longer lifetime than allowed by this setting, the lifetime will be the minimum of both, so this setting takes precedence. The default value is 15 days. + + The maximum lifetime must be less than a quarter of the active CA certificate lifetime where the active CA certificate lifetime is `ca.ca_certificate_lifetime - ca.ca_certificate_retirement_duration` to ensure that two subjects always have a common CA certificate in their trust stores – assuming that CAs are rotated at half of their active lifetimes. + + For instance, if a pod is created right before half of the active CA lifetime has passed, then it is signed by this CA but it does not know yet the new CA certificate which is created right afterwards. If another pod is created so that its certificate lifetime ends right after the first active CA lifetime then it is signed by the new CA. The `max_certificate_lifetime` must be chosen so that these two pods have no overlapping lifetimes, otherwise the first pod would see the second one signed by an unknown CA certificate. This can be achieved by the mentioned formula. type: string required: - ca diff --git a/docs/modules/secret-operator/pages/secretclass.adoc b/docs/modules/secret-operator/pages/secretclass.adoc index 10fb54a1..77e54804 100644 --- a/docs/modules/secret-operator/pages/secretclass.adoc +++ b/docs/modules/secret-operator/pages/secretclass.adoc @@ -92,6 +92,7 @@ Native support for customizing certificate lifetimes in Stacklet CRDs might be a Certificate authorities also have a limited lifetime, and need to be rotated before they expire to avoid cluster disruption. +// TODO Adapt this section If configured to provision its own CA (`autoTls.ca.autoGenerate`), the Secret Operator will create CA certificates that are valid for 365 days (≃ 1 year, configurable via `autoTls.ca.caCertificateLifetime`), and initiate rotation once less than half of that time remains. To avoid disruption and let the new CA propagate through the cluster, the Secret Operator will prefer using the oldest CA that will last for the entire lifetime of the issued certificate. diff --git a/rust/operator-binary/src/backend/tls/ca.rs b/rust/operator-binary/src/backend/tls/ca.rs index d7fdb2c8..71d56886 100644 --- a/rust/operator-binary/src/backend/tls/ca.rs +++ b/rust/operator-binary/src/backend/tls/ca.rs @@ -189,6 +189,10 @@ pub struct Config { /// The duration of any new CA certificates provisioned. pub ca_certificate_lifetime: Duration, + /// The retirement duration at the end of the CA certificate lifetime, where the CA is not used + /// to sign certificates and where the CA certificate does not have to be published. + pub ca_certificate_retirement_duration: Duration, + /// If no existing CA certificate outlives `rotate_if_ca_expires_before`, a new /// certificate will be generated. /// @@ -342,6 +346,7 @@ pub struct Manager { source_secret: ObjectRef, certificate_authorities: Vec, additional_trusted_certificates: Vec, + ca_certificate_retirement_duration: Duration, } impl Manager { @@ -513,6 +518,7 @@ impl Manager { .get() .map(|secret| secret.to_object_ref(())) .unwrap_or_else(|| secret_ref.into()), + ca_certificate_retirement_duration: config.ca_certificate_retirement_duration, }) } @@ -616,25 +622,230 @@ impl Manager { /// Get an appropriate [`CertificateAuthority`] for signing a given certificate. pub fn find_certificate_authority_for_signing( &self, - valid_until_at_least: OffsetDateTime, + active_until_at_least: OffsetDateTime, ) -> Result<&CertificateAuthority, GetCaError> { use get_ca_error::*; - self.certificate_authorities - .iter() - .filter(|ca| ca.not_after > valid_until_at_least) + self.active_certificate_authorities(active_until_at_least) + .into_iter() // pick the oldest valid CA, since it will be trusted by the most peers .min_by_key(|ca| ca.not_after) .with_context(|| NoCaLivesLongEnoughSnafu { - cutoff: valid_until_at_least, + cutoff: active_until_at_least, secret: self.source_secret.clone(), }) } /// Get all active trust root certificates. - pub fn trust_roots(&self) -> impl IntoIterator + '_ { - self.certificate_authorities - .iter() + pub fn trust_roots( + &self, + active_until_at_least: OffsetDateTime, + ) -> impl IntoIterator + '_ { + self.active_certificate_authorities(active_until_at_least) + .into_iter() .map(|ca| &ca.certificate) - .chain(&self.additional_trusted_certificates) + .chain(self.active_additional_trusted_certificates(active_until_at_least)) + } + + /// Returns all certificate authorities which are not retired or expired + fn active_certificate_authorities( + &self, + active_until_at_least: OffsetDateTime, + ) -> impl IntoIterator { + self.certificate_authorities.iter().filter(move |ca| { + ca.not_after - self.ca_certificate_retirement_duration >= active_until_at_least + }) + } + + /// Returns all additional trusted certificates which are not retired or expired + fn active_additional_trusted_certificates( + &self, + active_until_at_least: OffsetDateTime, + ) -> impl IntoIterator + '_ { + self.additional_trusted_certificates + .iter() + .filter(move |cert| { + asn1time_to_offsetdatetime(cert.not_after()).is_ok_and(|not_after| { + not_after - self.ca_certificate_retirement_duration >= active_until_at_least + }) + }) + } +} + +#[cfg(test)] +mod tests { + use kube_runtime::reflector::ObjectRef; + use openssl::{ + asn1::{Asn1Integer, Asn1Time}, + bn::BigNum, + hash::MessageDigest, + pkey::{PKey, Private}, + rsa::Rsa, + x509::{X509, X509Builder}, + }; + use stackable_operator::{ + k8s_openapi::{ByteString, api::core::v1::Secret}, + shared::time::Duration, + }; + use stackable_secret_operator_utils::crd::SecretReference; + use time::{OffsetDateTime, macros::datetime}; + + use super::{CertificateAuthority, Manager}; + + fn create_certificate( + serial_number: u32, + not_before: OffsetDateTime, + not_after: OffsetDateTime, + ) -> Result<(X509, PKey), openssl::error::ErrorStack> { + let key_pair = Rsa::generate(512)?; + let pkey = PKey::try_from(key_pair)?; + + let mut x509_builder = X509Builder::new()?; + x509_builder.set_serial_number( + Asn1Integer::from_bn(BigNum::from_u32(serial_number)?.as_ref())?.as_ref(), + )?; + x509_builder.set_not_before(Asn1Time::from_unix(not_before.unix_timestamp())?.as_ref())?; + x509_builder.set_not_after(Asn1Time::from_unix(not_after.unix_timestamp())?.as_ref())?; + x509_builder.set_pubkey(&pkey)?; + x509_builder.sign(&pkey, MessageDigest::sha256())?; + let x509 = x509_builder.build(); + + Ok((x509, pkey)) + } + + fn create_certificate_authority( + serial_number: u32, + not_before: OffsetDateTime, + not_after: OffsetDateTime, + ) -> Result { + let (certificate, pkey) = create_certificate(serial_number, not_before, not_after)?; + + let key_certificate = "crt"; + let key_private_key = "key"; + + Ok(CertificateAuthority::from_secret_data( + &[ + ( + key_certificate.to_owned(), + ByteString(certificate.to_pem()?), + ), + ( + key_private_key.to_owned(), + ByteString(pkey.private_key_to_pem_pkcs8()?), + ), + ] + .into(), + &SecretReference { + namespace: "default".to_owned(), + name: "secret-provisioner-tls-ca".to_owned(), + }, + key_certificate, + key_private_key, + ) + .expect("")) + } + + #[test] + fn test_find_certificate_authority_for_signing() { + let ca_certificate_retirement_duration = Duration::from_hours_unchecked(1); + + let ca1 = create_certificate_authority( + 1, + datetime!(2025-01-01 0:00 UTC), + datetime!(2025-01-01 12:00 UTC), + ) + .expect("should create a valid certificate"); + let ca2 = create_certificate_authority( + 2, + datetime!(2025-01-01 6:00 UTC), + datetime!(2025-01-01 18:00 UTC), + ) + .expect("should create a valid certificate"); + + let manager = Manager { + source_secret: ObjectRef::::new("secret-provisioner-tls-ca"), + certificate_authorities: vec![ca1, ca2], + additional_trusted_certificates: vec![], + ca_certificate_retirement_duration, + }; + + let signing_ca_at_11_00 = + manager.find_certificate_authority_for_signing(datetime!(2025-01-01 11:00 UTC)); + + assert_eq!( + Some("CertificateAuthority(serial=1)".to_owned()), + signing_ca_at_11_00.ok().map(|ca| format!("{}", ca)) + ); + + let signing_ca_at_11_01 = + manager.find_certificate_authority_for_signing(datetime!(2025-01-01 11:01 UTC)); + + assert_eq!( + Some("CertificateAuthority(serial=2)".to_owned()), + signing_ca_at_11_01.ok().map(|ca| format!("{}", ca)) + ); + } + + #[test] + fn test_trust_roots() { + let ca_certificate_retirement_duration = Duration::from_hours_unchecked(1); + + let ca1 = create_certificate_authority( + 1, + datetime!(2025-01-01 0:00 UTC), + datetime!(2025-01-01 12:00 UTC), + ) + .expect("should create a valid certificate"); + let ca1_certificate = ca1.certificate.clone(); + + let ca2 = create_certificate_authority( + 2, + datetime!(2025-01-01 6:00 UTC), + datetime!(2025-01-01 18:00 UTC), + ) + .expect("should create a valid certificate"); + let ca2_certificate = ca2.certificate.clone(); + + let (trust_root1, _) = create_certificate( + 3, + datetime!(2025-01-01 0:00 UTC), + datetime!(2025-01-01 12:00 UTC), + ) + .expect("should create a valid certificate"); + + let (trust_root2, _) = create_certificate( + 4, + datetime!(2025-01-01 6:00 UTC), + datetime!(2025-01-01 18:00 UTC), + ) + .expect("should create a valid certificate"); + + let manager = Manager { + source_secret: ObjectRef::::new("secret-provisioner-tls-ca"), + certificate_authorities: vec![ca1, ca2], + additional_trusted_certificates: vec![trust_root1.clone(), trust_root2.clone()], + ca_certificate_retirement_duration, + }; + + let trust_roots_at_11_00: Vec<&X509> = manager + .trust_roots(datetime!(2025-01-01 11:00 UTC)) + .into_iter() + .collect(); + + assert_eq!( + vec![ + &ca1_certificate, + &ca2_certificate, + &trust_root1, + &trust_root2 + ], + trust_roots_at_11_00 + ); + + let trust_roots_at_11_01: Vec<&X509> = manager + .trust_roots(datetime!(2025-01-01 11:01 UTC)) + .into_iter() + .collect(); + + assert_eq!(vec![&ca2_certificate, &trust_root2], trust_roots_at_11_01); } } diff --git a/rust/operator-binary/src/backend/tls/mod.rs b/rust/operator-binary/src/backend/tls/mod.rs index 3713935c..9cc245e2 100644 --- a/rust/operator-binary/src/backend/tls/mod.rs +++ b/rust/operator-binary/src/backend/tls/mod.rs @@ -1,6 +1,6 @@ //! Dynamically provisions TLS certificates -use std::ops::Range; +use std::{cmp::min, ops::Range}; use async_trait::async_trait; use openssl::{ @@ -20,7 +20,7 @@ use openssl::{ }, }; use rand::Rng; -use snafu::{OptionExt, ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu, ensure}; use stackable_operator::{ k8s_openapi::chrono::{self, FixedOffset, TimeZone}, shared::time::Duration, @@ -40,15 +40,33 @@ use crate::{ mod ca; +/// Fraction of the active lifetime of a CA after which it is rotated. +// Use a fraction instead of a factor because [`Duration::mul`] is not defined for floating point +// numbers. +pub const CA_ROTATION_FRACTION: u32 = 2; + /// How long CA certificates should last for. Also used for calculating when they should be rotated. -/// [`DEFAULT_MAX_CERT_LIFETIME`] must be less than half of [`DEFAULT_CA_CERT_LIFETIME`]. pub const DEFAULT_CA_CERT_LIFETIME: Duration = Duration::from_days_unchecked(365); +/// Duration at the end of the CA certificate lifetime where no certificates signed by the CA +/// certificate may exist. +/// +/// The CA certificate is not published anymore while in retirement to avoid that pods get almost +/// expired certificates. +/// +/// see https://github.com/stackabletech/secret-operator/issues/625 +pub const DEFAULT_CA_CERT_RETIREMENT_DURATION: Duration = Duration::from_hours_unchecked(1); + /// As the Pods will be evicted [`DEFAULT_CERT_RESTART_BUFFER`] before /// the cert actually expires, this results in a restart in approx every 2 weeks, /// which matches the rolling re-deploy of k8s nodes of e.g.: /// * 1 week for IONOS /// * 2 weeks for some on-prem k8s clusters +/// +/// [`DEFAULT_MAX_CERT_LIFETIME`] must be less than `([DEFAULT_CA_CERT_LIFETIME] - +/// [DEFAULT_CA_CERT_RETIREMENT_DURATION]) / [CA_ROTATION_FACTOR] / 2`. +/// +/// see the explanation in [`AutoTlsBackend::max_certificate_lifetime`] pub const DEFAULT_MAX_CERT_LIFETIME: Duration = Duration::from_days_unchecked(15); /// Default lifetime of certs when no annotations are set on the Volume. @@ -94,6 +112,12 @@ pub enum Error { #[snafu(display("invalid certificate lifetime"))] InvalidCertLifetime { source: DateTimeOutOfBoundsError }, + #[snafu(display("retirement duration is not shorter than the CA certificate lifetime"))] + RetirementDurationNotShorterThanCertificateLifetime { + ca_certificate_lifetime: Duration, + ca_certificate_retirement_duration: Duration, + }, + #[snafu(display( "certificate expiring at {expires_at} would schedule the pod to be restarted at {restart_at}, which is in the past (and we don't have a time machine (yet and/or anymore))" ))] @@ -123,6 +147,9 @@ impl SecretBackendError for Error { Error::BuildCertificate { .. } => tonic::Code::FailedPrecondition, Error::SerializeCertificate { .. } => tonic::Code::FailedPrecondition, Error::InvalidCertLifetime { .. } => tonic::Code::Internal, + Error::RetirementDurationNotShorterThanCertificateLifetime { .. } => { + tonic::Code::InvalidArgument + } Error::TooShortCertLifetimeRequiresTimeTravel { .. } => tonic::Code::InvalidArgument, Error::JitterOutOfRange { .. } => tonic::Code::InvalidArgument, } @@ -140,6 +167,7 @@ impl SecretBackendError for Error { Error::BuildCertificate { .. } => None, Error::SerializeCertificate { .. } => None, Error::InvalidCertLifetime { .. } => None, + Error::RetirementDurationNotShorterThanCertificateLifetime { .. } => None, Error::TooShortCertLifetimeRequiresTimeTravel { .. } => None, Error::JitterOutOfRange { .. } => None, } @@ -166,11 +194,33 @@ impl TlsGenerate { secret: ca_secret, auto_generate: auto_generate_ca, ca_certificate_lifetime, + ca_certificate_retirement_duration, key_generation, }: &v1alpha1::AutoTlsCa, additional_trust_roots: &[v1alpha1::AdditionalTrustRoot], max_cert_lifetime: Duration, ) -> Result { + ensure!( + ca_certificate_retirement_duration < ca_certificate_lifetime, + RetirementDurationNotShorterThanCertificateLifetimeSnafu { + ca_certificate_lifetime: *ca_certificate_lifetime, + ca_certificate_retirement_duration: *ca_certificate_retirement_duration + } + ); + + let active_ca_certificate_lifetime = + *ca_certificate_lifetime - *ca_certificate_retirement_duration; + + // Safe maximum certificate lifetime that ensures that the lifetimes of two certificates do + // not overlap if their issuer certificates are not known to each other. + let safe_max_cert_lifetime = active_ca_certificate_lifetime / CA_ROTATION_FRACTION / 2; + + if max_cert_lifetime > safe_max_cert_lifetime { + tracing::warn!(%max_cert_lifetime, %safe_max_cert_lifetime, "maxCertificateLifetime is longer than (caCertificateLifetime - caCertificateRetirementDuration) / {} and will be capped", CA_ROTATION_FRACTION * 2); + } + + let max_cert_lifetime = min(max_cert_lifetime, safe_max_cert_lifetime); + Ok(Self { ca_manager: ca::Manager::load_or_create( client, @@ -179,7 +229,10 @@ impl TlsGenerate { &ca::Config { manage_ca: *auto_generate_ca, ca_certificate_lifetime: *ca_certificate_lifetime, - rotate_if_ca_expires_before: Some(*ca_certificate_lifetime / 2), + ca_certificate_retirement_duration: *ca_certificate_retirement_duration, + rotate_if_ca_expires_before: Some( + active_ca_certificate_lifetime / CA_ROTATION_FRACTION, + ), key_generation: key_generation.clone(), }, ) @@ -347,7 +400,7 @@ impl SecretBackend for TlsGenerate { SecretContents::new(SecretData::WellKnown(WellKnownSecretData::TlsPem( well_known::TlsPem { ca_pem: iterator_try_concat_bytes( - self.ca_manager.trust_roots().into_iter().map(|ca| { + self.ca_manager.trust_roots(now).into_iter().map(|ca| { ca.to_pem() .context(SerializeCertificateSnafu { tpe: CertType::Ca }) }), @@ -374,14 +427,15 @@ impl SecretBackend for TlsGenerate { &self, _selector: &super::TrustSelector, ) -> Result { + let now = OffsetDateTime::now_utc(); Ok(SecretContents::new(SecretData::WellKnown( WellKnownSecretData::TlsPem(well_known::TlsPem { - ca_pem: iterator_try_concat_bytes(self.ca_manager.trust_roots().into_iter().map( - |ca| { + ca_pem: iterator_try_concat_bytes( + self.ca_manager.trust_roots(now).into_iter().map(|ca| { ca.to_pem() .context(SerializeCertificateSnafu { tpe: CertType::Ca }) - }, - ))?, + }), + )?, certificate_pem: None, key_pem: None, }), diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 41b156f7..275ff775 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -126,6 +126,20 @@ pub mod versioned { /// In case consumers request a longer lifetime than allowed by this setting, /// the lifetime will be the minimum of both, so this setting takes precedence. /// The default value is 15 days. + /// + /// The maximum lifetime must be less than a quarter of the active CA certificate lifetime + /// where the active CA certificate lifetime is `ca.ca_certificate_lifetime - + /// ca.ca_certificate_retirement_duration` to ensure that two subjects always have a common + /// CA certificate in their trust stores – assuming that CAs are rotated at half of their + /// active lifetimes. + /// + /// For instance, if a pod is created right before half of the active CA lifetime has + /// passed, then it is signed by this CA but it does not know yet the new CA certificate + /// which is created right afterwards. If another pod is created so that its certificate + /// lifetime ends right after the first active CA lifetime then it is signed by the new CA. + /// The `max_certificate_lifetime` must be chosen so that these two pods have no + /// overlapping lifetimes, otherwise the first pod would see the second one signed by an + /// unknown CA certificate. This can be achieved by the mentioned formula. #[serde(default = "AutoTlsBackend::default_max_certificate_lifetime")] pub max_certificate_lifetime: Duration, } @@ -152,6 +166,12 @@ pub mod versioned { #[serde(default = "AutoTlsCa::default_ca_certificate_lifetime")] pub ca_certificate_lifetime: Duration, + /// Duration at the end of the CA certificate lifetime where no signed certificate will exist. + /// + /// Retired (or expired) CA certificates will not be published. + #[serde(default = "AutoTlsCa::default_ca_certificate_retirement_duration")] + pub ca_certificate_retirement_duration: Duration, + /// The algorithm used to generate a key pair and required configuration settings. /// Currently only RSA and a key length of 2048, 3072 or 4096 bits can be configured. #[serde(default)] @@ -342,7 +362,10 @@ pub mod versioned { mod test { use super::*; use crate::{ - backend::tls::{DEFAULT_CA_CERT_LIFETIME, DEFAULT_MAX_CERT_LIFETIME}, + backend::tls::{ + DEFAULT_CA_CERT_LIFETIME, DEFAULT_CA_CERT_RETIREMENT_DURATION, + DEFAULT_MAX_CERT_LIFETIME, + }, crd::v1alpha1::{ AdditionalTrustRoot, AutoTlsBackend, AutoTlsCa, CertificateKeyGeneration, SecretClass, SecretClassBackend, SecretClassSpec, @@ -381,6 +404,7 @@ mod test { }, auto_generate: false, ca_certificate_lifetime: DEFAULT_CA_CERT_LIFETIME, + ca_certificate_retirement_duration: DEFAULT_CA_CERT_RETIREMENT_DURATION, key_generation: CertificateKeyGeneration::Rsa { length: CertificateKeyGeneration::RSA_KEY_LENGTH_3072 } @@ -405,6 +429,7 @@ mod test { namespace: default autoGenerate: true caCertificateLifetime: 100d + caCertificateRetirementDuration: 1d additionalTrustRoots: - configMap: name: tls-root-ca-config-map @@ -428,6 +453,7 @@ mod test { }, auto_generate: true, ca_certificate_lifetime: Duration::from_days_unchecked(100), + ca_certificate_retirement_duration: Duration::from_days_unchecked(1), key_generation: CertificateKeyGeneration::default() }, additional_trust_roots: vec![ diff --git a/rust/operator-binary/src/crd/v1alpha1_impl.rs b/rust/operator-binary/src/crd/v1alpha1_impl.rs index e3010ef8..88588d0c 100644 --- a/rust/operator-binary/src/crd/v1alpha1_impl.rs +++ b/rust/operator-binary/src/crd/v1alpha1_impl.rs @@ -124,6 +124,10 @@ impl AutoTlsCa { pub(crate) fn default_ca_certificate_lifetime() -> Duration { backend::tls::DEFAULT_CA_CERT_LIFETIME } + + pub(crate) fn default_ca_certificate_retirement_duration() -> Duration { + backend::tls::DEFAULT_CA_CERT_RETIREMENT_DURATION + } } impl CertificateKeyGeneration { From 44f62dcf80259495f3147d388512576e15181319 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 23 Oct 2025 11:25:58 +0200 Subject: [PATCH 2/7] docs: Document the CA certificate retirement --- docs/modules/secret-operator/pages/secretclass.adoc | 11 +++++++---- docs/modules/secret-operator/pages/truststore.adoc | 2 ++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/modules/secret-operator/pages/secretclass.adoc b/docs/modules/secret-operator/pages/secretclass.adoc index bc67d5b8..c6983a22 100644 --- a/docs/modules/secret-operator/pages/secretclass.adoc +++ b/docs/modules/secret-operator/pages/secretclass.adoc @@ -88,17 +88,18 @@ In case an operator sets a higher lifetime, a tracking issue must be created to Users can use xref:concepts:overrides.adoc#pod-overrides[podOverrides] to extend the certificate lifetime by adding volume annotations. Native support for customizing certificate lifetimes in Stacklet CRDs might be added in the future. +[#ca-rotation] ==== Certificate Authority rotation Certificate authorities also have a limited lifetime, and need to be rotated before they expire to avoid cluster disruption. -// TODO Adapt this section -If configured to provision its own CA (`autoTls.ca.autoGenerate`), the Secret Operator will create CA certificates that are valid for 365 days (≃ 1 year, configurable via `autoTls.ca.caCertificateLifetime`), and initiate rotation once less than half of that time remains. +If configured to provision its own CA (`autoTls.ca.autoGenerate`), the Secret Operator will create CA certificates that are valid for 365 days (≃ 1 year, configurable via `autoTls.ca.caCertificateLifetime`). +The CA certificate is retired one hour before its expiration (configurable via `autoTls.ca.caCertificateRetirementDuration`), to avoid that an almost expired certificate must be deployed, which causes problems in some products, e.g. OpenSearch. +Once less than half of the active lifetime (= lifetime - retirement duration) remains, the rotation is initiated. To avoid disruption and let the new CA propagate through the cluster, the Secret Operator will prefer using the oldest CA that will last for the entire lifetime of the issued certificate. -NOTE: Expired CA certificates will currently not be deleted automatically. -They should be cleaned up manually. +NOTE: Expired and retired CA certificates will not be deployed. ==== Reference @@ -113,6 +114,7 @@ spec: namespace: default autoGenerate: true caCertificateLifetime: 700d + caCertificateRetirementDuration: 1d keyGeneration: rsa: length: 4096 @@ -132,6 +134,7 @@ spec: and `ca.key` respectively. `autoTls.ca.autoGenerate`:: Whether the certificate authority should be provisioned and managed by the Secret Operator. `autoTls.ca.caCertificateLifetime` :: The lifetime of the certificate authority's root certificate. +`autoTls.ca.caCertificateRetirementDuration` :: Duration at the end of the CA certificate lifetime where no signed certificate will exist. `autoTls.ca.keyGeneration`:: Configures how keys should be generated. `autoTls.ca.keyGeneration.rsa`:: Declares that keys should be generated using the RSA algorithm. `autoTls.ca.keyGeneration.rsa.length`:: The amount of bits used for generating the RSA key pair. Currently, `2048`, `3072` and `4096` are supported. Defaults to `2048` bits. diff --git a/docs/modules/secret-operator/pages/truststore.adoc b/docs/modules/secret-operator/pages/truststore.adoc index ba3c4525..8cfb1c53 100644 --- a/docs/modules/secret-operator/pages/truststore.adoc +++ b/docs/modules/secret-operator/pages/truststore.adoc @@ -20,5 +20,7 @@ include::example$truststore-tls.yaml[] This will create a ConfigMap (or `Secret` based on `targetKind`) named `truststore-pem` containing a `ca.crt` with the trust root certificates. It can then either be mounted into a Pod or retrieved and used from outside of Kubernetes. +Expired or retired (see xref:secretclass.adoc#ca-rotation[Certificate Authority rotation]) certificates will not be published, because they should not be needed and some products, e.g. OpenSearch, have problems if they are present at startup. + NOTE: Make sure to have a procedure for updating the retrieved certificates. The Secret Operator will automatically rotate the xref:secretclass.adoc#backend-autotls[autoTls] certificate authority as needed, but all trust roots will require some form of update occasionally. From 25fd0b8bba6d031d2fe2fb19271a725c42536728 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 23 Oct 2025 11:32:27 +0200 Subject: [PATCH 3/7] chore: Update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4c48d21..360da609 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ All notable changes to this project will be documented in this file. - `EOS_DISABLED` (`--eos-disabled`) to disable the EoS checker completely. - Support exporting the TrustStore CA certificate information to Secrets or ConfigMaps ([#597]). - New helm value for `priorityClassName` ([#641]). +- CA certificates are retired one hour (configurable via + `autoTls.ca.caCertificateRetirementDuration`) before they expire ([#650]). ### Changed @@ -29,6 +31,8 @@ All notable changes to this project will be documented in this file. - `kubeletDir` has been move to `csiNodeDriver.kubeletDir`. - Bump csi-node-driver-registrar to `v2.15.0` ([#642]). - Bump csi-provisioner to `v5.3.0` ([#643]). +- BREAKING: Expired and retired CA certificates are no longer published in Volumes and TrustStores + ([#650]). [#597]: https://github.com/stackabletech/secret-operator/pull/597 [#636]: https://github.com/stackabletech/secret-operator/pull/636 @@ -37,6 +41,7 @@ All notable changes to this project will be documented in this file. [#643]: https://github.com/stackabletech/secret-operator/pull/643 [#644]: https://github.com/stackabletech/secret-operator/pull/644 [#645]: https://github.com/stackabletech/secret-operator/pull/645 +[#650]: https://github.com/stackabletech/secret-operator/pull/650 ## [25.7.0] - 2025-07-23 From 6585fe900c417331e8adb9d771a1977a65a94c29 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 23 Oct 2025 11:49:10 +0200 Subject: [PATCH 4/7] test: Add comment to `expect` --- rust/operator-binary/src/backend/tls/ca.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/operator-binary/src/backend/tls/ca.rs b/rust/operator-binary/src/backend/tls/ca.rs index 3fc4d101..0ff1cb51 100644 --- a/rust/operator-binary/src/backend/tls/ca.rs +++ b/rust/operator-binary/src/backend/tls/ca.rs @@ -741,7 +741,7 @@ mod tests { key_certificate, key_private_key, ) - .expect("")) + .expect("should load the valid certificates from the given Secret data")) } #[test] From 0328e3ffd7122f27d7ac31fcb03d03076602588c Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 23 Oct 2025 12:04:46 +0200 Subject: [PATCH 5/7] chore: Fix rustdoc warnings --- rust/operator-binary/src/backend/tls/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/operator-binary/src/backend/tls/mod.rs b/rust/operator-binary/src/backend/tls/mod.rs index 31cf428e..1339f756 100644 --- a/rust/operator-binary/src/backend/tls/mod.rs +++ b/rust/operator-binary/src/backend/tls/mod.rs @@ -54,7 +54,7 @@ pub const DEFAULT_CA_CERT_LIFETIME: Duration = Duration::from_days_unchecked(365 /// The CA certificate is not published anymore while in retirement to avoid that pods get almost /// expired certificates. /// -/// see https://github.com/stackabletech/secret-operator/issues/625 +/// see pub const DEFAULT_CA_CERT_RETIREMENT_DURATION: Duration = Duration::from_hours_unchecked(1); /// As the Pods will be evicted [`DEFAULT_CERT_RESTART_BUFFER`] before @@ -66,7 +66,7 @@ pub const DEFAULT_CA_CERT_RETIREMENT_DURATION: Duration = Duration::from_hours_u /// [`DEFAULT_MAX_CERT_LIFETIME`] must be less than `([DEFAULT_CA_CERT_LIFETIME] - /// [DEFAULT_CA_CERT_RETIREMENT_DURATION]) / [CA_ROTATION_FACTOR] / 2`. /// -/// see the explanation in [`AutoTlsBackend::max_certificate_lifetime`] +/// see the explanation in [`v1alpha2::AutoTlsBackend::max_certificate_lifetime`] pub const DEFAULT_MAX_CERT_LIFETIME: Duration = Duration::from_days_unchecked(15); /// Default lifetime of certs when no annotations are set on the Volume. From 37577156bef4f3b116169c4a9a6458e57a0de6a3 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 23 Oct 2025 17:01:54 +0200 Subject: [PATCH 6/7] chore: Address review comments --- extra/crds.yaml | 6 ++++-- rust/operator-binary/src/backend/tls/ca.rs | 12 ++++++------ rust/operator-binary/src/backend/tls/mod.rs | 14 ++++++++------ rust/operator-binary/src/crd/secret_class/mod.rs | 3 ++- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/extra/crds.yaml b/extra/crds.yaml index f5bd89dd..e3382133 100644 --- a/extra/crds.yaml +++ b/extra/crds.yaml @@ -119,7 +119,8 @@ spec: description: |- Duration at the end of the CA certificate lifetime where no signed certificate will exist. - Retired (or expired) CA certificates will not be published. + Retired (or expired) CA certificates will not be published and will not be used for + signing leaf certificates. type: string keyGeneration: default: @@ -545,7 +546,8 @@ spec: description: |- Duration at the end of the CA certificate lifetime where no signed certificate will exist. - Retired (or expired) CA certificates will not be published. + Retired (or expired) CA certificates will not be published and will not be used for + signing leaf certificates. type: string keyGeneration: default: diff --git a/rust/operator-binary/src/backend/tls/ca.rs b/rust/operator-binary/src/backend/tls/ca.rs index 0ff1cb51..b88c8536 100644 --- a/rust/operator-binary/src/backend/tls/ca.rs +++ b/rust/operator-binary/src/backend/tls/ca.rs @@ -753,13 +753,13 @@ mod tests { datetime!(2025-01-01 0:00 UTC), datetime!(2025-01-01 12:00 UTC), ) - .expect("should create a valid certificate"); + .expect("must be able to create a valid certificate"); let ca2 = create_certificate_authority( 2, datetime!(2025-01-01 6:00 UTC), datetime!(2025-01-01 18:00 UTC), ) - .expect("should create a valid certificate"); + .expect("must be able to create a valid certificate"); let manager = Manager { source_secret: ObjectRef::::new("secret-provisioner-tls-ca"), @@ -794,7 +794,7 @@ mod tests { datetime!(2025-01-01 0:00 UTC), datetime!(2025-01-01 12:00 UTC), ) - .expect("should create a valid certificate"); + .expect("must be able to create a valid certificate"); let ca1_certificate = ca1.certificate.clone(); let ca2 = create_certificate_authority( @@ -802,7 +802,7 @@ mod tests { datetime!(2025-01-01 6:00 UTC), datetime!(2025-01-01 18:00 UTC), ) - .expect("should create a valid certificate"); + .expect("must be able to create a valid certificate"); let ca2_certificate = ca2.certificate.clone(); let (trust_root1, _) = create_certificate( @@ -810,14 +810,14 @@ mod tests { datetime!(2025-01-01 0:00 UTC), datetime!(2025-01-01 12:00 UTC), ) - .expect("should create a valid certificate"); + .expect("must be able to create a valid certificate"); let (trust_root2, _) = create_certificate( 4, datetime!(2025-01-01 6:00 UTC), datetime!(2025-01-01 18:00 UTC), ) - .expect("should create a valid certificate"); + .expect("must be able to create a valid certificate"); let manager = Manager { source_secret: ObjectRef::::new("secret-provisioner-tls-ca"), diff --git a/rust/operator-binary/src/backend/tls/mod.rs b/rust/operator-binary/src/backend/tls/mod.rs index 1339f756..fd7765fe 100644 --- a/rust/operator-binary/src/backend/tls/mod.rs +++ b/rust/operator-binary/src/backend/tls/mod.rs @@ -428,14 +428,16 @@ impl SecretBackend for TlsGenerate { _selector: &super::TrustSelector, ) -> Result { let now = OffsetDateTime::now_utc(); + let active_trust_roots = self.ca_manager.trust_roots(now); + let pems = active_trust_roots.into_iter().map(|ca| { + ca.to_pem() + .context(SerializeCertificateSnafu { tpe: CertType::Ca }) + }); + let concatenated_pem = iterator_try_concat_bytes(pems)?; + Ok(SecretContents::new(SecretData::WellKnown( WellKnownSecretData::TlsPem(well_known::TlsPem { - ca_pem: iterator_try_concat_bytes( - self.ca_manager.trust_roots(now).into_iter().map(|ca| { - ca.to_pem() - .context(SerializeCertificateSnafu { tpe: CertType::Ca }) - }), - )?, + ca_pem: concatenated_pem, certificate_pem: None, key_pem: None, }), diff --git a/rust/operator-binary/src/crd/secret_class/mod.rs b/rust/operator-binary/src/crd/secret_class/mod.rs index 930e5a04..6e363708 100644 --- a/rust/operator-binary/src/crd/secret_class/mod.rs +++ b/rust/operator-binary/src/crd/secret_class/mod.rs @@ -164,7 +164,8 @@ pub mod versioned { /// Duration at the end of the CA certificate lifetime where no signed certificate will exist. /// - /// Retired (or expired) CA certificates will not be published. + /// Retired (or expired) CA certificates will not be published and will not be used for + /// signing leaf certificates. #[serde(default = "v1alpha2::AutoTlsCa::default_ca_certificate_retirement_duration")] pub ca_certificate_retirement_duration: Duration, From dbf2eb7d04cb65d4ee3820e0d03deb0d5f888621 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Fri, 24 Oct 2025 16:04:49 +0200 Subject: [PATCH 7/7] chore: Explain the calculation of safe_max_cert_lifetime --- rust/operator-binary/src/backend/tls/mod.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rust/operator-binary/src/backend/tls/mod.rs b/rust/operator-binary/src/backend/tls/mod.rs index fd7765fe..86a7bcaf 100644 --- a/rust/operator-binary/src/backend/tls/mod.rs +++ b/rust/operator-binary/src/backend/tls/mod.rs @@ -213,6 +213,16 @@ impl TlsGenerate { // Safe maximum certificate lifetime that ensures that the lifetimes of two certificates do // not overlap if their issuer certificates are not known to each other. + // + // For instance, if the `active_ca_certificate_lifetime` is 20 days, then it is rotated + // after 10 days (= active_ca_certificate_lifetime / CA_ROTATION_FRACTION). Let us assume + // that `max_cert_lifetime` is 6 days (> 10 days / 2). If pod 1 is generated on day 9, it + // can be alive until day 15 and only knows CA 1. If pod 2 is generated on day 15, its + // certificate is signed by CA 2 because CA 1 expires while pod 2 is alive. On day 15, pod + // 1 would not be able to communicate with pod 2 because it has no knowledge of CA 2. To + // ensure, that the lifetimes of these two pods cannot overlap, the + // `safe_max_cert_lifetime` is the half of the 10 days (= active_ca_certificate_lifetime / + // CA_ROTATION_FRACTION / 2). let safe_max_cert_lifetime = active_ca_certificate_lifetime / CA_ROTATION_FRACTION / 2; if max_cert_lifetime > safe_max_cert_lifetime {