Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 38 additions & 15 deletions crates/api-core/src/dpa/lockdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ use sha2::Sha256;
use sqlx::PgPool;

// CURRENT_LOCKDOWN_IKM_VERSION is the site-wide lockdown IKM version the
// lock/unlock flow currently derives keys from. We will leave it hardcoded to 0 until
// we introduce rotation logic.
// lock/unlock flow derives keys from, and the version recorded as each card's
// convergence target. Hardcoded to 0 until the rotation engine lands: rotating
// the IKM (v0 -> v1) is what advances this, and that logic will own making newly
// ingested NICs lock under the new IKM while already-locked cards migrate.
pub const CURRENT_LOCKDOWN_IKM_VERSION: u32 = 0;

// LOCKDOWN_KEY_LENGTH is the max length of the supported
Expand Down Expand Up @@ -144,22 +146,26 @@ fn lockdown_ikm_key(version: u32) -> CredentialKey {
}
}

// fetch_kdf_secret fetches the IKM for the KDF from the
// dedicated site-wide lockdown credential, decoupled from the BMC root so the
// two can be rotated independently.
// fetch_kdf_secret fetches the IKM for the KDF from the dedicated site-wide
// lockdown credential, decoupled from the BMC root so the two can be rotated
// independently.
//
// Returns the IKM version it resolved alongside the secret so the caller can
// durably record the exact version a card is locked under, rather than
// re-reading the (mutable) site-wide target later. Today the version is
// `CURRENT_LOCKDOWN_IKM_VERSION`; the rotation engine will own advancing it.
async fn fetch_kdf_secret(
credential_reader: &dyn CredentialReader,
) -> Result<String, eyre::Report> {
let ikm_key = lockdown_ikm_key(CURRENT_LOCKDOWN_IKM_VERSION);
) -> Result<(u32, String), eyre::Report> {
let version = CURRENT_LOCKDOWN_IKM_VERSION;
let ikm_key = lockdown_ikm_key(version);
let credentials = credential_reader
.get_credentials(&ikm_key)
.await?
.ok_or_else(|| {
eyre::eyre!("lockdown IKM v{CURRENT_LOCKDOWN_IKM_VERSION} not found; site not seeded")
})?;
.ok_or_else(|| eyre::eyre!("lockdown IKM v{version} not found; site not seeded"))?;
let Credentials::UsernamePassword { password, .. } = credentials;

Ok(password)
Ok((version, password))
}

// ensure_lockdown_ikm_seeded idempotently seeds the dedicated site-wide
Expand Down Expand Up @@ -225,16 +231,32 @@ pub async fn ensure_lockdown_ikm_seeded(
}
}

// SupernicLockdownKey is a derived lockdown key together with the site-wide
// lockdown IKM version it was derived from. The version travels with the key so
// the lock flow can durably record the exact version the card is locked under.
pub struct SupernicLockdownKey {
// The 16-character hex lockdown key sent to the device.
pub key: String,
// The site-wide lockdown IKM version `key` was derived from.
pub ikm_version: u32,
}

// build_supernic_lockdown_key builds a single lockdown key using
// the latest KdfContextVersion. Use this for locking a card.
//
// Returns the derived key together with the IKM version it used (see
// `SupernicLockdownKey`). The unlock flow can ignore the version; the lock flow
// persists it so the recorded convergence version matches what actually locked
// the card.
pub async fn build_supernic_lockdown_key(
db_reader: &PgPool,
dpa_interface_id: DpaInterfaceId,
credential_reader: &dyn CredentialReader,
) -> Result<String, eyre::Report> {
) -> Result<SupernicLockdownKey, eyre::Report> {
let ctx = build_kdf_context(db_reader, dpa_interface_id).await?;
let secret = fetch_kdf_secret(credential_reader).await?;
build_lockdown_key(secret.as_bytes(), &ctx, KdfContextVersion::V1)
let (ikm_version, secret) = fetch_kdf_secret(credential_reader).await?;
let key = build_lockdown_key(secret.as_bytes(), &ctx, KdfContextVersion::V1)?;
Ok(SupernicLockdownKey { key, ikm_version })
}

#[cfg(test)]
Expand Down Expand Up @@ -439,7 +461,8 @@ mod tests {
.await
.unwrap();

let secret = fetch_kdf_secret(&store).await.unwrap();
let (version, secret) = fetch_kdf_secret(&store).await.unwrap();
assert_eq!(version, CURRENT_LOCKDOWN_IKM_VERSION);
assert_eq!(secret, "ikm-pass");
}

Expand Down
7 changes: 7 additions & 0 deletions crates/api-core/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,13 @@ impl From<DatabaseError> for CarbideError {
DatabaseError::InvalidArgument(e) => InvalidArgument(e),
DatabaseError::InvalidConfiguration(e) => InvalidConfiguration(e),
DatabaseError::MissingArgument(e) => MissingArgument(e),
// A corrupted/absent site-wide rotation invariant is an internal
// state error, not a client-correctable one.
DatabaseError::MissingSitewideRotationTarget(credential_type) => Internal {
message: format!(
"no site-wide rotation target for credential type: {credential_type:?}"
),
},
DatabaseError::NetworkParseError(e) => NetworkParseError(e),
DatabaseError::NetworkSegmentDelete(e) => NetworkSegmentDelete(e),
DatabaseError::NetworkSegmentDuplicateMacAddress(e) => {
Expand Down
15 changes: 15 additions & 0 deletions crates/api-core/src/handlers/credential.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,21 @@ pub(crate) async fn delete_bmc_root_credentials_by_mac(
CarbideError::internal(format!("Error deleting credential for BMC: {e:?} "))
})?;

// Drop the bmc convergence marker alongside the Vault secret it depends on:
// once NICo discards the per-device BMC secret it can no longer authenticate
// or rotate the device, so tracking convergence is meaningless. (The rotation
// engine also joins device_credential_rotation to the live device tables, so
// a row orphaned by device deletion is never acted on -- this just keeps the
// table tidy at the chokepoint where the secret actually goes away.)
let mut txn = api.txn_begin().await?;
db::credential_rotation::delete_device_converged(
&mut txn,
bmc_mac_address,
db::credential_rotation::CredentialRotationType::Bmc,
)
.await?;
txn.commit().await?;

api.bmc_session_manager.flush_mac(bmc_mac_address).await;
Comment thread
coderabbitai[bot] marked this conversation as resolved.

Ok(())
Expand Down
40 changes: 35 additions & 5 deletions crates/api-core/src/handlers/dpa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ async fn build_unlock_command(
machine_id: MachineId,
pci_name: &str,
) -> CarbideResult<DpaCommand<'static>> {
let key = crate::dpa::lockdown::build_supernic_lockdown_key(
let lockdown = crate::dpa::lockdown::build_supernic_lockdown_key(
&api.database_connection,
sn.id,
&*api.credential_manager,
Expand All @@ -271,8 +271,10 @@ async fn build_unlock_command(

tracing::info!(%machine_id, %pci_name, "Unlocking DPA");

// The unlock flow does not record convergence, so the derived IKM version is
// not persisted here.
Ok(DpaCommand {
op: OpCode::Unlock { key },
op: OpCode::Unlock { key: lockdown.key },
})
}

Expand Down Expand Up @@ -414,7 +416,7 @@ async fn build_lock_command(
machine_id: MachineId,
pci_name: &str,
) -> CarbideResult<DpaCommand<'static>> {
let key = crate::dpa::lockdown::build_supernic_lockdown_key(
let lockdown = crate::dpa::lockdown::build_supernic_lockdown_key(
&api.database_connection,
sn.id,
&*api.credential_manager,
Expand All @@ -426,9 +428,37 @@ async fn build_lock_command(
))
})?;

tracing::info!(%machine_id, %pci_name, "Locking DPA");
// Stage the IKM version we are about to lock the card with as the in-flight
// rotation marker (`rotating_to_version`) on the card's lockdown_ikm row
// *before* issuing the lock command. dpa-manager's `handle_locking` promotes
// exactly this value to the convergence version when the card reports Locked
// -- never the (possibly advanced) site-wide target re-read at observation
// time. Staging first means we only ever issue a lock for a version we have
// already recorded our intent to use; if the write fails we surface the error
// and do not lock. The writer is idempotent across the per-cycle
// re-derivation while Locking.
let ikm_version = i32::try_from(lockdown.ikm_version).map_err(|e| CarbideError::Internal {
message: format!(
"lockdown IKM version {} does not fit in i32 for DPA {pci_name}: {e}",
lockdown.ikm_version
),
})?;
let mut conn = api.database_connection.acquire().await.map_err(|e| {
CarbideError::GenericErrorFromReport(eyre!(
"failed to acquire connection to stage lockdown IKM rotation for DPA {pci_name}: {e}"
))
})?;
db::credential_rotation::mark_device_rotating_to_version(
&mut conn,
sn.mac_address,
db::credential_rotation::CredentialRotationType::LockdownIkm,
ikm_version,
)
.await?;

tracing::info!(%machine_id, %pci_name, ikm_version = lockdown.ikm_version, "Locking DPA");
Ok(DpaCommand {
op: OpCode::Lock { key },
op: OpCode::Lock { key: lockdown.key },
})
}

Expand Down
38 changes: 36 additions & 2 deletions crates/api-core/src/handlers/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -516,12 +516,28 @@ pub(crate) async fn admin_force_delete_machine(
}

if machine.bios_password_set_time.is_some() {
if let Err(e) = api
match api
.redfish_pool
.clear_host_uefi_password(client.as_ref())
.await
{
tracing::warn!(%machine_id, error = %e, "Failed to clear host UEFI password while force deleting machine");
Ok(_) => {
// The UEFI password was reset on the device, so the host no
// longer carries the site-wide UEFI value: drop the host_uefi
// convergence marker (keyed by the host BMC MAC, mirroring where
// it is recorded when the password is set). Best-effort like the
// clear itself -- the machine row is being deleted anyway, so a
// surviving marker would be neutralized by the rotation engine's
// live-device join regardless.
if let Err(e) =
forget_host_uefi_convergence(api, bmc_mac_address).await
{
tracing::warn!(%machine_id, error = %e, "Cleared host UEFI password but failed to delete its credential-rotation marker");
}
}
Err(e) => {
tracing::warn!(%machine_id, error = %e, "Failed to clear host UEFI password while force deleting machine");
}
}

// TODO (spyda): have libredfish return whether the client needs to reboot the host after clearing the host uefi password
Expand Down Expand Up @@ -767,6 +783,24 @@ async fn clear_bmc_credentials(api: &Api, machine: &Machine) -> Result<(), Carbi
Ok(())
}

/// Deletes the `host_uefi` credential-rotation convergence marker for a host,
/// keyed by its BMC MAC. Called after force-delete resets the host UEFI password
/// on the device, where the host no longer carries the site-wide UEFI value.
async fn forget_host_uefi_convergence(
api: &Api,
bmc_mac_address: mac_address::MacAddress,
) -> Result<(), CarbideError> {
let mut txn = api.txn_begin().await?;
db::credential_rotation::delete_device_converged(
&mut txn,
bmc_mac_address,
db::credential_rotation::CredentialRotationType::HostUefi,
)
.await?;
txn.commit().await?;
Ok(())
}

pub async fn get_machine_position_info(
api: &Api,
request: Request<rpc::MachinePositionQuery>,
Expand Down
4 changes: 2 additions & 2 deletions crates/api-core/src/handlers/mlx_admin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,7 @@ async fn get_device_lockdown_key(
}
})?;

let lockdown_key = crate::dpa::lockdown::build_supernic_lockdown_key(
let lockdown = crate::dpa::lockdown::build_supernic_lockdown_key(
&api.database_connection,
dpa_interface.id,
&*api.credential_manager,
Expand All @@ -1328,5 +1328,5 @@ async fn get_device_lockdown_key(
),
})?;

Ok(lockdown_key)
Ok(lockdown.key)
}
50 changes: 42 additions & 8 deletions crates/api-core/src/handlers/uefi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,16 @@ pub(crate) async fn set_host_uefi_password(
CarbideError::InvalidArgument("Specified machine does not have BMC address".into())
})?;

// A known BMC MAC is a hard precondition for setting the UEFI password: it
// keys the host_uefi rotation bookkeeping recorded below, so reject the
// request up front rather than driving the device and only then discovering
// we cannot track its convergence.
let host_bmc_mac = snapshot.host_snapshot.bmc_info.mac.ok_or_else(|| {
CarbideError::InvalidArgument(
"Specified machine does not have a known BMC MAC address".into(),
)
})?;

let bmc_access_info =
db::machine_interface::lookup_bmc_access_info(&mut txn, addr.ip(), Some(addr.port()))
.await?;
Expand Down Expand Up @@ -222,14 +232,38 @@ pub(crate) async fn set_host_uefi_password(
tracing::error!(%e, "Failed to run uefi_setup call");
CarbideError::internal(format!("Failed redfish uefi_setup subtask: {e}"))
})?;
api.with_txn(|txn| db::machine::update_bios_password_set_time(&machine_id, txn).boxed())
.await?
.map_err(|e| {
tracing::error!("Failed to update bios_password_set_time: {}", e);
CarbideError::Internal {
message: format!("Failed to update BIOS password timestamp: {e}"),
}
})?;
// uefi_setup returns a BMC job_id; the password change completes
// asynchronously on the device and we do not poll it here. We optimistically
// stamp bios_password_set_time and, in the same transaction, record host_uefi
// convergence (keyed by the host BMC MAC, mirroring the backfill) so the two
// always agree -- convergence rides along with the pre-existing marker. If
// the dispatched job ultimately fails on the BMC, both are inaccurate.
//
// TODO(credential-rotation): gate both the bios_password_set_time stamp and
// the host_uefi convergence record on confirmed job_id completion (poll the
// BMC job rather than trusting dispatch). Whatever confirms completion should
// perform both updates together -- convergence does not need its own separate
// write path or operator-facing API; it follows bios_password_set_time.
api.with_txn(|txn| {
async move {
db::machine::update_bios_password_set_time(&machine_id, txn).await?;
db::credential_rotation::record_device_converged(
txn,
host_bmc_mac,
db::credential_rotation::CredentialRotationType::HostUefi,
)
.await?;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Ok::<(), db::DatabaseError>(())
}
.boxed()
})
.await?
.map_err(|e| {
tracing::error!("Failed to update bios_password_set_time: {}", e);
CarbideError::Internal {
message: format!("Failed to update BIOS password timestamp: {e}"),
}
})?;

Ok(Response::new(rpc::SetHostUefiPasswordResponse { job_id }))
}
5 changes: 5 additions & 0 deletions crates/api-core/src/setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ pub async fn start_api(
// lockdown key without operator action. No-op once seeded or if the BMC
// root is not yet configured.
crate::dpa::lockdown::ensure_lockdown_ikm_seeded(&*credential_manager).await?;

// Initial credential-rotation bookkeeping is backfilled by the
// `*_credential_rotation_backfill` data migration (see its header for the
// ordering invariants), not seeded here.
};

let common_pools =
Expand Down Expand Up @@ -471,6 +475,7 @@ pub async fn start_api(
.rotate_switch_nvos_credentials
.clone(),
carbide_config.site_explorer.explore_mode,
db_pool.clone(),
);

let nvlink_config = carbide_config.nvlink_config.clone().unwrap_or_default();
Expand Down
1 change: 1 addition & 0 deletions crates/api-core/src/test_support/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ impl TestApiBuilder {
Arc::new(std::sync::atomic::AtomicBool::new(false)),
// Tests use MockEndpointExplorer. So this doesn't affect anything.
SiteExplorerExploreMode::NvRedfish,
self.db_pool.clone(),
);

let metric_emitter = self.metric_emitter.unwrap_or_else(|| {
Expand Down
Loading
Loading