Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions crates/agent/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -599,3 +599,14 @@ fn fill_fake_dpu_info(hardware_info: &mut DiscoveryInfo) {
switches: vec![],
});
}

/// Return Some(s) if s is not empty, otherwise None. Used to avoid repetition
/// when dealing with gRPC-sourced fields that use an empty string to indicate
/// an absent value.
pub fn get_non_empty_str<S>(s: &S) -> Option<&str>
where
S: AsRef<str>,
{
let s = s.as_ref();
if s.is_empty() { None } else { Some(s) }
}
50 changes: 46 additions & 4 deletions crates/agent/src/main_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ use crate::network_monitor::{self, NetworkPingerType};
use crate::util::get_host_boot_timestamp;
use crate::{
FMDS_MINIMUM_HBN_VERSION, HBNDeviceNames, NVUE_MINIMUM_HBN_VERSION, RunOptions, command_line,
ethernet_virtualization, extension_services, hbn, health, instance_metadata_endpoint, lldp,
machine_inventory_updater, managed_files, mtu, netlink, nvue, periodic_config_fetcher,
pretty_cmd, sysfs, upgrade,
ethernet_virtualization, extension_services, get_non_empty_str, hbn, health,
instance_metadata_endpoint, lldp, machine_inventory_updater, managed_files, mtu, netlink, nvue,
periodic_config_fetcher, pretty_cmd, sysfs, upgrade,
};

// Main loop when running in daemon mode
Expand Down Expand Up @@ -379,6 +379,7 @@ pub async fn setup_and_run(
extension_service_manager,
nvue_context,
dhcp_interface_translation_mode,
current_network_version: CurrentNetworkVersion::default(),
};

main_loop.run().await
Expand Down Expand Up @@ -412,13 +413,46 @@ struct MainLoop {
extension_service_manager: extension_services::ExtensionServiceManager,
nvue_context: Option<NvueClientContext>,
dhcp_interface_translation_mode: Option<InterfaceTranslationMode>,
current_network_version: CurrentNetworkVersion,
}

struct IterationResult {
stop_agent: bool,
loop_period: std::time::Duration,
}

/// `CurrentNetworkVersion` tracks the versions we last successfully applied,
/// mostly so we can avoid hitting the HBN update methods more frequently than
/// needed.
#[derive(Debug, Default)]
struct CurrentNetworkVersion {
managed_host_config_version: Option<String>,
instance_network_config_version: Option<String>,
}

impl CurrentNetworkVersion {
pub fn matches_versions_from(
&self,
conf: impl AsRef<ManagedHostNetworkConfigResponse>,
) -> bool {
let conf = conf.as_ref();
let managed_host_config_version = get_non_empty_str(&conf.managed_host_config_version);
let instance_network_config_version =
get_non_empty_str(&conf.instance_network_config_version);

self.managed_host_config_version.as_deref() == managed_host_config_version
&& self.instance_network_config_version.as_deref() == instance_network_config_version
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

pub fn update_from(&mut self, conf: impl AsRef<ManagedHostNetworkConfigResponse>) {
let conf = conf.as_ref();
self.managed_host_config_version =
get_non_empty_str(&conf.managed_host_config_version).map(String::from);
self.instance_network_config_version =
get_non_empty_str(&conf.instance_network_config_version).map(String::from);
}
}

/// Returns the last DHCP request timestamps for all known host interfaces.
///
/// When `dhcp_grpc_server` is `Some`, fetches timestamps from the dhcp-server
Expand Down Expand Up @@ -622,7 +656,14 @@ impl MainLoop {
)
.await;

let update_result = {
let update_result = if self.current_network_version.matches_versions_from(&conf)
{
tracing::debug!(
"No configuration change, skipping HBN updates: {:?}",
&self.current_network_version
);
Ok(false)
} else {
if self.options.agent_platform_type.is_dpu_os()
&& hbn_version >= self.fmds_minimum_hbn_version
{
Expand Down Expand Up @@ -712,6 +753,7 @@ impl MainLoop {
};
match joined_result {
Ok(has_changed) => {
self.current_network_version.update_from(&conf);
has_changed_configs = has_changed;
if self.options.agent_platform_type.is_dpu_os()
&& let Err(err) = mtu::ensure().await
Expand Down
11 changes: 11 additions & 0 deletions crates/agent/src/nvue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,13 @@ async fn run_apply(hbn_root: &Path, path: &Path) -> eyre::Result<bool> {
// Compare pending to applied config at NVUE layer.
// This avoids no-op apply cycles when textual YAML ordering changes but
// semantic config does not.
//
// BUG: It seems like under some conditions (which I don't really
// understand) this diff can return some output, but then the subsequent
// `nv config apply` will warn about "config apply executed with no config
// diff". This can result in needless PostConfigCheckWait health alerts, so
// we're working around this by avoiding even calling this code if we see
// unchanged config versions in the main loop. -drew
let stdout =
super::hbn::run_in_container(&container_id, &["nv", "config", "diff"], true).await?;
if stdout.is_empty() {
Expand All @@ -1018,6 +1025,7 @@ async fn run_apply(hbn_root: &Path, path: &Path) -> eyre::Result<bool> {
}
return Ok(false);
}
let config_diff_stdout = stdout;

// Apply the pending config.
//
Expand All @@ -1034,6 +1042,9 @@ async fn run_apply(hbn_root: &Path, path: &Path) -> eyre::Result<bool> {
super::hbn::run_in_container(&container_id, &["nv", "config", "apply", "-y"], true).await?;
if !stdout.is_empty() {
tracing::info!("nv config apply: {stdout}");
// We're logging this just to see what was in there, in case it can help
// explain the "config apply executed with no config diff" message.
tracing::info!("nv config diff: {config_diff_stdout}");
}

// Restart nl2doca
Expand Down
Loading