diff --git a/crates/mesh/src/collector.rs b/crates/mesh/src/collector.rs deleted file mode 100644 index 1da4b208c..000000000 --- a/crates/mesh/src/collector.rs +++ /dev/null @@ -1,1118 +0,0 @@ -//! Incremental update collection and batching -//! -//! Collects local state changes and batches them for efficient transmission. -//! `CentralCollector` runs once per gossip round, `PeerWatermark` filters -//! the batch per peer and tracks per-key send watermarks. - -use std::{ - collections::HashMap, - sync::{atomic::Ordering, Arc}, - time::{SystemTime, UNIX_EPOCH}, -}; - -use parking_lot::RwLock; -use tracing::debug; - -use super::{ - service::gossip::StateUpdate, - stores::{AppState, MembershipState, PolicyState, StateStores, StoreType, WorkerState}, - tree_ops::{lz4_compress, TenantDelta, TreeState}, -}; - -/// Trait for extracting version from state types -trait Versioned { - fn version(&self) -> u64; -} - -impl Versioned for WorkerState { - fn version(&self) -> u64 { - self.version - } -} - -impl Versioned for PolicyState { - fn version(&self) -> u64 { - self.version - } -} - -impl Versioned for AppState { - fn version(&self) -> u64 { - self.version - } -} - -impl Versioned for MembershipState { - fn version(&self) -> u64 { - self.version - } -} - -/// Tracks the last sent version for each key in each store -#[derive(Debug, Clone, Default)] -struct LastSentVersions { - worker: HashMap, - policy: HashMap, - app: HashMap, - membership: HashMap, - rate_limit: HashMap, // Track last sent timestamp for rate limit counter shards -} - -/// Tracks store generation to skip unchanged stores -#[derive(Debug, Clone, Copy, Default)] -struct LastScannedGenerations { - worker: u64, - policy: u64, - app: u64, - membership: u64, - /// Separate generation for tree state changes (bumped by - /// `sync_tree_operation` via atomic counter instead of CRDT). - tree: u64, -} - -/// How often to send a full tree structure snapshot for convergence, -/// measured in gossip rounds. At the default gossip interval of ~1s, -/// this means a full snapshot every ~30 seconds per model. -/// -/// Tenant deltas are sent every round (~20KB/s); full snapshots are -/// heavier (~300KB compressed) but ensure convergence after missed -/// deltas, new nodes joining, or network partitions. -// FIXME: Re-enable when Layer 2 (chunked snapshots) is implemented. -#[expect(dead_code, reason = "Reserved for Layer 2 snapshot interval")] -const STRUCTURE_SNAPSHOT_INTERVAL: u64 = 30; - -/// Maximum LZ4-compressed size for a single tree snapshot. Snapshots larger -/// than this are skipped to prevent the infinite retry loop where an oversized -/// snapshot is serialized, rejected by the gRPC size limit, and re-tried every -/// round — ~23 MB/s of allocator churn that the OS never reclaims. -const MAX_SNAPSHOT_BYTES: usize = 8 * 1024 * 1024; - -/// Get current timestamp in nanoseconds. -#[expect( - clippy::expect_used, - reason = "system clock before UNIX epoch is a fatal misconfiguration that must not silently produce timestamp=0" -)] -pub(crate) fn current_timestamp() -> u64 { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("system clock before UNIX_EPOCH; cannot generate valid timestamps") - .as_nanos() as u64 -} - -/// Build the per-actor last-sent key for rate limit shards. -pub(crate) fn rate_limit_last_sent_key(key: &str, actor: &str) -> String { - format!("{key}::actor:{actor}") -} - -// ============================================================================ -// Central Tenant Delta Drain (v2 bug fix) -// ============================================================================ - -/// Tenant delta updates drained once per gossip round. Used by -/// `CentralCollector` to drain the shared DashMap exactly once per round -/// (v1 bug fix where destructive drain races left later peers empty-handed). -#[derive(Debug, Clone, Default)] -pub struct DrainedTenantDeltas { - /// Tenant delta StateUpdates collected from the destructive drain. - /// These are Policy-type updates with key "tree:{model_id}". - pub updates: Vec, - /// Set of tree keys emitted as deltas. Used to skip these keys in - /// Phase 1 (tree_configs full-state scan) so the same model isn't sent twice. - pub emitted_tree_keys: std::collections::HashSet, -} - -/// Drains tenant delta buffers exactly once per gossip round. The result is -/// stored in a shared location so all per-peer collectors can include the -/// same deltas without racing on the destructive DashMap remove. -pub fn drain_tenant_deltas_central(stores: &StateStores, self_name: &str) -> DrainedTenantDeltas { - let timestamp = current_timestamp(); - let mut updates = Vec::new(); - let mut emitted_tree_keys = std::collections::HashSet::new(); - - let models_with_inserts: Vec = stores - .tenant_delta_inserts - .iter() - .filter(|entry| !entry.value().is_empty()) - .map(|entry| entry.key().clone()) - .collect(); - let models_with_evictions: Vec = stores - .tenant_delta_evictions - .iter() - .filter(|entry| !entry.value().is_empty()) - .map(|entry| entry.key().clone()) - .collect(); - - let all_models: std::collections::HashSet = models_with_inserts - .into_iter() - .chain(models_with_evictions) - .collect(); - - for model_id in all_models { - let key = format!("tree:{model_id}"); - let current_version = stores.tree_version(&key); - - let inserts = stores - .tenant_delta_inserts - .remove(&model_id) - .map(|(_, v)| v) - .unwrap_or_default(); - let evictions = stores - .tenant_delta_evictions - .remove(&model_id) - .map(|(_, v)| v) - .unwrap_or_default(); - - if inserts.is_empty() && evictions.is_empty() { - continue; - } - - let delta = TenantDelta { - model_id: model_id.clone(), - version: current_version, - inserts, - evictions, - }; - - if let Ok(delta_bytes) = delta.to_bytes() { - let delta_policy = PolicyState { - model_id: model_id.clone(), - policy_type: "tenant_delta".to_string(), - config: delta_bytes, - version: current_version, - }; - if let Ok(serialized) = bincode::serialize(&delta_policy) { - updates.push(StateUpdate { - key: key.clone(), - value: serialized, - version: current_version, - actor: self_name.to_string(), - timestamp, - }); - debug!( - "Central drain: tenant delta {} ({} inserts, {} evictions, version: {})", - model_id, - delta.inserts.len(), - delta.evictions.len(), - current_version, - ); - emitted_tree_keys.insert(key); - } - } - } - - if !updates.is_empty() { - let total_bytes: usize = updates.iter().map(|u| u.value.len()).sum(); - debug!( - "Central drain: {} tenant delta updates ({} bytes total)", - updates.len(), - total_bytes, - ); - } - - DrainedTenantDeltas { - updates, - emitted_tree_keys, - } -} - -// ============================================================================ -// CentralCollector + PeerWatermark (v2 architecture) -// ============================================================================ - -/// A round batch produced by the central collector. Contains ALL updates from -/// this round, organized by store type. Per-peer watermark filtering happens -/// at send time via `PeerWatermark::filter()`. -#[derive(Debug, Clone, Default)] -pub struct RoundBatch { - pub updates: Vec<(StoreType, Vec)>, -} - -/// Central collector that runs once per gossip round. Produces a `RoundBatch` -/// containing all changed entries across all stores. Destructive operations -/// (tenant delta drain) happen here exactly once. Per-peer watermark filtering -/// is NOT done here — that's `PeerWatermark`'s job. -pub struct CentralCollector { - stores: Arc, - self_name: String, - /// Generation tracking to skip unchanged stores between rounds. - last_scanned: RwLock, - /// Generations observed during the most recent `collect()`. Used by - /// `advance_generations()` to avoid a TOCTOU race where a concurrent - /// write between collect and advance would cause the new entries to - /// be skipped on the next round. - collected_generations: RwLock, -} - -impl CentralCollector { - pub fn new(stores: Arc, self_name: String) -> Self { - Self { - stores, - self_name, - last_scanned: RwLock::new(LastScannedGenerations::default()), - collected_generations: RwLock::new(LastScannedGenerations::default()), - } - } - - /// Collect all changes for this round. Called exactly once per gossip round - /// by the event loop. Returns a `RoundBatch` that per-peer watermarks filter. - pub fn collect(&self) -> RoundBatch { - let mut all_updates = Vec::new(); - - // Snapshot all generations UP FRONT, before any reads. This locks in - // the values we'll use for skip checks AND for advance_generations, - // so concurrent writes between here and advance() aren't silently - // skipped on the next round. - let snapshot = LastScannedGenerations { - worker: self.stores.worker.generation(), - policy: self.stores.policy.generation(), - app: self.stores.app.generation(), - membership: self.stores.membership.generation(), - tree: self.stores.tree_generation.load(Ordering::Acquire), - }; - *self.collected_generations.write() = snapshot; - - for store_type in [ - StoreType::Worker, - StoreType::Policy, - StoreType::App, - StoreType::Membership, - StoreType::RateLimit, - ] { - let updates = self.collect_store(store_type, &snapshot); - if !updates.is_empty() { - all_updates.push((store_type, updates)); - } - } - - RoundBatch { - updates: all_updates, - } - } - - /// Record the generations observed during the last `collect()` so the next - /// round can skip unchanged stores. Uses the captured snapshot (not a - /// re-read) to avoid a TOCTOU race. - pub fn advance_generations(&self) { - let collected = *self.collected_generations.read(); - *self.last_scanned.write() = collected; - } - - /// Collect all entries for a store type. No watermark filtering — includes - /// ALL current entries from stores that changed since last round. Uses - /// the pre-captured `snapshot` for skip checks so the values are consistent - /// with what `advance_generations` will record. - fn collect_store( - &self, - store_type: StoreType, - snapshot: &LastScannedGenerations, - ) -> Vec { - let last_scanned = self.last_scanned.read(); - let timestamp = current_timestamp(); - - match store_type { - StoreType::Worker => { - if snapshot.worker == last_scanned.worker { - return vec![]; - } - self.collect_serializable_store( - self.stores.worker.all(), - "worker", - timestamp, - |s: &WorkerState| s.worker_id.clone(), - ) - } - StoreType::Policy => { - let policy_changed = snapshot.policy != last_scanned.policy; - let tree_changed = snapshot.tree != last_scanned.tree; - if !policy_changed && !tree_changed { - return vec![]; - } - self.collect_policy_store(timestamp, policy_changed, tree_changed) - } - StoreType::App => { - if snapshot.app == last_scanned.app { - return vec![]; - } - self.collect_serializable_store( - self.stores.app.all(), - "app", - timestamp, - |s: &AppState| s.key.clone(), - ) - } - StoreType::Membership => { - if snapshot.membership == last_scanned.membership { - return vec![]; - } - self.collect_serializable_store( - self.stores.membership.all(), - "membership", - timestamp, - |s: &MembershipState| s.name.clone(), - ) - } - StoreType::RateLimit => { - // Reuse outer `timestamp` — the RateLimit branch previously - // called current_timestamp() again for no meaningful difference. - let mut updates = Vec::new(); - for (key, actor, counter_value) in self.stores.rate_limit.all_shards() { - if !self.stores.rate_limit.is_owner(&key) { - continue; - } - if let Ok(serialized) = bincode::serialize(&counter_value) { - updates.push(StateUpdate { - key, - value: serialized, - version: timestamp, - actor, - timestamp, - }); - } - } - updates - } - } - } - - /// Collect all entries from a serializable store. No watermark filtering. - fn collect_serializable_store( - &self, - all_items: std::collections::BTreeMap, - store_name: &str, - timestamp: u64, - get_id: impl Fn(&S) -> String, - ) -> Vec - where - S: serde::Serialize + Versioned, - { - let mut updates = Vec::new(); - for (key, state) in all_items { - if let Ok(serialized) = bincode::serialize(&state) { - debug!( - "Central collect {} update: {} (version: {})", - store_name, - get_id(&state), - state.version(), - ); - updates.push(StateUpdate { - key, - value: serialized, - version: state.version(), - actor: self.self_name.clone(), - timestamp, - }); - } - } - updates - } - - /// Collect policy store entries + tenant deltas + tree_configs. - /// Tenant deltas are destructively drained (safe because this runs once). - /// `policy_changed` gates the non-tree policy scan; `tree_changed` gates - /// the tenant delta drain + tree_configs scan. A tenant-delta-only round - /// skips the full policy.all() sweep, which can be expensive. - fn collect_policy_store( - &self, - timestamp: u64, - policy_changed: bool, - tree_changed: bool, - ) -> Vec { - let mut updates = Vec::new(); - let mut emitted_tree_keys = std::collections::HashSet::new(); - - // Non-tree policy entries — only scan when the policy CRDT generation - // has changed since last round. Gating avoids O(policy_count) work - // on every tenant-delta round. - if policy_changed { - let all_policies = self.stores.policy.all(); - for (key, state) in &all_policies { - if key.starts_with("tree:") { - continue; - } - if let Ok(serialized) = bincode::serialize(state) { - updates.push(StateUpdate { - key: key.clone(), - value: serialized, - version: state.version(), - actor: self.self_name.clone(), - timestamp, - }); - } - } - } - - if !tree_changed { - return updates; - } - - // Phase 0: Drain tenant deltas (destructive, runs once) - let drained = drain_tenant_deltas_central(&self.stores, &self.self_name); - updates.extend(drained.updates); - emitted_tree_keys.extend(drained.emitted_tree_keys); - - // Phase 1: tree_configs scan for keys not emitted as deltas. - // - // Collect (key, value) snapshots FIRST so DashMap shard locks are - // released before the slow per-entry work (TreeSnapshot/TreeState - // parsing + lz4_compress). Holding shard locks across those would - // block concurrent writers to tree_configs and risk deadlock on any - // nested map operation. - let tree_entries: Vec<(String, Vec)> = self - .stores - .tree_configs - .iter() - .map(|e| (e.key().clone(), e.value().clone())) - .collect(); - - for (key, config_bytes) in tree_entries { - if emitted_tree_keys.contains(key.as_str()) { - continue; - } - if config_bytes.is_empty() { - continue; - } - let model_id = key.strip_prefix("tree:").unwrap_or(&key).to_string(); - let current_version = self.stores.tree_version(&key); - let tree_version = if let Ok(ts) = TreeState::from_bytes(&config_bytes) { - ts.version - } else if kv_index::snapshot::TreeSnapshot::from_bytes(&config_bytes).is_ok() { - current_version - } else { - continue; - }; - let compressed = lz4_compress(&config_bytes); - if compressed.len() > MAX_SNAPSHOT_BYTES { - debug!( - key = %key, - compressed_bytes = compressed.len(), - "Skipping oversized tree snapshot" - ); - continue; - } - let full_state = PolicyState { - model_id, - policy_type: "tree_state_lz4".to_string(), - config: compressed, - version: tree_version, - }; - if let Ok(serialized) = bincode::serialize(&full_state) { - updates.push(StateUpdate { - key, - value: serialized, - version: current_version, - actor: self.self_name.clone(), - timestamp, - }); - } - } - - updates - } -} - -/// Per-peer watermark tracker. Filters a centrally collected `RoundBatch` to -/// include only entries this peer hasn't seen yet, and tracks what was sent. -#[derive(Debug)] -pub struct PeerWatermark { - /// Peer name, used for Debug output. - _peer_name: String, - last_sent: LastSentVersions, -} - -impl PeerWatermark { - pub fn new(peer_name: String) -> Self { - Self { - _peer_name: peer_name, - last_sent: LastSentVersions::default(), - } - } - - /// Filter a round batch to include only entries this peer hasn't received. - /// Returns updates organized by store type, ready to send. - pub fn filter(&self, batch: &RoundBatch) -> Vec<(StoreType, Vec)> { - let mut filtered = Vec::new(); - - for (store_type, updates) in &batch.updates { - let peer_updates: Vec = updates - .iter() - .filter(|u| self.should_send(*store_type, u)) - .cloned() - .collect(); - if !peer_updates.is_empty() { - filtered.push((*store_type, peer_updates)); - } - } - - filtered - } - - /// Mark updates as successfully sent to this peer. Advances watermark. - pub fn mark_sent(&mut self, store_type: StoreType, updates: &[StateUpdate]) { - for update in updates { - match store_type { - StoreType::Worker => { - self.last_sent - .worker - .insert(update.key.clone(), update.version); - } - StoreType::Policy => { - self.last_sent - .policy - .insert(update.key.clone(), update.version); - } - StoreType::App => { - self.last_sent - .app - .insert(update.key.clone(), update.version); - } - StoreType::Membership => { - self.last_sent - .membership - .insert(update.key.clone(), update.version); - } - StoreType::RateLimit => { - let shard_key = rate_limit_last_sent_key(&update.key, &update.actor); - self.last_sent.rate_limit.insert(shard_key, update.version); - } - } - } - } - - fn should_send(&self, store_type: StoreType, update: &StateUpdate) -> bool { - let last_sent_version = match store_type { - StoreType::Worker => self.last_sent.worker.get(&update.key).copied().unwrap_or(0), - StoreType::Policy => self.last_sent.policy.get(&update.key).copied().unwrap_or(0), - StoreType::App => self.last_sent.app.get(&update.key).copied().unwrap_or(0), - StoreType::Membership => self - .last_sent - .membership - .get(&update.key) - .copied() - .unwrap_or(0), - StoreType::RateLimit => { - let shard_key = rate_limit_last_sent_key(&update.key, &update.actor); - self.last_sent - .rate_limit - .get(&shard_key) - .copied() - .unwrap_or(0) - } - }; - update.version > last_sent_version - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::stores::{AppState, MembershipState, PolicyState, StateStores, WorkerState}; - - /// Test-only helper: collect updates for one store type via CentralCollector. - fn collect_store_updates( - stores: &Arc, - self_name: &str, - store_type: StoreType, - ) -> Vec { - let central = CentralCollector::new(stores.clone(), self_name.to_string()); - let batch = central.collect(); - batch - .updates - .into_iter() - .find(|(t, _)| *t == store_type) - .map(|(_, v)| v) - .unwrap_or_default() - } - - fn make_worker(version: u64, health: bool, load: f64) -> WorkerState { - WorkerState { - worker_id: "worker1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health, - load, - version, - spec: vec![], - } - } - - #[test] - fn test_collect_worker_updates() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(1, true, 0.5)); - - // CentralCollector returns ALL entries — no watermark filter inside collect(). - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - let batch = central.collect(); - let updates: Vec<_> = batch - .updates - .iter() - .find(|(t, _)| *t == StoreType::Worker) - .map(|(_, v)| v.clone()) - .unwrap_or_default(); - assert_eq!(updates.len(), 1); - assert_eq!(updates[0].key, "worker1"); - assert_eq!(updates[0].version, 1); - assert_eq!(updates[0].actor, "node1"); - - // PeerWatermark applies filtering + mark_sent. - let mut watermark = PeerWatermark::new("peer".to_string()); - let filtered1 = watermark.filter(&batch); - assert_eq!(filtered1[0].1.len(), 1); - // Filter again before mark_sent — still visible. - let filtered2 = watermark.filter(&batch); - assert_eq!(filtered2[0].1.len(), 1); - // After mark_sent — filtered out. - watermark.mark_sent(StoreType::Worker, &filtered2[0].1); - let filtered_after = watermark.filter(&batch); - assert!(filtered_after.is_empty()); - - // Advance and write a new version — next batch should carry v2. - central.advance_generations(); - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(2, false, 0.8)); - let batch2 = central.collect(); - let updates3: Vec<_> = batch2 - .updates - .iter() - .find(|(t, _)| *t == StoreType::Worker) - .map(|(_, v)| v.clone()) - .unwrap_or_default(); - assert_eq!(updates3.len(), 1); - assert_eq!(updates3[0].version, 2); - } - - #[test] - fn test_collect_policy_updates() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let _ = stores.policy.insert( - "policy:model1".to_string(), - PolicyState { - model_id: "model1".to_string(), - policy_type: "cache_aware".to_string(), - config: b"config_data".to_vec(), - version: 1, - }, - ); - - let updates = collect_store_updates(&stores, "node1", StoreType::Policy); - assert_eq!(updates.len(), 1); - assert_eq!(updates[0].key, "policy:model1"); - } - - #[test] - fn test_collect_app_updates() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let _ = stores.app.insert( - "app_key1".to_string(), - AppState { - key: "app_key1".to_string(), - value: b"app_value".to_vec(), - version: 1, - }, - ); - - let updates = collect_store_updates(&stores, "node1", StoreType::App); - assert_eq!(updates.len(), 1); - assert_eq!(updates[0].key, "app_key1"); - } - - #[test] - fn test_collect_membership_updates() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let _ = stores.membership.insert( - "node2".to_string(), - MembershipState { - name: "node2".to_string(), - address: "127.0.0.1:8001".to_string(), - status: 1, // Alive - version: 1, - metadata: std::collections::BTreeMap::new(), - }, - ); - - let updates = collect_store_updates(&stores, "node1", StoreType::Membership); - assert_eq!(updates.len(), 1); - assert_eq!(updates[0].key, "node2"); - } - - #[test] - fn test_collect_all_updates() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(1, true, 0.5)); - let _ = stores.policy.insert( - "policy:model1".to_string(), - PolicyState { - model_id: "model1".to_string(), - policy_type: "cache_aware".to_string(), - config: vec![], - version: 1, - }, - ); - - let central = CentralCollector::new(stores, "node1".to_string()); - let batch = central.collect(); - assert_eq!(batch.updates.len(), 2); // Worker and Policy - } - - #[test] - fn test_mark_sent() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(1, true, 0.5)); - - let central = CentralCollector::new(stores, "node1".to_string()); - let batch = central.collect(); - - let mut watermark = PeerWatermark::new("peer".to_string()); - let filtered = watermark.filter(&batch); - assert_eq!(filtered[0].1.len(), 1); - - // Mark as sent — subsequent filter on the same batch returns empty. - for (store_type, updates) in &filtered { - watermark.mark_sent(*store_type, updates); - } - let filtered2 = watermark.filter(&batch); - assert!(filtered2.is_empty()); - } - - #[test] - fn test_rate_limit_version_dedup() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - stores.rate_limit.update_membership(&["node1".to_string()]); - - let test_key = "test_rate_limit_key".to_string(); - assert!( - stores.rate_limit.is_owner(&test_key), - "single-node membership should own every rate-limit shard" - ); - - stores - .rate_limit - .inc(test_key.clone(), "node1".to_string(), 1); - - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - let mut watermark = PeerWatermark::new("peer".to_string()); - - let batch1 = central.collect(); - let filtered1 = watermark.filter(&batch1); - let rl1 = filtered1 - .iter() - .find(|(t, _)| *t == StoreType::RateLimit) - .map(|(_, v)| v) - .expect("rate limit updates expected on first filter"); - assert!(!rl1.is_empty()); - for (store_type, updates) in &filtered1 { - watermark.mark_sent(*store_type, updates); - } - - let filtered2 = watermark.filter(&batch1); - let rl2_count = filtered2 - .iter() - .find(|(t, _)| *t == StoreType::RateLimit) - .map(|(_, v)| v.len()) - .unwrap_or(0); - assert_eq!(rl2_count, 0, "same batch should be fully deduped"); - - stores - .rate_limit - .inc(test_key.clone(), "node1".to_string(), 1); - central.advance_generations(); - let batch3 = central.collect(); - let filtered3 = watermark.filter(&batch3); - let rl3 = filtered3 - .iter() - .find(|(t, _)| *t == StoreType::RateLimit) - .map(|(_, v)| v) - .expect("bumped rate-limit shard should re-appear in filter"); - assert!(!rl3.is_empty()); - } - - #[test] - fn test_version_tracking() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(1, true, 0.5)); - let batch1 = central.collect(); - let v1 = batch1 - .updates - .iter() - .find(|(t, _)| *t == StoreType::Worker) - .map(|(_, v)| v[0].version) - .unwrap(); - assert_eq!(v1, 1); - - central.advance_generations(); - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(2, false, 0.8)); - let batch2 = central.collect(); - let v2 = batch2 - .updates - .iter() - .find(|(t, _)| *t == StoreType::Worker) - .map(|(_, v)| v[0].version) - .unwrap(); - assert_eq!(v2, 2); - - central.advance_generations(); - let _ = stores - .worker - .insert("worker1".to_string(), make_worker(3, true, 0.3)); - let batch3 = central.collect(); - let v3 = batch3 - .updates - .iter() - .find(|(t, _)| *t == StoreType::Worker) - .map(|(_, v)| v[0].version) - .unwrap(); - assert_eq!(v3, 3); - } - - // ======================================================================== - // Multi-peer delivery tests (v2 bug fix verification) - // ======================================================================== - - use crate::tree_ops::TenantInsert; - - fn insert_tenant_delta(stores: &StateStores, model_id: &str, hash: u64) { - stores - .tenant_delta_inserts - .entry(model_id.to_string()) - .or_default() - .push(TenantInsert { - node_path_hash: hash, - worker_url: "http://w1:8000".to_string(), - epoch: 0, - }); - stores.bump_tree_version(&format!("tree:{model_id}")); - } - - /// Regression test for v1 per-peer collector bug: tenant deltas were - /// destructively drained from the shared DashMap, so only the first peer's - /// collector received them. This test verifies that with CentralCollector - /// + PeerWatermark, ALL peers see the same deltas. - #[test] - fn test_all_peers_receive_tenant_deltas() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - - // Simulate a tree insert producing a tenant delta - insert_tenant_delta(&stores, "model-x", 0xABCD); - insert_tenant_delta(&stores, "model-x", 0xBEEF); - insert_tenant_delta(&stores, "model-y", 0xDEAD); - - // Central collector runs once per round - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - let batch = central.collect(); - - // Three simulated peers, each with their own watermark - let mut peer_a = PeerWatermark::new("peer-a".to_string()); - let mut peer_b = PeerWatermark::new("peer-b".to_string()); - let mut peer_c = PeerWatermark::new("peer-c".to_string()); - - // All three peers see the same tenant delta updates from the batch - let a_updates = peer_a.filter(&batch); - let b_updates = peer_b.filter(&batch); - let c_updates = peer_c.filter(&batch); - - // Helper: count tree:* entries (tenant deltas) - let count_tree_updates = |updates: &[(StoreType, Vec)]| -> usize { - updates - .iter() - .flat_map(|(_, v)| v.iter()) - .filter(|u| u.key.starts_with("tree:")) - .count() - }; - - let a_tree = count_tree_updates(&a_updates); - let b_tree = count_tree_updates(&b_updates); - let c_tree = count_tree_updates(&c_updates); - - assert_eq!( - a_tree, 2, - "peer-a should see 2 tenant deltas (model-x and model-y)" - ); - assert_eq!( - b_tree, 2, - "peer-b should see 2 tenant deltas (v1 bug: only peer-a would see them)" - ); - assert_eq!(c_tree, 2, "peer-c should see 2 tenant deltas"); - - // After each peer marks their updates as sent, a second collect - // (no new changes) should return nothing for those peers. - for (store_type, updates) in &a_updates { - peer_a.mark_sent(*store_type, updates); - } - for (store_type, updates) in &b_updates { - peer_b.mark_sent(*store_type, updates); - } - for (store_type, updates) in &c_updates { - peer_c.mark_sent(*store_type, updates); - } - } - - #[test] - fn test_peer_watermark_filters_by_version() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - - let _ = stores.worker.insert( - "worker:1".to_string(), - WorkerState { - worker_id: "worker:1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health: true, - load: 0.0, - version: 5, - spec: vec![], - }, - ); - - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - let batch = central.collect(); - - let mut peer_a = PeerWatermark::new("peer-a".to_string()); - - // First filter: peer-a has no watermark, gets the update - let updates1 = peer_a.filter(&batch); - assert_eq!(updates1.len(), 1); - assert_eq!(updates1[0].1.len(), 1); - assert_eq!(updates1[0].1[0].version, 5); - - // Mark sent: peer-a's watermark is now at version 5 - for (store_type, updates) in &updates1 { - peer_a.mark_sent(*store_type, updates); - } - - // Second filter: peer-a already has version 5, filtered out - let updates2 = peer_a.filter(&batch); - assert_eq!( - updates2.iter().flat_map(|(_, v)| v.iter()).count(), - 0, - "peer-a should filter out already-sent versions" - ); - } - - #[test] - fn test_peers_with_different_watermarks() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - - // Two workers, one at version 3, one at version 7 - let _ = stores.worker.insert( - "worker:1".to_string(), - WorkerState { - worker_id: "worker:1".to_string(), - model_id: "m1".to_string(), - url: "http://w1:8000".to_string(), - health: true, - load: 0.0, - version: 3, - spec: vec![], - }, - ); - let _ = stores.worker.insert( - "worker:2".to_string(), - WorkerState { - worker_id: "worker:2".to_string(), - model_id: "m2".to_string(), - url: "http://w2:8000".to_string(), - health: true, - load: 0.0, - version: 7, - spec: vec![], - }, - ); - - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - let batch = central.collect(); - - // peer-a is at worker:1 v=3, worker:2 v=0 (new) - // peer-b is at worker:1 v=0 (new), worker:2 v=7 (caught up) - let mut peer_a = PeerWatermark::new("peer-a".to_string()); - let mut peer_b = PeerWatermark::new("peer-b".to_string()); - - // Seed peer_a's watermark: already has worker:1 v=3, missing worker:2 - peer_a.mark_sent( - StoreType::Worker, - &[StateUpdate { - key: "worker:1".to_string(), - value: vec![], - version: 3, - actor: "node1".to_string(), - timestamp: 0, - }], - ); - // Seed peer_b's watermark: already has worker:2 v=7, missing worker:1 - peer_b.mark_sent( - StoreType::Worker, - &[StateUpdate { - key: "worker:2".to_string(), - value: vec![], - version: 7, - actor: "node1".to_string(), - timestamp: 0, - }], - ); - - let a_updates = peer_a.filter(&batch); - let b_updates = peer_b.filter(&batch); - - // peer_a: only worker:2 is new (v=7 > 0) - let a_keys: Vec = a_updates - .iter() - .flat_map(|(_, v)| v.iter().map(|u| u.key.clone())) - .collect(); - assert_eq!(a_keys, vec!["worker:2"], "peer-a should only get worker:2"); - - // peer_b: only worker:1 is new (v=3 > 0) - let b_keys: Vec = b_updates - .iter() - .flat_map(|(_, v)| v.iter().map(|u| u.key.clone())) - .collect(); - assert_eq!(b_keys, vec!["worker:1"], "peer-b should only get worker:1"); - } - - #[test] - fn test_central_collector_drains_tenant_deltas_once() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - insert_tenant_delta(&stores, "model-x", 0xABCD); - - let central = CentralCollector::new(stores.clone(), "node1".to_string()); - - // First collect: drains tenant deltas - let batch1 = central.collect(); - let tree_updates_1: usize = batch1 - .updates - .iter() - .flat_map(|(_, v)| v.iter()) - .filter(|u| u.key.starts_with("tree:")) - .count(); - assert_eq!( - tree_updates_1, 1, - "first collect should drain the tenant delta" - ); - - // Second collect (no new changes): tenant deltas already drained, so no tree updates - // (but generation hasn't changed, so Policy store is skipped entirely) - central.advance_generations(); - let batch2 = central.collect(); - let tree_updates_2: usize = batch2 - .updates - .iter() - .flat_map(|(_, v)| v.iter()) - .filter(|u| u.key.starts_with("tree:")) - .count(); - assert_eq!( - tree_updates_2, 0, - "second collect should have no tenant deltas" - ); - } -} diff --git a/crates/mesh/src/consistent_hash.rs b/crates/mesh/src/consistent_hash.rs deleted file mode 100644 index a81d726c5..000000000 --- a/crates/mesh/src/consistent_hash.rs +++ /dev/null @@ -1,217 +0,0 @@ -//! Consistent hashing for rate-limit ownership -//! -//! Implements consistent hashing ring to determine K owners (K=1-3) for each rate-limit key. -//! Supports ownership transfer on node failures. - -use std::{ - collections::{hash_map::DefaultHasher, BTreeMap, HashSet}, - hash::{Hash, Hasher}, -}; - -/// Number of virtual nodes per physical node (for better distribution) -const VIRTUAL_NODES_PER_NODE: usize = 150; - -/// Number of owners (K) for each key -const NUM_OWNERS: usize = 3; - -/// Consistent hash ring -#[derive(Debug, Clone)] -pub struct ConsistentHashRing { - /// Ring: hash -> node_name - ring: BTreeMap, - /// Node -> set of virtual node hashes - node_hashes: BTreeMap>, -} - -impl ConsistentHashRing { - pub fn new() -> Self { - Self { - ring: BTreeMap::new(), - node_hashes: BTreeMap::new(), - } - } - - /// Add a node to the ring - pub fn add_node(&mut self, node_name: &str) { - if self.node_hashes.contains_key(node_name) { - // Node already exists - return; - } - - let mut hashes = HashSet::new(); - for i in 0..VIRTUAL_NODES_PER_NODE { - let virtual_node = format!("{node_name}:{i}"); - let hash = Self::hash(&virtual_node); - self.ring.insert(hash, node_name.to_string()); - hashes.insert(hash); - } - self.node_hashes.insert(node_name.to_string(), hashes); - } - - /// Remove a node from the ring - pub fn remove_node(&mut self, node_name: &str) { - if let Some(hashes) = self.node_hashes.remove(node_name) { - for hash in hashes { - self.ring.remove(&hash); - } - } - } - - /// Get K owners for a key - pub fn get_owners(&self, key: &str) -> Vec { - if self.ring.is_empty() { - return Vec::new(); - } - - let key_hash = Self::hash(key); - let mut owners = Vec::new(); - let mut seen_nodes = HashSet::new(); - let total_unique_nodes = self.node_hashes.len(); - - // Find the first node >= key_hash (clockwise) - let mut iter = self.ring.range(key_hash..); - while owners.len() < NUM_OWNERS && seen_nodes.len() < total_unique_nodes { - if let Some((_, node)) = iter.next() { - if !seen_nodes.contains(node) { - owners.push(node.clone()); - seen_nodes.insert(node.clone()); - } - } else { - // Wrap around to the beginning - iter = self.ring.range(..); - } - } - - owners - } - - /// Check if a node is an owner of a key - pub fn is_owner(&self, key: &str, node_name: &str) -> bool { - self.get_owners(key).contains(&node_name.to_string()) - } - - /// Get all nodes in the ring - #[cfg(test)] - pub fn get_nodes(&self) -> Vec { - self.node_hashes.keys().cloned().collect() - } - - /// Check if a node exists in the ring - #[cfg(test)] - pub fn has_node(&self, node_name: &str) -> bool { - self.node_hashes.contains_key(node_name) - } - - /// Hash a string to u64 - fn hash(s: &str) -> u64 { - let mut hasher = DefaultHasher::new(); - s.hash(&mut hasher); - hasher.finish() - } - - /// Update ring with current membership - pub fn update_membership(&mut self, nodes: &[String]) { - let current_nodes: HashSet = self.node_hashes.keys().cloned().collect(); - let new_nodes: HashSet = nodes.iter().cloned().collect(); - - // Remove nodes that are no longer present - for node in current_nodes.difference(&new_nodes) { - self.remove_node(node); - } - - // Add new nodes - for node in new_nodes.difference(¤t_nodes) { - self.add_node(node); - } - } -} - -impl Default for ConsistentHashRing { - fn default() -> Self { - Self::new() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_add_remove_node() { - let mut ring = ConsistentHashRing::new(); - ring.add_node("node1"); - assert!(ring.has_node("node1")); - assert_eq!(ring.get_nodes().len(), 1); - - ring.add_node("node2"); - assert_eq!(ring.get_nodes().len(), 2); - - ring.remove_node("node1"); - assert!(!ring.has_node("node1")); - assert_eq!(ring.get_nodes().len(), 1); - } - - #[test] - fn test_get_owners() { - let mut ring = ConsistentHashRing::new(); - ring.add_node("node1"); - ring.add_node("node2"); - ring.add_node("node3"); - - let owners = ring.get_owners("test_key"); - assert_eq!(owners.len(), NUM_OWNERS); - assert!(owners.iter().all(|n| ring.has_node(n))); - } - - #[test] - fn test_is_owner() { - let mut ring = ConsistentHashRing::new(); - ring.add_node("node1"); - ring.add_node("node2"); - ring.add_node("node3"); - - let owners = ring.get_owners("test_key"); - for owner in &owners { - assert!(ring.is_owner("test_key", owner)); - } - } - - #[test] - fn test_update_membership() { - let mut ring = ConsistentHashRing::new(); - ring.add_node("node1"); - ring.add_node("node2"); - - ring.update_membership(&["node2".to_string(), "node3".to_string()]); - assert!(!ring.has_node("node1")); - assert!(ring.has_node("node2")); - assert!(ring.has_node("node3")); - } - - #[test] - fn test_get_owners_with_fewer_nodes_than_owners() { - // Test that the loop terminates correctly when there are fewer nodes than NUM_OWNERS - let mut ring = ConsistentHashRing::new(); - ring.add_node("node1"); - ring.add_node("node2"); - // Only 2 nodes, but NUM_OWNERS is 3 - - let owners = ring.get_owners("test_key"); - // Should return all available nodes (2) without infinite loop - assert_eq!(owners.len(), 2); - assert!(owners.contains(&"node1".to_string())); - assert!(owners.contains(&"node2".to_string())); - } - - #[test] - fn test_get_owners_with_single_node() { - // Test with only one node - let mut ring = ConsistentHashRing::new(); - ring.add_node("node1"); - - let owners = ring.get_owners("test_key"); - // Should return the single node without infinite loop - assert_eq!(owners.len(), 1); - assert_eq!(owners[0], "node1"); - } -} diff --git a/crates/mesh/src/controller.rs b/crates/mesh/src/controller.rs index 608536e8d..291e231d3 100644 --- a/crates/mesh/src/controller.rs +++ b/crates/mesh/src/controller.rs @@ -26,18 +26,14 @@ use super::{ }, try_ping, ClusterState, }, - stores::StateStores, - sync::MeshSyncManager, }; use crate::{ chunking::{ build_stream_batches, chunk_value, dispatch_stream_batch, next_generation, DEFAULT_MAX_CHUNKS_PER_BATCH, MAX_STREAM_CHUNK_BYTES, }, - collector::{CentralCollector, PeerWatermark, RoundBatch}, - flow_control::{MessageSizeValidator, MAX_MESSAGE_SIZE}, + flow_control::MAX_MESSAGE_SIZE, metrics, - service::gossip::IncrementalUpdate, }; pub struct MeshController { @@ -45,16 +41,9 @@ pub struct MeshController { self_name: String, self_addr: SocketAddr, init_peer: Option, - stores: Arc, - sync_manager: Arc, mtls_manager: Option>, // Track active sync_stream connections sync_connections: Arc>>>, - /// Central collector that runs once per gossip round. - central_collector: Arc, - /// Current round batch, updated once per round by the central collector. - /// Per-peer senders read and apply their own watermark filtering. - current_batch: Arc>>, /// Current stream round batch, drained once per round from MeshKV. /// Per-peer senders read this and filter targeted entries to their /// own peer; drain_entries are broadcast to every peer. @@ -66,29 +55,20 @@ pub struct MeshController { } impl MeshController { - /// Create a new MeshController with stores and sync manager pub fn new( state: ClusterState, self_addr: SocketAddr, self_name: &str, init_peer: Option, - stores: Arc, - sync_manager: Arc, mtls_manager: Option>, ) -> Self { - let central_collector = - Arc::new(CentralCollector::new(stores.clone(), self_name.to_string())); Self { state, self_name: self_name.to_string(), self_addr, init_peer, - stores, - sync_manager, mtls_manager, sync_connections: Arc::new(Mutex::new(HashMap::new())), - central_collector, - current_batch: Arc::new(parking_lot::RwLock::new(Arc::new(RoundBatch::default()))), current_stream_batch: Arc::new(parking_lot::RwLock::new(Arc::new( crate::kv::RoundBatch::default(), ))), @@ -105,12 +85,6 @@ impl MeshController { self } - /// Get a handle to the shared RoundBatch. Used by GossipService to - /// share the centrally collected batch with server-side sync_stream handlers. - pub fn current_batch(&self) -> Arc>> { - self.current_batch.clone() - } - /// Get a handle to the shared stream RoundBatch. Used by GossipService /// so server-side sync_stream handlers see the same drained stream /// entries as client-side handlers. @@ -171,13 +145,6 @@ impl MeshController { }; cnt += 1; - // Checkpoint tree state every 10 rounds (~10s) by exporting - // the live radix tree from CacheAwarePolicy into tree_configs. - // This keeps the periodic structure snapshot fresh. - if cnt.is_multiple_of(10) { - self.sync_manager.checkpoint_tree_states(); - } - // Chunk assembler GC: every 5 rounds (~5s), drop partial // assemblies older than 30s. Partial chunks the receiver has // been holding for a full assembly timeout are assumed lost; @@ -189,83 +156,11 @@ impl MeshController { } } - // Periodic GC: clean up tombstoned CRDT metadata every 60 rounds (~60s) + // Periodic retry-manager cleanup every 60 rounds (~60s). if cnt.is_multiple_of(60) { - let removed = self.stores.gc_tombstones(); - if removed > 0 { - log::info!("GC: removed {removed} tombstoned CRDT metadata entries"); - } - let tree_removed = self.stores.gc_stale_tree_entries(); - if tree_removed > 0 { - log::info!("GC: removed {tree_removed} stale tree_configs entries"); - } - // Record store sizes for monitoring - metrics::record_store_sizes( - self.stores.worker.len(), - self.stores.policy.len(), - self.stores.membership.len(), - self.stores.app.len(), - ); - - // Log all mesh data structure sizes for memory debugging. - let tree_configs_bytes: usize = self - .stores - .tree_configs - .iter() - .map(|e| e.value().len()) - .sum(); - let tenant_inserts: usize = self - .stores - .tenant_delta_inserts - .iter() - .map(|e| e.value().len()) - .sum(); - let tenant_evictions: usize = self - .stores - .tenant_delta_evictions - .iter() - .map(|e| e.value().len()) - .sum(); - let tree_ops_pending: usize = self - .stores - .tree_ops_pending - .iter() - .map(|e| e.value().len()) - .sum(); - log::info!( - "Mesh memory: tree_configs={} entries ({} bytes), tree_versions={}, \ - tenant_inserts={}, tenant_evictions={}, tree_ops_pending={}, \ - policy_crdt={}, worker_crdt={}", - self.stores.tree_configs.len(), - tree_configs_bytes, - self.stores.tree_versions.len(), - tenant_inserts, - tenant_evictions, - tree_ops_pending, - self.stores.policy.len(), - self.stores.worker.len(), - ); - - // Log CRDT policy store operation log length for memory debugging - let policy_oplog_len = self.stores.policy.get_operation_log().len(); - log::info!( - policy_oplog_len, - "GC: CRDT policy store operation log length" - ); - - // Clean up retry managers for peers no longer in cluster state retry_managers.retain(|peer_name, _| map.contains_key(peer_name)); } - // Central collection: run once per round. Drains tenant deltas - // (destructive) and collects all store changes into one batch. - // Per-peer senders read this batch and filter by their watermarks. - { - let batch = self.central_collector.collect(); - *self.current_batch.write() = Arc::new(batch); - self.central_collector.advance_generations(); - } - // Stream round collection: drain stream namespace buffers and // drain callbacks exactly once per round (destructive). Per-peer // senders filter targeted_entries by their own peer_id and @@ -490,10 +385,7 @@ impl MeshController { self_name: String, peer_name: String, ) -> tokio::task::JoinHandle<()> { - let stores = self.stores.clone(); - let sync_manager = self.sync_manager.clone(); let sync_connections = self.sync_connections.clone(); - let current_batch = self.current_batch.clone(); let current_stream_batch = self.current_stream_batch.clone(); let mesh_kv = self.mesh_kv.clone(); @@ -536,20 +428,12 @@ impl MeshController { return; } - // Spawn a task to periodically send incremental updates (client-side sender). - // Uses PeerWatermark to filter the centrally collected batch. + // Spawn a task to periodically broadcast v2 stream batches. let incremental_sender_handle = { - let mut watermark = PeerWatermark::new(peer_name.clone()); - log::debug!( - peer = %peer_name, - "PeerWatermark created for centralized gossip" - ); let tx_incremental = tx.clone(); let self_name_incremental = self_name.clone(); let peer_name_incremental = peer_name.clone(); let shared_sequence = sequence.clone(); - let size_validator = MessageSizeValidator::default(); - let batch_handle = current_batch.clone(); let stream_batch_handle = current_stream_batch.clone(); #[expect(clippy::disallowed_methods, reason = "incremental sender handle is stored and aborted when the parent sync_stream handler exits")] @@ -564,93 +448,6 @@ impl MeshController { let round_start = std::time::Instant::now(); - // Read the centrally collected batch and filter by - // this peer's watermark. No collection happens here. - let batch = batch_handle.read().clone(); - let all_updates = watermark.filter(&batch); - - let collect_elapsed = round_start.elapsed(); - - if !all_updates.is_empty() { - for (store_type, updates) in &all_updates { - let proto_store_type = store_type.to_proto(); - - // Validate message size before sending - let batch_size: usize = updates.iter().map(|u| u.value.len()).sum(); - - log::debug!( - peer = %peer_name_incremental, - store = ?store_type, - updates = updates.len(), - batch_bytes = batch_size, - "mesh sync store batch" - ); - metrics::record_sync_batch_bytes( - &peer_name_incremental, - store_type.as_str(), - batch_size, - ); - - if let Err(e) = size_validator.validate(batch_size) { - log::warn!( - "Incremental update too large, skipping store {:?}: {} (max: {} bytes)", - store_type, - e, - size_validator.max_size() - ); - // Mark non-tree stores as sent to prevent infinite - // retry loops. Tree updates are retried next round. - let is_tree_update = - updates.iter().any(|u| u.key.starts_with("tree:")); - if !is_tree_update { - watermark.mark_sent(*store_type, updates); - } - continue; - } - - let incremental_update = StreamMessage { - message_type: StreamMessageType::IncrementalUpdate as i32, - payload: Some( - super::service::gossip::stream_message::Payload::Incremental( - IncrementalUpdate { - store: proto_store_type, - updates: updates.clone(), - version: 0, - }, - ), - ), - sequence: shared_sequence.fetch_add(1, Ordering::Relaxed), - peer_id: self_name_incremental.clone(), - }; - - log::debug!( - "Sending incremental update to {}: store={:?}, {} updates", - peer_name_incremental, - store_type, - updates.len(), - ); - - match tx_incremental.try_send(incremental_update) { - Ok(()) => { - // Mark as sent after successful transmission - watermark.mark_sent(*store_type, updates); - } - Err(mpsc::error::TrySendError::Full(_)) => { - log::debug!( - "Backpressure: channel full, skipping send (will retry next interval)" - ); - continue; - } - Err(mpsc::error::TrySendError::Closed(_)) => { - log::warn!( - "Channel closed, stopping incremental update sender" - ); - break; - } - } - } - } - // Stream batches: drain-portion (broadcast) + // targeted entries addressed to this peer. Each // entry is chunked if oversized. On channel @@ -727,12 +524,10 @@ impl MeshController { &peer_name_incremental, round_elapsed, ); - if round_elapsed.as_millis() > 10 || !all_updates.is_empty() { + if round_elapsed.as_millis() > 10 { log::info!( peer = %peer_name_incremental, round_ms = round_elapsed.as_millis(), - collect_ms = collect_elapsed.as_millis(), - stores_with_updates = all_updates.len(), "mesh sync round" ); } @@ -748,312 +543,6 @@ impl MeshController { sequence.fetch_add(1, Ordering::Relaxed); match msg.message_type() { - StreamMessageType::IncrementalUpdate => { - log::info!( - "[CLIENT] Received incremental update from {} (seq: {})", - peer_name, - msg.sequence - ); - - // Apply incremental updates to local stores - if let Some( - super::service::gossip::stream_message::Payload::Incremental( - update, - ), - ) = &msg.payload - { - use super::stores::StoreType as LocalStoreType; - - let store_type = LocalStoreType::from_proto(update.store); - log::info!( - "[CLIENT] Applying incremental update from {}: store={:?}, {} updates", - peer_name, - store_type, - update.updates.len() - ); - - // Apply updates based on store type - for state_update in &update.updates { - match store_type { - LocalStoreType::App => { - // Deserialize and apply app state - if let Ok(app_state) = bincode::deserialize::< - super::stores::AppState, - >( - &state_update.value - ) { - let dominated = stores.app.get(&app_state.key) - .is_some_and(|existing| existing.version >= app_state.version); - if !dominated { - // Mirror into the v2 `config:` CRDT - // namespace so v2-only readers can - // reach the same value during a - // rolling upgrade, even when the - // source is a v1 node still writing - // to AppStore. - if let Some(ref kv) = mesh_kv { - kv.configs().put( - &format!( - "config:{}", - app_state.key - ), - app_state.value.clone(), - ); - } - if let Err(err) = stores.app.insert( - app_state.key.clone(), - app_state, - ) { - log::warn!(error = %err, "Failed to apply app state update"); - } - } - } - } - LocalStoreType::Membership => { - // Deserialize and apply membership state - if let Ok(membership_state) = bincode::deserialize::< - super::stores::MembershipState, - >( - &state_update.value - ) { - if let Err(err) = stores.membership.insert( - membership_state.name.clone(), - membership_state, - ) { - log::warn!(error = %err, "Failed to apply membership state update"); - } - } - } - LocalStoreType::Worker => { - // Deserialize and apply worker state - if let Ok(worker_state) = bincode::deserialize::< - super::stores::WorkerState, - >( - &state_update.value - ) { - let actor = Some(state_update.actor.clone()); - sync_manager.apply_remote_worker_state( - worker_state, - actor, - ); - } - } - LocalStoreType::Policy => { - // Deserialize and apply policy state - if let Ok(policy_state) = bincode::deserialize::< - super::stores::PolicyState, - >( - &state_update.value - ) { - let actor = Some(state_update.actor.clone()); - - if policy_state.policy_type - == "tenant_delta" - { - // Lightweight tenant delta — no tree structure, no prompt text - match super::tree_ops::TenantDelta::from_bytes( - &policy_state.config, - ) { - Ok(delta) => { - sync_manager - .apply_remote_tenant_delta( - delta, actor, - ); - } - Err(e) => { - log::warn!( - "Failed to deserialize tenant delta for model {}: {e}", - policy_state.model_id - ); - } - } - } else if policy_state.policy_type - == "tree_state_lz4" - { - // LZ4-compressed snapshot (TreeState or TreeSnapshot bytes) - match super::tree_ops::lz4_decompress( - &policy_state.config, - ) { - Ok(decompressed) => { - // Try TreeState first (backward compat) - if let Ok(tree_state) = - super::tree_ops::TreeState::from_bytes( - &decompressed, - ) - { - sync_manager - .apply_remote_tree_operation( - policy_state - .model_id - .clone(), - tree_state, - actor, - ); - } else if let Ok(snap) = - kv_index::snapshot::TreeSnapshot::from_bytes( - &decompressed, - ) - { - let tree_state = - super::tree_ops::TreeState::from_snapshot( - policy_state - .model_id - .clone(), - &snap, - policy_state.version, - ); - sync_manager - .apply_remote_tree_operation( - policy_state - .model_id - .clone(), - tree_state, - actor, - ); - } else { - log::warn!( - "Failed to deserialize tree_state_lz4 payload for model {}", - policy_state.model_id - ); - } - } - Err(e) => { - log::warn!( - "Failed to LZ4-decompress tree state for model {}: {e}", - policy_state.model_id - ); - } - } - } else if policy_state.policy_type - == "tree_state_delta" - { - // Delta: apply only the new operations - match super::tree_ops::TreeStateDelta::from_bytes( - &policy_state.config, - ) - { - Ok(delta) => { - sync_manager - .apply_remote_tree_delta( - delta, actor, - ); - } - Err(e) => { - log::warn!( - "Failed to deserialize tree state delta for model {}: {e}", - policy_state.model_id - ); - } - } - } else if policy_state.policy_type - == "tree_state" - { - // Full state: replace (backward compatible) - match super::tree_ops::TreeState::from_bytes( - &policy_state.config, - ) - { - Ok(tree_state) => { - sync_manager - .apply_remote_tree_operation( - policy_state - .model_id - .clone(), - tree_state, - actor, - ); - } - Err(e) => { - log::warn!( - "Failed to deserialize tree state for model {}: {e}", - policy_state.model_id - ); - } - } - } else { - // Regular policy state update - sync_manager.apply_remote_policy_state( - policy_state, - actor, - ); - } - } - } - LocalStoreType::RateLimit => { - // Backward-compatible rate-limit decoding: - // old payloads may send OperationLog, newer ones send raw i64. - if let Ok(log) = bincode::deserialize::< - super::crdt_kv::OperationLog, - >(&state_update.value) - { - sync_manager - .apply_remote_rate_limit_counter(&log); - } else if let Ok(counter_value) = - bincode::deserialize::( - &state_update.value, - ) - { - sync_manager - .apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - state_update.key.clone(), - state_update.actor.clone(), - counter_value, - state_update.timestamp, - ); - } else { - log::warn!( - key = %state_update.key, - "Failed to decode rate-limit update as OperationLog or i64" - ); - } - } - } - } - } - - // Send ACK - let ack = StreamMessage { - message_type: StreamMessageType::Ack as i32, - payload: Some(StreamPayload::Ack( - super::service::gossip::StreamAck { - sequence: msg.sequence, - success: true, - error_message: String::new(), - }, - )), - sequence: sequence.fetch_add(1, Ordering::Relaxed), - peer_id: self_name.clone(), - }; - if tx.send(ack).await.is_err() { - log::warn!("Failed to send ACK to {}", peer_name); - break; - } - } - StreamMessageType::SnapshotChunk => { - log::info!( - "Received snapshot chunk from {} (seq: {})", - peer_name, - msg.sequence - ); - // Server side handles snapshot assembly - // Send ACK - let ack = StreamMessage { - message_type: StreamMessageType::Ack as i32, - payload: Some(StreamPayload::Ack( - super::service::gossip::StreamAck { - sequence: msg.sequence, - success: true, - error_message: String::new(), - }, - )), - sequence: sequence.fetch_add(1, Ordering::Relaxed), - peer_id: self_name.clone(), - }; - if tx.send(ack).await.is_err() { - log::warn!("Failed to send ACK to {}", peer_name); - break; - } - } StreamMessageType::Heartbeat => { log::trace!("Received heartbeat from {}", peer_name); // Send heartbeat back @@ -1068,73 +557,6 @@ impl MeshController { break; } } - StreamMessageType::SnapshotRequest => { - log::info!("Received snapshot request from {}", peer_name); - // Handle snapshot request - generate and send snapshot using GossipService - if let Some(StreamPayload::SnapshotRequest(req)) = &msg.payload { - use std::net::SocketAddr; - - use super::{ - ping_server::GossipService, - stores::StoreType as LocalStoreType, - }; - - let store_type = LocalStoreType::from_proto(req.store); - log::info!( - "Generating snapshot for store {:?}", - store_type - ); - - // Create a temporary GossipService to generate snapshot chunks - let service = GossipService::new( - Arc::new(parking_lot::RwLock::new(BTreeMap::new())), - SocketAddr::from(([0, 0, 0, 0], 0)), - SocketAddr::from(([0, 0, 0, 0], 0)), - &self_name, - ) - .with_stores(stores.clone()) - .with_sync_manager(sync_manager.clone()); - - let chunks = - service.create_snapshot_chunks(store_type, 100); - let total_chunks = chunks.len() as u64; - - log::info!( - "Sending {} snapshot chunks for store {:?}", - total_chunks, - store_type - ); - - let mut sent_chunks: u64 = 0; - for chunk in chunks { - let snapshot_chunk = StreamMessage { - message_type: StreamMessageType::SnapshotChunk - as i32, - payload: Some(StreamPayload::SnapshotChunk(chunk)), - sequence: sequence.fetch_add(1, Ordering::Relaxed), - peer_id: self_name.clone(), - }; - - if tx.send(snapshot_chunk).await.is_err() { - log::warn!( - "Failed to send snapshot chunk {} to {}", - sent_chunks, - peer_name - ); - break; - } - - sent_chunks += 1; - } - - log::info!( - "Sent {} snapshot chunks for store {:?} to {}", - sent_chunks, - store_type, - peer_name - ); - } - } StreamMessageType::Ack => { log::trace!( "Received ACK from {} (seq: {})", @@ -1149,11 +571,14 @@ impl MeshController { msg.sequence ); } - StreamMessageType::SnapshotComplete => { + StreamMessageType::IncrementalUpdate + | StreamMessageType::SnapshotRequest + | StreamMessageType::SnapshotChunk + | StreamMessageType::SnapshotComplete => { log::debug!( - "Received message type {:?} from {}", - msg.message_type, - peer_name + peer = %peer_name, + message_type = ?msg.message_type(), + "ignoring v1 wire message (state-sync removed)", ); } StreamMessageType::StreamBatch => { diff --git a/crates/mesh/src/crdt_kv/crdt.rs b/crates/mesh/src/crdt_kv/crdt.rs index 941e50504..574ee6616 100644 --- a/crates/mesh/src/crdt_kv/crdt.rs +++ b/crates/mesh/src/crdt_kv/crdt.rs @@ -168,121 +168,6 @@ impl CrdtOrMap { result } - /// Update a key using the current store value and CRDT insert semantics. - pub fn upsert(&self, key: String, updater: F) -> Vec - where - F: FnOnce(Option<&[u8]>) -> Vec, - { - let key_lock = self.key_lock_for(&key); - let key_guard = key_lock.lock(); - - let current_value = self.store.get(&key); - let updated_value = updater(current_value.as_deref()); - let timestamp = self.clock.tick(); - - let result = if self.record_insert_metadata(&key, timestamp, self.replica_id) { - let operation = Operation::insert( - key.clone(), - updated_value.clone(), - timestamp, - self.replica_id, - ); - - self.store.insert(key.clone(), updated_value.clone()); - self.operation_log.write().append(operation); - - updated_value - } else { - self.store.get(&key).unwrap_or_default() - }; - - drop(key_guard); - self.try_cleanup_key_lock(&key, &key_lock); - result - } - - /// Fallible variant of upsert that returns serializer/updater errors. - pub fn try_upsert(&self, key: String, updater: F) -> Result, E> - where - F: FnOnce(Option<&[u8]>) -> Result, E>, - { - let key_lock = self.key_lock_for(&key); - let key_guard = key_lock.lock(); - - let current_value = self.store.get(&key); - let updated_value = match updater(current_value.as_deref()) { - Ok(value) => value, - Err(err) => { - drop(key_guard); - self.try_cleanup_key_lock(&key, &key_lock); - return Err(err); - } - }; - let timestamp = self.clock.tick(); - - let result = if self.record_insert_metadata(&key, timestamp, self.replica_id) { - let operation = Operation::insert( - key.clone(), - updated_value.clone(), - timestamp, - self.replica_id, - ); - - self.store.insert(key.clone(), updated_value.clone()); - self.operation_log.write().append(operation); - - updated_value - } else { - self.store.get(&key).unwrap_or_default() - }; - - drop(key_guard); - self.try_cleanup_key_lock(&key, &key_lock); - Ok(result) - } - - /// Fallible atomic upsert with conditional write. Returning `Ok(None)` skips CRDT write. - pub fn try_upsert_if(&self, key: String, updater: F) -> Result<(Vec, bool), E> - where - F: FnOnce(Option<&[u8]>) -> Result>, E>, - { - let key_lock = self.key_lock_for(&key); - let key_guard = key_lock.lock(); - - let current_value = self.store.get(&key); - let maybe_updated_value = match updater(current_value.as_deref()) { - Ok(value) => value, - Err(err) => { - drop(key_guard); - self.try_cleanup_key_lock(&key, &key_lock); - return Err(err); - } - }; - - let (result, changed) = if let Some(updated_value) = maybe_updated_value { - let timestamp = self.clock.tick(); - if self.record_insert_metadata(&key, timestamp, self.replica_id) { - let operation = Operation::insert( - key.clone(), - updated_value.clone(), - timestamp, - self.replica_id, - ); - self.store.insert(key.clone(), updated_value.clone()); - self.operation_log.write().append(operation); - (updated_value, true) - } else { - (self.store.get(&key).unwrap_or_default(), false) - } - } else { - (self.store.get(&key).unwrap_or_default(), false) - }; - - drop(key_guard); - self.try_cleanup_key_lock(&key, &key_lock); - Ok((result, changed)) - } - /// Remove key (transparent operation) pub fn remove(&self, key: &str) -> Option> { let key_lock = self.key_lock_for(key); diff --git a/crates/mesh/src/crdt_kv/mod.rs b/crates/mesh/src/crdt_kv/mod.rs index c7570b405..bbde4d660 100644 --- a/crates/mesh/src/crdt_kv/mod.rs +++ b/crates/mesh/src/crdt_kv/mod.rs @@ -11,8 +11,7 @@ mod replica; // Export core types pub use crdt::CrdtOrMap; pub use epoch_max_wins::{decode, encode, merge, EpochCount, EPOCH_MAX_WINS_ENCODED_LEN}; -pub use operation::{Operation, OperationLog}; -pub use replica::ReplicaId; +pub use operation::OperationLog; #[cfg(test)] mod tests; diff --git a/crates/mesh/src/flow_control.rs b/crates/mesh/src/flow_control.rs index 6f465e2b3..35c4547cd 100644 --- a/crates/mesh/src/flow_control.rs +++ b/crates/mesh/src/flow_control.rs @@ -1,9 +1,8 @@ //! Flow control for mesh cluster communication //! //! Provides: -//! - Backpressure control (channel capacity monitoring) -//! - Message size limits and validation -//! - Exponential backoff for reconnection +//! - Message size limit constant for gRPC encode/decode caps +//! - Exponential backoff for peer reconnection use std::{ sync::Arc, @@ -15,97 +14,6 @@ use parking_lot::RwLock; /// Maximum message size in bytes (default: 10MB) pub const MAX_MESSAGE_SIZE: usize = 10 * 1024 * 1024; -/// Channel capacity threshold for backpressure (default: 20% remaining) -pub const BACKPRESSURE_THRESHOLD: usize = 25; // 25 out of 128 = ~20% - -/// Backpressure controller for managing channel capacity -#[derive(Debug, Clone)] -pub struct BackpressureController { - channel_capacity: usize, - threshold: usize, -} - -impl BackpressureController { - pub fn new(channel_capacity: usize, threshold: usize) -> Self { - Self { - channel_capacity, - threshold, - } - } - - #[expect(dead_code)] - /// Check if channel has capacity for sending - pub fn can_send(&self, current_len: usize) -> bool { - let remaining = self.channel_capacity.saturating_sub(current_len); - remaining > self.threshold - } - - #[expect(dead_code)] - /// Get remaining capacity - pub fn remaining_capacity(&self, current_len: usize) -> usize { - self.channel_capacity.saturating_sub(current_len) - } -} - -impl Default for BackpressureController { - fn default() -> Self { - Self::new(128, BACKPRESSURE_THRESHOLD) - } -} - -/// Message size validator -#[derive(Debug, Clone)] -pub struct MessageSizeValidator { - max_size: usize, -} - -impl MessageSizeValidator { - pub fn new(max_size: usize) -> Self { - Self { max_size } - } - - /// Validate message size - pub fn validate(&self, size: usize) -> Result<(), MessageSizeError> { - if size > self.max_size { - Err(MessageSizeError::TooLarge { - size, - max: self.max_size, - }) - } else { - Ok(()) - } - } - - /// Get maximum allowed size - pub fn max_size(&self) -> usize { - self.max_size - } -} - -impl Default for MessageSizeValidator { - fn default() -> Self { - Self::new(MAX_MESSAGE_SIZE) - } -} - -/// Message size validation error -#[derive(Debug, Clone)] -pub enum MessageSizeError { - TooLarge { size: usize, max: usize }, -} - -impl std::fmt::Display for MessageSizeError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - MessageSizeError::TooLarge { size, max } => { - write!(f, "Message size {size} exceeds maximum {max}") - } - } - } -} - -impl std::error::Error for MessageSizeError {} - /// Exponential backoff calculator for reconnection #[derive(Debug, Clone)] pub struct ExponentialBackoff { diff --git a/crates/mesh/src/lib.rs b/crates/mesh/src/lib.rs index bcdc9f694..c5422b015 100644 --- a/crates/mesh/src/lib.rs +++ b/crates/mesh/src/lib.rs @@ -7,8 +7,6 @@ mod chunk_assembler; mod chunking; -mod collector; -mod consistent_hash; mod controller; mod crdt_kv; mod flow_control; @@ -16,15 +14,9 @@ mod hash; pub mod kv; mod metrics; mod mtls; -mod node_state_machine; mod partition; mod ping_server; -mod rate_limit_window; mod service; -mod stores; -mod sync; -mod topology; -mod tree_ops; mod types; // Internal tests module with full access to private types diff --git a/crates/mesh/src/metrics.rs b/crates/mesh/src/metrics.rs index 45c8adadd..48d7e9a54 100644 --- a/crates/mesh/src/metrics.rs +++ b/crates/mesh/src/metrics.rs @@ -8,39 +8,12 @@ //! - State integrity metrics //! - Rate-limit/LB drift metrics -use std::time::{Duration, Instant}; +use std::time::Duration; use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram}; /// Initialize mesh metrics descriptions pub fn init_mesh_metrics() { - // Convergence latency - describe_histogram!( - "router_mesh_convergence_ms", - "Time for state to converge across mesh in milliseconds" - ); - - // Traffic metrics - describe_counter!( - "router_mesh_batches_total", - "Total number of state update batches sent/received" - ); - describe_counter!("router_mesh_bytes_total", "Total bytes transmitted in mesh"); - - // Snapshot metrics - describe_counter!( - "router_mesh_snapshot_trigger_total", - "Total number of snapshot triggers" - ); - describe_histogram!( - "router_mesh_snapshot_duration_seconds", - "Time to generate and send snapshot" - ); - describe_counter!( - "router_mesh_snapshot_bytes_total", - "Total bytes in snapshots" - ); - // Peer health metrics describe_gauge!( "router_mesh_peer_connections", @@ -56,7 +29,8 @@ pub fn init_mesh_metrics() { "Total number of NACK messages" ); - // State integrity metrics + // State integrity metrics (drift gauges currently retained as scaffolding; + // recorder helpers below are `#[expect(dead_code)]`). describe_gauge!( "router_mesh_store_cardinality", "Number of entries in each store" @@ -66,27 +40,13 @@ pub fn init_mesh_metrics() { "Hash of store state for integrity checking" ); - // Sync round profiling metrics + // Sync round profiling describe_histogram!( "router_mesh_sync_round_duration_seconds", "Duration of a mesh sync round" ); - describe_histogram!("router_mesh_sync_batch_bytes", "Size of mesh sync batch"); - describe_gauge!( - "router_mesh_store_workers", - "Number of entries in worker store" - ); - describe_gauge!( - "router_mesh_store_policies", - "Number of entries in policy store" - ); - describe_gauge!( - "router_mesh_store_memberships", - "Number of entries in membership store" - ); - describe_gauge!("router_mesh_store_apps", "Number of entries in app store"); - // Rate-limit and LB drift metrics + // Rate-limit and LB drift gauges describe_gauge!( "router_rl_drift_ratio", "Rate-limit drift ratio (actual vs expected)" @@ -97,69 +57,6 @@ pub fn init_mesh_metrics() { ); } -/// Record convergence latency -pub fn record_convergence_latency(duration: Duration) { - histogram!("router_mesh_convergence_ms", - "quantile" => "p50" - ) - .record(duration.as_millis() as f64); -} - -/// Record batch transmission -pub fn record_batch_sent(peer: &str, batch_size: usize) { - counter!("router_mesh_batches_total", - "direction" => "sent", - "peer" => peer.to_string() - ) - .increment(1); - counter!("router_mesh_bytes_total", - "direction" => "sent", - "peer" => peer.to_string() - ) - .increment(batch_size as u64); -} - -#[expect(dead_code)] -/// Record batch reception -pub fn record_batch_received(peer: &str, batch_size: usize) { - counter!("router_mesh_batches_total", - "direction" => "received", - "peer" => peer.to_string() - ) - .increment(1); - counter!("router_mesh_bytes_total", - "direction" => "received", - "peer" => peer.to_string() - ) - .increment(batch_size as u64); -} - -/// Record snapshot trigger -pub fn record_snapshot_trigger(store: &str, reason: &str) { - counter!("router_mesh_snapshot_trigger_total", - "store" => store.to_string(), - "reason" => reason.to_string() - ) - .increment(1); -} - -/// Record snapshot generation duration -pub fn record_snapshot_duration(store: &str, duration: Duration) { - histogram!("router_mesh_snapshot_duration_seconds", - "store" => store.to_string() - ) - .record(duration.as_secs_f64()); -} - -/// Record snapshot bytes -pub fn record_snapshot_bytes(store: &str, direction: &str, bytes: usize) { - counter!("router_mesh_snapshot_bytes_total", - "store" => store.to_string(), - "direction" => direction.to_string() - ) - .increment(bytes as u64); -} - /// Update peer connection status pub fn update_peer_connections(peer: &str, connected: bool) { gauge!("router_mesh_peer_connections", @@ -237,50 +134,3 @@ pub fn record_sync_round_duration(peer: &str, duration: Duration) { ) .record(duration.as_secs_f64()); } - -/// Record mesh sync batch size in bytes -pub fn record_sync_batch_bytes(peer: &str, store: &str, bytes: usize) { - histogram!("router_mesh_sync_batch_bytes", - "peer" => peer.to_string(), - "store" => store.to_string() - ) - .record(bytes as f64); -} - -/// Record mesh store sizes for monitoring unbounded growth -pub fn record_store_sizes( - worker_count: usize, - policy_count: usize, - membership_count: usize, - app_count: usize, -) { - gauge!("router_mesh_store_workers").set(worker_count as f64); - gauge!("router_mesh_store_policies").set(policy_count as f64); - gauge!("router_mesh_store_memberships").set(membership_count as f64); - gauge!("router_mesh_store_apps").set(app_count as f64); -} - -/// Helper struct for tracking convergence time -pub struct ConvergenceTracker { - start_time: Instant, -} - -impl ConvergenceTracker { - pub fn new() -> Self { - Self { - start_time: Instant::now(), - } - } - - #[expect(dead_code)] - pub fn record_convergence(&self) { - let duration = self.start_time.elapsed(); - record_convergence_latency(duration); - } -} - -impl Default for ConvergenceTracker { - fn default() -> Self { - Self::new() - } -} diff --git a/crates/mesh/src/node_state_machine.rs b/crates/mesh/src/node_state_machine.rs deleted file mode 100644 index 2b26c5902..000000000 --- a/crates/mesh/src/node_state_machine.rs +++ /dev/null @@ -1,546 +0,0 @@ -//! Node state machine for cold start -//! -//! Manages node lifecycle: NotReady -> Joining -> SnapshotPull -> Converging -> Ready - -use std::{ - sync::Arc, - time::{Duration, Instant}, -}; - -use parking_lot::RwLock; -use tracing::info; - -use super::stores::StateStores; - -/// Node readiness state -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum NodeReadiness { - /// Node is not ready (initial state) - NotReady, - /// Node is joining the cluster - Joining, - /// Node is pulling snapshot from peers - SnapshotPull, - /// Node is converging (applying state updates) - Converging, - /// Node is ready to serve traffic - Ready, -} - -impl NodeReadiness { - pub fn as_str(self) -> &'static str { - match self { - NodeReadiness::NotReady => "not_ready", - NodeReadiness::Joining => "joining", - NodeReadiness::SnapshotPull => "snapshot_pull", - NodeReadiness::Converging => "converging", - NodeReadiness::Ready => "ready", - } - } -} - -/// Convergence detection configuration -#[derive(Debug, Clone)] -pub struct ConvergenceConfig { - /// Time window for convergence detection (seconds) - pub convergence_window: Duration, - /// Minimum number of state updates without changes to consider converged - pub min_stable_updates: usize, - /// Timeout for snapshot pull (seconds) - pub snapshot_timeout: Duration, -} - -impl Default for ConvergenceConfig { - fn default() -> Self { - Self { - convergence_window: Duration::from_secs(10), - min_stable_updates: 5, - snapshot_timeout: Duration::from_secs(60), - } - } -} - -/// Convergence tracker -#[derive(Debug)] -struct ConvergenceTracker { - last_update_time: Option, - stable_update_count: usize, - last_state_hash: Option, -} - -impl ConvergenceTracker { - fn new() -> Self { - Self { - last_update_time: None, - stable_update_count: 0, - last_state_hash: None, - } - } - - fn record_update(&mut self, state_hash: u64, config: &ConvergenceConfig) -> bool { - let now = Instant::now(); - - if let Some(last_hash) = self.last_state_hash { - if last_hash == state_hash { - // State unchanged - self.stable_update_count += 1; - } else { - // State changed, reset counter - self.stable_update_count = 0; - } - } else { - // First update - self.stable_update_count = 0; - } - - self.last_state_hash = Some(state_hash); - - // Check elapsed time since the first stable update, not since this update - if let Some(last_time) = self.last_update_time { - let elapsed = now.duration_since(last_time); - if elapsed >= config.convergence_window - && self.stable_update_count >= config.min_stable_updates - { - return true; - } - } - - // Only set the timestamp if this is the first update or state changed - if self.last_update_time.is_none() || self.stable_update_count == 0 { - self.last_update_time = Some(now); - } - - false - } - - fn reset(&mut self) { - self.last_update_time = None; - self.stable_update_count = 0; - self.last_state_hash = None; - } -} - -/// Node state machine for managing cold start -#[derive(Debug)] -pub struct NodeStateMachine { - readiness: Arc>, - config: ConvergenceConfig, - convergence_tracker: Arc>, - snapshot_start_time: Arc>>, - stores: Arc, -} - -impl NodeStateMachine { - pub fn new(stores: Arc, config: ConvergenceConfig) -> Self { - Self { - readiness: Arc::new(RwLock::new(NodeReadiness::NotReady)), - config, - convergence_tracker: Arc::new(RwLock::new(ConvergenceTracker::new())), - snapshot_start_time: Arc::new(RwLock::new(None)), - stores, - } - } - - /// Get current readiness state - pub fn readiness(&self) -> NodeReadiness { - *self.readiness.read() - } - - /// Transition to joining state - pub fn start_joining(&self) { - let mut readiness = self.readiness.write(); - if *readiness == NodeReadiness::NotReady { - *readiness = NodeReadiness::Joining; - info!("Node state: NotReady -> Joining"); - } - } - - /// Transition to snapshot pull state - pub fn start_snapshot_pull(&self) { - let mut readiness = self.readiness.write(); - if *readiness == NodeReadiness::Joining { - *readiness = NodeReadiness::SnapshotPull; - *self.snapshot_start_time.write() = Some(Instant::now()); - info!("Node state: Joining -> SnapshotPull"); - } - } - - /// Check if snapshot pull has timed out - pub fn is_snapshot_timeout(&self) -> bool { - if let Some(start_time) = *self.snapshot_start_time.read() { - start_time.elapsed() > self.config.snapshot_timeout - } else { - false - } - } - - /// Transition to converging state - pub fn start_converging(&self) { - let mut readiness = self.readiness.write(); - if *readiness == NodeReadiness::SnapshotPull { - *readiness = NodeReadiness::Converging; - *self.snapshot_start_time.write() = None; - self.convergence_tracker.write().reset(); - info!("Node state: SnapshotPull -> Converging"); - } - } - - /// Record a state update and check for convergence - pub fn record_state_update(&self) -> bool { - if self.readiness() != NodeReadiness::Converging { - return false; - } - - // Calculate a simple hash of store states - let state_hash = self.calculate_state_hash(); - let mut tracker = self.convergence_tracker.write(); - let converged = tracker.record_update(state_hash, &self.config); - - if converged { - self.transition_to_ready(); - return true; - } - - false - } - - /// Transition to ready state - pub fn transition_to_ready(&self) { - let mut readiness = self.readiness.write(); - if *readiness == NodeReadiness::Converging { - *readiness = NodeReadiness::Ready; - info!("Node state: Converging -> Ready"); - } - } - - /// Check if node is ready - pub fn is_ready(&self) -> bool { - self.readiness() == NodeReadiness::Ready - } - - /// Check if stores are empty (need snapshot) - pub fn needs_snapshot(&self) -> bool { - self.stores.membership.is_empty() - || self.stores.worker.is_empty() - || self.stores.policy.is_empty() - } - - /// Calculate a simple hash of current state (for convergence detection) - fn calculate_state_hash(&self) -> u64 { - use std::{ - collections::hash_map::DefaultHasher, - hash::{Hash, Hasher}, - }; - - let mut hasher = DefaultHasher::new(); - self.stores.membership.len().hash(&mut hasher); - self.stores.worker.len().hash(&mut hasher); - self.stores.policy.len().hash(&mut hasher); - self.stores.app.len().hash(&mut hasher); - hasher.finish() - } - - /// Reset state machine (for testing or recovery) - pub fn reset(&self) { - *self.readiness.write() = NodeReadiness::NotReady; - self.convergence_tracker.write().reset(); - *self.snapshot_start_time.write() = None; - } -} - -impl Default for NodeStateMachine { - fn default() -> Self { - Self::new( - Arc::new(StateStores::default()), - ConvergenceConfig::default(), - ) - } -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use super::*; - - fn create_test_stores() -> Arc { - Arc::new(StateStores::default()) - } - - fn create_test_config() -> ConvergenceConfig { - ConvergenceConfig { - convergence_window: Duration::from_millis(100), - min_stable_updates: 3, - snapshot_timeout: Duration::from_secs(1), - } - } - - #[test] - fn test_node_readiness_as_str() { - assert_eq!(NodeReadiness::NotReady.as_str(), "not_ready"); - assert_eq!(NodeReadiness::Joining.as_str(), "joining"); - assert_eq!(NodeReadiness::SnapshotPull.as_str(), "snapshot_pull"); - assert_eq!(NodeReadiness::Converging.as_str(), "converging"); - assert_eq!(NodeReadiness::Ready.as_str(), "ready"); - } - - #[test] - fn test_convergence_config_default() { - let config = ConvergenceConfig::default(); - assert_eq!(config.convergence_window, Duration::from_secs(10)); - assert_eq!(config.min_stable_updates, 5); - assert_eq!(config.snapshot_timeout, Duration::from_secs(60)); - } - - #[test] - fn test_node_state_machine_initial_state() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores, config); - - assert_eq!(sm.readiness(), NodeReadiness::NotReady); - assert!(!sm.is_ready()); - } - - #[test] - fn test_state_transition_flow() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores, config); - - // Start joining - sm.start_joining(); - assert_eq!(sm.readiness(), NodeReadiness::Joining); - - // Start snapshot pull - sm.start_snapshot_pull(); - assert_eq!(sm.readiness(), NodeReadiness::SnapshotPull); - assert!(!sm.is_snapshot_timeout()); - - // Start converging - sm.start_converging(); - assert_eq!(sm.readiness(), NodeReadiness::Converging); - - // Transition to ready - sm.transition_to_ready(); - assert_eq!(sm.readiness(), NodeReadiness::Ready); - assert!(sm.is_ready()); - } - - #[test] - fn test_state_transition_guards() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores, config); - - // Cannot start snapshot pull without joining first - sm.start_snapshot_pull(); - assert_eq!(sm.readiness(), NodeReadiness::NotReady); - - // Cannot start converging without snapshot pull - sm.start_joining(); - sm.start_converging(); - assert_eq!(sm.readiness(), NodeReadiness::Joining); - - // Cannot transition to ready without converging - sm.transition_to_ready(); - assert_eq!(sm.readiness(), NodeReadiness::Joining); - } - - #[test] - fn test_snapshot_timeout() { - let stores = create_test_stores(); - let mut config = create_test_config(); - config.snapshot_timeout = Duration::from_millis(50); - let sm = NodeStateMachine::new(stores, config); - - sm.start_joining(); - sm.start_snapshot_pull(); - assert!(!sm.is_snapshot_timeout()); - - // Wait for timeout - std::thread::sleep(Duration::from_millis(100)); - assert!(sm.is_snapshot_timeout()); - } - - #[test] - fn test_needs_snapshot() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores.clone(), config); - - // Empty stores need snapshot - assert!(sm.needs_snapshot()); - - // Add some data to stores - use super::super::stores::{MembershipState, PolicyState, WorkerState}; - - let _ = stores.membership.insert( - "node1".to_string(), - MembershipState { - name: "node1".to_string(), - address: "127.0.0.1:8080".to_string(), - status: 1, - version: 1, - metadata: Default::default(), - }, - ); - - let _ = stores.worker.insert( - "worker1".to_string(), - WorkerState { - worker_id: "worker1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health: true, - load: 0.5, - version: 1, - spec: vec![], - }, - ); - - let _ = stores.policy.insert( - "policy1".to_string(), - PolicyState { - model_id: "model1".to_string(), - policy_type: "round_robin".to_string(), - config: vec![], - version: 1, - }, - ); - - // Now should not need snapshot - assert!(!sm.needs_snapshot()); - } - - #[test] - fn test_record_state_update_not_converging() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores, config); - - // Should return false when not in converging state - assert!(!sm.record_state_update()); - assert_eq!(sm.readiness(), NodeReadiness::NotReady); - } - - #[test] - fn test_convergence_detection() { - let stores = create_test_stores(); - let mut config = create_test_config(); - config.convergence_window = Duration::from_millis(50); - config.min_stable_updates = 2; - let sm = NodeStateMachine::new(stores, config); - - // Transition to converging state - sm.start_joining(); - sm.start_snapshot_pull(); - sm.start_converging(); - assert_eq!(sm.readiness(), NodeReadiness::Converging); - - // Record multiple updates with same state - let converged1 = sm.record_state_update(); - assert!(!converged1); - - // Wait a bit and record more updates - std::thread::sleep(Duration::from_millis(60)); - let converged2 = sm.record_state_update(); - assert!(!converged2); // Still not enough stable updates - - // Record more stable updates - std::thread::sleep(Duration::from_millis(10)); - let converged3 = sm.record_state_update(); - // Should converge after enough stable updates within window - if converged3 { - assert_eq!(sm.readiness(), NodeReadiness::Ready); - } - } - - #[test] - fn test_convergence_reset_on_state_change() { - let stores = create_test_stores(); - let mut config = create_test_config(); - config.convergence_window = Duration::from_millis(100); - config.min_stable_updates = 2; - let sm = NodeStateMachine::new(stores.clone(), config); - - sm.start_joining(); - sm.start_snapshot_pull(); - sm.start_converging(); - - // Record update - sm.record_state_update(); - - // Change state by adding data - use super::super::stores::AppState; - let _ = stores.app.insert( - "app1".to_string(), - AppState { - key: "app1".to_string(), - value: vec![1, 2, 3], - version: 1, - }, - ); - - // Record update with changed state - sm.record_state_update(); - - // The stable count should be reset - std::thread::sleep(Duration::from_millis(110)); - let converged = sm.record_state_update(); - // Should not converge immediately after state change - assert!(!converged || sm.readiness() == NodeReadiness::Converging); - } - - #[test] - fn test_reset() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores, config); - - // Go through states - sm.start_joining(); - sm.start_snapshot_pull(); - sm.start_converging(); - sm.transition_to_ready(); - - assert_eq!(sm.readiness(), NodeReadiness::Ready); - - // Reset - sm.reset(); - assert_eq!(sm.readiness(), NodeReadiness::NotReady); - assert!(!sm.is_ready()); - assert!(!sm.is_snapshot_timeout()); - } - - #[test] - fn test_calculate_state_hash() { - let stores = create_test_stores(); - let config = create_test_config(); - let sm = NodeStateMachine::new(stores.clone(), config); - - let hash1 = sm.calculate_state_hash(); - - // Add some data - use super::super::stores::AppState; - let _ = stores.app.insert( - "app1".to_string(), - AppState { - key: "app1".to_string(), - value: vec![], - version: 1, - }, - ); - - // Hash should change - let hash2 = sm.calculate_state_hash(); - assert_ne!(hash1, hash2); - } - - #[test] - fn test_default_implementation() { - let sm = NodeStateMachine::default(); - assert_eq!(sm.readiness(), NodeReadiness::NotReady); - assert!(!sm.is_ready()); - } -} diff --git a/crates/mesh/src/ping_server.rs b/crates/mesh/src/ping_server.rs index 23829b543..8bdbfe5ee 100644 --- a/crates/mesh/src/ping_server.rs +++ b/crates/mesh/src/ping_server.rs @@ -1,10 +1,4 @@ -use std::{ - collections::HashMap, - net::SocketAddr, - pin::Pin, - sync::Arc, - time::{Duration, Instant}, -}; +use std::{net::SocketAddr, pin::Pin, sync::Arc, time::Duration}; use anyhow::Result; use futures::Stream; @@ -22,29 +16,20 @@ use super::{ build_stream_batches, chunk_value, dispatch_stream_batch, next_generation, DEFAULT_MAX_CHUNKS_PER_BATCH, MAX_STREAM_CHUNK_BYTES, }, - flow_control::{MessageSizeValidator, MAX_MESSAGE_SIZE}, - metrics::{ - record_ack, record_batch_sent, record_nack, record_peer_reconnect, record_snapshot_bytes, - record_snapshot_duration, record_snapshot_trigger, update_peer_connections, - ConvergenceTracker, - }, + flow_control::MAX_MESSAGE_SIZE, + metrics::{record_ack, record_nack, record_peer_reconnect, update_peer_connections}, mtls::MTLSManager, - node_state_machine::NodeStateMachine, partition::PartitionDetector, service::{ gossip::{ self, gossip_server::{Gossip, GossipServer}, - GossipMessage, IncrementalUpdate, NodeState, NodeStatus, NodeUpdate, PingReq, - SnapshotChunk, SnapshotRequest, StateUpdate, StreamAck, StreamMessage, + GossipMessage, NodeState, NodeStatus, NodeUpdate, PingReq, StreamMessage, StreamMessageType, }, try_ping, ClusterState, }, - stores::{StateStores, StoreType as LocalStoreType}, - sync::MeshSyncManager, }; -use crate::collector::{PeerWatermark, RoundBatch}; #[derive(Debug)] pub struct GossipService { @@ -52,15 +37,8 @@ pub struct GossipService { listen_addr: SocketAddr, advertise_addr: SocketAddr, self_name: String, - stores: Option>, // Optional state stores for CRDT-based sync - sync_manager: Option>, // Optional sync manager for applying remote updates - state_machine: Option>, partition_detector: Option>, mtls_manager: Option>, - /// Shared reference to the current RoundBatch, updated once per round by - /// the MeshController's central collector. Server-side sync_stream handlers - /// read from this and apply per-peer watermark filtering. - current_batch: Option>>>, /// Shared reference to the current stream RoundBatch, drained once /// per round by the MeshController. Server-side handlers read /// broadcast drain_entries and also emit targeted_entries addressed @@ -73,212 +51,6 @@ pub struct GossipService { mesh_kv: Option>, } -impl GossipService { - /// Create snapshot chunks for a store - #[expect( - clippy::expect_used, - reason = "system clock before UNIX epoch is a fatal misconfiguration that must not silently produce timestamp=0" - )] - pub fn create_snapshot_chunks( - &self, - store_type: LocalStoreType, - chunk_size: usize, - ) -> Vec { - let stores = match self.stores.as_ref() { - Some(s) => s, - None => { - log::warn!("State stores not available for snapshot generation"); - return vec![]; - } - }; - - let proto_store_type = store_type.to_proto(); - - // Get all entries from the store - let entries: Vec<(String, Vec)> = match store_type { - LocalStoreType::Membership => stores - .membership - .all() - .into_iter() - .map(|(k, v)| { - let serialized = bincode::serialize(&v).unwrap_or_else(|e| { - log::error!("Failed to serialize membership state: {}", e); - vec![] - }); - (k, serialized) - }) - .collect(), - LocalStoreType::App => stores - .app - .all() - .into_iter() - .map(|(k, v)| { - let serialized = bincode::serialize(&v).unwrap_or_else(|e| { - log::error!("Failed to serialize app state: {}", e); - vec![] - }); - (k, serialized) - }) - .collect(), - LocalStoreType::Worker => stores - .worker - .all() - .into_iter() - .map(|(k, v)| { - let serialized = bincode::serialize(&v).unwrap_or_else(|e| { - log::error!("Failed to serialize worker state: {}", e); - vec![] - }); - (k, serialized) - }) - .collect(), - LocalStoreType::Policy => { - let entries: Vec<(String, Vec)> = stores - .policy - .all() - .into_iter() - .filter(|(k, _)| { - // Tree configs are handled separately below via - // stores.tree_configs — skip stale CRDT policy - // entries with "tree:" keys. - !k.starts_with("tree:") - }) - .map(|(k, v)| { - let serialized = bincode::serialize(&v).unwrap_or_else(|e| { - log::error!("Failed to serialize policy state: {}", e); - vec![] - }); - (k, serialized) - }) - .collect(); - - // Tree data is synced via Layer 1 (tenant deltas) and Layer 2 - // (periodic compressed snapshots). No longer include tree_configs - // in the snapshot exchange — cloning large TreeState bytes on - // every ping round caused multi-GB memory growth. - entries - } - LocalStoreType::RateLimit => { - // For rate limit, serialize all counters from owners - stores - .rate_limit - .keys() - .into_iter() - .filter_map(|key| { - if stores.rate_limit.is_owner(&key) { - stores.rate_limit.get_counter(&key).map(|counter_value| { - let serialized = - bincode::serialize(&counter_value).unwrap_or_else(|e| { - log::error!( - "Failed to serialize rate limit counter: {}", - e - ); - vec![] - }); - (key.clone(), serialized) - }) - } else { - None - } - }) - .collect() - } - }; - - if entries.is_empty() { - return vec![]; - } - - // Split entries into chunks - let mut chunks = Vec::new(); - let total_chunks = entries.len().div_ceil(chunk_size); - - for (chunk_idx, chunk_entries) in entries.chunks(chunk_size).enumerate() { - let state_updates: Vec = chunk_entries - .iter() - .map(|(key, value)| { - // Get actual version from CRDT metadata - let version = match store_type { - LocalStoreType::Membership => { - stores.membership.get(key).map(|s| s.version).unwrap_or(1) - } - LocalStoreType::App => stores.app.get(key).map(|s| s.version).unwrap_or(1), - LocalStoreType::Worker => { - stores.worker.get(key).map(|s| s.version).unwrap_or(1) - } - LocalStoreType::Policy => { - // For tree keys, version comes from tree_configs - // (not the CRDT policy store). - if key.starts_with("tree:") { - stores - .tree_configs - .get(key) - .and_then(|bytes| { - super::tree_ops::TreeState::from_bytes(&bytes) - .ok() - .map(|ts| ts.version) - }) - .unwrap_or(1) - } else { - stores.policy.get(key).map(|s| s.version).unwrap_or(1) - } - } - LocalStoreType::RateLimit => { - // For rate limit, use timestamp as version - { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .expect("system clock before UNIX_EPOCH; cannot generate valid timestamps") - .as_nanos() as u64 - } - } - }; - - let timestamp = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .expect("system clock before UNIX_EPOCH; cannot generate valid timestamps") - .as_nanos() as u64; - - StateUpdate { - key: key.clone(), - value: value.clone(), - version, - actor: self.self_name.clone(), - timestamp, - } - }) - .collect(); - - // Calculate checksum for integrity verification - use std::{ - collections::hash_map::DefaultHasher, - hash::{Hash, Hasher}, - }; - let mut hasher = DefaultHasher::new(); - for update in &state_updates { - update.key.hash(&mut hasher); - update.value.hash(&mut hasher); - } - let checksum = hasher.finish().to_le_bytes().to_vec(); - - chunks.push(SnapshotChunk { - store: proto_store_type, - chunk_index: chunk_idx as u64, - total_chunks: total_chunks as u64, - entries: state_updates, - checksum, - }); - } - - log::info!( - "Generated {} snapshot chunks for store {:?}", - chunks.len(), - store_type - ); - chunks - } -} - impl GossipService { pub fn new( state: ClusterState, @@ -291,28 +63,13 @@ impl GossipService { listen_addr, advertise_addr, self_name: self_name.to_string(), - stores: None, - sync_manager: None, - state_machine: None, partition_detector: None, mtls_manager: None, - current_batch: None, current_stream_batch: None, mesh_kv: None, } } - /// Attach the shared RoundBatch reference from the MeshController. - /// Server-side sync_stream handlers read from this single batch - /// produced by the CentralCollector rather than re-collecting per peer. - pub fn with_current_batch( - mut self, - current_batch: Arc>>, - ) -> Self { - self.current_batch = Some(current_batch); - self - } - /// Attach the shared stream RoundBatch reference. Server-side /// handlers emit broadcast drain_entries plus targeted_entries /// whose target matches the remote peer learned from the first @@ -335,24 +92,6 @@ impl GossipService { self } - pub fn with_stores(mut self, stores: Arc) -> Self { - self.stores = Some(stores.clone()); - // Create state machine if stores are provided - if self.state_machine.is_none() { - use super::node_state_machine::ConvergenceConfig; - self.state_machine = Some(Arc::new(NodeStateMachine::new( - stores, - ConvergenceConfig::default(), - ))); - } - self - } - - pub fn with_sync_manager(mut self, sync_manager: Arc) -> Self { - self.sync_manager = Some(sync_manager); - self - } - pub fn with_partition_detector(mut self, partition_detector: Arc) -> Self { self.partition_detector = Some(partition_detector); self @@ -471,46 +210,28 @@ impl Gossip for GossipService { ) -> Result, Status> { let mut incoming = request.into_inner(); let self_name = self.self_name.clone(); - let state = self.state.clone(); - let stores = self.stores.clone(); - let sync_manager = self.sync_manager.clone(); let mesh_kv = self.mesh_kv.clone(); - // Create output stream with flow control const CHANNEL_CAPACITY: usize = 128; let (tx, rx) = mpsc::channel::>(CHANNEL_CAPACITY); - let size_validator = MessageSizeValidator::default(); - // Remote peer identity, discovered from inbound StreamMessage.peer_id. - // Shared between the inbound handler (writer) and the server-side - // sender task (reader) so the sender can emit targeted_entries whose - // `target` matches the learned peer. Before the first inbound message - // this is `None` and the sender emits only broadcast drain_entries — - // the targeted entries for that first round are dropped under - // at-most-once semantics and the application retries. + // Remote peer identity, learned from the first inbound message and + // used by the sender to filter targeted_entries. let learned_peer: Arc>> = Arc::new(parking_lot::RwLock::new(None)); - // Spawn task to periodically send incremental updates. - // Uses PeerWatermark reading from the shared RoundBatch (central collector). - // If current_batch is not set (e.g., temporary GossipService for snapshots), - // skip the sender task. - let incremental_sender_handle = if let Some(batch_handle) = self.current_batch.clone() { - let tx_incremental = tx.clone(); - let self_name_incremental = self_name.clone(); - let size_validator_clone = size_validator.clone(); - // The remote peer's name isn't known until the first stream message - // arrives. Use a placeholder label for debug output — this doesn't - // affect filtering correctness (each task has its own watermark). - let peer_name_for_watermark = "server-inbound".to_string(); - let stream_batch_handle = self.current_stream_batch.clone(); + // Server-side stream sender: periodically emit fresh stream batches + // (broadcast drain_entries + targeted entries addressed to the + // learned peer). Skipped when no current_stream_batch is attached. + let sender_handle = if let Some(stream_batch_handle) = self.current_stream_batch.clone() { + let tx_sender = tx.clone(); + let self_name_sender = self_name.clone(); let learned_peer_sender = learned_peer.clone(); #[expect( clippy::disallowed_methods, - reason = "server-side incremental sender that runs for the lifetime of the sync_stream; terminates when the channel closes or handle is aborted" + reason = "server-side sender bound to sync_stream lifetime; terminates when channel closes or handle is aborted on disconnect" )] Some(tokio::spawn(async move { - let mut watermark = PeerWatermark::new(peer_name_for_watermark); let mut interval = tokio::time::interval(Duration::from_secs(1)); let mut sequence_counter: u64 = 0; let mut last_stream_batch: Option> = None; @@ -518,100 +239,35 @@ impl Gossip for GossipService { loop { interval.tick().await; - // Read the centrally collected batch and filter by this peer's watermark. - let batch = batch_handle.read().clone(); - let all_updates = watermark.filter(&batch); - - if !all_updates.is_empty() { - for (store_type, updates) in all_updates { - let proto_store_type = store_type.to_proto(); - - sequence_counter += 1; - let batch_size: usize = updates.iter().map(|u| u.value.len()).sum(); - - // Validate message size - if let Err(e) = size_validator_clone.validate(batch_size) { - log::warn!( - "Incremental update too large, skipping store {:?}: {} (max: {} bytes)", - store_type, - e, - size_validator_clone.max_size() - ); - // Mark as sent to prevent infinite retry loop. - watermark.mark_sent(store_type, &updates); - continue; - } - - let incremental_update = StreamMessage { - message_type: StreamMessageType::IncrementalUpdate as i32, - payload: Some(gossip::stream_message::Payload::Incremental( - IncrementalUpdate { - store: proto_store_type, - updates: updates.clone(), - version: 0, // Version is tracked per key in StateUpdate - }, - )), - sequence: sequence_counter, - peer_id: self_name_incremental.clone(), - }; - - // Check backpressure using try_send - match tx_incremental.try_send(Ok(incremental_update)) { - Ok(()) => { - record_batch_sent(&self_name_incremental, batch_size); - watermark.mark_sent(store_type, &updates); - } - Err(mpsc::error::TrySendError::Full(_)) => { - log::debug!( - "Backpressure: channel full, skipping send (will retry next interval)" - ); - continue; - } - Err(mpsc::error::TrySendError::Closed(_)) => { - log::warn!( - "Channel closed, stopping incremental update sender" - ); - break; - } - } + let stream_batch = stream_batch_handle.read().clone(); + let fresh = last_stream_batch + .as_ref() + .is_none_or(|last| !Arc::ptr_eq(last, &stream_batch)); + if !fresh { + continue; + } + last_stream_batch = Some(stream_batch.clone()); - log::debug!( - "Sent incremental update: store={:?}, {} updates", - store_type, - updates.len() - ); - } + let peer_for_targeted = learned_peer_sender.read().clone(); + let has_targeted = peer_for_targeted.as_ref().is_some_and(|p| { + stream_batch.targeted_entries.iter().any(|(t, _, _)| t == p) + }); + if stream_batch.drain_entries.is_empty() && !has_targeted { + continue; } - // Server-side stream emission: broadcast drain_entries - // plus targeted_entries addressed to the learned remote - // peer. Covers the non-initiator → initiator direction - // of each peer pair; the client-side sender in - // controller.rs handles the other direction. Before - // learned_peer is set (first inbound message not yet - // received), targeted entries in this round are dropped - // under at-most-once. On channel full, drop without retry. - if let Some(sbh) = &stream_batch_handle { - let stream_batch = sbh.read().clone(); - let fresh_batch = last_stream_batch - .as_ref() - .is_none_or(|last| !Arc::ptr_eq(last, &stream_batch)); - if fresh_batch { - // Advance the tracker for every fresh Arc, not just - // ones that produced work — otherwise a batch with - // empty drain_entries stays "fresh" across ticks and - // we keep re-checking it. - last_stream_batch = Some(stream_batch.clone()); - } - let peer_for_targeted = learned_peer_sender.read().clone(); - let has_targeted = peer_for_targeted.as_ref().is_some_and(|p| { - stream_batch.targeted_entries.iter().any(|(t, _, _)| t == p) - }); - if fresh_batch && (!stream_batch.drain_entries.is_empty() || has_targeted) { - let mut entries = Vec::new(); - // Generation is per-value so concurrent publishes - // to the same key get distinct tags. - for (key, value) in &stream_batch.drain_entries { + let mut entries = Vec::new(); + for (key, value) in &stream_batch.drain_entries { + entries.extend(chunk_value( + key.clone(), + next_generation(), + value.clone(), + MAX_STREAM_CHUNK_BYTES, + )); + } + if let Some(ref peer) = peer_for_targeted { + for (target, key, value) in &stream_batch.targeted_entries { + if target == peer { entries.extend(chunk_value( key.clone(), next_generation(), @@ -619,56 +275,32 @@ impl Gossip for GossipService { MAX_STREAM_CHUNK_BYTES, )); } - if let Some(ref peer) = peer_for_targeted { - for (target, key, value) in &stream_batch.targeted_entries { - if target == peer { - entries.extend(chunk_value( - key.clone(), - next_generation(), - value.clone(), - MAX_STREAM_CHUNK_BYTES, - )); - } - } - } - if !entries.is_empty() { - for batch in build_stream_batches( - entries, - DEFAULT_MAX_CHUNKS_PER_BATCH, - MAX_STREAM_CHUNK_BYTES, - ) { - sequence_counter += 1; - let msg = StreamMessage { - message_type: StreamMessageType::StreamBatch as i32, - payload: Some( - gossip::stream_message::Payload::StreamBatch(batch), - ), - sequence: sequence_counter, - peer_id: self_name_incremental.clone(), - }; - match tx_incremental.try_send(Ok(msg)) { - Ok(()) => {} - Err(mpsc::error::TrySendError::Full(_)) => { - log::debug!( - "server-side stream batch dropped on backpressure" - ); - // TODO(metrics): bump - // stream_dropped_on_backpressure - break; - } - Err(mpsc::error::TrySendError::Closed(_)) => { - log::warn!( - "server-side stream sender: channel closed, stopping" - ); - return; - } - } - } + } + } + if entries.is_empty() { + continue; + } + + for batch in build_stream_batches( + entries, + DEFAULT_MAX_CHUNKS_PER_BATCH, + MAX_STREAM_CHUNK_BYTES, + ) { + sequence_counter += 1; + let msg = StreamMessage { + message_type: StreamMessageType::StreamBatch as i32, + payload: Some(gossip::stream_message::Payload::StreamBatch(batch)), + sequence: sequence_counter, + peer_id: self_name_sender.clone(), + }; + match tx_sender.try_send(Ok(msg)) { + Ok(()) => {} + Err(mpsc::error::TrySendError::Full(_)) => { + log::debug!("server-side stream batch dropped on backpressure"); + break; } + Err(mpsc::error::TrySendError::Closed(_)) => return, } - } else if last_stream_batch.is_some() { - // If handle became None (shouldn't normally), clear state. - last_stream_batch = None; } } })) @@ -676,838 +308,106 @@ impl Gossip for GossipService { None }; - // Spawn task to handle incoming messages - let mut sequence: u64 = 0; - let _convergence_tracker = ConvergenceTracker::new(); - - // Track snapshot reception state: store_type -> (received_chunks, expected_total) - // Keyed by store_type only — a new snapshot request for the same store - // replaces any incomplete previous attempt (prevents stale chunk mixing). - let mut snapshot_state: HashMap, u64)> = HashMap::new(); - let learned_peer_inbound = learned_peer.clone(); #[expect( clippy::disallowed_methods, - reason = "server-side stream handler that runs for the lifetime of the sync_stream gRPC connection; terminates when the stream closes" + reason = "server-side inbound handler bound to sync_stream lifetime; terminates when the stream closes" )] tokio::spawn(async move { + // Close the stream if no inbound message arrives within + // this window — protects against idle clients pinning the + // server-side task and mpsc channel indefinitely. + const STREAM_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + let mut peer_id = String::new(); update_peer_connections(&peer_id, true); + let mut sequence: u64 = 0; - // Check if we need to request snapshots on connection - // This happens when: - // 1. We're a new node joining (stores are empty or very small) - // 2. We detect a version gap - if let Some(ref stores) = stores { - for store_type in [ - LocalStoreType::Membership, - LocalStoreType::App, - LocalStoreType::Worker, - LocalStoreType::Policy, - LocalStoreType::RateLimit, - ] { - let store_len = match store_type { - LocalStoreType::Membership => stores.membership.len(), - LocalStoreType::App => stores.app.len(), - LocalStoreType::Worker => stores.worker.len(), - LocalStoreType::Policy => stores.policy.len(), - LocalStoreType::RateLimit => stores.rate_limit.keys().len(), - }; - - // If store is empty or very small, request snapshot - if store_len == 0 { - log::info!( - "Store {:?} is empty, requesting snapshot from {}", - store_type, - peer_id - ); - let proto_store_type = store_type.to_proto(); - - let snapshot_request = StreamMessage { - message_type: StreamMessageType::SnapshotRequest as i32, - payload: Some(gossip::stream_message::Payload::SnapshotRequest( - SnapshotRequest { - store: proto_store_type, - from_version: 0, // Request from beginning - }, - )), - sequence: 0, - peer_id: self_name.clone(), - }; - - if tx.send(Ok(snapshot_request)).await.is_err() { - log::warn!("Failed to send snapshot request"); - } - } - } - } - - const STREAM_IDLE_TIMEOUT: Duration = Duration::from_secs(60); - // Short-circuits the RwLock::read() once we've accepted an - // identity for this stream — learned_peer_inbound is - // write-once-per-connection, so after the first learn every - // subsequent message would otherwise take a pointless lock. - let mut peer_learned = false; loop { - match tokio::time::timeout(STREAM_IDLE_TIMEOUT, incoming.next()).await { - Ok(Some(Ok(msg))) => { - sequence += 1; - // Accept the claimed peer_id before copying it into - // the task-local `peer_id`: the task-local drives - // the teardown log and connection-gauge decrement - // below, and if we wrote a mismatched value here - // before the reject, the cleanup would attribute - // the close to the spoofed peer and leak the real - // peer's connection gauge. - // - // Reject mid-stream peer_id changes: the stream's - // remote identity is fixed for the connection, so - // a different peer_id on a later message is either - // a bug or a peer trying to claim another node's - // targeted entries. Close the stream and let the - // client reconnect. Pre-mTLS-binding defence; - // mTLS-derived identity is the authoritative - // long-term fix. - if !peer_learned && !msg.peer_id.is_empty() { - let mut learned = learned_peer_inbound.write(); - match learned.as_ref() { - None => { - *learned = Some(msg.peer_id.clone()); - peer_id.clone_from(&msg.peer_id); - peer_learned = true; - } - Some(existing) if existing == &msg.peer_id => { - peer_id.clone_from(existing); - peer_learned = true; - } - Some(existing) => { - log::warn!( - expected_peer_id = %existing, - received_peer_id = %msg.peer_id, - "peer_id changed mid-stream; closing sync_stream" - ); - break; - } - } - } else if peer_learned && msg.peer_id != peer_id { - // Empty peer_id after learn is also an identity - // change — a stream bound to a learned peer - // shouldn't accept unowned frames. - log::warn!( - expected_peer_id = %peer_id, - received_peer_id = %msg.peer_id, - "peer_id changed mid-stream; closing sync_stream" - ); - break; - } - - match msg.message_type() { - StreamMessageType::IncrementalUpdate => { - if let Some(gossip::stream_message::Payload::Incremental(update)) = - &msg.payload - { - // Validate message size - let msg_size: usize = - update.updates.iter().map(|u| u.value.len()).sum(); - if let Err(e) = size_validator.validate(msg_size) { - log::warn!( - "Received oversized incremental update from {}: {} (max: {} bytes), rejecting", - peer_id, e, size_validator.max_size() - ); - let nack = StreamMessage { - message_type: StreamMessageType::Nack as i32, - payload: Some(gossip::stream_message::Payload::Ack( - StreamAck { - sequence: msg.sequence, - success: false, - error_message: format!( - "Message too large: {e}" - ), - }, - )), - sequence, - peer_id: self_name.clone(), - }; - if tx.send(Ok(nack)).await.is_err() { - break; - } - record_nack(&peer_id); - continue; - } - - let store_type = LocalStoreType::from_proto(update.store); - log::debug!("Received incremental update from {}: store={:?}, {} updates", - peer_id, store_type, update.updates.len()); - - // Apply incremental updates to state stores - // This will be handled by the sync manager if available - // For now, we acknowledge and the sync manager will handle it - if let Some(ref sync_manager) = sync_manager { - for state_update in &update.updates { - match store_type { - LocalStoreType::Worker => { - // Deserialize and apply worker state - if let Ok(worker_state) = bincode::deserialize::< - super::stores::WorkerState, - >( - &state_update.value - ) { - // Extract actor from StateUpdate - let actor = - Some(state_update.actor.clone()); - sync_manager.apply_remote_worker_state( - worker_state, - actor, - ); - } - } - LocalStoreType::Policy => { - // Deserialize and apply policy state - if let Ok(policy_state) = bincode::deserialize::< - super::stores::PolicyState, - >( - &state_update.value - ) { - let actor = - Some(state_update.actor.clone()); - - if policy_state.policy_type - == "tenant_delta" - { - // Lightweight tenant delta — no tree structure, no prompt text - match super::tree_ops::TenantDelta::from_bytes( - &policy_state.config, - ) - { - Ok(delta) => { - sync_manager - .apply_remote_tenant_delta( - delta, actor, - ); - } - Err(e) => { - log::warn!( - "Failed to deserialize tenant delta for model {}: {e}", - policy_state.model_id - ); - } - } - } else if policy_state.policy_type - == "tree_state_delta" - { - // Operation-level delta: apply only the new operations - match super::tree_ops::TreeStateDelta::from_bytes( - &policy_state.config, - ) - { - Ok(delta) => { - sync_manager - .apply_remote_tree_delta( - delta, actor, - ); - } - Err(e) => { - log::warn!( - "Failed to deserialize tree state delta for model {}: {e}", - policy_state.model_id - ); - } - } - } else if policy_state.policy_type - == "tree_state_lz4" - { - // LZ4-compressed snapshot (TreeState or TreeSnapshot bytes) - match super::tree_ops::lz4_decompress( - &policy_state.config, - ) { - Ok(decompressed) => { - // Try TreeState first (backward compat) - if let Ok(tree_state) = - super::tree_ops::TreeState::from_bytes( - &decompressed, - ) - { - sync_manager - .apply_remote_tree_operation( - policy_state - .model_id - .clone(), - tree_state, - actor.clone(), - ); - } else if let Ok(snap) = - kv_index::snapshot::TreeSnapshot::from_bytes( - &decompressed, - ) - { - // Compact TreeSnapshot — convert to TreeState - let tree_state = - super::tree_ops::TreeState::from_snapshot( - policy_state - .model_id - .clone(), - &snap, - policy_state.version, - ); - sync_manager - .apply_remote_tree_operation( - policy_state - .model_id - .clone(), - tree_state, - actor.clone(), - ); - } else { - log::warn!( - "Failed to deserialize tree_state_lz4 payload for model {}", - policy_state.model_id - ); - } - } - Err(e) => { - log::warn!( - "Failed to LZ4-decompress tree state for model {}: {e}", - policy_state.model_id - ); - } - } - } else if policy_state.policy_type - == "tree_state" - { - // Uncompressed full state (backward compatible) - if let Ok(tree_state) = - super::tree_ops::TreeState::from_bytes( - &policy_state.config, - ) - { - sync_manager - .apply_remote_tree_operation( - policy_state - .model_id - .clone(), - tree_state, - actor, - ); - } - } else { - // Regular policy state update - sync_manager.apply_remote_policy_state( - policy_state, - actor, - ); - } - } - } - LocalStoreType::App => { - // Deserialize and apply app state - if let Ok(app_state) = bincode::deserialize::< - super::stores::AppState, - >( - &state_update.value - ) { - // Apply app state directly to the store, skipping stale versions - if let Some(ref stores) = stores { - let dominated = stores - .app - .get(&app_state.key) - .is_some_and(|existing| { - existing.version - >= app_state.version - }); - if !dominated { - // Mirror into the v2 `config:` CRDT - // namespace so v2-only readers can - // reach the same value during a - // rolling upgrade, even when the - // source is a v1 node still - // writing to AppStore. - if let Some(ref kv) = mesh_kv { - kv.configs().put( - &format!( - "config:{}", - app_state.key - ), - app_state.value.clone(), - ); - } - if let Err(err) = stores.app.insert( - app_state.key.clone(), - app_state, - ) { - log::warn!(error = %err, "Failed to apply app state update"); - } - } - } - } - } - LocalStoreType::Membership => { - // Deserialize and apply membership state - if let Ok(membership_state) = - bincode::deserialize::< - super::stores::MembershipState, - >( - &state_update.value - ) - { - // Apply membership state directly to the store - if let Some(ref stores) = stores { - if let Err(err) = - stores.membership.insert( - membership_state.name.clone(), - membership_state, - ) - { - log::warn!(error = %err, "Failed to apply membership state update"); - } - } - } - } - LocalStoreType::RateLimit => { - if let Ok(op_log) = bincode::deserialize::< - super::crdt_kv::OperationLog, - >( - &state_update.value - ) { - if let Some(counter_value) = op_log - .latest_counter_value(&state_update.key) - .or_else(|| { - op_log.latest_counter_value_any() - }) - { - sync_manager - .apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - state_update.key.clone(), - state_update.actor.clone(), - counter_value, - state_update.timestamp, - ); - } else { - log::warn!( - key = %state_update.key, - "Rate-limit OperationLog does not contain a decodable counter value" - ); - } - } else if let Ok(counter_value) = - bincode::deserialize::( - &state_update.value, - ) - { - sync_manager - .apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - state_update.key.clone(), - state_update.actor.clone(), - counter_value, - state_update.timestamp, - ); - } else { - log::warn!( - key = %state_update.key, - "Failed to decode rate-limit update as OperationLog or i64" - ); - } - } - } - } - } - let ack = StreamMessage { - message_type: StreamMessageType::Ack as i32, - payload: Some(gossip::stream_message::Payload::Ack( - StreamAck { - sequence: msg.sequence, - success: true, - error_message: String::new(), - }, - )), - sequence, - peer_id: self_name.clone(), - }; - if tx.send(Ok(ack)).await.is_err() { - break; - } - } - } - StreamMessageType::SnapshotRequest => { - if let Some(gossip::stream_message::Payload::SnapshotRequest(req)) = - &msg.payload - { - let store_type = LocalStoreType::from_proto(req.store); - let store_name = store_type.as_str(); - log::info!("Received snapshot request from {}: store={:?}, from_version={}", - peer_id, store_type, req.from_version); - - record_snapshot_trigger(store_name, "request"); - let snapshot_start = Instant::now(); - - // Generate and send snapshot chunks - let service = GossipService { - state: state.clone(), - listen_addr: SocketAddr::from(([0, 0, 0, 0], 0)), // Not used in snapshot generation - advertise_addr: SocketAddr::from(([0, 0, 0, 0], 0)), // Not used in snapshot generation - self_name: self_name.clone(), - stores: stores.clone(), - sync_manager: sync_manager.clone(), - state_machine: None, - partition_detector: None, - mtls_manager: None, - current_batch: None, - current_stream_batch: None, - mesh_kv: None, - }; - let chunks = service.create_snapshot_chunks(store_type, 100); // chunk_size = 100 entries - let total_chunks = chunks.len() as u64; - let mut total_bytes = 0; - - for (idx, chunk) in chunks.into_iter().enumerate() { - let chunk_bytes = chunk - .entries - .iter() - .map(|e| e.value.len()) - .sum::(); - total_bytes += chunk_bytes; - - let mut chunk_msg = StreamMessage { - message_type: StreamMessageType::SnapshotChunk as i32, - payload: Some( - gossip::stream_message::Payload::SnapshotChunk( - chunk, - ), - ), - sequence: sequence + idx as u64 + 1, - peer_id: self_name.clone(), - }; - // Update chunk metadata - if let Some( - gossip::stream_message::Payload::SnapshotChunk( - ref mut c, - ), - ) = chunk_msg.payload - { - c.chunk_index = idx as u64; - c.total_chunks = total_chunks; - } - - // Check backpressure using try_send - match tx.try_send(Ok(chunk_msg)) { - Ok(()) => { - // Successfully queued - } - Err(mpsc::error::TrySendError::Full(msg)) => { - log::debug!( - "Backpressure: channel full, waiting for drain" - ); - // Wait a bit for channel to drain, then use blocking send - tokio::time::sleep(Duration::from_millis(100)) - .await; - if tx.send(msg).await.is_err() { - log::warn!("Backpressure: channel closed, stopping snapshot"); - break; - } - } - Err(mpsc::error::TrySendError::Closed(_)) => { - log::warn!("Channel closed, stopping snapshot"); - break; - } - } - } - - record_snapshot_duration(store_name, snapshot_start.elapsed()); - record_snapshot_bytes(store_name, "sent", total_bytes); - - // Send snapshot complete message - let complete = StreamMessage { - message_type: StreamMessageType::SnapshotComplete as i32, - payload: None, - sequence: sequence + total_chunks + 1, - peer_id: self_name.clone(), - }; - if tx.send(Ok(complete)).await.is_err() { - break; - } - - // Send ACK - let ack = StreamMessage { - message_type: StreamMessageType::Ack as i32, - payload: Some(gossip::stream_message::Payload::Ack( - StreamAck { - sequence: msg.sequence, - success: true, - error_message: String::new(), - }, - )), - sequence, - peer_id: self_name.clone(), - }; - record_ack(&peer_id, true); - if tx.send(Ok(ack)).await.is_err() { - break; - } - } - } - StreamMessageType::SnapshotChunk => { - if let Some(gossip::stream_message::Payload::SnapshotChunk(chunk)) = - &msg.payload - { - let store_type = LocalStoreType::from_proto(chunk.store); - let store_name = store_type.as_str(); - log::info!( - "Received snapshot chunk from {}: store={:?}, chunk={}/{}", - peer_id, - store_type, - chunk.chunk_index, - chunk.total_chunks - ); - - // Record metrics - let chunk_bytes: usize = - chunk.entries.iter().map(|e| e.value.len()).sum(); - record_snapshot_bytes(store_name, "received", chunk_bytes); - - // Store chunk. Reset on chunk_index == 0 (start of a - // new snapshot transfer) to prevent stale chunks from a - // previous attempt mixing with new ones — even if - // total_chunks is the same. - let (chunks, expected) = snapshot_state - .entry(store_type) - .or_insert_with(|| (Vec::new(), chunk.total_chunks)); - if chunk.chunk_index == 0 && !chunks.is_empty() { - log::info!( - "New snapshot transfer for {:?}, discarding {} partial chunks", - store_type, chunks.len() - ); - chunks.clear(); - } - *expected = chunk.total_chunks; - chunks.push(chunk.clone()); - - // Check if we've received all chunks with valid indices - if let Some((received_chunks, total)) = - snapshot_state.get(&store_type) - { - if received_chunks.len() as u64 == *total { - // Verify all indices 0..total are present (no duplicates/gaps) - let mut sorted_chunks = received_chunks.to_vec(); - sorted_chunks.sort_by_key(|c| c.chunk_index); - let indices_valid = sorted_chunks - .iter() - .enumerate() - .all(|(i, c)| c.chunk_index == i as u64); - if !indices_valid { - log::warn!( - "Snapshot for {:?} has {} chunks but indices are not contiguous 0..{}, discarding", - store_type, sorted_chunks.len(), total - ); - snapshot_state.remove(&store_type); - continue; - } - - log::info!("All {} chunks received for store {:?}, applying snapshot", - total, store_type); - - if let Some(ref stores) = stores { - // Apply all entries from chunks - for chunk in &sorted_chunks { - for entry in &chunk.entries { - let key = entry.key.clone(); - - match store_type { - LocalStoreType::Membership => { - if let Ok(membership_state) = bincode::deserialize::(&entry.value) { - let _ = stores.membership.insert(key, membership_state); - } - } - LocalStoreType::App => { - if let Ok(app_state) = bincode::deserialize::(&entry.value) { - let dominated = stores.app.get(&key) - .is_some_and(|existing| existing.version >= app_state.version); - if !dominated { - // Mirror into the v2 `config:` CRDT - // namespace so v2-only readers can - // reach values a v1 snapshot sender - // is still populating via AppStore. - if let Some(ref kv) = mesh_kv { - kv.configs().put( - &format!("config:{}", app_state.key), - app_state.value.clone(), - ); - } - let _ = stores.app.insert(key, app_state); - } - } - } - LocalStoreType::Worker => { - if let Ok(worker_state) = bincode::deserialize::(&entry.value) { - let _ = stores.worker.insert(key, worker_state.clone()); - // Also update sync manager if available - if let Some(ref sync_manager) = sync_manager { - sync_manager.apply_remote_worker_state(worker_state, Some(entry.actor.clone())); - } - } - } - LocalStoreType::Policy => { - if let Ok(policy_state) = bincode::deserialize::(&entry.value) { - if policy_state.policy_type == "tree_state" { - // Tree state entries go to tree_configs - // (plain DashMap, no CRDT operation log). - if let Some(ref sync_manager) = sync_manager { - if let Ok(tree_state) = super::tree_ops::TreeState::from_bytes( - &policy_state.config - ) { - sync_manager.apply_remote_tree_operation( - policy_state.model_id.clone(), - tree_state, - Some(entry.actor.clone()), - ); - } - } else { - // No sync manager — write directly to tree_configs. - stores.tree_configs.insert(key, policy_state.config.clone()); - } - } else { - let _ = stores.policy.insert(key, policy_state.clone()); - if let Some(ref sync_manager) = sync_manager { - sync_manager.apply_remote_policy_state(policy_state, Some(entry.actor.clone())); - } - } - } - } - LocalStoreType::RateLimit => { - if let Some(ref sync_manager) = sync_manager { - if let Ok(op_log) = bincode::deserialize::(&entry.value) { - if let Some(counter_value) = op_log - .latest_counter_value(&entry.key) - .or_else(|| op_log.latest_counter_value_any()) - { - sync_manager - .apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - entry.key.clone(), - entry.actor.clone(), - counter_value, - entry.timestamp, - ); - } else { - log::warn!( - key = %entry.key, - "Snapshot OperationLog does not contain a decodable rate-limit counter" - ); - } - } else if let Ok(counter_value) = bincode::deserialize::(&entry.value) { - sync_manager - .apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - entry.key.clone(), - entry.actor.clone(), - counter_value, - entry.timestamp, - ); - } else { - log::warn!( - key = %entry.key, - "Failed to decode snapshot rate-limit entry as i64 or OperationLog" - ); - } - } - } - } - } - } - - // Clear snapshot state - snapshot_state.remove(&store_type); - log::info!( - "Snapshot applied successfully for store {:?}", - store_type - ); - } - } - } - - let ack = StreamMessage { - message_type: StreamMessageType::Ack as i32, - payload: Some(gossip::stream_message::Payload::Ack( - StreamAck { - sequence: msg.sequence, - success: true, - error_message: String::new(), - }, - )), - sequence, - peer_id: self_name.clone(), - }; - record_ack(&peer_id, true); - if tx.send(Ok(ack)).await.is_err() { - break; - } - } - } - StreamMessageType::Ack => { - log::debug!( - "Received ACK from {}: sequence={}", - peer_id, - msg.sequence - ); - if let Some(gossip::stream_message::Payload::Ack(ack)) = - &msg.payload - { - record_ack(&peer_id, ack.success); - } - } - StreamMessageType::Heartbeat => { - // Send heartbeat back - let heartbeat = StreamMessage { - message_type: StreamMessageType::Heartbeat as i32, - payload: None, - sequence, - peer_id: self_name.clone(), - }; - if tx.send(Ok(heartbeat)).await.is_err() { - break; - } - } - StreamMessageType::StreamBatch => { - if let Some(mesh_kv) = &mesh_kv { - if let Some(gossip::stream_message::Payload::StreamBatch( - batch, - )) = msg.payload - { - dispatch_stream_batch(mesh_kv, &msg.peer_id, batch.entries); - } - } - } - _ => { - log::warn!( - "Unknown message type from {}: {:?}", - peer_id, - msg.message_type - ); - } - } - } + let msg = match tokio::time::timeout(STREAM_IDLE_TIMEOUT, incoming.next()).await { + Ok(Some(Ok(msg))) => msg, Ok(Some(Err(e))) => { log::error!("Error receiving stream message: {}", e); - record_nack(&peer_id); - update_peer_connections(&peer_id, false); - record_peer_reconnect(&peer_id); break; } Ok(None) => break, Err(_) => { - tracing::warn!( + log::warn!( + peer = %peer_id, "sync_stream idle timeout ({STREAM_IDLE_TIMEOUT:?}) — closing" ); break; } + }; + + // Bind peer_id to the first non-empty inbound id. A later + // frame whose msg.peer_id (empty or otherwise) doesn't + // match is treated as identity change and closes the + // stream. Pre-mTLS-binding defence; mTLS-derived + // identity is the authoritative long-term fix. + if peer_id.is_empty() { + if !msg.peer_id.is_empty() { + peer_id = msg.peer_id.clone(); + update_peer_connections(&peer_id, true); + *learned_peer_inbound.write() = Some(peer_id.clone()); + } + } else if msg.peer_id != peer_id { + log::warn!( + expected_peer_id = %peer_id, + received_peer_id = %msg.peer_id, + "peer_id changed mid-stream; closing sync_stream" + ); + break; + } + sequence = sequence.max(msg.sequence); + + match msg.message_type() { + StreamMessageType::Heartbeat => { + let heartbeat = StreamMessage { + message_type: StreamMessageType::Heartbeat as i32, + payload: None, + sequence, + peer_id: self_name.clone(), + }; + if tx.send(Ok(heartbeat)).await.is_err() { + break; + } + } + StreamMessageType::Ack => { + if let Some(gossip::stream_message::Payload::Ack(ack)) = &msg.payload { + record_ack(&peer_id, ack.success); + } + } + StreamMessageType::Nack => record_nack(&peer_id), + StreamMessageType::StreamBatch => { + if let ( + Some(mesh_kv), + Some(gossip::stream_message::Payload::StreamBatch(batch)), + ) = (&mesh_kv, msg.payload) + { + dispatch_stream_batch(mesh_kv, &msg.peer_id, batch.entries); + } + } + StreamMessageType::IncrementalUpdate + | StreamMessageType::SnapshotRequest + | StreamMessageType::SnapshotChunk + | StreamMessageType::SnapshotComplete => { + log::debug!( + peer = %peer_id, + message_type = ?msg.message_type(), + "ignoring v1 wire message (state-sync removed)", + ); + } } } - // Abort the incremental sender task to release the tx - // channel and allow the stream to close cleanly. - if let Some(handle) = incremental_sender_handle { + + update_peer_connections(&peer_id, false); + record_peer_reconnect(&peer_id); + if let Some(handle) = sender_handle { handle.abort(); - let _ = handle.await; } - log::info!("Stream from {} closed", peer_id); - update_peer_connections(&peer_id, false); }); - // Convert receiver to stream let output_stream = tokio_stream::wrappers::ReceiverStream::new(rx); Ok(Response::new( Box::pin(output_stream) as Self::SyncStreamStream diff --git a/crates/mesh/src/rate_limit_window.rs b/crates/mesh/src/rate_limit_window.rs deleted file mode 100644 index 9a4ed9782..000000000 --- a/crates/mesh/src/rate_limit_window.rs +++ /dev/null @@ -1,292 +0,0 @@ -//! Rate limit time window management -//! -//! Manages time windows for global rate limiting with periodic counter resets - -use std::{sync::Arc, time::Duration}; - -use tokio::{sync::watch, time::interval}; -use tracing::{debug, info}; - -use super::sync::MeshSyncManager; - -/// Rate limit window manager -/// Handles periodic reset of rate limit counters for time window management -pub struct RateLimitWindow { - sync_manager: Arc, - window_seconds: u64, -} - -impl RateLimitWindow { - pub fn new(sync_manager: Arc, window_seconds: u64) -> Self { - Self { - sync_manager, - window_seconds, - } - } - - /// Start the window reset task - /// This task periodically resets the global rate limit counter - /// - /// # Arguments - /// * `shutdown_rx` - A watch receiver that signals when to stop the task - pub async fn start_reset_task(self, mut shutdown_rx: watch::Receiver) { - let mut interval_timer = interval(Duration::from_secs(self.window_seconds)); - info!( - "Starting rate limit window reset task with {}s interval", - self.window_seconds - ); - - loop { - tokio::select! { - _ = interval_timer.tick() => { - debug!("Resetting global rate limit counter"); - self.sync_manager.reset_global_rate_limit_counter(); - } - _ = shutdown_rx.changed() => { - info!("Rate limit window reset task received shutdown signal"); - break; - } - } - } - - info!("Rate limit window reset task stopped"); - } -} - -#[cfg(test)] -mod tests { - use std::{sync::Arc, time::Duration}; - - use tokio::time::sleep; - - use super::*; - use crate::stores::{ - RateLimitConfig, StateStores, GLOBAL_RATE_LIMIT_COUNTER_KEY, GLOBAL_RATE_LIMIT_KEY, - }; - - #[test] - fn test_rate_limit_window_new() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores, "node1".to_string())); - - let window = RateLimitWindow::new(sync_manager, 60); - // Should create without panicking - assert_eq!(window.window_seconds, 60); - } - - #[test] - fn test_rate_limit_window_different_intervals() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores, "node1".to_string())); - - let window1 = RateLimitWindow::new(sync_manager.clone(), 30); - assert_eq!(window1.window_seconds, 30); - - let window2 = RateLimitWindow::new(sync_manager, 120); - assert_eq!(window2.window_seconds, 120); - } - - #[tokio::test] - async fn test_rate_limit_window_reset_task_interval() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores, "node1".to_string())); - - // Set a very short window for testing (1 second) - let window = RateLimitWindow::new(sync_manager, 1); - - // Create shutdown channel - let (shutdown_tx, shutdown_rx) = watch::channel(false); - - // Spawn the reset task - #[expect( - clippy::disallowed_methods, - reason = "test: handle is awaited with timeout below" - )] - let task_handle = tokio::spawn(async move { - window.start_reset_task(shutdown_rx).await; - }); - - // Wait a bit to allow the task to run - sleep(Duration::from_millis(1500)).await; - - // Send shutdown signal - shutdown_tx - .send(true) - .expect("failed to send shutdown signal"); - - // Wait for task to complete gracefully - let res = tokio::time::timeout(Duration::from_secs(1), task_handle).await; - assert!(res.is_ok(), "reset task did not shut down in time"); - let join_res = res.unwrap(); - assert!(join_res.is_ok(), "reset task panicked"); - - // The task should have started and stopped gracefully - } - - #[tokio::test] - async fn test_rate_limit_window_reset_task() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores.clone(), "node1".to_string())); - - // Setup membership - stores.rate_limit.update_membership(&["node1".to_string()]); - - // Setup config - let key = GLOBAL_RATE_LIMIT_KEY.to_string(); - let config = RateLimitConfig { - limit_per_second: 100, - }; - let serialized = serde_json::to_vec(&config).unwrap(); - let _ = stores.app.insert( - key.clone(), - crate::stores::AppState { - key: GLOBAL_RATE_LIMIT_KEY.to_string(), - value: serialized, - version: 1, - }, - ); - - // Increment counter - if stores.rate_limit.is_owner(GLOBAL_RATE_LIMIT_COUNTER_KEY) { - sync_manager.sync_rate_limit_inc(GLOBAL_RATE_LIMIT_COUNTER_KEY.to_string(), 10); - let value_before = sync_manager.get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY); - assert!(value_before.is_some() && value_before.unwrap() > 0); - - // Create window manager with short interval for testing - let window = RateLimitWindow::new(sync_manager.clone(), 1); // 1 second - - // Create shutdown channel - let (shutdown_tx, shutdown_rx) = watch::channel(false); - - // Start reset task in background - #[expect( - clippy::disallowed_methods, - reason = "test: handle is awaited with timeout below" - )] - let reset_handle = tokio::spawn(async move { - window.start_reset_task(shutdown_rx).await; - }); - - // Wait a bit for reset to happen - sleep(Duration::from_millis(1500)).await; - - // Check that counter was reset (or at least decremented) - let _value_after = sync_manager.get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY); - // Counter should be reset or significantly reduced - // Note: The exact value depends on timing, but it should be less than initial - - // Send shutdown signal - shutdown_tx - .send(true) - .expect("failed to send shutdown signal"); - - // Wait for task to complete gracefully - let res = tokio::time::timeout(Duration::from_secs(1), reset_handle).await; - assert!(res.is_ok(), "reset task did not shut down in time"); - let join_res = res.unwrap(); - assert!(join_res.is_ok(), "reset task panicked"); - } - } - - #[tokio::test] - async fn test_rate_limit_window_reset_with_counter() { - use crate::stores::MembershipState; - - // Use with_self_name to ensure RateLimitStore uses the same self_name - let stores = Arc::new(StateStores::with_self_name("test_node".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new( - stores.clone(), - "test_node".to_string(), - )); - - // First, add this node to membership so it can be an owner - let membership_key = "test_node".to_string(); - let membership_state = MembershipState { - name: "test_node".to_string(), - address: "127.0.0.1:8080".to_string(), - status: 1, // NodeStatus::Alive - version: 1, - metadata: Default::default(), - }; - let _ = stores.membership.insert(membership_key, membership_state); - - // Update rate limit membership so this node becomes an owner - sync_manager.update_rate_limit_membership(); - - // Check if node is owner before incrementing - let key = GLOBAL_RATE_LIMIT_COUNTER_KEY.to_string(); - let is_owner = stores.rate_limit.is_owner(&key); - assert!(is_owner, "Node should be owner of the rate limit key"); - - // Set up a rate limit counter via sync_manager - // This should increment the counter if the node is an owner - sync_manager.sync_rate_limit_inc(key.clone(), 10); - - // Verify counter exists (was created) - // Note: The actual value might be 0 due to PNCounter implementation details, - // but the counter should exist after inc is called - let counter_opt = stores.rate_limit.get_counter(&key); - assert!(counter_opt.is_some(), "Counter should exist after inc call"); - - // Verify counter was created after inc call - // Note: The actual value depends on PNCounter implementation, - // but the counter should exist after inc is called - - // Reset the counter - sync_manager.reset_global_rate_limit_counter(); - - // Verify reset was called (counter should still exist) - // The reset implementation decrements by current count, - // so the value should be 0 or negative after reset - let reset_value = stores.rate_limit.value(&key).unwrap_or(0); - // After reset, value should be <= 0 (since we decrement by current count) - assert!( - reset_value <= 0, - "Counter should be reset to 0 or less, got: {reset_value}" - ); - } - - #[test] - fn test_rate_limit_window_zero_seconds() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores, "node1".to_string())); - - // Should handle zero seconds (though not recommended in practice) - let window = RateLimitWindow::new(sync_manager, 0); - assert_eq!(window.window_seconds, 0); - } - - #[test] - fn test_rate_limit_window_large_interval() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores, "node1".to_string())); - - // Test with a large interval - let window = RateLimitWindow::new(sync_manager, 86400); // 24 hours - assert_eq!(window.window_seconds, 86400); - } - - #[tokio::test] - async fn test_reset_global_rate_limit_counter_logic() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores.clone(), "node1".to_string())); - - // Setup membership - stores.rate_limit.update_membership(&["node1".to_string()]); - - if stores.rate_limit.is_owner(GLOBAL_RATE_LIMIT_COUNTER_KEY) { - // Increment counter - sync_manager.sync_rate_limit_inc(GLOBAL_RATE_LIMIT_COUNTER_KEY.to_string(), 20); - let value_before = sync_manager.get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY); - assert!(value_before.is_some() && value_before.unwrap() > 0); - - // Reset - sync_manager.reset_global_rate_limit_counter(); - - // Check that counter was reset - let value_after = sync_manager.get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY); - // Should be 0 or negative after reset - assert!(value_after.is_none() || value_after.unwrap() <= 0); - } - } -} diff --git a/crates/mesh/src/service.rs b/crates/mesh/src/service.rs index 847c172bd..afc416d32 100644 --- a/crates/mesh/src/service.rs +++ b/crates/mesh/src/service.rs @@ -30,11 +30,8 @@ use gossip::{ use crate::{ controller::MeshController, mtls::{MTLSConfig, MTLSManager}, - node_state_machine::{ConvergenceConfig, NodeStateMachine}, partition::PartitionDetector, ping_server::GossipService, - stores::{AppState, StateStores}, - sync::MeshSyncManager, }; pub type ClusterState = Arc>>; @@ -53,14 +50,10 @@ pub struct MeshServerConfig { /// node discovery(TODO), node status update(TODO), etc. pub struct MeshServerHandler { pub state: ClusterState, - pub stores: Arc, - pub sync_manager: Arc, pub self_name: String, _self_addr: SocketAddr, signal_tx: watch::Sender, partition_detector: Option>, - state_machine: Option>, - rate_limit_task_handle: std::sync::Mutex>>, /// Shared with the MeshServer so adapters can subscribe to stream /// namespaces (broadcast/targeted) and publish values that reach /// peers via the gossip loop. @@ -73,19 +66,6 @@ impl MeshServerHandler { self.partition_detector.as_ref() } - /// Get state machine - pub fn state_machine(&self) -> Option<&Arc> { - self.state_machine.as_ref() - } - - /// Check if node is ready - pub fn is_ready(&self) -> bool { - self.state_machine - .as_ref() - .map(|sm| sm.is_ready()) - .unwrap_or(true) // If no state machine, consider ready - } - /// Check if we should serve (have quorum) pub fn should_serve(&self) -> bool { self.partition_detector @@ -94,48 +74,9 @@ impl MeshServerHandler { .unwrap_or(true) // If no partition detector, consider should serve } - /// Start rate limit window reset task - /// This task will periodically reset the global rate limit counter - pub fn start_rate_limit_task(&self, window_seconds: u64) { - use crate::rate_limit_window::RateLimitWindow; - - let window_manager = RateLimitWindow::new(self.sync_manager.clone(), window_seconds); - let shutdown_rx = self.signal_tx.subscribe(); - - #[expect( - clippy::disallowed_methods, - reason = "handle is stored in rate_limit_task_handle and awaited on shutdown via stop_rate_limit_task" - )] - let handle = tokio::spawn(async move { - window_manager.start_reset_task(shutdown_rx).await; - }); - - if let Ok(mut task_handle) = self.rate_limit_task_handle.lock() { - *task_handle = Some(handle); - } - } - - /// Stop rate limit window reset task - pub fn stop_rate_limit_task(&self) { - self.signal_tx.send(true).ok(); - if let Ok(mut task_handle) = self.rate_limit_task_handle.lock() { - if let Some(handle) = task_handle.take() { - #[expect( - clippy::disallowed_methods, - reason = "short-lived join task that awaits the rate_limit_task handle during shutdown; completes when the inner task finishes" - )] - tokio::spawn(async move { - if let Err(err) = handle.await { - log::warn!("Rate limit task shutdown failed: {}", err); - } - }); - } - } - } - /// Shutdown immediately without graceful shutdown pub fn shutdown(&self) { - self.stop_rate_limit_task(); + self.signal_tx.send(true).ok(); } /// Graceful shutdown: broadcast LEAVING status to all alive nodes, @@ -172,7 +113,7 @@ impl MeshServerHandler { let (leaving_node, alive_nodes) = match maybe_leaving { Some(values) => values, None => { - self.stop_rate_limit_task(); + self.signal_tx.send(true).ok(); return Ok(()); } }; @@ -204,69 +145,11 @@ impl MeshServerHandler { ); tokio::time::sleep(propagation_delay).await; - log::info!("Stopping rate limit task and signaling shutdown"); - self.stop_rate_limit_task(); - Ok(()) - } - - /// Calculate the next version for a key - /// If the key exists, increment its version by 1 - /// If the key doesn't exist, start with version 1 - fn next_version(&self, key: &str) -> u64 { - self.stores - .app - .get(key) - .map(|app_state| app_state.version + 1) - .unwrap_or(1) - } - - pub fn write_data(&self, key: String, value: Vec) -> Result<()> { - // Keep app store write and metadata/version update in one lock scope. - let mut state = self.state.write(); - let node = state.get_mut(&self.self_name).ok_or_else(|| { - anyhow::anyhow!( - "Node {} not found in cluster state during write_data", - self.self_name - ) - })?; - - let version = self.next_version(&key); - let app_state = AppState { - key: key.clone(), - value: value.clone(), - version, - }; - self.stores - .app - .insert(key.clone(), app_state) - .map_err(|err| anyhow::anyhow!("Failed to persist app state for key {key}: {err}"))?; - - node.metadata.insert(key, value); - node.version += 1; + log::info!("Signaling shutdown"); + self.signal_tx.send(true).ok(); Ok(()) } - pub fn read_data(&self, key: String) -> Option> { - // Read from the app store - self.stores - .app - .get(&key) - .map(|app_state| app_state.value.clone()) - } - - /// Get operation log of the app store for synchronization - /// Returns an operation log that can be merged into other nodes - pub fn get_operation_log(&self) -> crate::crdt_kv::OperationLog { - self.stores.app.get_operation_log() - } - - /// Sync app store data from an operation log (for testing and manual sync) - /// This will be replaced by automatic sync stream in the future - pub fn sync_app_from_log(&self, log: &crate::crdt_kv::OperationLog) { - // Merge operation log into our app store using CRDT merge - self.stores.app.merge(log); - } - /// Shared MeshKV handle — adapters subscribe to stream namespaces /// and publish values through this. The handle is Arc-cloned, so /// subscribers created here see the same events as the gossip loop. @@ -277,7 +160,6 @@ impl MeshServerHandler { pub struct MeshServerBuilder { state: ClusterState, - stores: Arc, self_name: String, bind_addr: SocketAddr, advertise_addr: SocketAddr, @@ -302,10 +184,8 @@ impl MeshServerBuilder { metadata: HashMap::new(), }, )]))); - let stores = Arc::new(StateStores::with_self_name(self_name.clone())); Self { state, - stores, self_name, bind_addr, advertise_addr, @@ -322,22 +202,10 @@ impl MeshServerBuilder { pub fn build(&self) -> (MeshServer, MeshServerHandler) { let (signal_tx, signal_rx) = watch::channel(false); let partition_detector = Arc::new(PartitionDetector::default()); - let sync_manager = Arc::new(MeshSyncManager::new( - self.stores.clone(), - self.self_name.clone(), - )); - let state_machine = Arc::new(NodeStateMachine::new( - self.stores.clone(), - ConvergenceConfig::default(), - )); - // Initialize rate-limit hash ring with current membership - sync_manager.update_rate_limit_membership(); let mesh_kv = Arc::new(crate::kv::MeshKV::new(self.self_name.clone())); ( MeshServer { state: self.state.clone(), - stores: self.stores.clone(), - sync_manager: sync_manager.clone(), self_name: self.self_name.clone(), bind_addr: self.bind_addr, advertise_addr: self.advertise_addr, @@ -349,14 +217,10 @@ impl MeshServerBuilder { }, MeshServerHandler { state: self.state.clone(), - stores: self.stores.clone(), - sync_manager, self_name: self.self_name.clone(), _self_addr: self.advertise_addr, signal_tx, partition_detector: Some(partition_detector), - state_machine: Some(state_machine), - rate_limit_task_handle: std::sync::Mutex::new(None), mesh_kv, }, ) @@ -380,8 +244,6 @@ impl From<&MeshServerConfig> for MeshServerBuilder { pub struct MeshServer { state: ClusterState, - stores: Arc, - sync_manager: Arc, self_name: String, bind_addr: SocketAddr, advertise_addr: SocketAddr, @@ -410,8 +272,6 @@ impl MeshServer { self.advertise_addr, &self.self_name, self.init_peer, - self.stores.clone(), - self.sync_manager.clone(), self.mtls_manager.clone(), ) .with_mesh_kv(self.mesh_kv.clone()) @@ -453,22 +313,17 @@ impl MeshServer { .clone() .expect("partition detector missing"); - // Build controller first so we can share its current_batch with the - // server-side sync_stream handlers. This ensures both client-side - // (outgoing connections) and server-side (incoming connections) use - // the same centrally collected RoundBatch. + // Build controller first so we can share its current_stream_batch + // with server-side sync_stream handlers. let controller = self.build_controller(); let mut service = self.build_ping_server(); - service = service.with_stores(self.stores.clone()); - - service = service.with_sync_manager(self.sync_manager.clone()); service = service.with_partition_detector(partition_detector); - // Share the controller's current_batch so server-side sync_stream - // handlers use the same centrally collected data as client-side. - service = service.with_current_batch(controller.current_batch()); + // Share the controller's current_stream_batch so server-side + // sync_stream handlers see the same drained stream entries as + // client-side. service = service.with_current_stream_batch(controller.current_stream_batch()); // Add mTLS support if configured @@ -791,10 +646,6 @@ mod tests { ) .await; - handler_a - .write_data("hello".into(), "world".into()) - .unwrap(); - // 2. Add C and D let (listener_c, addr_c) = bind_node().await; let handler_c = mesh_run!("C", listener_c, addr_c, Some(addr_a)); diff --git a/crates/mesh/src/stores.rs b/crates/mesh/src/stores.rs deleted file mode 100644 index 188907c7d..000000000 --- a/crates/mesh/src/stores.rs +++ /dev/null @@ -1,1091 +0,0 @@ -//! State stores for mesh cluster synchronization -//! -//! Four types of state stores: -//! - MembershipStore: Router node membership -//! - AppStore: Application configuration, rate-limiting rules, LB algorithms -//! - WorkerStore: Worker status, load, health -//! - PolicyStore: Routing policy internal state - -use std::{ - collections::{BTreeMap, BTreeSet}, - marker::PhantomData, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, -}; - -use dashmap::DashMap; -use parking_lot::RwLock; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use tracing::debug; - -use super::{ - consistent_hash::ConsistentHashRing, - crdt_kv::{CrdtOrMap, Operation, OperationLog, ReplicaId}, - tree_ops::TreeOperation, -}; - -// ============================================================================ -// Type-Safe Serialization Layer - Transparent T ↔ Vec Conversion -// ============================================================================ - -/// Trait for CRDT-compatible value types. -/// Uses bincode for compact binary serialization. This is critical for -/// PolicyState which contains TreeState with token payloads — JSON -/// serialization of Vec is ~4x larger than binary. -trait CrdtValue: Serialize + DeserializeOwned + Clone { - fn to_bytes(&self) -> Result, CrdtSerError> { - bincode::serialize(self).map_err(CrdtSerError) - } - - fn from_bytes(bytes: &[u8]) -> Result { - bincode::deserialize(bytes).map_err(CrdtSerError) - } -} - -/// Serialization error wrapper for CRDT values. -#[derive(Debug)] -pub struct CrdtSerError(Box); - -impl std::fmt::Display for CrdtSerError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "CRDT serialization error: {}", self.0) - } -} - -impl std::error::Error for CrdtSerError {} - -// Blanket implementation for all types that satisfy the bounds -impl CrdtValue for T where T: Serialize + DeserializeOwned + Clone {} - -// ============================================================================ -// Generic CRDT Store Wrapper - Type-Safe Interface Over CrdtOrMap -// ============================================================================ - -/// Generic store wrapper providing type-safe operations over CrdtOrMap -#[derive(Clone)] -struct CrdtStore { - inner: CrdtOrMap, - _phantom: PhantomData, -} - -impl std::fmt::Debug for CrdtStore { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CrdtStore") - .field("inner", &"") - .finish() - } -} - -impl CrdtStore { - fn new() -> Self { - Self { - inner: CrdtOrMap::new(), - _phantom: PhantomData, - } - } - - /// Mutation generation counter. Cheap check to skip unchanged stores. - fn generation(&self) -> u64 { - self.inner.generation() - } - - fn get(&self, key: &str) -> Option { - self.inner.get(key).and_then(|bytes| { - T::from_bytes(&bytes) - .map_err(|err| { - debug!(error = %err, %key, "Failed to deserialize CRDT value"); - }) - .ok() - }) - } - - fn insert(&self, key: String, value: T) -> Result, CrdtSerError> { - let bytes = value.to_bytes().map_err(|err| { - debug!(error = %err, %key, "Failed to serialize CRDT value"); - err - })?; - - Ok(self.inner.insert(key, bytes).and_then(|old_bytes| { - T::from_bytes(&old_bytes) - .map_err(|err| { - debug!(error = %err, "Failed to deserialize old CRDT value"); - }) - .ok() - })) - } - - fn remove(&self, key: &str) -> Option { - self.inner.remove(key).and_then(|bytes| { - T::from_bytes(&bytes) - .map_err(|err| { - debug!(error = %err, %key, "Failed to deserialize removed CRDT value"); - }) - .ok() - }) - } - - fn update(&self, key: String, updater: F) -> Result, CrdtSerError> - where - F: FnOnce(Option) -> T, - { - let updated_bytes = self.inner.try_upsert(key, |current_bytes| { - let current = current_bytes.and_then(|bytes| { - T::from_bytes(bytes) - .map_err(|err| { - debug!(error = %err, "Failed to deserialize current CRDT value"); - }) - .ok() - }); - - let updated = updater(current); - updated.to_bytes() - })?; - - Ok(T::from_bytes(&updated_bytes) - .map_err(|err| { - debug!(error = %err, "Failed to deserialize updated CRDT value"); - err - }) - .ok()) - } - - fn update_if(&self, key: String, updater: F) -> Result<(Option, bool), CrdtSerError> - where - F: FnOnce(Option) -> Option, - { - let (updated_bytes, changed) = self.inner.try_upsert_if(key, |current_bytes| { - let current = current_bytes.and_then(|bytes| { - T::from_bytes(bytes) - .map_err(|err| { - debug!(error = %err, "Failed to deserialize current CRDT value"); - }) - .ok() - }); - - let updated = updater(current); - updated.map(|value| value.to_bytes()).transpose() - })?; - - let value = T::from_bytes(&updated_bytes) - .map_err(|err| { - debug!(error = %err, "Failed to deserialize conditionally updated CRDT value"); - err - }) - .ok(); - - Ok((value, changed)) - } - - fn len(&self) -> usize { - self.inner.len() - } - - fn is_empty(&self) -> bool { - self.len() == 0 - } - - fn merge(&self, log: &OperationLog) { - self.inner.merge(log); - } - - fn get_operation_log(&self) -> OperationLog { - self.inner.get_operation_log() - } - - fn all(&self) -> BTreeMap { - self.inner - .all() - .into_iter() - .filter_map(|(k, v)| { - let key_for_log = k.clone(); - T::from_bytes(&v) - .map(|val| (k, val)) - .map_err(|err| { - debug!(error = %err, key = %key_for_log, "Failed to deserialize CRDT value in all()"); - }) - .ok() - }) - .collect() - } - - /// Remove tombstoned keys from CRDT metadata maps. - fn gc_tombstones(&self) -> usize { - self.inner.gc_tombstones() - } -} - -impl Default for CrdtStore { - fn default() -> Self { - Self::new() - } -} - -/// Store type identifier -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum StoreType { - Membership, - App, - Worker, - Policy, - RateLimit, -} - -impl StoreType { - pub fn as_str(self) -> &'static str { - match self { - StoreType::Membership => "membership", - StoreType::App => "app", - StoreType::Worker => "worker", - StoreType::Policy => "policy", - StoreType::RateLimit => "rate_limit", - } - } - - /// Convert to proto StoreType (i32) - pub fn to_proto(self) -> i32 { - use super::service::gossip::StoreType as ProtoStoreType; - match self { - StoreType::Membership => ProtoStoreType::Membership as i32, - StoreType::App => ProtoStoreType::App as i32, - StoreType::Worker => ProtoStoreType::Worker as i32, - StoreType::Policy => ProtoStoreType::Policy as i32, - StoreType::RateLimit => ProtoStoreType::RateLimit as i32, - } - } - - /// Convert from proto StoreType (i32) to local StoreType - pub fn from_proto(proto_value: i32) -> Self { - match proto_value { - 0 => StoreType::Membership, - 1 => StoreType::App, - 2 => StoreType::Worker, - 3 => StoreType::Policy, - 4 => StoreType::RateLimit, - unknown => { - tracing::warn!( - proto_value = unknown, - "Unknown StoreType proto value, defaulting to Membership" - ); - StoreType::Membership - } - } - } -} - -/// Membership state entry -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Default)] -pub struct MembershipState { - pub name: String, - pub address: String, - pub status: i32, // NodeStatus enum value - pub version: u64, - pub metadata: BTreeMap>, -} - -/// App state entry (application configuration) -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Default)] -pub struct AppState { - pub key: String, - pub value: Vec, // Serialized config - pub version: u64, -} - -/// Global rate limit configuration -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] -pub struct RateLimitConfig { - pub limit_per_second: u64, -} - -/// Key for global rate limit configuration in AppStore -pub const GLOBAL_RATE_LIMIT_KEY: &str = "global_rate_limit"; -/// Key for global rate limit counter in RateLimitStore -pub const GLOBAL_RATE_LIMIT_COUNTER_KEY: &str = "global"; - -// `WorkerState` is now defined in `crate::types`; re-exported here -// so legacy internal modules (sync, collector) keep compiling -// during the v1 teardown. The whole `stores.rs` file is slated -// for deletion once those callers are gone. -pub use crate::types::WorkerState; - -/// Policy state entry -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Default)] -pub struct PolicyState { - pub model_id: String, - pub policy_type: String, - pub config: Vec, // Serialized policy config - pub version: u64, -} - -/// Helper function to get policy state key for a model -pub fn policy_key(model_id: &str) -> String { - format!("policy:{model_id}") -} - -/// Helper function to get tree state key for a model -pub fn tree_state_key(model_id: &str) -> String { - format!("tree:{model_id}") -} - -macro_rules! define_state_store { - ($store_name:ident, $value_type:ty) => { - #[derive(Debug, Clone)] - pub struct $store_name { - inner: CrdtStore<$value_type>, - } - - impl $store_name { - pub fn new() -> Self { - Self { - inner: CrdtStore::new(), - } - } - - /// Mutation generation counter. Cheap check to skip unchanged stores. - pub fn generation(&self) -> u64 { - self.inner.generation() - } - - pub fn get(&self, key: &str) -> Option<$value_type> { - self.inner.get(key) - } - - pub fn insert( - &self, - key: String, - value: $value_type, - ) -> Result, CrdtSerError> { - self.inner.insert(key, value) - } - - pub fn remove(&self, key: &str) { - self.inner.remove(key); - } - - pub fn merge(&self, log: &OperationLog) { - self.inner.merge(log); - } - - pub fn get_operation_log(&self) -> OperationLog { - self.inner.get_operation_log() - } - - pub fn update( - &self, - key: String, - updater: F, - ) -> Result, CrdtSerError> - where - F: FnOnce(Option<$value_type>) -> $value_type, - { - self.inner.update(key, updater) - } - - pub fn update_if( - &self, - key: String, - updater: F, - ) -> Result<(Option<$value_type>, bool), CrdtSerError> - where - F: FnOnce(Option<$value_type>) -> Option<$value_type>, - { - self.inner.update_if(key, updater) - } - - pub fn len(&self) -> usize { - self.inner.len() - } - - pub fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - pub fn all(&self) -> BTreeMap { - self.inner.all() - } - - /// Remove tombstoned keys from CRDT metadata to bound memory growth. - pub fn gc_tombstones(&self) -> usize { - self.inner.gc_tombstones() - } - } - - impl Default for $store_name { - fn default() -> Self { - Self::new() - } - } - }; -} - -define_state_store!(MembershipStore, MembershipState); -define_state_store!(AppStore, AppState); -define_state_store!(WorkerStore, WorkerState); -define_state_store!(PolicyStore, PolicyState); - -// ============================================================================ -// Rate Limit Counter - Simplified Counter Using CrdtOrMap -// ============================================================================ - -/// Counter value wrapper for rate limiting -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] -struct CounterValue { - value: i64, -} - -/// Rate-limit counter store (using CrdtOrMap with consistent hashing) -#[derive(Debug, Clone)] -pub struct RateLimitStore { - counters: CrdtStore, - hash_ring: Arc>, - self_name: String, - actor_replica_ids: Arc>, -} - -impl RateLimitStore { - const SHARD_SEPARATOR: &'static str = "::actor:"; - - pub fn new(self_name: String) -> Self { - Self { - counters: CrdtStore::new(), - hash_ring: Arc::new(RwLock::new(ConsistentHashRing::new())), - self_name, - actor_replica_ids: Arc::new(DashMap::new()), - } - } - - fn shard_key(key: &str, actor: &str) -> String { - format!("{key}{}{actor}", Self::SHARD_SEPARATOR) - } - - fn split_shard_key(shard_key: &str) -> Option<(&str, &str)> { - shard_key.rsplit_once(Self::SHARD_SEPARATOR) - } - - fn base_key(shard_key: &str) -> &str { - Self::split_shard_key(shard_key).map_or(shard_key, |(base, _)| base) - } - - fn replica_id_for_actor(&self, actor: &str) -> ReplicaId { - if let Ok(replica_id) = ReplicaId::from_string(actor) { - return replica_id; - } - - *self.actor_replica_ids.entry(actor.to_string()).or_default() - } - - fn aggregate_counter(&self, key: &str) -> Option { - let all_counters = self.counters.all(); - let mut has_shard = false; - let mut total = 0; - - for (shard_key, counter) in all_counters { - if Self::base_key(&shard_key) == key { - has_shard = true; - total += counter.value; - } - } - - if has_shard { - Some(total) - } else { - None - } - } - - /// Update the hash ring with current membership - pub fn update_membership(&self, nodes: &[String]) { - let mut ring = self.hash_ring.write(); - ring.update_membership(nodes); - debug!("Updated rate-limit hash ring with {} nodes", nodes.len()); - } - - /// Check if this node is an owner of a key - pub fn is_owner(&self, key: &str) -> bool { - let ring = self.hash_ring.read(); - ring.is_owner(key, &self.self_name) - } - - /// Get owners for a key - pub fn get_owners(&self, key: &str) -> Vec { - let ring = self.hash_ring.read(); - ring.get_owners(key) - } - - /// Get or create counter (only if this node is an owner) - #[expect(dead_code)] - fn get_or_create_counter_internal(&self, key: String) -> Option { - if !self.is_owner(&key) { - return None; - } - - let shard_key = Self::shard_key(&key, &self.self_name); - if let Some(counter) = self.counters.get(&shard_key) { - return Some(counter.value); - } - - let _ = self.counters.insert(shard_key, CounterValue::default()); - Some(0) - } - - pub fn get_counter(&self, key: &str) -> Option { - if !self.is_owner(key) { - return None; - } - self.aggregate_counter(key) - } - - /// Get all actor shards as (base_key, actor, value). - pub fn all_shards(&self) -> Vec<(String, String, i64)> { - self.counters - .all() - .into_iter() - .filter_map(|(shard_key, counter)| { - Self::split_shard_key(&shard_key).map(|(base_key, actor)| { - (base_key.to_string(), actor.to_string(), counter.value) - }) - }) - .collect() - } - - /// Increment counter (only if this node is an owner) - pub fn inc(&self, key: String, actor: String, delta: i64) { - if !self.is_owner(&key) { - return; - } - - let shard_key = Self::shard_key(&key, &actor); - if let Err(err) = self.counters.update(shard_key, |current| CounterValue { - value: current.map_or(delta, |existing| existing.value + delta), - }) { - debug!(error = %err, %key, %actor, "Failed to update rate-limit counter shard"); - } - } - - /// Set a snapshot value for one actor shard. - pub fn set_counter_snapshot(&self, key: String, actor: String, counter_value: i64) { - if !self.is_owner(&key) { - return; - } - - let shard_key = Self::shard_key(&key, &actor); - if let Err(err) = self.counters.insert( - shard_key, - CounterValue { - value: counter_value, - }, - ) { - debug!(error = %err, %key, %actor, "Failed to set rate-limit counter snapshot"); - } - } - - /// Build serialized snapshot payload and shard key for a counter value. - /// - /// NOTE: This intentionally does not fabricate CRDT operation IDs. - pub fn snapshot_payload_for_counter_value( - key: String, - actor: String, - counter_value: i64, - ) -> Option<(String, Vec)> { - let bytes = match (CounterValue { - value: counter_value, - }) - .to_bytes() - { - Ok(bytes) => bytes, - Err(err) => { - debug!(error = %err, "Failed to serialize rate-limit counter snapshot"); - return None; - } - }; - - let shard_key = Self::shard_key(&key, &actor); - Some((shard_key, bytes)) - } - - pub fn apply_counter_snapshot_payload( - &self, - shard_key: String, - actor: &str, - timestamp: u64, - payload: &[u8], - ) { - let Some((base_key, _)) = Self::split_shard_key(&shard_key) else { - debug!(%shard_key, "Invalid rate-limit shard key in snapshot payload"); - return; - }; - - if !self.is_owner(base_key) { - return; - } - - if let Err(err) = CounterValue::from_bytes(payload) { - debug!(error = %err, %shard_key, "Failed to decode rate-limit snapshot payload"); - return; - } - - let replica_id = self.replica_id_for_actor(actor); - let mut log = OperationLog::new(); - log.append(Operation::insert( - shard_key, - payload.to_vec(), - timestamp, - replica_id, - )); - self.counters.merge(&log); - } - - /// Get counter value - pub fn value(&self, key: &str) -> Option { - self.aggregate_counter(key) - } - - /// Merge operation log from another node - pub fn merge(&self, log: &OperationLog) { - self.counters.merge(log); - } - - /// Get operation log for synchronization - pub fn get_operation_log(&self) -> OperationLog { - self.counters.get_operation_log() - } - - /// Get all counter keys - pub fn keys(&self) -> Vec { - self.counters - .all() - .keys() - .map(|key| Self::base_key(key).to_string()) - .collect::>() - .into_iter() - .collect() - } - - /// Check if we need to transfer ownership due to node failure - pub fn check_ownership_transfer(&self, failed_nodes: &[String]) -> Vec { - let mut affected_keys = Vec::new(); - let ring = self.hash_ring.read(); - for key in self.keys() { - let owners = ring.get_owners(&key); - if owners.iter().any(|owner| failed_nodes.contains(owner)) - && ring.is_owner(&key, &self.self_name) - { - affected_keys.push(key); - } - } - - affected_keys - } -} - -impl Default for RateLimitStore { - fn default() -> Self { - Self::new("default".to_string()) - } -} - -/// All state stores container -#[derive(Debug, Clone)] -pub struct StateStores { - pub membership: MembershipStore, - pub app: AppStore, - pub worker: WorkerStore, - pub policy: PolicyStore, - pub rate_limit: RateLimitStore, - /// Pending tree operations for delta sync. - /// Key: tree key (e.g., "tree:model-name"), Value: operations since last successful send. - pub tree_ops_pending: DashMap>, - /// Per-key version counters for tree state, bumped atomically on every - /// `sync_tree_operation` call. Replaces the expensive CRDT - /// `policy.update()` that previously serialized the entire TreeState - /// config blob (~1 MB) on every request. - pub tree_versions: DashMap>, - /// Global generation counter for tree changes. The incremental collector - /// checks this (in addition to `policy.generation()`) to decide whether - /// the policy store needs scanning. - pub tree_generation: Arc, - /// Materialized tree state config blobs, stored outside the CRDT policy - /// store to avoid operation log memory accumulation (~50 MB/min leak). - /// Key: tree key (e.g., "tree:model-name"), Value: bincode-serialized TreeState. - pub tree_configs: DashMap>, - /// Tenant delta buffer for efficient two-layer sync. - /// Key: model_id, Value: pending tenant inserts since last gossip round. - /// Drained by the collector each round and sent as TenantDelta messages. - pub tenant_delta_inserts: DashMap>, - /// Tenant eviction buffer — same pattern as inserts. - pub tenant_delta_evictions: DashMap>, -} - -impl StateStores { - pub fn new() -> Self { - Self { - membership: MembershipStore::new(), - app: AppStore::new(), - worker: WorkerStore::new(), - policy: PolicyStore::new(), - rate_limit: RateLimitStore::new("default".to_string()), - tree_ops_pending: DashMap::new(), - tree_versions: DashMap::new(), - tree_generation: Arc::new(AtomicU64::new(0)), - tree_configs: DashMap::new(), - tenant_delta_inserts: DashMap::new(), - tenant_delta_evictions: DashMap::new(), - } - } - - pub fn with_self_name(self_name: String) -> Self { - Self { - membership: MembershipStore::new(), - app: AppStore::new(), - worker: WorkerStore::new(), - policy: PolicyStore::new(), - rate_limit: RateLimitStore::new(self_name), - tree_ops_pending: DashMap::new(), - tree_versions: DashMap::new(), - tree_generation: Arc::new(AtomicU64::new(0)), - tree_configs: DashMap::new(), - tenant_delta_inserts: DashMap::new(), - tenant_delta_evictions: DashMap::new(), - } - } - - /// Get the current version for a tree key. - pub fn tree_version(&self, key: &str) -> u64 { - self.tree_versions - .get(key) - .map(|v| v.load(Ordering::Acquire)) - .unwrap_or(0) - } - - /// Atomically bump the version for a tree key and the global tree - /// generation. Returns the new version. This is O(1) with no - /// serialization — unlike `policy.update()` which deserializes and - /// re-serializes the entire config blob. - /// - /// On the first call for a given key the counter is seeded from the - /// existing PolicyState version (if any) so that local ops don't - /// regress the advertised version below a remote/checkpointed baseline. - pub fn bump_tree_version(&self, key: &str) -> u64 { - let version = self - .tree_versions - .entry(key.to_string()) - .or_insert_with(|| { - // Seed from the committed tree config version so deltas - // start above any existing remote/checkpointed state. - let base = self - .tree_configs - .get(key) - .and_then(|bytes| { - super::tree_ops::TreeState::from_bytes(&bytes) - .ok() - .map(|ts| ts.version) - }) - .unwrap_or(0); - Arc::new(AtomicU64::new(base)) - }) - .fetch_add(1, Ordering::Release) - + 1; - self.tree_generation.fetch_add(1, Ordering::Release); - version - } - - /// Advance the tree version counter to at least `version`. - /// Called after applying remote deltas/full-state updates so that - /// subsequent local ops start above the remote baseline. - pub fn advance_tree_version(&self, key: &str, version: u64) { - self.tree_versions - .entry(key.to_string()) - .or_insert_with(|| Arc::new(AtomicU64::new(0))) - .fetch_max(version, Ordering::Release); - } - - /// Run garbage collection across all stores, removing tombstoned CRDT - /// metadata entries. Returns the total number of entries removed. - pub fn gc_tombstones(&self) -> usize { - self.membership.gc_tombstones() - + self.app.gc_tombstones() - + self.worker.gc_tombstones() - + self.policy.gc_tombstones() - } - - /// Remove stale tree entries that have no pending operations. - /// Returns the total number of entries removed across all tree maps. - /// - /// `tree_versions` entries are never removed during normal operation, so - /// using `tree_versions.contains_key()` as a liveness signal is - /// ineffective — it always returns true for any key that was ever used. - /// Instead, we use pending ops as the primary liveness indicator. - pub fn gc_stale_tree_entries(&self) -> usize { - let before = - self.tree_configs.len() + self.tree_versions.len() + self.tree_ops_pending.len(); - - // tree_configs is the authoritative store — only remove entries - // for models that are truly gone (no version counter AND no - // pending ops). - // - // An active tree has: tree_configs entry (from checkpoint or - // remote apply). Pending ops drain every 10 rounds via - // checkpoint, so empty pending does NOT mean the tree is stale. - - // Remove tree_versions for models with no tree_configs AND no - // pending ops (model was fully deregistered). - self.tree_versions.retain(|k, _| { - self.tree_configs.contains_key(k) - || self.tree_ops_pending.get(k).is_some_and(|v| !v.is_empty()) - }); - - // Remove empty pending op buffers for models with no tree_configs. - self.tree_ops_pending - .retain(|k, v| !v.is_empty() || self.tree_configs.contains_key(k)); - - // Only remove tree_configs for models with no version counter - // AND no pending ops — these are truly orphaned entries. - self.tree_configs.retain(|k, _| { - self.tree_versions.contains_key(k) - || self.tree_ops_pending.get(k).is_some_and(|v| !v.is_empty()) - }); - - let after = - self.tree_configs.len() + self.tree_versions.len() + self.tree_ops_pending.len(); - before.saturating_sub(after) - } -} - -impl Default for StateStores { - fn default() -> Self { - Self::new() - } -} - -#[cfg(test)] -mod tests { - use std::collections::BTreeMap; - - use super::*; - use crate::service::gossip::NodeStatus; - - #[test] - fn test_membership_store() { - let store = MembershipStore::new(); - let key = "node1".to_string(); - let state = MembershipState { - name: "node1".to_string(), - address: "127.0.0.1:8000".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: BTreeMap::new(), - }; - - let _ = store.insert(key.clone(), state.clone()); - assert_eq!(store.get(&key).unwrap().name, "node1"); - - store.remove(&key); - assert!(store.get(&key).is_none()); - } - - #[test] - fn test_app_store() { - let store = AppStore::new(); - let key = "app_key1".to_string(); - let state = AppState { - key: "app_key1".to_string(), - value: b"app_value".to_vec(), - version: 1, - }; - - let _ = store.insert(key.clone(), state.clone()); - assert_eq!(store.get(&key).unwrap().key, "app_key1"); - } - - #[test] - fn test_worker_store() { - let store = WorkerStore::new(); - let key = "worker1".to_string(); - let state = WorkerState { - worker_id: "worker1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health: true, - load: 0.5, - version: 1, - spec: vec![], - }; - - let _ = store.insert(key.clone(), state.clone()); - assert_eq!(store.get(&key).unwrap().worker_id, "worker1"); - } - - #[test] - fn test_policy_store() { - let store = PolicyStore::new(); - let key = "policy:model1".to_string(); - let state = PolicyState { - model_id: "model1".to_string(), - policy_type: "cache_aware".to_string(), - config: b"config_data".to_vec(), - version: 1, - }; - - let _ = store.insert(key.clone(), state.clone()); - assert_eq!(store.get(&key).unwrap().model_id, "model1"); - } - - #[test] - fn test_rate_limit_store_update_membership() { - let store = RateLimitStore::new("node1".to_string()); - - store.update_membership(&[ - "node1".to_string(), - "node2".to_string(), - "node3".to_string(), - ]); - - let owners = store.get_owners("test_key"); - assert_eq!(owners.len(), 3); - assert!( - owners.contains(&"node1".to_string()) - || owners.contains(&"node2".to_string()) - || owners.contains(&"node3".to_string()) - ); - } - - #[test] - fn test_rate_limit_store_is_owner() { - let store = RateLimitStore::new("node1".to_string()); - - store.update_membership(&["node1".to_string()]); - - let test_key = "test_key".to_string(); - let is_owner = store.is_owner(&test_key); - // node1 should be owner since it's the only node - assert!(is_owner); - } - - #[test] - fn test_rate_limit_store_inc_only_owner() { - let store = RateLimitStore::new("node1".to_string()); - - store.update_membership(&["node1".to_string()]); - - let test_key = "test_key".to_string(); - if store.is_owner(&test_key) { - store.inc(test_key.clone(), "node1".to_string(), 5); - - let value = store.value(&test_key); - assert_eq!(value, Some(5)); - } - } - - #[test] - fn test_rate_limit_store_inc_non_owner() { - let store = RateLimitStore::new("node1".to_string()); - - // Setup membership without node1 as owner - store.update_membership(&["node2".to_string(), "node3".to_string()]); - - let test_key = "test_key".to_string(); - if !store.is_owner(&test_key) { - store.inc(test_key.clone(), "node1".to_string(), 5); - - // Should not increment if not owner - let value = store.value(&test_key); - assert_eq!(value, None); - } - } - - #[test] - fn test_rate_limit_store_merge_counter() { - let store1 = RateLimitStore::new("node1".to_string()); - let store2 = RateLimitStore::new("node2".to_string()); - - store1.update_membership(&["node1".to_string()]); - store2.update_membership(&["node2".to_string()]); - - let test_key = "test_key".to_string(); - - // Both nodes increment their counters - if store1.is_owner(&test_key) { - store1.inc(test_key.clone(), "node1".to_string(), 10); - } - - if store2.is_owner(&test_key) { - store2.inc(test_key.clone(), "node2".to_string(), 5); - } - - // Merge operation log from store2 into store1 - let log2 = store2.get_operation_log(); - store1.merge(&log2); - - // Get aggregated value (if node1 is owner) - if store1.is_owner(&test_key) { - let value = store1.value(&test_key); - assert_eq!(value, Some(15)); - } - } - - #[test] - fn test_rate_limit_store_check_ownership_transfer() { - let store = RateLimitStore::new("node1".to_string()); - - store.update_membership(&[ - "node1".to_string(), - "node2".to_string(), - "node3".to_string(), - ]); - - let test_key = "test_key".to_string(); - - // Setup a counter (if node1 is owner) - if store.is_owner(&test_key) { - store.inc(test_key.clone(), "node1".to_string(), 10); - } - - // Check ownership transfer when node2 fails - let affected = store.check_ownership_transfer(&["node2".to_string()]); - // Should detect if node2 was an owner - let _ = affected; - } - - #[test] - fn test_rate_limit_store_keys() { - let store = RateLimitStore::new("node1".to_string()); - - store.update_membership(&["node1".to_string()]); - - let key1 = "key1".to_string(); - let key2 = "key2".to_string(); - - if store.is_owner(&key1) { - store.inc(key1.clone(), "node1".to_string(), 1); - } - - if store.is_owner(&key2) { - store.inc(key2.clone(), "node1".to_string(), 1); - } - - let keys = store.keys(); - // Should contain keys where node1 is owner - let _ = keys; - } - - #[test] - fn test_state_stores_new() { - let stores = StateStores::new(); - assert_eq!(stores.membership.len(), 0); - assert_eq!(stores.app.len(), 0); - assert_eq!(stores.worker.len(), 0); - assert_eq!(stores.policy.len(), 0); - } - - #[test] - fn test_state_stores_with_self_name() { - let stores = StateStores::with_self_name("test_node".to_string()); - // Rate limit store should have the self_name - let test_key = "test_key".to_string(); - stores - .rate_limit - .update_membership(&["test_node".to_string()]); - assert!(stores.rate_limit.is_owner(&test_key)); - } -} diff --git a/crates/mesh/src/sync.rs b/crates/mesh/src/sync.rs deleted file mode 100644 index 76809b5c8..000000000 --- a/crates/mesh/src/sync.rs +++ /dev/null @@ -1,2374 +0,0 @@ -//! Mesh state synchronization module -//! -//! Handles synchronization of worker and policy states across mesh cluster nodes - -use std::{ - fmt::Debug, - sync::{atomic::Ordering, Arc}, -}; - -use parking_lot::RwLock; -use tracing::{debug, warn}; - -use super::{ - service::gossip::NodeStatus, - stores::{ - policy_key, tree_state_key, PolicyState, RateLimitConfig, StateStores, WorkerState, - GLOBAL_RATE_LIMIT_COUNTER_KEY, GLOBAL_RATE_LIMIT_KEY, - }, - tree_ops::{ - hash_node_path, hash_token_path, TenantDelta, TenantEvict, TenantInsert, TreeKey, - TreeOperation, TreeState, TreeStateDelta, - }, -}; - -pub trait TreeStateSubscriber: Send + Sync + Debug { - fn apply_remote_tree_state(&self, model_id: &str, tree_state: &TreeState); - - /// Apply lightweight tenant delta — inserts and evictions by hash. - /// Default: process global evictions only (where `node_path_hash == GLOBAL_EVICTION_HASH`). - /// Inserts require the actual tree to resolve hashes to nodes, - /// so they are dropped here; implementations that maintain a - /// hash→node index (e.g. `CacheAwarePolicy`) should override. - fn apply_tenant_delta( - &self, - model_id: &str, - _inserts: &[TenantInsert], - evictions: &[TenantEvict], - ) { - // Default: only convert global evictions (hash=GLOBAL_EVICTION_HASH) - // into Remove ops. Targeted evictions (non-zero hash) are skipped - // because we can't resolve the hash without a path index. - let global_evictions: Vec<&TenantEvict> = evictions - .iter() - .filter(|e| e.node_path_hash == crate::tree_ops::GLOBAL_EVICTION_HASH) - .collect(); - - if !global_evictions.is_empty() { - let mut tree_state = TreeState::new(model_id.to_string()); - for evict in global_evictions { - tree_state.add_operation(TreeOperation::Remove(crate::tree_ops::TreeRemoveOp { - tenant: evict.worker_url.clone(), - })); - } - self.apply_remote_tree_state(model_id, &tree_state); - } - } - - /// Export the current tree state for a model from the live radix tree. - /// Used by `checkpoint_tree_states` to build periodic structure snapshots - /// WITHOUT accumulating full prompt text in memory on every request. - /// Returns None if the subscriber doesn't have a tree for this model. - fn export_tree_state(&self, _model_id: &str) -> Option { - None - } - - /// Export a compact tree snapshot for a model from the live radix tree. - /// Returns a [`kv_index::snapshot::TreeSnapshot`] that encodes the tree - /// structure with shared prefixes — much smaller than the flat - /// `TreeState` returned by [`export_tree_state`]. - /// - /// Used by `checkpoint_tree_states` to populate `tree_configs` for - /// Layer 2 periodic snapshots. - fn export_tree_snapshot(&self, _model_id: &str) -> Option { - None - } -} - -pub trait WorkerStateSubscriber: Send + Sync + Debug { - fn on_remote_worker_state(&self, state: &WorkerState); -} - -/// Mesh sync manager for coordinating state synchronization -#[derive(Clone, Debug)] -pub struct MeshSyncManager { - pub(crate) stores: Arc, - self_name: String, - tree_state_subscribers: Arc>>>, - worker_state_subscribers: Arc>>>, -} - -impl MeshSyncManager { - pub fn new(stores: Arc, self_name: String) -> Self { - Self { - stores, - self_name, - tree_state_subscribers: Arc::new(RwLock::new(Vec::new())), - worker_state_subscribers: Arc::new(RwLock::new(Vec::new())), - } - } - - pub fn register_tree_state_subscriber(&self, subscriber: Arc) { - self.tree_state_subscribers.write().push(subscriber); - } - - fn notify_tree_state_subscribers(&self, model_id: &str, tree_state: &TreeState) { - let subscribers = self.tree_state_subscribers.read().clone(); - for subscriber in subscribers { - subscriber.apply_remote_tree_state(model_id, tree_state); - } - } - - pub fn register_worker_state_subscriber(&self, subscriber: Arc) { - self.worker_state_subscribers.write().push(subscriber); - } - - fn notify_worker_state_subscribers(&self, state: &WorkerState) { - let subscribers = self.worker_state_subscribers.read().clone(); - for subscriber in subscribers { - subscriber.on_remote_worker_state(state); - } - } - - /// Get the node name (actor) for this sync manager - pub fn self_name(&self) -> &str { - &self.self_name - } - - /// Sync worker state to mesh stores - pub fn sync_worker_state( - &self, - worker_id: String, - model_id: String, - url: String, - health: bool, - load: f64, - spec: Vec, - ) { - let key = worker_id.clone(); - - let updated_state = self.stores.worker.update(key, |current| { - let new_version = current - .map(|state| state.version) - .unwrap_or(0) - .saturating_add(1); - - WorkerState { - worker_id: worker_id.clone(), - model_id, - url, - health, - load, - version: new_version, - spec, - } - }); - - match updated_state { - Ok(Some(state)) => { - debug!( - "Synced worker state to mesh {} (version: {})", - state.worker_id, state.version - ); - } - Ok(None) => {} - Err(err) => { - debug!(error = %err, worker_id = %worker_id, "Failed to sync worker state"); - } - } - } - - /// Remove worker state from mesh stores - pub fn remove_worker_state(&self, worker_id: &str) { - self.stores.worker.remove(worker_id); - debug!("Removed worker state from mesh {}", worker_id); - } - - /// Sync policy state to mesh stores - pub fn sync_policy_state(&self, model_id: String, policy_type: String, config: Vec) { - let key = policy_key(&model_id); - let model_id_for_update = model_id.clone(); - - let updated_state = self.stores.policy.update(key, move |current| { - let new_version = current - .map(|state| state.version) - .unwrap_or(0) - .saturating_add(1); - - PolicyState { - model_id: model_id_for_update, - policy_type, - config, - version: new_version, - } - }); - - match updated_state { - Ok(Some(state)) => { - debug!( - "Synced policy state to mesh model={} (version: {})", - state.model_id, state.version - ); - } - Ok(None) => {} - Err(err) => { - debug!(error = %err, model_id = %model_id, "Failed to sync policy state"); - } - } - } - - /// Remove policy state from mesh stores - pub fn remove_policy_state(&self, model_id: &str) { - let key = policy_key(model_id); - self.stores.policy.remove(&key); - debug!("Removed policy state from mesh model={}", model_id); - } - - /// Get worker state from mesh stores - pub fn get_worker_state(&self, worker_id: &str) -> Option { - self.stores.worker.get(worker_id) - } - - /// Get all worker states from mesh stores - pub fn get_all_worker_states(&self) -> Vec { - self.stores.worker.all().into_values().collect() - } - - /// Get policy state from mesh stores - pub fn get_policy_state(&self, model_id: &str) -> Option { - let key = policy_key(model_id); - self.stores.policy.get(&key) - } - - /// Get all policy states from mesh stores - pub fn get_all_policy_states(&self) -> Vec { - self.stores.policy.all().into_values().collect() - } - - /// Apply worker state update from remote node - /// The actor should be extracted from the state update context (e.g., from StateUpdate message) - pub fn apply_remote_worker_state(&self, state: WorkerState, actor: Option) { - let key = state.worker_id.clone(); - let actor = actor.unwrap_or_else(|| "remote".to_string()); - let mut current_version = 0; - - let update_result = self.stores.worker.update_if(key, |current| { - current_version = current - .as_ref() - .map(|existing| existing.version) - .unwrap_or(0); - if state.version > current_version { - Some(state.clone()) - } else { - None - } - }); - - match update_result { - Ok((_, true)) => { - debug!( - "Applied remote worker state update: {} (version: {} -> {})", - state.worker_id, current_version, state.version - ); - self.notify_worker_state_subscribers(&state); - } - Ok((_, false)) => { - debug!( - "Skipped remote worker state update: {} (version {} <= current {})", - state.worker_id, state.version, current_version - ); - } - Err(err) => { - debug!(error = %err, worker_id = %state.worker_id, actor = %actor, "Failed to apply remote worker state update"); - } - } - } - - /// Apply policy state update from remote node - /// The actor should be extracted from the state update context (e.g., from StateUpdate message) - pub fn apply_remote_policy_state(&self, state: PolicyState, actor: Option) { - let key = policy_key(&state.model_id); - let actor = actor.unwrap_or_else(|| "remote".to_string()); - let mut current_version = 0; - - let update_result = self.stores.policy.update_if(key, |current| { - current_version = current - .as_ref() - .map(|existing| existing.version) - .unwrap_or(0); - if state.version > current_version { - Some(state.clone()) - } else { - None - } - }); - - match update_result { - Ok((_, true)) => { - debug!( - "Applied remote policy state update: {} (version: {} -> {})", - state.model_id, current_version, state.version - ); - } - Ok((_, false)) => { - debug!( - "Skipped remote policy state update: {} (version {} <= current {})", - state.model_id, state.version, current_version - ); - } - Err(err) => { - debug!(error = %err, model_id = %state.model_id, actor = %actor, "Failed to apply remote policy state update"); - } - } - } - - /// Update rate-limit hash ring with current membership - pub fn update_rate_limit_membership(&self) { - // Get all alive nodes from membership store - let all_members = self.stores.membership.all(); - let alive_nodes: Vec = all_members - .values() - .filter(|m| m.status == NodeStatus::Alive as i32) - .map(|m| m.name.clone()) - .collect(); - - self.stores.rate_limit.update_membership(&alive_nodes); - debug!( - "Updated rate-limit hash ring with {} alive nodes", - alive_nodes.len() - ); - } - - /// Handle node failure and transfer rate-limit ownership - pub fn handle_node_failure(&self, failed_nodes: &[String]) { - if failed_nodes.is_empty() { - return; - } - - debug!("Handling node failure for rate-limit: {:?}", failed_nodes); - - // Check which keys need ownership transfer - let affected_keys = self - .stores - .rate_limit - .check_ownership_transfer(failed_nodes); - - if !affected_keys.is_empty() { - debug!( - "Ownership transfer needed for {} rate-limit keys", - affected_keys.len() - ); - - // Update membership to reflect node failures - self.update_rate_limit_membership(); - - // For each affected key, we may need to initialize counters if we're now an owner - for key in &affected_keys { - if self.stores.rate_limit.is_owner(key) { - debug!("This node is now owner of rate-limit key: {}", key); - // Counter will be created on first inc() call - } - } - } - } - - /// Sync rate-limit counter increment (only if this node is an owner) - pub fn sync_rate_limit_inc(&self, key: String, delta: i64) { - if !self.stores.rate_limit.is_owner(&key) { - // Not an owner, skip - return; - } - - self.stores - .rate_limit - .inc(key.clone(), self.self_name.clone(), delta); - debug!("Synced rate-limit increment: key={}, delta={}", key, delta); - } - - /// Apply remote rate-limit counter update (merge CRDT) - pub fn apply_remote_rate_limit_counter(&self, log: &super::crdt_kv::OperationLog) { - // Merge operation log regardless of ownership (for CRDT consistency) - self.stores.rate_limit.merge(log); - debug!("Applied remote rate-limit counter update"); - } - - /// Apply remote rate-limit counter snapshot encoded as raw i64. - pub fn apply_remote_rate_limit_counter_value(&self, key: String, counter_value: i64) { - self.apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - key, - "remote".to_string(), - counter_value, - 0, - ); - } - - pub fn apply_remote_rate_limit_counter_value_with_actor( - &self, - key: String, - actor: String, - counter_value: i64, - ) { - self.apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - key, - actor, - counter_value, - 0, - ); - } - - pub fn apply_remote_rate_limit_counter_value_with_actor_and_timestamp( - &self, - key: String, - actor: String, - counter_value: i64, - timestamp: u64, - ) { - if let Some((shard_key, payload)) = - super::stores::RateLimitStore::snapshot_payload_for_counter_value( - key, - actor.clone(), - counter_value, - ) - { - self.stores - .rate_limit - .apply_counter_snapshot_payload(shard_key, &actor, timestamp, &payload); - debug!("Applied remote rate-limit counter snapshot payload"); - } - } - - /// Get rate-limit value (aggregate from all owners) - pub fn get_rate_limit_value(&self, key: &str) -> Option { - self.stores.rate_limit.value(key) - } - - /// Get global rate limit configuration from AppStore - pub fn get_global_rate_limit_config(&self) -> Option { - self.stores - .app - .get(GLOBAL_RATE_LIMIT_KEY) - .and_then(|app_state| bincode::deserialize::(&app_state.value).ok()) - } - - /// Check if global rate limit is exceeded - /// Returns (is_exceeded, current_count, limit) - pub fn check_global_rate_limit(&self) -> (bool, i64, u64) { - let config = self.get_global_rate_limit_config().unwrap_or_default(); - - if config.limit_per_second == 0 { - // Rate limit disabled - return (false, 0, 0); - } - - // Increment counter if this node is an owner - self.sync_rate_limit_inc(GLOBAL_RATE_LIMIT_COUNTER_KEY.to_string(), 1); - - // Get aggregated counter value from all owners - let current_count = self - .get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY) - .unwrap_or(0); - - let is_exceeded = current_count > config.limit_per_second as i64; - (is_exceeded, current_count, config.limit_per_second) - } - - /// Reset global rate limit counter (called periodically for time window reset) - pub fn reset_global_rate_limit_counter(&self) { - // Reset by decrementing the current value - // Since we use PNCounter, we can't directly reset, but we can track the window - // For simplicity, we'll use a time-based approach where counters are reset periodically - // The actual reset logic will be handled by the window manager - let current_count = self - .get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY) - .unwrap_or(0); - - if current_count > 0 { - // Decrement by current count to effectively reset - // Note: This is a workaround since PNCounter doesn't support direct reset - // In production, you might want to use a different approach like timestamped counters - self.sync_rate_limit_inc(GLOBAL_RATE_LIMIT_COUNTER_KEY.to_string(), -current_count); - } - } - - /// Sync tree operation to mesh stores. - /// - /// This is called on every request (hot path). The operation is appended to - /// the pending buffer for delta sync — the collector serializes and sends it - /// to peers. We do NOT read/deserialize/serialize the full TreeState here, - /// because that is O(tree_size) per request and caused multi-GB memory usage - /// at 200+ rps. - /// - /// The policy store version is bumped so the generation-based collector - /// detects the change, but the `config` blob is NOT updated on every call. - /// It is rebuilt lazily by the collector when a full-state fallback is needed. - /// Lightweight sync: accepts a pre-computed hash + tenant, avoiding - /// the 80k+ String allocation from TreeKey::Text on every request. - pub fn sync_tree_insert_hash(&self, model_id: &str, path_hash: u64, tenant: &str) { - let key = tree_state_key(model_id); - - self.stores - .tenant_delta_inserts - .entry(model_id.to_string()) - .or_default() - .push(TenantInsert { - node_path_hash: path_hash, - worker_url: tenant.to_string(), - epoch: self.stores.tree_version(&key), - }); - - self.stores.bump_tree_version(&key); - } - - #[expect( - clippy::unnecessary_wraps, - reason = "Public API — callers handle Result; changing return type is a cross-crate break" - )] - pub fn sync_tree_operation( - &self, - model_id: String, - operation: TreeOperation, - ) -> Result<(), String> { - let key = tree_state_key(&model_id); - - // Buffer a lightweight tenant delta — 24 bytes per insert (hash + epoch) - // instead of 80k+ bytes (full prompt text). - match &operation { - TreeOperation::Insert(insert) => { - let path_hash = match &insert.key { - TreeKey::Text(text) => hash_node_path(text), - TreeKey::Tokens(tokens) => hash_token_path(tokens), - }; - self.stores - .tenant_delta_inserts - .entry(model_id.clone()) - .or_default() - .push(TenantInsert { - node_path_hash: path_hash, - worker_url: insert.tenant.clone(), - epoch: self.stores.tree_version(&key), - }); - } - TreeOperation::Remove(remove) => { - // TODO: capture the specific prefix hash being evicted. - // For now, 0 means "evict from all nodes" (global eviction). - // This is overly aggressive but correct — the next structure - // snapshot will restore any wrongly evicted entries. - self.stores - .tenant_delta_evictions - .entry(model_id.clone()) - .or_default() - .push(TenantEvict { - node_path_hash: crate::tree_ops::GLOBAL_EVICTION_HASH, - worker_url: remove.tenant.clone(), - }); - } - } - - // NOTE: We intentionally do NOT push to tree_ops_pending here. - // That would store the full TreeOperation (including 20KB+ prompt text) - // on every request — 40MB between checkpoints at 200 rps. - // Instead, checkpoint_tree_states exports the live tree via subscribers. - - // Bump the lightweight atomic version counter (O(1), no serialization). - self.stores.bump_tree_version(&key); - - Ok(()) - } - - /// Load the materialized TreeState from `tree_configs`. - /// Returns None if no checkpoint exists for this key. - /// - /// Handles two storage formats: - /// - `TreeState` bytes (from remote full-state updates) - /// - `TreeSnapshot` bytes (from local `checkpoint_tree_states`) - fn materialize_tree_state(&self, key: &str, model_id: &str) -> Option { - let config_bytes = self.stores.tree_configs.get(key)?; - let bytes = config_bytes.value(); - if bytes.is_empty() { - return Some(TreeState::new(model_id.to_string())); - } - // Try TreeState first (remote full-state updates store this format). - if let Ok(ts) = TreeState::from_bytes(bytes) { - return Some(ts); - } - // Fall back to TreeSnapshot (local checkpoint format). - if let Ok(snap) = kv_index::snapshot::TreeSnapshot::from_bytes(bytes) { - let version = self.stores.tree_version(key); - return Some(TreeState::from_snapshot( - model_id.to_string(), - &snap, - version, - )); - } - None - } - - /// Get tree state for a model from mesh stores. - /// Reads from `tree_configs` (populated by periodic checkpoint from live tree). - pub fn get_tree_state(&self, model_id: &str) -> Option { - let key = tree_state_key(model_id); - self.materialize_tree_state(&key, model_id) - } - - pub fn get_all_tree_states(&self) -> Vec { - let mut results = Vec::new(); - - for entry in &self.stores.tree_configs { - let key = entry.key().clone(); - let model_id = key.strip_prefix("tree:").unwrap_or(&key).to_string(); - if let Some(ts) = self.materialize_tree_state(&key, &model_id) { - results.push(ts); - } - } - - results - } - - /// Apply remote tree operation to local stores. - /// This is called when receiving full tree state updates from other nodes. - /// - /// Writes to `tree_configs` (plain DashMap) instead of the CRDT policy - /// store to avoid operation log memory accumulation. - /// - /// Uses `DashMap::entry()` for atomic read-modify-write on `tree_configs` - /// to avoid the TOCTOU gap between `get()` and `insert()`. - pub fn apply_remote_tree_operation( - &self, - model_id: String, - tree_state: TreeState, - actor: Option, - ) { - use dashmap::mapref::entry::Entry; - - let key = tree_state_key(&model_id); - let _actor = actor.unwrap_or_else(|| "remote".to_string()); - - let serialized = match tree_state.to_bytes() { - Ok(bytes) => bytes, - Err(err) => { - debug!(error = %err, model_id = %model_id, "Failed to serialize remote tree state"); - return; - } - }; - - // Atomic read-modify-write via entry() — version check and insert - // happen under the same shard lock, closing the TOCTOU gap. - let applied = match self.stores.tree_configs.entry(key.clone()) { - Entry::Occupied(mut entry) => { - // tree_configs may hold TreeState bytes (from remote) or - // TreeSnapshot bytes (from local checkpoint). Fall back to - // the authoritative atomic version counter if deserialization fails. - let current_version = TreeState::from_bytes(entry.get()) - .ok() - .map(|ts| ts.version) - .unwrap_or_else(|| self.stores.tree_version(&key)); - if tree_state.version > current_version { - entry.insert(serialized); - debug!( - "Applied remote tree state update: model={} (version: {} -> {})", - model_id, current_version, tree_state.version - ); - true - } else { - debug!( - "Skipped remote tree state update: model={} (version {} <= current {})", - model_id, tree_state.version, current_version - ); - false - } - } - Entry::Vacant(entry) => { - entry.insert(serialized); - debug!( - "Applied remote tree state update (new): model={} (version: {})", - model_id, tree_state.version - ); - true - } - }; - - // Subscriber notification and version advancement happen after - // dropping the entry (shard lock released). - if applied { - self.stores.advance_tree_version(&key, tree_state.version); - self.stores.tree_generation.fetch_add(1, Ordering::Release); - self.notify_tree_state_subscribers(&model_id, &tree_state); - } - } - - /// Apply a delta (incremental operations) from a remote node. - /// Merges the delta operations into the existing local tree state, - /// avoiding the cost of replacing the entire tree state on every sync. - /// - /// Uses `DashMap::entry()` for atomic read-modify-write on `tree_configs` - /// to avoid the TOCTOU gap between `get()` and `insert()`. - pub fn apply_remote_tree_delta(&self, delta: TreeStateDelta, actor: Option) { - use dashmap::mapref::entry::Entry; - - let key = tree_state_key(&delta.model_id); - let _actor = actor.unwrap_or_else(|| "remote".to_string()); - let model_id = delta.model_id.clone(); - let ops_count = delta.operations.len(); - - // Perform the atomic read-modify-write inside the entry block. - // Tree construction and serialization happen while holding the - // shard write lock; subscriber notification happens after. - let result: Option<(TreeState, u64)> = match self.stores.tree_configs.entry(key.clone()) { - Entry::Occupied(mut entry) => { - let bytes = entry.get(); - let current_version = if bytes.is_empty() { - 0 - } else { - match TreeState::from_bytes(bytes) { - Ok(ts) => ts.version, - Err(_) => 0, - } - }; - - // Version checks - if delta.base_version > current_version || current_version >= delta.new_version { - debug!( - "Skipped remote tree delta: model={} (base_version={}, new_version={}, current={})", - model_id, delta.base_version, delta.new_version, current_version - ); - return; - } - - // Build base tree from config only. - let mut tree_state = if bytes.is_empty() { - if current_version > 0 { - debug!( - "Skipped remote tree delta: model={} (base_version={}, new_version={}, current={})", - model_id, delta.base_version, delta.new_version, current_version - ); - return; - } - TreeState::new(delta.model_id.clone()) - } else { - match TreeState::from_bytes(bytes) { - Ok(state) => state, - Err(err) => { - warn!( - model_id = %delta.model_id, - error = %err, - "Corrupted tree state — rejecting delta to avoid data loss" - ); - return; - } - } - }; - - let old_version = current_version; - for op in &delta.operations { - tree_state.add_operation(op.clone()); - } - let new_version = tree_state.version; - - match tree_state.to_bytes() { - Ok(serialized) => { - entry.insert(serialized); - debug!( - "Applied remote tree delta: model={} (version: {} -> +{} ops)", - model_id, old_version, ops_count - ); - Some((tree_state, new_version)) - } - Err(err) => { - debug!(error = %err, model_id = %model_id, "Failed to serialize tree state after delta apply"); - None - } - } - } - Entry::Vacant(entry) => { - // No existing config — new tree from delta. - if delta.base_version > 0 { - debug!( - "Skipped remote tree delta: model={} (base_version={}, new_version={}, no local state)", - model_id, delta.base_version, delta.new_version - ); - return; - } - let mut tree_state = TreeState::new(delta.model_id.clone()); - for op in &delta.operations { - tree_state.add_operation(op.clone()); - } - let new_version = tree_state.version; - - match tree_state.to_bytes() { - Ok(serialized) => { - entry.insert(serialized); - debug!( - "Applied remote tree delta (new tree): model={} (+{} ops)", - model_id, ops_count - ); - Some((tree_state, new_version)) - } - Err(err) => { - debug!(error = %err, model_id = %model_id, "Failed to serialize new tree state from delta"); - None - } - } - } - }; - - // Notification happens outside the entry block (shard lock released). - if let Some((tree_state, new_version)) = result { - self.stores.advance_tree_version(&key, new_version); - self.stores.tree_generation.fetch_add(1, Ordering::Release); - self.notify_tree_state_subscribers(&model_id, &tree_state); - } - } - - /// Apply a lightweight tenant delta from a remote node. - /// Updates the local radix tree directly via subscribers without - /// going through the CRDT or the full TreeState machinery. - pub fn apply_remote_tenant_delta(&self, delta: TenantDelta, _actor: Option) { - let key = tree_state_key(&delta.model_id); - - if delta.inserts.is_empty() && delta.evictions.is_empty() { - return; - } - - // No version check — both routers independently bump tree_version - // on local inserts, so the remote delta's version can be lower than - // the local version even though it contains novel inserts. Tenant - // inserts are idempotent (insert_text is a no-op if the tenant - // already exists at the node), so applying "stale" deltas is safe. - - debug!( - model_id = %delta.model_id, - inserts = delta.inserts.len(), - evictions = delta.evictions.len(), - version = delta.version, - "Applying remote tenant delta" - ); - - // Clone subscriber list before calling back — same pattern as - // notify_tree_state_subscribers — so we don't hold the read guard - // during potentially expensive subscriber callbacks. - let subscribers = self.tree_state_subscribers.read().clone(); - for subscriber in &subscribers { - subscriber.apply_tenant_delta(&delta.model_id, &delta.inserts, &delta.evictions); - } - - // Advance version and bump generation so collector re-scans - self.stores.advance_tree_version(&key, delta.version); - self.stores.tree_generation.fetch_add(1, Ordering::Release); - } - - /// Checkpoint tree state by exporting compact snapshots from the live - /// radix tree via subscribers. - /// - /// Called periodically (~every 10s) to keep `tree_configs` fresh for - /// the periodic structure snapshot (every 30 gossip rounds). - /// - /// Uses [`TreeStateSubscriber::export_tree_snapshot`] to obtain a - /// compact [`kv_index::snapshot::TreeSnapshot`] that preserves shared - /// prefixes. This is much smaller than the flat `TreeState` produced - /// by `export_tree_state` (~2-4 MB vs ~40 MB for 2048 entries sharing - /// 80% prefixes) and avoids accumulating full prompt text in memory. - #[expect( - clippy::unused_self, - reason = "Public API called by controller — removing &self is a breaking change" - )] - pub fn checkpoint_tree_states(&self) { - // FIXME: Layer 2 (full tree snapshots) is disabled because the - // snapshot can be 170+ MB for large trees with long prompts, and - // allocating it every 10s causes allocator fragmentation. Tree data - // currently syncs via Layer 1 only (tenant deltas, ~50 bytes each). - // TODO: implement chunked snapshots or incremental tree diffs so - // Layer 2 works for large trees without excessive memory allocation. - } -} - -#[cfg(test)] -mod tests { - use std::{ - collections::BTreeMap, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - }; - - use super::*; - use crate::{ - collector::CentralCollector, - service::gossip::StateUpdate, - stores::{ - AppState, MembershipState, RateLimitConfig, StateStores, StoreType, - GLOBAL_RATE_LIMIT_COUNTER_KEY, GLOBAL_RATE_LIMIT_KEY, - }, - }; - - /// Test-only helper: collect Policy updates via CentralCollector. - /// Skips PeerWatermark since these tests don't exercise watermark filtering. - fn collect_policy_updates(stores: Arc, self_name: &str) -> Vec { - let central = CentralCollector::new(stores, self_name.to_string()); - let batch = central.collect(); - batch - .updates - .into_iter() - .find(|(t, _)| *t == StoreType::Policy) - .map(|(_, v)| v) - .unwrap_or_default() - } - - fn create_test_sync_manager() -> MeshSyncManager { - let stores = Arc::new(StateStores::new()); - MeshSyncManager::new(stores, "test_node".to_string()) - } - - fn create_test_manager(self_name: String) -> MeshSyncManager { - let stores = Arc::new(StateStores::with_self_name(self_name.clone())); - MeshSyncManager::new(stores, self_name) - } - - #[derive(Debug)] - struct LockCheckingSubscriber { - manager: Arc, - can_acquire_write_lock: Arc, - } - - impl TreeStateSubscriber for LockCheckingSubscriber { - fn apply_remote_tree_state(&self, _model_id: &str, _tree_state: &TreeState) { - let can_acquire_write_lock = self.manager.tree_state_subscribers.try_write().is_some(); - self.can_acquire_write_lock - .store(can_acquire_write_lock, Ordering::SeqCst); - } - } - - #[test] - fn test_sync_manager_new() { - let manager = create_test_sync_manager(); - // Should create without panicking - assert_eq!(manager.get_all_worker_states().len(), 0); - assert_eq!(manager.get_all_policy_states().len(), 0); - } - - #[test] - fn test_sync_worker_state() { - let manager = create_test_manager("node1".to_string()); - - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - let state = manager.get_worker_state("worker1").unwrap(); - assert_eq!(state.worker_id, "worker1"); - assert_eq!(state.model_id, "model1"); - assert_eq!(state.url, "http://localhost:8000"); - assert!(state.health); - assert_eq!(state.load, 0.5); - assert_eq!(state.version, 1); - } - - #[test] - fn test_sync_multiple_worker_states() { - let manager = create_test_sync_manager(); - - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - manager.sync_worker_state( - "worker2".to_string(), - "model1".to_string(), - "http://localhost:8001".to_string(), - false, - 0.8, - vec![], - ); - - manager.sync_worker_state( - "worker3".to_string(), - "model2".to_string(), - "http://localhost:8002".to_string(), - true, - 0.3, - vec![], - ); - - let all_states = manager.get_all_worker_states(); - assert_eq!(all_states.len(), 3); - - let worker1 = manager.get_worker_state("worker1").unwrap(); - assert_eq!(worker1.worker_id, "worker1"); - assert!(worker1.health); - - let worker2 = manager.get_worker_state("worker2").unwrap(); - assert_eq!(worker2.worker_id, "worker2"); - assert!(!worker2.health); - - let worker3 = manager.get_worker_state("worker3").unwrap(); - assert_eq!(worker3.worker_id, "worker3"); - assert_eq!(worker3.model_id, "model2"); - } - - #[test] - fn test_sync_worker_state_version_increment() { - let manager = create_test_manager("node1".to_string()); - - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - let state1 = manager.get_worker_state("worker1").unwrap(); - assert_eq!(state1.version, 1); - - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - false, - 0.8, - vec![], - ); - - let state2 = manager.get_worker_state("worker1").unwrap(); - assert_eq!(state2.version, 2); - assert!(!state2.health); - assert_eq!(state2.load, 0.8); - } - - #[test] - fn test_remove_worker_state() { - let manager = create_test_manager("node1".to_string()); - - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - assert!(manager.get_worker_state("worker1").is_some()); - - manager.remove_worker_state("worker1"); - - assert!(manager.get_worker_state("worker1").is_none()); - assert_eq!(manager.get_all_worker_states().len(), 0); - } - - #[test] - fn test_remove_nonexistent_worker_state() { - let manager = create_test_sync_manager(); - - // Should not panic - manager.remove_worker_state("nonexistent"); - assert!(manager.get_worker_state("nonexistent").is_none()); - } - - #[test] - fn test_sync_policy_state() { - let manager = create_test_manager("node1".to_string()); - - manager.sync_policy_state( - "model1".to_string(), - "cache_aware".to_string(), - b"config_data".to_vec(), - ); - - let state = manager.get_policy_state("model1").unwrap(); - assert_eq!(state.model_id, "model1"); - assert_eq!(state.policy_type, "cache_aware"); - assert_eq!(state.config, b"config_data"); - assert_eq!(state.version, 1); - } - - #[test] - fn test_sync_multiple_policy_states() { - let manager = create_test_sync_manager(); - - manager.sync_policy_state( - "model1".to_string(), - "round_robin".to_string(), - b"config1".to_vec(), - ); - - manager.sync_policy_state( - "model2".to_string(), - "random".to_string(), - b"config2".to_vec(), - ); - - manager.sync_policy_state( - "model3".to_string(), - "consistent_hash".to_string(), - b"config3".to_vec(), - ); - - let all_states = manager.get_all_policy_states(); - assert_eq!(all_states.len(), 3); - - let policy1 = manager.get_policy_state("model1").unwrap(); - assert_eq!(policy1.model_id, "model1"); - assert_eq!(policy1.policy_type, "round_robin"); - - let policy2 = manager.get_policy_state("model2").unwrap(); - assert_eq!(policy2.model_id, "model2"); - assert_eq!(policy2.policy_type, "random"); - } - - #[test] - fn test_remove_policy_state() { - let manager = create_test_sync_manager(); - - manager.sync_policy_state( - "model1".to_string(), - "round_robin".to_string(), - b"config".to_vec(), - ); - - assert!(manager.get_policy_state("model1").is_some()); - - manager.remove_policy_state("model1"); - - assert!(manager.get_policy_state("model1").is_none()); - assert_eq!(manager.get_all_policy_states().len(), 0); - } - - #[test] - fn test_remove_nonexistent_policy_state() { - let manager = create_test_sync_manager(); - - // Should not panic - manager.remove_policy_state("nonexistent"); - assert!(manager.get_policy_state("nonexistent").is_none()); - } - - #[test] - fn test_apply_remote_worker_state() { - let manager = create_test_manager("node1".to_string()); - - // Apply remote state with higher version - let remote_state = WorkerState { - worker_id: "worker1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health: true, - load: 0.5, - version: 5, - spec: vec![], - }; - - manager.apply_remote_worker_state(remote_state.clone(), Some("node2".to_string())); - - let state = manager.get_worker_state("worker1").unwrap(); - assert_eq!(state.version, 5); - } - - #[test] - fn test_apply_remote_worker_state_basic() { - let manager = create_test_sync_manager(); - - let remote_state = WorkerState { - worker_id: "remote_worker1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health: true, - load: 0.6, - version: 1, - spec: vec![], - }; - - manager.apply_remote_worker_state(remote_state.clone(), None); - - let state = manager.get_worker_state("remote_worker1"); - assert!(state.is_some()); - let state = state.unwrap(); - assert_eq!(state.worker_id, "remote_worker1"); - assert_eq!(state.model_id, "model1"); - assert_eq!(state.url, "http://localhost:8000"); - assert!(state.health); - assert_eq!(state.load, 0.6); - } - - #[test] - fn test_apply_remote_worker_state_version_check() { - let manager = create_test_manager("node1".to_string()); - - // First insert local state - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - // Try to apply older version - should be skipped - let old_state = WorkerState { - worker_id: "worker1".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8000".to_string(), - health: false, - load: 0.8, - version: 0, // Older version - spec: vec![], - }; - - manager.apply_remote_worker_state(old_state, Some("node2".to_string())); - - // Should still have version 1 - let state = manager.get_worker_state("worker1").unwrap(); - assert_eq!(state.version, 1); - assert!(state.health); // Not updated - } - - #[test] - fn test_apply_remote_policy_state() { - let manager = create_test_sync_manager(); - - let remote_state = PolicyState { - model_id: "model1".to_string(), - policy_type: "remote_policy".to_string(), - config: b"remote_config".to_vec(), - version: 1, - }; - - manager.apply_remote_policy_state(remote_state.clone(), None); - - let state = manager.get_policy_state("model1"); - assert!(state.is_some()); - let state = state.unwrap(); - assert_eq!(state.model_id, "model1"); - assert_eq!(state.policy_type, "remote_policy"); - assert_eq!(state.config, b"remote_config"); - } - - #[test] - fn test_mixed_local_and_remote_states() { - let manager = create_test_sync_manager(); - - // Add local worker - manager.sync_worker_state( - "local_worker".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - // Add remote worker - let remote_state = WorkerState { - worker_id: "remote_worker".to_string(), - model_id: "model1".to_string(), - url: "http://localhost:8001".to_string(), - health: true, - load: 0.7, - version: 1, - spec: vec![], - }; - manager.apply_remote_worker_state(remote_state, None); - - let all_states = manager.get_all_worker_states(); - assert_eq!(all_states.len(), 2); - - assert!(manager.get_worker_state("local_worker").is_some()); - assert!(manager.get_worker_state("remote_worker").is_some()); - } - - #[test] - fn test_update_worker_state() { - let manager = create_test_sync_manager(); - - // Initial state - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - // Update state - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - false, - 0.9, - vec![], - ); - - let state = manager.get_worker_state("worker1").unwrap(); - assert!(!state.health); - assert_eq!(state.load, 0.9); - assert_eq!(manager.get_all_worker_states().len(), 1); - } - - #[test] - fn test_update_policy_state() { - let manager = create_test_sync_manager(); - - // Initial state - manager.sync_policy_state( - "model1".to_string(), - "round_robin".to_string(), - b"config1".to_vec(), - ); - - // Update state - manager.sync_policy_state( - "model1".to_string(), - "random".to_string(), - b"config2".to_vec(), - ); - - let state = manager.get_policy_state("model1").unwrap(); - assert_eq!(state.policy_type, "random"); - assert_eq!(state.config, b"config2"); - assert_eq!(manager.get_all_policy_states().len(), 1); - } - - #[test] - fn test_get_all_worker_states_empty() { - let manager = create_test_sync_manager(); - let states = manager.get_all_worker_states(); - assert!(states.is_empty()); - } - - #[test] - fn test_get_all_policy_states_empty() { - let manager = create_test_sync_manager(); - let states = manager.get_all_policy_states(); - assert!(states.is_empty()); - } - - #[test] - fn test_update_rate_limit_membership() { - let manager = create_test_manager("node1".to_string()); - - // Add membership nodes - let _ = manager.stores.membership.insert( - "node1".to_string(), - MembershipState { - name: "node1".to_string(), - address: "127.0.0.1:8000".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: BTreeMap::new(), - }, - ); - - let _ = manager.stores.membership.insert( - "node2".to_string(), - MembershipState { - name: "node2".to_string(), - address: "127.0.0.1:8001".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: BTreeMap::new(), - }, - ); - - manager.update_rate_limit_membership(); - - // Check that hash ring was updated - let owners = manager.stores.rate_limit.get_owners("test_key"); - assert!(!owners.is_empty()); - } - - #[test] - fn test_handle_node_failure() { - let manager = create_test_manager("node1".to_string()); - - // Setup membership - let _ = manager.stores.membership.insert( - "node1".to_string(), - MembershipState { - name: "node1".to_string(), - address: "127.0.0.1:8000".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: BTreeMap::new(), - }, - ); - - let _ = manager.stores.membership.insert( - "node2".to_string(), - MembershipState { - name: "node2".to_string(), - address: "127.0.0.1:8001".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: BTreeMap::new(), - }, - ); - - manager.update_rate_limit_membership(); - - // Handle node failure - manager.handle_node_failure(&["node2".to_string()]); - - // Membership should be updated - manager.update_rate_limit_membership(); - } - - #[test] - fn test_sync_rate_limit_inc() { - let manager = create_test_manager("node1".to_string()); - - // Setup membership to make node1 an owner - manager - .stores - .rate_limit - .update_membership(&["node1".to_string()]); - - let test_key = "test_key".to_string(); - if manager.stores.rate_limit.is_owner(&test_key) { - manager.sync_rate_limit_inc(test_key.clone(), 5); - - let value = manager.get_rate_limit_value(&test_key); - assert_eq!(value, Some(5)); - } - } - - #[test] - fn test_sync_rate_limit_inc_non_owner() { - let manager = create_test_manager("node1".to_string()); - - // Setup membership without node1 - manager - .stores - .rate_limit - .update_membership(&["node2".to_string(), "node3".to_string()]); - - let test_key = "test_key".to_string(); - if !manager.stores.rate_limit.is_owner(&test_key) { - manager.sync_rate_limit_inc(test_key.clone(), 5); - - // Should not increment if not owner - let value = manager.get_rate_limit_value(&test_key); - assert_eq!(value, None); - } - } - - #[test] - fn test_get_global_rate_limit_config() { - let manager = create_test_manager("node1".to_string()); - - // Initially should be None - assert!(manager.get_global_rate_limit_config().is_none()); - - // Set config - let config = RateLimitConfig { - limit_per_second: 100, - }; - let serialized = bincode::serialize(&config).unwrap(); - let _ = manager.stores.app.insert( - GLOBAL_RATE_LIMIT_KEY.to_string(), - AppState { - key: GLOBAL_RATE_LIMIT_KEY.to_string(), - value: serialized, - version: 1, - }, - ); - - let retrieved = manager.get_global_rate_limit_config().unwrap(); - assert_eq!(retrieved.limit_per_second, 100); - } - - #[test] - fn test_check_global_rate_limit() { - let manager = create_test_manager("node1".to_string()); - - // Setup config - let config = RateLimitConfig { - limit_per_second: 10, - }; - let serialized = bincode::serialize(&config).unwrap(); - let _ = manager.stores.app.insert( - GLOBAL_RATE_LIMIT_KEY.to_string(), - AppState { - key: GLOBAL_RATE_LIMIT_KEY.to_string(), - value: serialized, - version: 1, - }, - ); - - // Setup membership - manager - .stores - .rate_limit - .update_membership(&["node1".to_string()]); - - // Check rate limit - let (is_exceeded, _current_count, limit) = manager.check_global_rate_limit(); - assert!(!is_exceeded); // First check should not exceed - assert_eq!(limit, 10); - - // Increment multiple times - for _ in 0..15 { - manager.check_global_rate_limit(); - } - - let (is_exceeded2, current_count2, _) = manager.check_global_rate_limit(); - // Should exceed after many increments - assert!(is_exceeded2 || current_count2 > 10); - } - - #[test] - fn test_reset_global_rate_limit_counter() { - let manager = create_test_manager("node1".to_string()); - - // Setup membership - manager - .stores - .rate_limit - .update_membership(&["node1".to_string()]); - - // Increment counter - if manager - .stores - .rate_limit - .is_owner(GLOBAL_RATE_LIMIT_COUNTER_KEY) - { - manager.sync_rate_limit_inc(GLOBAL_RATE_LIMIT_COUNTER_KEY.to_string(), 10); - let value = manager.get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY); - assert!(value.is_some() && value.unwrap() > 0); - - // Reset - manager.reset_global_rate_limit_counter(); - let value_after = manager.get_rate_limit_value(GLOBAL_RATE_LIMIT_COUNTER_KEY); - // Should be reset (0 or negative) - assert!(value_after.is_none() || value_after.unwrap() <= 0); - } - } - - #[test] - fn test_sync_tree_operation() { - let manager = create_test_manager("node1".to_string()); - - use crate::tree_ops::{TreeInsertOp, TreeKey, TreeOperation}; - - let op = TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://localhost:8000".to_string(), - }); - - let result = manager.sync_tree_operation("model1".to_string(), op); - assert!(result.is_ok()); - - // sync_tree_operation no longer populates tree_configs (no subscribers - // in unit tests), so get_tree_state returns None. Instead, verify - // that the tenant delta was buffered. - let inserts = manager.stores.tenant_delta_inserts.get("model1").unwrap(); - assert_eq!(inserts.len(), 1); - assert_eq!(inserts[0].worker_url, "http://localhost:8000"); - assert_eq!(inserts[0].node_path_hash, hash_node_path("test_text")); - } - - #[test] - fn test_get_tree_state() { - let manager = create_test_manager("node1".to_string()); - - // Initially should be None - assert!(manager.get_tree_state("model1").is_none()); - - // sync_tree_operation only buffers tenant deltas and bumps the version - // counter — it does NOT populate tree_configs (that requires a - // subscriber-backed checkpoint). Verify get_tree_state returns None - // after sync, but the tenant delta was buffered. - use crate::tree_ops::{TreeInsertOp, TreeKey, TreeOperation}; - let op = TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://localhost:8000".to_string(), - }); - manager - .sync_tree_operation("model1".to_string(), op) - .unwrap(); - - // get_tree_state reads from tree_configs which is empty (no subscriber) - assert!(manager.get_tree_state("model1").is_none()); - // But the tenant delta insert was buffered - assert!(manager.stores.tenant_delta_inserts.get("model1").is_some()); - } - - #[test] - fn test_apply_remote_tree_operation() { - let manager = create_test_manager("node1".to_string()); - - use crate::tree_ops::{TreeInsertOp, TreeKey, TreeOperation, TreeState}; - - let mut tree_state = TreeState::new("model1".to_string()); - tree_state.version = 5; - tree_state.add_operation(TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text("remote_text".to_string()), - tenant: "http://localhost:8001".to_string(), - })); - // add_operation increments version, so version is now 6 - - manager.apply_remote_tree_operation( - "model1".to_string(), - tree_state, - Some("node2".to_string()), - ); - - let retrieved = manager.get_tree_state("model1").unwrap(); - assert_eq!(retrieved.version, 6); // add_operation increments version from 5 to 6 - assert_eq!(retrieved.operations.len(), 1); - } - - #[test] - fn test_notify_tree_state_subscribers_drops_lock_before_callback() { - let manager = Arc::new(create_test_manager("node1".to_string())); - let can_acquire_write_lock = Arc::new(AtomicBool::new(false)); - let subscriber = Arc::new(LockCheckingSubscriber { - manager: manager.clone(), - can_acquire_write_lock: can_acquire_write_lock.clone(), - }); - - manager.register_tree_state_subscriber(subscriber); - manager.notify_tree_state_subscribers("model1", &TreeState::new("model1".to_string())); - - assert!(can_acquire_write_lock.load(Ordering::SeqCst)); - } - - #[test] - fn test_get_all_tree_states() { - let manager = create_test_manager("node1".to_string()); - - // get_all_tree_states reads from tree_configs. In unit tests there are - // no subscribers, so sync_tree_operation won't populate tree_configs. - // Instead, insert TreeStates directly into tree_configs. - let mut ts1 = TreeState::new("model1".to_string()); - ts1.add_operation(make_insert_op("alpha", "http://localhost:8000")); - let mut ts2 = TreeState::new("model2".to_string()); - ts2.add_operation(make_insert_op("beta", "http://localhost:8001")); - - manager - .stores - .tree_configs - .insert("tree:model1".to_string(), ts1.to_bytes().unwrap()); - manager - .stores - .tree_configs - .insert("tree:model2".to_string(), ts2.to_bytes().unwrap()); - - let mut states = manager.get_all_tree_states(); - states.sort_by(|left, right| left.model_id.cmp(&right.model_id)); - - assert_eq!(states.len(), 2); - assert_eq!(states[0].model_id, "model1"); - assert_eq!(states[1].model_id, "model2"); - } - - #[test] - fn test_get_all_worker_states() { - let manager = create_test_manager("node1".to_string()); - - manager.sync_worker_state( - "worker1".to_string(), - "model1".to_string(), - "http://localhost:8000".to_string(), - true, - 0.5, - vec![], - ); - - manager.sync_worker_state( - "worker2".to_string(), - "model2".to_string(), - "http://localhost:8001".to_string(), - false, - 0.8, - vec![], - ); - - let all_states = manager.get_all_worker_states(); - assert_eq!(all_states.len(), 2); - } - - #[test] - fn test_get_all_policy_states() { - let manager = create_test_manager("node1".to_string()); - - manager.sync_policy_state("model1".to_string(), "cache_aware".to_string(), vec![]); - - manager.sync_policy_state("model2".to_string(), "round_robin".to_string(), vec![]); - - let all_states = manager.get_all_policy_states(); - assert_eq!(all_states.len(), 2); - } - - // ── Delta encoding tests ──────────────────────────────────────────── - - use crate::tree_ops::{TreeInsertOp, TreeKey, TreeOperation, TreeRemoveOp, TreeStateDelta}; - - fn make_insert_op(text: &str, tenant: &str) -> TreeOperation { - TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text(text.to_string()), - tenant: tenant.to_string(), - }) - } - - fn make_delta(model_id: &str, ops: Vec, base: u64, new: u64) -> TreeStateDelta { - TreeStateDelta { - model_id: model_id.to_string(), - operations: ops, - base_version: base, - new_version: new, - } - } - - #[test] - fn test_delta_basic_apply() { - let manager = create_test_manager("node1".to_string()); - - let ops = vec![ - make_insert_op("a", "http://w1:8000"), - make_insert_op("b", "http://w2:8000"), - make_insert_op("c", "http://w3:8000"), - ]; - - let delta = make_delta("model1", ops, 0, 3); - manager.apply_remote_tree_delta(delta, Some("node2".to_string())); - - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 3); - assert_eq!(tree.operations.len(), 3); - } - - #[test] - fn test_delta_version_check_rejects_gap() { - let manager = create_test_manager("node1".to_string()); - - // Seed tree at version 10 - let mut seed = TreeState::new("model1".to_string()); - for i in 0..10 { - seed.add_operation(make_insert_op(&format!("seed_{i}"), "http://w:8000")); - } - assert_eq!(seed.version, 10); - manager.apply_remote_tree_operation("model1".to_string(), seed, Some("seed".to_string())); - - // Delta with base_version=5 should be accepted (base <= current) - let delta_ok = make_delta("model1", vec![make_insert_op("ok", "http://w:8000")], 5, 11); - manager.apply_remote_tree_delta(delta_ok, None); - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 11); - - // Delta with base_version=20 should be rejected (gap: base > current) - let delta_gap = make_delta( - "model1", - vec![make_insert_op("gap", "http://w:8000")], - 20, - 21, - ); - manager.apply_remote_tree_delta(delta_gap, None); - let tree = manager.get_tree_state("model1").unwrap(); - // Version should still be 11 — the gap delta was rejected - assert_eq!(tree.version, 11); - } - - #[test] - fn test_delta_concurrent_apply() { - let manager = Arc::new(create_test_manager("node1".to_string())); - - // Both deltas target the same empty tree. At least one must succeed, - // and the resulting version must reflect the applied operations. - let m1 = manager.clone(); - let m2 = manager.clone(); - - let t1 = std::thread::spawn(move || { - let delta = make_delta("model1", vec![make_insert_op("t1", "http://w1:8000")], 0, 1); - m1.apply_remote_tree_delta(delta, Some("thread1".to_string())); - }); - - let t2 = std::thread::spawn(move || { - let delta = make_delta("model1", vec![make_insert_op("t2", "http://w2:8000")], 0, 1); - m2.apply_remote_tree_delta(delta, Some("thread2".to_string())); - }); - - t1.join().unwrap(); - t2.join().unwrap(); - - // At least one delta should have been applied - let tree = manager.get_tree_state("model1").unwrap(); - assert!(tree.version >= 1); - assert!(!tree.operations.is_empty()); - } - - #[test] - fn test_delta_empty_tree() { - let manager = create_test_manager("node1".to_string()); - - // No pre-existing tree for "new_model" - assert!(manager.get_tree_state("new_model").is_none()); - - let delta = make_delta( - "new_model", - vec![make_insert_op("first", "http://w1:8000")], - 0, - 1, - ); - manager.apply_remote_tree_delta(delta, None); - - let tree = manager.get_tree_state("new_model").unwrap(); - assert_eq!(tree.model_id, "new_model"); - assert_eq!(tree.version, 1); - assert_eq!(tree.operations.len(), 1); - } - - #[test] - fn test_delta_notifies_subscribers() { - let manager = Arc::new(create_test_manager("node1".to_string())); - let notified = Arc::new(AtomicBool::new(false)); - - #[derive(Debug)] - struct FlagSubscriber(Arc); - impl TreeStateSubscriber for FlagSubscriber { - fn apply_remote_tree_state(&self, _model_id: &str, _tree_state: &TreeState) { - self.0.store(true, Ordering::SeqCst); - } - } - - manager.register_tree_state_subscriber(Arc::new(FlagSubscriber(notified.clone()))); - - let delta = make_delta("model1", vec![make_insert_op("x", "http://w:8000")], 0, 1); - manager.apply_remote_tree_delta(delta, None); - - assert!( - notified.load(Ordering::SeqCst), - "subscriber was not notified after delta apply" - ); - } - - #[test] - fn test_collector_sends_tenant_delta() { - use crate::tree_ops::TenantDelta; - - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let manager = MeshSyncManager::new(stores.clone(), "node1".to_string()); - - // Sync a tree operation — buffers a tenant insert - manager - .sync_tree_operation( - "model1".to_string(), - make_insert_op("hello world", "http://w:8000"), - ) - .unwrap(); - - let updates = collect_policy_updates(stores.clone(), "node1"); - - assert!(!updates.is_empty(), "expected at least one policy update"); - - // The update should be a tenant delta (not full tree state) - let tree_update = updates - .iter() - .find(|u| u.key.starts_with("tree:")) - .expect("expected a tree key update"); - - let policy_state: PolicyState = - bincode::deserialize(&tree_update.value).expect("deserialize PolicyState"); - assert_eq!( - policy_state.policy_type, "tenant_delta", - "expected tenant_delta, got {}", - policy_state.policy_type - ); - - // Verify the tenant delta deserializes and contains the insert - let delta = TenantDelta::from_bytes(&policy_state.config).expect("deserialize TenantDelta"); - assert_eq!(delta.model_id, "model1"); - assert_eq!(delta.inserts.len(), 1); - assert_eq!(delta.inserts[0].worker_url, "http://w:8000"); - assert_eq!( - delta.inserts[0].node_path_hash, - hash_node_path("hello world") - ); - assert!(delta.evictions.is_empty()); - } - - #[test] - fn test_collector_falls_back_to_full_state() { - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - - // Directly insert a tree state into tree_configs WITHOUT going through - // sync_tree_operation (so tree_ops_pending is empty). This simulates - // a remote tree state received via apply_remote_tree_operation. - let mut tree = TreeState::new("model1".to_string()); - tree.add_operation(make_insert_op("direct", "http://w:8000")); - let serialized = tree.to_bytes().unwrap(); - stores - .tree_configs - .insert("tree:model1".to_string(), serialized); - // Advance tree version so the collector sees it as changed. - stores.advance_tree_version("tree:model1", tree.version); - // Bump tree_generation so the collector's tree_changed check fires. - stores.bump_tree_version("tree:model1"); - - let updates = collect_policy_updates(stores.clone(), "node1"); - - assert!(!updates.is_empty(), "expected at least one policy update"); - - let tree_update = updates - .iter() - .find(|u| u.key.starts_with("tree:")) - .expect("expected a tree key update"); - - // Since there are no pending ops, it should fall back to full PolicyState - let policy_state: PolicyState = - bincode::deserialize(&tree_update.value).expect("deserialize PolicyState"); - assert_eq!( - policy_state.policy_type, "tree_state_lz4", - "expected full state fallback, got delta" - ); - } - - // test_collector_buffer_survives_mark_sent removed: tested tree_ops_pending - // buffer survival across mark_sent calls, which is a dead code path now - // that sync_tree_operation no longer pushes to tree_ops_pending. - - #[test] - fn test_receiver_dispatches_delta_vs_full() { - let manager = create_test_manager("node1".to_string()); - - // 1. Apply via delta path - let delta = make_delta( - "model_d", - vec![make_insert_op("delta_op", "http://w:8000")], - 0, - 1, - ); - manager.apply_remote_tree_delta(delta, Some("remote".to_string())); - - let tree_d = manager.get_tree_state("model_d").unwrap(); - assert_eq!(tree_d.version, 1); - assert_eq!(tree_d.operations.len(), 1); - - // 2. Apply via full state path - let mut full_tree = TreeState::new("model_f".to_string()); - full_tree.add_operation(make_insert_op("full_op1", "http://w1:8000")); - full_tree.add_operation(make_insert_op("full_op2", "http://w2:8000")); - - manager.apply_remote_tree_operation( - "model_f".to_string(), - full_tree, - Some("remote".to_string()), - ); - - let tree_f = manager.get_tree_state("model_f").unwrap(); - assert_eq!(tree_f.version, 2); - assert_eq!(tree_f.operations.len(), 2); - } - - #[test] - fn test_delta_backward_compatible_full_state() { - let manager = create_test_manager("node1".to_string()); - - // Simulate receiving a full TreeState (the old, pre-delta format) - let mut old_format_tree = TreeState::new("legacy_model".to_string()); - old_format_tree.add_operation(make_insert_op("old1", "http://w:8000")); - old_format_tree.add_operation(make_insert_op("old2", "http://w:8000")); - - // The full-state path (apply_remote_tree_operation) should handle it - manager.apply_remote_tree_operation( - "legacy_model".to_string(), - old_format_tree.clone(), - Some("old_node".to_string()), - ); - - let tree = manager.get_tree_state("legacy_model").unwrap(); - assert_eq!(tree.version, old_format_tree.version); - assert_eq!(tree.operations.len(), 2); - assert_eq!(tree.model_id, "legacy_model"); - } - - // ── Edge-case delta encoding tests ───────────────────────────────── - - #[test] - fn test_delta_reconnect_falls_back_to_full_state() { - // Simulate a reconnected peer scenario: tree_configs has a materialized - // tree state but tenant delta buffers are empty. The collector should - // produce a full PolicyState (lz4-compressed), not a delta. - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - - // Directly insert a tree state into tree_configs (simulating a - // checkpoint that ran with real subscribers in production). - let mut tree = TreeState::new("model1".to_string()); - for i in 0..10 { - tree.add_operation(make_insert_op(&format!("op_{i}"), "http://w:8000")); - } - let serialized = tree.to_bytes().unwrap(); - stores - .tree_configs - .insert("tree:model1".to_string(), serialized); - stores.advance_tree_version("tree:model1", tree.version); - stores.bump_tree_version("tree:model1"); - - // Ensure tenant delta buffers are empty (simulating buffer drain) - stores.tenant_delta_inserts.remove("model1"); - stores.tenant_delta_evictions.remove("model1"); - - // Collect via v2 central collector (simulating reconnected peer) - let updates = collect_policy_updates(stores.clone(), "node1"); - - assert!(!updates.is_empty(), "expected at least one update"); - - let tree_update = updates - .iter() - .find(|u| u.key.starts_with("tree:")) - .expect("expected a tree key update"); - - let policy_state: PolicyState = - bincode::deserialize(&tree_update.value).expect("deserialize PolicyState"); - assert_eq!( - policy_state.policy_type, "tree_state_lz4", - "expected full state fallback when tenant delta buffers are empty, got: {}", - policy_state.policy_type - ); - } - - // test_delta_compaction_divergence removed: tested TreeState compaction - // via sync_tree_operation + get_tree_state, which relied on tree_ops_pending - // replay. sync_tree_operation no longer pushes to tree_ops_pending, and - // get_tree_state reads only from tree_configs (populated by subscribers). - - #[test] - fn test_delta_out_of_order_delivery() { - // Create tree at version 0. Apply delta [0→5], then apply stale - // delta [0→3]. The second delta should be rejected because the - // tree is already at version 5. - let manager = create_test_manager("node1".to_string()); - - let ops_1_to_5: Vec<_> = (1..=5) - .map(|i| make_insert_op(&format!("op_{i}"), "http://w:8000")) - .collect(); - let delta1 = make_delta("model1", ops_1_to_5, 0, 5); - manager.apply_remote_tree_delta(delta1, Some("peer_a".to_string())); - - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 5); - assert_eq!(tree.operations.len(), 5); - - // Late-arriving delta with lower new_version - let ops_1_to_3: Vec<_> = (1..=3) - .map(|i| make_insert_op(&format!("late_op_{i}"), "http://w:8000")) - .collect(); - let delta2 = make_delta("model1", ops_1_to_3, 0, 3); - manager.apply_remote_tree_delta(delta2, Some("peer_b".to_string())); - - // Tree should be unchanged — stale delta rejected - let tree_after = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree_after.version, 5); - assert_eq!(tree_after.operations.len(), 5); - } - - #[test] - fn test_delta_duplicate_delivery() { - // Apply the same delta twice. The second application must be a - // no-op because current version >= delta.new_version. - let manager = create_test_manager("node1".to_string()); - - let ops = vec![ - make_insert_op("dup1", "http://w:8000"), - make_insert_op("dup2", "http://w:8000"), - ]; - let delta = make_delta("model1", ops.clone(), 0, 2); - - manager.apply_remote_tree_delta(delta.clone(), Some("peer".to_string())); - let tree1 = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree1.version, 2); - assert_eq!(tree1.operations.len(), 2); - - // Second apply — duplicate - manager.apply_remote_tree_delta(delta, Some("peer".to_string())); - let tree2 = manager.get_tree_state("model1").unwrap(); - assert_eq!( - tree2.version, 2, - "duplicate delta should not change version" - ); - assert_eq!( - tree2.operations.len(), - 2, - "duplicate delta should not add extra ops" - ); - } - - #[test] - fn test_delta_split_brain_recovery() { - // Node A and Node B both start at version 5. - // A processes 3 ops (version 8). B has the seed at version 5 - // in tree_configs (local ops via sync_tree_operation only bump - // the atomic counter, not tree_configs). - // A sends delta(base=5, new=8) to B. - // B's tree_configs version is 5. - // base(5) <= current(5) ✓ - // current(5) < new(8) ✓ - // So B accepts and applies the 3 ops. - let manager = create_test_manager("nodeB".to_string()); - - // Seed the tree at version 5 (common ancestor) — writes to tree_configs - let mut seed = TreeState::new("model1".to_string()); - for i in 0..5 { - seed.add_operation(make_insert_op(&format!("seed_{i}"), "http://w:8000")); - } - assert_eq!(seed.version, 5); - manager.apply_remote_tree_operation("model1".to_string(), seed, Some("origin".to_string())); - - // Verify tree_configs has version 5 - let tree_b = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree_b.version, 5); - - // A's delta: base=5, new=8, 3 ops - let a_ops: Vec<_> = (0..3) - .map(|i| make_insert_op(&format!("A_op_{i}"), "http://wA:8000")) - .collect(); - let delta_a = make_delta("model1", a_ops, 5, 8); - manager.apply_remote_tree_delta(delta_a, Some("nodeA".to_string())); - - // After apply, tree should have seed ops + A's ops. - let tree_merged = manager.get_tree_state("model1").unwrap(); - assert_eq!( - tree_merged.version, 8, - "tree_configs version should be 8 (seed 5 + 3 delta ops), got {}", - tree_merged.version - ); - assert_eq!(tree_merged.operations.len(), 8); - } - - // test_delta_buffer_trim_multi_peer removed: tested tree_ops_pending trim - // behavior across multiple peer collectors. sync_tree_operation no longer - // pushes to tree_ops_pending, making this a dead code path. - - // test_delta_empty_pending_vec removed: tested empty tree_ops_pending - // fallback to full state. sync_tree_operation no longer pushes to - // tree_ops_pending, making this a dead code path. - - #[test] - fn test_delta_concurrent_write_and_collect() { - // Spawn a thread that adds 100 ops via sync_tree_operation. - // Simultaneously run the collector. The collector should get a - // consistent snapshot — either some ops or all ops, but never - // corrupted data. - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let manager = Arc::new(MeshSyncManager::new(stores.clone(), "node1".to_string())); - - let manager_clone = manager.clone(); - let writer = std::thread::spawn(move || { - for i in 0..100 { - manager_clone - .sync_tree_operation( - "model1".to_string(), - make_insert_op(&format!("concurrent_op_{i}"), "http://w:8000"), - ) - .unwrap(); - } - }); - - // Collect multiple times while writer is active - let mut collected_any = false; - for _ in 0..10 { - let updates = collect_policy_updates(stores.clone(), "node1"); - for update in &updates { - if update.key.starts_with("tree:") { - // Verify the data deserializes without corruption - let policy_state: PolicyState = - bincode::deserialize(&update.value).expect("data should not be corrupted"); - match policy_state.policy_type.as_str() { - "tenant_delta" => { - TenantDelta::from_bytes(&policy_state.config) - .expect("tenant delta should deserialize cleanly"); - } - "tree_state_delta" => { - let delta = TreeStateDelta::from_bytes(&policy_state.config) - .expect("delta should deserialize cleanly"); - assert!(!delta.operations.is_empty()); - } - "tree_state_lz4" => { - let decompressed = - crate::tree_ops::lz4_decompress(&policy_state.config) - .expect("lz4 should decompress cleanly"); - let tree = TreeState::from_bytes(&decompressed) - .expect("tree state should deserialize cleanly"); - assert!(!tree.operations.is_empty()); - } - "tree_state" => { - let tree = TreeState::from_bytes(&policy_state.config) - .expect("tree state should deserialize cleanly"); - assert!(!tree.operations.is_empty()); - } - other => panic!("unexpected policy_type: {other}"), - } - collected_any = true; - } - } - } - - writer.join().unwrap(); - - // After writer finishes, one final collect should succeed - let final_updates = collect_policy_updates(stores.clone(), "node1"); - if !collected_any { - // Writer may have been too fast; at least final collection must succeed - assert!( - !final_updates.is_empty(), - "final collection after writer finished should have updates" - ); - } - } - - // test_delta_oversized_mark_sent_trims_buffer removed: tested - // tree_ops_pending trim threshold during mark_sent. sync_tree_operation - // no longer pushes to tree_ops_pending, making this a dead code path. - - // test_delta_version_monotonic_after_compaction removed: tested version - // monotonicity across compaction by calling sync_tree_operation 3000 times - // and reading back via get_tree_state. Both paths relied on tree_ops_pending - // replay, which is a dead code path now. - - #[test] - fn test_delta_with_remove_operations() { - // Verify that deltas containing Remove operations work correctly - let manager = create_test_manager("node1".to_string()); - - let ops = vec![ - make_insert_op("text1", "http://w1:8000"), - TreeOperation::Remove(TreeRemoveOp { - tenant: "http://w1:8000".to_string(), - }), - make_insert_op("text2", "http://w2:8000"), - ]; - - let delta = make_delta("model1", ops, 0, 3); - manager.apply_remote_tree_delta(delta, Some("peer".to_string())); - - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 3); - assert_eq!(tree.operations.len(), 3); - // Verify the remove op is present - assert!(matches!( - tree.operations[1], - TreeOperation::Remove(TreeRemoveOp { .. }) - )); - } - - #[test] - fn test_delta_multiple_models_independent() { - // Verify that deltas for different models don't interfere with - // each other - let manager = create_test_manager("node1".to_string()); - - let delta_a = make_delta( - "model_a", - vec![make_insert_op("a_op", "http://w:8000")], - 0, - 1, - ); - let delta_b = make_delta( - "model_b", - vec![ - make_insert_op("b_op1", "http://w:8000"), - make_insert_op("b_op2", "http://w:8000"), - ], - 0, - 2, - ); - - manager.apply_remote_tree_delta(delta_a, None); - manager.apply_remote_tree_delta(delta_b, None); - - let tree_a = manager.get_tree_state("model_a").unwrap(); - let tree_b = manager.get_tree_state("model_b").unwrap(); - - assert_eq!(tree_a.version, 1); - assert_eq!(tree_a.operations.len(), 1); - assert_eq!(tree_b.version, 2); - assert_eq!(tree_b.operations.len(), 2); - } - - #[test] - fn test_delta_incremental_chain() { - // Apply a chain of sequential deltas: 0→3, 3→5, 5→8 - // Each should be accepted and the tree should accumulate all ops. - let manager = create_test_manager("node1".to_string()); - - let delta1 = make_delta( - "model1", - (0..3) - .map(|i| make_insert_op(&format!("batch1_op_{i}"), "http://w:8000")) - .collect(), - 0, - 3, - ); - manager.apply_remote_tree_delta(delta1, None); - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 3); - - let delta2 = make_delta( - "model1", - (0..2) - .map(|i| make_insert_op(&format!("batch2_op_{i}"), "http://w:8000")) - .collect(), - 3, - 5, - ); - manager.apply_remote_tree_delta(delta2, None); - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 5); - - let delta3 = make_delta( - "model1", - (0..3) - .map(|i| make_insert_op(&format!("batch3_op_{i}"), "http://w:8000")) - .collect(), - 5, - 8, - ); - manager.apply_remote_tree_delta(delta3, None); - let tree = manager.get_tree_state("model1").unwrap(); - assert_eq!(tree.version, 8); - assert_eq!(tree.operations.len(), 8); - } - - #[test] - fn test_delta_token_key_serialization_round_trip() { - // Verify that deltas with TreeKey::Tokens survive serialization - // through the full delta encode/decode path. - use crate::tree_ops::TreeInsertOp; - - let tokens = vec![42u32, 100, 200, 999, u32::MAX]; - let ops = vec![TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Tokens(tokens.clone()), - tenant: "http://w:8000".to_string(), - })]; - - let delta = TreeStateDelta { - model_id: "token_model".to_string(), - operations: ops, - base_version: 0, - new_version: 1, - }; - - // Serialize and deserialize - let bytes = delta.to_bytes().unwrap(); - let restored = TreeStateDelta::from_bytes(&bytes).unwrap(); - assert_eq!(restored.operations.len(), 1); - - match &restored.operations[0] { - TreeOperation::Insert(op) => { - assert_eq!(op.key, TreeKey::Tokens(tokens)); - } - TreeOperation::Remove(_) => panic!("expected Insert operation"), - } - - // Apply the delta to a manager and verify the tree - let manager = create_test_manager("node1".to_string()); - manager.apply_remote_tree_delta(restored, None); - - let tree = manager.get_tree_state("token_model").unwrap(); - assert_eq!(tree.version, 1); - assert_eq!(tree.operations.len(), 1); - } -} diff --git a/crates/mesh/src/tests/comprehensive.rs b/crates/mesh/src/tests/comprehensive.rs deleted file mode 100644 index 99b305325..000000000 --- a/crates/mesh/src/tests/comprehensive.rs +++ /dev/null @@ -1,837 +0,0 @@ -//! Comprehensive Mesh Service Tests -//! -//! This module implements High Priority Steps 1-5 from the test plan: -//! - Step 1: Test Infrastructure Setup -//! - Step 2: Basic Component Unit Tests -//! - Step 3: Single Node Integration Tests -//! - Step 4: Two-Node Cluster Tests -//! - Step 5: Multi-Node Cluster Formation -//! -//! ## Internal Tests -//! These tests are now crate-internal and have full access to private modules. - -use std::{ - collections::BTreeMap, - sync::{Arc, Once}, - time::Duration, -}; - -use tracing as log; -use tracing_subscriber::{ - filter::LevelFilter, layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, -}; - -use super::test_utils::{self, bind_node, wait_for}; -// Internal crate imports - now can access private modules -use crate::{ - node_state_machine::{ConvergenceConfig, NodeReadiness, NodeStateMachine}, - partition::{PartitionConfig, PartitionDetector, PartitionState}, - service::gossip::{NodeState as GossipNodeState, NodeStatus}, - stores::{AppState, StateStores}, - sync::MeshSyncManager, -}; - -// -// ==================================================================================== -// STEP 1: Test Infrastructure Setup -// ==================================================================================== -// - -static INIT: Once = Once::new(); - -/// Initialize test logging infrastructure -fn init_test_logging() { - INIT.call_once(|| { - let _ = tracing_subscriber::registry() - .with(tracing_subscriber::fmt::layer()) - .with( - EnvFilter::builder() - .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy(), - ) - .try_init(); - }); -} - -#[test] -fn test_infrastructure_utilities() { - init_test_logging(); - - // Test using test_utils module - let stores = test_utils::create_test_stores("test_node".to_string()); - assert!(stores.membership.all().is_empty()); - - let sync_manager = test_utils::create_test_sync_manager("test_node".to_string()); - assert_eq!(sync_manager.self_name(), "test_node"); - - // Now we can test create_test_cluster_state with NodeState - let cluster_state = test_utils::create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ]); - assert_eq!(cluster_state.read().len(), 2); -} - -// -// ==================================================================================== -// STEP 2: Basic Component Unit Tests -// ==================================================================================== -// - -#[test] -fn test_partition_detector_initialization() { - let config = PartitionConfig::default(); - let detector = PartitionDetector::new(config); - - // Test with empty cluster state - let empty_state = BTreeMap::new(); - let state = detector.detect_partition(&empty_state); - assert_eq!(state, PartitionState::Normal); -} - -#[test] -fn test_partition_detector_quorum_calculation() { - let detector = PartitionDetector::default(); - - // Test quorum with 3 nodes (need 2 for quorum) - let mut cluster_state = BTreeMap::new(); - cluster_state.insert( - "node1".to_string(), - GossipNodeState { - name: "node1".to_string(), - address: "127.0.0.1:8000".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: Default::default(), - }, - ); - cluster_state.insert( - "node2".to_string(), - GossipNodeState { - name: "node2".to_string(), - address: "127.0.0.1:8001".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: Default::default(), - }, - ); - cluster_state.insert( - "node3".to_string(), - GossipNodeState { - name: "node3".to_string(), - address: "127.0.0.1:8002".to_string(), - status: NodeStatus::Down as i32, - version: 1, - metadata: Default::default(), - }, - ); - - // Update last_seen for alive nodes - detector.update_last_seen("node1"); - detector.update_last_seen("node2"); - - let state = detector.detect_partition(&cluster_state); - assert_eq!(state, PartitionState::Normal); -} - -#[test] -fn test_node_state_machine_lifecycle() { - let stores = test_utils::create_test_stores("test_node".to_string()); - let config = ConvergenceConfig::default(); - let state_machine = NodeStateMachine::new(stores, config); - - // Initial state should be NotReady - assert!(!state_machine.is_ready()); - assert_eq!(state_machine.readiness(), NodeReadiness::NotReady); - - // Transition to Joining - state_machine.start_joining(); - assert_eq!(state_machine.readiness(), NodeReadiness::Joining); - - // Transition to SnapshotPull - state_machine.start_snapshot_pull(); - assert_eq!(state_machine.readiness(), NodeReadiness::SnapshotPull); - - // Transition to Converging - state_machine.start_converging(); - assert_eq!(state_machine.readiness(), NodeReadiness::Converging); - - // Transition to Ready - state_machine.transition_to_ready(); - assert!(state_machine.is_ready()); - assert_eq!(state_machine.readiness(), NodeReadiness::Ready); -} - -#[test] -fn test_state_stores_basic_operations() { - let stores = test_utils::create_test_stores("test_node".to_string()); - - // Test app data write/read - let app_state = AppState { - key: "key1".to_string(), - value: vec![1, 2, 3], - version: 1, - }; - let _ = stores.app.insert("key1".to_string(), app_state.clone()); - let value = stores.app.get("key1"); - assert!(value.is_some()); - assert_eq!(value.unwrap().value, vec![1, 2, 3]); - - // Test that keys don't exist initially - assert_eq!(stores.app.get("nonexistent"), None); -} - -#[test] -fn test_sync_manager_rate_limit_membership() { - let sync_manager = test_utils::create_test_sync_manager("node1".to_string()); - - // Update membership should not panic - sync_manager.update_rate_limit_membership(); - - // Test self name - assert_eq!(sync_manager.self_name(), "node1"); -} - -#[tokio::test] -async fn test_rate_limit_window_creation() { - use crate::rate_limit_window::RateLimitWindow; - - let stores = Arc::new(StateStores::with_self_name("node1".to_string())); - let sync_manager = Arc::new(MeshSyncManager::new(stores, "node1".to_string())); - - let _window = RateLimitWindow::new(sync_manager, 60); - // Window created successfully - no public fields to assert -} - -// -// ==================================================================================== -// STEP 3: Single Node Integration Tests -// ==================================================================================== -// - -#[tokio::test] -async fn test_single_node_creation_and_shutdown() { - init_test_logging(); - log::info!("Starting test_single_node_creation_and_shutdown"); - - let (listener, addr) = bind_node().await; - let handler = crate::mesh_run!("single_node", listener, addr, None); - - wait_for( - || handler.state.read().contains_key("single_node"), - Duration::from_secs(5), - "single_node appears in cluster state", - ) - .await; - - handler.graceful_shutdown().await.unwrap(); - log::info!("Single node shutdown completed"); -} - -#[tokio::test] -async fn test_single_node_data_operations() { - init_test_logging(); - log::info!("Starting test_single_node_data_operations"); - - let (listener, addr) = bind_node().await; - let handler = crate::mesh_run!("data_node", listener, addr, None); - - wait_for( - || handler.state.read().contains_key("data_node"), - Duration::from_secs(5), - "data_node appears in cluster state", - ) - .await; - - handler - .write_data("test_key".into(), "test_value".into()) - .unwrap(); - - assert!(handler.stores.app.get("test_key").is_some()); - - handler.shutdown(); - log::info!("Data operations test completed"); -} - -#[tokio::test] -async fn test_single_node_subsystems_initialized() { - init_test_logging(); - log::info!("Starting test_single_node_subsystems_initialized"); - - let (listener, addr) = bind_node().await; - let handler = crate::mesh_run!("subsystem_node", listener, addr, None); - - wait_for( - || handler.state.read().contains_key("subsystem_node"), - Duration::from_secs(5), - "subsystem_node appears in cluster state", - ) - .await; - - assert!(handler.partition_detector().is_some()); - assert!(handler.state_machine().is_some()); - - handler.shutdown(); - log::info!("Subsystems initialization test completed"); -} - -// -// ==================================================================================== -// STEP 4: Two-Node Cluster Tests -// ==================================================================================== -// - -#[tokio::test] -async fn test_two_node_cluster_formation() { - init_test_logging(); - log::info!("Starting test_two_node_cluster_formation"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("node_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("node_b", listener_b, addr_b, Some(addr_a)); - - wait_for( - || handler_a.state.read().len() == 2 && handler_b.state.read().len() == 2, - Duration::from_secs(15), - "both nodes see each other", - ) - .await; - - let state_a = handler_a.state.read(); - assert!(state_a.contains_key("node_a")); - assert!(state_a.contains_key("node_b")); - drop(state_a); - - handler_a.shutdown(); - handler_b.shutdown(); - log::info!("Two-node cluster formation test completed"); -} - -#[tokio::test] -async fn test_two_node_data_synchronization() { - init_test_logging(); - log::info!("Starting test_two_node_data_synchronization"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("sync_node_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("sync_node_b", listener_b, addr_b, Some(addr_a)); - - // Wait for cluster formation - wait_for( - || handler_a.state.read().len() == 2 && handler_b.state.read().len() == 2, - Duration::from_secs(15), - "both nodes see each other", - ) - .await; - - // Write data on node A - handler_a - .write_data("shared_key".into(), "shared_value".into()) - .unwrap(); - - // Poll until data syncs to B via incremental sync stream - wait_for( - || { - handler_b - .stores - .app - .get("shared_key") - .is_some_and(|v| v.value == b"shared_value") - }, - Duration::from_secs(15), - "shared_key synced to node B", - ) - .await; - - // Allow the sync cycle to settle before writing a second update. - // The incremental collector runs on a 1s interval and the mark_sent - // bookkeeping must complete before the next version can be detected. - tokio::time::sleep(Duration::from_secs(2)).await; - - // Update data on node A - handler_a - .write_data("shared_key".into(), "shared_value2".into()) - .unwrap(); - - // Poll until updated value syncs to B - wait_for( - || { - handler_b - .stores - .app - .get("shared_key") - .is_some_and(|v| v.value == b"shared_value2") - }, - Duration::from_secs(30), - "updated shared_key synced to node B", - ) - .await; - - let value_a = handler_a.stores.app.get("shared_key").unwrap(); - let value_b = handler_b.stores.app.get("shared_key").unwrap(); - assert_eq!(value_a.value, value_b.value); - - handler_a.shutdown(); - handler_b.shutdown(); - log::info!("Two-node data synchronization test completed"); -} - -#[tokio::test] -async fn test_two_node_heartbeat_monitoring() { - init_test_logging(); - log::info!("Starting test_two_node_heartbeat_monitoring"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("heartbeat_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("heartbeat_b", listener_b, addr_b, Some(addr_a)); - - // Wait for cluster formation - wait_for( - || { - handler_a - .state - .read() - .get("heartbeat_b") - .is_some_and(|n| n.status == NodeStatus::Alive as i32) - }, - Duration::from_secs(15), - "node A sees heartbeat_b as Alive", - ) - .await; - - // Shutdown node B abruptly - handler_b.shutdown(); - - // Poll until A detects B as no longer Alive - wait_for( - || { - handler_a - .state - .read() - .get("heartbeat_b") - .is_some_and(|n| n.status != NodeStatus::Alive as i32) - }, - Duration::from_secs(30), - "node A detects heartbeat_b as not Alive", - ) - .await; - - let status = handler_a.state.read().get("heartbeat_b").map(|n| n.status); - log::info!("Node B status after shutdown: {:?}", status); - - handler_a.shutdown(); - log::info!("Two-node heartbeat monitoring test completed"); -} - -// -// ==================================================================================== -// STEP 5: Multi-Node Cluster Formation -// ==================================================================================== -// - -#[tokio::test] -async fn test_three_node_cluster_formation() { - init_test_logging(); - log::info!("Starting test_three_node_cluster_formation"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("cluster_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("cluster_b", listener_b, addr_b, Some(addr_a)); - - let (listener_c, addr_c) = bind_node().await; - let handler_c = crate::mesh_run!("cluster_c", listener_c, addr_c, Some(addr_a)); - - wait_for( - || { - handler_a.state.read().len() == 3 - && handler_b.state.read().len() == 3 - && handler_c.state.read().len() == 3 - }, - Duration::from_secs(30), - "all 3 nodes see each other", - ) - .await; - - let state_a = handler_a.state.read(); - assert!(state_a.contains_key("cluster_a")); - assert!(state_a.contains_key("cluster_b")); - assert!(state_a.contains_key("cluster_c")); - drop(state_a); - - handler_a.shutdown(); - handler_b.shutdown(); - handler_c.shutdown(); - log::info!("Three-node cluster formation test completed"); -} - -#[tokio::test] -async fn test_multi_node_data_propagation() { - init_test_logging(); - log::info!("Starting test_multi_node_data_propagation"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("prop_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("prop_b", listener_b, addr_b, Some(addr_a)); - - let (listener_c, addr_c) = bind_node().await; - let handler_c = crate::mesh_run!("prop_c", listener_c, addr_c, Some(addr_a)); - - // Wait for 3-node cluster - wait_for( - || { - handler_a.state.read().len() == 3 - && handler_b.state.read().len() == 3 - && handler_c.state.read().len() == 3 - }, - Duration::from_secs(60), - "all 3 nodes see each other", - ) - .await; - - // Write data on node A - handler_a - .write_data("propagated_key".into(), "propagated_value".into()) - .unwrap(); - - // Poll until data reaches B and C - wait_for( - || { - handler_b.stores.app.get("propagated_key").is_some() - && handler_c.stores.app.get("propagated_key").is_some() - }, - Duration::from_secs(60), - "propagated_key synced to B and C", - ) - .await; - - let val_a = handler_a.stores.app.get("propagated_key").unwrap().value; - assert_eq!( - val_a, - handler_b.stores.app.get("propagated_key").unwrap().value - ); - assert_eq!( - val_a, - handler_c.stores.app.get("propagated_key").unwrap().value - ); - - // Write updated data on node B - handler_b - .write_data("propagated_key".into(), "propagated_value2".into()) - .unwrap(); - - // Poll until updated value reaches A and C - wait_for( - || { - handler_a - .stores - .app - .get("propagated_key") - .is_some_and(|v| v.value == b"propagated_value2") - && handler_c - .stores - .app - .get("propagated_key") - .is_some_and(|v| v.value == b"propagated_value2") - }, - Duration::from_secs(60), - "updated propagated_key synced to A and C", - ) - .await; - - handler_a.shutdown(); - handler_b.shutdown(); - handler_c.shutdown(); - log::info!("Multi-node data propagation test completed"); -} - -/// Regression test: one publish of a tenant delta on node A must land on -/// BOTH connected peers (B and C), not just the first peer whose collector -/// drains the shared buffer. -#[tokio::test] -async fn test_multi_peer_tenant_delta_broadcast() { - use std::sync::atomic::{AtomicUsize, Ordering}; - - use crate::{ - sync::TreeStateSubscriber, - tree_ops::{TenantEvict, TenantInsert, TreeInsertOp, TreeKey, TreeOperation, TreeState}, - }; - - #[derive(Debug)] - struct CountingSubscriber { - target_model_id: String, - inserts_received: Arc, - } - impl TreeStateSubscriber for CountingSubscriber { - fn apply_remote_tree_state(&self, _model_id: &str, _tree_state: &TreeState) {} - fn apply_tenant_delta( - &self, - model_id: &str, - inserts: &[TenantInsert], - _evictions: &[TenantEvict], - ) { - if model_id == self.target_model_id { - self.inserts_received - .fetch_add(inserts.len(), Ordering::SeqCst); - } - } - } - - init_test_logging(); - log::info!("Starting test_multi_peer_tenant_delta_broadcast"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("td_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("td_b", listener_b, addr_b, Some(addr_a)); - - let (listener_c, addr_c) = bind_node().await; - let handler_c = crate::mesh_run!("td_c", listener_c, addr_c, Some(addr_a)); - - wait_for( - || { - handler_a.state.read().len() == 3 - && handler_b.state.read().len() == 3 - && handler_c.state.read().len() == 3 - }, - Duration::from_secs(60), - "all 3 nodes see each other", - ) - .await; - - let model_id = "test-model".to_string(); - let count_b = Arc::new(AtomicUsize::new(0)); - let count_c = Arc::new(AtomicUsize::new(0)); - handler_b - .sync_manager - .register_tree_state_subscriber(Arc::new(CountingSubscriber { - target_model_id: model_id.clone(), - inserts_received: count_b.clone(), - })); - handler_c - .sync_manager - .register_tree_state_subscriber(Arc::new(CountingSubscriber { - target_model_id: model_id.clone(), - inserts_received: count_c.clone(), - })); - - // SWIM membership converges on a separate channel from sync_stream. - // Tenant delta is at-most-once — if the stream isn't up when the - // collector drains, the delta is gone. Warm the pipe with a CRDT app - // write first (retried via watermark until it lands on both peers); - // once both observe it, sync_stream is proven active in both directions. - handler_a - .write_data("td-sync-ready".into(), "1".into()) - .unwrap(); - wait_for( - || { - handler_b - .stores - .app - .get("td-sync-ready") - .is_some_and(|v| v.value == b"1") - && handler_c - .stores - .app - .get("td-sync-ready") - .is_some_and(|v| v.value == b"1") - }, - Duration::from_secs(30), - "sync_stream is active on both B and C", - ) - .await; - - handler_a - .sync_manager - .sync_tree_operation( - model_id, - TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text("multi-peer-prompt".to_string()), - tenant: "http://worker:8000".to_string(), - }), - ) - .unwrap(); - - wait_for( - || count_b.load(Ordering::SeqCst) > 0 && count_c.load(Ordering::SeqCst) > 0, - Duration::from_secs(30), - "tenant delta reached BOTH B and C (v1 bug would leave one at 0)", - ) - .await; - - assert!( - count_b.load(Ordering::SeqCst) > 0, - "B did not receive the tenant delta" - ); - assert!( - count_c.load(Ordering::SeqCst) > 0, - "C did not receive the tenant delta" - ); - - handler_a.shutdown(); - handler_b.shutdown(); - handler_c.shutdown(); - log::info!("test_multi_peer_tenant_delta_broadcast completed"); -} - -#[tokio::test] -#[ignore = "SWIM failure detection for hard-shutdown nodes needs many gossip rounds; flaky under parallel CI load"] -async fn test_five_node_cluster_with_failure() { - init_test_logging(); - log::info!("Starting test_five_node_cluster_with_failure"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("multi_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("multi_b", listener_b, addr_b, Some(addr_a)); - - // Wait for A-B cluster - wait_for( - || handler_a.state.read().len() == 2, - Duration::from_secs(15), - "A-B cluster formed", - ) - .await; - - handler_a - .write_data("test_data".into(), "initial_value".into()) - .unwrap(); - - // Add C and D - let (listener_c, addr_c) = bind_node().await; - let handler_c = crate::mesh_run!("multi_c", listener_c, addr_c, Some(addr_a)); - - let (listener_d, addr_d) = bind_node().await; - let handler_d = crate::mesh_run!("multi_d", listener_d, addr_d, Some(addr_c)); - - wait_for( - || handler_a.state.read().len() == 4, - Duration::from_secs(30), - "4-node cluster formed", - ) - .await; - - // Add E, wait for it to join, then kill it - { - let (listener_e, addr_e) = bind_node().await; - let handler_e = crate::mesh_run!("multi_e", listener_e, addr_e, Some(addr_d)); - - wait_for( - || handler_a.state.read().len() == 5, - Duration::from_secs(30), - "5-node cluster formed", - ) - .await; - - handler_e.shutdown(); - log::info!("Node E shutdown"); - } - - // Gracefully shutdown D - handler_d.graceful_shutdown().await.unwrap(); - log::info!("Node D gracefully shutdown"); - - // Wait for D to be Leaving - wait_for( - || { - handler_a - .state - .read() - .get("multi_d") - .is_some_and(|n| n.status == NodeStatus::Leaving as i32) - }, - Duration::from_secs(30), - "node D detected as Leaving", - ) - .await; - - // Wait for E to be detected as not Alive (Suspected or Down). - // SWIM failure detection requires multiple gossip rounds, so allow ample time - // especially when other tests are running in parallel. - wait_for( - || { - handler_a - .state - .read() - .get("multi_e") - .is_some_and(|n| n.status != NodeStatus::Alive as i32) - }, - Duration::from_secs(60), - "node E detected as not Alive", - ) - .await; - - let state_a = handler_a.state.read(); - assert!(state_a.contains_key("multi_a")); - assert!(state_a.contains_key("multi_b")); - assert!(state_a.contains_key("multi_c")); - assert_eq!( - state_a.get("multi_d").map(|n| n.status), - Some(NodeStatus::Leaving as i32) - ); - let e_status = state_a.get("multi_e").map(|n| n.status); - log::info!("Node E final status: {:?}", e_status); - drop(state_a); - - handler_a.shutdown(); - handler_b.shutdown(); - handler_c.shutdown(); - log::info!("Five-node cluster test completed"); -} - -#[tokio::test] -async fn test_cluster_formation_different_join_patterns() { - init_test_logging(); - log::info!("Starting test_cluster_formation_different_join_patterns"); - - let (listener_a, addr_a) = bind_node().await; - let handler_a = crate::mesh_run!("pattern_a", listener_a, addr_a, None); - - let (listener_b, addr_b) = bind_node().await; - let handler_b = crate::mesh_run!("pattern_b", listener_b, addr_b, Some(addr_a)); - - // Node C joins through B (chain topology) - let (listener_c, addr_c) = bind_node().await; - let handler_c = crate::mesh_run!("pattern_c", listener_c, addr_c, Some(addr_b)); - - // Node D joins through A (star topology) - let (listener_d, addr_d) = bind_node().await; - let handler_d = crate::mesh_run!("pattern_d", listener_d, addr_d, Some(addr_a)); - - wait_for( - || { - handler_a.state.read().len() == 4 - && handler_b.state.read().len() == 4 - && handler_c.state.read().len() == 4 - && handler_d.state.read().len() == 4 - }, - Duration::from_secs(30), - "all 4 nodes see each other (chain + star topology)", - ) - .await; - - handler_a.shutdown(); - handler_b.shutdown(); - handler_c.shutdown(); - handler_d.shutdown(); - log::info!("Different join patterns test completed"); -} diff --git a/crates/mesh/src/tests/mod.rs b/crates/mesh/src/tests/mod.rs index 5f562e2e2..533464c0f 100644 --- a/crates/mesh/src/tests/mod.rs +++ b/crates/mesh/src/tests/mod.rs @@ -1,10 +1,7 @@ //! Internal tests module //! -//! This module contains comprehensive integration and unit tests -//! that have full access to private crate internals. +//! Tests that need access to private crate internals. #[cfg(test)] mod chunking_integration; -#[cfg(test)] -mod comprehensive; pub(crate) mod test_utils; diff --git a/crates/mesh/src/tests/test_utils.rs b/crates/mesh/src/tests/test_utils.rs index a0d82bcd5..ba5bbf7d5 100644 --- a/crates/mesh/src/tests/test_utils.rs +++ b/crates/mesh/src/tests/test_utils.rs @@ -1,20 +1,8 @@ //! Test utilities for mesh module -use std::{ - collections::{BTreeMap, HashMap}, - net::SocketAddr, - sync::Arc, - time::Duration, -}; +use std::{net::SocketAddr, time::Duration}; -use parking_lot::RwLock; use tokio::net::TcpListener; -use crate::{ - service::{gossip::NodeState, ClusterState}, - stores::StateStores, - sync::MeshSyncManager, -}; - /// Bind to an ephemeral port and return the listener + address. /// The caller must keep the listener alive and pass it to `mesh_run!` /// to avoid a TOCTOU port race. @@ -39,63 +27,3 @@ where tokio::time::sleep(Duration::from_millis(100)).await; } } - -/// Create test StateStores with a given node name -pub fn create_test_stores(self_name: String) -> Arc { - Arc::new(StateStores::with_self_name(self_name)) -} - -/// Create test MeshSyncManager -pub fn create_test_sync_manager(self_name: String) -> Arc { - let stores = create_test_stores(self_name.clone()); - Arc::new(MeshSyncManager::new(stores, self_name)) -} - -/// Create test cluster state with given nodes -pub fn create_test_cluster_state( - nodes: Vec<(String, String, i32)>, // (name, address, status) -) -> ClusterState { - let mut state = BTreeMap::new(); - for (name, address, status) in nodes { - state.insert( - name.clone(), - NodeState { - name: name.clone(), - address, - status, - version: 1, - metadata: HashMap::new(), - }, - ); - } - Arc::new(RwLock::new(state)) -} - -#[cfg(test)] -mod test_utils_tests { - use super::*; - - #[test] - fn test_create_test_stores() { - let stores = create_test_stores("test_node".to_string()); - assert!(!stores.rate_limit.is_owner("key1")); - } - - #[test] - fn test_create_test_sync_manager() { - let manager = create_test_sync_manager("test_node".to_string()); - assert_eq!(manager.self_name(), "test_node"); - } - - #[test] - fn test_create_test_cluster_state() { - let state = create_test_cluster_state(vec![ - ("node1".to_string(), "127.0.0.1:8000".to_string(), 1), - ("node2".to_string(), "127.0.0.1:8001".to_string(), 1), - ]); - let read_state = state.read(); - assert_eq!(read_state.len(), 2); - assert!(read_state.contains_key("node1")); - assert!(read_state.contains_key("node2")); - } -} diff --git a/crates/mesh/src/topology.rs b/crates/mesh/src/topology.rs deleted file mode 100644 index 6d8f74b98..000000000 --- a/crates/mesh/src/topology.rs +++ /dev/null @@ -1,633 +0,0 @@ -//! Topology management for mesh cluster -//! -//! Supports: -//! - Full mesh for small/medium clusters -//! - Sparse mesh for large clusters (by region/AZ) - -// TopologyConfig/TopologyManager are currently only exercised through in-crate tests. -// Suppress dead_code for the entire module in non-test builds. -#![cfg_attr(not(test), allow(dead_code, clippy::allow_attributes))] - -use std::{ - collections::{BTreeMap, HashSet}, - sync::Arc, -}; - -use parking_lot::RwLock; -use tracing::debug; - -use super::{service::ClusterState, stores::MembershipState}; - -/// Topology configuration -#[derive(Debug, Clone)] -pub struct TopologyConfig { - /// Maximum nodes for full mesh (beyond this, use sparse) - pub full_mesh_threshold: usize, - /// Region identifier (for sparse mesh) - pub region: Option, - /// Availability zone identifier (for sparse mesh) - pub availability_zone: Option, -} - -impl Default for TopologyConfig { - fn default() -> Self { - Self { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - } - } -} - -/// Topology manager -pub struct TopologyManager { - config: TopologyConfig, - state: ClusterState, - self_name: String, - /// Active peer connections (for sparse mesh) - active_peers: Arc>>, -} - -impl TopologyManager { - pub fn new(config: TopologyConfig, state: ClusterState, self_name: String) -> Self { - Self { - config, - state, - self_name, - active_peers: Arc::new(RwLock::new(HashSet::new())), - } - } - - /// Get peers to connect to based on topology - pub fn get_peers(&self, count: usize) -> Vec { - let state = self.state.read(); - let total_nodes = state.len(); - - if total_nodes <= self.config.full_mesh_threshold { - // Full mesh: connect to all nodes - self.get_full_mesh_peers(&state, count) - } else { - // Sparse mesh: connect based on region/AZ - self.get_sparse_mesh_peers(&state, count) - } - } - - /// Get peers for full mesh topology - fn get_full_mesh_peers( - &self, - state: &BTreeMap, - count: usize, - ) -> Vec { - let mut peers = Vec::new(); - let active = self.active_peers.read(); - - for (name, node) in state { - if name != &self.self_name - && node.status == super::service::gossip::NodeStatus::Alive as i32 - && !active.contains(name) - { - let metadata: BTreeMap> = node - .metadata - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect::>(); - peers.push(MembershipState { - name: node.name.clone(), - address: node.address.clone(), - status: node.status, - version: node.version, - metadata, - }); - if peers.len() >= count { - break; - } - } - } - - peers - } - - /// Get peers for sparse mesh topology (by region/AZ) - fn get_sparse_mesh_peers( - &self, - state: &BTreeMap, - count: usize, - ) -> Vec { - let mut peers = Vec::new(); - let active = self.active_peers.read(); - - // First, try to connect to nodes in same region/AZ - if let (Some(ref region), Some(ref az)) = - (&self.config.region, &self.config.availability_zone) - { - for (name, node) in state { - if name != &self.self_name - && node.status == super::service::gossip::NodeStatus::Alive as i32 - && !active.contains(name) - { - // Check if node is in same region/AZ (from metadata) - let node_region = node - .metadata - .get("region") - .and_then(|v| String::from_utf8(v.clone()).ok()); - let node_az = node - .metadata - .get("availability_zone") - .and_then(|v| String::from_utf8(v.clone()).ok()); - - if node_region.as_ref() == Some(region) && node_az.as_ref() == Some(az) { - let metadata: BTreeMap> = node - .metadata - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - peers.push(MembershipState { - name: node.name.clone(), - address: node.address.clone(), - status: node.status, - version: node.version, - metadata, - }); - if peers.len() >= count { - break; - } - } - } - } - } - - // If not enough peers, add from other regions - if peers.len() < count { - for (name, node) in state { - if name != &self.self_name - && node.status == super::service::gossip::NodeStatus::Alive as i32 - && !active.contains(name) - && !peers.iter().any(|p| p.name == node.name) - { - let metadata: BTreeMap> = node - .metadata - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - peers.push(MembershipState { - name: node.name.clone(), - address: node.address.clone(), - status: node.status, - version: node.version, - metadata, - }); - if peers.len() >= count { - break; - } - } - } - } - - peers - } - - /// Mark peer as active - pub fn mark_peer_active(&self, peer_name: &str) { - self.active_peers.write().insert(peer_name.to_string()); - debug!("Marked peer {} as active", peer_name); - } - - /// Mark peer as inactive - pub fn mark_peer_inactive(&self, peer_name: &str) { - self.active_peers.write().remove(peer_name); - debug!("Marked peer {} as inactive", peer_name); - } - - /// Get number of active peers - pub fn active_peer_count(&self) -> usize { - self.active_peers.read().len() - } - - /// Check if we should use full mesh - pub fn is_full_mesh(&self) -> bool { - let state = self.state.read(); - state.len() <= self.config.full_mesh_threshold - } -} - -#[cfg(test)] -mod tests { - use std::collections::BTreeMap; - - use super::*; - use crate::service::gossip::{NodeState, NodeStatus}; - - fn create_test_cluster_state(nodes: Vec<(String, String, i32)>) -> ClusterState { - let mut state = BTreeMap::new(); - for (name, address, status) in nodes { - state.insert( - name.clone(), - NodeState { - name: name.clone(), - address, - status, - version: 1, - metadata: std::collections::HashMap::new(), - }, - ); - } - Arc::new(RwLock::new(state)) - } - - #[test] - fn test_full_mesh_topology() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node3".to_string(), - "127.0.0.1:8002".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - let peers = manager.get_peers(5); - // Should return all available peers (node2 and node3) - assert_eq!(peers.len(), 2); - assert!(peers.iter().any(|p| p.name == "node2")); - assert!(peers.iter().any(|p| p.name == "node3")); - } - - #[test] - fn test_full_mesh_topology_excludes_self() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - let peers = manager.get_peers(5); - // Should not include self (node1) - assert_eq!(peers.len(), 1); - assert_eq!(peers[0].name, "node2"); - } - - #[test] - fn test_full_mesh_topology_filters_down_nodes() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Down as i32, - ), - ( - "node3".to_string(), - "127.0.0.1:8002".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - let peers = manager.get_peers(5); - // Should only return alive nodes (node3) - assert_eq!(peers.len(), 1); - assert_eq!(peers[0].name, "node3"); - } - - #[test] - fn test_sparse_mesh_topology() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node3".to_string(), - "127.0.0.1:8002".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node4".to_string(), - "127.0.0.1:8003".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node5".to_string(), - "127.0.0.1:8004".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node6".to_string(), - "127.0.0.1:8005".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node7".to_string(), - "127.0.0.1:8006".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node8".to_string(), - "127.0.0.1:8007".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node9".to_string(), - "127.0.0.1:8008".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node10".to_string(), - "127.0.0.1:8009".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node11".to_string(), - "127.0.0.1:8010".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, // 11 nodes > 10, should use sparse - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - let peers = manager.get_peers(5); - // Should return peers (sparse mesh mode) - assert!(!peers.is_empty()); - assert!(peers.len() <= 5); - } - - #[test] - fn test_sparse_mesh_with_region_az() { - let mut state_map = BTreeMap::new(); - - // Create nodes with region/AZ metadata - let mut node1_metadata = std::collections::HashMap::new(); - node1_metadata.insert("region".to_string(), b"us-west".to_vec()); - node1_metadata.insert("availability_zone".to_string(), b"us-west-1a".to_vec()); - state_map.insert( - "node1".to_string(), - NodeState { - name: "node1".to_string(), - address: "127.0.0.1:8000".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: node1_metadata.clone(), - }, - ); - - let mut node2_metadata = std::collections::HashMap::new(); - node2_metadata.insert("region".to_string(), b"us-west".to_vec()); - node2_metadata.insert("availability_zone".to_string(), b"us-west-1a".to_vec()); - state_map.insert( - "node2".to_string(), - NodeState { - name: "node2".to_string(), - address: "127.0.0.1:8001".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: node2_metadata, - }, - ); - - let mut node3_metadata = std::collections::HashMap::new(); - node3_metadata.insert("region".to_string(), b"us-east".to_vec()); - node3_metadata.insert("availability_zone".to_string(), b"us-east-1a".to_vec()); - state_map.insert( - "node3".to_string(), - NodeState { - name: "node3".to_string(), - address: "127.0.0.1:8002".to_string(), - status: NodeStatus::Alive as i32, - version: 1, - metadata: node3_metadata, - }, - ); - - let state = Arc::new(RwLock::new(state_map)); - - let config = TopologyConfig { - full_mesh_threshold: 2, - region: Some("us-west".to_string()), - availability_zone: Some("us-west-1a".to_string()), - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - let peers = manager.get_peers(5); - // Should prefer nodes in same region/AZ (node2) - assert!(!peers.is_empty()); - // node2 should be in the list (same region/AZ) - assert!(peers.iter().any(|p| p.name == "node2")); - } - - #[test] - fn test_mark_peer_active_inactive() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - assert_eq!(manager.active_peer_count(), 0); - - manager.mark_peer_active("node2"); - assert_eq!(manager.active_peer_count(), 1); - - manager.mark_peer_inactive("node2"); - assert_eq!(manager.active_peer_count(), 0); - } - - #[test] - fn test_get_peers_excludes_active_peers() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node3".to_string(), - "127.0.0.1:8002".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - - manager.mark_peer_active("node2"); - - let peers = manager.get_peers(5); - // Should exclude node2 (already active) - assert!(!peers.iter().any(|p| p.name == "node2")); - // Should include node3 - assert!(peers.iter().any(|p| p.name == "node3")); - } - - #[test] - fn test_is_full_mesh() { - let state = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager = TopologyManager::new(config, state, "node1".to_string()); - assert!(manager.is_full_mesh()); - - let state2 = create_test_cluster_state(vec![ - ( - "node1".to_string(), - "127.0.0.1:8000".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node2".to_string(), - "127.0.0.1:8001".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node3".to_string(), - "127.0.0.1:8002".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node4".to_string(), - "127.0.0.1:8003".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node5".to_string(), - "127.0.0.1:8004".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node6".to_string(), - "127.0.0.1:8005".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node7".to_string(), - "127.0.0.1:8006".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node8".to_string(), - "127.0.0.1:8007".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node9".to_string(), - "127.0.0.1:8008".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node10".to_string(), - "127.0.0.1:8009".to_string(), - NodeStatus::Alive as i32, - ), - ( - "node11".to_string(), - "127.0.0.1:8010".to_string(), - NodeStatus::Alive as i32, - ), - ]); - - let config2 = TopologyConfig { - full_mesh_threshold: 10, - region: None, - availability_zone: None, - }; - - let manager2 = TopologyManager::new(config2, state2, "node1".to_string()); - assert!(!manager2.is_full_mesh()); - } -} diff --git a/crates/mesh/src/tree_ops.rs b/crates/mesh/src/tree_ops.rs deleted file mode 100644 index 30c74852d..000000000 --- a/crates/mesh/src/tree_ops.rs +++ /dev/null @@ -1,647 +0,0 @@ -//! Tree operation definitions for mesh synchronization -//! -//! Defines serializable tree operations that can be synchronized across mesh cluster nodes - -use serde::{Deserialize, Serialize}; - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub enum TreeKey { - Text(String), - Tokens(Vec), -} - -/// Tree insert operation -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct TreeInsertOp { - pub key: TreeKey, - pub tenant: String, // worker URL -} - -/// Tree remove operation -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct TreeRemoveOp { - pub tenant: String, // worker URL -} - -/// Tree operation type -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub enum TreeOperation { - Insert(TreeInsertOp), - Remove(TreeRemoveOp), -} - -/// Delta encoding for tree state synchronization. -/// Contains only the new operations since the last successful sync, rather than the full tree state. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub struct TreeStateDelta { - pub model_id: String, - pub operations: Vec, - /// Tree state version before these operations were applied. - pub base_version: u64, - /// Tree state version after these operations are applied. - pub new_version: u64, -} - -impl TreeStateDelta { - /// Serialize to bincode. - pub fn to_bytes(&self) -> Result, String> { - bincode::serialize(self).map_err(|e| format!("Failed to serialize TreeStateDelta: {e}")) - } - - /// Deserialize from bincode bytes. - pub fn from_bytes(bytes: &[u8]) -> Result { - bincode::deserialize(bytes) - .map_err(|e| format!("Failed to deserialize TreeStateDelta: {e}")) - } -} - -// ── Tenant delta types for efficient two-layer sync ───────────────── - -/// Lightweight tenant change set for high-frequency sync (every gossip round). -/// Contains only which tenants changed at which tree nodes — no tree structure, -/// no prompt text. ~100 bytes per insert vs ~200KB for full TreeOperation. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub struct TenantDelta { - pub model_id: String, - pub version: u64, - pub inserts: Vec, - pub evictions: Vec, -} - -/// A tenant was added or refreshed at a tree node. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub struct TenantInsert { - /// Blake3 hash of the full prefix path from tree root to this node. - /// 8 bytes instead of 80k+ chars. Receiver looks up node by hash; - /// if unknown, buffers until next structure snapshot. - pub node_path_hash: u64, - /// Worker URL that cached this prefix. - pub worker_url: String, - /// Epoch (timestamp) of the cache event. Max-epoch-wins on merge. - pub epoch: u64, -} - -pub use crate::hash::{hash_node_path, hash_token_path, GLOBAL_EVICTION_HASH}; - -/// A tenant was evicted from a tree node. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub struct TenantEvict { - /// Blake3 hash of the prefix path where the tenant was evicted. - /// Use [`GLOBAL_EVICTION_HASH`] (0) to evict from all nodes. - pub node_path_hash: u64, - /// Worker URL that evicted this prefix. - pub worker_url: String, -} - -impl TenantDelta { - pub fn new(model_id: String, version: u64) -> Self { - Self { - model_id, - version, - inserts: Vec::new(), - evictions: Vec::new(), - } - } - - pub fn is_empty(&self) -> bool { - self.inserts.is_empty() && self.evictions.is_empty() - } - - pub fn to_bytes(&self) -> Result, String> { - bincode::serialize(self).map_err(|e| format!("Failed to serialize TenantDelta: {e}")) - } - - pub fn from_bytes(bytes: &[u8]) -> Result { - bincode::deserialize(bytes).map_err(|e| format!("Failed to deserialize TenantDelta: {e}")) - } -} - -// ── Compression helpers for structure snapshots ───────────────────── - -/// Compress bytes with LZ4 for wire efficiency. -/// Radix tree data compresses well (repetitive edge labels, worker URLs). -pub fn lz4_compress(data: &[u8]) -> Vec { - lz4_flex::compress_prepend_size(data) -} - -/// Decompress LZ4-compressed bytes with a size safety check. -/// Rejects payloads claiming > 256 MB decompressed size to prevent -/// OOM from corrupted or malicious size headers. -pub fn lz4_decompress(data: &[u8]) -> Result, String> { - const MAX_DECOMPRESSED_SIZE: usize = 256 * 1024 * 1024; // 256 MB - if data.len() >= 4 { - let claimed_size = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize; - if claimed_size > MAX_DECOMPRESSED_SIZE { - return Err(format!( - "LZ4 claimed decompressed size {claimed_size} exceeds limit {MAX_DECOMPRESSED_SIZE}" - )); - } - } - lz4_flex::decompress_size_prepended(data).map_err(|e| format!("LZ4 decompression failed: {e}")) -} - -// ── Legacy types (still used for periodic structure snapshots) ─────── - -/// Maximum number of operations stored in a TreeState before compaction. -/// Prevents unbounded growth of the operation log, especially with token payloads. -const MAX_TREE_OPERATIONS: usize = 2048; - -/// Tree state for a specific model -/// Contains a sequence of operations that can be applied to reconstruct the tree -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Default)] -pub struct TreeState { - pub model_id: String, - pub operations: Vec, - pub version: u64, -} - -impl TreeState { - pub fn new(model_id: String) -> Self { - Self { - model_id, - operations: Vec::new(), - version: 0, - } - } - - pub fn add_operation(&mut self, operation: TreeOperation) { - self.operations.push(operation); - self.version += 1; - if self.operations.len() > MAX_TREE_OPERATIONS { - // Keep the most recent half — oldest operations are least relevant for routing - let drain_count = self.operations.len() - MAX_TREE_OPERATIONS / 2; - self.operations.drain(..drain_count); - } - } - - /// Serialize to bincode (compact binary format). - /// A Vec of 1000 tokens is ~4KB in bincode vs ~7KB in JSON. - pub fn to_bytes(&self) -> Result, String> { - bincode::serialize(self).map_err(|e| format!("Failed to serialize TreeState: {e}")) - } - - /// Deserialize from bincode bytes. - pub fn from_bytes(bytes: &[u8]) -> Result { - bincode::deserialize(bytes).map_err(|e| format!("Failed to deserialize TreeState: {e}")) - } - - /// Reconstruct a `TreeState` from a compact [`kv_index::snapshot::TreeSnapshot`]. - /// - /// Walks the pre-order node list, rebuilding full prefix paths and emitting - /// an `Insert` operation for each `(tenant, prefix)` pair. This is the - /// inverse of [`CacheAwarePolicy::export_tree_state`] and is used on the - /// receiver side to convert compact snapshots back into the `TreeState` - /// format that `apply_remote_tree_operation` expects. - #[expect( - clippy::unwrap_used, - reason = "pop() after last_mut().is_some() is infallible" - )] - pub fn from_snapshot( - model_id: String, - snapshot: &kv_index::snapshot::TreeSnapshot, - version: u64, - ) -> Self { - let mut tree_state = Self::new(model_id); - let mut path_stack: Vec<(String, u32)> = Vec::new(); - let mut current_prefix = String::new(); - - for node in &snapshot.nodes { - // Pop completed parents from the stack - while let Some((_, remaining)) = path_stack.last_mut() { - if *remaining == 0 { - let (parent_prefix, _) = path_stack.pop().unwrap(); - current_prefix = parent_prefix; - } else { - *remaining -= 1; - break; - } - } - - // Build this node's full prefix - let node_prefix = format!("{}{}", current_prefix, node.edge); - - // Emit an Insert operation for each tenant at this node - for (tenant_url, _epoch) in &node.tenants { - if !node_prefix.is_empty() { - tree_state.add_operation(TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text(node_prefix.clone()), - tenant: tenant_url.clone(), - })); - } - } - - // Push this node onto the stack for its children - if node.child_count > 0 { - path_stack.push((current_prefix.clone(), node.child_count)); - current_prefix = node_prefix; - } - } - - tree_state.version = version; - tree_state - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_tree_insert_op_creation() { - let op = TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - assert_eq!(op.key, TreeKey::Text("test_text".to_string())); - assert_eq!(op.tenant, "http://worker1:8000"); - } - - #[test] - fn test_tree_remove_op_creation() { - let op = TreeRemoveOp { - tenant: "http://worker1:8000".to_string(), - }; - assert_eq!(op.tenant, "http://worker1:8000"); - } - - #[test] - fn test_tree_operation_insert() { - let insert_op = TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - let operation = TreeOperation::Insert(insert_op.clone()); - - match &operation { - TreeOperation::Insert(op) => { - assert_eq!(op.key, TreeKey::Text("test_text".to_string())); - assert_eq!(op.tenant, "http://worker1:8000"); - } - TreeOperation::Remove(_) => panic!("Expected Insert operation"), - } - } - - #[test] - fn test_tree_operation_remove() { - let remove_op = TreeRemoveOp { - tenant: "http://worker1:8000".to_string(), - }; - let operation = TreeOperation::Remove(remove_op.clone()); - - match &operation { - TreeOperation::Insert(_) => panic!("Expected Remove operation"), - TreeOperation::Remove(op) => { - assert_eq!(op.tenant, "http://worker1:8000"); - } - } - } - - #[test] - fn test_tree_operation_serialization() { - let insert_op = TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - let operation = TreeOperation::Insert(insert_op); - - let serialized = serde_json::to_string(&operation).unwrap(); - let deserialized: TreeOperation = serde_json::from_str(&serialized).unwrap(); - - match (&operation, &deserialized) { - (TreeOperation::Insert(a), TreeOperation::Insert(b)) => { - assert_eq!(a.key, b.key); - assert_eq!(a.tenant, b.tenant); - } - _ => panic!("Operations should match"), - } - } - - #[test] - fn test_tree_operation_token_serialization() { - let insert_op = TreeInsertOp { - key: TreeKey::Tokens(vec![1, 2, 3, 4]), - tenant: "http://worker1:8000".to_string(), - }; - let operation = TreeOperation::Insert(insert_op); - - let serialized = serde_json::to_string(&operation).unwrap(); - let deserialized: TreeOperation = serde_json::from_str(&serialized).unwrap(); - - match (&operation, &deserialized) { - (TreeOperation::Insert(a), TreeOperation::Insert(b)) => { - assert_eq!(a.key, b.key); - assert_eq!(a.tenant, b.tenant); - } - _ => panic!("Operations should match"), - } - } - - #[test] - fn test_tree_state_bincode_round_trip_with_tokens() { - let tokens = vec![12345u32, 67890, 0, u32::MAX, 42]; - let mut state = TreeState::new("test-model".to_string()); - state.add_operation(TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Tokens(tokens.clone()), - tenant: "http://worker1:8000".to_string(), - })); - state.add_operation(TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text("text_key".to_string()), - tenant: "http://worker2:8000".to_string(), - })); - state.add_operation(TreeOperation::Remove(TreeRemoveOp { - tenant: "http://worker3:8000".to_string(), - })); - - let bytes = state.to_bytes().unwrap(); - let restored = TreeState::from_bytes(&bytes).unwrap(); - - assert_eq!(restored.model_id, "test-model"); - assert_eq!(restored.version, state.version); - assert_eq!(restored.operations.len(), 3); - - match &restored.operations[0] { - TreeOperation::Insert(op) => { - assert_eq!(op.key, TreeKey::Tokens(tokens)); - assert_eq!(op.tenant, "http://worker1:8000"); - } - TreeOperation::Remove(_) => panic!("Expected Insert"), - } - match &restored.operations[1] { - TreeOperation::Insert(op) => { - assert_eq!(op.key, TreeKey::Text("text_key".to_string())); - } - TreeOperation::Remove(_) => panic!("Expected Insert"), - } - match &restored.operations[2] { - TreeOperation::Remove(op) => { - assert_eq!(op.tenant, "http://worker3:8000"); - } - TreeOperation::Insert(_) => panic!("Expected Remove"), - } - } - - #[test] - fn test_tree_state_bincode_round_trip_large_tokens() { - let mut state = TreeState::new("large-model".to_string()); - for i in 0..100 { - let tokens: Vec = (0..1000).map(|j| (i * 1000 + j) as u32).collect(); - state.add_operation(TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Tokens(tokens), - tenant: format!("http://worker-{i}:8000"), - })); - } - - let bytes = state.to_bytes().unwrap(); - let restored = TreeState::from_bytes(&bytes).unwrap(); - - assert_eq!(restored.operations.len(), 100); - assert_eq!(restored.version, state.version); - - // Spot-check exact token preservation - match &restored.operations[0] { - TreeOperation::Insert(op) => { - if let TreeKey::Tokens(tokens) = &op.key { - assert_eq!(tokens.len(), 1000); - assert_eq!(tokens[0], 0); - assert_eq!(tokens[999], 999); - } else { - panic!("Expected Tokens key"); - } - } - TreeOperation::Remove(_) => panic!("Expected Insert"), - } - match &restored.operations[99] { - TreeOperation::Insert(op) => { - if let TreeKey::Tokens(tokens) = &op.key { - assert_eq!(tokens[0], 99000); - assert_eq!(tokens[999], 99999); - } else { - panic!("Expected Tokens key"); - } - } - TreeOperation::Remove(_) => panic!("Expected Insert"), - } - } - - #[test] - fn test_tree_operation_remove_serialization() { - let remove_op = TreeRemoveOp { - tenant: "http://worker1:8000".to_string(), - }; - let operation = TreeOperation::Remove(remove_op); - - let serialized = serde_json::to_string(&operation).unwrap(); - let deserialized: TreeOperation = serde_json::from_str(&serialized).unwrap(); - - match (&operation, &deserialized) { - (TreeOperation::Remove(a), TreeOperation::Remove(b)) => { - assert_eq!(a.tenant, b.tenant); - } - _ => panic!("Operations should match"), - } - } - - #[test] - fn test_tree_state_new() { - let state = TreeState::new("model1".to_string()); - assert_eq!(state.model_id, "model1"); - assert_eq!(state.operations.len(), 0); - assert_eq!(state.version, 0); - } - - #[test] - fn test_tree_state_default() { - let state = TreeState::default(); - assert_eq!(state.model_id, ""); - assert_eq!(state.operations.len(), 0); - assert_eq!(state.version, 0); - } - - #[test] - fn test_tree_state_add_operation() { - let mut state = TreeState::new("model1".to_string()); - - let insert_op = TreeInsertOp { - key: TreeKey::Text("text1".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - state.add_operation(TreeOperation::Insert(insert_op)); - - assert_eq!(state.operations.len(), 1); - assert_eq!(state.version, 1); - - let remove_op = TreeRemoveOp { - tenant: "http://worker1:8000".to_string(), - }; - state.add_operation(TreeOperation::Remove(remove_op)); - - assert_eq!(state.operations.len(), 2); - assert_eq!(state.version, 2); - } - - #[test] - fn test_tree_state_add_multiple_operations() { - let mut state = TreeState::new("model1".to_string()); - - for i in 0..5 { - let insert_op = TreeInsertOp { - key: TreeKey::Text(format!("text_{i}")), - tenant: format!("http://worker{i}:8000"), - }; - state.add_operation(TreeOperation::Insert(insert_op)); - } - - assert_eq!(state.operations.len(), 5); - assert_eq!(state.version, 5); - } - - #[test] - fn test_tree_state_serialization() { - let mut state = TreeState::new("model1".to_string()); - - let insert_op = TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - state.add_operation(TreeOperation::Insert(insert_op)); - - let remove_op = TreeRemoveOp { - tenant: "http://worker1:8000".to_string(), - }; - state.add_operation(TreeOperation::Remove(remove_op)); - - let serialized = serde_json::to_string(&state).unwrap(); - let deserialized: TreeState = serde_json::from_str(&serialized).unwrap(); - - assert_eq!(state.model_id, deserialized.model_id); - assert_eq!(state.operations.len(), deserialized.operations.len()); - assert_eq!(state.version, deserialized.version); - } - - #[test] - fn test_tree_state_clone() { - let mut state = TreeState::new("model1".to_string()); - - let insert_op = TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - state.add_operation(TreeOperation::Insert(insert_op)); - - let cloned = state.clone(); - assert_eq!(state.model_id, cloned.model_id); - assert_eq!(state.operations.len(), cloned.operations.len()); - assert_eq!(state.version, cloned.version); - } - - #[test] - fn test_tree_state_equality() { - let mut state1 = TreeState::new("model1".to_string()); - let mut state2 = TreeState::new("model1".to_string()); - - let insert_op = TreeInsertOp { - key: TreeKey::Text("test_text".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - state1.add_operation(TreeOperation::Insert(insert_op.clone())); - state2.add_operation(TreeOperation::Insert(insert_op)); - - assert_eq!(state1, state2); - } - - #[test] - fn test_tree_operation_hash() { - use std::collections::HashSet; - - let insert_op1 = TreeInsertOp { - key: TreeKey::Text("text1".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - let insert_op2 = TreeInsertOp { - key: TreeKey::Text("text1".to_string()), - tenant: "http://worker1:8000".to_string(), - }; - - let op1 = TreeOperation::Insert(insert_op1); - let op2 = TreeOperation::Insert(insert_op2); - - let mut set = HashSet::new(); - set.insert(op1.clone()); - set.insert(op2.clone()); - - // Same operations should be considered equal - assert_eq!(set.len(), 1); - } - - #[test] - fn test_tenant_delta_round_trip() { - let path_hash = hash_node_path("Hello world, how are"); - let mut delta = TenantDelta::new("model1".to_string(), 42); - delta.inserts.push(TenantInsert { - node_path_hash: path_hash, - worker_url: "grpc://w1:8000".to_string(), - epoch: 1000, - }); - delta.evictions.push(TenantEvict { - node_path_hash: path_hash, - worker_url: "grpc://w2:8000".to_string(), - }); - - assert!(!delta.is_empty()); - - let bytes = delta.to_bytes().unwrap(); - let restored = TenantDelta::from_bytes(&bytes).unwrap(); - - assert_eq!(restored.model_id, "model1"); - assert_eq!(restored.version, 42); - assert_eq!(restored.inserts.len(), 1); - assert_eq!(restored.inserts[0].worker_url, "grpc://w1:8000"); - assert_eq!(restored.inserts[0].node_path_hash, path_hash); - assert_eq!(restored.inserts[0].epoch, 1000); - assert_eq!(restored.evictions.len(), 1); - assert_eq!(restored.evictions[0].worker_url, "grpc://w2:8000"); - } - - #[test] - fn test_tenant_delta_empty() { - let delta = TenantDelta::new("model1".to_string(), 0); - assert!(delta.is_empty()); - } - - #[test] - fn test_tenant_delta_size_vs_tree_operation() { - // A TenantInsert with a hash is ~30 bytes (8 + ~20 URL + 8 epoch) - let insert = TenantInsert { - node_path_hash: hash_node_path(&"a".repeat(100)), - worker_url: "grpc://worker1:8000".to_string(), - epoch: 12345, - }; - let delta = TenantDelta { - model_id: "model1".to_string(), - version: 1, - inserts: vec![insert], - evictions: vec![], - }; - let delta_bytes = delta.to_bytes().unwrap(); - - // A TreeOperation with a 20k-char prompt is ~20KB+ - let tree_op = TreeOperation::Insert(TreeInsertOp { - key: TreeKey::Text("x".repeat(20_000)), - tenant: "grpc://worker1:8000".to_string(), - }); - let tree_state = TreeState { - model_id: "model1".to_string(), - operations: vec![tree_op], - version: 1, - }; - let tree_bytes = tree_state.to_bytes().unwrap(); - - // TenantDelta should be orders of magnitude smaller - assert!( - delta_bytes.len() < tree_bytes.len() / 10, - "TenantDelta ({} bytes) should be much smaller than TreeState ({} bytes)", - delta_bytes.len(), - tree_bytes.len() - ); - } -}