From 7a2d35a82e0b2f392da56b5afd78fecb9c8d7c58 Mon Sep 17 00:00:00 2001 From: Orlando Hohmeier Date: Sat, 3 Jan 2026 17:32:08 +0100 Subject: [PATCH 1/3] fix: adjust request timeout Change the request timeout calculation so that it scales with the configured RTT beyond 10s but cap it at 30s as anything beyond that should probably be handled with retries and longer backoffs. --- crates/data/src/network.rs | 2 +- crates/scheduler/src/network.rs | 2 +- crates/worker/src/network.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/data/src/network.rs b/crates/data/src/network.rs index f77a7a1b..d5946a4e 100644 --- a/crates/data/src/network.rs +++ b/crates/data/src/network.rs @@ -87,7 +87,7 @@ impl Network { let (action_sender, action_receiver) = mpsc::channel(64); let meter = metrics::global::meter(); let request_timeout = - (Duration::from_millis(network_config.rtt_ms()) * 10).max(Duration::from_secs(10)); + (Duration::from_millis(network_config.rtt_ms()) * 100).min(Duration::from_secs(30)); let swarm = SwarmBuilder::with_existing_identity(cert_chain, private_key, ca_certs, crls) .with_tokio() diff --git a/crates/scheduler/src/network.rs b/crates/scheduler/src/network.rs index f6c1c474..16f10a6b 100644 --- a/crates/scheduler/src/network.rs +++ b/crates/scheduler/src/network.rs @@ -113,7 +113,7 @@ impl Network { let (action_sender, action_receiver) = mpsc::channel(512); let meter = metrics::global::meter(); let request_timeout = - (Duration::from_millis(network_config.rtt_ms()) * 10).max(Duration::from_secs(10)); + (Duration::from_millis(network_config.rtt_ms()) * 100).min(Duration::from_secs(30)); // Build libp2p Swarm using the derived identity and mTLS config let swarm = SwarmBuilder::with_existing_identity(cert_chain, private_key, ca_certs, crls) diff --git a/crates/worker/src/network.rs b/crates/worker/src/network.rs index c25cb61d..c0624abf 100644 --- a/crates/worker/src/network.rs +++ b/crates/worker/src/network.rs @@ -107,7 +107,7 @@ impl Network { let (action_sender, action_receiver) = mpsc::channel(512); let meter = metrics::global::meter(); let request_timeout = - (Duration::from_millis(network_config.rtt_ms()) * 10).max(Duration::from_secs(10)); + (Duration::from_millis(network_config.rtt_ms()) * 100).min(Duration::from_secs(30)); let swarm = SwarmBuilder::with_existing_identity(cert_chain, private_key, ca_certs, crls) .with_tokio() From 3eba5cb4573a572732b210a157ef32aa85be1238 Mon Sep 17 00:00:00 2001 From: Orlando Hohmeier Date: Sat, 3 Jan 2026 17:33:48 +0100 Subject: [PATCH 2/3] fix: increase lease and offer timeouts Adjust offer and lease timeouts for more resilient lease renewal and offer accept flows in high RTT scenarios. --- crates/worker/src/arbiter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/worker/src/arbiter.rs b/crates/worker/src/arbiter.rs index e44cc4d9..bcd0b4f0 100644 --- a/crates/worker/src/arbiter.rs +++ b/crates/worker/src/arbiter.rs @@ -25,9 +25,9 @@ const WORKER_TOPIC: &str = "hypha/worker"; // This allows proper handling of multiple schedulers by batching advertisements const WINDOW_LIMIT: usize = 100; const WINDOW_WAIT: std::time::Duration = std::time::Duration::from_millis(200); -const OFFER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); +const OFFER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10); const PRUNE_INTERVAL: std::time::Duration = std::time::Duration::from_millis(250); -const LEASE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60); +const LEASE_TIMEOUT: std::time::Duration = std::time::Duration::from_mins(3); #[derive(Debug, Error)] #[error("lease error")] From 03e4b61defcb2651f2e972a68191324c90e890a4 Mon Sep 17 00:00:00 2001 From: Orlando Hohmeier Date: Sat, 3 Jan 2026 17:37:59 +0100 Subject: [PATCH 3/3] chore(network-sim) simulate RTT spikes and jitter Ajust teh network simulation script to jitter and spike delay. --- scripts/network-sim.sh | 349 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 320 insertions(+), 29 deletions(-) diff --git a/scripts/network-sim.sh b/scripts/network-sim.sh index abd3c87d..fe74cb19 100755 --- a/scripts/network-sim.sh +++ b/scripts/network-sim.sh @@ -6,15 +6,15 @@ # between Hypha components. # # Usage: -# sudo ./network-sim.sh start [delay_ms] [packet_loss_%] [bandwidth_kbit] +# sudo ./network-sim.sh start [options] # sudo ./network-sim.sh status # sudo ./network-sim.sh stop # # Examples: -# sudo ./network-sim.sh start 100 5 1000 # 100ms delay, 5% loss, 1Mbps -# sudo ./network-sim.sh start 50 0 10000 # 50ms delay, no loss, 10Mbps -# sudo ./network-sim.sh start 200 # 200ms delay only -# sudo ./network-sim.sh stop # Remove all rules +# sudo ./network-sim.sh start --min-delay 50 --max-delay 200 --loss 5 --bandwidth 1000 +# sudo ./network-sim.sh start --min-delay 100 --max-delay 100 # Static 100ms delay only +# sudo ./network-sim.sh start --spike-pct 10 --spike-mult 4 # More frequent spikes +# sudo ./network-sim.sh stop # Remove all rules set -euo pipefail @@ -23,20 +23,36 @@ set -euo pipefail ANCHOR="com.apple/hypha-test" PIPE_NUM=1 +# Jitter defaults (configure fluctuating RTT) +JITTER_MIN_DELAY_MS=20 +JITTER_MAX_DELAY_MS=120 +JITTER_INTERVAL_SEC=5 +JITTER_SPIKE_PCT=5 +JITTER_SPIKE_MULT=3 + +# Background jitter PID + metadata +PID_FILE="/tmp/hypha-network-sim.pid" +META_FILE="/tmp/hypha-network-sim.meta" + show_usage() { cat < Minimum RTT delay (default: 20) + --max-delay Maximum RTT delay (default: 120; set min=max for static delay) + --jitter-interval How often to randomize delay (default: 2) + --spike-pct <0-100> Chance (percent) to inject a spike (default: 5) + --spike-mult Multiplier applied when a spike occurs (default: 3) + --loss <0-100> Packet loss percentage (default: 0) + --bandwidth Bandwidth cap (default: unlimited) Examples: - sudo $0 start 100 5 1000 # 100ms delay, 5% loss, 1Mbps - sudo $0 start 50 # 50ms delay only + sudo $0 start --min-delay 100 --max-delay 100 --loss 5 --bandwidth 1000 + sudo $0 start --min-delay 20 --max-delay 120 --jitter-interval 1 + sudo $0 start --min-delay 50 --max-delay 200 --spike-pct 10 --spike-mult 4 status Show current simulation configuration @@ -46,6 +62,10 @@ Commands: Traffic affected: All IPv4 and IPv6 traffic on localhost (lo0). + +Behavior: + The start command now runs in the foreground. Press Ctrl+C to stop and clean up. + The explicit 'stop' command remains available as a fallback. EOF } @@ -56,19 +76,14 @@ check_root() { fi } -start_simulation() { - local delay_ms=${1:-100} - local loss_pct=${2:-0} - local bw_kbit=${3:-0} +log() { + echo "[$(date +'%H:%M:%S')] $*" +} - echo "Starting network simulation..." - echo " Delay: ${delay_ms}ms" - echo " Packet loss: ${loss_pct}%" - if [[ $bw_kbit -gt 0 ]]; then - echo " Bandwidth: ${bw_kbit}kbit/s" - else - echo " Bandwidth: unlimited" - fi +configure_pipe() { + local delay_ms=$1 + local loss_pct=$2 + local bw_kbit=$3 # Build dnctl pipe configuration local pipe_config="delay ${delay_ms}ms" @@ -89,8 +104,245 @@ start_simulation() { # NOTE: Split stats by flow (src/dst/proto/ports) pipe_config="$pipe_config mask all" - echo "Configuring dummynet pipe $PIPE_NUM..." dnctl pipe "$PIPE_NUM" config $pipe_config +} + +jitter_pid_running() { + [[ -f "$PID_FILE" ]] || return 1 + local pid + pid=$(cat "$PID_FILE" 2>/dev/null || true) + [[ -n "$pid" ]] || return 1 + kill -0 "$pid" 2>/dev/null +} + +stop_jitter_loop() { + if jitter_pid_running; then + local pid + pid=$(cat "$PID_FILE") + log "Stopping jitter loop (pid $pid)..." + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + fi + rm -f "$PID_FILE" "$META_FILE" +} + +random_delay_in_range() { + local min_ms=$1 + local max_ms=$2 + + if [[ $min_ms -ge $max_ms ]]; then + echo "$min_ms" + return + fi + + local range=$((max_ms - min_ms + 1)) + echo $((min_ms + RANDOM % range)) +} + +next_jitter_delay() { + local min_ms=$1 + local max_ms=$2 + local spike_pct=$3 + local spike_mult=$4 + + local delay_ms + delay_ms=$(random_delay_in_range "$min_ms" "$max_ms") + + if [[ $spike_pct -gt 0 && $spike_mult -gt 1 ]]; then + local roll=$((RANDOM % 100)) + if [[ $roll -lt $spike_pct ]]; then + delay_ms=$((delay_ms * spike_mult)) + fi + fi + + echo "$delay_ms" +} + +start_jitter_loop() { + local min_ms=$1 + local max_ms=$2 + local interval_sec=$3 + local loss_pct=$4 + local bw_kbit=$5 + local spike_pct=$6 + local spike_mult=$7 + local jitter_enabled=$8 + + ( + set -euo pipefail + trap "exit 0" TERM INT + + while true; do + if [[ $jitter_enabled -eq 1 ]]; then + local delay_ms + delay_ms=$(next_jitter_delay "$min_ms" "$max_ms" "$spike_pct" "$spike_mult") + log "Applying delay ${delay_ms}ms (loss ${loss_pct}%, bw ${bw_kbit}kbit/s)" + configure_pipe "$delay_ms" "$loss_pct" "$bw_kbit" + sleep "$interval_sec" + else + sleep "$interval_sec" + fi + done + ) & + + local loop_pid=$! + echo "$loop_pid" > "$PID_FILE" + JITTER_LOOP_PID=$loop_pid + + cat > "$META_FILE" <= min-delay" + exit 1 + fi + + # Clean up any existing jitter loop to avoid conflicting writers + if jitter_pid_running; then + echo "Stopping existing jitter loop (pid $(cat "$PID_FILE"))..." + stop_jitter_loop + fi + + local jitter_enabled=0 + if [[ $min_delay_ms -ne $max_delay_ms ]]; then + jitter_enabled=1 + elif [[ $spike_pct -gt 0 && $spike_mult -gt 1 ]]; then + jitter_enabled=1 + fi + + if [[ $jitter_interval_sec -le 0 ]]; then + echo "Error: jitter-interval must be > 0" + exit 1 + fi + + # Compute initial delay (randomized if jitter is enabled so we don't start flat) + local applied_delay_ms + if [[ $jitter_enabled -eq 1 ]]; then + applied_delay_ms=$(next_jitter_delay "$min_delay_ms" "$max_delay_ms" "$spike_pct" "$spike_mult") + else + applied_delay_ms=$min_delay_ms + fi + + echo "Starting network simulation..." + echo " Delay range: ${min_delay_ms}ms - ${max_delay_ms}ms (initial ${applied_delay_ms}ms)" + if [[ $jitter_enabled -eq 1 ]]; then + echo " Jitter interval: ${jitter_interval_sec}s, spikes: ${spike_pct}% @ x${spike_mult}" + else + echo " Jitter: disabled (static delay)" + fi + echo " Packet loss: ${loss_pct}%" + if [[ $bw_kbit -gt 0 ]]; then + echo " Bandwidth: ${bw_kbit}kbit/s" + else + echo " Bandwidth: unlimited" + fi + + echo "Configuring dummynet pipe $PIPE_NUM..." + configure_pipe "$applied_delay_ms" "$loss_pct" "$bw_kbit" echo "Configuring packet filter rules (anchor: $ANCHOR) for localhost traffic..." @@ -114,10 +366,24 @@ start_simulation() { pfctl -E >/dev/null 2>&1 || pfctl -e >/dev/null 2>&1 || true fi - echo "Network simulation started successfully!" - echo "" - echo "To adjust settings, run: $0 stop && sudo $0 start [new_params]" - echo "To stop simulation, run: sudo $0 stop" + # Start loop (keeps process in foreground). Even when jitter is disabled we + # keep a sleep loop running so Ctrl+C can trigger cleanup. + start_jitter_loop "$min_delay_ms" "$max_delay_ms" "$jitter_interval_sec" "$loss_pct" "$bw_kbit" "$spike_pct" "$spike_mult" "$jitter_enabled" + echo "Loop started (pid $(cat "$PID_FILE")). Running in foreground; press Ctrl+C to stop." + + cleanup_and_exit() { + echo "" + log "Signal received, stopping simulation..." + stop_simulation + exit 0 + } + trap cleanup_and_exit INT TERM + + # Wait for loop to exit (e.g., killed by stop command or errors) + wait "$JITTER_LOOP_PID" || true + + # If we get here without a signal, ensure cleanup to avoid stale rules. + stop_simulation } show_status() { @@ -135,6 +401,28 @@ show_status() { fi echo "" + echo "Jitter loop:" + if jitter_pid_running; then + echo " ✓ Running (pid $(cat "$PID_FILE"))" + if [[ -f "$META_FILE" ]]; then + # shellcheck disable=SC1090 + source "$META_FILE" + if [[ ${jitter_enabled:-1} -eq 1 ]]; then + echo " Jitter: enabled" + else + echo " Jitter: disabled (static delay)" + fi + echo " Delay range: ${min_delay_ms}ms - ${max_delay_ms}ms" + echo " Interval: ${jitter_interval_sec}s" + echo " Spikes: ${spike_pct}% @ x${spike_mult}" + fi + elif [[ -f "$PID_FILE" ]]; then + echo " ✗ Not running (stale pid $(cat "$PID_FILE"))" + else + echo " ✗ Not running" + fi + echo "" + # Show our dummynet rules in the anchor echo "Hypha dummynet rules (anchor: $ANCHOR):" if pfctl -a "$ANCHOR" -s dummynet 2>/dev/null | grep -q .; then @@ -156,6 +444,9 @@ show_status() { stop_simulation() { echo "Stopping network simulation..." + # Stop jitter writer first to avoid racing with teardown + stop_jitter_loop + # Flush rules from our anchor (only this anchor, not system rules) pfctl -q -a "$ANCHOR" -F all 2>/dev/null || true