perf: reduce latency via timing constant tuning

CaptainMirage · CaptainMirage · commit b64213625122 · 2026-05-29T23:27:28.000+03:30
Tighten all relay timing constants to cut dead-wait time and flow-control
stalls without touching any logic paths.

tunnel_client.rs:
- REPLY_TIMEOUT 35s -&gt; 20s: GAS hard execution limit is 30s, so 35s
  can never catch a live-but-killed session; 20s still covers slow
  legitimate responses (~5-10s) with margin.
- Pre-fill poll stagger 1s -&gt; 100ms per slot: eliminated 1s dead time
  at every session startup (INFLIGHT_OPTIMIST=2 means 1 slot was always
  delayed by 1s).

domain_fronter.rs:
- POOL_TTL_SECS 60 -&gt; 30: faster turnover when IP/DNS changes.
- POOL_REFILL_INTERVAL_SECS 5 -&gt; 2: halves h1 pool recovery window
  after an h2 outage.
- H2_READY_TIMEOUT_SECS 5 -&gt; 3: faster h1 fallback on saturated h2
  connections.
- H1_KEEPALIVE_INTERVAL_SECS 240 -&gt; 60: keeps GAS containers warm
  after 1-min idle instead of 4-min; eliminates 1-3s cold-start
  penalty for users who pause streaming. Quota cost is ~360
  extra invocations/day, well under the free-tier 6M/day limit.
- H2 flow-control windows 4MB/8MB -&gt; 16MB/32MB: eliminates flow-
  control stalls during range-parallel streaming (256 KB chunks).
  Memory overhead is zero on idle pooled connections.
- Body Vec pre-sized from content-length header: avoids O(log n)
  realloc-and-copy cycles on large GAS responses (up to 40 MB).
diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs
@@ -95,9 +95,9 @@ impl FronterError {
 }
 
 type PooledStream = TlsStream<TcpStream>;
-const POOL_TTL_SECS: u64 = 60;
+const POOL_TTL_SECS: u64 = 30;
 const POOL_MIN: usize = 8;
-const POOL_REFILL_INTERVAL_SECS: u64 = 5;
+const POOL_REFILL_INTERVAL_SECS: u64 = 2;
 const POOL_MAX: usize = 80;
 const REQUEST_TIMEOUT_SECS: u64 = 25;
 const RANGE_PARALLEL_CHUNK_BYTES: u64 = 256 * 1024;
@@ -118,7 +118,7 @@ const H2_CONN_TTL_SECS: u64 = 540;
 /// `h2_round_trip`. This way a slow but legitimate Apps Script call
 /// isn't cut off at an arbitrary fixed cap, and Full-mode batches can
 /// honor the user's `request_timeout_secs` setting.
-const H2_READY_TIMEOUT_SECS: u64 = 5;
+const H2_READY_TIMEOUT_SECS: u64 = 3;
 /// Default response-phase deadline used by `relay_uncoalesced` callers
 /// (the Apps-Script direct path). Sized to be just under the outer
 /// `REQUEST_TIMEOUT_SECS` (25 s) so an h2 timeout still leaves a few
@@ -147,7 +147,7 @@ const H1_OPEN_TIMEOUT_SECS: u64 = 8;
 /// containers go cold after ~5min idle and cost 1-3s on the first
 /// request to wake back up — most painful on YouTube / streaming where
 /// the first chunk after a quiet pause stalls the player.
-const H1_KEEPALIVE_INTERVAL_SECS: u64 = 240;
+const H1_KEEPALIVE_INTERVAL_SECS: u64 = 60;
 /// Largest response body Apps Script's `UrlFetchApp` will deliver before
 /// the script gets killed mid-execution. The hard wire ceiling is ~50 MiB;
 /// after base64 / envelope overhead and edge variance, the practical raw
@@ -1405,10 +1405,13 @@ impl DomainFronter {
         // `release_capacity` on every chunk for typical Apps Script
         // payloads (usually < 1 MB; range chunks are 256 KB). We still
         // release capacity in the body-read loop for safety on larger
-        // bodies.
+        // bodies. 16/32 MB windows eliminate stalls for range-parallel
+        // streaming (256 KB chunks × many streams) without adding memory
+        // overhead on idle connections (the window is just a counter until
+        // data flows).
         let (send, conn) = h2::client::Builder::new()
-            .initial_window_size(4 * 1024 * 1024)
-            .initial_connection_window_size(8 * 1024 * 1024)
+            .initial_window_size(16 * 1024 * 1024)
+            .initial_connection_window_size(32 * 1024 * 1024)
             .handshake(tls)
             .await
             .map_err(|e| OpenH2Error::Handshake(e.to_string()))?;
@@ -1626,9 +1629,15 @@ impl DomainFronter {
         // through Apps Script (where a 256 KB range chunk can take 30-90s
         // of wall-clock time) are not killed by the tighter `batch_timeout`.
         // Release flow-control credit per chunk so large responses don't
-        // stall after the initial 4 MB window.
+        // stall after the initial window.
+        // Pre-size from content-length to avoid O(log n) realloc cycles
+        // on large GAS responses (up to 40 MB).
         let stream_timeout = self.stream_timeout();
-        let mut buf: Vec<u8> = Vec::new();
+        let body_hint: usize = headers.iter()
+            .find(|(k, _)| k.eq_ignore_ascii_case("content-length"))
+            .and_then(|(_, v)| v.parse().ok())
+            .unwrap_or(0);
+        let mut buf: Vec<u8> = Vec::with_capacity(body_hint.min(APPS_SCRIPT_BODY_MAX_BYTES as usize));
         loop {
             match tokio::time::timeout(stream_timeout, body.data()).await {
                 Ok(None) => break,
diff --git a/src/tunnel_client.rs b/src/tunnel_client.rs
@@ -56,7 +56,7 @@ const REPLY_TIMEOUT_SLACK: Duration = Duration::from_secs(5);
 /// Per-inflight reply timeout used by the pipelined poll loop. Each
 /// in-flight future independently times out after this duration so a
 /// dead target on the tunnel-node side doesn't block the session.
-const REPLY_TIMEOUT: Duration = Duration::from_secs(35);
+const REPLY_TIMEOUT: Duration = Duration::from_secs(20);
 
 /// How long we'll briefly hold the client socket after the local
 /// CONNECT/SOCKS5 handshake, waiting for the client's first bytes (the
@@ -1543,13 +1543,14 @@ async fn tunnel_loop(
     }
 
     // Send initial pre-fill empty polls (optimist depth), staggered
-    // 1s apart so they land in separate batches. The pending data op
+    // 100ms apart so they land in separate batches without blocking
+    // session startup for a full second per slot. The pending data op
     // (if any) already occupies one slot.
     {
         let polls_to_send = max_inflight.saturating_sub(inflight.len());
         for i in 0..polls_to_send {
             if i > 0 {
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                tokio::time::sleep(Duration::from_millis(100)).await;
             }
             let (meta, reply_rx) = send_empty_poll(sid, &mut next_send_seq, mux);
             tracing::debug!(