Skip to content
1 change: 1 addition & 0 deletions crates/larql-server/examples/bench_expert_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ fn make_app_state(model: LoadedModel) -> Arc<AppState> {
api_key: None,
sessions: SessionManager::new(3600),
describe_cache: DescribeCache::new(60),
infer_timeout: std::time::Duration::from_secs(60),
})
}

Expand Down
1 change: 1 addition & 0 deletions crates/larql-server/examples/openai_demo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ fn make_app_state(model: LoadedModel) -> Arc<AppState> {
api_key: None,
sessions: SessionManager::new(3600),
describe_cache: DescribeCache::new(60),
infer_timeout: std::time::Duration::from_secs(60),
})
}

Expand Down
128 changes: 128 additions & 0 deletions crates/larql-server/src/bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ pub fn load_single_vindex(
embed_store,
release_mmap_after_request: opts.release_mmap_after_request,
weights: std::sync::OnceLock::new(),
weights_init: std::sync::Mutex::new(()),
probe_labels,
ffn_l2_cache: crate::ffn_l2_cache::FfnL2Cache::new(num_layers),
layer_latency_tracker: std::sync::Arc::new(crate::metrics::LayerLatencyTracker::new()),
Expand Down Expand Up @@ -423,6 +424,59 @@ pub struct Cli {
#[arg(long)]
pub no_infer: bool,

/// Defer model-weight loading until the first `/v1/infer` (or
/// other inference) request, instead of loading at startup.
///
/// The eager startup load is the default because:
///
/// - Lazy load happens on a request thread under HTTP handler
/// backpressure, and a 5+ GB allocation under cgroup pressure
/// reliably triggers an OOM-kill on memory-constrained hosts
/// (see `BUG-infer-deadlock.md`). Eager load surfaces the
/// same condition as a clean startup failure that systemd
/// reports loudly, *before* the listener binds.
/// - Lazy first-callers double-allocated until the single-flight
/// `weights_init` guard landed; eager load avoids that path
/// entirely on hosts where every inference call is going to
/// trigger the load anyway.
///
/// Pass this flag if you want the historical lazy behaviour
/// (e.g. for `--ffn-only` boxes that *might* be promoted to
/// inference later, or in tests).
///
/// Note: `--lazy-weights` also skips the startup memory
/// pre-flight check (there is nothing to size before the
/// deferred load), so a too-small-RAM condition surfaces on the
/// first request rather than at startup.
#[arg(long)]
pub lazy_weights: bool,

/// Skip the startup cgroup memory pre-flight check (BUG
/// `infer-deadlock-oom` §5.5). By default the server reads
/// `/sys/fs/cgroup/<self>/memory.max` and refuses to start when
/// the vindex's estimated resident size + a 512 MiB headroom
/// reserve exceeds the limit. Pass `--no-memcheck` to override
/// (e.g. for cases where the estimate is wrong, or when running
/// in an environment without cgroup v2).
#[arg(long)]
pub no_memcheck: bool,

/// Headroom (MiB) to reserve below `memory.max` for the OS,
/// allocator overhead, and the request-handling working set.
/// Used by the startup pre-flight; ignored when
/// `--no-memcheck` is set.
#[arg(long, default_value_t = 512)]
pub memcheck_headroom_mib: u64,

/// Per-request hard timeout for `/v1/infer` and other inference
/// endpoints, in seconds. When the inference exceeds this, the
/// handler responds 504 Gateway Timeout and drops the
/// `spawn_blocking` JoinHandle. The blocking thread runs to
/// completion in the background; its result is discarded.
/// Set to 0 to disable. See BUG-infer-deadlock §5.6.
#[arg(long, default_value_t = 60)]
pub infer_timeout_secs: u64,

/// Run as an FFN-service endpoint for remote `RemoteWalkBackend`
/// clients. Disables `/v1/infer` (like `--no-infer`) and advertises
/// `mode: ffn-service` in `/v1/stats`. This is Act 2 of the demo —
Expand Down Expand Up @@ -869,6 +923,73 @@ pub async fn serve(cli: Cli) -> Result<(), BoxError> {
return Err("no vindexes loaded".into());
}

// Cgroup memory pre-flight (BUG-infer-deadlock §5.5). Refuses to
// start when the configured cgroup leaves no room to load weights;
// converts a 10-second OOM-kill loop into a one-line startup error.
if !cli.no_memcheck && !cli.lazy_weights {
let total_estimate: u64 = models
.iter()
.filter(|m| !m.infer_disabled)
.map(|m| m.config.estimate_resident_bytes())
.sum();
if total_estimate > 0 {
let headroom = cli.memcheck_headroom_mib * 1024 * 1024;
let outcome = crate::memcheck::check_memory_headroom(total_estimate, headroom);
match &outcome {
crate::memcheck::MemCheckOutcome::Ok {
cgroup_max_bytes,
estimate_bytes,
} => {
info!(
"Memcheck: estimated {:.1} GB resident vs cgroup memory.max {:.1} GB \
(headroom {} MiB, ok)",
(*estimate_bytes as f64) / (1024.0 * 1024.0 * 1024.0),
(*cgroup_max_bytes as f64) / (1024.0 * 1024.0 * 1024.0),
cli.memcheck_headroom_mib,
);
}
crate::memcheck::MemCheckOutcome::Skipped { reason } => {
info!("Memcheck: skipped ({reason})");
}
crate::memcheck::MemCheckOutcome::Tight { .. } => {
return Err(crate::memcheck::explain_tight_outcome(&outcome).into());
}
}
}
} else if cli.no_memcheck {
info!("Memcheck: disabled (--no-memcheck)");
}

// Eager-load model weights at startup so the first /v1/infer
// request does not face a multi-GB allocation under HTTP-handler
// backpressure. Failure here is a clean startup error rather
// than an OOM-kill during the first request. See
// `BUG-infer-deadlock.md` and `LoadedModel::force_load_weights`.
if cli.lazy_weights {
info!("Lazy weight load: enabled (--lazy-weights)");
} else {
for m in &models {
if m.infer_disabled {
continue;
}
let load_start = std::time::Instant::now();
info!("Pre-loading model weights for '{}' …", m.id);
if let Err(e) = m.force_load_weights() {
return Err(format!(
"failed to load weights for '{}': {} \
(pass --lazy-weights to defer until first request)",
m.id, e
)
.into());
}
info!(
" Pre-loaded weights for '{}' in {:.1}s",
m.id,
load_start.elapsed().as_secs_f64(),
);
}
}

let rate_limiter =
cli.rate_limit
.as_ref()
Expand All @@ -893,8 +1014,15 @@ pub async fn serve(cli: Cli) -> Result<(), BoxError> {
api_key: cli.api_key.clone(),
sessions: SessionManager::new(DEFAULT_SESSION_TTL_SECS),
describe_cache: DescribeCache::new(cli.cache_ttl),
infer_timeout: std::time::Duration::from_secs(cli.infer_timeout_secs),
});

if cli.infer_timeout_secs == 0 {
info!("Infer timeout: disabled");
} else {
info!("Infer timeout: {}s", cli.infer_timeout_secs);
}

if cli.cache_ttl > 0 {
info!("DESCRIBE cache: {}s TTL", cli.cache_ttl);
}
Expand Down
11 changes: 11 additions & 0 deletions crates/larql-server/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ pub enum ServerError {

#[error("internal error: {0}")]
Internal(String),

/// Inference handler exceeded the server-side deadline. We drop
/// the in-flight `spawn_blocking` future, log the original
/// elapsed time, and respond `504 Gateway Timeout` so the
/// client can decide whether to retry. The blocking thread
/// keeps running to completion in the background — we don't
/// have cooperative cancellation on the inference path — but it
/// no longer holds up the HTTP handler or the next request.
#[error("inference timed out: {0}")]
Timeout(String),
}

impl IntoResponse for ServerError {
Expand All @@ -37,6 +47,7 @@ impl IntoResponse for ServerError {
(StatusCode::SERVICE_UNAVAILABLE, msg.clone())
}
ServerError::Internal(msg) => (StatusCode::INTERNAL_SERVER_ERROR, msg.clone()),
ServerError::Timeout(msg) => (StatusCode::GATEWAY_TIMEOUT, msg.clone()),
};

(status, axum::Json(ErrorBody { error: message })).into_response()
Expand Down
1 change: 1 addition & 0 deletions crates/larql-server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub mod ffn_l2_cache;
pub mod grpc;
pub mod grpc_expert;
pub mod http;
pub mod memcheck;
pub mod metrics;
pub mod openapi;
pub mod ratelimit;
Expand Down
Loading
Loading