diff --git a/Cargo.lock b/Cargo.lock
index 02c03811..611d1125 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,43 +200,6 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "async-nats"
-version = "0.45.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
-dependencies = [
- "base64 0.22.1",
- "bytes",
- "futures-util",
- "memchr",
- "nkeys",
- "nuid",
- "once_cell",
- "pin-project",
- "portable-atomic",
- "rand 0.8.6",
- "regex",
- "ring",
- "rustls-native-certs 0.7.3",
- "rustls-pemfile",
- "rustls-webpki 0.102.8",
- "serde",
- "serde_json",
- "serde_nanos",
- "serde_repr",
- "thiserror 1.0.69",
- "time",
- "tokio",
- "tokio-rustls 0.26.4",
- "tokio-stream",
- "tokio-util",
- "tokio-websockets",
- "tracing",
- "tryhard",
- "url",
-]
-
 [[package]]
 name = "async-stream"
 version = "0.3.6"
@@ -316,15 +279,6 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "atomic"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340"
-dependencies = [
- "bytemuck",
-]
-
 [[package]]
 name = "atomic-polyfill"
 version = "1.0.3"
@@ -389,448 +343,6 @@ dependencies = [
  "arrayvec",
 ]
 
-[[package]]
-name = "aws-config"
-version = "1.8.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a8fc176d53d6fe85017f230405e3255cedb4a02221cb55ed6d76dccbbb099b2"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-sdk-sso",
- "aws-sdk-ssooidc",
- "aws-sdk-sts",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "fastrand",
- "hex",
- "http 1.4.0",
- "ring",
- "time",
- "tokio",
- "tracing",
- "url",
- "zeroize",
-]
-
-[[package]]
-name = "aws-credential-types"
-version = "1.2.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e26bbf46abc608f2dc61fd6cb3b7b0665497cc259a21520151ed98f8b37d2c79"
-dependencies = [
- "aws-smithy-async",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-rs"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00"
-dependencies = [
- "aws-lc-sys",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-sys"
-version = "0.41.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4"
-dependencies = [
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
-]
-
-[[package]]
-name = "aws-runtime"
-version = "1.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0f92058d22a46adf53ec57a6a96f34447daf02bff52e8fb956c66bcd5c6ac12"
-dependencies = [
- "aws-credential-types",
- "aws-sigv4",
- "aws-smithy-async",
- "aws-smithy-eventstream",
- "aws-smithy-http",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "bytes-utils",
- "fastrand",
- "http 0.2.12",
- "http 1.4.0",
- "http-body 0.4.6",
- "http-body 1.0.1",
- "percent-encoding",
- "pin-project-lite",
- "tracing",
- "uuid",
-]
-
-[[package]]
-name = "aws-sdk-s3"
-version = "1.123.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c018f22146966fdd493a664f62ee2483dff256b42a08c125ab6a084bde7b77fe"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-sigv4",
- "aws-smithy-async",
- "aws-smithy-checksums",
- "aws-smithy-eventstream",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-observability",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-smithy-xml",
- "aws-types",
- "bytes",
- "fastrand",
- "hex",
- "hmac",
- "http 0.2.12",
- "http 1.4.0",
- "http-body 1.0.1",
- "lru 0.16.4",
- "percent-encoding",
- "regex-lite",
- "sha2 0.10.9",
- "tracing",
- "url",
-]
-
-[[package]]
-name = "aws-sdk-sso"
-version = "1.94.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "699da1961a289b23842d88fe2984c6ff68735fdf9bdcbc69ceaeb2491c9bf434"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-observability",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "fastrand",
- "http 0.2.12",
- "http 1.4.0",
- "regex-lite",
- "tracing",
-]
-
-[[package]]
-name = "aws-sdk-ssooidc"
-version = "1.96.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3e3a4cb3b124833eafea9afd1a6cc5f8ddf3efefffc6651ef76a03cbc6b4981"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-observability",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "fastrand",
- "http 0.2.12",
- "http 1.4.0",
- "regex-lite",
- "tracing",
-]
-
-[[package]]
-name = "aws-sdk-sts"
-version = "1.98.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89c4f19655ab0856375e169865c91264de965bd74c407c7f1e403184b1049409"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-observability",
- "aws-smithy-query",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-smithy-xml",
- "aws-types",
- "fastrand",
- "http 0.2.12",
- "http 1.4.0",
- "regex-lite",
- "tracing",
-]
-
-[[package]]
-name = "aws-sigv4"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68f6ae9b71597dc5fd115d52849d7a5556ad9265885ad3492ea8d73b93bbc46e"
-dependencies = [
- "aws-credential-types",
- "aws-smithy-eventstream",
- "aws-smithy-http",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "bytes",
- "crypto-bigint 0.5.5",
- "form_urlencoded",
- "hex",
- "hmac",
- "http 0.2.12",
- "http 1.4.0",
- "p256",
- "percent-encoding",
- "ring",
- "sha2 0.10.9",
- "subtle",
- "time",
- "tracing",
- "zeroize",
-]
-
-[[package]]
-name = "aws-smithy-async"
-version = "1.2.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cba48474f1d6807384d06fec085b909f5807e16653c5af5c45dfe89539f0b70"
-dependencies = [
- "futures-util",
- "pin-project-lite",
- "tokio",
-]
-
-[[package]]
-name = "aws-smithy-checksums"
-version = "0.64.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a764fa7222922f6c0af8eea478b0ef1ba5ce1222af97e01f33ca5e957bd7f3b9"
-dependencies = [
- "aws-smithy-http",
- "aws-smithy-types",
- "bytes",
- "crc-fast",
- "hex",
- "http 1.4.0",
- "http-body 1.0.1",
- "http-body-util",
- "md-5",
- "pin-project-lite",
- "sha1",
- "sha2 0.10.9",
- "tracing",
-]
-
-[[package]]
-name = "aws-smithy-eventstream"
-version = "0.60.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c0b3e587fbaa5d7f7e870544508af8ce82ea47cd30376e69e1e37c4ac746f79"
-dependencies = [
- "aws-smithy-types",
- "bytes",
- "crc32fast",
-]
-
-[[package]]
-name = "aws-smithy-http"
-version = "0.63.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af4a8a5fe3e4ac7ee871237c340bbce13e982d37543b65700f4419e039f5d78e"
-dependencies = [
- "aws-smithy-eventstream",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "bytes",
- "bytes-utils",
- "futures-core",
- "futures-util",
- "http 1.4.0",
- "http-body 1.0.1",
- "http-body-util",
- "percent-encoding",
- "pin-project-lite",
- "pin-utils",
- "tracing",
-]
-
-[[package]]
-name = "aws-smithy-http-client"
-version = "1.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0709f0083aa19b704132684bc26d3c868e06bd428ccc4373b0b55c3e8748a58b"
-dependencies = [
- "aws-smithy-async",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "h2 0.3.27",
- "h2 0.4.14",
- "http 0.2.12",
- "http 1.4.0",
- "http-body 0.4.6",
- "hyper 0.14.32",
- "hyper 1.9.0",
- "hyper-rustls 0.24.2",
- "hyper-rustls 0.27.9",
- "hyper-util",
- "pin-project-lite",
- "rustls 0.21.12",
- "rustls 0.23.40",
- "rustls-native-certs 0.8.3",
- "rustls-pki-types",
- "tokio",
- "tokio-rustls 0.26.4",
- "tower",
- "tracing",
-]
-
-[[package]]
-name = "aws-smithy-json"
-version = "0.62.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27b3a779093e18cad88bbae08dc4261e1d95018c4c5b9356a52bcae7c0b6e9bb"
-dependencies = [
- "aws-smithy-types",
-]
-
-[[package]]
-name = "aws-smithy-observability"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d3f39d5bb871aaf461d59144557f16d5927a5248a983a40654d9cf3b9ba183b"
-dependencies = [
- "aws-smithy-runtime-api",
-]
-
-[[package]]
-name = "aws-smithy-query"
-version = "0.60.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05f76a580e3d8f8961e5d48763214025a2af65c2fa4cd1fb7f270a0e107a71b0"
-dependencies = [
- "aws-smithy-types",
- "urlencoding",
-]
-
-[[package]]
-name = "aws-smithy-runtime"
-version = "1.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fd3dfc18c1ce097cf81fced7192731e63809829c6cbf933c1ec47452d08e1aa"
-dependencies = [
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-http-client",
- "aws-smithy-observability",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "bytes",
- "fastrand",
- "http 0.2.12",
- "http 1.4.0",
- "http-body 0.4.6",
- "http-body 1.0.1",
- "http-body-util",
- "pin-project-lite",
- "pin-utils",
- "tokio",
- "tracing",
-]
-
-[[package]]
-name = "aws-smithy-runtime-api"
-version = "1.11.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c55e0837e9b8526f49e0b9bfa9ee18ddee70e853f5bc09c5d11ebceddcb0fec"
-dependencies = [
- "aws-smithy-async",
- "aws-smithy-types",
- "bytes",
- "http 0.2.12",
- "http 1.4.0",
- "pin-project-lite",
- "tokio",
- "tracing",
- "zeroize",
-]
-
-[[package]]
-name = "aws-smithy-types"
-version = "1.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "576b0d6991c9c32bc14fc340582ef148311f924d41815f641a308b5d11e8e7cd"
-dependencies = [
- "base64-simd",
- "bytes",
- "bytes-utils",
- "futures-core",
- "http 0.2.12",
- "http 1.4.0",
- "http-body 0.4.6",
- "http-body 1.0.1",
- "http-body-util",
- "itoa",
- "num-integer",
- "pin-project-lite",
- "pin-utils",
- "ryu",
- "serde",
- "time",
- "tokio",
- "tokio-util",
-]
-
-[[package]]
-name = "aws-smithy-xml"
-version = "0.60.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3"
-dependencies = [
- "xmlparser",
-]
-
-[[package]]
-name = "aws-types"
-version = "1.3.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c50f3cdf47caa8d01f2be4a6663ea02418e892f9bbfd82c7b9a3a37eaccdd3a"
-dependencies = [
- "aws-credential-types",
- "aws-smithy-async",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "rustc_version",
- "tracing",
-]
-
 [[package]]
 name = "axum"
 version = "0.8.9"
@@ -841,10 +353,10 @@ dependencies = [
  "bytes",
  "form_urlencoded",
  "futures-util",
- "http 1.4.0",
- "http-body 1.0.1",
+ "http",
+ "http-body",
  "http-body-util",
- "hyper 1.9.0",
+ "hyper",
  "hyper-util",
  "itoa",
  "matchit",
@@ -872,8 +384,8 @@ checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
 dependencies = [
  "bytes",
  "futures-core",
- "http 1.4.0",
- "http-body 1.0.1",
+ "http",
+ "http-body",
  "http-body-util",
  "mime",
  "pin-project-lite",
@@ -883,12 +395,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "base16ct"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"
-
 [[package]]
 name = "base64"
 version = "0.13.1"
@@ -901,16 +407,6 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
-[[package]]
-name = "base64-simd"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195"
-dependencies = [
- "outref",
- "vsimd",
-]
-
 [[package]]
 name = "base64ct"
 version = "1.8.3"
@@ -1157,16 +653,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "bytes-utils"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35"
-dependencies = [
- "bytes",
- "either",
-]
-
 [[package]]
 name = "bytesize"
 version = "2.3.1"
@@ -1249,10 +735,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
 dependencies = [
  "iana-time-zone",
- "js-sys",
  "num-traits",
  "serde",
- "wasm-bindgen",
  "windows-link",
 ]
 
@@ -1334,15 +818,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
 
-[[package]]
-name = "cmake"
-version = "0.1.58"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "cobs"
 version = "0.3.0"
@@ -1444,12 +919,6 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "const-oid"
-version = "0.9.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
-
 [[package]]
 name = "const-oid"
 version = "0.10.2"
@@ -1535,33 +1004,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "crc"
-version = "3.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675"
-dependencies = [
- "crc-catalog",
-]
-
-[[package]]
-name = "crc-catalog"
-version = "2.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853"
-
-[[package]]
-name = "crc-fast"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d"
-dependencies = [
- "crc",
- "digest 0.10.7",
- "rustversion",
- "spin 0.10.0",
-]
-
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -1733,28 +1175,6 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
-[[package]]
-name = "crypto-bigint"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef"
-dependencies = [
- "generic-array",
- "rand_core 0.6.4",
- "subtle",
- "zeroize",
-]
-
-[[package]]
-name = "crypto-bigint"
-version = "0.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
-dependencies = [
- "rand_core 0.6.4",
- "subtle",
-]
-
 [[package]]
 name = "crypto-common"
 version = "0.1.7"
@@ -1784,32 +1204,6 @@ dependencies = [
  "libloading 0.9.0",
 ]
 
-[[package]]
-name = "curve25519-dalek"
-version = "4.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
-dependencies = [
- "cfg-if",
- "cpufeatures 0.2.17",
- "curve25519-dalek-derive",
- "digest 0.10.7",
- "fiat-crypto",
- "rustc_version",
- "subtle",
-]
-
-[[package]]
-name = "curve25519-dalek-derive"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "cxx"
 version = "1.0.194"
@@ -1970,40 +1364,13 @@ dependencies = [
  "parking_lot_core",
 ]
 
-[[package]]
-name = "data-encoding"
-version = "2.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8"
-
-[[package]]
-name = "der"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de"
-dependencies = [
- "const-oid 0.9.6",
- "zeroize",
-]
-
-[[package]]
-name = "der"
-version = "0.7.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
-dependencies = [
- "const-oid 0.9.6",
- "pem-rfc7468 0.7.0",
- "zeroize",
-]
-
 [[package]]
 name = "der"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b"
 dependencies = [
- "pem-rfc7468 1.0.0",
+ "pem-rfc7468",
  "zeroize",
 ]
 
@@ -2073,7 +1440,6 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer 0.10.4",
  "crypto-common 0.1.7",
- "subtle",
 ]
 
 [[package]]
@@ -2083,7 +1449,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
 dependencies = [
  "block-buffer 0.12.0",
- "const-oid 0.10.2",
+ "const-oid",
  "crypto-common 0.2.1",
 ]
 
@@ -2161,12 +1527,6 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
 
-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.20"
@@ -2195,7 +1555,7 @@ dependencies = [
  "dashmap",
  "derive_builder",
  "dynamo-tokens",
- "flume 0.12.0",
+ "flume",
  "ordered-float",
  "parking_lot",
  "prometheus",
@@ -2259,66 +1619,12 @@ version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8072bec12b909b65aec01fa6518f387cfbf3427d4475409ad622898cd347522c"
 
-[[package]]
-name = "ecdsa"
-version = "0.14.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
-dependencies = [
- "der 0.6.1",
- "elliptic-curve",
- "rfc6979",
- "signature 1.6.4",
-]
-
-[[package]]
-name = "ed25519"
-version = "2.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
-dependencies = [
- "signature 2.2.0",
-]
-
-[[package]]
-name = "ed25519-dalek"
-version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9"
-dependencies = [
- "curve25519-dalek",
- "ed25519",
- "sha2 0.10.9",
- "signature 2.2.0",
- "subtle",
-]
-
 [[package]]
 name = "either"
 version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
-[[package]]
-name = "elliptic-curve"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
-dependencies = [
- "base16ct",
- "crypto-bigint 0.4.9",
- "der 0.6.1",
- "digest 0.10.7",
- "ff",
- "generic-array",
- "group",
- "pkcs8 0.9.0",
- "rand_core 0.6.4",
- "sec1",
- "subtle",
- "zeroize",
-]
-
 [[package]]
 name = "embedded-io"
 version = "0.4.0"
@@ -2568,37 +1874,6 @@ dependencies = [
  "simd-adler32",
 ]
 
-[[package]]
-name = "ff"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160"
-dependencies = [
- "rand_core 0.6.4",
- "subtle",
-]
-
-[[package]]
-name = "fiat-crypto"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
-
-[[package]]
-name = "figment"
-version = "0.10.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8cb01cd46b0cf372153850f4c6c272d9cbea2da513e07538405148f95bd789f3"
-dependencies = [
- "atomic",
- "pear",
- "serde",
- "serde_json",
- "toml 0.8.23",
- "uncased",
- "version_check",
-]
-
 [[package]]
 name = "find-msvc-tools"
 version = "0.1.9"
@@ -2621,18 +1896,6 @@ dependencies = [
  "miniz_oxide",
 ]
 
-[[package]]
-name = "flume"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095"
-dependencies = [
- "futures-core",
- "futures-sink",
- "nanorand",
- "spin 0.9.8",
-]
-
 [[package]]
 name = "flume"
 version = "0.12.0"
@@ -2642,7 +1905,7 @@ dependencies = [
  "fastrand",
  "futures-core",
  "futures-sink",
- "spin 0.9.8",
+ "spin",
 ]
 
 [[package]]
@@ -2696,23 +1959,6 @@ dependencies = [
  "futures-core",
 ]
 
-[[package]]
-name = "fs4"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4"
-dependencies = [
- "rustix",
- "tokio",
- "windows-sys 0.59.0",
-]
-
-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "futures"
 version = "0.3.32"
@@ -2884,36 +2130,6 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
 
-[[package]]
-name = "group"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
-dependencies = [
- "ff",
- "rand_core 0.6.4",
- "subtle",
-]
-
-[[package]]
-name = "h2"
-version = "0.3.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d"
-dependencies = [
- "bytes",
- "fnv",
- "futures-core",
- "futures-sink",
- "futures-util",
- "http 0.2.12",
- "indexmap 2.14.0",
- "slab",
- "tokio",
- "tokio-util",
- "tracing",
-]
-
 [[package]]
 name = "h2"
 version = "0.4.14"
@@ -2925,7 +2141,7 @@ dependencies = [
  "fnv",
  "futures-core",
  "futures-sink",
- "http 1.4.0",
+ "http",
  "indexmap 2.14.0",
  "slab",
  "tokio",
@@ -2975,8 +2191,6 @@ version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 dependencies = [
- "allocator-api2",
- "equivalent",
  "foldhash 0.1.5",
 ]
 
@@ -2999,6 +2213,15 @@ version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
 
+[[package]]
+name = "hashlink"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824e001ac4f3012dd16a264bec811403a67ca9deb6c102fc5049b32c4574b35f"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
 [[package]]
 name = "heapless"
 version = "0.7.17"
@@ -3009,7 +2232,7 @@ dependencies = [
  "hash32",
  "rustc_version",
  "serde",
- "spin 0.9.8",
+ "spin",
  "stable_deref_trait",
 ]
 
@@ -3038,7 +2261,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
 dependencies = [
  "dirs",
- "http 1.4.0",
+ "http",
  "indicatif 0.17.11",
  "libc",
  "log",
@@ -3059,7 +2282,7 @@ checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213"
 dependencies = [
  "dirs",
  "futures",
- "http 1.4.0",
+ "http",
  "indicatif 0.18.4",
  "libc",
  "log",
@@ -3075,32 +2298,12 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "hmac"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
-dependencies = [
- "digest 0.10.7",
-]
-
 [[package]]
 name = "hound"
 version = "3.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
 
-[[package]]
-name = "http"
-version = "0.2.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
-dependencies = [
- "bytes",
- "fnv",
- "itoa",
-]
-
 [[package]]
 name = "http"
 version = "1.4.0"
@@ -3111,17 +2314,6 @@ dependencies = [
  "itoa",
 ]
 
-[[package]]
-name = "http-body"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
-dependencies = [
- "bytes",
- "http 0.2.12",
- "pin-project-lite",
-]
-
 [[package]]
 name = "http-body"
 version = "1.0.1"
@@ -3129,7 +2321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
  "bytes",
- "http 1.4.0",
+ "http",
 ]
 
 [[package]]
@@ -3140,8 +2332,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
 dependencies = [
  "bytes",
  "futures-core",
- "http 1.4.0",
- "http-body 1.0.1",
+ "http",
+ "http-body",
  "pin-project-lite",
 ]
 
@@ -3152,42 +2344,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
 
 [[package]]
-name = "httpdate"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
-
-[[package]]
-name = "hybrid-array"
-version = "0.4.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5"
-dependencies = [
- "typenum",
-]
-
-[[package]]
-name = "hyper"
-version = "0.14.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
-dependencies = [
- "bytes",
- "futures-channel",
- "futures-core",
- "futures-util",
- "h2 0.3.27",
- "http 0.2.12",
- "http-body 0.4.6",
- "httparse",
- "httpdate",
- "itoa",
- "pin-project-lite",
- "socket2 0.5.10",
- "tokio",
- "tower-service",
- "tracing",
- "want",
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hybrid-array"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5"
+dependencies = [
+ "typenum",
 ]
 
 [[package]]
@@ -3200,9 +2368,9 @@ dependencies = [
  "bytes",
  "futures-channel",
  "futures-core",
- "h2 0.4.14",
- "http 1.4.0",
- "http-body 1.0.1",
+ "h2",
+ "http",
+ "http-body",
  "httparse",
  "httpdate",
  "itoa",
@@ -3212,34 +2380,18 @@ dependencies = [
  "want",
 ]
 
-[[package]]
-name = "hyper-rustls"
-version = "0.24.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
-dependencies = [
- "futures-util",
- "http 0.2.12",
- "hyper 0.14.32",
- "log",
- "rustls 0.21.12",
- "tokio",
- "tokio-rustls 0.24.1",
-]
-
 [[package]]
 name = "hyper-rustls"
 version = "0.27.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f"
 dependencies = [
- "http 1.4.0",
- "hyper 1.9.0",
+ "http",
+ "hyper",
  "hyper-util",
- "rustls 0.23.40",
- "rustls-native-certs 0.8.3",
+ "rustls",
  "tokio",
- "tokio-rustls 0.26.4",
+ "tokio-rustls",
  "tower-service",
  "webpki-roots 1.0.7",
 ]
@@ -3250,7 +2402,7 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
 dependencies = [
- "hyper 1.9.0",
+ "hyper",
  "hyper-util",
  "pin-project-lite",
  "tokio",
@@ -3265,7 +2417,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
 dependencies = [
  "bytes",
  "http-body-util",
- "hyper 1.9.0",
+ "hyper",
  "hyper-util",
  "native-tls",
  "tokio",
@@ -3283,14 +2435,14 @@ dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
- "http 1.4.0",
- "http-body 1.0.1",
- "hyper 1.9.0",
+ "http",
+ "http-body",
+ "hyper",
  "ipnet",
  "libc",
  "percent-encoding",
  "pin-project-lite",
- "socket2 0.6.3",
+ "socket2",
  "system-configuration",
  "tokio",
  "tower-service",
@@ -3538,12 +2690,6 @@ dependencies = [
  "rustversion",
 ]
 
-[[package]]
-name = "inlinable_string"
-version = "0.1.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb"
-
 [[package]]
 name = "insta"
 version = "1.47.2"
@@ -3570,6 +2716,17 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "io-uring"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62"
+dependencies = [
+ "bitflags 2.11.1",
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.12.0"
@@ -3717,26 +2874,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "kvbm-config"
-version = "1.2.0"
-dependencies = [
- "anyhow",
- "dynamo-memory",
- "figment",
- "nix 0.30.1",
- "nvtx",
- "rayon",
- "serde",
- "serde_json",
- "temp-env",
- "thiserror 2.0.18",
- "tokio",
- "tracing",
- "validator",
- "velo",
-]
-
 [[package]]
 name = "kvbm-consolidator"
 version = "1.2.0"
@@ -3764,46 +2901,6 @@ dependencies = [
  "tracing-test",
 ]
 
-[[package]]
-name = "kvbm-engine"
-version = "1.2.0"
-dependencies = [
- "anyhow",
- "async-nats",
- "aws-config",
- "aws-sdk-s3",
- "bytes",
- "chrono",
- "clap",
- "crossbeam-queue",
- "cudarc",
- "dashmap",
- "derive_builder",
- "dynamo-memory",
- "figment",
- "flume 0.11.1",
- "futures",
- "kvbm-common",
- "kvbm-config",
- "kvbm-logical",
- "kvbm-physical",
- "libc",
- "nvtx",
- "oneshot",
- "parking_lot",
- "rayon",
- "rmp-serde",
- "serde",
- "serde_json",
- "tokio",
- "tokio-rayon",
- "tokio-stream",
- "tracing",
- "tracing-subscriber",
- "uuid",
- "velo",
-]
-
 [[package]]
 name = "kvbm-kernels"
 version = "1.2.0"
@@ -3828,7 +2925,7 @@ dependencies = [
  "dynamo-tokens",
  "futures",
  "indexmap 2.14.0",
- "lru 0.16.4",
+ "lru",
  "parking_lot",
  "prometheus",
  "proptest",
@@ -3847,32 +2944,6 @@ dependencies = [
  "xxhash-rust",
 ]
 
-[[package]]
-name = "kvbm-physical"
-version = "1.2.0"
-dependencies = [
- "aligned-vec",
- "anyhow",
- "bincode 2.0.1",
- "blake3",
- "cudarc",
- "derive-getters",
- "derive_builder",
- "dynamo-memory",
- "futures",
- "kvbm-common",
- "kvbm-kernels",
- "rstest 0.26.1",
- "serde",
- "serde_json",
- "thiserror 2.0.18",
- "tokio",
- "tracing",
- "uuid",
- "validator",
- "velo",
-]
-
 [[package]]
 name = "lazy_static"
 version = "1.5.0"
@@ -4095,15 +3166,6 @@ dependencies = [
  "imgref",
 ]
 
-[[package]]
-name = "lru"
-version = "0.12.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
-dependencies = [
- "hashbrown 0.15.5",
-]
-
 [[package]]
 name = "lru"
 version = "0.16.4"
@@ -4171,13 +3233,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "md-5"
-version = "0.10.6"
+name = "mea"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+checksum = "2640d335e7273dacdcf51044026139b2e269c3bb0dfc3f8cb3496b85e3f6a42c"
 dependencies = [
- "cfg-if",
- "digest 0.10.7",
+ "slab",
 ]
 
 [[package]]
@@ -4201,6 +3262,15 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b"
 
+[[package]]
+name = "memoffset"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.9.1"
@@ -4339,15 +3409,6 @@ version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
 
-[[package]]
-name = "nanorand"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3"
-dependencies = [
- "getrandom 0.2.17",
-]
-
 [[package]]
 name = "native-tls"
 version = "0.2.18"
@@ -4357,10 +3418,10 @@ dependencies = [
  "libc",
  "log",
  "openssl",
- "openssl-probe 0.2.1",
+ "openssl-probe",
  "openssl-sys",
  "schannel",
- "security-framework 3.7.0",
+ "security-framework",
  "security-framework-sys",
  "tempfile",
 ]
@@ -4401,6 +3462,19 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
 
+[[package]]
+name = "nix"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
+dependencies = [
+ "bitflags 1.3.2",
+ "cc",
+ "cfg-if",
+ "libc",
+ "memoffset 0.6.5",
+]
+
 [[package]]
 name = "nix"
 version = "0.30.1"
@@ -4411,7 +3485,6 @@ dependencies = [
  "cfg-if",
  "cfg_aliases",
  "libc",
- "memoffset",
 ]
 
 [[package]]
@@ -4444,21 +3517,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "nkeys"
-version = "0.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
-dependencies = [
- "data-encoding",
- "ed25519",
- "ed25519-dalek",
- "getrandom 0.2.17",
- "log",
- "rand 0.8.6",
- "signatory",
-]
-
 [[package]]
 name = "no_std_io2"
 version = "0.9.4"
@@ -4508,15 +3566,6 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "nuid"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
-dependencies = [
- "rand 0.8.6",
-]
-
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@@ -4872,12 +3921,6 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "openssl-probe"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
-
 [[package]]
 name = "openssl-probe"
 version = "0.2.1"
@@ -4896,6 +3939,20 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "opentelemetry"
+version = "0.31.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "js-sys",
+ "pin-project-lite",
+ "thiserror 2.0.18",
+ "tracing",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -4927,23 +3984,6 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "outref"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
-
-[[package]]
-name = "p256"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
-dependencies = [
- "ecdsa",
- "elliptic-curve",
- "sha2 0.10.9",
-]
-
 [[package]]
 name = "page_size"
 version = "0.6.0"
@@ -5018,26 +4058,54 @@ dependencies = [
 ]
 
 [[package]]
-name = "pear"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdeeaa00ce488657faba8ebf44ab9361f9365a97bd39ffb8a60663f57ff4b467"
+name = "pegaflow-common"
+version = "0.22.6"
+source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b"
 dependencies = [
- "inlinable_string",
- "pear_codegen",
- "yansi",
+ "colored",
+ "libc",
+ "log",
+ "logforth",
 ]
 
 [[package]]
-name = "pear_codegen"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bab5b985dc082b345f812b7df84e1bef27e7207b39e448439ba8bd69c93f147"
+name = "pegaflow-core"
+version = "0.22.6"
+source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b"
 dependencies = [
- "proc-macro2",
- "proc-macro2-diagnostics",
- "quote",
- "syn 2.0.117",
+ "ahash",
+ "bytesize",
+ "cudarc",
+ "dashmap",
+ "futures",
+ "hashlink",
+ "io-uring",
+ "libc",
+ "log",
+ "logforth",
+ "mea",
+ "offset-allocator",
+ "opentelemetry",
+ "parking_lot",
+ "pegaflow-common",
+ "pegaflow-proto",
+ "rand 0.10.1",
+ "shared_memory",
+ "smallvec",
+ "tokio",
+ "tonic",
+ "uuid",
+]
+
+[[package]]
+name = "pegaflow-proto"
+version = "0.22.6"
+source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b"
+dependencies = [
+ "prost",
+ "tonic",
+ "tonic-prost",
+ "tonic-prost-build",
 ]
 
 [[package]]
@@ -5363,6 +4431,19 @@ dependencies = [
  "pegainfer-kernels",
 ]
 
+[[package]]
+name = "pegainfer-kv-offload"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "cudarc",
+ "half",
+ "log",
+ "pegaflow-core",
+ "pegainfer-kv-cache",
+ "tokio",
+]
+
 [[package]]
 name = "pegainfer-qwen3-4b"
 version = "0.1.0"
@@ -5381,6 +4462,7 @@ dependencies = [
  "pegainfer-cupti",
  "pegainfer-kernels",
  "pegainfer-kv-cache",
+ "pegainfer-kv-offload",
  "pegainfer-vllm-support",
  "rand 0.10.1",
  "safetensors",
@@ -5489,15 +4571,6 @@ dependencies = [
  "vllm-tokenizer",
 ]
 
-[[package]]
-name = "pem-rfc7468"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
-dependencies = [
- "base64ct",
-]
-
 [[package]]
 name = "pem-rfc7468"
 version = "1.0.0"
@@ -5556,16 +4629,6 @@ dependencies = [
  "sha2 0.10.9",
 ]
 
-[[package]]
-name = "petgraph"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
-dependencies = [
- "fixedbitset",
- "indexmap 2.14.0",
-]
-
 [[package]]
 name = "petgraph"
 version = "0.8.3"
@@ -5587,47 +4650,21 @@ dependencies = [
 ]
 
 [[package]]
-name = "pin-project-internal"
-version = "1.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a990e22f43e84855daf260dded30524ef4a9021cc7541c26540500a50b624389"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.117",
-]
-
-[[package]]
-name = "pin-project-lite"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
-
-[[package]]
-name = "pin-utils"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
-
-[[package]]
-name = "pkcs8"
-version = "0.9.0"
+name = "pin-project-internal"
+version = "1.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba"
+checksum = "a990e22f43e84855daf260dded30524ef4a9021cc7541c26540500a50b624389"
 dependencies = [
- "der 0.6.1",
- "spki 0.6.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
 ]
 
 [[package]]
-name = "pkcs8"
-version = "0.10.2"
+name = "pin-project-lite"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
-dependencies = [
- "der 0.7.10",
- "spki 0.7.3",
-]
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
 
 [[package]]
 name = "pkg-config"
@@ -5846,19 +4883,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "proc-macro2-diagnostics"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.117",
- "version_check",
- "yansi",
-]
-
 [[package]]
 name = "profiling"
 version = "1.0.18"
@@ -5935,16 +4959,6 @@ dependencies = [
  "unarray",
 ]
 
-[[package]]
-name = "prost"
-version = "0.13.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
-dependencies = [
- "bytes",
- "prost-derive 0.13.5",
-]
-
 [[package]]
 name = "prost"
 version = "0.14.3"
@@ -5952,27 +4966,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
 dependencies = [
  "bytes",
- "prost-derive 0.14.3",
-]
-
-[[package]]
-name = "prost-build"
-version = "0.13.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf"
-dependencies = [
- "heck",
- "itertools 0.14.0",
- "log",
- "multimap",
- "once_cell",
- "petgraph 0.7.1",
- "prettyplease",
- "prost 0.13.5",
- "prost-types 0.13.5",
- "regex",
- "syn 2.0.117",
- "tempfile",
+ "prost-derive",
 ]
 
 [[package]]
@@ -5985,10 +4979,10 @@ dependencies = [
  "itertools 0.14.0",
  "log",
  "multimap",
- "petgraph 0.8.3",
+ "petgraph",
  "prettyplease",
- "prost 0.14.3",
- "prost-types 0.14.3",
+ "prost",
+ "prost-types",
  "pulldown-cmark",
  "pulldown-cmark-to-cmark",
  "regex",
@@ -5996,19 +4990,6 @@ dependencies = [
  "tempfile",
 ]
 
-[[package]]
-name = "prost-derive"
-version = "0.13.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
-dependencies = [
- "anyhow",
- "itertools 0.14.0",
- "proc-macro2",
- "quote",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "prost-derive"
 version = "0.14.3"
@@ -6022,22 +5003,13 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "prost-types"
-version = "0.13.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
-dependencies = [
- "prost 0.13.5",
-]
-
 [[package]]
 name = "prost-types"
 version = "0.14.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7"
 dependencies = [
- "prost 0.14.3",
+ "prost",
 ]
 
 [[package]]
@@ -6095,7 +5067,7 @@ dependencies = [
  "anyhow",
  "indoc",
  "libc",
- "memoffset",
+ "memoffset 0.9.1",
  "once_cell",
  "portable-atomic",
  "pyo3-build-config",
@@ -6181,8 +5153,8 @@ dependencies = [
  "quinn-proto",
  "quinn-udp",
  "rustc-hash 2.1.2",
- "rustls 0.23.40",
- "socket2 0.6.3",
+ "rustls",
+ "socket2",
  "thiserror 2.0.18",
  "tokio",
  "tracing",
@@ -6201,7 +5173,7 @@ dependencies = [
  "rand 0.9.4",
  "ring",
  "rustc-hash 2.1.2",
- "rustls 0.23.40",
+ "rustls",
  "rustls-pki-types",
  "slab",
  "thiserror 2.0.18",
@@ -6219,7 +5191,7 @@ dependencies = [
  "cfg_aliases",
  "libc",
  "once_cell",
- "socket2 0.6.3",
+ "socket2",
  "tracing",
  "windows-sys 0.60.2",
 ]
@@ -6499,12 +5471,6 @@ dependencies = [
  "regex-syntax",
 ]
 
-[[package]]
-name = "regex-lite"
-version = "0.1.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
-
 [[package]]
 name = "regex-syntax"
 version = "0.8.10"
@@ -6529,12 +5495,12 @@ dependencies = [
  "futures-channel",
  "futures-core",
  "futures-util",
- "h2 0.4.14",
- "http 1.4.0",
- "http-body 1.0.1",
+ "h2",
+ "http",
+ "http-body",
  "http-body-util",
- "hyper 1.9.0",
- "hyper-rustls 0.27.9",
+ "hyper",
+ "hyper-rustls",
  "hyper-tls",
  "hyper-util",
  "js-sys",
@@ -6545,7 +5511,7 @@ dependencies = [
  "percent-encoding",
  "pin-project-lite",
  "quinn",
- "rustls 0.23.40",
+ "rustls",
  "rustls-pki-types",
  "serde",
  "serde_json",
@@ -6553,7 +5519,7 @@ dependencies = [
  "sync_wrapper",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls 0.26.4",
+ "tokio-rustls",
  "tokio-util",
  "tower",
  "tower-http",
@@ -6566,17 +5532,6 @@ dependencies = [
  "webpki-roots 1.0.7",
 ]
 
-[[package]]
-name = "rfc6979"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb"
-dependencies = [
- "crypto-bigint 0.4.9",
- "hmac",
- "zeroize",
-]
-
 [[package]]
 name = "rgb"
 version = "0.8.53"
@@ -6775,68 +5730,21 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "rustls"
-version = "0.21.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e"
-dependencies = [
- "log",
- "ring",
- "rustls-webpki 0.101.7",
- "sct",
-]
-
 [[package]]
 name = "rustls"
 version = "0.23.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b"
 dependencies = [
- "aws-lc-rs",
  "log",
  "once_cell",
  "ring",
  "rustls-pki-types",
- "rustls-webpki 0.103.13",
+ "rustls-webpki",
  "subtle",
  "zeroize",
 ]
 
-[[package]]
-name = "rustls-native-certs"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
-dependencies = [
- "openssl-probe 0.1.6",
- "rustls-pemfile",
- "rustls-pki-types",
- "schannel",
- "security-framework 2.11.1",
-]
-
-[[package]]
-name = "rustls-native-certs"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
-dependencies = [
- "openssl-probe 0.2.1",
- "rustls-pki-types",
- "schannel",
- "security-framework 3.7.0",
-]
-
-[[package]]
-name = "rustls-pemfile"
-version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
-dependencies = [
- "rustls-pki-types",
-]
-
 [[package]]
 name = "rustls-pki-types"
 version = "1.14.1"
@@ -6847,33 +5755,12 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "rustls-webpki"
-version = "0.101.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
-dependencies = [
- "ring",
- "untrusted",
-]
-
-[[package]]
-name = "rustls-webpki"
-version = "0.102.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
-dependencies = [
- "rustls-pki-types",
- "untrusted",
-]
-
 [[package]]
 name = "rustls-webpki"
 version = "0.103.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
 dependencies = [
- "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -6984,16 +5871,6 @@ version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2"
 
-[[package]]
-name = "sct"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
-dependencies = [
- "ring",
- "untrusted",
-]
-
 [[package]]
 name = "sdd"
 version = "4.8.6"
@@ -7003,33 +5880,6 @@ dependencies = [
  "saa",
 ]
 
-[[package]]
-name = "sec1"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
-dependencies = [
- "base16ct",
- "der 0.6.1",
- "generic-array",
- "pkcs8 0.9.0",
- "subtle",
- "zeroize",
-]
-
-[[package]]
-name = "security-framework"
-version = "2.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
-dependencies = [
- "bitflags 2.11.1",
- "core-foundation 0.9.4",
- "core-foundation-sys",
- "libc",
- "security-framework-sys",
-]
-
 [[package]]
 name = "security-framework"
 version = "3.7.0"
@@ -7154,15 +6004,6 @@ dependencies = [
  "zmij",
 ]
 
-[[package]]
-name = "serde_nanos"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "serde_path_to_error"
 version = "0.1.20"
@@ -7309,6 +6150,19 @@ dependencies = [
  "lazy_static",
 ]
 
+[[package]]
+name = "shared_memory"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba8593196da75d9dc4f69349682bd4c2099f8cde114257d1ef7ef1b33d1aba54"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "nix 0.23.2",
+ "rand 0.8.6",
+ "win-sys",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -7325,38 +6179,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "signatory"
-version = "0.27.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
-dependencies = [
- "pkcs8 0.10.2",
- "rand_core 0.6.4",
- "signature 2.2.0",
- "zeroize",
-]
-
-[[package]]
-name = "signature"
-version = "1.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c"
-dependencies = [
- "digest 0.10.7",
- "rand_core 0.6.4",
-]
-
-[[package]]
-name = "signature"
-version = "2.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
-dependencies = [
- "digest 0.10.7",
- "rand_core 0.6.4",
-]
-
 [[package]]
 name = "simd-adler32"
 version = "0.3.9"
@@ -7410,16 +6232,6 @@ dependencies = [
  "version_check",
 ]
 
-[[package]]
-name = "socket2"
-version = "0.5.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
-dependencies = [
- "libc",
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "socket2"
 version = "0.6.3"
@@ -7450,32 +6262,6 @@ dependencies = [
  "lock_api",
 ]
 
-[[package]]
-name = "spin"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591"
-
-[[package]]
-name = "spki"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b"
-dependencies = [
- "base64ct",
- "der 0.6.1",
-]
-
-[[package]]
-name = "spki"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
-dependencies = [
- "base64ct",
- "der 0.7.10",
-]
-
 [[package]]
 name = "spm_precompiled"
 version = "0.1.4"
@@ -7758,15 +6544,6 @@ dependencies = [
  "tiktoken-rs 0.7.0",
 ]
 
-[[package]]
-name = "temp-env"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96374855068f47402c3121c6eed88d29cb1de8f3ab27090e273e420bdabcf050"
-dependencies = [
- "parking_lot",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.27.0"
@@ -8045,9 +6822,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.48.0"
+version = "1.52.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
+checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe"
 dependencies = [
  "bytes",
  "libc",
@@ -8055,16 +6832,16 @@ dependencies = [
  "parking_lot",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2 0.6.3",
+ "socket2",
  "tokio-macros",
  "windows-sys 0.61.2",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.6.1"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
+checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -8081,33 +6858,13 @@ dependencies = [
  "tokio",
 ]
 
-[[package]]
-name = "tokio-rayon"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cf33a76e0b1dd03b778f83244137bd59887abf25c0e87bc3e7071105f457693"
-dependencies = [
- "rayon",
- "tokio",
-]
-
-[[package]]
-name = "tokio-rustls"
-version = "0.24.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081"
-dependencies = [
- "rustls 0.21.12",
- "tokio",
-]
-
 [[package]]
 name = "tokio-rustls"
 version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
 dependencies = [
- "rustls 0.23.40",
+ "rustls",
  "tokio",
 ]
 
@@ -8134,29 +6891,8 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
- "pin-project-lite",
- "tokio",
-]
-
-[[package]]
-name = "tokio-websockets"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
-dependencies = [
- "base64 0.22.1",
- "bytes",
- "futures-core",
- "futures-sink",
- "http 1.4.0",
- "httparse",
- "rand 0.8.6",
- "ring",
- "rustls-pki-types",
+ "pin-project-lite",
  "tokio",
- "tokio-rustls 0.26.4",
- "tokio-util",
- "webpki-roots 0.26.11",
 ]
 
 [[package]]
@@ -8214,7 +6950,6 @@ dependencies = [
  "serde",
  "serde_spanned 0.6.9",
  "toml_datetime 0.6.11",
- "toml_write",
  "winnow 0.7.15",
 ]
 
@@ -8239,47 +6974,12 @@ dependencies = [
  "winnow 1.0.2",
 ]
 
-[[package]]
-name = "toml_write"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
-
 [[package]]
 name = "toml_writer"
 version = "1.1.1+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db"
 
-[[package]]
-name = "tonic"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
-dependencies = [
- "async-trait",
- "axum",
- "base64 0.22.1",
- "bytes",
- "h2 0.4.14",
- "http 1.4.0",
- "http-body 1.0.1",
- "http-body-util",
- "hyper 1.9.0",
- "hyper-timeout",
- "hyper-util",
- "percent-encoding",
- "pin-project",
- "prost 0.13.5",
- "socket2 0.5.10",
- "tokio",
- "tokio-stream",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
 [[package]]
 name = "tonic"
 version = "0.14.5"
@@ -8290,16 +6990,16 @@ dependencies = [
  "axum",
  "base64 0.22.1",
  "bytes",
- "h2 0.4.14",
- "http 1.4.0",
- "http-body 1.0.1",
+ "h2",
+ "http",
+ "http-body",
  "http-body-util",
- "hyper 1.9.0",
+ "hyper",
  "hyper-timeout",
  "hyper-util",
  "percent-encoding",
  "pin-project",
- "socket2 0.6.3",
+ "socket2",
  "sync_wrapper",
  "tokio",
  "tokio-stream",
@@ -8309,20 +7009,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "tonic-build"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847"
-dependencies = [
- "prettyplease",
- "proc-macro2",
- "prost-build 0.13.5",
- "prost-types 0.13.5",
- "quote",
- "syn 2.0.117",
-]
-
 [[package]]
 name = "tonic-build"
 version = "0.14.5"
@@ -8342,8 +7028,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309"
 dependencies = [
  "bytes",
- "prost 0.14.3",
- "tonic 0.14.5",
+ "prost",
+ "tonic",
 ]
 
 [[package]]
@@ -8354,12 +7040,12 @@ checksum = "f3144df636917574672e93d0f56d7edec49f90305749c668df5101751bb8f95a"
 dependencies = [
  "prettyplease",
  "proc-macro2",
- "prost-build 0.14.3",
- "prost-types 0.14.3",
+ "prost-build",
+ "prost-types",
  "quote",
  "syn 2.0.117",
  "tempfile",
- "tonic-build 0.14.5",
+ "tonic-build",
 ]
 
 [[package]]
@@ -8390,8 +7076,8 @@ dependencies = [
  "bitflags 2.11.1",
  "bytes",
  "futures-util",
- "http 1.4.0",
- "http-body 1.0.1",
+ "http",
+ "http-body",
  "iri-string",
  "pin-project-lite",
  "tower",
@@ -8558,16 +7244,6 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
-[[package]]
-name = "tryhard"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
-dependencies = [
- "pin-project-lite",
- "tokio",
-]
-
 [[package]]
 name = "typeid"
 version = "1.0.3"
@@ -8592,15 +7268,6 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
 
-[[package]]
-name = "uncased"
-version = "0.9.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697"
-dependencies = [
- "version_check",
-]
-
 [[package]]
 name = "unicase"
 version = "2.9.0"
@@ -8680,7 +7347,7 @@ dependencies = [
  "flate2",
  "log",
  "once_cell",
- "rustls 0.23.40",
+ "rustls",
  "rustls-pki-types",
  "serde",
  "serde_json",
@@ -8697,12 +7364,12 @@ checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
 dependencies = [
  "base64 0.22.1",
  "cookie_store",
- "der 0.8.0",
+ "der",
  "flate2",
  "log",
  "native-tls",
  "percent-encoding",
- "rustls 0.23.40",
+ "rustls",
  "rustls-pki-types",
  "serde",
  "serde_json",
@@ -8720,7 +7387,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
 dependencies = [
  "base64 0.22.1",
- "http 1.4.0",
+ "http",
  "httparse",
  "log",
 ]
@@ -8737,12 +7404,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "urlencoding"
-version = "2.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
-
 [[package]]
 name = "utf16_iter"
 version = "1.0.5"
@@ -8869,226 +7530,6 @@ version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
-[[package]]
-name = "velo"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e0cc874c11ea3d03afd7adf90529bcf3df374039deff65bc02a26e71bb5814b"
-dependencies = [
- "anyhow",
- "bytes",
- "serde",
- "tokio",
- "tokio-util",
- "velo-common",
- "velo-discovery",
- "velo-events",
- "velo-messenger",
- "velo-observability",
- "velo-queue",
- "velo-rendezvous",
- "velo-streaming",
- "velo-transports",
-]
-
-[[package]]
-name = "velo-common"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc2824d7667c4ee992dfc58c30799518f504d39c91f1e83962d55cb8c44cfb67"
-dependencies = [
- "bytes",
- "rmp-serde",
- "serde",
- "serde_bytes",
- "serde_json",
- "thiserror 2.0.18",
- "uuid",
- "xxhash-rust",
-]
-
-[[package]]
-name = "velo-discovery"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41107947fe3d15972c56815e42fd2485ca4ba58a5a69ec1a6fd61c00966f8734"
-dependencies = [
- "anyhow",
- "async-stream",
- "bytes",
- "fs4",
- "futures",
- "parking_lot",
- "rmp-serde",
- "serde",
- "serde_json",
- "tokio",
- "tokio-util",
- "tracing",
- "uuid",
- "velo-common",
-]
-
-[[package]]
-name = "velo-events"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2504e857f1ed52ad96ce73792a724be880ba75f3579a27050395e81d38cb42d0"
-dependencies = [
- "anyhow",
- "dashmap",
- "futures",
- "parking_lot",
- "serde",
- "tokio",
- "tokio-util",
- "tracing",
- "uuid",
- "xxhash-rust",
-]
-
-[[package]]
-name = "velo-messenger"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ada8ea47ab39a82ef9101024452a3c605bcf5c0acad82c76afcf48a17c0450cf"
-dependencies = [
- "anyhow",
- "bs58",
- "bytes",
- "dashmap",
- "derive-getters",
- "derive_builder",
- "flume 0.12.0",
- "futures",
- "lru 0.12.5",
- "parking_lot",
- "rmp-serde",
- "serde",
- "serde_json",
- "thiserror 2.0.18",
- "tokio",
- "tokio-util",
- "tracing",
- "uuid",
- "velo-common",
- "velo-discovery",
- "velo-events",
- "velo-observability",
- "velo-transports",
-]
-
-[[package]]
-name = "velo-observability"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57813414b19b0f845744fcf158ed4b87afe4f5c59ee99d58872c8e5d5534d61b"
-dependencies = [
- "prometheus",
- "tracing",
-]
-
-[[package]]
-name = "velo-queue"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7bcce33f5710d87e33e1eacc5472300cbe6acdfb5f2cd6929c12f2e4908ef38"
-dependencies = [
- "bytes",
- "dashmap",
- "flume 0.12.0",
- "futures",
- "rmp-serde",
- "serde",
- "thiserror 2.0.18",
- "tokio",
- "tracing",
-]
-
-[[package]]
-name = "velo-rendezvous"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1eac77f329b0c3dba1b36114c969e0a7d680bb4a6436720754bef14a24fedde"
-dependencies = [
- "anyhow",
- "bytes",
- "dashmap",
- "futures",
- "serde",
- "serde_json",
- "tracing",
- "velo-common",
- "velo-messenger",
- "velo-observability",
-]
-
-[[package]]
-name = "velo-streaming"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd6dc0d71f73c6f369302bb94b05341a807a5693f6c321fc208d9c9a974d918a"
-dependencies = [
- "anyhow",
- "bytes",
- "dashmap",
- "derive_builder",
- "flume 0.12.0",
- "futures",
- "rmp-serde",
- "serde",
- "serde_json",
- "socket2 0.6.3",
- "thiserror 2.0.18",
- "tokio",
- "tokio-util",
- "tonic-build 0.13.1",
- "tracing",
- "uuid",
- "velo-common",
- "velo-messenger",
- "velo-observability",
- "velo-transports",
-]
-
-[[package]]
-name = "velo-transports"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb5a348cc2fe13fc560824f170d3bdc3d10ec3a11802bb32489a59f49eb12409"
-dependencies = [
- "anyhow",
- "async-nats",
- "axum",
- "bs58",
- "bytes",
- "dashmap",
- "flume 0.12.0",
- "futures",
- "http 1.4.0",
- "http-body 1.0.1",
- "http-body-util",
- "hyper 1.9.0",
- "hyper-util",
- "nix 0.30.1",
- "parking_lot",
- "prost 0.13.5",
- "rmp-serde",
- "serde",
- "serde_json",
- "socket2 0.6.3",
- "thiserror 2.0.18",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tonic 0.13.1",
- "tonic-build 0.13.1",
- "tower",
- "tracing",
- "velo-common",
- "velo-observability",
-]
-
 [[package]]
 name = "version-compare"
 version = "0.2.1"
@@ -9221,22 +7662,22 @@ dependencies = [
  "asynk-strim-attr",
  "axum",
  "futures",
- "http-body 1.0.1",
+ "http-body",
  "itertools 0.14.0",
  "libc",
  "llm-multimodal",
- "prost 0.14.3",
- "prost-types 0.14.3",
+ "prost",
+ "prost-types",
  "rmpv",
  "serde",
  "serde_json",
  "serde_with",
- "socket2 0.6.3",
+ "socket2",
  "thiserror-ext",
  "tokio",
  "tokio-stream",
  "tokio-util",
- "tonic 0.14.5",
+ "tonic",
  "tonic-prost",
  "tonic-prost-build",
  "tower-http",
@@ -9308,12 +7749,6 @@ dependencies = [
  "winnow 1.0.2",
 ]
 
-[[package]]
-name = "vsimd"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
-
 [[package]]
 name = "wait-timeout"
 version = "0.2.1"
@@ -9521,6 +7956,15 @@ version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
 
+[[package]]
+name = "win-sys"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b7b128a98c1cfa201b09eb49ba285887deb3cbe7466a98850eb1adabb452be5"
+dependencies = [
+ "windows",
+]
+
 [[package]]
 name = "win_uds"
 version = "0.2.2"
@@ -9529,7 +7973,7 @@ checksum = "7dd30a1a28a3799479cbf4e17284a220ea9ff6bad098a9d0224543a5d1efe1da"
 dependencies = [
  "async-io",
  "futures-io",
- "socket2 0.6.3",
+ "socket2",
 ]
 
 [[package]]
@@ -9563,6 +8007,19 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45296b64204227616fdbf2614cefa4c236b98ee64dfaaaa435207ed99fe7829f"
+dependencies = [
+ "windows_aarch64_msvc 0.34.0",
+ "windows_i686_gnu 0.34.0",
+ "windows_i686_msvc 0.34.0",
+ "windows_x86_64_gnu 0.34.0",
+ "windows_x86_64_msvc 0.34.0",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.62.2"
@@ -9714,6 +8171,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
@@ -9726,6 +8189,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
@@ -9750,6 +8219,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
@@ -9762,6 +8237,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
@@ -9786,6 +8267,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
@@ -9922,12 +8409,6 @@ version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
 
-[[package]]
-name = "xmlparser"
-version = "0.13.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
-
 [[package]]
 name = "xxhash-rust"
 version = "0.8.15"
diff --git a/Cargo.toml b/Cargo.toml
index 61485cac..0a72b866 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,7 @@ members = [
     "pegainfer-qwen3-4b",
     "pegainfer-qwen35-4b",
     "pegainfer-kv-cache",
+    "pegainfer-kv-offload",
     # ---- pegainfer-comm (EP all-to-all) ----
     "pegainfer-comm",
     "pegainfer-comm/crates/pegainfer-comm-build-utils",
@@ -40,12 +41,9 @@ members = [
     "kvbm/dynamo-kv-hashing",
     "kvbm/dynamo-kv-router",
     "kvbm/kvbm-common",
-    "kvbm/kvbm-config",
     "kvbm/kvbm-consolidator",
-    "kvbm/kvbm-engine",
     "kvbm/kvbm-kernels",
     "kvbm/kvbm-logical",
-    "kvbm/kvbm-physical",
 ]
 
 # Inherited by dynamo-ported crates that use `edition.workspace = true` etc.
@@ -66,12 +64,9 @@ dynamo-memory = { path = "kvbm/dynamo-memory" }
 dynamo-kv-hashing = { path = "kvbm/dynamo-kv-hashing" }
 dynamo-kv-router = { path = "kvbm/dynamo-kv-router", features = ["metrics"] }
 kvbm-common = { path = "kvbm/kvbm-common" }
-kvbm-config = { path = "kvbm/kvbm-config" }
 kvbm-consolidator = { path = "kvbm/kvbm-consolidator" }
-kvbm-engine = { path = "kvbm/kvbm-engine" }
 kvbm-kernels = { path = "kvbm/kvbm-kernels" }
 kvbm-logical = { path = "kvbm/kvbm-logical" }
-kvbm-physical = { path = "kvbm/kvbm-physical" }
 # ---- third-party ----
 anyhow = "1.0"
 async-nats = { version = "0.45.0", features = ["service"] }
@@ -96,6 +91,11 @@ cudarc = { version = "0.19.7", features = [
     "cublas",
     "f16",
     "nccl",
+    # nvrtc: embedded pegaflow-core's transfer/kernel.rs references the nvrtc
+    # bindings unconditionally (its KernelBackend JIT-compiles the copy kernel).
+    # Lazy per-symbol loading (0.19.5+) keeps this off the runtime driver floor,
+    # so it stays compatible with the cuda-12090 binding level (issue #263).
+    "nvrtc",
 ] }
 cxx = "1.0.187"
 cxx-build = "1.0.187"
@@ -129,6 +129,7 @@ parking_lot = "0.12.5"
 pegainfer-bench = { path = "pegainfer-bench" }
 pegainfer-core = { path = "pegainfer-core" }
 pegainfer-kv-cache = { path = "pegainfer-kv-cache" }
+pegainfer-kv-offload = { path = "pegainfer-kv-offload" }
 pegainfer-cupti = { path = "pegainfer-cupti" }
 pegainfer-deepseek-v4 = { path = "pegainfer-deepseek-v4" }
 pegainfer-engine = { path = "pegainfer-engine" }
diff --git a/docs/index.md b/docs/index.md
index b2fafd74..e07d6b96 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -92,6 +92,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l
 | --- | --- |
 | `subsystems/runtime/runtime.md` | Runtime complexity is controlled by a shared `pegainfer-core` that owns the generation contract and orchestration; per-model crates implement `ModelForward` so prefill/decode and hybrid attention stay hidden from the caller. State (`&mut`) is separated from weights (`&self`) for future bs > 1. |
 | `subsystems/runtime/kv-cache-design.md` | Dynamo 式 logical/physical 分层 KV cache：BlockManager 管 block 生命周期和 admission，PhysicalBackend trait 管 GPU 内存和布局（FullAttention / MLA）。支持 TP / DP。基于 vLLM/Dynamo/pegaflow 调研。 |
+| `subsystems/runtime/pegaflow-offload-integration.md` | 把 `pegaflow-core` 当进程内 Rust 库做 KV 卸载物理后端（HBM→DRAM/SSD/RDMA），补 kvbm 没写的卸载层。**Qwen3-4B full-attn 首发，端到端已在真实 GPU 跑通并验证**（async SAVE+LOAD 接进 executor/scheduler，纯 CPU-hit 与 GPU+CPU 组合 hit 恢复后 logits 与冷算一致）。pegaflow 经 git rev pin（#331+#333）。默认关，未接 server CLI。linear 排除，sparse 暂缓。 |
 
 ## subsystems / scheduler
 
diff --git a/docs/subsystems/runtime/pegaflow-offload-integration.md b/docs/subsystems/runtime/pegaflow-offload-integration.md
new file mode 100644
index 00000000..2af55b94
--- /dev/null
+++ b/docs/subsystems/runtime/pegaflow-offload-integration.md
@@ -0,0 +1,157 @@
+# pegaflow KV 卸载接入 Spec
+
+> **TL;DR**: 把 `pegaflow-core` 当**进程内 Rust 库**做 KV 卸载的物理后端（HBM→DRAM/SSD/RDMA），补上 kvbm 留着没写的卸载层。connector 大脑（决定 load/save 哪些 block）用 kvbm logical/physical 分层思想自建，pegaflow 退为语义无关的 raw block transfer 后端。**路线已调整为 Qwen3-4B full-attn 首发**（原计划 Kimi 首发）：page-first 单 buffer 经 pegaflow `block_stride_bytes`（PR #331）适配。**端到端已在真实 GPU 上跑通并验证**：async SAVE + async LOAD 接进 `Qwen3Executor` + scheduler，`tests/kv_offload_cpu_hit.rs` 覆盖纯 CPU-hit 与 GPU+CPU 组合 hit，恢复后 logits 与冷算一致；连接层 `OffloadEngine` + `tests/cpu_roundtrip.rs` 字节级一致。默认关（builder flag opt-in），未接 server CLI。**Qwen3.5 linear/SSM state 明确排除**；**DeepSeek sparse 暂缓**。
+>
+> Last touched: 2026-06
+
+## 0. 实现状态（2026-06）
+
+已落地并验证：
+
+- **pegaflow `block_stride_bytes`**（PR #331 → novitalabs/pegaflow，`feat/inproc-load` 基于其上）：解耦"块间步长"与"每块拷贝大小"，让 page-first fused buffer 能注册。**已合入 master**。
+- **pegaflow 进程内 load API**（PR #333，**已合入**，squash 进 #331 的 `07cac7e`）：`LoadCompletion::{Shm,Channel}` + `batch_load_kv_blocks_multi_layer_inproc` → `oneshot::Receiver`，去掉 in-process 调用方对 shm `LoadState` 的依赖（Rust 进程内不需要），非阻塞 poll。
+- **`pegainfer-kv-offload::OffloadEngine`**：拥有 `PegaEngine` + 内嵌 tokio runtime；`Registration::from_buffer` 把 fused page-first buffer 映射成 per-layer 注册（**单段 `[K|V]`**：fused layout 里 K/V 本就连续 = `layer_stride` 一段，`block_stride = page_stride`，`segments=1`——不是 K/V split，那条路需要 `kv_stride > bytes_per_block`，此处不成立）。`save`（async fire-and-forget）/`save_blocking`（eviction handoff，同步捕获）/`query`（GPU+CPU hit）/`load`（oneshot）/`flush_saves`/`evict_all`。
+- **`KvBuffer::device_ptr`**（kv-cache）：注册用的稳定基址。
+- **kvbm↔bytes 桥**（kv-cache `RequestKv`）：`prompt_block_hashes` / `assigned_block_hashes` / `prefix_matched_blocks`，`SequenceHash::as_u128()` → 16B content key。
+- **`tests/cpu_roundtrip.rs`**：真实 `KvBuffer` 上写已知 pattern → save → query → load 到**另一组** block → 字节级比对 + 零块负向控制。**通过**。
+- **live 接线（§9，已落地）**：`Qwen3Executor` 持 `Option<OffloadEngine>`（`Qwen3OffloadOptions` opt-in，默认关）；SAVE hook（`save_sealed_blocks`，async fire-and-forget）+ 非阻塞 prefetch admission（`begin_kv_prefetch`/`drain_ready_prefetch`/`wait_ready_prefetch`，scheduler `loading` 态）。`tests/kv_offload_cpu_hit.rs` 单测序跑两幕——纯 CPU restore（`gpu_hit==0`）与 GPU+CPU 组合 hit（G=3+C=3 拼成一段连续前缀）——恢复后 first-token logits 与冷算一致（mean Δ≈0.03 nat，bf16 floor）。
+- **三处正确性加固**（toxic-review 后）：① query lease 在 `reserve_loaded_blocks` 失败 / `load` 提交失败时显式 `release_query_lease`，不再泄漏到 600s TTL；② admission 拒绝（context/KV budget/未知 LoRA）时 `drop_request` 释放已 settle 的 prefetch 状态，不再泄漏已 commit 的 block；③ async SAVE 把被保存 block 的 `ImmutableBlock` 强引用（`KvBlockGuard`）随 spawn 持到 D2H 落地才 drop——封死"请求结束→slot 重分配→D2H 抓到错 KV 写进旧 hash"的静默腐蚀窗口。
+
+未接 server CLI（仅经 `start_engine_with_offload` / 测试入口）。**依赖已从 fork 摘除**：PR #331+#333 均合入上游 master（squash 进 `07cac7e`），`third_party/pegaflow` 已删，`pegaflow-core` 改为 pin 到该 rev 的 **git 依赖**（见 §5.2），GPU 测试在 git-dep 下行为不变（delta 一致）。
+
+相关：[kv-cache-design.md](kv-cache-design.md)（logical/physical 分层，已把 pegaflow 列为设计调研）· [qwen3-kvbm-integration-spec.md](qwen3-kvbm-integration-spec.md)（kvbm-logical 已接入）· `models/kimi-k2/kv-cache-design.md`（Kimi 已用 `BlockPool`）· `models/qwen3/prefix-cache.md`（HBM 内前缀复用已落地）。
+
+---
+
+## 1. 定位：pegaflow 是 raw 后端，connector 大脑要自建
+
+pegaflow（`third_party/pegaflow`，novita，Apache-2.0）原本是 **vLLM 的 KV connector 服务端**：KV 的编排逻辑（何时 save、query 几个 block、prefix 匹配、与 scheduler 的 admission/preemption 交互）全在 vLLM 的 Python connector 那一侧，`pegaflow-core` 只是底下干 D2H/H2D + 分层存储的**肌肉**。
+
+pegainfer 不是 vLLM，那套 Python connector 一行用不上。接入要做的是**用 Rust 自建那颗 connector 大脑**——而 kvbm 的 logical/physical 分层正是它的骨架：
+
+```
+per-model scheduler   ← 策略：哪些 block 该 resident（full 前缀 / MLA 全前缀 / 未来稀疏选择）
+  ↓ 产出 load/save 意图（一组 block）
+connector（kvbm logical/physical 思想）← 机制：block identity、状态机、GPU slot 编排、transfer 调度
+  ↓ 语义无关的 raw transfer
+pegaflow-core         ← 机制底座：D2H/H2D、DRAM/SSD/RDMA 分层
+```
+
+## 2. 战略决策：pegaflow 取代 kvbm 死代码做物理 tier
+
+pegainfer 仓里 vendored 的 `kvbm-physical` / `kvbm-engine` 设计目标就是分层卸载，但**至今零接线、是死代码**（无任何非 kvbm crate 依赖）。同时养两套分层卸载违反项目复杂度红线。本 spec 采纳：**`kvbm-logical`（逻辑层 + 前缀匹配）保留，pegaflow-core 顶替它下面缺失的物理卸载层，砍掉 `kvbm-physical`/`kvbm-engine`**。理由：pegaflow 同组维护、已上 PyPI、有 H800 benchmark、库化干净；kvbm 那两层是纯负债。
+
+## 3. 三模型三 KV 形态 → connector 边界（实据）
+
+| 模型 | KV 形态 | active set | 跨请求复用 | 卸载结论 |
+| --- | --- | --- | --- | --- |
+| **Kimi-K2 MLA** | paged，per-layer ckv/kpe arena，后端是 `BlockPool`；latent 68.6 KiB/token，无 per-head | 无（dense 全前缀） | 有（HBM 内 prefix cache 已落地） | **首发**：接入面最干净，layout 直接适配 pegaflow registration |
+| **Qwen3 / Qwen3.5 full-attn** | paged，page-first 单 buffer，`PagePool` | 无（dense 全前缀） | 有（前缀缓存已落地） | **次发**：page-first 与 pegaflow `stride==copy-size` ABI 冲突，需加 `block_stride`（见 §5.R1） |
+| **Qwen3.5 linear（24 层）** | per-request `RecurrentState` [32,128,128] f32 2 MiB/层，非 paged、独立分配 | 无（每步读写整个 matrix） | **零**（this-request 有损摘要，非 content-addressable） | **排除**：offload 无 prefix/dedup 收益；省显存是 per-request swap-out，另一套机制 |
+| **DeepSeek-V4 sparse** | per-request per-layer dense arena [window\|compressed]，非 paged；compressor 4:1 | **显式**：`topk_idxs` = window 行 + indexer 选中 compressed 行，token/row 粒度，每步重选 | 部分 | **暂缓**：compressor 已控 footprint；indexer 信号现成但 token 粒度 ≠ block 粒度（见 §7） |
+
+**边界结论**：connector 只收 **block-structured、content-addressable** 的 KV（MLA latent / full-attn paged）。recurrent/SSM state 不进 connector。稀疏的 active-set gather 是独立的、未来的课题。
+
+证据：Kimi `pegainfer-kimi-k2/src/runner/{worker.rs:612-619, cache.rs:63-80, mla.rs:38-48}`、`scheduler.rs:16,27,146,180`、`pool.rs:123`；Qwen3.5 linear `pegainfer-qwen35-4b/src/...recurrent.rs`、`batch_decode_graph.rs:82-86`；DeepSeek `pegainfer-deepseek-v4/src/...state.rs:220, indexer.rs:609-670`、`csrc/.../deepseek_indexer.cu:470-527`。
+
+## 4. 路线
+
+1. **Kimi MLA 首发** —— pegaflow 做 `BlockPool` 下的 host/SSD tier；block evict 时 demote 到 host，前缀 query 命中时从 host restore。带宽便宜（latent 小），layout 零阻抗。
+2. **Qwen full-attn 次发** —— 先给 pegaflow 加 `block_stride_bytes`（R1），再接 page-first buffer。
+3. **linear 排除、sparse 暂缓**。
+
+## 5. 可行性（对抗验证结论，附证据）
+
+四条承重假设由 10-agent workflow 对抗验证：
+
+1. **✅ 进程内注册裸指针，无 IPC、无第二进程**：`register_context_layer_batch(data_ptrs: &[u64])`（`pegaflow-core/src/lib.rs:242-259`）收裸设备地址，拷贝路径直接喂给 driver API `cuMemcpyDtoHAsync_v2`（`transfer/memcpy.rs:82-89`）；IPC 只在 server/Python 层，core 零 IPC 调用点。cudarc 附设备 **primary context**（与 pegainfer 同一），自建 worker stream。
+2. **✅ 依赖无致命冲突**：cudarc 单 major（0.19.3↔0.19.7 统一），cuda-12080/12090 共存（build.rs 取高版本），tokio/tonic/prost 兼容。**依赖行**（git rev pin 到上游 master `07cac7e`，含 #331+#333；`default-features=false` 砍掉 pegaflow 自带的 `cuda-12`/`rdma`，靠 workspace cudarc 提供的 `cuda-12090`+`nvrtc` 满足——pegaflow-core 无 `cfg(cuda-12)` gate）：
+   ```toml
+   pegaflow-core = { git = "https://github.com/novitalabs/pegaflow.git", rev = "07cac7e50e8ae7be15ad1b9311401039c9ee439b", default-features = false }
+   ```
+   下次再改 pegaflow：临时换回 path dep 共同开发 → 提 PR → 合入后 re-pin rev。
+   **为何 `cuda-12` 而非 `cuda-13`**（本机明明是 CUDA 13.3 toolkit / 13.0 driver）：pegainfer 有意锁 `cudarc/cuda-12090`（`Cargo.toml:92-93`，issue #263——配 cudarc 0.19.5+ 的 per-symbol lazy loading，压低 binding level 以**不抬高 runtime driver floor**、保宽部署兼容；故意不用 `cuda-version-from-build-system` 自动，否则 driver floor 会跟着构建机 toolkit 走）。cudarc 在 workspace 是**单实例、feature 取并集后选最高版本**：pegaflow 用 `cuda-12` 并集后仍是 12090、不抬 floor；用 `cuda-13`（→ `cudarc/cuda-13000`）会把**整个 workspace 含 pegainfer 自己**顶到 13000、driver floor 抬到 CUDA 13，撞翻 #263。整体迁 cu13 是独立决策（须同时改 pegainfer 的 cudarc + revisit #263），本期不做。
+3. **⚠️ Layout**：block-hash 键直接适配（`u64→Vec<u8>`）；page-first layout **不适配**（见 §5.R1）；Kimi per-layer 布局**天然适配**。
+4. **✅ 流同步**：host-side 粗同步可解——save 前 pegainfer 必须 `synchronize()` compute stream（pegaflow 私有 stream 只自同步，`gpu_worker.rs:520-528`），restore 前自旋 poll `LoadState`。代价：损 compute/offload 重叠（见 §6.R3）。
+
+## 6. connector 接口（dense-first，稀疏留门不展开）
+
+两层分离，稀疏复杂性全关在 policy 侧：
+
+```rust
+// mechanism —— pegaflow backend，永不懂稀疏/前缀
+trait KvOffloadBackend {
+    fn load(&self, items: &[(BlockHash, GpuSlot)]) -> LoadHandle; // 任意集合，phase 无关
+    fn offload(&self, items: &[(GpuSlot, BlockHash, OffloadHint)]);
+    fn poll(&self, h: LoadHandle) -> LoadState;
+}
+enum OffloadHint { ReusableAcrossRequests, TransientDiscard }
+
+// policy —— per-model scheduler，懂自己的拓扑
+trait KvResidencyPolicy {
+    fn required_blocks(&self, req: &RequestCtx, phase: Phase) -> SmallVec<BlockId>;
+    fn save_hint(&self, block: BlockId) -> OffloadHint;
+}
+```
+
+**现在做对、未来免费受益的三个决策**（即便 dense-first 也按这个写，成本为零）：
+- 接口说 **block 集合**不说 prefix-count（full attention 产出的集合恰好连续 = 退化特例）；
+- admission 按 **active working set ≤ HBM** 写（dense 下 active=total，退化）；
+- `load` **phase-agnostic**（不绑 prefill，未来 decode gather 是"启用"不是"重设计"）。
+
+第一版：`required_blocks` 对 Kimi/Qwen 就是"全前缀"，`OffloadHint` 全 `ReusableAcrossRequests`，只走 prefill-前 + evict 路径。
+
+## 7. 风险
+
+| # | 风险 | 等级 | 处置 |
+| --- | --- | --- | --- |
+| R1 | Qwen page-first vs pegaflow `stride==copy-size` ABI 不兼容 | major | 给 `KVCacheRegistration` 加 `block_stride_bytes`（改 pegaflow ~几十行，`instance.rs` + `transfer/mod.rs`）；**Kimi 首发绕开此风险** |
+| R2 | save 前漏 `synchronize()` → 静默 D2H 半写 KV，pegaflow 不校验 | major | bridge 层把 synchronize 设成不可绕过 + debug 断言 |
+| R3 | host-side 粗同步损 compute/offload 重叠 | minor | 第一版接受；后续给 pegaflow 加 device-side event-injection |
+| R4 | 依赖误配（裸 default-features=false / 漏 cuda-12） | minor | §5.2 依赖行已定，CI 编译验证 |
+| R5 | 稀疏 active-set offload 的 token-vs-block 粒度落差 | 已知开放 | 见下，不在本期 |
+
+**稀疏（已知开放问题，不在本期）**：连 dynamo KVBM 都没解 sparse attention offloading——它的复用是 radix 前缀、offload 是 frequency/LRU、tier 是整请求异步流动，对 SWA 只在 router 透传 `kv_cache_spec_sliding_window` 做 window-aware 前缀，对 topk 零处理。没有现成抽象可继承。pegainfer 侧 DeepSeek 的 indexer 已产出显式可拦截的 active-set 信号，但 token/row 粒度 ≠ block 粒度，且 compressor 已控 footprint 当前不需 offload。机制层（内容寻址 + 可插拔 policy + 语义无关 transfer）本就不堵稀疏，真正缺的 decode-loop gather 大脑到时候结合具体模型新写更准。
+
+## 8. 下一步：Kimi MLA 最小 spike
+
+**目标**：进程内跑通一个 page 的 register→save→evict→load，证伪"无先例"风险 + 量带宽。
+
+1. 新 bridge crate，path-dep pegaflow-core（§5.2 依赖行），`cargo build` 验依赖。
+2. Kimi：`new_with_config` → `register_context_layer_batch`（per-layer ckv/kpe，segments=2，per-layer 布局天然适配）。
+3. 一个 page：`synchronize` → `save` →（手动 evict）→ `query` 命中 → `load` 回 GPU → 比对 bytes 一致。
+4. 量 host↔HBM 带宽 + save 前 synchronize 的 host stall（确认 R3 可接受）。
+5. 通过后再决定给 pegaflow 加 `block_stride` 上 Qwen page-first（R1）。
+
+**阻塞**：等 §2 战略决策最终拍板（pegaflow 取代 kvbm 卸载层 = 是）。
+
+## 9. live 接线设计（Qwen3-4B，**已落地**）
+
+> 状态：已实现并在真实 GPU 上验证（§0）。下文是设计与实现一致的记录；落地时相对原设计的偏差与加固见末尾「实现注记」。
+
+连接层已就绪（§0），把它接进 `Qwen3Executor` + `scheduler.rs` 的真实推理路径。`Qwen3Executor` 持 `kv_mgr`（`BlockPool`+`KvBuffer`）与 `request_kvs`；在构造（`from_runtime`/`single`，model 移入 RankWorker 之前，此时 `KvBuffer` + `device_ctx().stream` 都在手）建一个 `Option<OffloadEngine>`，opt-in（builder flag，**不加 env**），默认关，保现有路径不动。
+
+**SAVE（async，best-effort）**：`apply_prefill`/`apply_decode` 封块后（此刻 compute stream 已随 `run_step` 同步 → 满足 §0 的跨 stream ordering 约束），取 `rkv.assigned_block_hashes()`，按 per-request `saved_cursor`（初值 = `prefix_matched_blocks()`，GPU-hit 前缀已 resident，跳过）保存新封的 `(page_id, hash)`，`offload.save(...)` fire-and-forget，推进 cursor。
+
+**LOAD（async，GPU+CPU hit，非阻塞 admission）**：admission 把 `match_and_add_prefix` 拆成"建 RequestKv → 算 GPU hit G → query CPU [G..F] → 异步 load → LoadingKv 轮询"：
+
+1. `rkv = pool.new_request(...)`；`hashes = rkv.prompt_block_hashes()`（F 块）。
+2. `manager.match_blocks(&seq_hashes)` 数出 GPU 命中前缀 G（**持其 `ImmutableBlock` 不 drop**，防 load 期间被 evict）。
+3. `offload.query(req_id, &hashes[G..F])` → CPU 命中 C（连续）+ lease。
+4. `manager.allocate_blocks(C)` 拿 C 个 `MutableBlock`（DMA 落点），取 `block_id()` 列表；`offload.load(lease, page_ids)` → `LoadHandle`。请求进 `LoadingKv{rkv, handle, muts, hashes[G..G+C], gpu_imms}` holding 态，**不 prefill**。
+5. 每 tick `handle.poll()`：`Ready` → 对每个 `mut` `.stage(hash, bs)` + `manager.register_block(..)` 注册进 registry（用的就是 `BlockPool::new` 给 padding 块用的同一套公开 API，**无需改 kvbm**）；随后 `rkv.match_and_add_prefix()` 自然命中 G+C 连续前缀，`kv_position=(G+C)*bs`；drop holding 的 imms（sequence 自持）。请求转入正常 prefill（suffix = 剩余 token）。
+6. `C==0` → 直接 prefill（纯 GPU hit，与今日行为一致）。
+
+**为何 register→rematch 而非直接注入 sequence**：复用现成的 `match_and_add_prefix`（GPU+CPU 在它眼里就是一段连续前缀），零 kvbm 改动；register 与 rematch 同 tick、且 holding 了 G 的 imms，eviction 窗口为零。最坏（真被 evict）只是少命中、退化为多 prefill，不损正确性。
+
+**scheduler 状态机**：`scheduler_loop` 新增 `loading: Vec<PendingRequest>`，每 tick `reclaim_ready_prefetch`（settle 完的回 `deferred` 队首）+ `offer_prefetch`（未 offer 的 deferred 试 prefetch，起 load 的移入 `loading`）；空闲且有 `loading` 时 `block_on_loading` 阻塞等一个 DMA。`OffloadEngine` 的 `block_on`（query/flush）只在 scheduler 这个**纯 OS 线程**调用，`debug_assert` 护住误用。
+
+风险：preemption/release 时须 drop holding 的 mutable/immutable（RAII 已覆盖）；admission KV 预算要把 loading 占用的 C 块计入 in-flight。
+
+**实现注记（相对原设计的偏差 + toxic-review 加固）**：
+
+- **prefetch 状态落在 executor 而非 scheduler**：`PrefixProbe`（持 G 的 imms + commit 后的 C 块）、`LoadReservation`（C 个 MutableBlock DMA 落点）、`LoadHandle` 都封进 `Qwen3Executor.prefetch: HashMap<RequestId, PrefetchState>`，scheduler 只跟 `RequestId`。commit 在 `seq_hashes[gpu_hit + i]`（GPU+CPU 偏移对齐，组合 hit 测试守这条）。
+- **lease 泄漏修复**：`query` 创建的 pegaflow lease 在 `reserve_loaded_blocks` 失败 / `load` 提交失败时 `OffloadEngine::release_query_lease` 显式释放（`QueryLeaseId` 是 `Copy` 裸 token、无 Drop，丢掉只会挂到 600s TTL）。
+- **拒绝清理**：admission（context/KV budget）与未知 LoRA 拒绝路径补 `drop_request`——否则一个已 settle prefetch 的请求被拒后，commit 的 block + map entry 永久泄漏。
+- **SAVE 防 slot 复用腐蚀**：async `save()` 把被存 block 的 `ImmutableBlock` 强引用（`KvBlockGuard`，与 `block_ids` 1:1）随 spawn 持到 pegaflow D2H 落地才 drop。否则短请求结束 → slot 回收重分配 → 新请求覆写 → 在途 D2H 抓到新 KV 写进旧 hash = 静默腐蚀。guard 在 offload 线程并发 drop 是安全的（kvbm `BlockStore` 单 Mutex、有并发 drop race 处理）；`flush_saves` await 各 save 任务后 guard 才落，故 evict 前先 flush 仍能把 block 排空。
+- **测试**：`tests/kv_offload_cpu_hit.rs` 合一个顺序 `#[test]`（避免两 executor 撞同一 device + pegaflow instance_id），先纯 CPU 后组合 hit。
diff --git a/kvbm/kvbm-logical/src/manager/mod.rs b/kvbm/kvbm-logical/src/manager/mod.rs
index 248480e4..699d3c07 100644
--- a/kvbm/kvbm-logical/src/manager/mod.rs
+++ b/kvbm/kvbm-logical/src/manager/mod.rs
@@ -89,6 +89,16 @@ impl<T: BlockMetadata + Sync> BlockManager<T> {
         Ok(())
     }
 
+    /// Evict every cached-but-unused block: drain the inactive pool back to the
+    /// reset pool. Active blocks (held by a request or an external strong ref,
+    /// e.g. a leaked padding reservation) are untouched. Unlike
+    /// [`reset_inactive_pool`](Self::reset_inactive_pool) this makes no
+    /// assertion about the resulting free count, so it is safe to call on a
+    /// pool that still has pinned blocks — a cold-cache flush, not a reset.
+    pub fn evict_inactive(&self) {
+        drop(self.store.drain_inactive_to_mutable());
+    }
+
     /// Register a batch of completed blocks.
     pub fn register_blocks(&self, blocks: Vec<CompleteBlock<T>>) -> Vec<ImmutableBlock<T>> {
         blocks
diff --git a/pegainfer-kv-cache/src/buffer.rs b/pegainfer-kv-cache/src/buffer.rs
index 52dad3b3..381ad711 100644
--- a/pegainfer-kv-cache/src/buffer.rs
+++ b/pegainfer-kv-cache/src/buffer.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use cudarc::driver::{CudaSlice, CudaStream};
+use cudarc::driver::{CudaSlice, CudaStream, DevicePtr};
 use half::bf16;
 
 use crate::KvLayout;
@@ -51,6 +51,18 @@ impl KvBuffer {
         &self.inner.buffer
     }
 
+    /// Base device address of the fused KV buffer.
+    ///
+    /// Stable for the buffer's lifetime — cudarc allocations don't move — so
+    /// the KV-offload connector registers this once with pegaflow and the
+    /// page-first [`KvLayout`] strides reach every (layer, block, K/V) segment
+    /// from it. The returned address outlives the transient stream-ordering
+    /// guard precisely because the `Arc<Inner>` keeps the slice alive.
+    pub fn device_ptr(&self, stream: &CudaStream) -> u64 {
+        let (ptr, _guard) = self.inner.buffer.device_ptr(stream);
+        ptr
+    }
+
     pub fn num_blocks(&self) -> usize {
         self.inner.num_blocks
     }
diff --git a/pegainfer-kv-cache/src/lib.rs b/pegainfer-kv-cache/src/lib.rs
index 26031b76..a4476644 100644
--- a/pegainfer-kv-cache/src/lib.rs
+++ b/pegainfer-kv-cache/src/lib.rs
@@ -7,7 +7,7 @@ mod view;
 pub use buffer::KvBuffer;
 pub use layout::KvLayout;
 pub use manager::KvCacheManager;
-pub use pool::{BlockPool, RequestKv};
+pub use pool::{BlockPool, KvBlockGuard, LoadReservation, PrefixProbe, RequestKv};
 pub use view::{KvView, KvViewDesc};
 
 pub use kvbm_logical;
diff --git a/pegainfer-kv-cache/src/pool.rs b/pegainfer-kv-cache/src/pool.rs
index bead1c74..dc66fdf6 100644
--- a/pegainfer-kv-cache/src/pool.rs
+++ b/pegainfer-kv-cache/src/pool.rs
@@ -1,4 +1,5 @@
 use kvbm_logical::SequenceHash;
+use kvbm_logical::blocks::{ImmutableBlock, MutableBlock};
 use kvbm_logical::integrations::{DecodeOutcome, SchedulableSequence, ScheduleError};
 use kvbm_logical::manager::BlockManager;
 use kvbm_logical::pools::BlockDuplicationPolicy;
@@ -78,6 +79,14 @@ impl BlockPool {
         self.block_manager.total_blocks().saturating_sub(1)
     }
 
+    /// Evict every cached-but-unused block from the GPU prefix cache (drain the
+    /// inactive pool). In-use blocks are untouched. A cold-cache flush — and,
+    /// for the offload path, the way to force a prefix out of HBM so a
+    /// subsequent request must restore it from the CPU tier.
+    pub fn evict_inactive(&self) {
+        self.block_manager.evict_inactive();
+    }
+
     /// `lora_name` scopes the prefix cache: blocks registered under one
     /// adapter (or the base model, `None`) never match a request running
     /// under a different adapter — the name is folded into the block-hash
@@ -100,6 +109,139 @@ impl BlockPool {
         );
         RequestKv { seq }
     }
+
+    // ── KV-offload prefetch (CPU-tier load before prefill) ─────────────
+
+    /// Resolve `prompt_tokens` against the GPU prefix cache *without* creating
+    /// a request, returning a [`PrefixProbe`] that holds the GPU-hit prefix
+    /// blocks alive so an async CPU-tier load can extend it. The connector
+    /// queries the probe's [`PrefixProbe::cpu_query_hashes`] against the host
+    /// tier, then [`reserve_loaded_blocks`](Self::reserve_loaded_blocks) +
+    /// load + [`commit_loaded_blocks`](Self::commit_loaded_blocks).
+    ///
+    /// `lora_name` must match the request's adapter — it salts the block
+    /// hashes, so probing under the wrong adapter would query unrelated keys.
+    pub fn probe_prefix(&self, prompt_tokens: Vec<u32>, lora_name: Option<&str>) -> PrefixProbe {
+        let num_input = prompt_tokens.len();
+        let rkv = self.new_request(prompt_tokens, 0, lora_name);
+        let seq_hashes = rkv.seq.inner().sequence().all_sequence_hashes();
+        // match_and_add_prefix leaves >=1 prompt token uncached, so a request
+        // can reuse at most this many leading blocks — the CPU load must not
+        // exceed it, or the trailing loaded block would never be re-matched.
+        let cacheable = num_input.saturating_sub(1) / self.block_size;
+        let gpu_guard = self.block_manager.match_blocks(&seq_hashes);
+        let gpu_hit = gpu_guard.len();
+        PrefixProbe {
+            seq_hashes,
+            gpu_hit,
+            cacheable,
+            held: gpu_guard,
+        }
+    }
+
+    /// Reserve `count` mutable blocks as the GPU destinations for a CPU→GPU
+    /// load. Returns `None` under block pressure (the caller then skips the
+    /// prefetch and prefills from scratch). The reservation's
+    /// [`LoadReservation::page_ids`] feed the connector's load; on completion
+    /// hand it to [`commit_loaded_blocks`](Self::commit_loaded_blocks).
+    pub fn reserve_loaded_blocks(&self, count: usize) -> Option<LoadReservation> {
+        let blocks = self.block_manager.allocate_blocks(count)?;
+        Some(LoadReservation { blocks })
+    }
+
+    /// Stage + register the freshly-loaded blocks under the probe's
+    /// continuation hashes (`seq_hashes[gpu_hit .. gpu_hit + reserved]`) and
+    /// fold them into the probe's held set, so a following
+    /// `new_request().match_and_add_prefix()` reuses the full GPU+CPU prefix.
+    ///
+    /// The probe keeps holding every registered block until the request
+    /// prefills, closing the eviction window between registration and re-match.
+    pub fn commit_loaded_blocks(&self, probe: &mut PrefixProbe, reservation: LoadReservation) {
+        let start = probe.gpu_hit;
+        for (i, block) in reservation.blocks.into_iter().enumerate() {
+            let hash = probe.seq_hashes[start + i];
+            let complete = block
+                .stage(hash, self.block_size)
+                .expect("loaded block stage: block_size invariant violated");
+            probe.held.push(self.block_manager.register_block(complete));
+        }
+    }
+}
+
+/// A prompt's prefix resolved against the GPU cache, ready to drive a CPU-tier
+/// prefetch. Holds every GPU-hit (and, after commit, CPU-loaded) block so they
+/// can't be evicted while the load is in flight and before the request prefills.
+pub struct PrefixProbe {
+    /// Content hashes of every complete prompt block, in order (native form).
+    seq_hashes: Vec<SequenceHash>,
+    /// Length of the contiguous GPU-resident prefix.
+    gpu_hit: usize,
+    /// Reuse cap: blocks past this are never matched (the final chunk forwards).
+    cacheable: usize,
+    /// Strong refs keeping matched/loaded blocks resident until prefill.
+    held: Vec<ImmutableBlock<()>>,
+}
+
+impl PrefixProbe {
+    /// Blocks already resident in GPU HBM (the existing prefix-cache hit).
+    pub fn gpu_hit_blocks(&self) -> usize {
+        self.gpu_hit
+    }
+
+    /// Total blocks this probe holds: the GPU-hit prefix plus any committed from
+    /// a CPU-tier load. They are already out of the free pool and become the
+    /// request's cached prefix at prefill, so admission credits them against the
+    /// request's block need (avoiding a double-count against `available_blocks`).
+    pub fn held_blocks(&self) -> usize {
+        self.held.len()
+    }
+
+    /// Content hashes to query the CPU tier with: the blocks past the GPU hit,
+    /// capped at the reuse boundary. Empty when the GPU hit already covers
+    /// every reusable block (nothing to load — prefill normally).
+    pub fn cpu_query_hashes(&self) -> Vec<Vec<u8>> {
+        if self.gpu_hit >= self.cacheable {
+            return Vec::new();
+        }
+        self.seq_hashes[self.gpu_hit..self.cacheable]
+            .iter()
+            .map(|h| sequence_hash_bytes(h).to_vec())
+            .collect()
+    }
+}
+
+/// An opaque strong pin on one registered KV block. While held it keeps the
+/// block in the active pool (out of the free/inactive pools), so the physical
+/// slot cannot be reallocated. Used to hold a block stable across an in-flight
+/// async offload save; cheap to clone/drop (one `Arc` bump). See
+/// [`RequestKv::assigned_block_guards`].
+///
+/// The inner guard is never read — it exists purely for its `Drop`, which
+/// releases the pin. Holding the value *is* the contract.
+pub struct KvBlockGuard(#[allow(dead_code)] ImmutableBlock<()>);
+
+/// GPU destination blocks reserved for a CPU→GPU load, consumed by
+/// [`BlockPool::commit_loaded_blocks`] once the DMA lands.
+pub struct LoadReservation {
+    blocks: Vec<MutableBlock<()>>,
+}
+
+impl LoadReservation {
+    /// Physical page ids the connector loads the leased CPU blocks into, in
+    /// lease order (the i-th leased block lands in `page_ids()[i]`).
+    pub fn page_ids(&self) -> Vec<i32> {
+        self.blocks.iter().map(|b| b.block_id() as i32).collect()
+    }
+
+    /// Number of reserved destination blocks.
+    pub fn len(&self) -> usize {
+        self.blocks.len()
+    }
+
+    /// True when no destinations were reserved.
+    pub fn is_empty(&self) -> bool {
+        self.blocks.is_empty()
+    }
 }
 
 /// Per-request KV state wrapping `SchedulableSequence`.
@@ -229,12 +371,105 @@ impl RequestKv {
         pages.truncate(kv_tokens.div_ceil(self.seq.block_size()));
         pages
     }
+
+    // ── KV offload bridge ──────────────────────────────────────────────
+
+    /// Content hashes of every *full* prompt block, in prompt order.
+    ///
+    /// These are the keys the KV-offload connector queries the CPU tier with,
+    /// so they must be identical across any two requests sharing a prefix.
+    /// They are kvbm's lineage-based [`SequenceHash`], which is exactly that:
+    /// position + content + parent fragment, so block `i` of prompt `P` hashes
+    /// the same no matter which request computed it.
+    pub fn prompt_block_hashes(&self) -> Vec<[u8; 16]> {
+        self.seq
+            .inner()
+            .sequence()
+            .all_sequence_hashes()
+            .iter()
+            .map(sequence_hash_bytes)
+            .collect()
+    }
+
+    /// `(page_id, content_hash)` for every block currently assigned to this
+    /// request, in prompt order. Drives the offload save once a block seals;
+    /// the first [`prefix_matched_blocks`](Self::prefix_matched_blocks) entries
+    /// are GPU-hit reuse (already resident) and are normally skipped.
+    pub fn assigned_block_hashes(&self) -> Vec<(i32, [u8; 16])> {
+        self.seq
+            .inner()
+            .assignments()
+            .assigned_iter()
+            .map(|(&id, block)| (id as i32, sequence_hash_bytes(&block.sequence_hash())))
+            .collect()
+    }
+
+    /// Strong pins for every block currently assigned to this request, aligned
+    /// 1:1 (same order) with [`assigned_block_hashes`](Self::assigned_block_hashes).
+    ///
+    /// An offload save's GPU→CPU copy runs asynchronously after the save is
+    /// submitted; holding the matching [`KvBlockGuard`] keeps that block out of
+    /// the free/inactive pool until the copy lands, so a later request can't be
+    /// allocated the same slot and overwrite it mid-copy. Drop the guard once
+    /// the save reports done.
+    pub fn assigned_block_guards(&self) -> Vec<KvBlockGuard> {
+        self.seq
+            .inner()
+            .assignments()
+            .assigned_iter()
+            .map(|(_, block)| KvBlockGuard(block.clone()))
+            .collect()
+    }
+
+    /// Number of leading blocks reused from the GPU prefix cache.
+    pub fn prefix_matched_blocks(&self) -> usize {
+        self.seq.inner().prefix_matched_blocks()
+    }
+}
+
+/// Pack a kvbm [`SequenceHash`] (lineage hash) into the 16-byte content key the
+/// offload tier addresses blocks by. Big-endian for a stable on-wire ordering.
+fn sequence_hash_bytes(hash: &SequenceHash) -> [u8; 16] {
+    hash.as_u128().to_be_bytes()
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
 
+    /// The offload CPU-tier query keys are `prompt_block_hashes`. The whole
+    /// load path is built on these being identical for any two requests that
+    /// share a prefix (and diverging the moment content does) — otherwise a
+    /// warm block saved by one request would never match the next. Guard it.
+    #[test]
+    fn prompt_block_hashes_stable_across_shared_prefix() {
+        let pool = BlockPool::new(16, 256).unwrap();
+        let shared: Vec<u32> = (0..48).map(|i| 1000 + i).collect(); // 3 full blocks
+        let mut a_tokens = shared.clone();
+        a_tokens.extend((0..16).map(|i| 7000 + i)); // 4th block diverges
+        let mut b_tokens = shared.clone();
+        b_tokens.extend((0..16).map(|i| 9000 + i));
+
+        let a = pool.new_request(a_tokens, 8, None);
+        let b = pool.new_request(b_tokens, 8, None);
+        let ha = a.prompt_block_hashes();
+        let hb = b.prompt_block_hashes();
+
+        assert_eq!(ha.len(), 4, "64 tokens / 16 = 4 full blocks");
+        assert_eq!(hb.len(), 4);
+        assert_eq!(ha[..3], hb[..3], "shared prefix must hash identically");
+        assert_ne!(ha[3], hb[3], "divergent block must hash differently");
+        assert!(ha.iter().all(|h| *h != [0u8; 16]), "hashes are non-trivial");
+
+        // A different LoRA salt must poison the match — same tokens, new keys.
+        let c = pool.new_request(shared, 8, Some("adapter-x"));
+        assert_ne!(
+            c.prompt_block_hashes()[0],
+            ha[0],
+            "salt (lora) must scope the prefix cache"
+        );
+    }
+
     /// kvbm's `schedule_decode` allocates the next generation block when the
     /// appended token fills the current block (`need = pending + 1`), so the
     /// raw `page_indices()` exceeds `ceil(kv_tokens / block_size)` at every
diff --git a/pegainfer-kv-offload/Cargo.toml b/pegainfer-kv-offload/Cargo.toml
new file mode 100644
index 00000000..2d8b9e3a
--- /dev/null
+++ b/pegainfer-kv-offload/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "pegainfer-kv-offload"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+# Embedded in-process: pegaflow-core is the host/SSD/RDMA KV offload tier.
+# Pinned to the upstream master commit that landed the two changes pegainfer
+# needs: block_stride_bytes (#331) and the in-process load API (#333). Bump the
+# rev when upstreaming further pegaflow changes; co-develop via a temporary path
+# dep + PR, then re-pin here.
+# default-features=false drops its `cuda-12` (which would pull a clashing
+# cudarc/cuda-12080 selector) and `rdma` (no RDMA hardware needed for the
+# dense GPU<->CPU path). The workspace cudarc already provides cuda-12090 +
+# nvrtc, which is all pegaflow-core's code needs — it has no cfg(cuda-12) gates.
+pegaflow-core = { git = "https://github.com/novitalabs/pegaflow.git", rev = "07cac7e50e8ae7be15ad1b9311401039c9ee439b", default-features = false }
+pegainfer-kv-cache = { workspace = true }
+cudarc = { workspace = true }
+anyhow = { workspace = true }
+half = { workspace = true }
+log = { workspace = true }
+tokio = { workspace = true }
+
+[lints]
+workspace = true
diff --git a/pegainfer-kv-offload/src/engine.rs b/pegainfer-kv-offload/src/engine.rs
new file mode 100644
index 00000000..432b859f
--- /dev/null
+++ b/pegainfer-kv-offload/src/engine.rs
@@ -0,0 +1,447 @@
+//! [`OffloadEngine`]: the in-process connector that moves KV blocks between
+//! pegainfer's GPU paged cache and pegaflow's host/SSD tiers.
+//!
+//! It owns a [`PegaEngine`] plus a small tokio runtime to drive pegaflow's
+//! async save/query, and translates pegainfer's page-first [`KvLayout`] into
+//! pegaflow's per-layer strided registration. Block content hashes are opaque
+//! `Vec<u8>` here — the caller (scheduler) derives them from kvbm sequence
+//! hashes, so this layer never depends on the logical-cache hashing scheme.
+
+use std::sync::{Arc, Mutex};
+
+use cudarc::driver::CudaStream;
+use pegaflow_core::{
+    EngineError, LayerSave, PegaEngine, PrefetchStatus, QueryLeaseId, StorageConfig,
+};
+use pegainfer_kv_cache::KvBuffer;
+use tokio::runtime::Runtime;
+use tokio::sync::oneshot;
+use tokio::task::JoinHandle;
+
+/// Single-GPU, single-rank topology. The dense Qwen3-4B path runs one offload
+/// engine per executor rank, each owning one GPU's KV buffer.
+const NAMESPACE: &str = "pegainfer";
+const TP_RANK: usize = 0;
+const PP_RANK: usize = 0;
+const TP_SIZE: usize = 1;
+const WORLD_SIZE: usize = 1;
+
+/// bf16 KV cache: every layout stride is counted in elements, bytes are ×2.
+const ELEM_SIZE: usize = std::mem::size_of::<half::bf16>();
+
+/// Guard the `block_on` entry points: tokio panics with an opaque message if
+/// you block on a runtime from within any runtime. These methods are meant for
+/// the synchronous scheduler thread — fail loud and specific if that's violated.
+fn assert_outside_runtime(op: &str) {
+    debug_assert!(
+        tokio::runtime::Handle::try_current().is_err(),
+        "OffloadEngine::{op} drives the offload runtime with block_on and must be \
+         called from a synchronous thread, never from within a tokio runtime"
+    );
+}
+
+/// Tuning knobs for a new [`OffloadEngine`].
+pub struct OffloadConfig {
+    /// Stable identifier shared across this engine's lifetime so prefix blocks
+    /// saved by one request are query-visible to the next.
+    pub instance_id: String,
+    /// CUDA device ordinal whose KV buffer this engine offloads.
+    pub device_id: i32,
+    /// Host pinned-memory pool size in bytes (the CPU KV tier capacity).
+    pub pinned_pool_bytes: usize,
+    /// Worker threads for the embedded runtime that drives pegaflow's async
+    /// save/query. Two is plenty: save is fire-and-forget, query is a brief
+    /// memory-cache lookup.
+    pub runtime_threads: usize,
+}
+
+impl OffloadConfig {
+    pub fn new(instance_id: impl Into<String>, device_id: i32, pinned_pool_bytes: usize) -> Self {
+        Self {
+            instance_id: instance_id.into(),
+            device_id,
+            pinned_pool_bytes,
+            runtime_threads: 2,
+        }
+    }
+}
+
+/// A query hit: how many prefix blocks pegaflow can return from its CPU tier,
+/// and the lease that owns those blocks until [`OffloadEngine::load`] consumes
+/// it. `num_blocks == 0` means a full miss and `lease` is `None`.
+pub struct QueryHit {
+    pub lease: Option<QueryLeaseId>,
+    pub num_blocks: usize,
+}
+
+/// In-flight handle for a CPU→GPU load submitted to pegaflow's worker.
+///
+/// The load runs on pegaflow's GPU worker thread; this resolves when the DMA
+/// completes. [`Self::poll`] keeps scheduler admission non-blocking; [`Self::wait`]
+/// blocks for tests and non-pipelined callers.
+pub struct LoadHandle {
+    rx: oneshot::Receiver<Result<(), EngineError>>,
+}
+
+impl LoadHandle {
+    /// Non-blocking check for a scheduler tick. `None` while still loading.
+    pub fn poll(&mut self) -> Option<Result<(), EngineError>> {
+        match self.rx.try_recv() {
+            Ok(result) => Some(result),
+            Err(oneshot::error::TryRecvError::Empty) => None,
+            Err(oneshot::error::TryRecvError::Closed) => Some(Err(EngineError::Storage(
+                "load worker dropped reply".into(),
+            ))),
+        }
+    }
+
+    /// Block the current thread until the load settles.
+    pub fn wait(self) -> Result<(), EngineError> {
+        self.rx
+            .blocking_recv()
+            .unwrap_or_else(|_| Err(EngineError::Storage("load worker dropped reply".into())))
+    }
+}
+
+/// Per-layer registration geometry derived once from a [`KvBuffer`]'s layout.
+///
+/// Only `data_ptrs` and `size_bytes` differ per layer; the rest are the same
+/// scalar broadcast across all layers (kept as vectors only to feed pegaflow's
+/// one batched registration call).
+struct Registration {
+    layer_names: Vec<String>,
+    data_ptrs: Vec<u64>,
+    size_bytes: Vec<usize>,
+    num_blocks: Vec<usize>,
+    bytes_per_block: Vec<usize>,
+    kv_stride_bytes: Vec<usize>,
+    segments: Vec<usize>,
+    block_stride_bytes: Vec<usize>,
+}
+
+impl Registration {
+    /// Map the fused page-first buffer to pegaflow's per-layer view.
+    ///
+    /// Each model layer registers as one pegaflow "layer". Within a page the
+    /// layout is K then V back-to-back (`layer_stride = 2·kv_block_len`), so a
+    /// layer's K and V are *contiguous* — one single segment of `layer_stride`
+    /// bytes copies both, and pegaflow's K/V-split path (which needs the two
+    /// segments set apart, `kv_stride > bytes_per_block`) does not apply here.
+    /// What is *not* contiguous is consecutive blocks of one layer: the fused
+    /// buffer interleaves all layers within a page, so they sit `page_stride`
+    /// apart. That gap (stride ≠ copy size) is exactly what `block_stride_bytes`
+    /// decouples.
+    fn from_buffer(buffer: &KvBuffer, stream: &CudaStream) -> Self {
+        let layout = buffer.layout();
+        let num_blocks = buffer.num_blocks();
+        let base_ptr = buffer.device_ptr(stream);
+
+        // One block's copy unit for a layer = its whole [K|V] span in a page.
+        let layer_bytes = layout.layer_stride * ELEM_SIZE;
+        let page_stride_bytes = layout.page_stride * ELEM_SIZE;
+        let total_bytes = num_blocks * page_stride_bytes;
+
+        let n = layout.num_layers;
+        let mut reg = Registration {
+            layer_names: Vec::with_capacity(n),
+            data_ptrs: Vec::with_capacity(n),
+            size_bytes: Vec::with_capacity(n),
+            num_blocks: vec![num_blocks; n],
+            bytes_per_block: vec![layer_bytes; n],
+            kv_stride_bytes: vec![0; n],
+            segments: vec![1; n],
+            block_stride_bytes: vec![page_stride_bytes; n],
+        };
+        for layer in 0..n {
+            let layer_off = layer * layer_bytes;
+            reg.layer_names.push(layer.to_string());
+            reg.data_ptrs.push(base_ptr + layer_off as u64);
+            // The layer's region runs from its [K|V] base to the end of the
+            // buffer; bounds are validated against the strided last-block reach.
+            reg.size_bytes.push(total_bytes - layer_off);
+        }
+        reg
+    }
+}
+
+/// In-process bridge from pegainfer's GPU KV cache to pegaflow's offload tiers.
+///
+/// Dropping the engine drops its [`Runtime`], which abandons any in-flight
+/// fire-and-forget [`Self::save`] tasks. That is acceptable: the host tier is a
+/// cache, so a lost save only forfeits a future hit, never inference
+/// correctness. Saves that must survive a handoff (eviction) use the synchronous
+/// [`Self::save_blocking`] instead.
+pub struct OffloadEngine {
+    engine: Arc<PegaEngine>,
+    runtime: Runtime,
+    instance_id: String,
+    device_id: i32,
+    /// Owned per-layer names; load borrows these as `&[&str]`.
+    layer_names: Vec<String>,
+    /// In-flight fire-and-forget save tasks. [`Self::flush_saves`] awaits these
+    /// before draining the write pipeline, so a flush is a true barrier — the
+    /// detached D2H may not even have started when the caller flushes.
+    /// Finished handles are pruned on each [`Self::save`].
+    pending_saves: Mutex<Vec<JoinHandle<()>>>,
+}
+
+impl OffloadEngine {
+    /// Build the engine and register `buffer` as the GPU side of the offload.
+    ///
+    /// `stream` must be the stream that owns `buffer` (used only to read its
+    /// base device address). pegaflow attaches the device's primary CUDA
+    /// context for its own worker transfers — the same context pegainfer runs
+    /// on — so the registered pointers are valid across both.
+    pub fn new(
+        config: OffloadConfig,
+        buffer: &KvBuffer,
+        stream: &CudaStream,
+    ) -> Result<Self, EngineError> {
+        let runtime = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(config.runtime_threads.max(1))
+            .enable_all()
+            .build()
+            .map_err(|e| EngineError::Storage(format!("offload runtime build: {e}")))?;
+
+        let storage_config = StorageConfig::default();
+        let engine = Arc::new(PegaEngine::new_with_config(
+            config.pinned_pool_bytes,
+            false,
+            storage_config,
+        )?);
+
+        let reg = Registration::from_buffer(buffer, stream);
+        engine.register_context_layer_batch_strided(
+            &config.instance_id,
+            NAMESPACE,
+            config.device_id,
+            TP_RANK,
+            PP_RANK,
+            TP_SIZE,
+            WORLD_SIZE,
+            reg.layer_names.len(),
+            &reg.layer_names,
+            &reg.data_ptrs,
+            &reg.size_bytes,
+            &reg.num_blocks,
+            &reg.bytes_per_block,
+            &reg.kv_stride_bytes,
+            &reg.segments,
+            Some(&reg.block_stride_bytes),
+        )?;
+
+        Ok(Self {
+            engine,
+            runtime,
+            instance_id: config.instance_id,
+            device_id: config.device_id,
+            layer_names: reg.layer_names,
+            pending_saves: Mutex::new(Vec::new()),
+        })
+    }
+
+    /// Fan one (block_id, hash) list across every layer — the device data
+    /// differs per layer, the ids and hashes don't.
+    fn build_saves(&self, block_ids: &[i32], block_hashes: &[Vec<u8>]) -> Vec<LayerSave> {
+        self.layer_names
+            .iter()
+            .map(|name| LayerSave {
+                layer_name: name.clone(),
+                block_ids: block_ids.to_vec(),
+                block_hashes: block_hashes.to_vec(),
+            })
+            .collect()
+    }
+
+    /// Save the named GPU blocks to the host tier — fire-and-forget.
+    ///
+    /// Best-effort by contract: the GPU→CPU copy runs on pegaflow's worker and
+    /// any failure (pinned pool full, copy error) is logged, never surfaced.
+    /// `block_hashes[i]` is the content hash of `block_ids[i]`; all layers share
+    /// the same (block_id, hash) pairing — only the device data differs.
+    ///
+    /// ORDERING CONTRACT: pegaflow's D2H runs on *its own* stream, with no
+    /// dependency on pegainfer's compute stream. The caller must therefore only
+    /// save blocks whose KV writes are already complete — i.e. call this after
+    /// the producing forward step has synchronized (block-seal time, which is
+    /// post-step-sync in the executor). Saving a block whose attention write is
+    /// still in flight reads torn data. This connector cannot enforce the
+    /// invariant (it does not own the compute stream); the wiring must uphold it.
+    ///
+    /// REUSE CONTRACT: the copy reads the GPU block asynchronously *after* this
+    /// returns, so the block must stay stable until the copy lands. `keep_alive`
+    /// is an opaque payload (e.g. the source blocks' allocator guards) held for
+    /// the lifetime of the spawned save and dropped only once it finishes — so
+    /// the caller's blocks cannot be evicted and overwritten under the in-flight
+    /// D2H (which would snapshot the wrong KV and persist it under the old hash).
+    /// Pass `()` only when the blocks are owned elsewhere for the whole save.
+    pub fn save<G: Send + 'static>(
+        &self,
+        block_ids: &[i32],
+        block_hashes: &[Vec<u8>],
+        keep_alive: G,
+    ) {
+        debug_assert_eq!(block_ids.len(), block_hashes.len());
+        if block_ids.is_empty() {
+            return;
+        }
+        let saves = self.build_saves(block_ids, block_hashes);
+        let engine = Arc::clone(&self.engine);
+        let instance_id = self.instance_id.clone();
+        let device_id = self.device_id;
+        let handle = self.runtime.spawn(async move {
+            if let Err(e) = engine
+                .batch_save_kv_blocks_from_ipc(&instance_id, TP_RANK, PP_RANK, device_id, saves)
+                .await
+            {
+                log::warn!("pegaflow save failed (best-effort): {e}");
+            }
+            // Release the source-block pins only now the D2H has landed; before
+            // this point the blocks must not be reused (see REUSE CONTRACT).
+            drop(keep_alive);
+        });
+        // Track for `flush_saves`; prune the ones that already settled so the
+        // list stays bounded by the genuinely in-flight saves.
+        let mut pending = self.pending_saves.lock().expect("pending_saves poisoned");
+        pending.retain(|h| !h.is_finished());
+        pending.push(handle);
+    }
+
+    /// Save the named GPU blocks and block until the GPU→CPU copy has captured
+    /// the data into the host tier (the insert may still be in flight; pair with
+    /// [`Self::flush_saves`] for cache visibility).
+    ///
+    /// The synchronous contract is what makes this safe at eviction handoff: the
+    /// GPU block can be reused the moment this returns. Errors surface, unlike
+    /// the fire-and-forget [`Self::save`]. The same compute-stream ORDERING
+    /// CONTRACT as [`Self::save`] applies: blocking waits on pegaflow's D2H, not
+    /// on pegainfer's compute stream, so the writes must already be complete.
+    pub fn save_blocking(
+        &self,
+        block_ids: &[i32],
+        block_hashes: &[Vec<u8>],
+    ) -> Result<(), EngineError> {
+        debug_assert_eq!(block_ids.len(), block_hashes.len());
+        if block_ids.is_empty() {
+            return Ok(());
+        }
+        assert_outside_runtime("save_blocking");
+        let saves = self.build_saves(block_ids, block_hashes);
+        self.runtime
+            .block_on(self.engine.batch_save_kv_blocks_from_ipc(
+                &self.instance_id,
+                TP_RANK,
+                PP_RANK,
+                self.device_id,
+                saves,
+            ))
+    }
+
+    /// Look up how long a prefix of `block_hashes` is resident in the CPU tier.
+    ///
+    /// Returns the hit-block count and a lease owning those blocks; pass the
+    /// lease to [`Self::load`] to copy them to GPU. `req_id` must be non-empty
+    /// and unique enough to scope an in-flight prefetch (the request id works).
+    pub fn query(&self, req_id: &str, block_hashes: &[Vec<u8>]) -> Result<QueryHit, EngineError> {
+        if block_hashes.is_empty() {
+            return Ok(QueryHit {
+                lease: None,
+                num_blocks: 0,
+            });
+        }
+        assert_outside_runtime("query");
+        let status = self
+            .runtime
+            .block_on(self.engine.count_prefix_hit_blocks_with_prefetch(
+                &self.instance_id,
+                req_id,
+                block_hashes,
+            ))?;
+
+        match status {
+            // No SSD/RDMA tier in the dense v1 path, so a prefetch never lands
+            // in flight; treat the rare `Loading` as a miss this tick.
+            PrefetchStatus::Loading => Ok(QueryHit {
+                lease: None,
+                num_blocks: 0,
+            }),
+            PrefetchStatus::Ready { blocks, .. } => {
+                if blocks.is_empty() {
+                    return Ok(QueryHit {
+                        lease: None,
+                        num_blocks: 0,
+                    });
+                }
+                let num_blocks = blocks.len();
+                let lease = self.engine.create_query_lease(&self.instance_id, blocks)?;
+                Ok(QueryHit {
+                    lease: Some(lease),
+                    num_blocks,
+                })
+            }
+        }
+    }
+
+    /// Copy the leased CPU blocks into the GPU blocks named by `dst_block_ids`,
+    /// across every registered layer. Returns a non-blocking [`LoadHandle`].
+    ///
+    /// `dst_block_ids.len()` must equal the lease's block count (the
+    /// `num_blocks` from [`Self::query`]); pegaflow maps the i-th leased block
+    /// onto `dst_block_ids[i]` for each layer.
+    pub fn load(
+        &self,
+        lease: QueryLeaseId,
+        dst_block_ids: Vec<i32>,
+    ) -> Result<LoadHandle, EngineError> {
+        let layer_refs: Vec<&str> = self.layer_names.iter().map(String::as_str).collect();
+        let loads = [(lease, dst_block_ids)];
+        let rx = self.engine.batch_load_kv_blocks_multi_layer_inproc(
+            &self.instance_id,
+            TP_RANK,
+            self.device_id,
+            &layer_refs,
+            &loads,
+        )?;
+        Ok(LoadHandle { rx })
+    }
+
+    /// Release a query lease without loading it.
+    ///
+    /// [`Self::query`] pins its hit blocks behind a lease until [`Self::load`]
+    /// consumes it. When the caller decides not to load (e.g. no GPU
+    /// destination blocks are free), it must release the lease here — a dropped
+    /// [`QueryLeaseId`] is an inert token, so without this the pinned host
+    /// blocks would sit unevictable until the lease's TTL expires. A no-op if
+    /// the lease was already consumed by a `load`.
+    pub fn release_query_lease(&self, lease: QueryLeaseId) {
+        self.engine.release_query_lease(&lease);
+    }
+
+    /// Flush pending saves into the read cache so a following [`Self::query`]
+    /// can see them. A correctness barrier for tests and eviction handoff, not
+    /// a steady-state call.
+    ///
+    /// First awaits every in-flight fire-and-forget [`Self::save`] (their D2H
+    /// copy + write-pipeline submit), *then* drains the write pipeline — without
+    /// the first step a detached save that has not started yet would be missed.
+    pub fn flush_saves(&self) {
+        assert_outside_runtime("flush_saves");
+        let handles: Vec<JoinHandle<()>> = {
+            let mut pending = self.pending_saves.lock().expect("pending_saves poisoned");
+            pending.drain(..).collect()
+        };
+        self.runtime.block_on(async {
+            for handle in handles {
+                let _ = handle.await;
+            }
+            self.engine.flush_saves().await;
+        });
+    }
+
+    /// Drop all resident CPU-tier blocks (test/eviction helper). Saved data in
+    /// a backing store would survive; the dense v1 path has none, so this
+    /// empties the CPU tier.
+    pub fn evict_all(&self) {
+        self.engine.cleanup_memory_cache();
+    }
+}
diff --git a/pegainfer-kv-offload/src/lib.rs b/pegainfer-kv-offload/src/lib.rs
new file mode 100644
index 00000000..3cf3a961
--- /dev/null
+++ b/pegainfer-kv-offload/src/lib.rs
@@ -0,0 +1,21 @@
+//! In-process KV cache offload bridge between pegainfer and pegaflow.
+//!
+//! pegainfer owns the GPU paged-KV (`pegainfer-kv-cache::KvBuffer`, page-first
+//! layout) and the logical prefix cache (kvbm `BlockPool`). pegaflow owns the
+//! deeper tiers (host pinned memory, SSD, RDMA). [`OffloadEngine`] is the
+//! connector "brain" that moves blocks between them and decides when.
+//!
+//! Dense-attention v1 (Qwen3-4B): the GPU prefix hit stays native to kvbm's
+//! `BlockPool`; this connector covers the CPU tier and stacks a CPU-hit prefix
+//! on top of the GPU-hit prefix (both anchor at prefix 0, so the combined hit
+//! is one contiguous prefix split at a single point — GPU→CPU→GPU interleaving
+//! is deliberately excluded). Save is best-effort fire-and-forget; load is on
+//! the critical path, strongly ordered, but never blocks admission — a request
+//! polls its [`LoadHandle`] each scheduler tick.
+
+mod engine;
+
+pub use engine::{LoadHandle, OffloadConfig, OffloadEngine, QueryHit};
+
+// Re-exported so callers name pegaflow's engine types through this bridge.
+pub use pegaflow_core::{EngineError, PegaEngine, QueryLeaseId};
diff --git a/pegainfer-kv-offload/tests/cpu_roundtrip.rs b/pegainfer-kv-offload/tests/cpu_roundtrip.rs
new file mode 100644
index 00000000..f02f0c67
--- /dev/null
+++ b/pegainfer-kv-offload/tests/cpu_roundtrip.rs
@@ -0,0 +1,155 @@
+//! GPU→CPU→GPU round-trip over a real page-first [`KvBuffer`].
+//!
+//! Writes a distinct bit pattern into a set of source GPU blocks, offloads them
+//! to pegaflow's host tier, evicts the GPU-side data implicitly by loading into
+//! a *different* set of blocks, and checks the bytes match. This exercises the
+//! whole connector — strided per-layer registration (`block_stride` ≠ copy
+//! size), the K/V split, the async save, the prefix query, and the in-process
+//! oneshot load — on actual device memory. If the layout math were wrong the
+//! loaded bytes would land in the wrong layer/segment/block and the compare
+//! would fail.
+//!
+//! Requires a CUDA GPU; skipped from `--lib` unit runs.
+
+use cudarc::driver::{CudaContext, result};
+use half::bf16;
+use pegainfer_kv_cache::KvBuffer;
+use pegainfer_kv_offload::{OffloadConfig, OffloadEngine};
+
+const NUM_LAYERS: usize = 4;
+const NUM_KV_HEADS: usize = 2;
+const HEAD_DIM: usize = 8;
+const PAGE_SIZE: usize = 16;
+const NUM_BLOCKS: usize = 32;
+
+/// Elements in one K (or V) segment of one block.
+const SEGMENT_LEN: usize = PAGE_SIZE * NUM_KV_HEADS * HEAD_DIM;
+const LAYER_STRIDE: usize = 2 * SEGMENT_LEN;
+const PAGE_STRIDE: usize = NUM_LAYERS * LAYER_STRIDE;
+
+/// Deterministic, finite, varied pattern for one (logical block, layer, segment).
+/// `logical` is the block's position in the saved hash list — load must restore
+/// the i-th leased block onto the i-th destination, so the destination's bytes
+/// must equal `pattern(i, ..)` regardless of which physical block held it.
+fn pattern(logical: usize, layer: usize, segment: usize) -> Vec<bf16> {
+    (0..SEGMENT_LEN)
+        .map(|e| {
+            let seed = (logical * 9973 + layer * 257 + segment * 131 + e * 7) % 4093;
+            bf16::from_f32(seed as f32 / 11.0 - 90.0)
+        })
+        .collect()
+}
+
+/// Byte address of (block, layer, segment)'s first element within the fused buffer.
+fn segment_ptr(base: u64, block_id: usize, layer: usize, segment: usize) -> u64 {
+    let elem_off = block_id * PAGE_STRIDE + layer * LAYER_STRIDE + segment * SEGMENT_LEN;
+    base + (elem_off * std::mem::size_of::<bf16>()) as u64
+}
+
+fn block_hash(logical: usize) -> Vec<u8> {
+    let mut h = vec![0xA5u8; 16];
+    h[0] = logical as u8;
+    h[1] = (logical as u8).wrapping_mul(31).wrapping_add(7);
+    h
+}
+
+#[test]
+fn gpu_cpu_gpu_roundtrip_preserves_kv_bytes() {
+    let ctx = CudaContext::new(0).expect("cuda device 0");
+    ctx.bind_to_thread().expect("bind ctx to test thread");
+    let stream = ctx.default_stream();
+
+    let buffer = KvBuffer::new(
+        &stream,
+        NUM_LAYERS,
+        NUM_KV_HEADS,
+        HEAD_DIM,
+        PAGE_SIZE,
+        NUM_BLOCKS,
+    )
+    .expect("alloc KvBuffer");
+    // Sanity: our test-local geometry constants match the buffer's layout.
+    assert_eq!(buffer.layout().page_stride, PAGE_STRIDE);
+    assert_eq!(buffer.layout().kv_block_len, SEGMENT_LEN);
+
+    let base = buffer.device_ptr(&stream);
+
+    let src_blocks = [1usize, 2, 3];
+    let dst_blocks = [10usize, 11, 12];
+    let untouched_block = 20usize;
+
+    // ── Fill source blocks with the per-(logical, layer, segment) pattern ──
+    for (logical, &block_id) in src_blocks.iter().enumerate() {
+        for layer in 0..NUM_LAYERS {
+            for segment in 0..2 {
+                let data = pattern(logical, layer, segment);
+                let dst = segment_ptr(base, block_id, layer, segment);
+                // SAFETY: dst lies inside the buffer (block < NUM_BLOCKS) and the
+                // slice is exactly one segment of bf16, the buffer's element type.
+                unsafe { result::memcpy_htod_sync(dst, &data) }.expect("htod fill");
+            }
+        }
+    }
+    stream.synchronize().expect("sync after fill");
+
+    // ── Build the offload engine (registers the fused buffer) ──
+    let engine = OffloadEngine::new(
+        OffloadConfig::new("roundtrip-test", 0, 64 * 1024 * 1024),
+        &buffer,
+        &stream,
+    )
+    .expect("build OffloadEngine");
+
+    let hashes: Vec<Vec<u8>> = (0..src_blocks.len()).map(block_hash).collect();
+    let src_ids: Vec<i32> = src_blocks.iter().map(|&b| b as i32).collect();
+
+    // ── Save GPU→CPU (blocking capture) and make the writes cache-visible ──
+    engine.save_blocking(&src_ids, &hashes).expect("save");
+    engine.flush_saves();
+
+    // ── Query the CPU tier: the full 3-block prefix must be resident ──
+    let hit = engine.query("roundtrip-req", &hashes).expect("query");
+    assert_eq!(
+        hit.num_blocks, 3,
+        "all three saved blocks should hit the CPU tier"
+    );
+    let lease = hit.lease.expect("a hit returns a lease");
+
+    // ── Load CPU→GPU into a *different* set of blocks ──
+    let dst_ids: Vec<i32> = dst_blocks.iter().map(|&b| b as i32).collect();
+    engine
+        .load(lease, dst_ids)
+        .expect("submit load")
+        .wait()
+        .expect("load completes");
+    stream.synchronize().expect("sync after load");
+
+    // ── Verify each destination block holds the matching logical pattern ──
+    for (logical, &block_id) in dst_blocks.iter().enumerate() {
+        for layer in 0..NUM_LAYERS {
+            for segment in 0..2 {
+                let expected = pattern(logical, layer, segment);
+                let mut got = vec![bf16::ZERO; SEGMENT_LEN];
+                let src = segment_ptr(base, block_id, layer, segment);
+                // SAFETY: src is one in-bounds segment of bf16.
+                unsafe { result::memcpy_dtoh_sync(&mut got, src) }.expect("dtoh verify");
+                let expected_bits: Vec<u16> = expected.iter().map(|v| v.to_bits()).collect();
+                let got_bits: Vec<u16> = got.iter().map(|v| v.to_bits()).collect();
+                assert_eq!(
+                    got_bits, expected_bits,
+                    "dst block {block_id} layer {layer} segment {segment} \
+                     must restore logical block {logical}'s bytes"
+                );
+            }
+        }
+    }
+
+    // ── Negative control: a block we never loaded stays zero ──
+    let mut zero = vec![bf16::from_f32(1.0); SEGMENT_LEN];
+    let src = segment_ptr(base, untouched_block, 0, 0);
+    unsafe { result::memcpy_dtoh_sync(&mut zero, src) }.expect("dtoh untouched");
+    assert!(
+        zero.iter().all(|v| v.to_bits() == 0),
+        "an unloaded block must remain zeroed — load must not scribble outside its destinations"
+    );
+}
diff --git a/pegainfer-qwen3-4b/Cargo.toml b/pegainfer-qwen3-4b/Cargo.toml
index 65d58a82..5a017c4c 100644
--- a/pegainfer-qwen3-4b/Cargo.toml
+++ b/pegainfer-qwen3-4b/Cargo.toml
@@ -15,6 +15,7 @@ comfy-table = { workspace = true, optional = true }
 crossbeam-channel = { workspace = true }
 cudarc = { workspace = true }
 pegainfer-kv-cache = { workspace = true }
+pegainfer-kv-offload = { workspace = true }
 fastrace = { workspace = true }
 half = { workspace = true }
 hex = { workspace = true, optional = true }
diff --git a/pegainfer-qwen3-4b/src/executor.rs b/pegainfer-qwen3-4b/src/executor.rs
index 10358a44..47b7caf4 100644
--- a/pegainfer-qwen3-4b/src/executor.rs
+++ b/pegainfer-qwen3-4b/src/executor.rs
@@ -4,16 +4,19 @@ use std::thread;
 use anyhow::Result;
 use crossbeam_channel as channel;
 
-use crate::Qwen3LoraOptions;
 use crate::batch_decode_buffers::{BATCH_BUCKETS, BatchDecodeBuffers};
 use crate::config::{Config, TensorParallelConfig};
 use crate::weights::{ModelRuntimeConfig, Qwen3Model};
+use crate::{Qwen3LoraOptions, Qwen3OffloadOptions};
 use pegainfer_core::engine::{LoadLoraAdapterRequest, TokenLogprob, UnloadLoraAdapterRequest};
 use pegainfer_core::kv_pool::KvLayout;
 use pegainfer_core::ops;
 use pegainfer_core::sampler::SamplingParams;
 use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates};
-use pegainfer_kv_cache::{KvBuffer, KvCacheManager, KvView};
+use pegainfer_kv_cache::{
+    KvBlockGuard, KvBuffer, KvCacheManager, KvView, LoadReservation, PrefixProbe,
+};
+use pegainfer_kv_offload::{LoadHandle, OffloadConfig, OffloadEngine};
 
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
 pub struct RequestId(pub(crate) u64);
@@ -449,6 +452,41 @@ pub(crate) trait ModelExecutor: Send {
     fn list_lora_adapters(&self) -> Vec<String> {
         Vec::new()
     }
+
+    // ── KV-offload prefetch hooks (no-op unless offload is enabled) ─────
+
+    /// Offer a freshly-submitted request for async CPU-tier KV prefetch.
+    /// Returns `true` if a load is now in flight and the scheduler must park
+    /// the request until [`Self::drain_ready_prefetch`] reports it ready.
+    fn begin_kv_prefetch(
+        &mut self,
+        _request_id: RequestId,
+        _prompt_tokens: &[u32],
+        _lora_adapter: Option<&str>,
+    ) -> bool {
+        false
+    }
+
+    /// Non-blocking sweep: request ids whose prefetch just settled (now
+    /// prefill-eligible).
+    fn drain_ready_prefetch(&mut self) -> Vec<RequestId> {
+        Vec::new()
+    }
+
+    /// Block until at least one in-flight prefetch settles (idle-only), then
+    /// sweep the rest.
+    fn wait_ready_prefetch(&mut self) -> Vec<RequestId> {
+        Vec::new()
+    }
+
+    /// Blocks `request_id` already holds via a settled prefetch (its restored
+    /// prefix). These were taken out of the free pool for this request and
+    /// become its cached prefill prefix, so admission credits them against the
+    /// request's block need to avoid double-counting. Zero unless a prefetch
+    /// has committed for `request_id`.
+    fn prefetched_blocks(&self, _request_id: RequestId) -> usize {
+        0
+    }
 }
 
 struct Qwen3ExecutorMetadata {
@@ -466,10 +504,42 @@ pub struct Qwen3Executor {
     loaded_lora_adapters: HashSet<String>,
     prefix_cache_enabled: bool,
     lora_options: Qwen3LoraOptions,
+    /// pegaflow KV-offload bridge; `None` unless offload is opted in on the
+    /// single-GPU path. Drives both the SAVE hook and the async LOAD prefetch.
+    offload: Option<OffloadEngine>,
+    /// Per-request count of sealed blocks already saved to the host tier, so
+    /// each step only saves blocks that newly sealed. Initialized to the
+    /// GPU-hit prefix (already resident) on first save.
+    saved_cursor: HashMap<RequestId, usize>,
+    /// In-flight CPU→GPU prefetches keyed by request, parked until their load
+    /// settles and the blocks register into the prefix cache.
+    prefetch: HashMap<RequestId, PrefetchState>,
+    /// Offload pure-L2 mode. When set, completed blocks are not kept for
+    /// cross-request HBM reuse: the prefetch probe drains the inactive pool
+    /// first, so every probe sees `gpu_hit == 0` and the whole cacheable prefix
+    /// is restored from the host tier. This is what `--no-prefix-cache` means
+    /// once offload is on (the L2 restore still rides on `match_and_add_prefix`,
+    /// so prefix matching itself stays enabled). Set via
+    /// [`Self::set_no_prefix_cache`].
+    l1_retention_disabled: bool,
+}
+
+/// One request's in-flight CPU-tier KV prefetch.
+///
+/// Holds the destination blocks (via `probe`/`reservation`) and the load handle
+/// so the scheduler can poll completion non-blockingly. Once the load settles,
+/// the reservation is committed (blocks staged + registered) and only `probe`
+/// remains, holding the GPU+CPU prefix resident until the request prefills.
+struct PrefetchState {
+    probe: PrefixProbe,
+    /// `Some` until the load lands and the blocks are committed.
+    reservation: Option<LoadReservation>,
+    /// `Some` while the DMA is in flight; `None` once it has settled.
+    handle: Option<LoadHandle>,
 }
 
 impl Qwen3Executor {
-    pub(crate) fn single(model: Qwen3Model) -> Result<Self> {
+    pub(crate) fn single(model: Qwen3Model, offload_opts: &Qwen3OffloadOptions) -> Result<Self> {
         let budget = model.kv_budget();
         let kv_mgr = KvCacheManager::new(
             &model.device_ctx().stream,
@@ -485,6 +555,9 @@ impl Qwen3Executor {
             config: model.config().clone(),
         };
         let kv_buffer = kv_mgr.buffer().clone();
+        // Build the offload engine while the model's stream is still in hand
+        // (it moves into the RankWorker below). Registers the fused KV buffer.
+        let offload = build_offload(offload_opts, &kv_mgr, model.device_ctx())?;
         let total_blocks = kv_mgr.pool().total_blocks();
         let padding_block_id = kv_mgr.pool().padding_block_id();
         Ok(Self {
@@ -499,6 +572,10 @@ impl Qwen3Executor {
             loaded_lora_adapters: HashSet::new(),
             prefix_cache_enabled: true,
             lora_options: Qwen3LoraOptions::default(),
+            offload,
+            saved_cursor: HashMap::new(),
+            prefetch: HashMap::new(),
+            l1_retention_disabled: false,
         })
     }
 
@@ -512,6 +589,7 @@ impl Qwen3Executor {
             enable_cuda_graph,
             device_ordinals,
             Qwen3LoraOptions::default(),
+            Qwen3OffloadOptions::disabled(),
         )
     }
 
@@ -520,12 +598,19 @@ impl Qwen3Executor {
         enable_cuda_graph: bool,
         device_ordinals: &[usize],
         lora_options: Qwen3LoraOptions,
+        offload_options: Qwen3OffloadOptions,
     ) -> Result<Self> {
         let lora_options = lora_options.validate()?;
         anyhow::ensure!(
             !device_ordinals.is_empty(),
             "Qwen3 executor requires at least one device"
         );
+        anyhow::ensure!(
+            !offload_options.enabled || device_ordinals.len() == 1,
+            "KV offload is only supported on the single-GPU path (tensor parallel \
+             shards KV per rank); got {} devices",
+            device_ordinals.len()
+        );
         if device_ordinals.len() == 1 {
             let model = Qwen3Model::from_safetensors_with_runtime(
                 model_path,
@@ -537,7 +622,7 @@ impl Qwen3Executor {
                     max_lora_rank: lora_options.max_lora_rank,
                 },
             )?;
-            let mut executor = Self::single(model)?;
+            let mut executor = Self::single(model, &offload_options)?;
             executor.lora_options = lora_options;
             return Ok(executor);
         }
@@ -635,6 +720,11 @@ impl Qwen3Executor {
             loaded_lora_adapters: HashSet::new(),
             prefix_cache_enabled: true,
             lora_options,
+            // Offload is single-GPU only (asserted above); never built here.
+            offload: None,
+            saved_cursor: HashMap::new(),
+            prefetch: HashMap::new(),
+            l1_retention_disabled: false,
         })
     }
 
@@ -677,6 +767,145 @@ impl Qwen3Executor {
         self.prefix_cache_enabled = enabled;
     }
 
+    /// vLLM-style `--no-prefix-cache`. Behaviour depends on whether offload is
+    /// active:
+    ///   * **No offload** — classic: disable prefix matching outright, so every
+    ///     prefill recomputes the full prompt.
+    ///   * **With offload** — pure-L2 mode: keep matching on (the host-tier
+    ///     restore registers blocks and relies on `match_and_add_prefix` to pick
+    ///     them up) but stop retaining completed blocks in HBM, so no request
+    ///     ever serves its prefix from a cross-request L1 hit. Every reuse then
+    ///     comes from the host tier, which is the point of the L2 benchmark.
+    ///
+    /// A resident HBM block and its host-tier copy share one content hash, so
+    /// the cache cannot be told to prefer L2 for a block still in HBM — the only
+    /// way to force the bytes from L2 is to not keep the HBM copy around.
+    pub fn set_no_prefix_cache(&mut self, on: bool) {
+        if self.offload.is_some() {
+            self.l1_retention_disabled = on;
+        } else {
+            self.prefix_cache_enabled = !on;
+        }
+    }
+
+    /// Whether KV offload is active on this executor.
+    pub fn offload_enabled(&self) -> bool {
+        self.offload.is_some()
+    }
+
+    /// Flush pending offload saves into the host read cache so a following
+    /// query can see them. A persistence barrier for handoff and tests; no-op
+    /// without offload.
+    pub fn flush_offload_saves(&self) {
+        if let Some(offload) = &self.offload {
+            offload.flush_saves();
+        }
+    }
+
+    /// Drop every cached-but-unused GPU prefix block. With offload on, this
+    /// forces a cold prefix to be restored from the host tier on its next
+    /// request (rather than served from HBM).
+    pub fn evict_cached_blocks(&self) {
+        self.kv_mgr.pool().evict_inactive();
+    }
+
+    /// Begin an async CPU-tier KV prefetch for `request_id`; see the
+    /// [`ModelExecutor`] hook. Public so admission drivers and tests can park a
+    /// request on its load. Returns `true` when a load is in flight.
+    pub fn begin_kv_prefetch(
+        &mut self,
+        request_id: RequestId,
+        prompt_tokens: &[u32],
+        lora_adapter: Option<&str>,
+    ) -> bool {
+        <Self as ModelExecutor>::begin_kv_prefetch(self, request_id, prompt_tokens, lora_adapter)
+    }
+
+    /// Block until at least one in-flight prefetch settles, then sweep the
+    /// rest; returns the settled request ids (now prefill-eligible).
+    pub fn wait_ready_prefetch(&mut self) -> Vec<RequestId> {
+        <Self as ModelExecutor>::wait_ready_prefetch(self)
+    }
+
+    // ── KV-offload SAVE ────────────────────────────────────────────────
+
+    /// Save every block that sealed since this request's last save to the host
+    /// tier (fire-and-forget). Safe to call right after `apply_prefill`/
+    /// `apply_decode`: the producing step's token read-back has already
+    /// synchronized the compute stream, so the sealed KV is fully written.
+    fn save_sealed_blocks(&mut self, request_id: RequestId) {
+        if self.offload.is_none() {
+            return;
+        }
+        let Some(rkv) = self.request_kvs.get(&request_id) else {
+            return;
+        };
+        // `assigned_block_hashes` lists only sealed (registered) blocks; the
+        // partial tail block has no hash and never appears here.
+        let assigned = rkv.assigned_block_hashes();
+        let prefix_matched = rkv.prefix_matched_blocks();
+        let cursor = self
+            .saved_cursor
+            .entry(request_id)
+            .or_insert(prefix_matched);
+        if assigned.len() <= *cursor {
+            return;
+        }
+        let fresh = &assigned[*cursor..];
+        let block_ids: Vec<i32> = fresh.iter().map(|(id, _)| *id).collect();
+        let block_hashes: Vec<Vec<u8>> = fresh.iter().map(|(_, h)| h.to_vec()).collect();
+        // Pin exactly the blocks being saved (aligned 1:1 with `assigned`) for
+        // the duration of the async D2H, so a finished request can't hand the
+        // slot to a new request that overwrites it before the copy lands.
+        let pins: Vec<KvBlockGuard> = rkv
+            .assigned_block_guards()
+            .into_iter()
+            .skip(*cursor)
+            .collect();
+        *cursor = assigned.len();
+        self.offload
+            .as_ref()
+            .expect("offload present")
+            .save(&block_ids, &block_hashes, pins);
+    }
+
+    // ── KV-offload LOAD (async CPU-tier prefetch) ──────────────────────
+    // The trait-facing prefetch hooks (`begin_kv_prefetch`,
+    // `drain_ready_prefetch`, `wait_ready_prefetch`, `has_pending_prefetch`)
+    // live in the `ModelExecutor` impl below; `settle_prefetch` is their shared
+    // helper.
+
+    /// Finalize one prefetch whose load returned `result`. On success the
+    /// reserved blocks are staged + registered (held by the probe until the
+    /// request prefills); on failure the state is dropped so the request
+    /// prefills from scratch.
+    fn settle_prefetch(
+        &mut self,
+        id: RequestId,
+        result: Result<(), pegainfer_kv_offload::EngineError>,
+    ) {
+        if let Some(st) = self.prefetch.get_mut(&id) {
+            st.handle = None;
+        }
+        match result {
+            Ok(()) => {
+                let reservation = self
+                    .prefetch
+                    .get_mut(&id)
+                    .and_then(|st| st.reservation.take())
+                    .expect("reservation present until commit");
+                let st = self.prefetch.get_mut(&id).expect("prefetch present");
+                self.kv_mgr
+                    .pool()
+                    .commit_loaded_blocks(&mut st.probe, reservation);
+            }
+            Err(e) => {
+                log::warn!("KV offload load failed for {id:?} (prefill from scratch): {e}");
+                self.prefetch.remove(&id);
+            }
+        }
+    }
+
     fn wait_for_step_ack(
         pending: Vec<channel::Receiver<Result<WorkerStepOutcome>>>,
         op_name: &'static str,
@@ -712,6 +941,33 @@ impl Qwen3Executor {
     }
 }
 
+/// Build the KV-offload engine for the single-GPU path, or `None` when offload
+/// is disabled. Registers the fused KV buffer with pegaflow against the model's
+/// device/stream — must be called while that stream is still owned by the model
+/// (before it moves into the `RankWorker`).
+fn build_offload(
+    opts: &Qwen3OffloadOptions,
+    kv_mgr: &KvCacheManager,
+    ctx: &DeviceContext,
+) -> Result<Option<OffloadEngine>> {
+    if !opts.enabled {
+        return Ok(None);
+    }
+    let device_id = ctx.device_ordinal as i32;
+    let config = OffloadConfig::new(
+        format!("qwen3-4b-dev{device_id}"),
+        device_id,
+        opts.pinned_pool_bytes,
+    );
+    let engine = OffloadEngine::new(config, kv_mgr.buffer(), &ctx.stream)
+        .map_err(|e| anyhow::anyhow!("KV offload engine init failed: {e}"))?;
+    log::info!(
+        "KV offload enabled on device {device_id} ({} MiB host tier)",
+        opts.pinned_pool_bytes >> 20
+    );
+    Ok(Some(engine))
+}
+
 fn ensure_lora_capacity(
     loaded_lora_adapters: &HashSet<String>,
     lora_name: &str,
@@ -760,13 +1016,146 @@ impl ModelExecutor for Qwen3Executor {
         self.metadata.stop_token_ids.contains(&token_id)
     }
 
+    fn prefetched_blocks(&self, request_id: RequestId) -> usize {
+        self.prefetch
+            .get(&request_id)
+            .map(|st| st.probe.held_blocks())
+            .unwrap_or(0)
+    }
+
     fn drop_request(&mut self, request_id: RequestId) -> Result<()> {
         // Remove and drop — RAII on SchedulableSequence's block guards
-        // returns all allocated blocks regardless of lifecycle state.
+        // returns all allocated blocks regardless of lifecycle state. The same
+        // RAII frees any parked prefetch's reserved/held blocks.
         self.request_kvs.remove(&request_id);
+        // A parked prefetch may still have a load in flight: pegaflow's worker
+        // is writing the reserved GPU blocks (H2D). Dropping the reservation now
+        // frees those physical pages for immediate reuse while the DMA keeps
+        // landing on them — silent KV corruption, the load-side mirror of the
+        // SAVE keep-alive pin. Block until the copy finishes before the
+        // reservation drops. The scheduler is a dedicated synchronous thread, so
+        // this brief wait costs nothing it could spend elsewhere.
+        if let Some(mut state) = self.prefetch.remove(&request_id) {
+            if let Some(handle) = state.handle.take() {
+                let _ = handle.wait();
+            }
+        }
+        self.saved_cursor.remove(&request_id);
         Ok(())
     }
 
+    fn begin_kv_prefetch(
+        &mut self,
+        request_id: RequestId,
+        prompt_tokens: &[u32],
+        lora_adapter: Option<&str>,
+    ) -> bool {
+        let Some(offload) = self.offload.as_ref() else {
+            return false;
+        };
+        if !self.prefix_cache_enabled {
+            return false;
+        }
+        if self.l1_retention_disabled {
+            // Pure-L2 mode: drop any cross-request HBM retention so the probe
+            // sees gpu_hit == 0 and queries the whole cacheable prefix from the
+            // host tier. Only inactive (completed, unheld) blocks are drained —
+            // the current request holds nothing yet, and in-flight prefetches
+            // keep their reserved blocks, so this never touches live KV.
+            self.kv_mgr.pool().evict_inactive();
+        }
+        let probe = self
+            .kv_mgr
+            .pool()
+            .probe_prefix(prompt_tokens.to_vec(), lora_adapter);
+        let query_hashes = probe.cpu_query_hashes();
+        if query_hashes.is_empty() {
+            return false;
+        }
+        let hit = match offload.query(&request_id.0.to_string(), &query_hashes) {
+            Ok(hit) => hit,
+            Err(e) => {
+                log::warn!("KV offload query failed for {request_id:?} (skipping): {e}");
+                return false;
+            }
+        };
+        let (Some(lease), num_blocks) = (hit.lease, hit.num_blocks) else {
+            return false; // miss
+        };
+        let Some(reservation) = self.kv_mgr.pool().reserve_loaded_blocks(num_blocks) else {
+            // Block pressure: release the lease so its pinned host blocks aren't
+            // held for the full lease TTL, and prefill from scratch rather than
+            // stall.
+            offload.release_query_lease(lease);
+            return false;
+        };
+        let page_ids = reservation.page_ids();
+        let handle = match offload.load(lease, page_ids) {
+            Ok(handle) => handle,
+            Err(e) => {
+                log::warn!("KV offload load submit failed for {request_id:?} (skipping): {e}");
+                // `load` consumes the lease only past its early validation; a
+                // submit error may leave it pinned, so release it (no-op if it
+                // was already consumed).
+                offload.release_query_lease(lease);
+                return false;
+            }
+        };
+        self.prefetch.insert(
+            request_id,
+            PrefetchState {
+                probe,
+                reservation: Some(reservation),
+                handle: Some(handle),
+            },
+        );
+        true
+    }
+
+    fn drain_ready_prefetch(&mut self) -> Vec<RequestId> {
+        let ids: Vec<RequestId> = self.prefetch.keys().copied().collect();
+        let mut done = Vec::new();
+        for id in ids {
+            let poll = match self.prefetch.get_mut(&id).and_then(|st| st.handle.as_mut()) {
+                Some(handle) => handle.poll(),
+                None => continue, // already settled, awaiting prefill
+            };
+            if let Some(result) = poll {
+                self.settle_prefetch(id, result);
+                done.push(id);
+            }
+        }
+        done
+    }
+
+    fn wait_ready_prefetch(&mut self) -> Vec<RequestId> {
+        let mut done = Vec::new();
+        if let Some(id) = self
+            .prefetch
+            .iter()
+            .find(|(_, st)| st.handle.is_some())
+            .map(|(id, _)| *id)
+        {
+            let handle = self
+                .prefetch
+                .get_mut(&id)
+                .and_then(|st| st.handle.take())
+                .expect("in-flight handle present");
+            let result = handle.wait();
+            self.settle_prefetch(id, result);
+            // `settle_prefetch` clears the handle, so the drain below skips it;
+            // record it here as the one we blocked on.
+            done.push(id);
+        }
+        // Sweep any others that completed concurrently.
+        for id in self.drain_ready_prefetch() {
+            if !done.contains(&id) {
+                done.push(id);
+            }
+        }
+        done
+    }
+
     fn execute_prefill(&mut self, plan: PrefillPlan<'_>) -> Result<PrefillResult> {
         // 1. Create RequestKvs, reuse cached prefix blocks, schedule the rest
         let mut requests = plan.requests.to_vec();
@@ -786,6 +1175,10 @@ impl ModelExecutor for Qwen3Executor {
                     anyhow::anyhow!("schedule_prefill failed for {:?}: {e}", req.request_id)
                 })?;
             self.request_kvs.insert(req.request_id, rkv);
+            // match_and_add_prefix above already absorbed any CPU-prefetched
+            // blocks (now held by the request's sequence), so release the
+            // prefetch's separate hold.
+            self.prefetch.remove(&req.request_id);
         }
 
         // 2. Build KvViews (seq_len = cached prefix + new suffix)
@@ -819,6 +1212,10 @@ impl ModelExecutor for Qwen3Executor {
                 .expect("request must exist after prefill");
             rkv.apply_prefill(req_result.first_token, self.kv_mgr.pool())?;
         }
+        // 5. Offload the blocks this prefill just sealed (post-step-sync).
+        for req_result in &result.requests {
+            self.save_sealed_blocks(req_result.request_id);
+        }
 
         Ok(result)
     }
@@ -866,6 +1263,10 @@ impl ModelExecutor for Qwen3Executor {
                 .expect("request must exist after decode");
             rkv.apply_decode(req_result.token, self.kv_mgr.pool())?;
         }
+        // 5. Offload any block this decode step just sealed (post-step-sync).
+        for req_result in &result.requests {
+            self.save_sealed_blocks(req_result.request_id);
+        }
 
         Ok(result)
     }
@@ -888,6 +1289,7 @@ impl ModelExecutor for Qwen3Executor {
                     anyhow::anyhow!("schedule_prefill failed for {:?}: {e}", req.request_id)
                 })?;
             self.request_kvs.insert(req.request_id, rkv);
+            self.prefetch.remove(&req.request_id);
         }
 
         // Schedule decode for active requests
@@ -945,6 +1347,13 @@ impl ModelExecutor for Qwen3Executor {
                 .expect("request must exist after unified decode");
             rkv.apply_decode(req_result.token, self.kv_mgr.pool())?;
         }
+        // 5. Offload sealed blocks from both halves (post-step-sync).
+        for req_result in &result.prefill_requests {
+            self.save_sealed_blocks(req_result.request_id);
+        }
+        for req_result in &result.decode_requests {
+            self.save_sealed_blocks(req_result.request_id);
+        }
 
         Ok(result)
     }
diff --git a/pegainfer-qwen3-4b/src/lib.rs b/pegainfer-qwen3-4b/src/lib.rs
index 48c20af0..83106f68 100644
--- a/pegainfer-qwen3-4b/src/lib.rs
+++ b/pegainfer-qwen3-4b/src/lib.rs
@@ -63,6 +63,45 @@ impl Default for Qwen3LoraOptions {
     }
 }
 
+/// KV-offload (pegaflow) opt-in for the single-GPU Qwen3 path.
+///
+/// Disabled by default — the existing GPU-only prefix cache is unchanged.
+/// When enabled, the executor saves sealed KV blocks to pegaflow's host tier
+/// and prefetches CPU-resident prefixes back into HBM before prefill, so a
+/// prompt that has fallen out of the GPU cache still skips recompute. Only the
+/// single-GPU topology is supported (tensor parallel shards KV per rank).
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub struct Qwen3OffloadOptions {
+    pub enabled: bool,
+    /// Host pinned-memory pool size (the CPU KV-tier capacity), in bytes.
+    pub pinned_pool_bytes: usize,
+}
+
+impl Qwen3OffloadOptions {
+    /// 8 GiB host tier — a few thousand dense Qwen3-4B blocks.
+    pub const DEFAULT_PINNED_POOL_BYTES: usize = 8 << 30;
+
+    pub fn disabled() -> Self {
+        Self {
+            enabled: false,
+            pinned_pool_bytes: 0,
+        }
+    }
+
+    pub fn enabled(pinned_pool_bytes: usize) -> Self {
+        Self {
+            enabled: true,
+            pinned_pool_bytes,
+        }
+    }
+}
+
+impl Default for Qwen3OffloadOptions {
+    fn default() -> Self {
+        Self::disabled()
+    }
+}
+
 /// Low-level Qwen3 execution interface.
 ///
 /// This is the production phase boundary used by the Qwen3 scheduler and by
@@ -99,6 +138,24 @@ pub fn probe_model(model_path: &Path) -> Result<Option<ModelInfo>> {
 }
 
 pub fn start_engine(model_path: &Path, options: EngineLoadOptions) -> Result<EngineHandle> {
+    start_engine_with_offload(model_path, options, Qwen3OffloadOptions::disabled(), false)
+}
+
+/// Like [`start_engine`] but with pegaflow KV offload (single-GPU only). The
+/// host tier persists sealed KV blocks and serves CPU-resident prefixes back
+/// into HBM before prefill.
+///
+/// `no_prefix_cache` is the vLLM-style switch (see
+/// [`Qwen3Executor::set_no_prefix_cache`](runtime::Qwen3Executor::set_no_prefix_cache)):
+/// without offload it disables prefix matching outright; with offload it keeps
+/// the host tier but stops cross-request HBM reuse, so every prefix is served
+/// from L2 — the pure-L2 benchmark mode.
+pub fn start_engine_with_offload(
+    model_path: &Path,
+    options: EngineLoadOptions,
+    offload_options: Qwen3OffloadOptions,
+    no_prefix_cache: bool,
+) -> Result<EngineHandle> {
     let EngineLoadOptions {
         enable_cuda_graph,
         device_ordinals,
@@ -108,13 +165,22 @@ pub fn start_engine(model_path: &Path, options: EngineLoadOptions) -> Result<Eng
     let model_path = model_path
         .to_str()
         .ok_or_else(|| anyhow::anyhow!("model path must be valid UTF-8"))?;
-    scheduler::start_qwen3(model_path, enable_cuda_graph, &device_ordinals, seed)
+    scheduler::start_qwen3(
+        model_path,
+        enable_cuda_graph,
+        &device_ordinals,
+        seed,
+        offload_options,
+        no_prefix_cache,
+    )
 }
 
 pub fn start_engine_with_lora_control(
     model_path: &Path,
     options: EngineLoadOptions,
     lora_options: Qwen3LoraOptions,
+    offload_options: Qwen3OffloadOptions,
+    no_prefix_cache: bool,
 ) -> Result<EngineHandle> {
     let EngineLoadOptions {
         enable_cuda_graph,
@@ -131,5 +197,7 @@ pub fn start_engine_with_lora_control(
         &device_ordinals,
         seed,
         lora_options.validate()?,
+        offload_options,
+        no_prefix_cache,
     )
 }
diff --git a/pegainfer-qwen3-4b/src/scheduler.rs b/pegainfer-qwen3-4b/src/scheduler.rs
index a4d87ce0..67b09a50 100644
--- a/pegainfer-qwen3-4b/src/scheduler.rs
+++ b/pegainfer-qwen3-4b/src/scheduler.rs
@@ -18,8 +18,8 @@ use rand::SeedableRng;
 use rand::rngs::StdRng;
 use tokio::sync::mpsc;
 
-use crate::Qwen3LoraOptions;
 use crate::executor::{ModelExecutor, Qwen3Executor, RequestId};
+use crate::{Qwen3LoraOptions, Qwen3OffloadOptions};
 use pegainfer_core::engine::{
     EngineCommand, EngineControlRequest, EngineHandle, GenerateRequest, TokenEvent,
 };
@@ -54,6 +54,10 @@ pub(super) struct PendingRequest {
     pub(super) token_tx: mpsc::UnboundedSender<TokenEvent>,
     pub(super) logprobs: usize,
     pub(super) echo: bool,
+    /// Whether this request has already been offered to async KV prefetch.
+    /// Offered at most once; a no-hit offer leaves the request in the normal
+    /// admission flow with this set so it isn't re-probed every tick.
+    pub(super) prefetch_offered: bool,
 }
 
 impl PendingRequest {
@@ -67,6 +71,7 @@ impl PendingRequest {
             token_tx: req.token_tx,
             logprobs: req.logprobs,
             echo: req.echo,
+            prefetch_offered: false,
         }
     }
 }
@@ -78,8 +83,17 @@ pub(crate) fn start_qwen3(
     enable_cuda_graph: bool,
     device_ordinals: &[usize],
     seed: u64,
+    offload_options: Qwen3OffloadOptions,
+    no_prefix_cache: bool,
 ) -> Result<EngineHandle> {
-    let executor = Qwen3Executor::from_runtime(model_path, enable_cuda_graph, device_ordinals)?;
+    let mut executor = Qwen3Executor::from_runtime_with_lora_options(
+        model_path,
+        enable_cuda_graph,
+        device_ordinals,
+        Qwen3LoraOptions::default(),
+        offload_options,
+    )?;
+    executor.set_no_prefix_cache(no_prefix_cache);
     Ok(start_with_executor(executor, seed))
 }
 
@@ -89,13 +103,17 @@ pub(crate) fn start_qwen3_with_lora_control(
     device_ordinals: &[usize],
     seed: u64,
     lora_options: Qwen3LoraOptions,
+    offload_options: Qwen3OffloadOptions,
+    no_prefix_cache: bool,
 ) -> Result<EngineHandle> {
-    let executor = Qwen3Executor::from_runtime_with_lora_options(
+    let mut executor = Qwen3Executor::from_runtime_with_lora_options(
         model_path,
         enable_cuda_graph,
         device_ordinals,
         lora_options,
+        offload_options,
     )?;
+    executor.set_no_prefix_cache(no_prefix_cache);
     Ok(start_with_executor_with_lora_control(executor, seed))
 }
 
@@ -131,6 +149,90 @@ where
     EngineHandle::new_with_command_channel(command_tx)
 }
 
+// ── KV-offload prefetch admission helpers ────────────────────────────────
+
+/// Move requests whose async CPU-tier prefetch just settled from `loading`
+/// back to the front of `deferred` — their KV is hot, so admit them first.
+fn reclaim_ready_prefetch<E: ModelExecutor>(
+    executor: &mut E,
+    deferred: &mut Vec<PendingRequest>,
+    loading: &mut Vec<PendingRequest>,
+) {
+    promote_ready(executor.drain_ready_prefetch(), deferred, loading);
+}
+
+/// Offer each not-yet-offered `deferred` request to async CPU-tier prefetch,
+/// moving the ones that start loading out of `deferred` into `loading`. A
+/// request that doesn't start a load (pure GPU hit, miss, or block pressure)
+/// stays in `deferred`, flagged so it isn't re-probed next tick.
+///
+/// Echo requests are never offered: their prefill forwards the whole prompt to
+/// recover prompt logprobs and so skips `match_and_add_prefix` (see
+/// `execute_prefill`). Prefetched blocks would never be matched/reused — they
+/// would only park restored KV that admission credits but prefill can't spend,
+/// starving the request under tight budgets. Leaving `prefetch_offered` unset
+/// for echo is harmless: the `!req.echo` guard keeps them from being probed.
+fn offer_prefetch<E: ModelExecutor>(
+    executor: &mut E,
+    deferred: &mut Vec<PendingRequest>,
+    loading: &mut Vec<PendingRequest>,
+) {
+    let mut keep = Vec::with_capacity(deferred.len());
+    for mut req in deferred.drain(..) {
+        if !req.prefetch_offered && !req.echo {
+            req.prefetch_offered = true;
+            if executor.begin_kv_prefetch(
+                req.request_id,
+                &req.prompt_tokens,
+                req.lora_adapter.as_deref(),
+            ) {
+                loading.push(req);
+                continue;
+            }
+        }
+        keep.push(req);
+    }
+    *deferred = keep;
+}
+
+/// Block until at least one in-flight prefetch settles, then promote the
+/// settled requests to `deferred`. Called only when the scheduler is otherwise
+/// idle, so blocking on the DMA costs nothing.
+fn block_on_loading<E: ModelExecutor>(
+    executor: &mut E,
+    deferred: &mut Vec<PendingRequest>,
+    loading: &mut Vec<PendingRequest>,
+) {
+    promote_ready(executor.wait_ready_prefetch(), deferred, loading);
+}
+
+fn promote_ready(
+    ready: Vec<RequestId>,
+    deferred: &mut Vec<PendingRequest>,
+    loading: &mut Vec<PendingRequest>,
+) {
+    for id in ready {
+        if let Some(pos) = loading.iter().position(|p| p.request_id == id) {
+            deferred.insert(0, loading.remove(pos));
+        }
+    }
+}
+
+/// Release any executor-side state a request accumulated before it was rejected
+/// at admission. A rejected request never prefills, so the only state it can
+/// hold is a settled KV prefetch — committed prefix blocks parked in the
+/// executor while the request waited in `deferred`. Without this they would
+/// leak (blocks pinned, map entry stranded) for the engine's lifetime. Idempotent
+/// and harmless for requests that were never prefetched.
+fn release_rejected<E: ModelExecutor>(executor: &mut E, req: &PendingRequest) {
+    if let Err(e) = executor.drop_request(req.request_id) {
+        warn!(
+            "failed to release state for rejected {:?}: {e}",
+            req.request_id
+        );
+    }
+}
+
 // ── Main loop ───────────────────────────────────────────────────────────
 
 fn scheduler_loop<E>(
@@ -146,6 +248,8 @@ fn scheduler_loop<E>(
     // Requests that could not be admitted due to KV budget pressure.
     // Held here so they aren't lost; re-evaluated every loop iteration.
     let mut deferred: Vec<PendingRequest> = Vec::new();
+    // Requests parked while their async CPU-tier KV prefetch loads.
+    let mut loading: Vec<PendingRequest> = Vec::new();
 
     info!("Scheduler ready");
 
@@ -159,8 +263,18 @@ fn scheduler_loop<E>(
             next_request_id += 1;
         }
 
-        // 2. Nothing active and nothing deferred → block until a request arrives.
+        // 2. Reclaim settled prefetches, then offer fresh requests to prefetch.
+        reclaim_ready_prefetch(&mut executor, &mut deferred, &mut loading);
+        offer_prefetch(&mut executor, &mut deferred, &mut loading);
+
+        // 3. Nothing active and nothing admittable → block. Prefer blocking on
+        // an in-flight load (so its request prefills next) over a new submit;
+        // only truly idle (no loads either) do we block on the channel.
         if active.is_empty() && deferred.is_empty() {
+            if !loading.is_empty() {
+                block_on_loading(&mut executor, &mut deferred, &mut loading);
+                continue;
+            }
             if let Some(req) = submit_rx.blocking_recv() {
                 deferred.push(PendingRequest::from_scheduler_request(
                     RequestId(next_request_id),
@@ -178,11 +292,13 @@ fn scheduler_loop<E>(
                 ));
                 next_request_id += 1;
             }
+            continue;
         }
 
         let lora_validation = reject_unknown_lora_requests(deferred, &executor);
         for rejected in &lora_validation.rejected {
             send_unknown_lora_rejection(rejected);
+            release_rejected(&mut executor, rejected);
         }
 
         let admission = admit_deferred_requests(
@@ -193,9 +309,11 @@ fn scheduler_loop<E>(
             executor.max_request_blocks(),
             executor.max_context_tokens(),
             executor.max_decode_batch_size(),
+            |id| executor.prefetched_blocks(id),
         );
         for (rejected, reason) in &admission.rejected {
             send_rejection(rejected, *reason);
+            release_rejected(&mut executor, rejected);
         }
         let pending = admission.pending;
         deferred = admission.deferred;
@@ -228,6 +346,7 @@ fn scheduler_loop_with_lora_control<E>(
     let mut active: Vec<ActiveRequestState> = Vec::new();
     let mut next_request_id = 0u64;
     let mut deferred: Vec<PendingRequest> = Vec::new();
+    let mut loading: Vec<PendingRequest> = Vec::new();
     let mut pending_control: VecDeque<EngineControlRequest> = VecDeque::new();
     let mut post_control_deferred: Vec<PendingRequest> = Vec::new();
 
@@ -246,6 +365,14 @@ fn scheduler_loop_with_lora_control<E>(
             );
         }
 
+        // 1b. Reclaim settled prefetches and offer fresh requests. Control
+        // commands gate generation, so only offer once no control is pending
+        // (a prefetch must not race ahead of an adapter load it depends on).
+        reclaim_ready_prefetch(&mut executor, &mut deferred, &mut loading);
+        if pending_control.is_empty() {
+            offer_prefetch(&mut executor, &mut deferred, &mut loading);
+        }
+
         // 2. Once idle, apply pending control commands before admitting newer
         // generation requests that arrived behind them.
         if active.is_empty() && deferred.is_empty() {
@@ -255,9 +382,13 @@ fn scheduler_loop_with_lora_control<E>(
             }
         }
 
-        // 3. Nothing active and no deferred generation → block until any
-        // command arrives.
+        // 3. Nothing active and no deferred generation → block. An in-flight
+        // load takes priority over waiting on a new command.
         if active.is_empty() && deferred.is_empty() {
+            if !loading.is_empty() {
+                block_on_loading(&mut executor, &mut deferred, &mut loading);
+                continue;
+            }
             if let Some(command) = command_rx.blocking_recv() {
                 enqueue_engine_command(
                     command,
@@ -290,6 +421,7 @@ fn scheduler_loop_with_lora_control<E>(
         let lora_validation = reject_unknown_lora_requests(deferred, &executor);
         for rejected in &lora_validation.rejected {
             send_unknown_lora_rejection(rejected);
+            release_rejected(&mut executor, rejected);
         }
 
         let admission = admit_deferred_requests(
@@ -300,14 +432,21 @@ fn scheduler_loop_with_lora_control<E>(
             executor.max_request_blocks(),
             executor.max_context_tokens(),
             executor.max_decode_batch_size(),
+            |id| executor.prefetched_blocks(id),
         );
         for (rejected, reason) in &admission.rejected {
             send_rejection(rejected, *reason);
+            release_rejected(&mut executor, rejected);
         }
         let pending = admission.pending;
         deferred = admission.deferred;
 
         if active.is_empty() && pending.is_empty() {
+            // A parked load must still be polled to completion before we block.
+            if !loading.is_empty() {
+                block_on_loading(&mut executor, &mut deferred, &mut loading);
+                continue;
+            }
             if let Some(command) = command_rx.blocking_recv() {
                 enqueue_engine_command(
                     command,
@@ -503,6 +642,11 @@ fn admit_deferred_requests(
     max_request_blocks: usize,
     max_context_tokens: usize,
     max_decode_batch_size: usize,
+    // Blocks a request already holds from a settled prefetch. These are already
+    // out of `available_blocks`, so they must be credited against the request's
+    // need or admission double-counts them and can wedge a near-budget CPU-hit
+    // request forever (never admitted, prefetch never released).
+    prefetch_credit: impl Fn(RequestId) -> usize,
 ) -> AdmissionOutcome {
     let mut budget = available_blocks.saturating_sub(active_future_blocks(active, block_size));
     let mut decode_slots = max_decode_batch_size.saturating_sub(active.len());
@@ -522,14 +666,19 @@ fn admit_deferred_requests(
             continue;
         }
 
-        let max_needed = blocks_needed(max_request_tokens(&req), block_size);
-        if max_needed > max_request_blocks {
+        // Full physical footprint gates the per-request cap (a request occupies
+        // all of it, prefetched or not)…
+        let footprint = blocks_needed(max_request_tokens(&req), block_size);
+        if footprint > max_request_blocks {
             rejected.push((req, RejectReason::KvBudget));
             continue;
         }
 
-        if max_needed <= budget && decode_slots > 0 {
-            budget -= max_needed;
+        // …but only the blocks not already held by this request's prefetch must
+        // come from the free-pool budget.
+        let fresh_needed = footprint.saturating_sub(prefetch_credit(req.request_id));
+        if fresh_needed <= budget && decode_slots > 0 {
+            budget -= fresh_needed;
             decode_slots -= 1;
             pending.push(req);
         } else {
@@ -662,6 +811,7 @@ mod tests {
         decode_delay: Duration,
         loaded_lora_adapters: HashSet<String>,
         dropped: Arc<Mutex<Vec<u64>>>,
+        prefetch_offers: Arc<Mutex<Vec<u64>>>,
         prefill_batches: Arc<Mutex<Vec<Vec<RequestId>>>>,
         decode_batches: Arc<Mutex<Vec<Vec<RequestId>>>>,
         prefill_lora_batches: Arc<Mutex<Vec<Vec<Option<String>>>>>,
@@ -680,6 +830,7 @@ mod tests {
                 decode_delay: Duration::ZERO,
                 loaded_lora_adapters: HashSet::new(),
                 dropped,
+                prefetch_offers: Arc::new(Mutex::new(Vec::new())),
                 prefill_batches: Arc::new(Mutex::new(Vec::new())),
                 decode_batches: Arc::new(Mutex::new(Vec::new())),
                 prefill_lora_batches: Arc::new(Mutex::new(Vec::new())),
@@ -778,6 +929,16 @@ mod tests {
             Ok(())
         }
 
+        fn begin_kv_prefetch(
+            &mut self,
+            request_id: RequestId,
+            _prompt_tokens: &[u32],
+            _lora_adapter: Option<&str>,
+        ) -> bool {
+            self.prefetch_offers.lock().unwrap().push(request_id.get());
+            false
+        }
+
         fn list_lora_adapters(&self) -> Vec<String> {
             let mut names: Vec<_> = self.loaded_lora_adapters.iter().cloned().collect();
             names.sort();
@@ -999,7 +1160,7 @@ mod tests {
         ];
 
         // available 4 blocks - 2 reserved for active growth = budget of 2.
-        let outcome = admit_deferred_requests(deferred, &active, 16, 4, 4, usize::MAX, 64);
+        let outcome = admit_deferred_requests(deferred, &active, 16, 4, 4, usize::MAX, 64, |_| 0);
 
         let ids =
             |reqs: &[PendingRequest]| reqs.iter().map(|r| r.request_id.get()).collect::<Vec<_>>();
@@ -1038,7 +1199,7 @@ mod tests {
             mk(3, 40, 1), // request 3: 40 prompt + 1 max = 41 total: overflows by 9 tokens → rejected
         ];
 
-        let outcome = admit_deferred_requests(deferred, &active, 16, 1000, 1000, 32, 64);
+        let outcome = admit_deferred_requests(deferred, &active, 16, 1000, 1000, 32, 64, |_| 0);
 
         let pending_ids = outcome
             .pending
@@ -1084,8 +1245,16 @@ mod tests {
         }
         let pending = PendingRequest::from_scheduler_request(RequestId(64), request(16, 1).0);
 
-        let outcome =
-            admit_deferred_requests(vec![pending], &active, 16, 1024, 1024, usize::MAX, 64);
+        let outcome = admit_deferred_requests(
+            vec![pending],
+            &active,
+            16,
+            1024,
+            1024,
+            usize::MAX,
+            64,
+            |_| 0,
+        );
 
         assert!(
             outcome.pending.is_empty(),
@@ -1157,6 +1326,43 @@ mod tests {
         );
     }
 
+    fn pending(request_id: u64, echo: bool) -> PendingRequest {
+        let (token_tx, _token_rx) = mpsc::unbounded_channel();
+        PendingRequest {
+            request_id: RequestId::new(request_id),
+            lora_adapter: None,
+            prompt_tokens: vec![1; 32],
+            params: SamplingParams::default(),
+            max_tokens: 1,
+            token_tx,
+            logprobs: 0,
+            echo,
+            prefetch_offered: false,
+        }
+    }
+
+    #[test]
+    fn echo_requests_are_never_offered_to_prefetch() {
+        let dropped = Arc::new(Mutex::new(Vec::new()));
+        let mut executor = FakeExecutor::new(64, dropped);
+        let offers = Arc::clone(&executor.prefetch_offers);
+
+        let mut deferred = vec![pending(1, true), pending(2, false)];
+        let mut loading = Vec::new();
+        offer_prefetch(&mut executor, &mut deferred, &mut loading);
+
+        // The plain request is probed; the echo request is skipped entirely, so
+        // its prefill forwards the whole prompt without parking unspendable KV.
+        assert_eq!(*offers.lock().unwrap(), vec![2]);
+        let echo = deferred.iter().find(|r| r.request_id.get() == 1).unwrap();
+        assert!(!echo.prefetch_offered, "echo request must stay un-probed");
+        let plain = deferred.iter().find(|r| r.request_id.get() == 2).unwrap();
+        assert!(
+            plain.prefetch_offered,
+            "plain request must be marked probed"
+        );
+    }
+
     fn request(
         prompt_len: usize,
         max_tokens: usize,
diff --git a/pegainfer-qwen3-4b/src/scheduler/plan.rs b/pegainfer-qwen3-4b/src/scheduler/plan.rs
index 86bbab4a..78c380cb 100644
--- a/pegainfer-qwen3-4b/src/scheduler/plan.rs
+++ b/pegainfer-qwen3-4b/src/scheduler/plan.rs
@@ -156,6 +156,7 @@ mod tests {
             token_tx,
             logprobs: 0,
             echo: false,
+            prefetch_offered: false,
         }
     }
 
diff --git a/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs b/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs
new file mode 100644
index 00000000..c2a47e22
--- /dev/null
+++ b/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs
@@ -0,0 +1,272 @@
+//! Live GPU+CPU prefix-hit gate for the pegaflow KV-offload integration.
+//!
+//! Drives a real Qwen3-4B [`Qwen3Executor`] with offload enabled to prove the
+//! end-to-end wiring on actual model weights:
+//!   * a cold prefill SAVEs its sealed KV blocks to pegaflow's host tier;
+//!   * after the GPU prefix cache is flushed, a second identical request finds
+//!     the prefix only on the CPU tier (a genuine CPU-only hit) and the async
+//!     prefetch RESTOREs it into HBM;
+//!   * the restored KV reproduces the original first-token logits.
+//!
+//! This is the one test that exercises save → host-tier persistence → query →
+//! async load → register → prefill-rematch through the executor, not a unit
+//! harness. `tests/cpu_roundtrip.rs` (in `pegainfer-kv-offload`) covers the raw
+//! byte path; this covers the live executor wiring. If the load landed in the
+//! wrong layer/segment/block the warm logits would be whole nats off.
+//!
+//! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when absent
+//! (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it).
+
+use std::collections::HashMap;
+use std::path::Path;
+
+use pegainfer_core::sampler::SamplingParams;
+use pegainfer_qwen3_4b::runtime::{PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId};
+use pegainfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions};
+
+const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B");
+const BLOCK: usize = 16;
+const LOGPROBS: usize = 16;
+const MAX_OUTPUT: usize = 8;
+/// 512 MiB host tier — comfortably more than the handful of dense Qwen3-4B
+/// blocks this test offloads (~2.25 MiB/block).
+const HOST_TIER_BYTES: usize = 512 << 20;
+
+/// Warm-vs-cold bounds, following the prefix-cache methodology: the CPU-restored
+/// KV is byte-identical to the original GPU compute, so the only legitimate
+/// drift is the prefill GEMM shrinking to the uncached tail (bf16 reduction
+/// order). The warm argmax must sit within `REGRET_TOL` of cold; the mean head
+/// delta must stay at the bf16 floor.
+const REGRET_TOL: f32 = 0.20;
+const MEAN_TOL: f32 = 0.06;
+
+fn model_path_or_skip() -> Option<String> {
+    match std::env::var("PEGAINFER_TEST_MODEL_PATH") {
+        Ok(path) => Some(path),
+        Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => {
+            Some(MODEL_PATH.to_string())
+        }
+        Err(_) => {
+            eprintln!(
+                "skipping qwen3 kv_offload_cpu_hit: {MODEL_PATH}/config.json is missing; \
+                 set PEGAINFER_TEST_MODEL_PATH to run it"
+            );
+            None
+        }
+    }
+}
+
+/// Deterministic synthetic prompt; different seeds share no prefix.
+fn prompt(seed: usize, len: usize) -> Vec<u32> {
+    (0..len)
+        .map(|i| ((seed * 100_003 + i * 17) % 50_000 + 1_000) as u32)
+        .collect()
+}
+
+fn prefill_item(id: u64, prompt: &[u32]) -> PrefillStepItem {
+    PrefillStepItem::new(
+        RequestId::new(id),
+        prompt.to_vec(),
+        MAX_OUTPUT,
+        SamplingParams::default(),
+        LOGPROBS,
+        false,
+        0.0,
+    )
+}
+
+fn first_token_top(pr: &pegainfer_qwen3_4b::runtime::PrefillResult) -> Vec<(u32, f32)> {
+    pr.requests[0]
+        .first_token_logprob
+        .as_ref()
+        .expect("logprobs requested but none returned")
+        .top_logprobs
+        .clone()
+}
+
+/// The warm (CPU-restored) first-token logits must agree with the cold compute
+/// up to bf16 reduction noise: warm argmax within `REGRET_TOL` of cold, mean
+/// head-token delta under `MEAN_TOL`.
+fn assert_close(cold: &[(u32, f32)], warm: &[(u32, f32)]) {
+    let cold_map: HashMap<u32, f32> = cold.iter().copied().collect();
+    let cold_top = cold[0].1;
+    match cold_map.get(&warm[0].0) {
+        None => panic!(
+            "warm argmax {} absent from cold top-{}",
+            warm[0].0,
+            cold.len()
+        ),
+        Some(&clp) => assert!(
+            cold_top - clp <= REGRET_TOL,
+            "warm argmax {} sits {:.4} nat below cold argmax",
+            warm[0].0,
+            cold_top - clp
+        ),
+    }
+    let deltas: Vec<f32> = warm
+        .iter()
+        .take(8)
+        .filter_map(|&(token, wlp)| cold_map.get(&token).map(|&clp| (wlp - clp).abs()))
+        .collect();
+    assert!(!deltas.is_empty(), "no head-token overlap");
+    let mean = deltas.iter().sum::<f32>() / deltas.len() as f32;
+    let max = deltas.iter().copied().fold(0.0f32, f32::max);
+    eprintln!(
+        "kv_offload_cpu_hit: {} head deltas — mean {mean:.4} max {max:.4}",
+        deltas.len()
+    );
+    assert!(
+        mean <= MEAN_TOL,
+        "mean head logprob delta {mean:.4} > {MEAN_TOL} — restored KV drifted past bf16 noise"
+    );
+}
+
+/// One executor, two scenarios, run sequentially. cargo runs `#[test]`
+/// functions on parallel threads; two Qwen3-4B executors sharing device 0 and
+/// the same pegaflow instance id ("qwen3-4b-dev0") would collide on the host
+/// tier. Production wires exactly one executor per model, so the realistic
+/// shape is one executor servicing both prefixes. The two scenarios use
+/// disjoint prompt seeds, so they share no prefix and cannot cross-contaminate.
+#[test]
+fn live_gpu_and_cpu_prefix_hits() {
+    let Some(model_path) = model_path_or_skip() else {
+        return;
+    };
+    let mut ex = Qwen3Executor::from_runtime_with_lora_options(
+        &model_path,
+        false,
+        &[0],
+        Qwen3LoraOptions::default(),
+        Qwen3OffloadOptions::enabled(HOST_TIER_BYTES),
+    )
+    .expect("build offload executor");
+    assert!(ex.offload_enabled(), "offload must be active");
+
+    cpu_tier_restores_evicted_prefix(&mut ex);
+    gpu_and_cpu_combined_hit(&mut ex);
+}
+
+/// A prefix that is evicted from HBM and restored entirely from the CPU tier
+/// (`gpu_hit == 0`): the baseline CPU round-trip through the live executor.
+fn cpu_tier_restores_evicted_prefix(ex: &mut Qwen3Executor) {
+    let p = prompt(7, 50); // 3 full blocks (48 tok) + 2-token tail
+
+    // ── Cold: first sight of P. Computes all of P on GPU and offloads the 3
+    // sealed blocks to the host tier. ──
+    let cold = ex
+        .execute_prefill(PrefillPlan {
+            requests: &[prefill_item(1, &p)],
+            echo: false,
+        })
+        .expect("cold prefill");
+    assert_eq!(
+        cold.requests[0].cached_tokens, 0,
+        "first sight of P is cold"
+    );
+    let cold_first = first_token_top(&cold);
+    ex.drop_request(RequestId::new(1)).expect("drop req1");
+
+    // ── Persist the saves, then evict P from HBM so it lives only on CPU. ──
+    ex.flush_offload_saves();
+    ex.evict_cached_blocks();
+
+    // ── A GPU miss now: the prefetch must restore P from the CPU tier. ──
+    let hit = ex.begin_kv_prefetch(RequestId::new(2), &p, None);
+    assert!(hit, "P must hit the CPU tier after GPU eviction");
+    let ready = ex.wait_ready_prefetch();
+    assert!(
+        ready.contains(&RequestId::new(2)),
+        "prefetch load must settle ready, got {ready:?}"
+    );
+
+    // ── Warm: the restored CPU prefix is matched, only the 2-token tail
+    // recomputes (the full-block cap keeps the 3rd block's last token off the
+    // match the same way the GPU prefix cache does). ──
+    let warm = ex
+        .execute_prefill(PrefillPlan {
+            requests: &[prefill_item(2, &p)],
+            echo: false,
+        })
+        .expect("warm prefill");
+    assert_eq!(
+        warm.requests[0].cached_tokens,
+        3 * BLOCK,
+        "CPU-restored prefix: 3 blocks matched, tail recomputed"
+    );
+    let warm_first = first_token_top(&warm);
+    ex.drop_request(RequestId::new(2)).expect("drop req2");
+
+    // ── The restored KV must reproduce the original GPU first-token logits. ──
+    assert_close(&cold_first, &warm_first);
+}
+
+/// A single prefix that is part GPU-resident, part CPU-only: the prefetch must
+/// stack the CPU continuation onto the GPU hit and the re-match must see one
+/// contiguous prefix. This is the case that catches an off-by-`gpu_hit` bug in
+/// the query/commit offset math — the pure-CPU test (`gpu_hit == 0`) cannot.
+fn gpu_and_cpu_combined_hit(ex: &mut Qwen3Executor) {
+    let full = prompt(9, 100); // 6 full blocks (96 tok) + 4-token tail
+    let short = full[..50].to_vec(); // a 3-block prefix of `full`
+
+    // ── Cold-compute `full`, saving all 6 blocks to the host tier. ──
+    let cold = ex
+        .execute_prefill(PrefillPlan {
+            requests: &[prefill_item(1, &full)],
+            echo: false,
+        })
+        .expect("cold full prefill");
+    assert_eq!(
+        cold.requests[0].cached_tokens, 0,
+        "first sight of full is cold"
+    );
+    let cold_first = first_token_top(&cold);
+    ex.drop_request(RequestId::new(1)).expect("drop req1");
+    ex.flush_offload_saves();
+
+    // ── Drop the whole prefix from HBM (CPU keeps all 6 blocks), then
+    // re-establish ONLY the first 3 blocks in HBM by cold-prefilling `short`.
+    // GPU now holds blocks 0..3; CPU holds blocks 0..6. ──
+    ex.evict_cached_blocks();
+    let s = ex
+        .execute_prefill(PrefillPlan {
+            requests: &[prefill_item(2, &short)],
+            echo: false,
+        })
+        .expect("short prefill");
+    assert_eq!(
+        s.requests[0].cached_tokens, 0,
+        "short re-warms blocks 0..3 cold"
+    );
+    ex.drop_request(RequestId::new(2)).expect("drop req2");
+
+    // ── Prefetch `full`: GPU hits blocks 0..3, the host tier must supply the
+    // continuation 3..6. A pure GPU hit would not start a load. ──
+    let hit = ex.begin_kv_prefetch(RequestId::new(3), &full, None);
+    assert!(
+        hit,
+        "blocks 3..6 must be fetched from the CPU tier beyond the GPU hit"
+    );
+    let ready = ex.wait_ready_prefetch();
+    assert!(
+        ready.contains(&RequestId::new(3)),
+        "prefetch must settle, got {ready:?}"
+    );
+
+    // ── Warm prefill `full`: all 6 blocks match (3 GPU + 3 CPU). Without the
+    // CPU continuation this would be 3. ──
+    let warm = ex
+        .execute_prefill(PrefillPlan {
+            requests: &[prefill_item(3, &full)],
+            echo: false,
+        })
+        .expect("warm full prefill");
+    assert_eq!(
+        warm.requests[0].cached_tokens,
+        6 * BLOCK,
+        "combined hit: 3 GPU-resident + 3 CPU-restored blocks match as one prefix"
+    );
+    let warm_first = first_token_top(&warm);
+    ex.drop_request(RequestId::new(3)).expect("drop req3");
+
+    assert_close(&cold_first, &warm_first);
+}
diff --git a/pegainfer-qwen3-4b/tests/lora_smoke.rs b/pegainfer-qwen3-4b/tests/lora_smoke.rs
index 37f6ddfa..3dce67db 100644
--- a/pegainfer-qwen3-4b/tests/lora_smoke.rs
+++ b/pegainfer-qwen3-4b/tests/lora_smoke.rs
@@ -228,6 +228,8 @@ fn qwen3_lora_loads_rank_and_generates(rank: usize, adapter_name: &str) {
             ..EngineLoadOptions::default()
         },
         pegainfer_qwen3_4b::Qwen3LoraOptions::default(),
+        pegainfer_qwen3_4b::Qwen3OffloadOptions::disabled(),
+        false,
     )
     .expect("start LoRA-capable Qwen3 engine");
 
diff --git a/pegainfer-server/src/main.rs b/pegainfer-server/src/main.rs
index 7a824823..64b7fe9e 100644
--- a/pegainfer-server/src/main.rs
+++ b/pegainfer-server/src/main.rs
@@ -10,7 +10,7 @@ use pegainfer::vllm_frontend::LoraModule;
 use pegainfer_core::engine::{EngineLoadOptions, EpBackend};
 #[cfg(feature = "kimi-k2")]
 use pegainfer_core::parallel::ParallelConfig;
-use pegainfer_qwen3_4b::Qwen3LoraOptions;
+use pegainfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions};
 
 #[cfg(not(target_env = "msvc"))]
 #[global_allocator]
@@ -74,6 +74,25 @@ struct Args {
     /// Emit synchronized DeepSeek V4 prefill phase timing records.
     #[arg(long, default_value_t = false)]
     deepseek_prefill_profile: bool,
+
+    /// Enable pegaflow KV offload (host-tier "L2" cache) on the single-GPU
+    /// Qwen3 path. Sealed KV blocks are saved to host pinned memory and
+    /// restored into HBM before prefill when a prompt's prefix has fallen out
+    /// of the GPU cache.
+    #[arg(long, default_value_t = false)]
+    kv_offload: bool,
+
+    /// Host pinned-memory pool size for the KV offload tier, in GiB. pegaflow
+    /// allocates the whole pool up front, so RSS reflects this at startup.
+    #[arg(long, default_value_t = 8.0)]
+    kv_offload_host_gib: f64,
+
+    /// vLLM-style no-prefix-cache. Without --kv-offload it disables prefix
+    /// matching outright (every prefill recomputes the full prompt). With
+    /// --kv-offload it is the pure-L2 mode: no cross-request HBM reuse, so every
+    /// prefix is restored from the host tier — for measuring the L2 TTFT win.
+    #[arg(long, default_value_t = false)]
+    no_prefix_cache: bool,
 }
 
 #[derive(Clone, Copy, Debug, ValueEnum)]
@@ -210,6 +229,16 @@ async fn main() -> anyhow::Result<()> {
                 ep_backend: EpBackend::Nccl,
                 seed: 42,
             };
+            let offload = if args.kv_offload {
+                let bytes = (args.kv_offload_host_gib * f64::from(1u32 << 30)) as usize;
+                info!(
+                    "Qwen3 KV offload enabled: host tier {:.1} GiB, no_prefix_cache={}",
+                    args.kv_offload_host_gib, args.no_prefix_cache
+                );
+                Qwen3OffloadOptions::enabled(bytes)
+            } else {
+                Qwen3OffloadOptions::disabled()
+            };
             let handle = if args.enable_lora {
                 let lora_options = Qwen3LoraOptions {
                     max_loras: args.max_loras,
@@ -223,9 +252,16 @@ async fn main() -> anyhow::Result<()> {
                     &args.model_path,
                     options,
                     lora_options,
+                    offload,
+                    args.no_prefix_cache,
                 )
             } else {
-                pegainfer_qwen3_4b::start_engine(&args.model_path, options)
+                pegainfer_qwen3_4b::start_engine_with_offload(
+                    &args.model_path,
+                    options,
+                    offload,
+                    args.no_prefix_cache,
+                )
             }
             .context("failed to start Qwen3 engine")?;