diff --git a/Cargo.lock b/Cargo.lock index 02c03811..611d1125 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -200,43 +200,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "async-nats" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe" -dependencies = [ - "base64 0.22.1", - "bytes", - "futures-util", - "memchr", - "nkeys", - "nuid", - "once_cell", - "pin-project", - "portable-atomic", - "rand 0.8.6", - "regex", - "ring", - "rustls-native-certs 0.7.3", - "rustls-pemfile", - "rustls-webpki 0.102.8", - "serde", - "serde_json", - "serde_nanos", - "serde_repr", - "thiserror 1.0.69", - "time", - "tokio", - "tokio-rustls 0.26.4", - "tokio-stream", - "tokio-util", - "tokio-websockets", - "tracing", - "tryhard", - "url", -] - [[package]] name = "async-stream" version = "0.3.6" @@ -316,15 +279,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "atomic" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340" -dependencies = [ - "bytemuck", -] - [[package]] name = "atomic-polyfill" version = "1.0.3" @@ -389,448 +343,6 @@ dependencies = [ "arrayvec", ] -[[package]] -name = "aws-config" -version = "1.8.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a8fc176d53d6fe85017f230405e3255cedb4a02221cb55ed6d76dccbbb099b2" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-sdk-sso", - "aws-sdk-ssooidc", - "aws-sdk-sts", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "hex", - "http 1.4.0", - "ring", - "time", - "tokio", - "tracing", - "url", - "zeroize", -] - -[[package]] -name = "aws-credential-types" -version = "1.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e26bbf46abc608f2dc61fd6cb3b7b0665497cc259a21520151ed98f8b37d2c79" -dependencies = [ - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "zeroize", -] - -[[package]] -name = "aws-lc-rs" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00" -dependencies = [ - "aws-lc-sys", - "zeroize", -] - -[[package]] -name = "aws-lc-sys" -version = "0.41.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4" -dependencies = [ - "cc", - "cmake", - "dunce", - "fs_extra", -] - -[[package]] -name = "aws-runtime" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0f92058d22a46adf53ec57a6a96f34447daf02bff52e8fb956c66bcd5c6ac12" -dependencies = [ - "aws-credential-types", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "bytes-utils", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "http-body 1.0.1", - "percent-encoding", - "pin-project-lite", - "tracing", - "uuid", -] - -[[package]] -name = "aws-sdk-s3" -version = "1.123.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c018f22146966fdd493a664f62ee2483dff256b42a08c125ab6a084bde7b77fe" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-checksums", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "bytes", - "fastrand", - "hex", - "hmac", - "http 0.2.12", - "http 1.4.0", - "http-body 1.0.1", - "lru 0.16.4", - "percent-encoding", - "regex-lite", - "sha2 0.10.9", - "tracing", - "url", -] - -[[package]] -name = "aws-sdk-sso" -version = "1.94.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "699da1961a289b23842d88fe2984c6ff68735fdf9bdcbc69ceaeb2491c9bf434" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sdk-ssooidc" -version = "1.96.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3e3a4cb3b124833eafea9afd1a6cc5f8ddf3efefffc6651ef76a03cbc6b4981" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sdk-sts" -version = "1.98.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89c4f19655ab0856375e169865c91264de965bd74c407c7f1e403184b1049409" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-query", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sigv4" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f6ae9b71597dc5fd115d52849d7a5556ad9265885ad3492ea8d73b93bbc46e" -dependencies = [ - "aws-credential-types", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "crypto-bigint 0.5.5", - "form_urlencoded", - "hex", - "hmac", - "http 0.2.12", - "http 1.4.0", - "p256", - "percent-encoding", - "ring", - "sha2 0.10.9", - "subtle", - "time", - "tracing", - "zeroize", -] - -[[package]] -name = "aws-smithy-async" -version = "1.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cba48474f1d6807384d06fec085b909f5807e16653c5af5c45dfe89539f0b70" -dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "aws-smithy-checksums" -version = "0.64.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a764fa7222922f6c0af8eea478b0ef1ba5ce1222af97e01f33ca5e957bd7f3b9" -dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "bytes", - "crc-fast", - "hex", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "md-5", - "pin-project-lite", - "sha1", - "sha2 0.10.9", - "tracing", -] - -[[package]] -name = "aws-smithy-eventstream" -version = "0.60.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c0b3e587fbaa5d7f7e870544508af8ce82ea47cd30376e69e1e37c4ac746f79" -dependencies = [ - "aws-smithy-types", - "bytes", - "crc32fast", -] - -[[package]] -name = "aws-smithy-http" -version = "0.63.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af4a8a5fe3e4ac7ee871237c340bbce13e982d37543b65700f4419e039f5d78e" -dependencies = [ - "aws-smithy-eventstream", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "bytes-utils", - "futures-core", - "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tracing", -] - -[[package]] -name = "aws-smithy-http-client" -version = "1.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0709f0083aa19b704132684bc26d3c868e06bd428ccc4373b0b55c3e8748a58b" -dependencies = [ - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "h2 0.3.27", - "h2 0.4.14", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "hyper 0.14.32", - "hyper 1.9.0", - "hyper-rustls 0.24.2", - "hyper-rustls 0.27.9", - "hyper-util", - "pin-project-lite", - "rustls 0.21.12", - "rustls 0.23.40", - "rustls-native-certs 0.8.3", - "rustls-pki-types", - "tokio", - "tokio-rustls 0.26.4", - "tower", - "tracing", -] - -[[package]] -name = "aws-smithy-json" -version = "0.62.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b3a779093e18cad88bbae08dc4261e1d95018c4c5b9356a52bcae7c0b6e9bb" -dependencies = [ - "aws-smithy-types", -] - -[[package]] -name = "aws-smithy-observability" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d3f39d5bb871aaf461d59144557f16d5927a5248a983a40654d9cf3b9ba183b" -dependencies = [ - "aws-smithy-runtime-api", -] - -[[package]] -name = "aws-smithy-query" -version = "0.60.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f76a580e3d8f8961e5d48763214025a2af65c2fa4cd1fb7f270a0e107a71b0" -dependencies = [ - "aws-smithy-types", - "urlencoding", -] - -[[package]] -name = "aws-smithy-runtime" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd3dfc18c1ce097cf81fced7192731e63809829c6cbf933c1ec47452d08e1aa" -dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-http-client", - "aws-smithy-observability", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "http-body 1.0.1", - "http-body-util", - "pin-project-lite", - "pin-utils", - "tokio", - "tracing", -] - -[[package]] -name = "aws-smithy-runtime-api" -version = "1.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c55e0837e9b8526f49e0b9bfa9ee18ddee70e853f5bc09c5d11ebceddcb0fec" -dependencies = [ - "aws-smithy-async", - "aws-smithy-types", - "bytes", - "http 0.2.12", - "http 1.4.0", - "pin-project-lite", - "tokio", - "tracing", - "zeroize", -] - -[[package]] -name = "aws-smithy-types" -version = "1.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "576b0d6991c9c32bc14fc340582ef148311f924d41815f641a308b5d11e8e7cd" -dependencies = [ - "base64-simd", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "http-body 1.0.1", - "http-body-util", - "itoa", - "num-integer", - "pin-project-lite", - "pin-utils", - "ryu", - "serde", - "time", - "tokio", - "tokio-util", -] - -[[package]] -name = "aws-smithy-xml" -version = "0.60.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3" -dependencies = [ - "xmlparser", -] - -[[package]] -name = "aws-types" -version = "1.3.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c50f3cdf47caa8d01f2be4a6663ea02418e892f9bbfd82c7b9a3a37eaccdd3a" -dependencies = [ - "aws-credential-types", - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "rustc_version", - "tracing", -] - [[package]] name = "axum" version = "0.8.9" @@ -841,10 +353,10 @@ dependencies = [ "bytes", "form_urlencoded", "futures-util", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", - "hyper 1.9.0", + "hyper", "hyper-util", "itoa", "matchit", @@ -872,8 +384,8 @@ checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" dependencies = [ "bytes", "futures-core", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", "mime", "pin-project-lite", @@ -883,12 +395,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "base16ct" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" - [[package]] name = "base64" version = "0.13.1" @@ -901,16 +407,6 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "base64-simd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" -dependencies = [ - "outref", - "vsimd", -] - [[package]] name = "base64ct" version = "1.8.3" @@ -1157,16 +653,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bytes-utils" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" -dependencies = [ - "bytes", - "either", -] - [[package]] name = "bytesize" version = "2.3.1" @@ -1249,10 +735,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", - "js-sys", "num-traits", "serde", - "wasm-bindgen", "windows-link", ] @@ -1334,15 +818,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" -[[package]] -name = "cmake" -version = "0.1.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" -dependencies = [ - "cc", -] - [[package]] name = "cobs" version = "0.3.0" @@ -1444,12 +919,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "const-oid" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" - [[package]] name = "const-oid" version = "0.10.2" @@ -1535,33 +1004,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" -dependencies = [ - "crc-catalog", -] - -[[package]] -name = "crc-catalog" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" - -[[package]] -name = "crc-fast" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" -dependencies = [ - "crc", - "digest 0.10.7", - "rustversion", - "spin 0.10.0", -] - [[package]] name = "crc32fast" version = "1.5.0" @@ -1733,28 +1175,6 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" -[[package]] -name = "crypto-bigint" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" -dependencies = [ - "generic-array", - "rand_core 0.6.4", - "subtle", - "zeroize", -] - -[[package]] -name = "crypto-bigint" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" -dependencies = [ - "rand_core 0.6.4", - "subtle", -] - [[package]] name = "crypto-common" version = "0.1.7" @@ -1784,32 +1204,6 @@ dependencies = [ "libloading 0.9.0", ] -[[package]] -name = "curve25519-dalek" -version = "4.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" -dependencies = [ - "cfg-if", - "cpufeatures 0.2.17", - "curve25519-dalek-derive", - "digest 0.10.7", - "fiat-crypto", - "rustc_version", - "subtle", -] - -[[package]] -name = "curve25519-dalek-derive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "cxx" version = "1.0.194" @@ -1970,40 +1364,13 @@ dependencies = [ "parking_lot_core", ] -[[package]] -name = "data-encoding" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8" - -[[package]] -name = "der" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" -dependencies = [ - "const-oid 0.9.6", - "zeroize", -] - -[[package]] -name = "der" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" -dependencies = [ - "const-oid 0.9.6", - "pem-rfc7468 0.7.0", - "zeroize", -] - [[package]] name = "der" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b" dependencies = [ - "pem-rfc7468 1.0.0", + "pem-rfc7468", "zeroize", ] @@ -2073,7 +1440,6 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer 0.10.4", "crypto-common 0.1.7", - "subtle", ] [[package]] @@ -2083,7 +1449,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ "block-buffer 0.12.0", - "const-oid 0.10.2", + "const-oid", "crypto-common 0.2.1", ] @@ -2161,12 +1527,6 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" -[[package]] -name = "dunce" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" - [[package]] name = "dyn-clone" version = "1.0.20" @@ -2195,7 +1555,7 @@ dependencies = [ "dashmap", "derive_builder", "dynamo-tokens", - "flume 0.12.0", + "flume", "ordered-float", "parking_lot", "prometheus", @@ -2259,66 +1619,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8072bec12b909b65aec01fa6518f387cfbf3427d4475409ad622898cd347522c" -[[package]] -name = "ecdsa" -version = "0.14.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" -dependencies = [ - "der 0.6.1", - "elliptic-curve", - "rfc6979", - "signature 1.6.4", -] - -[[package]] -name = "ed25519" -version = "2.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" -dependencies = [ - "signature 2.2.0", -] - -[[package]] -name = "ed25519-dalek" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" -dependencies = [ - "curve25519-dalek", - "ed25519", - "sha2 0.10.9", - "signature 2.2.0", - "subtle", -] - [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" -[[package]] -name = "elliptic-curve" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" -dependencies = [ - "base16ct", - "crypto-bigint 0.4.9", - "der 0.6.1", - "digest 0.10.7", - "ff", - "generic-array", - "group", - "pkcs8 0.9.0", - "rand_core 0.6.4", - "sec1", - "subtle", - "zeroize", -] - [[package]] name = "embedded-io" version = "0.4.0" @@ -2568,37 +1874,6 @@ dependencies = [ "simd-adler32", ] -[[package]] -name = "ff" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" -dependencies = [ - "rand_core 0.6.4", - "subtle", -] - -[[package]] -name = "fiat-crypto" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" - -[[package]] -name = "figment" -version = "0.10.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cb01cd46b0cf372153850f4c6c272d9cbea2da513e07538405148f95bd789f3" -dependencies = [ - "atomic", - "pear", - "serde", - "serde_json", - "toml 0.8.23", - "uncased", - "version_check", -] - [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -2621,18 +1896,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "flume" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "spin 0.9.8", -] - [[package]] name = "flume" version = "0.12.0" @@ -2642,7 +1905,7 @@ dependencies = [ "fastrand", "futures-core", "futures-sink", - "spin 0.9.8", + "spin", ] [[package]] @@ -2696,23 +1959,6 @@ dependencies = [ "futures-core", ] -[[package]] -name = "fs4" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4" -dependencies = [ - "rustix", - "tokio", - "windows-sys 0.59.0", -] - -[[package]] -name = "fs_extra" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" - [[package]] name = "futures" version = "0.3.32" @@ -2884,36 +2130,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" -[[package]] -name = "group" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" -dependencies = [ - "ff", - "rand_core 0.6.4", - "subtle", -] - -[[package]] -name = "h2" -version = "0.3.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap 2.14.0", - "slab", - "tokio", - "tokio-util", - "tracing", -] - [[package]] name = "h2" version = "0.4.14" @@ -2925,7 +2141,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.4.0", + "http", "indexmap 2.14.0", "slab", "tokio", @@ -2975,8 +2191,6 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "allocator-api2", - "equivalent", "foldhash 0.1.5", ] @@ -2999,6 +2213,15 @@ version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +[[package]] +name = "hashlink" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824e001ac4f3012dd16a264bec811403a67ca9deb6c102fc5049b32c4574b35f" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "heapless" version = "0.7.17" @@ -3009,7 +2232,7 @@ dependencies = [ "hash32", "rustc_version", "serde", - "spin 0.9.8", + "spin", "stable_deref_trait", ] @@ -3038,7 +2261,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", - "http 1.4.0", + "http", "indicatif 0.17.11", "libc", "log", @@ -3059,7 +2282,7 @@ checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213" dependencies = [ "dirs", "futures", - "http 1.4.0", + "http", "indicatif 0.18.4", "libc", "log", @@ -3075,32 +2298,12 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest 0.10.7", -] - [[package]] name = "hound" version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f" -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - [[package]] name = "http" version = "1.4.0" @@ -3111,17 +2314,6 @@ dependencies = [ "itoa", ] -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http 0.2.12", - "pin-project-lite", -] - [[package]] name = "http-body" version = "1.0.1" @@ -3129,7 +2321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.0", + "http", ] [[package]] @@ -3140,8 +2332,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "pin-project-lite", ] @@ -3152,42 +2344,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - -[[package]] -name = "hybrid-array" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5" -dependencies = [ - "typenum", -] - -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.27", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2 0.5.10", - "tokio", - "tower-service", - "tracing", - "want", +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hybrid-array" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d46837a0ed51fe95bd3b05de33cd64a1ee88fc797477ca48446872504507c5" +dependencies = [ + "typenum", ] [[package]] @@ -3200,9 +2368,9 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2 0.4.14", - "http 1.4.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "httparse", "httpdate", "itoa", @@ -3212,34 +2380,18 @@ dependencies = [ "want", ] -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "log", - "rustls 0.21.12", - "tokio", - "tokio-rustls 0.24.1", -] - [[package]] name = "hyper-rustls" version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.0", - "hyper 1.9.0", + "http", + "hyper", "hyper-util", - "rustls 0.23.40", - "rustls-native-certs 0.8.3", + "rustls", "tokio", - "tokio-rustls 0.26.4", + "tokio-rustls", "tower-service", "webpki-roots 1.0.7", ] @@ -3250,7 +2402,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper 1.9.0", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3265,7 +2417,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ "bytes", "http-body-util", - "hyper 1.9.0", + "hyper", "hyper-util", "native-tls", "tokio", @@ -3283,14 +2435,14 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "hyper 1.9.0", + "http", + "http-body", + "hyper", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2", "system-configuration", "tokio", "tower-service", @@ -3538,12 +2690,6 @@ dependencies = [ "rustversion", ] -[[package]] -name = "inlinable_string" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb" - [[package]] name = "insta" version = "1.47.2" @@ -3570,6 +2716,17 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "io-uring" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -3717,26 +2874,6 @@ dependencies = [ "serde", ] -[[package]] -name = "kvbm-config" -version = "1.2.0" -dependencies = [ - "anyhow", - "dynamo-memory", - "figment", - "nix 0.30.1", - "nvtx", - "rayon", - "serde", - "serde_json", - "temp-env", - "thiserror 2.0.18", - "tokio", - "tracing", - "validator", - "velo", -] - [[package]] name = "kvbm-consolidator" version = "1.2.0" @@ -3764,46 +2901,6 @@ dependencies = [ "tracing-test", ] -[[package]] -name = "kvbm-engine" -version = "1.2.0" -dependencies = [ - "anyhow", - "async-nats", - "aws-config", - "aws-sdk-s3", - "bytes", - "chrono", - "clap", - "crossbeam-queue", - "cudarc", - "dashmap", - "derive_builder", - "dynamo-memory", - "figment", - "flume 0.11.1", - "futures", - "kvbm-common", - "kvbm-config", - "kvbm-logical", - "kvbm-physical", - "libc", - "nvtx", - "oneshot", - "parking_lot", - "rayon", - "rmp-serde", - "serde", - "serde_json", - "tokio", - "tokio-rayon", - "tokio-stream", - "tracing", - "tracing-subscriber", - "uuid", - "velo", -] - [[package]] name = "kvbm-kernels" version = "1.2.0" @@ -3828,7 +2925,7 @@ dependencies = [ "dynamo-tokens", "futures", "indexmap 2.14.0", - "lru 0.16.4", + "lru", "parking_lot", "prometheus", "proptest", @@ -3847,32 +2944,6 @@ dependencies = [ "xxhash-rust", ] -[[package]] -name = "kvbm-physical" -version = "1.2.0" -dependencies = [ - "aligned-vec", - "anyhow", - "bincode 2.0.1", - "blake3", - "cudarc", - "derive-getters", - "derive_builder", - "dynamo-memory", - "futures", - "kvbm-common", - "kvbm-kernels", - "rstest 0.26.1", - "serde", - "serde_json", - "thiserror 2.0.18", - "tokio", - "tracing", - "uuid", - "validator", - "velo", -] - [[package]] name = "lazy_static" version = "1.5.0" @@ -4095,15 +3166,6 @@ dependencies = [ "imgref", ] -[[package]] -name = "lru" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" -dependencies = [ - "hashbrown 0.15.5", -] - [[package]] name = "lru" version = "0.16.4" @@ -4171,13 +3233,12 @@ dependencies = [ ] [[package]] -name = "md-5" -version = "0.10.6" +name = "mea" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +checksum = "2640d335e7273dacdcf51044026139b2e269c3bb0dfc3f8cb3496b85e3f6a42c" dependencies = [ - "cfg-if", - "digest 0.10.7", + "slab", ] [[package]] @@ -4201,6 +3262,15 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + [[package]] name = "memoffset" version = "0.9.1" @@ -4339,15 +3409,6 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom 0.2.17", -] - [[package]] name = "native-tls" version = "0.2.18" @@ -4357,10 +3418,10 @@ dependencies = [ "libc", "log", "openssl", - "openssl-probe 0.2.1", + "openssl-probe", "openssl-sys", "schannel", - "security-framework 3.7.0", + "security-framework", "security-framework-sys", "tempfile", ] @@ -4401,6 +3462,19 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nix" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" +dependencies = [ + "bitflags 1.3.2", + "cc", + "cfg-if", + "libc", + "memoffset 0.6.5", +] + [[package]] name = "nix" version = "0.30.1" @@ -4411,7 +3485,6 @@ dependencies = [ "cfg-if", "cfg_aliases", "libc", - "memoffset", ] [[package]] @@ -4444,21 +3517,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "nkeys" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf" -dependencies = [ - "data-encoding", - "ed25519", - "ed25519-dalek", - "getrandom 0.2.17", - "log", - "rand 0.8.6", - "signatory", -] - [[package]] name = "no_std_io2" version = "0.9.4" @@ -4508,15 +3566,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "nuid" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83" -dependencies = [ - "rand 0.8.6", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -4872,12 +3921,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "openssl-probe" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" - [[package]] name = "openssl-probe" version = "0.2.1" @@ -4896,6 +3939,20 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "opentelemetry" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4927,23 +3984,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "outref" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" - -[[package]] -name = "p256" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" -dependencies = [ - "ecdsa", - "elliptic-curve", - "sha2 0.10.9", -] - [[package]] name = "page_size" version = "0.6.0" @@ -5018,26 +4058,54 @@ dependencies = [ ] [[package]] -name = "pear" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdeeaa00ce488657faba8ebf44ab9361f9365a97bd39ffb8a60663f57ff4b467" +name = "pegaflow-common" +version = "0.22.6" +source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" dependencies = [ - "inlinable_string", - "pear_codegen", - "yansi", + "colored", + "libc", + "log", + "logforth", ] [[package]] -name = "pear_codegen" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bab5b985dc082b345f812b7df84e1bef27e7207b39e448439ba8bd69c93f147" +name = "pegaflow-core" +version = "0.22.6" +source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" dependencies = [ - "proc-macro2", - "proc-macro2-diagnostics", - "quote", - "syn 2.0.117", + "ahash", + "bytesize", + "cudarc", + "dashmap", + "futures", + "hashlink", + "io-uring", + "libc", + "log", + "logforth", + "mea", + "offset-allocator", + "opentelemetry", + "parking_lot", + "pegaflow-common", + "pegaflow-proto", + "rand 0.10.1", + "shared_memory", + "smallvec", + "tokio", + "tonic", + "uuid", +] + +[[package]] +name = "pegaflow-proto" +version = "0.22.6" +source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" +dependencies = [ + "prost", + "tonic", + "tonic-prost", + "tonic-prost-build", ] [[package]] @@ -5363,6 +4431,19 @@ dependencies = [ "pegainfer-kernels", ] +[[package]] +name = "pegainfer-kv-offload" +version = "0.1.0" +dependencies = [ + "anyhow", + "cudarc", + "half", + "log", + "pegaflow-core", + "pegainfer-kv-cache", + "tokio", +] + [[package]] name = "pegainfer-qwen3-4b" version = "0.1.0" @@ -5381,6 +4462,7 @@ dependencies = [ "pegainfer-cupti", "pegainfer-kernels", "pegainfer-kv-cache", + "pegainfer-kv-offload", "pegainfer-vllm-support", "rand 0.10.1", "safetensors", @@ -5489,15 +4571,6 @@ dependencies = [ "vllm-tokenizer", ] -[[package]] -name = "pem-rfc7468" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" -dependencies = [ - "base64ct", -] - [[package]] name = "pem-rfc7468" version = "1.0.0" @@ -5556,16 +4629,6 @@ dependencies = [ "sha2 0.10.9", ] -[[package]] -name = "petgraph" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" -dependencies = [ - "fixedbitset", - "indexmap 2.14.0", -] - [[package]] name = "petgraph" version = "0.8.3" @@ -5587,47 +4650,21 @@ dependencies = [ ] [[package]] -name = "pin-project-internal" -version = "1.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a990e22f43e84855daf260dded30524ef4a9021cc7541c26540500a50b624389" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkcs8" -version = "0.9.0" +name = "pin-project-internal" +version = "1.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +checksum = "a990e22f43e84855daf260dded30524ef4a9021cc7541c26540500a50b624389" dependencies = [ - "der 0.6.1", - "spki 0.6.0", + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] -name = "pkcs8" -version = "0.10.2" +name = "pin-project-lite" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" -dependencies = [ - "der 0.7.10", - "spki 0.7.3", -] +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "pkg-config" @@ -5846,19 +4883,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "proc-macro2-diagnostics" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "version_check", - "yansi", -] - [[package]] name = "profiling" version = "1.0.18" @@ -5935,16 +4959,6 @@ dependencies = [ "unarray", ] -[[package]] -name = "prost" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" -dependencies = [ - "bytes", - "prost-derive 0.13.5", -] - [[package]] name = "prost" version = "0.14.3" @@ -5952,27 +4966,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" dependencies = [ "bytes", - "prost-derive 0.14.3", -] - -[[package]] -name = "prost-build" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" -dependencies = [ - "heck", - "itertools 0.14.0", - "log", - "multimap", - "once_cell", - "petgraph 0.7.1", - "prettyplease", - "prost 0.13.5", - "prost-types 0.13.5", - "regex", - "syn 2.0.117", - "tempfile", + "prost-derive", ] [[package]] @@ -5985,10 +4979,10 @@ dependencies = [ "itertools 0.14.0", "log", "multimap", - "petgraph 0.8.3", + "petgraph", "prettyplease", - "prost 0.14.3", - "prost-types 0.14.3", + "prost", + "prost-types", "pulldown-cmark", "pulldown-cmark-to-cmark", "regex", @@ -5996,19 +4990,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "prost-derive" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" -dependencies = [ - "anyhow", - "itertools 0.14.0", - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "prost-derive" version = "0.14.3" @@ -6022,22 +5003,13 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "prost-types" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" -dependencies = [ - "prost 0.13.5", -] - [[package]] name = "prost-types" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" dependencies = [ - "prost 0.14.3", + "prost", ] [[package]] @@ -6095,7 +5067,7 @@ dependencies = [ "anyhow", "indoc", "libc", - "memoffset", + "memoffset 0.9.1", "once_cell", "portable-atomic", "pyo3-build-config", @@ -6181,8 +5153,8 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.1.2", - "rustls 0.23.40", - "socket2 0.6.3", + "rustls", + "socket2", "thiserror 2.0.18", "tokio", "tracing", @@ -6201,7 +5173,7 @@ dependencies = [ "rand 0.9.4", "ring", "rustc-hash 2.1.2", - "rustls 0.23.40", + "rustls", "rustls-pki-types", "slab", "thiserror 2.0.18", @@ -6219,7 +5191,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2", "tracing", "windows-sys 0.60.2", ] @@ -6499,12 +5471,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-lite" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" - [[package]] name = "regex-syntax" version = "0.8.10" @@ -6529,12 +5495,12 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2 0.4.14", - "http 1.4.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "http-body-util", - "hyper 1.9.0", - "hyper-rustls 0.27.9", + "hyper", + "hyper-rustls", "hyper-tls", "hyper-util", "js-sys", @@ -6545,7 +5511,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.40", + "rustls", "rustls-pki-types", "serde", "serde_json", @@ -6553,7 +5519,7 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-native-tls", - "tokio-rustls 0.26.4", + "tokio-rustls", "tokio-util", "tower", "tower-http", @@ -6566,17 +5532,6 @@ dependencies = [ "webpki-roots 1.0.7", ] -[[package]] -name = "rfc6979" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" -dependencies = [ - "crypto-bigint 0.4.9", - "hmac", - "zeroize", -] - [[package]] name = "rgb" version = "0.8.53" @@ -6775,68 +5730,21 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", -] - [[package]] name = "rustls" version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ - "aws-lc-rs", "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.13", + "rustls-webpki", "subtle", "zeroize", ] -[[package]] -name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe 0.1.6", - "rustls-pemfile", - "rustls-pki-types", - "schannel", - "security-framework 2.11.1", -] - -[[package]] -name = "rustls-native-certs" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" -dependencies = [ - "openssl-probe 0.2.1", - "rustls-pki-types", - "schannel", - "security-framework 3.7.0", -] - -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" version = "1.14.1" @@ -6847,33 +5755,12 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-webpki" -version = "0.101.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "rustls-webpki" -version = "0.102.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" -dependencies = [ - "rustls-pki-types", - "untrusted", -] - [[package]] name = "rustls-webpki" version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ - "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -6984,16 +5871,6 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "sdd" version = "4.8.6" @@ -7003,33 +5880,6 @@ dependencies = [ "saa", ] -[[package]] -name = "sec1" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" -dependencies = [ - "base16ct", - "der 0.6.1", - "generic-array", - "pkcs8 0.9.0", - "subtle", - "zeroize", -] - -[[package]] -name = "security-framework" -version = "2.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" -dependencies = [ - "bitflags 2.11.1", - "core-foundation 0.9.4", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - [[package]] name = "security-framework" version = "3.7.0" @@ -7154,15 +6004,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_nanos" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985" -dependencies = [ - "serde", -] - [[package]] name = "serde_path_to_error" version = "0.1.20" @@ -7309,6 +6150,19 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shared_memory" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba8593196da75d9dc4f69349682bd4c2099f8cde114257d1ef7ef1b33d1aba54" +dependencies = [ + "cfg-if", + "libc", + "nix 0.23.2", + "rand 0.8.6", + "win-sys", +] + [[package]] name = "shlex" version = "1.3.0" @@ -7325,38 +6179,6 @@ dependencies = [ "libc", ] -[[package]] -name = "signatory" -version = "0.27.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31" -dependencies = [ - "pkcs8 0.10.2", - "rand_core 0.6.4", - "signature 2.2.0", - "zeroize", -] - -[[package]] -name = "signature" -version = "1.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" -dependencies = [ - "digest 0.10.7", - "rand_core 0.6.4", -] - -[[package]] -name = "signature" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" -dependencies = [ - "digest 0.10.7", - "rand_core 0.6.4", -] - [[package]] name = "simd-adler32" version = "0.3.9" @@ -7410,16 +6232,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "socket2" version = "0.6.3" @@ -7450,32 +6262,6 @@ dependencies = [ "lock_api", ] -[[package]] -name = "spin" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" - -[[package]] -name = "spki" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" -dependencies = [ - "base64ct", - "der 0.6.1", -] - -[[package]] -name = "spki" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" -dependencies = [ - "base64ct", - "der 0.7.10", -] - [[package]] name = "spm_precompiled" version = "0.1.4" @@ -7758,15 +6544,6 @@ dependencies = [ "tiktoken-rs 0.7.0", ] -[[package]] -name = "temp-env" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96374855068f47402c3121c6eed88d29cb1de8f3ab27090e273e420bdabcf050" -dependencies = [ - "parking_lot", -] - [[package]] name = "tempfile" version = "3.27.0" @@ -8045,9 +6822,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.48.0" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ "bytes", "libc", @@ -8055,16 +6832,16 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.3", + "socket2", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -8081,33 +6858,13 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-rayon" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cf33a76e0b1dd03b778f83244137bd59887abf25c0e87bc3e7071105f457693" -dependencies = [ - "rayon", - "tokio", -] - -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls 0.23.40", + "rustls", "tokio", ] @@ -8134,29 +6891,8 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-websockets" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d" -dependencies = [ - "base64 0.22.1", - "bytes", - "futures-core", - "futures-sink", - "http 1.4.0", - "httparse", - "rand 0.8.6", - "ring", - "rustls-pki-types", + "pin-project-lite", "tokio", - "tokio-rustls 0.26.4", - "tokio-util", - "webpki-roots 0.26.11", ] [[package]] @@ -8214,7 +6950,6 @@ dependencies = [ "serde", "serde_spanned 0.6.9", "toml_datetime 0.6.11", - "toml_write", "winnow 0.7.15", ] @@ -8239,47 +6974,12 @@ dependencies = [ "winnow 1.0.2", ] -[[package]] -name = "toml_write" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" - [[package]] name = "toml_writer" version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" -[[package]] -name = "tonic" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" -dependencies = [ - "async-trait", - "axum", - "base64 0.22.1", - "bytes", - "h2 0.4.14", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "hyper 1.9.0", - "hyper-timeout", - "hyper-util", - "percent-encoding", - "pin-project", - "prost 0.13.5", - "socket2 0.5.10", - "tokio", - "tokio-stream", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - [[package]] name = "tonic" version = "0.14.5" @@ -8290,16 +6990,16 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", - "h2 0.4.14", - "http 1.4.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "http-body-util", - "hyper 1.9.0", + "hyper", "hyper-timeout", "hyper-util", "percent-encoding", "pin-project", - "socket2 0.6.3", + "socket2", "sync_wrapper", "tokio", "tokio-stream", @@ -8309,20 +7009,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "tonic-build" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847" -dependencies = [ - "prettyplease", - "proc-macro2", - "prost-build 0.13.5", - "prost-types 0.13.5", - "quote", - "syn 2.0.117", -] - [[package]] name = "tonic-build" version = "0.14.5" @@ -8342,8 +7028,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" dependencies = [ "bytes", - "prost 0.14.3", - "tonic 0.14.5", + "prost", + "tonic", ] [[package]] @@ -8354,12 +7040,12 @@ checksum = "f3144df636917574672e93d0f56d7edec49f90305749c668df5101751bb8f95a" dependencies = [ "prettyplease", "proc-macro2", - "prost-build 0.14.3", - "prost-types 0.14.3", + "prost-build", + "prost-types", "quote", "syn 2.0.117", "tempfile", - "tonic-build 0.14.5", + "tonic-build", ] [[package]] @@ -8390,8 +7076,8 @@ dependencies = [ "bitflags 2.11.1", "bytes", "futures-util", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "iri-string", "pin-project-lite", "tower", @@ -8558,16 +7244,6 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "tryhard" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5" -dependencies = [ - "pin-project-lite", - "tokio", -] - [[package]] name = "typeid" version = "1.0.3" @@ -8592,15 +7268,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" -[[package]] -name = "uncased" -version = "0.9.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697" -dependencies = [ - "version_check", -] - [[package]] name = "unicase" version = "2.9.0" @@ -8680,7 +7347,7 @@ dependencies = [ "flate2", "log", "once_cell", - "rustls 0.23.40", + "rustls", "rustls-pki-types", "serde", "serde_json", @@ -8697,12 +7364,12 @@ checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" dependencies = [ "base64 0.22.1", "cookie_store", - "der 0.8.0", + "der", "flate2", "log", "native-tls", "percent-encoding", - "rustls 0.23.40", + "rustls", "rustls-pki-types", "serde", "serde_json", @@ -8720,7 +7387,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" dependencies = [ "base64 0.22.1", - "http 1.4.0", + "http", "httparse", "log", ] @@ -8737,12 +7404,6 @@ dependencies = [ "serde", ] -[[package]] -name = "urlencoding" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" - [[package]] name = "utf16_iter" version = "1.0.5" @@ -8869,226 +7530,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "velo" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e0cc874c11ea3d03afd7adf90529bcf3df374039deff65bc02a26e71bb5814b" -dependencies = [ - "anyhow", - "bytes", - "serde", - "tokio", - "tokio-util", - "velo-common", - "velo-discovery", - "velo-events", - "velo-messenger", - "velo-observability", - "velo-queue", - "velo-rendezvous", - "velo-streaming", - "velo-transports", -] - -[[package]] -name = "velo-common" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc2824d7667c4ee992dfc58c30799518f504d39c91f1e83962d55cb8c44cfb67" -dependencies = [ - "bytes", - "rmp-serde", - "serde", - "serde_bytes", - "serde_json", - "thiserror 2.0.18", - "uuid", - "xxhash-rust", -] - -[[package]] -name = "velo-discovery" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41107947fe3d15972c56815e42fd2485ca4ba58a5a69ec1a6fd61c00966f8734" -dependencies = [ - "anyhow", - "async-stream", - "bytes", - "fs4", - "futures", - "parking_lot", - "rmp-serde", - "serde", - "serde_json", - "tokio", - "tokio-util", - "tracing", - "uuid", - "velo-common", -] - -[[package]] -name = "velo-events" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2504e857f1ed52ad96ce73792a724be880ba75f3579a27050395e81d38cb42d0" -dependencies = [ - "anyhow", - "dashmap", - "futures", - "parking_lot", - "serde", - "tokio", - "tokio-util", - "tracing", - "uuid", - "xxhash-rust", -] - -[[package]] -name = "velo-messenger" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ada8ea47ab39a82ef9101024452a3c605bcf5c0acad82c76afcf48a17c0450cf" -dependencies = [ - "anyhow", - "bs58", - "bytes", - "dashmap", - "derive-getters", - "derive_builder", - "flume 0.12.0", - "futures", - "lru 0.12.5", - "parking_lot", - "rmp-serde", - "serde", - "serde_json", - "thiserror 2.0.18", - "tokio", - "tokio-util", - "tracing", - "uuid", - "velo-common", - "velo-discovery", - "velo-events", - "velo-observability", - "velo-transports", -] - -[[package]] -name = "velo-observability" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57813414b19b0f845744fcf158ed4b87afe4f5c59ee99d58872c8e5d5534d61b" -dependencies = [ - "prometheus", - "tracing", -] - -[[package]] -name = "velo-queue" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7bcce33f5710d87e33e1eacc5472300cbe6acdfb5f2cd6929c12f2e4908ef38" -dependencies = [ - "bytes", - "dashmap", - "flume 0.12.0", - "futures", - "rmp-serde", - "serde", - "thiserror 2.0.18", - "tokio", - "tracing", -] - -[[package]] -name = "velo-rendezvous" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1eac77f329b0c3dba1b36114c969e0a7d680bb4a6436720754bef14a24fedde" -dependencies = [ - "anyhow", - "bytes", - "dashmap", - "futures", - "serde", - "serde_json", - "tracing", - "velo-common", - "velo-messenger", - "velo-observability", -] - -[[package]] -name = "velo-streaming" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dc0d71f73c6f369302bb94b05341a807a5693f6c321fc208d9c9a974d918a" -dependencies = [ - "anyhow", - "bytes", - "dashmap", - "derive_builder", - "flume 0.12.0", - "futures", - "rmp-serde", - "serde", - "serde_json", - "socket2 0.6.3", - "thiserror 2.0.18", - "tokio", - "tokio-util", - "tonic-build 0.13.1", - "tracing", - "uuid", - "velo-common", - "velo-messenger", - "velo-observability", - "velo-transports", -] - -[[package]] -name = "velo-transports" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb5a348cc2fe13fc560824f170d3bdc3d10ec3a11802bb32489a59f49eb12409" -dependencies = [ - "anyhow", - "async-nats", - "axum", - "bs58", - "bytes", - "dashmap", - "flume 0.12.0", - "futures", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "hyper 1.9.0", - "hyper-util", - "nix 0.30.1", - "parking_lot", - "prost 0.13.5", - "rmp-serde", - "serde", - "serde_json", - "socket2 0.6.3", - "thiserror 2.0.18", - "tokio", - "tokio-stream", - "tokio-util", - "tonic 0.13.1", - "tonic-build 0.13.1", - "tower", - "tracing", - "velo-common", - "velo-observability", -] - [[package]] name = "version-compare" version = "0.2.1" @@ -9221,22 +7662,22 @@ dependencies = [ "asynk-strim-attr", "axum", "futures", - "http-body 1.0.1", + "http-body", "itertools 0.14.0", "libc", "llm-multimodal", - "prost 0.14.3", - "prost-types 0.14.3", + "prost", + "prost-types", "rmpv", "serde", "serde_json", "serde_with", - "socket2 0.6.3", + "socket2", "thiserror-ext", "tokio", "tokio-stream", "tokio-util", - "tonic 0.14.5", + "tonic", "tonic-prost", "tonic-prost-build", "tower-http", @@ -9308,12 +7749,6 @@ dependencies = [ "winnow 1.0.2", ] -[[package]] -name = "vsimd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" - [[package]] name = "wait-timeout" version = "0.2.1" @@ -9521,6 +7956,15 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" +[[package]] +name = "win-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b7b128a98c1cfa201b09eb49ba285887deb3cbe7466a98850eb1adabb452be5" +dependencies = [ + "windows", +] + [[package]] name = "win_uds" version = "0.2.2" @@ -9529,7 +7973,7 @@ checksum = "7dd30a1a28a3799479cbf4e17284a220ea9ff6bad098a9d0224543a5d1efe1da" dependencies = [ "async-io", "futures-io", - "socket2 0.6.3", + "socket2", ] [[package]] @@ -9563,6 +8007,19 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45296b64204227616fdbf2614cefa4c236b98ee64dfaaaa435207ed99fe7829f" +dependencies = [ + "windows_aarch64_msvc 0.34.0", + "windows_i686_gnu 0.34.0", + "windows_i686_msvc 0.34.0", + "windows_x86_64_gnu 0.34.0", + "windows_x86_64_msvc 0.34.0", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -9714,6 +8171,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -9726,6 +8189,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -9750,6 +8219,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -9762,6 +8237,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -9786,6 +8267,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -9922,12 +8409,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" -[[package]] -name = "xmlparser" -version = "0.13.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" - [[package]] name = "xxhash-rust" version = "0.8.15" diff --git a/Cargo.toml b/Cargo.toml index 61485cac..0a72b866 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ members = [ "pegainfer-qwen3-4b", "pegainfer-qwen35-4b", "pegainfer-kv-cache", + "pegainfer-kv-offload", # ---- pegainfer-comm (EP all-to-all) ---- "pegainfer-comm", "pegainfer-comm/crates/pegainfer-comm-build-utils", @@ -40,12 +41,9 @@ members = [ "kvbm/dynamo-kv-hashing", "kvbm/dynamo-kv-router", "kvbm/kvbm-common", - "kvbm/kvbm-config", "kvbm/kvbm-consolidator", - "kvbm/kvbm-engine", "kvbm/kvbm-kernels", "kvbm/kvbm-logical", - "kvbm/kvbm-physical", ] # Inherited by dynamo-ported crates that use `edition.workspace = true` etc. @@ -66,12 +64,9 @@ dynamo-memory = { path = "kvbm/dynamo-memory" } dynamo-kv-hashing = { path = "kvbm/dynamo-kv-hashing" } dynamo-kv-router = { path = "kvbm/dynamo-kv-router", features = ["metrics"] } kvbm-common = { path = "kvbm/kvbm-common" } -kvbm-config = { path = "kvbm/kvbm-config" } kvbm-consolidator = { path = "kvbm/kvbm-consolidator" } -kvbm-engine = { path = "kvbm/kvbm-engine" } kvbm-kernels = { path = "kvbm/kvbm-kernels" } kvbm-logical = { path = "kvbm/kvbm-logical" } -kvbm-physical = { path = "kvbm/kvbm-physical" } # ---- third-party ---- anyhow = "1.0" async-nats = { version = "0.45.0", features = ["service"] } @@ -96,6 +91,11 @@ cudarc = { version = "0.19.7", features = [ "cublas", "f16", "nccl", + # nvrtc: embedded pegaflow-core's transfer/kernel.rs references the nvrtc + # bindings unconditionally (its KernelBackend JIT-compiles the copy kernel). + # Lazy per-symbol loading (0.19.5+) keeps this off the runtime driver floor, + # so it stays compatible with the cuda-12090 binding level (issue #263). + "nvrtc", ] } cxx = "1.0.187" cxx-build = "1.0.187" @@ -129,6 +129,7 @@ parking_lot = "0.12.5" pegainfer-bench = { path = "pegainfer-bench" } pegainfer-core = { path = "pegainfer-core" } pegainfer-kv-cache = { path = "pegainfer-kv-cache" } +pegainfer-kv-offload = { path = "pegainfer-kv-offload" } pegainfer-cupti = { path = "pegainfer-cupti" } pegainfer-deepseek-v4 = { path = "pegainfer-deepseek-v4" } pegainfer-engine = { path = "pegainfer-engine" } diff --git a/docs/index.md b/docs/index.md index b2fafd74..e07d6b96 100644 --- a/docs/index.md +++ b/docs/index.md @@ -92,6 +92,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | --- | --- | | `subsystems/runtime/runtime.md` | Runtime complexity is controlled by a shared `pegainfer-core` that owns the generation contract and orchestration; per-model crates implement `ModelForward` so prefill/decode and hybrid attention stay hidden from the caller. State (`&mut`) is separated from weights (`&self`) for future bs > 1. | | `subsystems/runtime/kv-cache-design.md` | Dynamo 式 logical/physical 分层 KV cache:BlockManager 管 block 生命周期和 admission,PhysicalBackend trait 管 GPU 内存和布局(FullAttention / MLA)。支持 TP / DP。基于 vLLM/Dynamo/pegaflow 调研。 | +| `subsystems/runtime/pegaflow-offload-integration.md` | 把 `pegaflow-core` 当进程内 Rust 库做 KV 卸载物理后端(HBM→DRAM/SSD/RDMA),补 kvbm 没写的卸载层。**Qwen3-4B full-attn 首发,端到端已在真实 GPU 跑通并验证**(async SAVE+LOAD 接进 executor/scheduler,纯 CPU-hit 与 GPU+CPU 组合 hit 恢复后 logits 与冷算一致)。pegaflow 经 git rev pin(#331+#333)。默认关,未接 server CLI。linear 排除,sparse 暂缓。 | ## subsystems / scheduler diff --git a/docs/subsystems/runtime/pegaflow-offload-integration.md b/docs/subsystems/runtime/pegaflow-offload-integration.md new file mode 100644 index 00000000..2af55b94 --- /dev/null +++ b/docs/subsystems/runtime/pegaflow-offload-integration.md @@ -0,0 +1,157 @@ +# pegaflow KV 卸载接入 Spec + +> **TL;DR**: 把 `pegaflow-core` 当**进程内 Rust 库**做 KV 卸载的物理后端(HBM→DRAM/SSD/RDMA),补上 kvbm 留着没写的卸载层。connector 大脑(决定 load/save 哪些 block)用 kvbm logical/physical 分层思想自建,pegaflow 退为语义无关的 raw block transfer 后端。**路线已调整为 Qwen3-4B full-attn 首发**(原计划 Kimi 首发):page-first 单 buffer 经 pegaflow `block_stride_bytes`(PR #331)适配。**端到端已在真实 GPU 上跑通并验证**:async SAVE + async LOAD 接进 `Qwen3Executor` + scheduler,`tests/kv_offload_cpu_hit.rs` 覆盖纯 CPU-hit 与 GPU+CPU 组合 hit,恢复后 logits 与冷算一致;连接层 `OffloadEngine` + `tests/cpu_roundtrip.rs` 字节级一致。默认关(builder flag opt-in),未接 server CLI。**Qwen3.5 linear/SSM state 明确排除**;**DeepSeek sparse 暂缓**。 +> +> Last touched: 2026-06 + +## 0. 实现状态(2026-06) + +已落地并验证: + +- **pegaflow `block_stride_bytes`**(PR #331 → novitalabs/pegaflow,`feat/inproc-load` 基于其上):解耦"块间步长"与"每块拷贝大小",让 page-first fused buffer 能注册。**已合入 master**。 +- **pegaflow 进程内 load API**(PR #333,**已合入**,squash 进 #331 的 `07cac7e`):`LoadCompletion::{Shm,Channel}` + `batch_load_kv_blocks_multi_layer_inproc` → `oneshot::Receiver`,去掉 in-process 调用方对 shm `LoadState` 的依赖(Rust 进程内不需要),非阻塞 poll。 +- **`pegainfer-kv-offload::OffloadEngine`**:拥有 `PegaEngine` + 内嵌 tokio runtime;`Registration::from_buffer` 把 fused page-first buffer 映射成 per-layer 注册(**单段 `[K|V]`**:fused layout 里 K/V 本就连续 = `layer_stride` 一段,`block_stride = page_stride`,`segments=1`——不是 K/V split,那条路需要 `kv_stride > bytes_per_block`,此处不成立)。`save`(async fire-and-forget)/`save_blocking`(eviction handoff,同步捕获)/`query`(GPU+CPU hit)/`load`(oneshot)/`flush_saves`/`evict_all`。 +- **`KvBuffer::device_ptr`**(kv-cache):注册用的稳定基址。 +- **kvbm↔bytes 桥**(kv-cache `RequestKv`):`prompt_block_hashes` / `assigned_block_hashes` / `prefix_matched_blocks`,`SequenceHash::as_u128()` → 16B content key。 +- **`tests/cpu_roundtrip.rs`**:真实 `KvBuffer` 上写已知 pattern → save → query → load 到**另一组** block → 字节级比对 + 零块负向控制。**通过**。 +- **live 接线(§9,已落地)**:`Qwen3Executor` 持 `Option`(`Qwen3OffloadOptions` opt-in,默认关);SAVE hook(`save_sealed_blocks`,async fire-and-forget)+ 非阻塞 prefetch admission(`begin_kv_prefetch`/`drain_ready_prefetch`/`wait_ready_prefetch`,scheduler `loading` 态)。`tests/kv_offload_cpu_hit.rs` 单测序跑两幕——纯 CPU restore(`gpu_hit==0`)与 GPU+CPU 组合 hit(G=3+C=3 拼成一段连续前缀)——恢复后 first-token logits 与冷算一致(mean Δ≈0.03 nat,bf16 floor)。 +- **三处正确性加固**(toxic-review 后):① query lease 在 `reserve_loaded_blocks` 失败 / `load` 提交失败时显式 `release_query_lease`,不再泄漏到 600s TTL;② admission 拒绝(context/KV budget/未知 LoRA)时 `drop_request` 释放已 settle 的 prefetch 状态,不再泄漏已 commit 的 block;③ async SAVE 把被保存 block 的 `ImmutableBlock` 强引用(`KvBlockGuard`)随 spawn 持到 D2H 落地才 drop——封死"请求结束→slot 重分配→D2H 抓到错 KV 写进旧 hash"的静默腐蚀窗口。 + +未接 server CLI(仅经 `start_engine_with_offload` / 测试入口)。**依赖已从 fork 摘除**:PR #331+#333 均合入上游 master(squash 进 `07cac7e`),`third_party/pegaflow` 已删,`pegaflow-core` 改为 pin 到该 rev 的 **git 依赖**(见 §5.2),GPU 测试在 git-dep 下行为不变(delta 一致)。 + +相关:[kv-cache-design.md](kv-cache-design.md)(logical/physical 分层,已把 pegaflow 列为设计调研)· [qwen3-kvbm-integration-spec.md](qwen3-kvbm-integration-spec.md)(kvbm-logical 已接入)· `models/kimi-k2/kv-cache-design.md`(Kimi 已用 `BlockPool`)· `models/qwen3/prefix-cache.md`(HBM 内前缀复用已落地)。 + +--- + +## 1. 定位:pegaflow 是 raw 后端,connector 大脑要自建 + +pegaflow(`third_party/pegaflow`,novita,Apache-2.0)原本是 **vLLM 的 KV connector 服务端**:KV 的编排逻辑(何时 save、query 几个 block、prefix 匹配、与 scheduler 的 admission/preemption 交互)全在 vLLM 的 Python connector 那一侧,`pegaflow-core` 只是底下干 D2H/H2D + 分层存储的**肌肉**。 + +pegainfer 不是 vLLM,那套 Python connector 一行用不上。接入要做的是**用 Rust 自建那颗 connector 大脑**——而 kvbm 的 logical/physical 分层正是它的骨架: + +``` +per-model scheduler ← 策略:哪些 block 该 resident(full 前缀 / MLA 全前缀 / 未来稀疏选择) + ↓ 产出 load/save 意图(一组 block) +connector(kvbm logical/physical 思想)← 机制:block identity、状态机、GPU slot 编排、transfer 调度 + ↓ 语义无关的 raw transfer +pegaflow-core ← 机制底座:D2H/H2D、DRAM/SSD/RDMA 分层 +``` + +## 2. 战略决策:pegaflow 取代 kvbm 死代码做物理 tier + +pegainfer 仓里 vendored 的 `kvbm-physical` / `kvbm-engine` 设计目标就是分层卸载,但**至今零接线、是死代码**(无任何非 kvbm crate 依赖)。同时养两套分层卸载违反项目复杂度红线。本 spec 采纳:**`kvbm-logical`(逻辑层 + 前缀匹配)保留,pegaflow-core 顶替它下面缺失的物理卸载层,砍掉 `kvbm-physical`/`kvbm-engine`**。理由:pegaflow 同组维护、已上 PyPI、有 H800 benchmark、库化干净;kvbm 那两层是纯负债。 + +## 3. 三模型三 KV 形态 → connector 边界(实据) + +| 模型 | KV 形态 | active set | 跨请求复用 | 卸载结论 | +| --- | --- | --- | --- | --- | +| **Kimi-K2 MLA** | paged,per-layer ckv/kpe arena,后端是 `BlockPool`;latent 68.6 KiB/token,无 per-head | 无(dense 全前缀) | 有(HBM 内 prefix cache 已落地) | **首发**:接入面最干净,layout 直接适配 pegaflow registration | +| **Qwen3 / Qwen3.5 full-attn** | paged,page-first 单 buffer,`PagePool` | 无(dense 全前缀) | 有(前缀缓存已落地) | **次发**:page-first 与 pegaflow `stride==copy-size` ABI 冲突,需加 `block_stride`(见 §5.R1) | +| **Qwen3.5 linear(24 层)** | per-request `RecurrentState` [32,128,128] f32 2 MiB/层,非 paged、独立分配 | 无(每步读写整个 matrix) | **零**(this-request 有损摘要,非 content-addressable) | **排除**:offload 无 prefix/dedup 收益;省显存是 per-request swap-out,另一套机制 | +| **DeepSeek-V4 sparse** | per-request per-layer dense arena [window\|compressed],非 paged;compressor 4:1 | **显式**:`topk_idxs` = window 行 + indexer 选中 compressed 行,token/row 粒度,每步重选 | 部分 | **暂缓**:compressor 已控 footprint;indexer 信号现成但 token 粒度 ≠ block 粒度(见 §7) | + +**边界结论**:connector 只收 **block-structured、content-addressable** 的 KV(MLA latent / full-attn paged)。recurrent/SSM state 不进 connector。稀疏的 active-set gather 是独立的、未来的课题。 + +证据:Kimi `pegainfer-kimi-k2/src/runner/{worker.rs:612-619, cache.rs:63-80, mla.rs:38-48}`、`scheduler.rs:16,27,146,180`、`pool.rs:123`;Qwen3.5 linear `pegainfer-qwen35-4b/src/...recurrent.rs`、`batch_decode_graph.rs:82-86`;DeepSeek `pegainfer-deepseek-v4/src/...state.rs:220, indexer.rs:609-670`、`csrc/.../deepseek_indexer.cu:470-527`。 + +## 4. 路线 + +1. **Kimi MLA 首发** —— pegaflow 做 `BlockPool` 下的 host/SSD tier;block evict 时 demote 到 host,前缀 query 命中时从 host restore。带宽便宜(latent 小),layout 零阻抗。 +2. **Qwen full-attn 次发** —— 先给 pegaflow 加 `block_stride_bytes`(R1),再接 page-first buffer。 +3. **linear 排除、sparse 暂缓**。 + +## 5. 可行性(对抗验证结论,附证据) + +四条承重假设由 10-agent workflow 对抗验证: + +1. **✅ 进程内注册裸指针,无 IPC、无第二进程**:`register_context_layer_batch(data_ptrs: &[u64])`(`pegaflow-core/src/lib.rs:242-259`)收裸设备地址,拷贝路径直接喂给 driver API `cuMemcpyDtoHAsync_v2`(`transfer/memcpy.rs:82-89`);IPC 只在 server/Python 层,core 零 IPC 调用点。cudarc 附设备 **primary context**(与 pegainfer 同一),自建 worker stream。 +2. **✅ 依赖无致命冲突**:cudarc 单 major(0.19.3↔0.19.7 统一),cuda-12080/12090 共存(build.rs 取高版本),tokio/tonic/prost 兼容。**依赖行**(git rev pin 到上游 master `07cac7e`,含 #331+#333;`default-features=false` 砍掉 pegaflow 自带的 `cuda-12`/`rdma`,靠 workspace cudarc 提供的 `cuda-12090`+`nvrtc` 满足——pegaflow-core 无 `cfg(cuda-12)` gate): + ```toml + pegaflow-core = { git = "https://github.com/novitalabs/pegaflow.git", rev = "07cac7e50e8ae7be15ad1b9311401039c9ee439b", default-features = false } + ``` + 下次再改 pegaflow:临时换回 path dep 共同开发 → 提 PR → 合入后 re-pin rev。 + **为何 `cuda-12` 而非 `cuda-13`**(本机明明是 CUDA 13.3 toolkit / 13.0 driver):pegainfer 有意锁 `cudarc/cuda-12090`(`Cargo.toml:92-93`,issue #263——配 cudarc 0.19.5+ 的 per-symbol lazy loading,压低 binding level 以**不抬高 runtime driver floor**、保宽部署兼容;故意不用 `cuda-version-from-build-system` 自动,否则 driver floor 会跟着构建机 toolkit 走)。cudarc 在 workspace 是**单实例、feature 取并集后选最高版本**:pegaflow 用 `cuda-12` 并集后仍是 12090、不抬 floor;用 `cuda-13`(→ `cudarc/cuda-13000`)会把**整个 workspace 含 pegainfer 自己**顶到 13000、driver floor 抬到 CUDA 13,撞翻 #263。整体迁 cu13 是独立决策(须同时改 pegainfer 的 cudarc + revisit #263),本期不做。 +3. **⚠️ Layout**:block-hash 键直接适配(`u64→Vec`);page-first layout **不适配**(见 §5.R1);Kimi per-layer 布局**天然适配**。 +4. **✅ 流同步**:host-side 粗同步可解——save 前 pegainfer 必须 `synchronize()` compute stream(pegaflow 私有 stream 只自同步,`gpu_worker.rs:520-528`),restore 前自旋 poll `LoadState`。代价:损 compute/offload 重叠(见 §6.R3)。 + +## 6. connector 接口(dense-first,稀疏留门不展开) + +两层分离,稀疏复杂性全关在 policy 侧: + +```rust +// mechanism —— pegaflow backend,永不懂稀疏/前缀 +trait KvOffloadBackend { + fn load(&self, items: &[(BlockHash, GpuSlot)]) -> LoadHandle; // 任意集合,phase 无关 + fn offload(&self, items: &[(GpuSlot, BlockHash, OffloadHint)]); + fn poll(&self, h: LoadHandle) -> LoadState; +} +enum OffloadHint { ReusableAcrossRequests, TransientDiscard } + +// policy —— per-model scheduler,懂自己的拓扑 +trait KvResidencyPolicy { + fn required_blocks(&self, req: &RequestCtx, phase: Phase) -> SmallVec; + fn save_hint(&self, block: BlockId) -> OffloadHint; +} +``` + +**现在做对、未来免费受益的三个决策**(即便 dense-first 也按这个写,成本为零): +- 接口说 **block 集合**不说 prefix-count(full attention 产出的集合恰好连续 = 退化特例); +- admission 按 **active working set ≤ HBM** 写(dense 下 active=total,退化); +- `load` **phase-agnostic**(不绑 prefill,未来 decode gather 是"启用"不是"重设计")。 + +第一版:`required_blocks` 对 Kimi/Qwen 就是"全前缀",`OffloadHint` 全 `ReusableAcrossRequests`,只走 prefill-前 + evict 路径。 + +## 7. 风险 + +| # | 风险 | 等级 | 处置 | +| --- | --- | --- | --- | +| R1 | Qwen page-first vs pegaflow `stride==copy-size` ABI 不兼容 | major | 给 `KVCacheRegistration` 加 `block_stride_bytes`(改 pegaflow ~几十行,`instance.rs` + `transfer/mod.rs`);**Kimi 首发绕开此风险** | +| R2 | save 前漏 `synchronize()` → 静默 D2H 半写 KV,pegaflow 不校验 | major | bridge 层把 synchronize 设成不可绕过 + debug 断言 | +| R3 | host-side 粗同步损 compute/offload 重叠 | minor | 第一版接受;后续给 pegaflow 加 device-side event-injection | +| R4 | 依赖误配(裸 default-features=false / 漏 cuda-12) | minor | §5.2 依赖行已定,CI 编译验证 | +| R5 | 稀疏 active-set offload 的 token-vs-block 粒度落差 | 已知开放 | 见下,不在本期 | + +**稀疏(已知开放问题,不在本期)**:连 dynamo KVBM 都没解 sparse attention offloading——它的复用是 radix 前缀、offload 是 frequency/LRU、tier 是整请求异步流动,对 SWA 只在 router 透传 `kv_cache_spec_sliding_window` 做 window-aware 前缀,对 topk 零处理。没有现成抽象可继承。pegainfer 侧 DeepSeek 的 indexer 已产出显式可拦截的 active-set 信号,但 token/row 粒度 ≠ block 粒度,且 compressor 已控 footprint 当前不需 offload。机制层(内容寻址 + 可插拔 policy + 语义无关 transfer)本就不堵稀疏,真正缺的 decode-loop gather 大脑到时候结合具体模型新写更准。 + +## 8. 下一步:Kimi MLA 最小 spike + +**目标**:进程内跑通一个 page 的 register→save→evict→load,证伪"无先例"风险 + 量带宽。 + +1. 新 bridge crate,path-dep pegaflow-core(§5.2 依赖行),`cargo build` 验依赖。 +2. Kimi:`new_with_config` → `register_context_layer_batch`(per-layer ckv/kpe,segments=2,per-layer 布局天然适配)。 +3. 一个 page:`synchronize` → `save` →(手动 evict)→ `query` 命中 → `load` 回 GPU → 比对 bytes 一致。 +4. 量 host↔HBM 带宽 + save 前 synchronize 的 host stall(确认 R3 可接受)。 +5. 通过后再决定给 pegaflow 加 `block_stride` 上 Qwen page-first(R1)。 + +**阻塞**:等 §2 战略决策最终拍板(pegaflow 取代 kvbm 卸载层 = 是)。 + +## 9. live 接线设计(Qwen3-4B,**已落地**) + +> 状态:已实现并在真实 GPU 上验证(§0)。下文是设计与实现一致的记录;落地时相对原设计的偏差与加固见末尾「实现注记」。 + +连接层已就绪(§0),把它接进 `Qwen3Executor` + `scheduler.rs` 的真实推理路径。`Qwen3Executor` 持 `kv_mgr`(`BlockPool`+`KvBuffer`)与 `request_kvs`;在构造(`from_runtime`/`single`,model 移入 RankWorker 之前,此时 `KvBuffer` + `device_ctx().stream` 都在手)建一个 `Option`,opt-in(builder flag,**不加 env**),默认关,保现有路径不动。 + +**SAVE(async,best-effort)**:`apply_prefill`/`apply_decode` 封块后(此刻 compute stream 已随 `run_step` 同步 → 满足 §0 的跨 stream ordering 约束),取 `rkv.assigned_block_hashes()`,按 per-request `saved_cursor`(初值 = `prefix_matched_blocks()`,GPU-hit 前缀已 resident,跳过)保存新封的 `(page_id, hash)`,`offload.save(...)` fire-and-forget,推进 cursor。 + +**LOAD(async,GPU+CPU hit,非阻塞 admission)**:admission 把 `match_and_add_prefix` 拆成"建 RequestKv → 算 GPU hit G → query CPU [G..F] → 异步 load → LoadingKv 轮询": + +1. `rkv = pool.new_request(...)`;`hashes = rkv.prompt_block_hashes()`(F 块)。 +2. `manager.match_blocks(&seq_hashes)` 数出 GPU 命中前缀 G(**持其 `ImmutableBlock` 不 drop**,防 load 期间被 evict)。 +3. `offload.query(req_id, &hashes[G..F])` → CPU 命中 C(连续)+ lease。 +4. `manager.allocate_blocks(C)` 拿 C 个 `MutableBlock`(DMA 落点),取 `block_id()` 列表;`offload.load(lease, page_ids)` → `LoadHandle`。请求进 `LoadingKv{rkv, handle, muts, hashes[G..G+C], gpu_imms}` holding 态,**不 prefill**。 +5. 每 tick `handle.poll()`:`Ready` → 对每个 `mut` `.stage(hash, bs)` + `manager.register_block(..)` 注册进 registry(用的就是 `BlockPool::new` 给 padding 块用的同一套公开 API,**无需改 kvbm**);随后 `rkv.match_and_add_prefix()` 自然命中 G+C 连续前缀,`kv_position=(G+C)*bs`;drop holding 的 imms(sequence 自持)。请求转入正常 prefill(suffix = 剩余 token)。 +6. `C==0` → 直接 prefill(纯 GPU hit,与今日行为一致)。 + +**为何 register→rematch 而非直接注入 sequence**:复用现成的 `match_and_add_prefix`(GPU+CPU 在它眼里就是一段连续前缀),零 kvbm 改动;register 与 rematch 同 tick、且 holding 了 G 的 imms,eviction 窗口为零。最坏(真被 evict)只是少命中、退化为多 prefill,不损正确性。 + +**scheduler 状态机**:`scheduler_loop` 新增 `loading: Vec`,每 tick `reclaim_ready_prefetch`(settle 完的回 `deferred` 队首)+ `offer_prefetch`(未 offer 的 deferred 试 prefetch,起 load 的移入 `loading`);空闲且有 `loading` 时 `block_on_loading` 阻塞等一个 DMA。`OffloadEngine` 的 `block_on`(query/flush)只在 scheduler 这个**纯 OS 线程**调用,`debug_assert` 护住误用。 + +风险:preemption/release 时须 drop holding 的 mutable/immutable(RAII 已覆盖);admission KV 预算要把 loading 占用的 C 块计入 in-flight。 + +**实现注记(相对原设计的偏差 + toxic-review 加固)**: + +- **prefetch 状态落在 executor 而非 scheduler**:`PrefixProbe`(持 G 的 imms + commit 后的 C 块)、`LoadReservation`(C 个 MutableBlock DMA 落点)、`LoadHandle` 都封进 `Qwen3Executor.prefetch: HashMap`,scheduler 只跟 `RequestId`。commit 在 `seq_hashes[gpu_hit + i]`(GPU+CPU 偏移对齐,组合 hit 测试守这条)。 +- **lease 泄漏修复**:`query` 创建的 pegaflow lease 在 `reserve_loaded_blocks` 失败 / `load` 提交失败时 `OffloadEngine::release_query_lease` 显式释放(`QueryLeaseId` 是 `Copy` 裸 token、无 Drop,丢掉只会挂到 600s TTL)。 +- **拒绝清理**:admission(context/KV budget)与未知 LoRA 拒绝路径补 `drop_request`——否则一个已 settle prefetch 的请求被拒后,commit 的 block + map entry 永久泄漏。 +- **SAVE 防 slot 复用腐蚀**:async `save()` 把被存 block 的 `ImmutableBlock` 强引用(`KvBlockGuard`,与 `block_ids` 1:1)随 spawn 持到 pegaflow D2H 落地才 drop。否则短请求结束 → slot 回收重分配 → 新请求覆写 → 在途 D2H 抓到新 KV 写进旧 hash = 静默腐蚀。guard 在 offload 线程并发 drop 是安全的(kvbm `BlockStore` 单 Mutex、有并发 drop race 处理);`flush_saves` await 各 save 任务后 guard 才落,故 evict 前先 flush 仍能把 block 排空。 +- **测试**:`tests/kv_offload_cpu_hit.rs` 合一个顺序 `#[test]`(避免两 executor 撞同一 device + pegaflow instance_id),先纯 CPU 后组合 hit。 diff --git a/kvbm/kvbm-logical/src/manager/mod.rs b/kvbm/kvbm-logical/src/manager/mod.rs index 248480e4..699d3c07 100644 --- a/kvbm/kvbm-logical/src/manager/mod.rs +++ b/kvbm/kvbm-logical/src/manager/mod.rs @@ -89,6 +89,16 @@ impl BlockManager { Ok(()) } + /// Evict every cached-but-unused block: drain the inactive pool back to the + /// reset pool. Active blocks (held by a request or an external strong ref, + /// e.g. a leaked padding reservation) are untouched. Unlike + /// [`reset_inactive_pool`](Self::reset_inactive_pool) this makes no + /// assertion about the resulting free count, so it is safe to call on a + /// pool that still has pinned blocks — a cold-cache flush, not a reset. + pub fn evict_inactive(&self) { + drop(self.store.drain_inactive_to_mutable()); + } + /// Register a batch of completed blocks. pub fn register_blocks(&self, blocks: Vec>) -> Vec> { blocks diff --git a/pegainfer-kv-cache/src/buffer.rs b/pegainfer-kv-cache/src/buffer.rs index 52dad3b3..381ad711 100644 --- a/pegainfer-kv-cache/src/buffer.rs +++ b/pegainfer-kv-cache/src/buffer.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use cudarc::driver::{CudaSlice, CudaStream}; +use cudarc::driver::{CudaSlice, CudaStream, DevicePtr}; use half::bf16; use crate::KvLayout; @@ -51,6 +51,18 @@ impl KvBuffer { &self.inner.buffer } + /// Base device address of the fused KV buffer. + /// + /// Stable for the buffer's lifetime — cudarc allocations don't move — so + /// the KV-offload connector registers this once with pegaflow and the + /// page-first [`KvLayout`] strides reach every (layer, block, K/V) segment + /// from it. The returned address outlives the transient stream-ordering + /// guard precisely because the `Arc` keeps the slice alive. + pub fn device_ptr(&self, stream: &CudaStream) -> u64 { + let (ptr, _guard) = self.inner.buffer.device_ptr(stream); + ptr + } + pub fn num_blocks(&self) -> usize { self.inner.num_blocks } diff --git a/pegainfer-kv-cache/src/lib.rs b/pegainfer-kv-cache/src/lib.rs index 26031b76..a4476644 100644 --- a/pegainfer-kv-cache/src/lib.rs +++ b/pegainfer-kv-cache/src/lib.rs @@ -7,7 +7,7 @@ mod view; pub use buffer::KvBuffer; pub use layout::KvLayout; pub use manager::KvCacheManager; -pub use pool::{BlockPool, RequestKv}; +pub use pool::{BlockPool, KvBlockGuard, LoadReservation, PrefixProbe, RequestKv}; pub use view::{KvView, KvViewDesc}; pub use kvbm_logical; diff --git a/pegainfer-kv-cache/src/pool.rs b/pegainfer-kv-cache/src/pool.rs index bead1c74..dc66fdf6 100644 --- a/pegainfer-kv-cache/src/pool.rs +++ b/pegainfer-kv-cache/src/pool.rs @@ -1,4 +1,5 @@ use kvbm_logical::SequenceHash; +use kvbm_logical::blocks::{ImmutableBlock, MutableBlock}; use kvbm_logical::integrations::{DecodeOutcome, SchedulableSequence, ScheduleError}; use kvbm_logical::manager::BlockManager; use kvbm_logical::pools::BlockDuplicationPolicy; @@ -78,6 +79,14 @@ impl BlockPool { self.block_manager.total_blocks().saturating_sub(1) } + /// Evict every cached-but-unused block from the GPU prefix cache (drain the + /// inactive pool). In-use blocks are untouched. A cold-cache flush — and, + /// for the offload path, the way to force a prefix out of HBM so a + /// subsequent request must restore it from the CPU tier. + pub fn evict_inactive(&self) { + self.block_manager.evict_inactive(); + } + /// `lora_name` scopes the prefix cache: blocks registered under one /// adapter (or the base model, `None`) never match a request running /// under a different adapter — the name is folded into the block-hash @@ -100,6 +109,139 @@ impl BlockPool { ); RequestKv { seq } } + + // ── KV-offload prefetch (CPU-tier load before prefill) ───────────── + + /// Resolve `prompt_tokens` against the GPU prefix cache *without* creating + /// a request, returning a [`PrefixProbe`] that holds the GPU-hit prefix + /// blocks alive so an async CPU-tier load can extend it. The connector + /// queries the probe's [`PrefixProbe::cpu_query_hashes`] against the host + /// tier, then [`reserve_loaded_blocks`](Self::reserve_loaded_blocks) + + /// load + [`commit_loaded_blocks`](Self::commit_loaded_blocks). + /// + /// `lora_name` must match the request's adapter — it salts the block + /// hashes, so probing under the wrong adapter would query unrelated keys. + pub fn probe_prefix(&self, prompt_tokens: Vec, lora_name: Option<&str>) -> PrefixProbe { + let num_input = prompt_tokens.len(); + let rkv = self.new_request(prompt_tokens, 0, lora_name); + let seq_hashes = rkv.seq.inner().sequence().all_sequence_hashes(); + // match_and_add_prefix leaves >=1 prompt token uncached, so a request + // can reuse at most this many leading blocks — the CPU load must not + // exceed it, or the trailing loaded block would never be re-matched. + let cacheable = num_input.saturating_sub(1) / self.block_size; + let gpu_guard = self.block_manager.match_blocks(&seq_hashes); + let gpu_hit = gpu_guard.len(); + PrefixProbe { + seq_hashes, + gpu_hit, + cacheable, + held: gpu_guard, + } + } + + /// Reserve `count` mutable blocks as the GPU destinations for a CPU→GPU + /// load. Returns `None` under block pressure (the caller then skips the + /// prefetch and prefills from scratch). The reservation's + /// [`LoadReservation::page_ids`] feed the connector's load; on completion + /// hand it to [`commit_loaded_blocks`](Self::commit_loaded_blocks). + pub fn reserve_loaded_blocks(&self, count: usize) -> Option { + let blocks = self.block_manager.allocate_blocks(count)?; + Some(LoadReservation { blocks }) + } + + /// Stage + register the freshly-loaded blocks under the probe's + /// continuation hashes (`seq_hashes[gpu_hit .. gpu_hit + reserved]`) and + /// fold them into the probe's held set, so a following + /// `new_request().match_and_add_prefix()` reuses the full GPU+CPU prefix. + /// + /// The probe keeps holding every registered block until the request + /// prefills, closing the eviction window between registration and re-match. + pub fn commit_loaded_blocks(&self, probe: &mut PrefixProbe, reservation: LoadReservation) { + let start = probe.gpu_hit; + for (i, block) in reservation.blocks.into_iter().enumerate() { + let hash = probe.seq_hashes[start + i]; + let complete = block + .stage(hash, self.block_size) + .expect("loaded block stage: block_size invariant violated"); + probe.held.push(self.block_manager.register_block(complete)); + } + } +} + +/// A prompt's prefix resolved against the GPU cache, ready to drive a CPU-tier +/// prefetch. Holds every GPU-hit (and, after commit, CPU-loaded) block so they +/// can't be evicted while the load is in flight and before the request prefills. +pub struct PrefixProbe { + /// Content hashes of every complete prompt block, in order (native form). + seq_hashes: Vec, + /// Length of the contiguous GPU-resident prefix. + gpu_hit: usize, + /// Reuse cap: blocks past this are never matched (the final chunk forwards). + cacheable: usize, + /// Strong refs keeping matched/loaded blocks resident until prefill. + held: Vec>, +} + +impl PrefixProbe { + /// Blocks already resident in GPU HBM (the existing prefix-cache hit). + pub fn gpu_hit_blocks(&self) -> usize { + self.gpu_hit + } + + /// Total blocks this probe holds: the GPU-hit prefix plus any committed from + /// a CPU-tier load. They are already out of the free pool and become the + /// request's cached prefix at prefill, so admission credits them against the + /// request's block need (avoiding a double-count against `available_blocks`). + pub fn held_blocks(&self) -> usize { + self.held.len() + } + + /// Content hashes to query the CPU tier with: the blocks past the GPU hit, + /// capped at the reuse boundary. Empty when the GPU hit already covers + /// every reusable block (nothing to load — prefill normally). + pub fn cpu_query_hashes(&self) -> Vec> { + if self.gpu_hit >= self.cacheable { + return Vec::new(); + } + self.seq_hashes[self.gpu_hit..self.cacheable] + .iter() + .map(|h| sequence_hash_bytes(h).to_vec()) + .collect() + } +} + +/// An opaque strong pin on one registered KV block. While held it keeps the +/// block in the active pool (out of the free/inactive pools), so the physical +/// slot cannot be reallocated. Used to hold a block stable across an in-flight +/// async offload save; cheap to clone/drop (one `Arc` bump). See +/// [`RequestKv::assigned_block_guards`]. +/// +/// The inner guard is never read — it exists purely for its `Drop`, which +/// releases the pin. Holding the value *is* the contract. +pub struct KvBlockGuard(#[allow(dead_code)] ImmutableBlock<()>); + +/// GPU destination blocks reserved for a CPU→GPU load, consumed by +/// [`BlockPool::commit_loaded_blocks`] once the DMA lands. +pub struct LoadReservation { + blocks: Vec>, +} + +impl LoadReservation { + /// Physical page ids the connector loads the leased CPU blocks into, in + /// lease order (the i-th leased block lands in `page_ids()[i]`). + pub fn page_ids(&self) -> Vec { + self.blocks.iter().map(|b| b.block_id() as i32).collect() + } + + /// Number of reserved destination blocks. + pub fn len(&self) -> usize { + self.blocks.len() + } + + /// True when no destinations were reserved. + pub fn is_empty(&self) -> bool { + self.blocks.is_empty() + } } /// Per-request KV state wrapping `SchedulableSequence`. @@ -229,12 +371,105 @@ impl RequestKv { pages.truncate(kv_tokens.div_ceil(self.seq.block_size())); pages } + + // ── KV offload bridge ────────────────────────────────────────────── + + /// Content hashes of every *full* prompt block, in prompt order. + /// + /// These are the keys the KV-offload connector queries the CPU tier with, + /// so they must be identical across any two requests sharing a prefix. + /// They are kvbm's lineage-based [`SequenceHash`], which is exactly that: + /// position + content + parent fragment, so block `i` of prompt `P` hashes + /// the same no matter which request computed it. + pub fn prompt_block_hashes(&self) -> Vec<[u8; 16]> { + self.seq + .inner() + .sequence() + .all_sequence_hashes() + .iter() + .map(sequence_hash_bytes) + .collect() + } + + /// `(page_id, content_hash)` for every block currently assigned to this + /// request, in prompt order. Drives the offload save once a block seals; + /// the first [`prefix_matched_blocks`](Self::prefix_matched_blocks) entries + /// are GPU-hit reuse (already resident) and are normally skipped. + pub fn assigned_block_hashes(&self) -> Vec<(i32, [u8; 16])> { + self.seq + .inner() + .assignments() + .assigned_iter() + .map(|(&id, block)| (id as i32, sequence_hash_bytes(&block.sequence_hash()))) + .collect() + } + + /// Strong pins for every block currently assigned to this request, aligned + /// 1:1 (same order) with [`assigned_block_hashes`](Self::assigned_block_hashes). + /// + /// An offload save's GPU→CPU copy runs asynchronously after the save is + /// submitted; holding the matching [`KvBlockGuard`] keeps that block out of + /// the free/inactive pool until the copy lands, so a later request can't be + /// allocated the same slot and overwrite it mid-copy. Drop the guard once + /// the save reports done. + pub fn assigned_block_guards(&self) -> Vec { + self.seq + .inner() + .assignments() + .assigned_iter() + .map(|(_, block)| KvBlockGuard(block.clone())) + .collect() + } + + /// Number of leading blocks reused from the GPU prefix cache. + pub fn prefix_matched_blocks(&self) -> usize { + self.seq.inner().prefix_matched_blocks() + } +} + +/// Pack a kvbm [`SequenceHash`] (lineage hash) into the 16-byte content key the +/// offload tier addresses blocks by. Big-endian for a stable on-wire ordering. +fn sequence_hash_bytes(hash: &SequenceHash) -> [u8; 16] { + hash.as_u128().to_be_bytes() } #[cfg(test)] mod tests { use super::*; + /// The offload CPU-tier query keys are `prompt_block_hashes`. The whole + /// load path is built on these being identical for any two requests that + /// share a prefix (and diverging the moment content does) — otherwise a + /// warm block saved by one request would never match the next. Guard it. + #[test] + fn prompt_block_hashes_stable_across_shared_prefix() { + let pool = BlockPool::new(16, 256).unwrap(); + let shared: Vec = (0..48).map(|i| 1000 + i).collect(); // 3 full blocks + let mut a_tokens = shared.clone(); + a_tokens.extend((0..16).map(|i| 7000 + i)); // 4th block diverges + let mut b_tokens = shared.clone(); + b_tokens.extend((0..16).map(|i| 9000 + i)); + + let a = pool.new_request(a_tokens, 8, None); + let b = pool.new_request(b_tokens, 8, None); + let ha = a.prompt_block_hashes(); + let hb = b.prompt_block_hashes(); + + assert_eq!(ha.len(), 4, "64 tokens / 16 = 4 full blocks"); + assert_eq!(hb.len(), 4); + assert_eq!(ha[..3], hb[..3], "shared prefix must hash identically"); + assert_ne!(ha[3], hb[3], "divergent block must hash differently"); + assert!(ha.iter().all(|h| *h != [0u8; 16]), "hashes are non-trivial"); + + // A different LoRA salt must poison the match — same tokens, new keys. + let c = pool.new_request(shared, 8, Some("adapter-x")); + assert_ne!( + c.prompt_block_hashes()[0], + ha[0], + "salt (lora) must scope the prefix cache" + ); + } + /// kvbm's `schedule_decode` allocates the next generation block when the /// appended token fills the current block (`need = pending + 1`), so the /// raw `page_indices()` exceeds `ceil(kv_tokens / block_size)` at every diff --git a/pegainfer-kv-offload/Cargo.toml b/pegainfer-kv-offload/Cargo.toml new file mode 100644 index 00000000..2d8b9e3a --- /dev/null +++ b/pegainfer-kv-offload/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "pegainfer-kv-offload" +version = "0.1.0" +edition = "2024" + +[dependencies] +# Embedded in-process: pegaflow-core is the host/SSD/RDMA KV offload tier. +# Pinned to the upstream master commit that landed the two changes pegainfer +# needs: block_stride_bytes (#331) and the in-process load API (#333). Bump the +# rev when upstreaming further pegaflow changes; co-develop via a temporary path +# dep + PR, then re-pin here. +# default-features=false drops its `cuda-12` (which would pull a clashing +# cudarc/cuda-12080 selector) and `rdma` (no RDMA hardware needed for the +# dense GPU<->CPU path). The workspace cudarc already provides cuda-12090 + +# nvrtc, which is all pegaflow-core's code needs — it has no cfg(cuda-12) gates. +pegaflow-core = { git = "https://github.com/novitalabs/pegaflow.git", rev = "07cac7e50e8ae7be15ad1b9311401039c9ee439b", default-features = false } +pegainfer-kv-cache = { workspace = true } +cudarc = { workspace = true } +anyhow = { workspace = true } +half = { workspace = true } +log = { workspace = true } +tokio = { workspace = true } + +[lints] +workspace = true diff --git a/pegainfer-kv-offload/src/engine.rs b/pegainfer-kv-offload/src/engine.rs new file mode 100644 index 00000000..432b859f --- /dev/null +++ b/pegainfer-kv-offload/src/engine.rs @@ -0,0 +1,447 @@ +//! [`OffloadEngine`]: the in-process connector that moves KV blocks between +//! pegainfer's GPU paged cache and pegaflow's host/SSD tiers. +//! +//! It owns a [`PegaEngine`] plus a small tokio runtime to drive pegaflow's +//! async save/query, and translates pegainfer's page-first [`KvLayout`] into +//! pegaflow's per-layer strided registration. Block content hashes are opaque +//! `Vec` here — the caller (scheduler) derives them from kvbm sequence +//! hashes, so this layer never depends on the logical-cache hashing scheme. + +use std::sync::{Arc, Mutex}; + +use cudarc::driver::CudaStream; +use pegaflow_core::{ + EngineError, LayerSave, PegaEngine, PrefetchStatus, QueryLeaseId, StorageConfig, +}; +use pegainfer_kv_cache::KvBuffer; +use tokio::runtime::Runtime; +use tokio::sync::oneshot; +use tokio::task::JoinHandle; + +/// Single-GPU, single-rank topology. The dense Qwen3-4B path runs one offload +/// engine per executor rank, each owning one GPU's KV buffer. +const NAMESPACE: &str = "pegainfer"; +const TP_RANK: usize = 0; +const PP_RANK: usize = 0; +const TP_SIZE: usize = 1; +const WORLD_SIZE: usize = 1; + +/// bf16 KV cache: every layout stride is counted in elements, bytes are ×2. +const ELEM_SIZE: usize = std::mem::size_of::(); + +/// Guard the `block_on` entry points: tokio panics with an opaque message if +/// you block on a runtime from within any runtime. These methods are meant for +/// the synchronous scheduler thread — fail loud and specific if that's violated. +fn assert_outside_runtime(op: &str) { + debug_assert!( + tokio::runtime::Handle::try_current().is_err(), + "OffloadEngine::{op} drives the offload runtime with block_on and must be \ + called from a synchronous thread, never from within a tokio runtime" + ); +} + +/// Tuning knobs for a new [`OffloadEngine`]. +pub struct OffloadConfig { + /// Stable identifier shared across this engine's lifetime so prefix blocks + /// saved by one request are query-visible to the next. + pub instance_id: String, + /// CUDA device ordinal whose KV buffer this engine offloads. + pub device_id: i32, + /// Host pinned-memory pool size in bytes (the CPU KV tier capacity). + pub pinned_pool_bytes: usize, + /// Worker threads for the embedded runtime that drives pegaflow's async + /// save/query. Two is plenty: save is fire-and-forget, query is a brief + /// memory-cache lookup. + pub runtime_threads: usize, +} + +impl OffloadConfig { + pub fn new(instance_id: impl Into, device_id: i32, pinned_pool_bytes: usize) -> Self { + Self { + instance_id: instance_id.into(), + device_id, + pinned_pool_bytes, + runtime_threads: 2, + } + } +} + +/// A query hit: how many prefix blocks pegaflow can return from its CPU tier, +/// and the lease that owns those blocks until [`OffloadEngine::load`] consumes +/// it. `num_blocks == 0` means a full miss and `lease` is `None`. +pub struct QueryHit { + pub lease: Option, + pub num_blocks: usize, +} + +/// In-flight handle for a CPU→GPU load submitted to pegaflow's worker. +/// +/// The load runs on pegaflow's GPU worker thread; this resolves when the DMA +/// completes. [`Self::poll`] keeps scheduler admission non-blocking; [`Self::wait`] +/// blocks for tests and non-pipelined callers. +pub struct LoadHandle { + rx: oneshot::Receiver>, +} + +impl LoadHandle { + /// Non-blocking check for a scheduler tick. `None` while still loading. + pub fn poll(&mut self) -> Option> { + match self.rx.try_recv() { + Ok(result) => Some(result), + Err(oneshot::error::TryRecvError::Empty) => None, + Err(oneshot::error::TryRecvError::Closed) => Some(Err(EngineError::Storage( + "load worker dropped reply".into(), + ))), + } + } + + /// Block the current thread until the load settles. + pub fn wait(self) -> Result<(), EngineError> { + self.rx + .blocking_recv() + .unwrap_or_else(|_| Err(EngineError::Storage("load worker dropped reply".into()))) + } +} + +/// Per-layer registration geometry derived once from a [`KvBuffer`]'s layout. +/// +/// Only `data_ptrs` and `size_bytes` differ per layer; the rest are the same +/// scalar broadcast across all layers (kept as vectors only to feed pegaflow's +/// one batched registration call). +struct Registration { + layer_names: Vec, + data_ptrs: Vec, + size_bytes: Vec, + num_blocks: Vec, + bytes_per_block: Vec, + kv_stride_bytes: Vec, + segments: Vec, + block_stride_bytes: Vec, +} + +impl Registration { + /// Map the fused page-first buffer to pegaflow's per-layer view. + /// + /// Each model layer registers as one pegaflow "layer". Within a page the + /// layout is K then V back-to-back (`layer_stride = 2·kv_block_len`), so a + /// layer's K and V are *contiguous* — one single segment of `layer_stride` + /// bytes copies both, and pegaflow's K/V-split path (which needs the two + /// segments set apart, `kv_stride > bytes_per_block`) does not apply here. + /// What is *not* contiguous is consecutive blocks of one layer: the fused + /// buffer interleaves all layers within a page, so they sit `page_stride` + /// apart. That gap (stride ≠ copy size) is exactly what `block_stride_bytes` + /// decouples. + fn from_buffer(buffer: &KvBuffer, stream: &CudaStream) -> Self { + let layout = buffer.layout(); + let num_blocks = buffer.num_blocks(); + let base_ptr = buffer.device_ptr(stream); + + // One block's copy unit for a layer = its whole [K|V] span in a page. + let layer_bytes = layout.layer_stride * ELEM_SIZE; + let page_stride_bytes = layout.page_stride * ELEM_SIZE; + let total_bytes = num_blocks * page_stride_bytes; + + let n = layout.num_layers; + let mut reg = Registration { + layer_names: Vec::with_capacity(n), + data_ptrs: Vec::with_capacity(n), + size_bytes: Vec::with_capacity(n), + num_blocks: vec![num_blocks; n], + bytes_per_block: vec![layer_bytes; n], + kv_stride_bytes: vec![0; n], + segments: vec![1; n], + block_stride_bytes: vec![page_stride_bytes; n], + }; + for layer in 0..n { + let layer_off = layer * layer_bytes; + reg.layer_names.push(layer.to_string()); + reg.data_ptrs.push(base_ptr + layer_off as u64); + // The layer's region runs from its [K|V] base to the end of the + // buffer; bounds are validated against the strided last-block reach. + reg.size_bytes.push(total_bytes - layer_off); + } + reg + } +} + +/// In-process bridge from pegainfer's GPU KV cache to pegaflow's offload tiers. +/// +/// Dropping the engine drops its [`Runtime`], which abandons any in-flight +/// fire-and-forget [`Self::save`] tasks. That is acceptable: the host tier is a +/// cache, so a lost save only forfeits a future hit, never inference +/// correctness. Saves that must survive a handoff (eviction) use the synchronous +/// [`Self::save_blocking`] instead. +pub struct OffloadEngine { + engine: Arc, + runtime: Runtime, + instance_id: String, + device_id: i32, + /// Owned per-layer names; load borrows these as `&[&str]`. + layer_names: Vec, + /// In-flight fire-and-forget save tasks. [`Self::flush_saves`] awaits these + /// before draining the write pipeline, so a flush is a true barrier — the + /// detached D2H may not even have started when the caller flushes. + /// Finished handles are pruned on each [`Self::save`]. + pending_saves: Mutex>>, +} + +impl OffloadEngine { + /// Build the engine and register `buffer` as the GPU side of the offload. + /// + /// `stream` must be the stream that owns `buffer` (used only to read its + /// base device address). pegaflow attaches the device's primary CUDA + /// context for its own worker transfers — the same context pegainfer runs + /// on — so the registered pointers are valid across both. + pub fn new( + config: OffloadConfig, + buffer: &KvBuffer, + stream: &CudaStream, + ) -> Result { + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(config.runtime_threads.max(1)) + .enable_all() + .build() + .map_err(|e| EngineError::Storage(format!("offload runtime build: {e}")))?; + + let storage_config = StorageConfig::default(); + let engine = Arc::new(PegaEngine::new_with_config( + config.pinned_pool_bytes, + false, + storage_config, + )?); + + let reg = Registration::from_buffer(buffer, stream); + engine.register_context_layer_batch_strided( + &config.instance_id, + NAMESPACE, + config.device_id, + TP_RANK, + PP_RANK, + TP_SIZE, + WORLD_SIZE, + reg.layer_names.len(), + ®.layer_names, + ®.data_ptrs, + ®.size_bytes, + ®.num_blocks, + ®.bytes_per_block, + ®.kv_stride_bytes, + ®.segments, + Some(®.block_stride_bytes), + )?; + + Ok(Self { + engine, + runtime, + instance_id: config.instance_id, + device_id: config.device_id, + layer_names: reg.layer_names, + pending_saves: Mutex::new(Vec::new()), + }) + } + + /// Fan one (block_id, hash) list across every layer — the device data + /// differs per layer, the ids and hashes don't. + fn build_saves(&self, block_ids: &[i32], block_hashes: &[Vec]) -> Vec { + self.layer_names + .iter() + .map(|name| LayerSave { + layer_name: name.clone(), + block_ids: block_ids.to_vec(), + block_hashes: block_hashes.to_vec(), + }) + .collect() + } + + /// Save the named GPU blocks to the host tier — fire-and-forget. + /// + /// Best-effort by contract: the GPU→CPU copy runs on pegaflow's worker and + /// any failure (pinned pool full, copy error) is logged, never surfaced. + /// `block_hashes[i]` is the content hash of `block_ids[i]`; all layers share + /// the same (block_id, hash) pairing — only the device data differs. + /// + /// ORDERING CONTRACT: pegaflow's D2H runs on *its own* stream, with no + /// dependency on pegainfer's compute stream. The caller must therefore only + /// save blocks whose KV writes are already complete — i.e. call this after + /// the producing forward step has synchronized (block-seal time, which is + /// post-step-sync in the executor). Saving a block whose attention write is + /// still in flight reads torn data. This connector cannot enforce the + /// invariant (it does not own the compute stream); the wiring must uphold it. + /// + /// REUSE CONTRACT: the copy reads the GPU block asynchronously *after* this + /// returns, so the block must stay stable until the copy lands. `keep_alive` + /// is an opaque payload (e.g. the source blocks' allocator guards) held for + /// the lifetime of the spawned save and dropped only once it finishes — so + /// the caller's blocks cannot be evicted and overwritten under the in-flight + /// D2H (which would snapshot the wrong KV and persist it under the old hash). + /// Pass `()` only when the blocks are owned elsewhere for the whole save. + pub fn save( + &self, + block_ids: &[i32], + block_hashes: &[Vec], + keep_alive: G, + ) { + debug_assert_eq!(block_ids.len(), block_hashes.len()); + if block_ids.is_empty() { + return; + } + let saves = self.build_saves(block_ids, block_hashes); + let engine = Arc::clone(&self.engine); + let instance_id = self.instance_id.clone(); + let device_id = self.device_id; + let handle = self.runtime.spawn(async move { + if let Err(e) = engine + .batch_save_kv_blocks_from_ipc(&instance_id, TP_RANK, PP_RANK, device_id, saves) + .await + { + log::warn!("pegaflow save failed (best-effort): {e}"); + } + // Release the source-block pins only now the D2H has landed; before + // this point the blocks must not be reused (see REUSE CONTRACT). + drop(keep_alive); + }); + // Track for `flush_saves`; prune the ones that already settled so the + // list stays bounded by the genuinely in-flight saves. + let mut pending = self.pending_saves.lock().expect("pending_saves poisoned"); + pending.retain(|h| !h.is_finished()); + pending.push(handle); + } + + /// Save the named GPU blocks and block until the GPU→CPU copy has captured + /// the data into the host tier (the insert may still be in flight; pair with + /// [`Self::flush_saves`] for cache visibility). + /// + /// The synchronous contract is what makes this safe at eviction handoff: the + /// GPU block can be reused the moment this returns. Errors surface, unlike + /// the fire-and-forget [`Self::save`]. The same compute-stream ORDERING + /// CONTRACT as [`Self::save`] applies: blocking waits on pegaflow's D2H, not + /// on pegainfer's compute stream, so the writes must already be complete. + pub fn save_blocking( + &self, + block_ids: &[i32], + block_hashes: &[Vec], + ) -> Result<(), EngineError> { + debug_assert_eq!(block_ids.len(), block_hashes.len()); + if block_ids.is_empty() { + return Ok(()); + } + assert_outside_runtime("save_blocking"); + let saves = self.build_saves(block_ids, block_hashes); + self.runtime + .block_on(self.engine.batch_save_kv_blocks_from_ipc( + &self.instance_id, + TP_RANK, + PP_RANK, + self.device_id, + saves, + )) + } + + /// Look up how long a prefix of `block_hashes` is resident in the CPU tier. + /// + /// Returns the hit-block count and a lease owning those blocks; pass the + /// lease to [`Self::load`] to copy them to GPU. `req_id` must be non-empty + /// and unique enough to scope an in-flight prefetch (the request id works). + pub fn query(&self, req_id: &str, block_hashes: &[Vec]) -> Result { + if block_hashes.is_empty() { + return Ok(QueryHit { + lease: None, + num_blocks: 0, + }); + } + assert_outside_runtime("query"); + let status = self + .runtime + .block_on(self.engine.count_prefix_hit_blocks_with_prefetch( + &self.instance_id, + req_id, + block_hashes, + ))?; + + match status { + // No SSD/RDMA tier in the dense v1 path, so a prefetch never lands + // in flight; treat the rare `Loading` as a miss this tick. + PrefetchStatus::Loading => Ok(QueryHit { + lease: None, + num_blocks: 0, + }), + PrefetchStatus::Ready { blocks, .. } => { + if blocks.is_empty() { + return Ok(QueryHit { + lease: None, + num_blocks: 0, + }); + } + let num_blocks = blocks.len(); + let lease = self.engine.create_query_lease(&self.instance_id, blocks)?; + Ok(QueryHit { + lease: Some(lease), + num_blocks, + }) + } + } + } + + /// Copy the leased CPU blocks into the GPU blocks named by `dst_block_ids`, + /// across every registered layer. Returns a non-blocking [`LoadHandle`]. + /// + /// `dst_block_ids.len()` must equal the lease's block count (the + /// `num_blocks` from [`Self::query`]); pegaflow maps the i-th leased block + /// onto `dst_block_ids[i]` for each layer. + pub fn load( + &self, + lease: QueryLeaseId, + dst_block_ids: Vec, + ) -> Result { + let layer_refs: Vec<&str> = self.layer_names.iter().map(String::as_str).collect(); + let loads = [(lease, dst_block_ids)]; + let rx = self.engine.batch_load_kv_blocks_multi_layer_inproc( + &self.instance_id, + TP_RANK, + self.device_id, + &layer_refs, + &loads, + )?; + Ok(LoadHandle { rx }) + } + + /// Release a query lease without loading it. + /// + /// [`Self::query`] pins its hit blocks behind a lease until [`Self::load`] + /// consumes it. When the caller decides not to load (e.g. no GPU + /// destination blocks are free), it must release the lease here — a dropped + /// [`QueryLeaseId`] is an inert token, so without this the pinned host + /// blocks would sit unevictable until the lease's TTL expires. A no-op if + /// the lease was already consumed by a `load`. + pub fn release_query_lease(&self, lease: QueryLeaseId) { + self.engine.release_query_lease(&lease); + } + + /// Flush pending saves into the read cache so a following [`Self::query`] + /// can see them. A correctness barrier for tests and eviction handoff, not + /// a steady-state call. + /// + /// First awaits every in-flight fire-and-forget [`Self::save`] (their D2H + /// copy + write-pipeline submit), *then* drains the write pipeline — without + /// the first step a detached save that has not started yet would be missed. + pub fn flush_saves(&self) { + assert_outside_runtime("flush_saves"); + let handles: Vec> = { + let mut pending = self.pending_saves.lock().expect("pending_saves poisoned"); + pending.drain(..).collect() + }; + self.runtime.block_on(async { + for handle in handles { + let _ = handle.await; + } + self.engine.flush_saves().await; + }); + } + + /// Drop all resident CPU-tier blocks (test/eviction helper). Saved data in + /// a backing store would survive; the dense v1 path has none, so this + /// empties the CPU tier. + pub fn evict_all(&self) { + self.engine.cleanup_memory_cache(); + } +} diff --git a/pegainfer-kv-offload/src/lib.rs b/pegainfer-kv-offload/src/lib.rs new file mode 100644 index 00000000..3cf3a961 --- /dev/null +++ b/pegainfer-kv-offload/src/lib.rs @@ -0,0 +1,21 @@ +//! In-process KV cache offload bridge between pegainfer and pegaflow. +//! +//! pegainfer owns the GPU paged-KV (`pegainfer-kv-cache::KvBuffer`, page-first +//! layout) and the logical prefix cache (kvbm `BlockPool`). pegaflow owns the +//! deeper tiers (host pinned memory, SSD, RDMA). [`OffloadEngine`] is the +//! connector "brain" that moves blocks between them and decides when. +//! +//! Dense-attention v1 (Qwen3-4B): the GPU prefix hit stays native to kvbm's +//! `BlockPool`; this connector covers the CPU tier and stacks a CPU-hit prefix +//! on top of the GPU-hit prefix (both anchor at prefix 0, so the combined hit +//! is one contiguous prefix split at a single point — GPU→CPU→GPU interleaving +//! is deliberately excluded). Save is best-effort fire-and-forget; load is on +//! the critical path, strongly ordered, but never blocks admission — a request +//! polls its [`LoadHandle`] each scheduler tick. + +mod engine; + +pub use engine::{LoadHandle, OffloadConfig, OffloadEngine, QueryHit}; + +// Re-exported so callers name pegaflow's engine types through this bridge. +pub use pegaflow_core::{EngineError, PegaEngine, QueryLeaseId}; diff --git a/pegainfer-kv-offload/tests/cpu_roundtrip.rs b/pegainfer-kv-offload/tests/cpu_roundtrip.rs new file mode 100644 index 00000000..f02f0c67 --- /dev/null +++ b/pegainfer-kv-offload/tests/cpu_roundtrip.rs @@ -0,0 +1,155 @@ +//! GPU→CPU→GPU round-trip over a real page-first [`KvBuffer`]. +//! +//! Writes a distinct bit pattern into a set of source GPU blocks, offloads them +//! to pegaflow's host tier, evicts the GPU-side data implicitly by loading into +//! a *different* set of blocks, and checks the bytes match. This exercises the +//! whole connector — strided per-layer registration (`block_stride` ≠ copy +//! size), the K/V split, the async save, the prefix query, and the in-process +//! oneshot load — on actual device memory. If the layout math were wrong the +//! loaded bytes would land in the wrong layer/segment/block and the compare +//! would fail. +//! +//! Requires a CUDA GPU; skipped from `--lib` unit runs. + +use cudarc::driver::{CudaContext, result}; +use half::bf16; +use pegainfer_kv_cache::KvBuffer; +use pegainfer_kv_offload::{OffloadConfig, OffloadEngine}; + +const NUM_LAYERS: usize = 4; +const NUM_KV_HEADS: usize = 2; +const HEAD_DIM: usize = 8; +const PAGE_SIZE: usize = 16; +const NUM_BLOCKS: usize = 32; + +/// Elements in one K (or V) segment of one block. +const SEGMENT_LEN: usize = PAGE_SIZE * NUM_KV_HEADS * HEAD_DIM; +const LAYER_STRIDE: usize = 2 * SEGMENT_LEN; +const PAGE_STRIDE: usize = NUM_LAYERS * LAYER_STRIDE; + +/// Deterministic, finite, varied pattern for one (logical block, layer, segment). +/// `logical` is the block's position in the saved hash list — load must restore +/// the i-th leased block onto the i-th destination, so the destination's bytes +/// must equal `pattern(i, ..)` regardless of which physical block held it. +fn pattern(logical: usize, layer: usize, segment: usize) -> Vec { + (0..SEGMENT_LEN) + .map(|e| { + let seed = (logical * 9973 + layer * 257 + segment * 131 + e * 7) % 4093; + bf16::from_f32(seed as f32 / 11.0 - 90.0) + }) + .collect() +} + +/// Byte address of (block, layer, segment)'s first element within the fused buffer. +fn segment_ptr(base: u64, block_id: usize, layer: usize, segment: usize) -> u64 { + let elem_off = block_id * PAGE_STRIDE + layer * LAYER_STRIDE + segment * SEGMENT_LEN; + base + (elem_off * std::mem::size_of::()) as u64 +} + +fn block_hash(logical: usize) -> Vec { + let mut h = vec![0xA5u8; 16]; + h[0] = logical as u8; + h[1] = (logical as u8).wrapping_mul(31).wrapping_add(7); + h +} + +#[test] +fn gpu_cpu_gpu_roundtrip_preserves_kv_bytes() { + let ctx = CudaContext::new(0).expect("cuda device 0"); + ctx.bind_to_thread().expect("bind ctx to test thread"); + let stream = ctx.default_stream(); + + let buffer = KvBuffer::new( + &stream, + NUM_LAYERS, + NUM_KV_HEADS, + HEAD_DIM, + PAGE_SIZE, + NUM_BLOCKS, + ) + .expect("alloc KvBuffer"); + // Sanity: our test-local geometry constants match the buffer's layout. + assert_eq!(buffer.layout().page_stride, PAGE_STRIDE); + assert_eq!(buffer.layout().kv_block_len, SEGMENT_LEN); + + let base = buffer.device_ptr(&stream); + + let src_blocks = [1usize, 2, 3]; + let dst_blocks = [10usize, 11, 12]; + let untouched_block = 20usize; + + // ── Fill source blocks with the per-(logical, layer, segment) pattern ── + for (logical, &block_id) in src_blocks.iter().enumerate() { + for layer in 0..NUM_LAYERS { + for segment in 0..2 { + let data = pattern(logical, layer, segment); + let dst = segment_ptr(base, block_id, layer, segment); + // SAFETY: dst lies inside the buffer (block < NUM_BLOCKS) and the + // slice is exactly one segment of bf16, the buffer's element type. + unsafe { result::memcpy_htod_sync(dst, &data) }.expect("htod fill"); + } + } + } + stream.synchronize().expect("sync after fill"); + + // ── Build the offload engine (registers the fused buffer) ── + let engine = OffloadEngine::new( + OffloadConfig::new("roundtrip-test", 0, 64 * 1024 * 1024), + &buffer, + &stream, + ) + .expect("build OffloadEngine"); + + let hashes: Vec> = (0..src_blocks.len()).map(block_hash).collect(); + let src_ids: Vec = src_blocks.iter().map(|&b| b as i32).collect(); + + // ── Save GPU→CPU (blocking capture) and make the writes cache-visible ── + engine.save_blocking(&src_ids, &hashes).expect("save"); + engine.flush_saves(); + + // ── Query the CPU tier: the full 3-block prefix must be resident ── + let hit = engine.query("roundtrip-req", &hashes).expect("query"); + assert_eq!( + hit.num_blocks, 3, + "all three saved blocks should hit the CPU tier" + ); + let lease = hit.lease.expect("a hit returns a lease"); + + // ── Load CPU→GPU into a *different* set of blocks ── + let dst_ids: Vec = dst_blocks.iter().map(|&b| b as i32).collect(); + engine + .load(lease, dst_ids) + .expect("submit load") + .wait() + .expect("load completes"); + stream.synchronize().expect("sync after load"); + + // ── Verify each destination block holds the matching logical pattern ── + for (logical, &block_id) in dst_blocks.iter().enumerate() { + for layer in 0..NUM_LAYERS { + for segment in 0..2 { + let expected = pattern(logical, layer, segment); + let mut got = vec![bf16::ZERO; SEGMENT_LEN]; + let src = segment_ptr(base, block_id, layer, segment); + // SAFETY: src is one in-bounds segment of bf16. + unsafe { result::memcpy_dtoh_sync(&mut got, src) }.expect("dtoh verify"); + let expected_bits: Vec = expected.iter().map(|v| v.to_bits()).collect(); + let got_bits: Vec = got.iter().map(|v| v.to_bits()).collect(); + assert_eq!( + got_bits, expected_bits, + "dst block {block_id} layer {layer} segment {segment} \ + must restore logical block {logical}'s bytes" + ); + } + } + } + + // ── Negative control: a block we never loaded stays zero ── + let mut zero = vec![bf16::from_f32(1.0); SEGMENT_LEN]; + let src = segment_ptr(base, untouched_block, 0, 0); + unsafe { result::memcpy_dtoh_sync(&mut zero, src) }.expect("dtoh untouched"); + assert!( + zero.iter().all(|v| v.to_bits() == 0), + "an unloaded block must remain zeroed — load must not scribble outside its destinations" + ); +} diff --git a/pegainfer-qwen3-4b/Cargo.toml b/pegainfer-qwen3-4b/Cargo.toml index 65d58a82..5a017c4c 100644 --- a/pegainfer-qwen3-4b/Cargo.toml +++ b/pegainfer-qwen3-4b/Cargo.toml @@ -15,6 +15,7 @@ comfy-table = { workspace = true, optional = true } crossbeam-channel = { workspace = true } cudarc = { workspace = true } pegainfer-kv-cache = { workspace = true } +pegainfer-kv-offload = { workspace = true } fastrace = { workspace = true } half = { workspace = true } hex = { workspace = true, optional = true } diff --git a/pegainfer-qwen3-4b/src/executor.rs b/pegainfer-qwen3-4b/src/executor.rs index 10358a44..47b7caf4 100644 --- a/pegainfer-qwen3-4b/src/executor.rs +++ b/pegainfer-qwen3-4b/src/executor.rs @@ -4,16 +4,19 @@ use std::thread; use anyhow::Result; use crossbeam_channel as channel; -use crate::Qwen3LoraOptions; use crate::batch_decode_buffers::{BATCH_BUCKETS, BatchDecodeBuffers}; use crate::config::{Config, TensorParallelConfig}; use crate::weights::{ModelRuntimeConfig, Qwen3Model}; +use crate::{Qwen3LoraOptions, Qwen3OffloadOptions}; use pegainfer_core::engine::{LoadLoraAdapterRequest, TokenLogprob, UnloadLoraAdapterRequest}; use pegainfer_core::kv_pool::KvLayout; use pegainfer_core::ops; use pegainfer_core::sampler::SamplingParams; use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; -use pegainfer_kv_cache::{KvBuffer, KvCacheManager, KvView}; +use pegainfer_kv_cache::{ + KvBlockGuard, KvBuffer, KvCacheManager, KvView, LoadReservation, PrefixProbe, +}; +use pegainfer_kv_offload::{LoadHandle, OffloadConfig, OffloadEngine}; #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)] pub struct RequestId(pub(crate) u64); @@ -449,6 +452,41 @@ pub(crate) trait ModelExecutor: Send { fn list_lora_adapters(&self) -> Vec { Vec::new() } + + // ── KV-offload prefetch hooks (no-op unless offload is enabled) ───── + + /// Offer a freshly-submitted request for async CPU-tier KV prefetch. + /// Returns `true` if a load is now in flight and the scheduler must park + /// the request until [`Self::drain_ready_prefetch`] reports it ready. + fn begin_kv_prefetch( + &mut self, + _request_id: RequestId, + _prompt_tokens: &[u32], + _lora_adapter: Option<&str>, + ) -> bool { + false + } + + /// Non-blocking sweep: request ids whose prefetch just settled (now + /// prefill-eligible). + fn drain_ready_prefetch(&mut self) -> Vec { + Vec::new() + } + + /// Block until at least one in-flight prefetch settles (idle-only), then + /// sweep the rest. + fn wait_ready_prefetch(&mut self) -> Vec { + Vec::new() + } + + /// Blocks `request_id` already holds via a settled prefetch (its restored + /// prefix). These were taken out of the free pool for this request and + /// become its cached prefill prefix, so admission credits them against the + /// request's block need to avoid double-counting. Zero unless a prefetch + /// has committed for `request_id`. + fn prefetched_blocks(&self, _request_id: RequestId) -> usize { + 0 + } } struct Qwen3ExecutorMetadata { @@ -466,10 +504,42 @@ pub struct Qwen3Executor { loaded_lora_adapters: HashSet, prefix_cache_enabled: bool, lora_options: Qwen3LoraOptions, + /// pegaflow KV-offload bridge; `None` unless offload is opted in on the + /// single-GPU path. Drives both the SAVE hook and the async LOAD prefetch. + offload: Option, + /// Per-request count of sealed blocks already saved to the host tier, so + /// each step only saves blocks that newly sealed. Initialized to the + /// GPU-hit prefix (already resident) on first save. + saved_cursor: HashMap, + /// In-flight CPU→GPU prefetches keyed by request, parked until their load + /// settles and the blocks register into the prefix cache. + prefetch: HashMap, + /// Offload pure-L2 mode. When set, completed blocks are not kept for + /// cross-request HBM reuse: the prefetch probe drains the inactive pool + /// first, so every probe sees `gpu_hit == 0` and the whole cacheable prefix + /// is restored from the host tier. This is what `--no-prefix-cache` means + /// once offload is on (the L2 restore still rides on `match_and_add_prefix`, + /// so prefix matching itself stays enabled). Set via + /// [`Self::set_no_prefix_cache`]. + l1_retention_disabled: bool, +} + +/// One request's in-flight CPU-tier KV prefetch. +/// +/// Holds the destination blocks (via `probe`/`reservation`) and the load handle +/// so the scheduler can poll completion non-blockingly. Once the load settles, +/// the reservation is committed (blocks staged + registered) and only `probe` +/// remains, holding the GPU+CPU prefix resident until the request prefills. +struct PrefetchState { + probe: PrefixProbe, + /// `Some` until the load lands and the blocks are committed. + reservation: Option, + /// `Some` while the DMA is in flight; `None` once it has settled. + handle: Option, } impl Qwen3Executor { - pub(crate) fn single(model: Qwen3Model) -> Result { + pub(crate) fn single(model: Qwen3Model, offload_opts: &Qwen3OffloadOptions) -> Result { let budget = model.kv_budget(); let kv_mgr = KvCacheManager::new( &model.device_ctx().stream, @@ -485,6 +555,9 @@ impl Qwen3Executor { config: model.config().clone(), }; let kv_buffer = kv_mgr.buffer().clone(); + // Build the offload engine while the model's stream is still in hand + // (it moves into the RankWorker below). Registers the fused KV buffer. + let offload = build_offload(offload_opts, &kv_mgr, model.device_ctx())?; let total_blocks = kv_mgr.pool().total_blocks(); let padding_block_id = kv_mgr.pool().padding_block_id(); Ok(Self { @@ -499,6 +572,10 @@ impl Qwen3Executor { loaded_lora_adapters: HashSet::new(), prefix_cache_enabled: true, lora_options: Qwen3LoraOptions::default(), + offload, + saved_cursor: HashMap::new(), + prefetch: HashMap::new(), + l1_retention_disabled: false, }) } @@ -512,6 +589,7 @@ impl Qwen3Executor { enable_cuda_graph, device_ordinals, Qwen3LoraOptions::default(), + Qwen3OffloadOptions::disabled(), ) } @@ -520,12 +598,19 @@ impl Qwen3Executor { enable_cuda_graph: bool, device_ordinals: &[usize], lora_options: Qwen3LoraOptions, + offload_options: Qwen3OffloadOptions, ) -> Result { let lora_options = lora_options.validate()?; anyhow::ensure!( !device_ordinals.is_empty(), "Qwen3 executor requires at least one device" ); + anyhow::ensure!( + !offload_options.enabled || device_ordinals.len() == 1, + "KV offload is only supported on the single-GPU path (tensor parallel \ + shards KV per rank); got {} devices", + device_ordinals.len() + ); if device_ordinals.len() == 1 { let model = Qwen3Model::from_safetensors_with_runtime( model_path, @@ -537,7 +622,7 @@ impl Qwen3Executor { max_lora_rank: lora_options.max_lora_rank, }, )?; - let mut executor = Self::single(model)?; + let mut executor = Self::single(model, &offload_options)?; executor.lora_options = lora_options; return Ok(executor); } @@ -635,6 +720,11 @@ impl Qwen3Executor { loaded_lora_adapters: HashSet::new(), prefix_cache_enabled: true, lora_options, + // Offload is single-GPU only (asserted above); never built here. + offload: None, + saved_cursor: HashMap::new(), + prefetch: HashMap::new(), + l1_retention_disabled: false, }) } @@ -677,6 +767,145 @@ impl Qwen3Executor { self.prefix_cache_enabled = enabled; } + /// vLLM-style `--no-prefix-cache`. Behaviour depends on whether offload is + /// active: + /// * **No offload** — classic: disable prefix matching outright, so every + /// prefill recomputes the full prompt. + /// * **With offload** — pure-L2 mode: keep matching on (the host-tier + /// restore registers blocks and relies on `match_and_add_prefix` to pick + /// them up) but stop retaining completed blocks in HBM, so no request + /// ever serves its prefix from a cross-request L1 hit. Every reuse then + /// comes from the host tier, which is the point of the L2 benchmark. + /// + /// A resident HBM block and its host-tier copy share one content hash, so + /// the cache cannot be told to prefer L2 for a block still in HBM — the only + /// way to force the bytes from L2 is to not keep the HBM copy around. + pub fn set_no_prefix_cache(&mut self, on: bool) { + if self.offload.is_some() { + self.l1_retention_disabled = on; + } else { + self.prefix_cache_enabled = !on; + } + } + + /// Whether KV offload is active on this executor. + pub fn offload_enabled(&self) -> bool { + self.offload.is_some() + } + + /// Flush pending offload saves into the host read cache so a following + /// query can see them. A persistence barrier for handoff and tests; no-op + /// without offload. + pub fn flush_offload_saves(&self) { + if let Some(offload) = &self.offload { + offload.flush_saves(); + } + } + + /// Drop every cached-but-unused GPU prefix block. With offload on, this + /// forces a cold prefix to be restored from the host tier on its next + /// request (rather than served from HBM). + pub fn evict_cached_blocks(&self) { + self.kv_mgr.pool().evict_inactive(); + } + + /// Begin an async CPU-tier KV prefetch for `request_id`; see the + /// [`ModelExecutor`] hook. Public so admission drivers and tests can park a + /// request on its load. Returns `true` when a load is in flight. + pub fn begin_kv_prefetch( + &mut self, + request_id: RequestId, + prompt_tokens: &[u32], + lora_adapter: Option<&str>, + ) -> bool { + ::begin_kv_prefetch(self, request_id, prompt_tokens, lora_adapter) + } + + /// Block until at least one in-flight prefetch settles, then sweep the + /// rest; returns the settled request ids (now prefill-eligible). + pub fn wait_ready_prefetch(&mut self) -> Vec { + ::wait_ready_prefetch(self) + } + + // ── KV-offload SAVE ──────────────────────────────────────────────── + + /// Save every block that sealed since this request's last save to the host + /// tier (fire-and-forget). Safe to call right after `apply_prefill`/ + /// `apply_decode`: the producing step's token read-back has already + /// synchronized the compute stream, so the sealed KV is fully written. + fn save_sealed_blocks(&mut self, request_id: RequestId) { + if self.offload.is_none() { + return; + } + let Some(rkv) = self.request_kvs.get(&request_id) else { + return; + }; + // `assigned_block_hashes` lists only sealed (registered) blocks; the + // partial tail block has no hash and never appears here. + let assigned = rkv.assigned_block_hashes(); + let prefix_matched = rkv.prefix_matched_blocks(); + let cursor = self + .saved_cursor + .entry(request_id) + .or_insert(prefix_matched); + if assigned.len() <= *cursor { + return; + } + let fresh = &assigned[*cursor..]; + let block_ids: Vec = fresh.iter().map(|(id, _)| *id).collect(); + let block_hashes: Vec> = fresh.iter().map(|(_, h)| h.to_vec()).collect(); + // Pin exactly the blocks being saved (aligned 1:1 with `assigned`) for + // the duration of the async D2H, so a finished request can't hand the + // slot to a new request that overwrites it before the copy lands. + let pins: Vec = rkv + .assigned_block_guards() + .into_iter() + .skip(*cursor) + .collect(); + *cursor = assigned.len(); + self.offload + .as_ref() + .expect("offload present") + .save(&block_ids, &block_hashes, pins); + } + + // ── KV-offload LOAD (async CPU-tier prefetch) ────────────────────── + // The trait-facing prefetch hooks (`begin_kv_prefetch`, + // `drain_ready_prefetch`, `wait_ready_prefetch`, `has_pending_prefetch`) + // live in the `ModelExecutor` impl below; `settle_prefetch` is their shared + // helper. + + /// Finalize one prefetch whose load returned `result`. On success the + /// reserved blocks are staged + registered (held by the probe until the + /// request prefills); on failure the state is dropped so the request + /// prefills from scratch. + fn settle_prefetch( + &mut self, + id: RequestId, + result: Result<(), pegainfer_kv_offload::EngineError>, + ) { + if let Some(st) = self.prefetch.get_mut(&id) { + st.handle = None; + } + match result { + Ok(()) => { + let reservation = self + .prefetch + .get_mut(&id) + .and_then(|st| st.reservation.take()) + .expect("reservation present until commit"); + let st = self.prefetch.get_mut(&id).expect("prefetch present"); + self.kv_mgr + .pool() + .commit_loaded_blocks(&mut st.probe, reservation); + } + Err(e) => { + log::warn!("KV offload load failed for {id:?} (prefill from scratch): {e}"); + self.prefetch.remove(&id); + } + } + } + fn wait_for_step_ack( pending: Vec>>, op_name: &'static str, @@ -712,6 +941,33 @@ impl Qwen3Executor { } } +/// Build the KV-offload engine for the single-GPU path, or `None` when offload +/// is disabled. Registers the fused KV buffer with pegaflow against the model's +/// device/stream — must be called while that stream is still owned by the model +/// (before it moves into the `RankWorker`). +fn build_offload( + opts: &Qwen3OffloadOptions, + kv_mgr: &KvCacheManager, + ctx: &DeviceContext, +) -> Result> { + if !opts.enabled { + return Ok(None); + } + let device_id = ctx.device_ordinal as i32; + let config = OffloadConfig::new( + format!("qwen3-4b-dev{device_id}"), + device_id, + opts.pinned_pool_bytes, + ); + let engine = OffloadEngine::new(config, kv_mgr.buffer(), &ctx.stream) + .map_err(|e| anyhow::anyhow!("KV offload engine init failed: {e}"))?; + log::info!( + "KV offload enabled on device {device_id} ({} MiB host tier)", + opts.pinned_pool_bytes >> 20 + ); + Ok(Some(engine)) +} + fn ensure_lora_capacity( loaded_lora_adapters: &HashSet, lora_name: &str, @@ -760,13 +1016,146 @@ impl ModelExecutor for Qwen3Executor { self.metadata.stop_token_ids.contains(&token_id) } + fn prefetched_blocks(&self, request_id: RequestId) -> usize { + self.prefetch + .get(&request_id) + .map(|st| st.probe.held_blocks()) + .unwrap_or(0) + } + fn drop_request(&mut self, request_id: RequestId) -> Result<()> { // Remove and drop — RAII on SchedulableSequence's block guards - // returns all allocated blocks regardless of lifecycle state. + // returns all allocated blocks regardless of lifecycle state. The same + // RAII frees any parked prefetch's reserved/held blocks. self.request_kvs.remove(&request_id); + // A parked prefetch may still have a load in flight: pegaflow's worker + // is writing the reserved GPU blocks (H2D). Dropping the reservation now + // frees those physical pages for immediate reuse while the DMA keeps + // landing on them — silent KV corruption, the load-side mirror of the + // SAVE keep-alive pin. Block until the copy finishes before the + // reservation drops. The scheduler is a dedicated synchronous thread, so + // this brief wait costs nothing it could spend elsewhere. + if let Some(mut state) = self.prefetch.remove(&request_id) { + if let Some(handle) = state.handle.take() { + let _ = handle.wait(); + } + } + self.saved_cursor.remove(&request_id); Ok(()) } + fn begin_kv_prefetch( + &mut self, + request_id: RequestId, + prompt_tokens: &[u32], + lora_adapter: Option<&str>, + ) -> bool { + let Some(offload) = self.offload.as_ref() else { + return false; + }; + if !self.prefix_cache_enabled { + return false; + } + if self.l1_retention_disabled { + // Pure-L2 mode: drop any cross-request HBM retention so the probe + // sees gpu_hit == 0 and queries the whole cacheable prefix from the + // host tier. Only inactive (completed, unheld) blocks are drained — + // the current request holds nothing yet, and in-flight prefetches + // keep their reserved blocks, so this never touches live KV. + self.kv_mgr.pool().evict_inactive(); + } + let probe = self + .kv_mgr + .pool() + .probe_prefix(prompt_tokens.to_vec(), lora_adapter); + let query_hashes = probe.cpu_query_hashes(); + if query_hashes.is_empty() { + return false; + } + let hit = match offload.query(&request_id.0.to_string(), &query_hashes) { + Ok(hit) => hit, + Err(e) => { + log::warn!("KV offload query failed for {request_id:?} (skipping): {e}"); + return false; + } + }; + let (Some(lease), num_blocks) = (hit.lease, hit.num_blocks) else { + return false; // miss + }; + let Some(reservation) = self.kv_mgr.pool().reserve_loaded_blocks(num_blocks) else { + // Block pressure: release the lease so its pinned host blocks aren't + // held for the full lease TTL, and prefill from scratch rather than + // stall. + offload.release_query_lease(lease); + return false; + }; + let page_ids = reservation.page_ids(); + let handle = match offload.load(lease, page_ids) { + Ok(handle) => handle, + Err(e) => { + log::warn!("KV offload load submit failed for {request_id:?} (skipping): {e}"); + // `load` consumes the lease only past its early validation; a + // submit error may leave it pinned, so release it (no-op if it + // was already consumed). + offload.release_query_lease(lease); + return false; + } + }; + self.prefetch.insert( + request_id, + PrefetchState { + probe, + reservation: Some(reservation), + handle: Some(handle), + }, + ); + true + } + + fn drain_ready_prefetch(&mut self) -> Vec { + let ids: Vec = self.prefetch.keys().copied().collect(); + let mut done = Vec::new(); + for id in ids { + let poll = match self.prefetch.get_mut(&id).and_then(|st| st.handle.as_mut()) { + Some(handle) => handle.poll(), + None => continue, // already settled, awaiting prefill + }; + if let Some(result) = poll { + self.settle_prefetch(id, result); + done.push(id); + } + } + done + } + + fn wait_ready_prefetch(&mut self) -> Vec { + let mut done = Vec::new(); + if let Some(id) = self + .prefetch + .iter() + .find(|(_, st)| st.handle.is_some()) + .map(|(id, _)| *id) + { + let handle = self + .prefetch + .get_mut(&id) + .and_then(|st| st.handle.take()) + .expect("in-flight handle present"); + let result = handle.wait(); + self.settle_prefetch(id, result); + // `settle_prefetch` clears the handle, so the drain below skips it; + // record it here as the one we blocked on. + done.push(id); + } + // Sweep any others that completed concurrently. + for id in self.drain_ready_prefetch() { + if !done.contains(&id) { + done.push(id); + } + } + done + } + fn execute_prefill(&mut self, plan: PrefillPlan<'_>) -> Result { // 1. Create RequestKvs, reuse cached prefix blocks, schedule the rest let mut requests = plan.requests.to_vec(); @@ -786,6 +1175,10 @@ impl ModelExecutor for Qwen3Executor { anyhow::anyhow!("schedule_prefill failed for {:?}: {e}", req.request_id) })?; self.request_kvs.insert(req.request_id, rkv); + // match_and_add_prefix above already absorbed any CPU-prefetched + // blocks (now held by the request's sequence), so release the + // prefetch's separate hold. + self.prefetch.remove(&req.request_id); } // 2. Build KvViews (seq_len = cached prefix + new suffix) @@ -819,6 +1212,10 @@ impl ModelExecutor for Qwen3Executor { .expect("request must exist after prefill"); rkv.apply_prefill(req_result.first_token, self.kv_mgr.pool())?; } + // 5. Offload the blocks this prefill just sealed (post-step-sync). + for req_result in &result.requests { + self.save_sealed_blocks(req_result.request_id); + } Ok(result) } @@ -866,6 +1263,10 @@ impl ModelExecutor for Qwen3Executor { .expect("request must exist after decode"); rkv.apply_decode(req_result.token, self.kv_mgr.pool())?; } + // 5. Offload any block this decode step just sealed (post-step-sync). + for req_result in &result.requests { + self.save_sealed_blocks(req_result.request_id); + } Ok(result) } @@ -888,6 +1289,7 @@ impl ModelExecutor for Qwen3Executor { anyhow::anyhow!("schedule_prefill failed for {:?}: {e}", req.request_id) })?; self.request_kvs.insert(req.request_id, rkv); + self.prefetch.remove(&req.request_id); } // Schedule decode for active requests @@ -945,6 +1347,13 @@ impl ModelExecutor for Qwen3Executor { .expect("request must exist after unified decode"); rkv.apply_decode(req_result.token, self.kv_mgr.pool())?; } + // 5. Offload sealed blocks from both halves (post-step-sync). + for req_result in &result.prefill_requests { + self.save_sealed_blocks(req_result.request_id); + } + for req_result in &result.decode_requests { + self.save_sealed_blocks(req_result.request_id); + } Ok(result) } diff --git a/pegainfer-qwen3-4b/src/lib.rs b/pegainfer-qwen3-4b/src/lib.rs index 48c20af0..83106f68 100644 --- a/pegainfer-qwen3-4b/src/lib.rs +++ b/pegainfer-qwen3-4b/src/lib.rs @@ -63,6 +63,45 @@ impl Default for Qwen3LoraOptions { } } +/// KV-offload (pegaflow) opt-in for the single-GPU Qwen3 path. +/// +/// Disabled by default — the existing GPU-only prefix cache is unchanged. +/// When enabled, the executor saves sealed KV blocks to pegaflow's host tier +/// and prefetches CPU-resident prefixes back into HBM before prefill, so a +/// prompt that has fallen out of the GPU cache still skips recompute. Only the +/// single-GPU topology is supported (tensor parallel shards KV per rank). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Qwen3OffloadOptions { + pub enabled: bool, + /// Host pinned-memory pool size (the CPU KV-tier capacity), in bytes. + pub pinned_pool_bytes: usize, +} + +impl Qwen3OffloadOptions { + /// 8 GiB host tier — a few thousand dense Qwen3-4B blocks. + pub const DEFAULT_PINNED_POOL_BYTES: usize = 8 << 30; + + pub fn disabled() -> Self { + Self { + enabled: false, + pinned_pool_bytes: 0, + } + } + + pub fn enabled(pinned_pool_bytes: usize) -> Self { + Self { + enabled: true, + pinned_pool_bytes, + } + } +} + +impl Default for Qwen3OffloadOptions { + fn default() -> Self { + Self::disabled() + } +} + /// Low-level Qwen3 execution interface. /// /// This is the production phase boundary used by the Qwen3 scheduler and by @@ -99,6 +138,24 @@ pub fn probe_model(model_path: &Path) -> Result> { } pub fn start_engine(model_path: &Path, options: EngineLoadOptions) -> Result { + start_engine_with_offload(model_path, options, Qwen3OffloadOptions::disabled(), false) +} + +/// Like [`start_engine`] but with pegaflow KV offload (single-GPU only). The +/// host tier persists sealed KV blocks and serves CPU-resident prefixes back +/// into HBM before prefill. +/// +/// `no_prefix_cache` is the vLLM-style switch (see +/// [`Qwen3Executor::set_no_prefix_cache`](runtime::Qwen3Executor::set_no_prefix_cache)): +/// without offload it disables prefix matching outright; with offload it keeps +/// the host tier but stops cross-request HBM reuse, so every prefix is served +/// from L2 — the pure-L2 benchmark mode. +pub fn start_engine_with_offload( + model_path: &Path, + options: EngineLoadOptions, + offload_options: Qwen3OffloadOptions, + no_prefix_cache: bool, +) -> Result { let EngineLoadOptions { enable_cuda_graph, device_ordinals, @@ -108,13 +165,22 @@ pub fn start_engine(model_path: &Path, options: EngineLoadOptions) -> Result Result { let EngineLoadOptions { enable_cuda_graph, @@ -131,5 +197,7 @@ pub fn start_engine_with_lora_control( &device_ordinals, seed, lora_options.validate()?, + offload_options, + no_prefix_cache, ) } diff --git a/pegainfer-qwen3-4b/src/scheduler.rs b/pegainfer-qwen3-4b/src/scheduler.rs index a4d87ce0..67b09a50 100644 --- a/pegainfer-qwen3-4b/src/scheduler.rs +++ b/pegainfer-qwen3-4b/src/scheduler.rs @@ -18,8 +18,8 @@ use rand::SeedableRng; use rand::rngs::StdRng; use tokio::sync::mpsc; -use crate::Qwen3LoraOptions; use crate::executor::{ModelExecutor, Qwen3Executor, RequestId}; +use crate::{Qwen3LoraOptions, Qwen3OffloadOptions}; use pegainfer_core::engine::{ EngineCommand, EngineControlRequest, EngineHandle, GenerateRequest, TokenEvent, }; @@ -54,6 +54,10 @@ pub(super) struct PendingRequest { pub(super) token_tx: mpsc::UnboundedSender, pub(super) logprobs: usize, pub(super) echo: bool, + /// Whether this request has already been offered to async KV prefetch. + /// Offered at most once; a no-hit offer leaves the request in the normal + /// admission flow with this set so it isn't re-probed every tick. + pub(super) prefetch_offered: bool, } impl PendingRequest { @@ -67,6 +71,7 @@ impl PendingRequest { token_tx: req.token_tx, logprobs: req.logprobs, echo: req.echo, + prefetch_offered: false, } } } @@ -78,8 +83,17 @@ pub(crate) fn start_qwen3( enable_cuda_graph: bool, device_ordinals: &[usize], seed: u64, + offload_options: Qwen3OffloadOptions, + no_prefix_cache: bool, ) -> Result { - let executor = Qwen3Executor::from_runtime(model_path, enable_cuda_graph, device_ordinals)?; + let mut executor = Qwen3Executor::from_runtime_with_lora_options( + model_path, + enable_cuda_graph, + device_ordinals, + Qwen3LoraOptions::default(), + offload_options, + )?; + executor.set_no_prefix_cache(no_prefix_cache); Ok(start_with_executor(executor, seed)) } @@ -89,13 +103,17 @@ pub(crate) fn start_qwen3_with_lora_control( device_ordinals: &[usize], seed: u64, lora_options: Qwen3LoraOptions, + offload_options: Qwen3OffloadOptions, + no_prefix_cache: bool, ) -> Result { - let executor = Qwen3Executor::from_runtime_with_lora_options( + let mut executor = Qwen3Executor::from_runtime_with_lora_options( model_path, enable_cuda_graph, device_ordinals, lora_options, + offload_options, )?; + executor.set_no_prefix_cache(no_prefix_cache); Ok(start_with_executor_with_lora_control(executor, seed)) } @@ -131,6 +149,90 @@ where EngineHandle::new_with_command_channel(command_tx) } +// ── KV-offload prefetch admission helpers ──────────────────────────────── + +/// Move requests whose async CPU-tier prefetch just settled from `loading` +/// back to the front of `deferred` — their KV is hot, so admit them first. +fn reclaim_ready_prefetch( + executor: &mut E, + deferred: &mut Vec, + loading: &mut Vec, +) { + promote_ready(executor.drain_ready_prefetch(), deferred, loading); +} + +/// Offer each not-yet-offered `deferred` request to async CPU-tier prefetch, +/// moving the ones that start loading out of `deferred` into `loading`. A +/// request that doesn't start a load (pure GPU hit, miss, or block pressure) +/// stays in `deferred`, flagged so it isn't re-probed next tick. +/// +/// Echo requests are never offered: their prefill forwards the whole prompt to +/// recover prompt logprobs and so skips `match_and_add_prefix` (see +/// `execute_prefill`). Prefetched blocks would never be matched/reused — they +/// would only park restored KV that admission credits but prefill can't spend, +/// starving the request under tight budgets. Leaving `prefetch_offered` unset +/// for echo is harmless: the `!req.echo` guard keeps them from being probed. +fn offer_prefetch( + executor: &mut E, + deferred: &mut Vec, + loading: &mut Vec, +) { + let mut keep = Vec::with_capacity(deferred.len()); + for mut req in deferred.drain(..) { + if !req.prefetch_offered && !req.echo { + req.prefetch_offered = true; + if executor.begin_kv_prefetch( + req.request_id, + &req.prompt_tokens, + req.lora_adapter.as_deref(), + ) { + loading.push(req); + continue; + } + } + keep.push(req); + } + *deferred = keep; +} + +/// Block until at least one in-flight prefetch settles, then promote the +/// settled requests to `deferred`. Called only when the scheduler is otherwise +/// idle, so blocking on the DMA costs nothing. +fn block_on_loading( + executor: &mut E, + deferred: &mut Vec, + loading: &mut Vec, +) { + promote_ready(executor.wait_ready_prefetch(), deferred, loading); +} + +fn promote_ready( + ready: Vec, + deferred: &mut Vec, + loading: &mut Vec, +) { + for id in ready { + if let Some(pos) = loading.iter().position(|p| p.request_id == id) { + deferred.insert(0, loading.remove(pos)); + } + } +} + +/// Release any executor-side state a request accumulated before it was rejected +/// at admission. A rejected request never prefills, so the only state it can +/// hold is a settled KV prefetch — committed prefix blocks parked in the +/// executor while the request waited in `deferred`. Without this they would +/// leak (blocks pinned, map entry stranded) for the engine's lifetime. Idempotent +/// and harmless for requests that were never prefetched. +fn release_rejected(executor: &mut E, req: &PendingRequest) { + if let Err(e) = executor.drop_request(req.request_id) { + warn!( + "failed to release state for rejected {:?}: {e}", + req.request_id + ); + } +} + // ── Main loop ─────────────────────────────────────────────────────────── fn scheduler_loop( @@ -146,6 +248,8 @@ fn scheduler_loop( // Requests that could not be admitted due to KV budget pressure. // Held here so they aren't lost; re-evaluated every loop iteration. let mut deferred: Vec = Vec::new(); + // Requests parked while their async CPU-tier KV prefetch loads. + let mut loading: Vec = Vec::new(); info!("Scheduler ready"); @@ -159,8 +263,18 @@ fn scheduler_loop( next_request_id += 1; } - // 2. Nothing active and nothing deferred → block until a request arrives. + // 2. Reclaim settled prefetches, then offer fresh requests to prefetch. + reclaim_ready_prefetch(&mut executor, &mut deferred, &mut loading); + offer_prefetch(&mut executor, &mut deferred, &mut loading); + + // 3. Nothing active and nothing admittable → block. Prefer blocking on + // an in-flight load (so its request prefills next) over a new submit; + // only truly idle (no loads either) do we block on the channel. if active.is_empty() && deferred.is_empty() { + if !loading.is_empty() { + block_on_loading(&mut executor, &mut deferred, &mut loading); + continue; + } if let Some(req) = submit_rx.blocking_recv() { deferred.push(PendingRequest::from_scheduler_request( RequestId(next_request_id), @@ -178,11 +292,13 @@ fn scheduler_loop( )); next_request_id += 1; } + continue; } let lora_validation = reject_unknown_lora_requests(deferred, &executor); for rejected in &lora_validation.rejected { send_unknown_lora_rejection(rejected); + release_rejected(&mut executor, rejected); } let admission = admit_deferred_requests( @@ -193,9 +309,11 @@ fn scheduler_loop( executor.max_request_blocks(), executor.max_context_tokens(), executor.max_decode_batch_size(), + |id| executor.prefetched_blocks(id), ); for (rejected, reason) in &admission.rejected { send_rejection(rejected, *reason); + release_rejected(&mut executor, rejected); } let pending = admission.pending; deferred = admission.deferred; @@ -228,6 +346,7 @@ fn scheduler_loop_with_lora_control( let mut active: Vec = Vec::new(); let mut next_request_id = 0u64; let mut deferred: Vec = Vec::new(); + let mut loading: Vec = Vec::new(); let mut pending_control: VecDeque = VecDeque::new(); let mut post_control_deferred: Vec = Vec::new(); @@ -246,6 +365,14 @@ fn scheduler_loop_with_lora_control( ); } + // 1b. Reclaim settled prefetches and offer fresh requests. Control + // commands gate generation, so only offer once no control is pending + // (a prefetch must not race ahead of an adapter load it depends on). + reclaim_ready_prefetch(&mut executor, &mut deferred, &mut loading); + if pending_control.is_empty() { + offer_prefetch(&mut executor, &mut deferred, &mut loading); + } + // 2. Once idle, apply pending control commands before admitting newer // generation requests that arrived behind them. if active.is_empty() && deferred.is_empty() { @@ -255,9 +382,13 @@ fn scheduler_loop_with_lora_control( } } - // 3. Nothing active and no deferred generation → block until any - // command arrives. + // 3. Nothing active and no deferred generation → block. An in-flight + // load takes priority over waiting on a new command. if active.is_empty() && deferred.is_empty() { + if !loading.is_empty() { + block_on_loading(&mut executor, &mut deferred, &mut loading); + continue; + } if let Some(command) = command_rx.blocking_recv() { enqueue_engine_command( command, @@ -290,6 +421,7 @@ fn scheduler_loop_with_lora_control( let lora_validation = reject_unknown_lora_requests(deferred, &executor); for rejected in &lora_validation.rejected { send_unknown_lora_rejection(rejected); + release_rejected(&mut executor, rejected); } let admission = admit_deferred_requests( @@ -300,14 +432,21 @@ fn scheduler_loop_with_lora_control( executor.max_request_blocks(), executor.max_context_tokens(), executor.max_decode_batch_size(), + |id| executor.prefetched_blocks(id), ); for (rejected, reason) in &admission.rejected { send_rejection(rejected, *reason); + release_rejected(&mut executor, rejected); } let pending = admission.pending; deferred = admission.deferred; if active.is_empty() && pending.is_empty() { + // A parked load must still be polled to completion before we block. + if !loading.is_empty() { + block_on_loading(&mut executor, &mut deferred, &mut loading); + continue; + } if let Some(command) = command_rx.blocking_recv() { enqueue_engine_command( command, @@ -503,6 +642,11 @@ fn admit_deferred_requests( max_request_blocks: usize, max_context_tokens: usize, max_decode_batch_size: usize, + // Blocks a request already holds from a settled prefetch. These are already + // out of `available_blocks`, so they must be credited against the request's + // need or admission double-counts them and can wedge a near-budget CPU-hit + // request forever (never admitted, prefetch never released). + prefetch_credit: impl Fn(RequestId) -> usize, ) -> AdmissionOutcome { let mut budget = available_blocks.saturating_sub(active_future_blocks(active, block_size)); let mut decode_slots = max_decode_batch_size.saturating_sub(active.len()); @@ -522,14 +666,19 @@ fn admit_deferred_requests( continue; } - let max_needed = blocks_needed(max_request_tokens(&req), block_size); - if max_needed > max_request_blocks { + // Full physical footprint gates the per-request cap (a request occupies + // all of it, prefetched or not)… + let footprint = blocks_needed(max_request_tokens(&req), block_size); + if footprint > max_request_blocks { rejected.push((req, RejectReason::KvBudget)); continue; } - if max_needed <= budget && decode_slots > 0 { - budget -= max_needed; + // …but only the blocks not already held by this request's prefetch must + // come from the free-pool budget. + let fresh_needed = footprint.saturating_sub(prefetch_credit(req.request_id)); + if fresh_needed <= budget && decode_slots > 0 { + budget -= fresh_needed; decode_slots -= 1; pending.push(req); } else { @@ -662,6 +811,7 @@ mod tests { decode_delay: Duration, loaded_lora_adapters: HashSet, dropped: Arc>>, + prefetch_offers: Arc>>, prefill_batches: Arc>>>, decode_batches: Arc>>>, prefill_lora_batches: Arc>>>>, @@ -680,6 +830,7 @@ mod tests { decode_delay: Duration::ZERO, loaded_lora_adapters: HashSet::new(), dropped, + prefetch_offers: Arc::new(Mutex::new(Vec::new())), prefill_batches: Arc::new(Mutex::new(Vec::new())), decode_batches: Arc::new(Mutex::new(Vec::new())), prefill_lora_batches: Arc::new(Mutex::new(Vec::new())), @@ -778,6 +929,16 @@ mod tests { Ok(()) } + fn begin_kv_prefetch( + &mut self, + request_id: RequestId, + _prompt_tokens: &[u32], + _lora_adapter: Option<&str>, + ) -> bool { + self.prefetch_offers.lock().unwrap().push(request_id.get()); + false + } + fn list_lora_adapters(&self) -> Vec { let mut names: Vec<_> = self.loaded_lora_adapters.iter().cloned().collect(); names.sort(); @@ -999,7 +1160,7 @@ mod tests { ]; // available 4 blocks - 2 reserved for active growth = budget of 2. - let outcome = admit_deferred_requests(deferred, &active, 16, 4, 4, usize::MAX, 64); + let outcome = admit_deferred_requests(deferred, &active, 16, 4, 4, usize::MAX, 64, |_| 0); let ids = |reqs: &[PendingRequest]| reqs.iter().map(|r| r.request_id.get()).collect::>(); @@ -1038,7 +1199,7 @@ mod tests { mk(3, 40, 1), // request 3: 40 prompt + 1 max = 41 total: overflows by 9 tokens → rejected ]; - let outcome = admit_deferred_requests(deferred, &active, 16, 1000, 1000, 32, 64); + let outcome = admit_deferred_requests(deferred, &active, 16, 1000, 1000, 32, 64, |_| 0); let pending_ids = outcome .pending @@ -1084,8 +1245,16 @@ mod tests { } let pending = PendingRequest::from_scheduler_request(RequestId(64), request(16, 1).0); - let outcome = - admit_deferred_requests(vec![pending], &active, 16, 1024, 1024, usize::MAX, 64); + let outcome = admit_deferred_requests( + vec![pending], + &active, + 16, + 1024, + 1024, + usize::MAX, + 64, + |_| 0, + ); assert!( outcome.pending.is_empty(), @@ -1157,6 +1326,43 @@ mod tests { ); } + fn pending(request_id: u64, echo: bool) -> PendingRequest { + let (token_tx, _token_rx) = mpsc::unbounded_channel(); + PendingRequest { + request_id: RequestId::new(request_id), + lora_adapter: None, + prompt_tokens: vec![1; 32], + params: SamplingParams::default(), + max_tokens: 1, + token_tx, + logprobs: 0, + echo, + prefetch_offered: false, + } + } + + #[test] + fn echo_requests_are_never_offered_to_prefetch() { + let dropped = Arc::new(Mutex::new(Vec::new())); + let mut executor = FakeExecutor::new(64, dropped); + let offers = Arc::clone(&executor.prefetch_offers); + + let mut deferred = vec![pending(1, true), pending(2, false)]; + let mut loading = Vec::new(); + offer_prefetch(&mut executor, &mut deferred, &mut loading); + + // The plain request is probed; the echo request is skipped entirely, so + // its prefill forwards the whole prompt without parking unspendable KV. + assert_eq!(*offers.lock().unwrap(), vec![2]); + let echo = deferred.iter().find(|r| r.request_id.get() == 1).unwrap(); + assert!(!echo.prefetch_offered, "echo request must stay un-probed"); + let plain = deferred.iter().find(|r| r.request_id.get() == 2).unwrap(); + assert!( + plain.prefetch_offered, + "plain request must be marked probed" + ); + } + fn request( prompt_len: usize, max_tokens: usize, diff --git a/pegainfer-qwen3-4b/src/scheduler/plan.rs b/pegainfer-qwen3-4b/src/scheduler/plan.rs index 86bbab4a..78c380cb 100644 --- a/pegainfer-qwen3-4b/src/scheduler/plan.rs +++ b/pegainfer-qwen3-4b/src/scheduler/plan.rs @@ -156,6 +156,7 @@ mod tests { token_tx, logprobs: 0, echo: false, + prefetch_offered: false, } } diff --git a/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs b/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs new file mode 100644 index 00000000..c2a47e22 --- /dev/null +++ b/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs @@ -0,0 +1,272 @@ +//! Live GPU+CPU prefix-hit gate for the pegaflow KV-offload integration. +//! +//! Drives a real Qwen3-4B [`Qwen3Executor`] with offload enabled to prove the +//! end-to-end wiring on actual model weights: +//! * a cold prefill SAVEs its sealed KV blocks to pegaflow's host tier; +//! * after the GPU prefix cache is flushed, a second identical request finds +//! the prefix only on the CPU tier (a genuine CPU-only hit) and the async +//! prefetch RESTOREs it into HBM; +//! * the restored KV reproduces the original first-token logits. +//! +//! This is the one test that exercises save → host-tier persistence → query → +//! async load → register → prefill-rematch through the executor, not a unit +//! harness. `tests/cpu_roundtrip.rs` (in `pegainfer-kv-offload`) covers the raw +//! byte path; this covers the live executor wiring. If the load landed in the +//! wrong layer/segment/block the warm logits would be whole nats off. +//! +//! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when absent +//! (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). + +use std::collections::HashMap; +use std::path::Path; + +use pegainfer_core::sampler::SamplingParams; +use pegainfer_qwen3_4b::runtime::{PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId}; +use pegainfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions}; + +const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); +const BLOCK: usize = 16; +const LOGPROBS: usize = 16; +const MAX_OUTPUT: usize = 8; +/// 512 MiB host tier — comfortably more than the handful of dense Qwen3-4B +/// blocks this test offloads (~2.25 MiB/block). +const HOST_TIER_BYTES: usize = 512 << 20; + +/// Warm-vs-cold bounds, following the prefix-cache methodology: the CPU-restored +/// KV is byte-identical to the original GPU compute, so the only legitimate +/// drift is the prefill GEMM shrinking to the uncached tail (bf16 reduction +/// order). The warm argmax must sit within `REGRET_TOL` of cold; the mean head +/// delta must stay at the bf16 floor. +const REGRET_TOL: f32 = 0.20; +const MEAN_TOL: f32 = 0.06; + +fn model_path_or_skip() -> Option { + match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + Ok(path) => Some(path), + Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { + Some(MODEL_PATH.to_string()) + } + Err(_) => { + eprintln!( + "skipping qwen3 kv_offload_cpu_hit: {MODEL_PATH}/config.json is missing; \ + set PEGAINFER_TEST_MODEL_PATH to run it" + ); + None + } + } +} + +/// Deterministic synthetic prompt; different seeds share no prefix. +fn prompt(seed: usize, len: usize) -> Vec { + (0..len) + .map(|i| ((seed * 100_003 + i * 17) % 50_000 + 1_000) as u32) + .collect() +} + +fn prefill_item(id: u64, prompt: &[u32]) -> PrefillStepItem { + PrefillStepItem::new( + RequestId::new(id), + prompt.to_vec(), + MAX_OUTPUT, + SamplingParams::default(), + LOGPROBS, + false, + 0.0, + ) +} + +fn first_token_top(pr: &pegainfer_qwen3_4b::runtime::PrefillResult) -> Vec<(u32, f32)> { + pr.requests[0] + .first_token_logprob + .as_ref() + .expect("logprobs requested but none returned") + .top_logprobs + .clone() +} + +/// The warm (CPU-restored) first-token logits must agree with the cold compute +/// up to bf16 reduction noise: warm argmax within `REGRET_TOL` of cold, mean +/// head-token delta under `MEAN_TOL`. +fn assert_close(cold: &[(u32, f32)], warm: &[(u32, f32)]) { + let cold_map: HashMap = cold.iter().copied().collect(); + let cold_top = cold[0].1; + match cold_map.get(&warm[0].0) { + None => panic!( + "warm argmax {} absent from cold top-{}", + warm[0].0, + cold.len() + ), + Some(&clp) => assert!( + cold_top - clp <= REGRET_TOL, + "warm argmax {} sits {:.4} nat below cold argmax", + warm[0].0, + cold_top - clp + ), + } + let deltas: Vec = warm + .iter() + .take(8) + .filter_map(|&(token, wlp)| cold_map.get(&token).map(|&clp| (wlp - clp).abs())) + .collect(); + assert!(!deltas.is_empty(), "no head-token overlap"); + let mean = deltas.iter().sum::() / deltas.len() as f32; + let max = deltas.iter().copied().fold(0.0f32, f32::max); + eprintln!( + "kv_offload_cpu_hit: {} head deltas — mean {mean:.4} max {max:.4}", + deltas.len() + ); + assert!( + mean <= MEAN_TOL, + "mean head logprob delta {mean:.4} > {MEAN_TOL} — restored KV drifted past bf16 noise" + ); +} + +/// One executor, two scenarios, run sequentially. cargo runs `#[test]` +/// functions on parallel threads; two Qwen3-4B executors sharing device 0 and +/// the same pegaflow instance id ("qwen3-4b-dev0") would collide on the host +/// tier. Production wires exactly one executor per model, so the realistic +/// shape is one executor servicing both prefixes. The two scenarios use +/// disjoint prompt seeds, so they share no prefix and cannot cross-contaminate. +#[test] +fn live_gpu_and_cpu_prefix_hits() { + let Some(model_path) = model_path_or_skip() else { + return; + }; + let mut ex = Qwen3Executor::from_runtime_with_lora_options( + &model_path, + false, + &[0], + Qwen3LoraOptions::default(), + Qwen3OffloadOptions::enabled(HOST_TIER_BYTES), + ) + .expect("build offload executor"); + assert!(ex.offload_enabled(), "offload must be active"); + + cpu_tier_restores_evicted_prefix(&mut ex); + gpu_and_cpu_combined_hit(&mut ex); +} + +/// A prefix that is evicted from HBM and restored entirely from the CPU tier +/// (`gpu_hit == 0`): the baseline CPU round-trip through the live executor. +fn cpu_tier_restores_evicted_prefix(ex: &mut Qwen3Executor) { + let p = prompt(7, 50); // 3 full blocks (48 tok) + 2-token tail + + // ── Cold: first sight of P. Computes all of P on GPU and offloads the 3 + // sealed blocks to the host tier. ── + let cold = ex + .execute_prefill(PrefillPlan { + requests: &[prefill_item(1, &p)], + echo: false, + }) + .expect("cold prefill"); + assert_eq!( + cold.requests[0].cached_tokens, 0, + "first sight of P is cold" + ); + let cold_first = first_token_top(&cold); + ex.drop_request(RequestId::new(1)).expect("drop req1"); + + // ── Persist the saves, then evict P from HBM so it lives only on CPU. ── + ex.flush_offload_saves(); + ex.evict_cached_blocks(); + + // ── A GPU miss now: the prefetch must restore P from the CPU tier. ── + let hit = ex.begin_kv_prefetch(RequestId::new(2), &p, None); + assert!(hit, "P must hit the CPU tier after GPU eviction"); + let ready = ex.wait_ready_prefetch(); + assert!( + ready.contains(&RequestId::new(2)), + "prefetch load must settle ready, got {ready:?}" + ); + + // ── Warm: the restored CPU prefix is matched, only the 2-token tail + // recomputes (the full-block cap keeps the 3rd block's last token off the + // match the same way the GPU prefix cache does). ── + let warm = ex + .execute_prefill(PrefillPlan { + requests: &[prefill_item(2, &p)], + echo: false, + }) + .expect("warm prefill"); + assert_eq!( + warm.requests[0].cached_tokens, + 3 * BLOCK, + "CPU-restored prefix: 3 blocks matched, tail recomputed" + ); + let warm_first = first_token_top(&warm); + ex.drop_request(RequestId::new(2)).expect("drop req2"); + + // ── The restored KV must reproduce the original GPU first-token logits. ── + assert_close(&cold_first, &warm_first); +} + +/// A single prefix that is part GPU-resident, part CPU-only: the prefetch must +/// stack the CPU continuation onto the GPU hit and the re-match must see one +/// contiguous prefix. This is the case that catches an off-by-`gpu_hit` bug in +/// the query/commit offset math — the pure-CPU test (`gpu_hit == 0`) cannot. +fn gpu_and_cpu_combined_hit(ex: &mut Qwen3Executor) { + let full = prompt(9, 100); // 6 full blocks (96 tok) + 4-token tail + let short = full[..50].to_vec(); // a 3-block prefix of `full` + + // ── Cold-compute `full`, saving all 6 blocks to the host tier. ── + let cold = ex + .execute_prefill(PrefillPlan { + requests: &[prefill_item(1, &full)], + echo: false, + }) + .expect("cold full prefill"); + assert_eq!( + cold.requests[0].cached_tokens, 0, + "first sight of full is cold" + ); + let cold_first = first_token_top(&cold); + ex.drop_request(RequestId::new(1)).expect("drop req1"); + ex.flush_offload_saves(); + + // ── Drop the whole prefix from HBM (CPU keeps all 6 blocks), then + // re-establish ONLY the first 3 blocks in HBM by cold-prefilling `short`. + // GPU now holds blocks 0..3; CPU holds blocks 0..6. ── + ex.evict_cached_blocks(); + let s = ex + .execute_prefill(PrefillPlan { + requests: &[prefill_item(2, &short)], + echo: false, + }) + .expect("short prefill"); + assert_eq!( + s.requests[0].cached_tokens, 0, + "short re-warms blocks 0..3 cold" + ); + ex.drop_request(RequestId::new(2)).expect("drop req2"); + + // ── Prefetch `full`: GPU hits blocks 0..3, the host tier must supply the + // continuation 3..6. A pure GPU hit would not start a load. ── + let hit = ex.begin_kv_prefetch(RequestId::new(3), &full, None); + assert!( + hit, + "blocks 3..6 must be fetched from the CPU tier beyond the GPU hit" + ); + let ready = ex.wait_ready_prefetch(); + assert!( + ready.contains(&RequestId::new(3)), + "prefetch must settle, got {ready:?}" + ); + + // ── Warm prefill `full`: all 6 blocks match (3 GPU + 3 CPU). Without the + // CPU continuation this would be 3. ── + let warm = ex + .execute_prefill(PrefillPlan { + requests: &[prefill_item(3, &full)], + echo: false, + }) + .expect("warm full prefill"); + assert_eq!( + warm.requests[0].cached_tokens, + 6 * BLOCK, + "combined hit: 3 GPU-resident + 3 CPU-restored blocks match as one prefix" + ); + let warm_first = first_token_top(&warm); + ex.drop_request(RequestId::new(3)).expect("drop req3"); + + assert_close(&cold_first, &warm_first); +} diff --git a/pegainfer-qwen3-4b/tests/lora_smoke.rs b/pegainfer-qwen3-4b/tests/lora_smoke.rs index 37f6ddfa..3dce67db 100644 --- a/pegainfer-qwen3-4b/tests/lora_smoke.rs +++ b/pegainfer-qwen3-4b/tests/lora_smoke.rs @@ -228,6 +228,8 @@ fn qwen3_lora_loads_rank_and_generates(rank: usize, adapter_name: &str) { ..EngineLoadOptions::default() }, pegainfer_qwen3_4b::Qwen3LoraOptions::default(), + pegainfer_qwen3_4b::Qwen3OffloadOptions::disabled(), + false, ) .expect("start LoRA-capable Qwen3 engine"); diff --git a/pegainfer-server/src/main.rs b/pegainfer-server/src/main.rs index 7a824823..64b7fe9e 100644 --- a/pegainfer-server/src/main.rs +++ b/pegainfer-server/src/main.rs @@ -10,7 +10,7 @@ use pegainfer::vllm_frontend::LoraModule; use pegainfer_core::engine::{EngineLoadOptions, EpBackend}; #[cfg(feature = "kimi-k2")] use pegainfer_core::parallel::ParallelConfig; -use pegainfer_qwen3_4b::Qwen3LoraOptions; +use pegainfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions}; #[cfg(not(target_env = "msvc"))] #[global_allocator] @@ -74,6 +74,25 @@ struct Args { /// Emit synchronized DeepSeek V4 prefill phase timing records. #[arg(long, default_value_t = false)] deepseek_prefill_profile: bool, + + /// Enable pegaflow KV offload (host-tier "L2" cache) on the single-GPU + /// Qwen3 path. Sealed KV blocks are saved to host pinned memory and + /// restored into HBM before prefill when a prompt's prefix has fallen out + /// of the GPU cache. + #[arg(long, default_value_t = false)] + kv_offload: bool, + + /// Host pinned-memory pool size for the KV offload tier, in GiB. pegaflow + /// allocates the whole pool up front, so RSS reflects this at startup. + #[arg(long, default_value_t = 8.0)] + kv_offload_host_gib: f64, + + /// vLLM-style no-prefix-cache. Without --kv-offload it disables prefix + /// matching outright (every prefill recomputes the full prompt). With + /// --kv-offload it is the pure-L2 mode: no cross-request HBM reuse, so every + /// prefix is restored from the host tier — for measuring the L2 TTFT win. + #[arg(long, default_value_t = false)] + no_prefix_cache: bool, } #[derive(Clone, Copy, Debug, ValueEnum)] @@ -210,6 +229,16 @@ async fn main() -> anyhow::Result<()> { ep_backend: EpBackend::Nccl, seed: 42, }; + let offload = if args.kv_offload { + let bytes = (args.kv_offload_host_gib * f64::from(1u32 << 30)) as usize; + info!( + "Qwen3 KV offload enabled: host tier {:.1} GiB, no_prefix_cache={}", + args.kv_offload_host_gib, args.no_prefix_cache + ); + Qwen3OffloadOptions::enabled(bytes) + } else { + Qwen3OffloadOptions::disabled() + }; let handle = if args.enable_lora { let lora_options = Qwen3LoraOptions { max_loras: args.max_loras, @@ -223,9 +252,16 @@ async fn main() -> anyhow::Result<()> { &args.model_path, options, lora_options, + offload, + args.no_prefix_cache, ) } else { - pegainfer_qwen3_4b::start_engine(&args.model_path, options) + pegainfer_qwen3_4b::start_engine_with_offload( + &args.model_path, + options, + offload, + args.no_prefix_cache, + ) } .context("failed to start Qwen3 engine")?;