From 6659fd6c5906da6ea3123eb076aeb6d4493f9ac7 Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Fri, 12 Jun 2026 00:19:46 +0100 Subject: [PATCH 1/8] =?UTF-8?q?feat(experts):=20arithmetic=20virtual=20exp?= =?UTF-8?q?ert=20(AVE=20v0.1)=20=E2=80=94=20gate/extract/alu/drive/verify?= =?UTF-8?q?=20+=20first-run=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VirtualExpert trait (gate reads exhaust, compute external+exact, forced-decode drive with schedule-end termination) + arith instance. Tier-0 symbolic gate is the extract scanner, so fire => extraction by construction; tier-1 probe demoted to audit instrument per A11 (artifact schema retained, no weights present). First run on gemma3-4b q4k: explicit dispatch 6/6 exact, schedule-end 6/6, distractor false fires 0/6, native 2/6. Specs (AVE v0.1 + MEE v0.2 draft) in crates/larql-inference/docs/specs/virtual-experts/. --- bench/aim-validation/ave_demo_gemma3-4b.json | 1 + .../arithmetic-virtual-expert.md | 235 ++++++++ .../virtual-experts/math-expression-expert.md | 187 +++++++ crates/larql-inference/examples/ave_demo.rs | 184 +++++++ .../larql-inference/src/experts/arith/alu.rs | 499 +++++++++++++++++ .../src/experts/arith/drive.rs | 145 +++++ .../src/experts/arith/extract.rs | 260 +++++++++ .../larql-inference/src/experts/arith/gate.rs | 181 +++++++ .../larql-inference/src/experts/arith/mod.rs | 509 ++++++++++++++++++ .../src/experts/arith/probe_weights/README.md | 38 ++ .../src/experts/arith/verify.rs | 176 ++++++ crates/larql-inference/src/experts/mod.rs | 19 + .../src/experts/virtual_expert.rs | 215 ++++++++ docs/specs.md | 2 + 14 files changed, 2651 insertions(+) create mode 100644 bench/aim-validation/ave_demo_gemma3-4b.json create mode 100644 crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md create mode 100644 crates/larql-inference/docs/specs/virtual-experts/math-expression-expert.md create mode 100644 crates/larql-inference/examples/ave_demo.rs create mode 100644 crates/larql-inference/src/experts/arith/alu.rs create mode 100644 crates/larql-inference/src/experts/arith/drive.rs create mode 100644 crates/larql-inference/src/experts/arith/extract.rs create mode 100644 crates/larql-inference/src/experts/arith/gate.rs create mode 100644 crates/larql-inference/src/experts/arith/mod.rs create mode 100644 crates/larql-inference/src/experts/arith/probe_weights/README.md create mode 100644 crates/larql-inference/src/experts/arith/verify.rs create mode 100644 crates/larql-inference/src/experts/virtual_expert.rs diff --git a/bench/aim-validation/ave_demo_gemma3-4b.json b/bench/aim-validation/ave_demo_gemma3-4b.json new file mode 100644 index 000000000..7c3e1b30b --- /dev/null +++ b/bench/aim-validation/ave_demo_gemma3-4b.json @@ -0,0 +1 @@ +{"experiment":"ave_demo","vindex":"output/gemma3-4b-q4k-v2.vindex","explicit":[6,6],"schedule_end":[6,6],"native":[2,6],"false_fires":[0,6],"items":[{"leg":"explicit","prompt":"12 + 7 =","expected":"19","dispatch_ok":true,"native_ok":true,"native_text":"19\n12 - 7 =","native_tokens":11,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"12 + 7","alu_result":"19","emitted":" 19","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":3}},{"leg":"explicit","prompt":"123456 + 654321 =","expected":"777777","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n123456 + 65432","native_tokens":15,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"123456 + 654321","alu_result":"777777","emitted":" 777777","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":7}},{"leg":"explicit","prompt":"100000 - 1 =","expected":"99999","dispatch_ok":true,"native_ok":true,"native_text":"99999\n100000 -","native_tokens":14,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"100000 - 1","alu_result":"99999","emitted":" 99999","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":6}},{"leg":"explicit","prompt":"12345 * 6789 =","expected":"83810205","dispatch_ok":true,"native_ok":false,"native_text":"8380000 - 1000 = 8","native_tokens":17,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"12345 * 6789","alu_result":"83810205","emitted":" 83810205","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":9}},{"leg":"explicit","prompt":"999 + 111 - 222 =","expected":"888","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n999 + 111 =","native_tokens":12,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"999 + 111 - 222","alu_result":"888","emitted":" 888","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":4}},{"leg":"explicit","prompt":"858358354868358358358358 + 141641645131641641641641 =","expected":"999999999999999999999999","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n**Calculation:**\n\n858358354868358358358358 + 1","native_tokens":33,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"858358354868358358358358 + 141641645131641641641641","alu_result":"999999999999999999999999","emitted":" 999999999999999999999999","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":25}},{"leg":"distractor","prompt":"My phone number is 4415550172.","fire":"no"},{"leg":"distractor","prompt":"The meeting is on 2026-06-11.","fire":"no"},{"leg":"distractor","prompt":"Train 9 departs at 18:45 from platform 3.","fire":"no"},{"leg":"distractor","prompt":"Order 66 was executed in 19 BBY.","fire":"no"},{"leg":"distractor","prompt":"Account 123456789012345678901234567890 is active.","fire":"no"},{"leg":"distractor","prompt":"What is the capital of France?","fire":"no"}]} \ No newline at end of file diff --git a/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md b/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md new file mode 100644 index 000000000..759dc1b7c --- /dev/null +++ b/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md @@ -0,0 +1,235 @@ +# LARQL SPEC — Arithmetic Virtual Expert (AVE) + +**Status:** draft v0.1 (2026-06-11). **Scope:** larql-rs runtime component. +**Evidence base:** arithmetic_mechanism arc A0–A10 + A9b, all numbers cite frozen +pre-registered runs on Gemma-3-4b-it. **Claim discipline:** every parameter below is +tagged MEASURED / DERIVED / ASSUMED / OPEN. + +--- + +## 1. Design principle + +The model is an I/O system, not a calculator (A0–A6). It supplies: tokenization-level +digit decomposition, a causally verified number format (per-digit mod-10 wheels, A2c/A2e), +an involuntary engagement signal (A7), perfect operand extraction (A8), a magnitude prior +(A4c/A5), and a fluent readout (A9/A9b). It structurally cannot supply the serial +algorithm (bounded-depth, A4e/A5). Therefore: + +> **Fired ⇒ dispatch, always.** No length threshold. A8 measured native never winning +> once surface form is uncontrolled (template-fragility 0.58–0.67 at sizes where one +> template scored 0.93); A10 measured dispatch ≥ native in every cell at equal-or-known +> token cost. The model's own arithmetic output is consumed only as a verification prior. + +This component is also **instance #1 of the VirtualExpert trait** (§8) — the gate / +extract / compute / drive / verify decomposition is intended to be reused by future +experts (dates, units, sorting) pending the exhaust-generality result. + +## 2. Placement in the workspace + +``` +larql-inference/ + src/experts/ + mod.rs // VirtualExpert trait + ExpertController + arith/ + mod.rs // AVE: wiring + state machine + gate.rs // tier-0 symbolic scan + tier-1 L8 probe + extract.rs // symbolic parser + rewrite fallback + alu.rs // exact compute (BigInt) + drive.rs // forced-decode schedule (+ injection hook) + verify.rs // magnitude-prior check + probe_weights/ // ridge probe artifacts (versioned, per-model) +``` + +Forward-pass hooks required (larql-compute / larql-models): +- `residual_tap(layer, position) -> &[f32]` — read-only capture at L8, last prompt token. +- `logit_override(step) -> Option` — sampler-level forcing (default drive path). +- `residual_inject(layer, position, vec, lambda)` — optional; reserved (§5.2). +- `terminate_at(step)` — controller-owned generation stop. + +All four are trivial given larql-rs owns the pass; none exist over a token API. +**Stack-relative note (A9b):** for digit payloads the *outcome* is replicable via +constrained decoding anywhere; what forward-pass ownership uniquely buys is the gate tap, +non-token-aligned payloads (Lazarus convergence), and conditioning-without-committing. + +## 3. Gate + +Two tiers, evaluated during the prompt forward pass (which runs anyway — the tap is a +free read, not a 0.24-forward surcharge; the 0.24 framing only applies if an early-exit +dispatch skips the remaining layers, which is an optional optimization, OPEN). + +**Tier 0 — symbolic (explicit math).** Regex/scanner over the token stream for operator +tokens adjacent to digit spans. Cost ~0. MEASURED: fire 1.0, extraction downstream 1.0 +(A10 bare cells). + +**Tier 1 — engagement probe (disguised math). DEMOTED (A11).** Ridge probe on the L8 +residual at the last prompt token, reading arithmetic-engagement exhaust (math vs +numbers-present, template-held-out 0.91–0.99, A7b). +- MEASURED specificity: 1.00 (0/18 + 0/48 false fires across A8/A10, incl. long-number + no-op controls). A fire is always trustworthy. +- MEASURED sensitivity: uneven — 1.0 on sub/mul/multi phrasings, 0.17–0.58 on novel add + phrasings (A8/A10). +- **A11 demotion:** the probe is parked as an *audit instrument*, not a gate component; + the gate-hardening workstream is deleted. v0.1 gates on tier-0 only. Disguised-math + coverage waits on the exhaust-generality instrument science (§8 OPEN) — not on probe + retraining. The artifact format is retained for audit use + (`probe_weights/README.md`); weights remain per-checkpoint artifacts if/when refit. + +**Policy:** Tier0 fire ⇒ dispatch. No fire ⇒ native path untouched (zero overhead +beyond the tap). **No fire on disguised math is the designed fallthrough, not a +coverage gap:** the §7 decomposition `fleet = fire + (1−fire)·native` makes native the +floor — a silent gate costs exactly nothing relative to not having the expert, and the +dispatch architecture loses nothing when the probe never fires. + +## 4. Extract + +**Explicit path:** symbolic parse of the operand digit spans and operator(s) from the +token stream. Exact by construction, zero tokens. + +**Disguised path:** 2-shot rewrite prompt → parse the emitted expression. +MEASURED: extraction 1.00 in every cell — 16-digit operands, mul, 2-op chains, with an +*untuned* prompt (A8); held in-pipeline (A10: extract = 1.00 of fired, all kinds). +Cost ~2× tokens of native on the rewritten segment. The regex reads the model's +expression, never its sum (rigging-proofed by design, A8). +- OPEN: structured-output extraction (JSON-constrained decode) should beat 2-shot on + token cost; the 2-shot number is the measured floor. + +**Failure handling:** unparseable rewrite ⇒ fall to native, flag `extract_miss`. +MEASURED rate at floor prompt: 0. + +## 5. Compute + Drive (return path) + +**ALU:** Rust-native exact integer arithmetic (`i128` fast path, BigInt beyond). +Latency ~0 relative to a decode step. Ops in scope v0.1: +, −, ×, integer chains of the +A8 shapes. Division, decimals, negatives: OPEN (extraction for them unmeasured). + +### 5.1 Default drive: forced decode +Controller forces the answer token sequence at the sampler, then **terminates at +schedule end**. +- DERIVED from A9b: logit bias β=10 ≅ L30 injection ≅ constrained decoding, + behaviorally, on greedy. Forcing is the cheapest equivalent and larql-rs owns the + sampler. +- Schedule-end termination is MANDATORY: it eliminates the one observed delivery defect + — post-schedule digit continuation, ~4% per-item (129/135 ≈ 0.96 delivery without it, + A10 correction; mode caught in a logged diagnostic: full correct answer + one extra + digit). With termination, delivery = 1.0 **by construction**; "the model terminates on + its own" is demoted from claimed property (~0.96) to unneeded one. +- Forced tokens enter the KV cache normally; MEASURED: the model stays coherent + conditioned on supplied digits (A9 clean termination, A10 word-continuation cells). + +### 5.2 Reserved drive: residual injection +`λ·‖h‖·û(digit)` per decode step. MEASURED: drives 1.00 at any site ≥L16 during +emission, λ clean to 0.25, graded threshold ≈0.1 (A9b); the defended band is defended +only while *computing* (prompt step), not while emitting — the phase map. +Kept as the general mechanism because it is the same operation as Lazarus fact injection +(shared splice infrastructure) and supports conditioning-without-committing +(bias-without-force), which the sampler path cannot express. Not used for digits in v0.1. +- OPEN: emission-commandability lower bound ( Fire; // exhaust, not intent + fn extract(&self, ctx: &GenCtx) -> Result; + fn compute(&self, p: &Payload) -> Answer; // external, exact + fn drive(&self, a: &Answer) -> DriveSchedule; // forced-decode default + fn verify(&self, a: &Answer, native: Option<&str>) -> Verdict; // prior, not judge +} +``` + +Design constraints baked in from the arc: the gate reads **exhaust, not intent** (A7: +no abstract op object exists to read; the engagement signal is involuntary, cannot be +prompted away, and needs no MoE router); the expert is **invisible to the model** (no +weights touched, no model routing used); compute is **never** the model's. +- OPEN (the fleet's gating science question): exhaust generality — whether dates/units/ + sorting emit separable engagement signatures or one shared "bounded computation + straining" signal. Determines whether `gate()` is per-expert or shared infrastructure. + +## 9. Measured-parameter table + +| parameter | value | status | source | +|---|---|---|---| +| probe layer / site | L8, last prompt token | MEASURED | A7b | +| probe arch | ridge, λ ∝ mean feature norm | MEASURED | A7b | +| gate specificity | 1.00 (0/66 false fires) | MEASURED | A8+A10 | +| gate sensitivity (current weights) | 0.17–1.0 by phrasing | MEASURED; probe DEMOTED to audit instrument | A8/A10 + A11 | +| extraction (2-shot floor) | 1.00 all cells | MEASURED | A8/A10 | +| drive equivalence (bias≅inject≅force) | β=10 / λ≥0.25 / forced | MEASURED (greedy) | A9b | +| λ floor / threshold (L30) | clean 0.25 / graded ≈0.1 | MEASURED | A9b | +| emission-commandable sites | ≥L16 (lower bound open) | MEASURED | A9b | +| delivery w/o termination | 129/135 ≈ 0.96 (one mode: +1 digit) | MEASURED | A10+corr | +| delivery w/ schedule termination | 1.0 by construction | DERIVED | A10 corr | +| explicit-path token overhead | 0 | MEASURED | A10 | +| disguised-path token overhead | ~2× (rewrite floor) | MEASURED | A8/A10 | +| estimator prior envelope | ±25–35% to 24 digits; void ≥28 | MEASURED | A4c/A5 | +| end-to-end demo | 24-digit add 0.92 vs native 0.00, equal tokens | MEASURED | A10 | + +## 10. Out of scope / risks + +1. **Model-version coupling.** Probe weights, L8/L16/L30 sites, and the phase map are + Gemma-3-4b measurements. The *relative-depth* framing (L8 ≈ 24%, emission-commandable + ≥ ~47%) is the porting hypothesis (depth-fraction routing law), ASSUMED until the + 12B/other-family run. Ship probes as per-checkpoint artifacts; treat sites as + fractions with a calibration pass per model. +2. **Sampling.** All drive equivalences are greedy-measured. Under temperature, forced + decode is unaffected by construction; injection/bias equivalence is OPEN + distributionally. +3. **Op coverage.** Return path measured for addition; extraction measured for +,−,×, + 2-op chains. Division, decimals, negatives, mixed text-number answers: OPEN. +4. **Gate hardening — workstream DELETED (A11).** Pre-A11 this read as "the single + component standing between current and ~1.0 disguised accuracy." Post-A11 the probe + is an audit instrument and disguised coverage is parked behind the exhaust-generality + instrument science. The explicit path — the measured-1.0 path — is the product + surface; native is the designed fallthrough for everything else. +5. **Quantization.** All measurements bf16/MLX. Behavior under Q4_K/Q6_K on the Metal + pipeline: ASSUMED stable for forced decode (sampler-level, quantization-independent); + probe and injection paths need one re-calibration run. + +## 11. Acceptance tests (assembly increments) + +- AT-1: A10 suite rerun in larql-rs, forced-decode drive + termination ⇒ explicit ≥0.99, + zero post-schedule continuations, tokens == native. +- AT-2: distractor set ×10 size ⇒ false fires = 0. +- AT-3: hardened gate ⇒ disguised single-step fire ≥0.9 (the A8 bar), specificity intact. +- AT-4: verify leg ⇒ injected extraction faults (swapped operands) caught at the + measured prior envelope; false-flag rate <2%. +- AT-5: per-item telemetry replays the word_16-class consistency check automatically. diff --git a/crates/larql-inference/docs/specs/virtual-experts/math-expression-expert.md b/crates/larql-inference/docs/specs/virtual-experts/math-expression-expert.md new file mode 100644 index 000000000..bd22ca47b --- /dev/null +++ b/crates/larql-inference/docs/specs/virtual-experts/math-expression-expert.md @@ -0,0 +1,187 @@ +# LARQL SPEC — Math Expression Expert (MEE) — AVE v0.2 extension + +**Status:** draft v0.1 (2026-06-11). **Supersedes:** §5 ALU scope of +[arithmetic-virtual-expert.md](arithmetic-virtual-expert.md) (AVE v0.1); all other AVE +sections inherited unchanged. **Scope:** generalize the AVE from +integer +,−,× to full mathematical expression evaluation — elementary functions, nesting, +constants — without touching the gate/drive architecture. +**Claim discipline:** MEASURED / DERIVED / ASSUMED / OPEN tags throughout. New stages +carry acceptance tests (§9) rather than borrowed confidence. + +--- + +## 1. Design statement + +One expert, all of evaluable mathematics. Do not build per-function experts; functions +are payload vocabulary, not architecture. The substrate constraint that justifies this +(arc-measured): weights hold rows (authorable), finite circuits (trainable), and bounded +interpolators (trainable) — exact evaluation over unbounded domains is in none of those +classes. The MEE supplies exactly that residue. + +> Invariant inherited from AVE: **fired ⇒ dispatch.** The model's native value for any +> expression is an estimate (A4c/A5 envelope at best); it is consumed only by the verify +> leg, never emitted. + +## 2. Deltas vs AVE v0.1 + +| stage | v0.1 (AVE) | v0.2 (MEE) | risk class | +|---|---|---|---| +| gate | symbolic ops + L8 probe | + function-name lexicon (tier-0) | LOW (additive) | +| extract | operand-pair regex / 2-shot rewrite | expression-tree parse / rewrite-to-expression | MEDIUM (depth unmeasured) | +| compute | BigInt +,−,× | full expression engine (§5) | LOW (engineering) | +| drive | forced decode, exact payload | + precision policy for non-terminating values (§6) | MEDIUM (new controller decision) | +| verify | magnitude prior | + domain/range checks, interval prior (§7) | LOW | + +Gate architecture, L8 probe, injection reserve, schedule termination, telemetry +invariants: unchanged, inherited. + +## 3. Gate additions + +**Tier-0 lexicon expansion.** Function-name surface forms are unambiguous dispatch +triggers: {sqrt, sin, cos, tan, asin..., log, ln, exp, pow/^, abs, floor, ceil, round, +factorial/!, mod/%, gcd, lcm, min, max, sum, prod, mean, median, std, nCr/choose, nPr, +deg/rad}, plus constants {pi/π, e, tau, phi} when adjacent to operators. ASSUMED: zero +false-fire cost from lexicon matches adjacent to digit spans (a function name next to a +number is not ambiguous English); AT-G1 verifies on a distractor set including +metaphorical uses ("exponential growth", "a tangent about...") — **the lexicon trigger +requires operator/operand adjacency, not bare word match.** + +**Probe (tier-1) unchanged.** ASSUMED: word-problem trig/log rides the existing numeric +engagement exhaust (it is numbers-under-operation, A7b's contrast class). OPEN: not +re-measured; the exhaust-generality sweep covers it. + +## 4. Extract + +**Target representation:** an expression AST, not operand tuples. + +```rust +enum Expr { + Num(Decimal), // exact where possible + Const(K), // Pi, E, Tau, Phi + Neg(Box), + Bin(Op, Box, Box), // + - * / ^ % + Call(Func, Vec), // sin, log(b, x), nCr, ... +} +``` + +**Explicit path:** Pratt parser over the token stream's expression span. Exact, zero +generation cost. Grammar covers: precedence, unary minus, implicit multiplication +(`2pi`, `3(4+1)`), `!` postfix, `^` right-assoc, degree/radian annotations. + +**Disguised path:** the A8 rewrite, retargeted — "rewrite as a bare mathematical +expression" with 2-shot examples including one nested call. Parse the emission with the +same Pratt parser. +- MEASURED floor: 1.00 at 2-op chains, 16-digit operands (A8). +- OPEN — **the v0.2 science item:** extraction accuracy vs AST depth/function arity. + A8 never exceeded depth 2 or non-arithmetic ops. AT-E1 measures the depth curve + before any accuracy claim; the pre-registerable prediction is A5-copy-grade + transcription holding to depth ~4–5 with failures being *structural* (dropped + parens) not *lexical* (wrong digits). +- Degree/radian ambiguity: prose "sin of 30" defaults DEGREES with the assumption + logged in telemetry; explicit `sin(0.5)` bare-numeric defaults RADIANS. ASSUMED + convention, surfaced in the answer when load-bearing (AT-E2). + +**Failure handling:** parse error ⇒ one re-rewrite with the error class in the prompt; +second failure ⇒ native + flag (inherited policy). + +## 5. Compute — expression engine + +`larql-inference/src/experts/arith/engine.rs` (replaces alu.rs scope): + +- **Exact tier:** BigInt / BigRational for integer and rational subtrees — `3/4 + 1/6` + returns `11/12` exactly; factorial, gcd, nCr exact to BigInt limits. +- **Float tier:** arbitrary-precision floats (rug/MPFR binding or pure-Rust dashu/astro-float; + decision = build-dependency policy, OPEN) at working precision = output precision + 8 + guard digits. **Never f64 for user-visible digits** — the expert's one absolute is + that emitted digits are correct; double rounding at f64 violates it silently. +- **Symbolic constants** held symbolic until forced: `sin(pi/6)` → exact path → `1/2`. + A small exact-value table (the "famous points" the model itself memorized as lexicon + entries) short-circuits the float tier where it can. +- **Mixed trees:** exact subtrees evaluated exactly, promoted to float only at the + boundary node that requires it. +- Domain errors (log of negative, div-by-zero, asin(2)) return a typed error that the + drive verbalizes ("undefined: ...") rather than NaN — a *correct* refusal is a valid + payload (AT-C2). + +DERIVED: compute latency remains ~0 vs a decode step for everything except pathological +precision requests; cap working precision (default 50 digits, configurable) and degrade +to "≈ at N digits" beyond. + +## 6. Drive — precision policy (the new controller decision) + +Forced decode inherited; what's new is that non-terminating values have no canonical +token sequence. The controller, not the model, fixes the representation: + +1. **Exact wins:** if the engine produced an exact form whose decimal terminates or + whose rational is short, emit it (`11/12`, `0.5`, `120`). +2. **Sig-fig inference:** else infer requested precision from the prompt ("to 3 dp", + "approximately") — explicit request always wins. +3. **Default:** 6 significant digits, prefixed with the approximation marker the + schedule includes ("≈ "), banker's rounding at the cut. +4. **Schedule construction:** tokenize the final string with the target model's + tokenizer, validate round-trip (inherited tokenizer assertion), force, terminate at + schedule end (inherited; eliminates the post-schedule continuation mode by + construction). + +ASSUMED: the "≈" prefix and unit/degree annotations force cleanly as part of the +schedule (they are ordinary tokens). AT-D1 checks emission coherence on +mixed text-numeric payloads ("≈ 0.932 radians") — the first payloads with +non-digit interior tokens, which is the only genuinely new drive surface. + +## 7. Verify + +- **Magnitude prior (inherited):** applicable only when the model produced a native + numeric guess; envelope per A4c/A5; void where the function family has no native + estimator (ASSUMED for transcendentals — the model likely has *no* usable sin + estimate off famous points; treat prior as absent, not as zero). +- **New, oracle-side (stronger than the prior and free):** + - interval check: recompute at precision+8, confirm rounding stability; + - inverse check where cheap: exp(log x) ≈ x, (sqrt x)² ≈ x; + - domain pre-check before evaluation (catches extraction faults like a dropped + minus sign turning log(−x) into a silent wrong branch — the MEE's analogue of the + swapped-operand fault AT-4 covers). +- Trust topology note: in v0.2 extraction remains the suspect stage and compute the + trusted one (AVE topology) — the inversion flagged for solver-class experts does NOT + apply here; nesting raises extraction's error *rate*, not its error *role*. + +## 8. Out of scope (v0.2) + +- Symbolic algebra (solve, differentiate, integrate, simplify): different payload class + — answers are *expressions*, drive is semantic-adjacent, and "which manipulation" + is partly model judgment. v0.3 candidate behind its own plan; do not scope-creep the + evaluator. +- Matrices/vectors, complex numbers, series: payload representation undecided. +- Equation *solving* even numerically (root-finding is evaluation-adjacent but the + extraction target is an equation, not an expression — small step, separate AT). +- Multi-expression programs ("compute X, then use it in Y"): mid-trajectory dispatch + territory; inherits that programme's status. + +## 9. Acceptance tests + +- **AT-G1 (lexicon specificity):** 50 distractors incl. metaphorical function words + ("tangent", "exponential", "log file", "sine of the times") ⇒ 0 false fires. +- **AT-E1 (extraction depth curve):** rewrite-extraction accuracy at AST depth + {1,2,3,4,5} × arity {1,2} × 3 surface phrasings, n≥12/cell, *pre-registered before + the run* with a full outcome space (holds / degrades-with-depth / structural-failure + mode). This is the one v0.2 number that decides scope. +- **AT-E2 (unit ambiguity):** deg/rad inference cases; assumption surfaced in ≥0.95 of + ambiguous emissions. +- **AT-C1 (engine correctness):** differential test vs mpmath reference on 10³ random + trees, exact match at emitted precision. +- **AT-C2 (typed refusals):** domain-error inputs verbalize correctly, never NaN/garbage. +- **AT-D1 (mixed-payload drive):** forced schedules containing ≈/units/words emit + coherently, schedule termination clean, on the A10 telemetry rig. +- **AT-V1 (inverse-check tripwire):** injected extraction faults (sign flips, dropped + parens) caught ≥0.9 by domain/inverse checks before emission. +- **AT-A1 (assembly):** A10-pattern run, 100 items spanning bare expressions, word + problems, famous values, domain errors, distractors — fleet ≥ native in every cell, + consistency assertion (fleet ≈ fire + (1−fire)·native) holding per batch. + +## 10. The number that gates everything + +AT-E1's depth curve is the only open quantity standing between this spec and a closed +expert. If extraction holds to depth 4+ (the copy-grade prediction), the MEE ships as a +port. If it degrades structurally at depth 2–3, the rewrite prompt gains structure +(parenthesis-explicit examples, or JSON-AST constrained decode) and AT-E1 reruns — an +engineering loop, not a science gate. Either way the architecture is inherited, the +walls are the substrate's, and the function library rides in on one parser. diff --git a/crates/larql-inference/examples/ave_demo.rs b/crates/larql-inference/examples/ave_demo.rs new file mode 100644 index 000000000..d6b0d819e --- /dev/null +++ b/crates/larql-inference/examples/ave_demo.rs @@ -0,0 +1,184 @@ +//! AVE assembly increment — first run of the Arithmetic Virtual Expert +//! (spec `docs/specs/virtual-experts/arithmetic-virtual-expert.md`) against a +//! real Q4_K vindex on the CPU decode path. +//! +//! Two legs, mapped to the spec's acceptance tests: +//! - **AT-1 (explicit):** tier-0 fires on explicit expressions, the ALU +//! computes exactly, the forced-decode schedule delivers with schedule-end +//! termination (zero post-schedule continuations by construction). The +//! same prompts run native for the accuracy/token comparison. +//! - **AT-2 (specificity):** distractor prompts — numbers without operators, +//! dates, times, long no-op numbers — must produce zero false fires. +//! +//! Per-item telemetry is written as JSON (the A10 lesson: per-item logs turn +//! a rerun into a grep). +//! +//! Usage: `cargo run --release --example ave_demo -- [VINDEX_DIR]` +//! Writes `bench/aim-validation/ave_demo_gemma3-4b.json`. + +use larql_inference::experts::{ave_generate_kquant, ArithmeticExpert, AveOptions}; +use larql_inference::load_tokenizer; +use larql_inference::vindex::generate_kquant_cpu; + +/// (prompt, expected exact answer) — tier-0 explicit forms, incl. the +/// 24-digit add (the A10 demo cell: dispatch 0.92 vs native 0.00). +const EXPLICIT: &[(&str, &str)] = &[ + ("12 + 7 =", "19"), + ("123456 + 654321 =", "777777"), + ("100000 - 1 =", "99999"), + ("12345 * 6789 =", "83810205"), + ("999 + 111 - 222 =", "888"), + ( + "858358354868358358358358 + 141641645131641641641641 =", + "999999999999999999999999", + ), +]; + +/// Distractors: digits present, no computation asked — gate must stay cold. +const DISTRACTORS: &[&str] = &[ + "My phone number is 4415550172.", + "The meeting is on 2026-06-11.", + "Train 9 departs at 18:45 from platform 3.", + "Order 66 was executed in 19 BBY.", + "Account 123456789012345678901234567890 is active.", + "What is the capital of France?", +]; + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + eprintln!(" pass a Q4_K gemma3-4b vindex dir as the first arg"); + eprintln!(" (default: output/gemma3-4b-q4k-v2.vindex). Skipping cleanly."); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + + // No tier-1 probe artifact exists yet (probe_weights/README.md) — the + // gate runs tier-0 only, which is the measured-1.0 explicit path. + let ave = ArithmeticExpert::new(); + let opts = AveOptions::default(); + + println!("\n=== AVE assembly increment on {vindex} ==="); + println!(" gate: tier-0 symbolic (no probe artifact); drive: forced decode + schedule-end termination\n"); + + // ── AT-1: explicit dispatch vs native ─────────────────────────────── + let mut json_rows = String::new(); + let (mut dispatch_ok, mut native_ok) = (0usize, 0usize); + let mut schedule_end_ok = 0usize; + + println!(" ── AT-1 explicit (dispatch vs native) ──"); + for (prompt, expected) in EXPLICIT { + let t0 = std::time::Instant::now(); + let out = ave_generate_kquant(&ave, &mut weights, &tok, &index, prompt, None, &opts) + .expect("ave run"); + let dispatch_ms = t0.elapsed().as_millis(); + + let d_ok = out.emitted.trim() == *expected; + let sched_ok = out.telemetry.termination == "schedule_end"; + dispatch_ok += usize::from(d_ok); + schedule_end_ok += usize::from(sched_ok); + + // Native comparison: same prompt, greedy, answer-sized budget. + let prompt_ids = tok + .encode(*prompt, true) + .expect("encode") + .get_ids() + .to_vec(); + let budget = out.telemetry.answer_tokens.max(expected.len()) + 8; + let t1 = std::time::Instant::now(); + let native = generate_kquant_cpu(&mut weights, &tok, &prompt_ids, budget, &index); + let native_ms = t1.elapsed().as_millis(); + let native_text: String = native.iter().map(|(t, _)| t.as_str()).collect(); + let native_tokens = native.len(); + // Native is correct if the expected number appears (separator-blind). + let n_ok = native_text.replace([',', ' '], "").contains(expected); + native_ok += usize::from(n_ok); + + println!( + " {:<58} dispatch: {:<9} [{}tok {}ms {}] native: {:<9} [{}tok {}ms]", + format!("{prompt:?}"), + if d_ok { "✓ exact" } else { "✗ WRONG" }, + out.telemetry.answer_tokens, + dispatch_ms, + out.telemetry.termination, + if n_ok { "✓" } else { "✗" }, + native_tokens, + native_ms, + ); + if !d_ok { + println!(" emitted: {:?} expected {:?}", out.emitted, expected); + } + + json_rows.push_str(&format!( + "{}{{\"leg\":\"explicit\",\"prompt\":{},\"expected\":\"{expected}\",\"dispatch_ok\":{d_ok},\"native_ok\":{n_ok},\"native_text\":{},\"native_tokens\":{native_tokens},\"telemetry\":{}}}", + if json_rows.is_empty() { "" } else { "," }, + serde_json::to_string(prompt).expect("json"), + serde_json::to_string(native_text.trim()).expect("json"), + serde_json::to_string(&out.telemetry).expect("json"), + )); + } + + // ── AT-2: distractor specificity (gate only — no generation needed + // to score a false fire) ──────────────────────────────────────────── + println!("\n ── AT-2 distractors (false fires must be 0) ──"); + let mut false_fires = 0usize; + for prompt in DISTRACTORS { + use larql_inference::experts::VirtualExpert; + let fire = ave.gate(None, prompt); + let fired = fire.fired(); + false_fires += usize::from(fired); + println!( + " {:<58} fire: {}", + format!("{prompt:?}"), + if fired { "✗ FALSE FIRE" } else { "✓ no" } + ); + json_rows.push_str(&format!( + ",{{\"leg\":\"distractor\",\"prompt\":{},\"fire\":\"{}\"}}", + serde_json::to_string(prompt).expect("json"), + fire.label(), + )); + } + + // ── verdict + the spec §7 consistency check ───────────────────────── + let n_e = EXPLICIT.len(); + let n_d = DISTRACTORS.len(); + println!("\n ── verdict ──"); + println!( + " explicit dispatch: {dispatch_ok}/{n_e} exact schedule-end termination: {schedule_end_ok}/{n_e} native: {native_ok}/{n_e}" + ); + println!(" distractor false fires: {false_fires}/{n_d} (AT-2 bar: 0)"); + // Fire rate on the explicit leg is 1.0 by construction (tier-0), so the + // §7 decomposition reduces to fleet == dispatch accuracy there. + let fleet = dispatch_ok as f64 / n_e as f64; + let residual = larql_inference::experts::arith::decomposition_residual( + fleet, + 1.0, + dispatch_ok as f64 / n_e as f64, + native_ok as f64 / n_e as f64, + ); + println!(" §7 decomposition residual (explicit leg): {residual:.4} (alarm if ≉ 0)"); + + let json = format!( + "{{\"experiment\":\"ave_demo\",\"vindex\":{},\"explicit\":[{dispatch_ok},{n_e}],\"schedule_end\":[{schedule_end_ok},{n_e}],\"native\":[{native_ok},{n_e}],\"false_fires\":[{false_fires},{n_d}],\"items\":[{json_rows}]}}", + serde_json::to_string(&vindex).expect("json"), + ); + let out_path = "bench/aim-validation/ave_demo_gemma3-4b.json"; + if let Err(e) = std::fs::write(out_path, &json) { + eprintln!("warning: could not write {out_path}: {e}"); + } else { + println!("\nwrote {out_path}"); + } +} diff --git a/crates/larql-inference/src/experts/arith/alu.rs b/crates/larql-inference/src/experts/arith/alu.rs new file mode 100644 index 000000000..d7c174ec8 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/alu.rs @@ -0,0 +1,499 @@ +//! Exact compute for the arithmetic virtual expert: `i128` fast path, +//! arbitrary-precision decimal beyond. Ops in scope v0.1: +, −, ×, integer +//! chains (spec §5). Division, decimals, negative *operands*: OPEN — +//! extraction for them is unmeasured (negative *results* of − are fine). +//! +//! The bignum is a deliberately small signed decimal-digit implementation: +//! operand sizes here are tens of digits, schoolbook is exact and instant +//! relative to a decode step, and it keeps the crate dependency-free. + +use std::cmp::Ordering; +use std::fmt; + +/// Operator set the ALU evaluates. `Mul` binds tighter than `Add`/`Sub`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Op { + Add, + Sub, + Mul, +} + +impl Op { + pub fn symbol(&self) -> char { + match self { + Op::Add => '+', + Op::Sub => '-', + Op::Mul => '*', + } + } +} + +/// A parsed integer chain: `operands[0] ops[0] operands[1] ops[1] …`. +/// Invariant: `operands.len() == ops.len() + 1`, at least one op. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Expr { + pub operands: Vec, + pub ops: Vec, +} + +impl Expr { + /// Exact evaluation with standard precedence (× before ±). + /// `i128` fast path, decimal bignum beyond. + pub fn eval(&self) -> BigInt { + if let Some(v) = self.eval_i128() { + return BigInt::parse(&v.to_string()).expect("i128 → BigInt"); + } + self.eval_big() + } + + /// Largest operand width in decimal digits — drives the verify-prior + /// envelope (void past 24-digit operands). + pub fn max_operand_digits(&self) -> usize { + self.operands + .iter() + .map(|o| o.digit_count()) + .max() + .unwrap_or(0) + } + + fn eval_i128(&self) -> Option { + // Fold × runs into terms, then sum the terms. + let mut terms: Vec = vec![self.operands[0].to_i128()?]; + let mut signs: Vec = vec![false]; // true = subtract + for (op, operand) in self.ops.iter().zip(self.operands[1..].iter()) { + let v = operand.to_i128()?; + match op { + Op::Mul => { + let last = terms.last_mut().expect("nonempty"); + *last = last.checked_mul(v)?; + } + Op::Add => { + terms.push(v); + signs.push(false); + } + Op::Sub => { + terms.push(v); + signs.push(true); + } + } + } + let mut acc: i128 = 0; + for (t, neg) in terms.iter().zip(signs.iter()) { + acc = if *neg { + acc.checked_sub(*t)? + } else { + acc.checked_add(*t)? + }; + } + Some(acc) + } + + fn eval_big(&self) -> BigInt { + let mut terms: Vec = vec![self.operands[0].clone()]; + let mut signs: Vec = vec![false]; + for (op, operand) in self.ops.iter().zip(self.operands[1..].iter()) { + match op { + Op::Mul => { + let last = terms.last_mut().expect("nonempty"); + *last = last.mul(operand); + } + Op::Add => { + terms.push(operand.clone()); + signs.push(false); + } + Op::Sub => { + terms.push(operand.clone()); + signs.push(true); + } + } + } + let mut acc = BigInt::zero(); + for (t, neg) in terms.iter().zip(signs.iter()) { + acc = if *neg { acc.sub(t) } else { acc.add(t) }; + } + acc + } +} + +impl fmt::Display for Expr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.operands[0])?; + for (op, operand) in self.ops.iter().zip(self.operands[1..].iter()) { + write!(f, " {} {}", op.symbol(), operand)?; + } + Ok(()) + } +} + +/// Signed arbitrary-precision decimal integer. Magnitude is little-endian +/// decimal digits, no leading zeros; zero is `[0]` with `neg = false`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BigInt { + neg: bool, + mag: Vec, +} + +impl BigInt { + pub fn zero() -> Self { + BigInt { + neg: false, + mag: vec![0], + } + } + + /// Parse an optionally signed decimal string. No separators — the + /// extractor normalizes those. + pub fn parse(s: &str) -> Option { + let s = s.trim(); + let (neg, digits) = match s.strip_prefix('-') { + Some(rest) => (true, rest), + None => (false, s.strip_prefix('+').unwrap_or(s)), + }; + if digits.is_empty() || !digits.bytes().all(|b| b.is_ascii_digit()) { + return None; + } + let mut mag: Vec = digits.bytes().rev().map(|b| b - b'0').collect(); + while mag.len() > 1 && *mag.last().expect("nonempty") == 0 { + mag.pop(); + } + let is_zero = mag == [0]; + Some(BigInt { + neg: neg && !is_zero, + mag, + }) + } + + pub fn is_zero(&self) -> bool { + self.mag == [0] + } + + pub fn is_negative(&self) -> bool { + self.neg + } + + /// Width of the magnitude in decimal digits. + pub fn digit_count(&self) -> usize { + self.mag.len() + } + + /// Most-significant decimal digit of the magnitude. + pub fn leading_digit(&self) -> u8 { + *self.mag.last().expect("mag is never empty") + } + + /// Magnitude as an `f64` approximation (`mantissa × 10^exp` off the + /// leading digits) — only used for ratio checks in the verify prior. + pub fn approx_magnitude(&self) -> f64 { + let take = self.mag.len().min(15); + let mut mant = 0f64; + for d in self.mag.iter().rev().take(take) { + mant = mant * 10.0 + f64::from(*d); + } + mant * 10f64.powi((self.mag.len() - take) as i32) + } + + fn to_i128(&self) -> Option { + if self.mag.len() > 38 { + return None; + } + let mut v: i128 = 0; + for d in self.mag.iter().rev() { + v = v.checked_mul(10)?.checked_add(i128::from(*d))?; + } + if self.neg { + v.checked_neg() + } else { + Some(v) + } + } + + pub fn add(&self, other: &BigInt) -> BigInt { + if self.neg == other.neg { + BigInt { + neg: self.neg, + mag: add_mag(&self.mag, &other.mag), + } + .normalized() + } else { + // Differing signs: subtract smaller magnitude from larger. + match cmp_mag(&self.mag, &other.mag) { + Ordering::Equal => BigInt::zero(), + Ordering::Greater => BigInt { + neg: self.neg, + mag: sub_mag(&self.mag, &other.mag), + } + .normalized(), + Ordering::Less => BigInt { + neg: other.neg, + mag: sub_mag(&other.mag, &self.mag), + } + .normalized(), + } + } + } + + pub fn sub(&self, other: &BigInt) -> BigInt { + let negated = BigInt { + neg: !other.neg && !other.is_zero(), + mag: other.mag.clone(), + }; + self.add(&negated) + } + + pub fn mul(&self, other: &BigInt) -> BigInt { + if self.is_zero() || other.is_zero() { + return BigInt::zero(); + } + BigInt { + neg: self.neg != other.neg, + mag: mul_mag(&self.mag, &other.mag), + } + .normalized() + } + + fn normalized(mut self) -> BigInt { + while self.mag.len() > 1 && *self.mag.last().expect("nonempty") == 0 { + self.mag.pop(); + } + if self.mag == [0] { + self.neg = false; + } + self + } +} + +impl fmt::Display for BigInt { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.neg { + write!(f, "-")?; + } + for d in self.mag.iter().rev() { + write!(f, "{d}")?; + } + Ok(()) + } +} + +fn cmp_mag(a: &[u8], b: &[u8]) -> Ordering { + if a.len() != b.len() { + return a.len().cmp(&b.len()); + } + for (da, db) in a.iter().rev().zip(b.iter().rev()) { + match da.cmp(db) { + Ordering::Equal => continue, + other => return other, + } + } + Ordering::Equal +} + +fn add_mag(a: &[u8], b: &[u8]) -> Vec { + let mut out = Vec::with_capacity(a.len().max(b.len()) + 1); + let mut carry = 0u8; + for i in 0..a.len().max(b.len()) { + let s = a.get(i).copied().unwrap_or(0) + b.get(i).copied().unwrap_or(0) + carry; + out.push(s % 10); + carry = s / 10; + } + if carry > 0 { + out.push(carry); + } + out +} + +/// Requires `a >= b` by magnitude. +fn sub_mag(a: &[u8], b: &[u8]) -> Vec { + let mut out = Vec::with_capacity(a.len()); + let mut borrow = 0i8; + for (i, da) in a.iter().enumerate() { + let mut d = i8::try_from(*da).expect("digit") - borrow + - i8::try_from(b.get(i).copied().unwrap_or(0)).expect("digit"); + if d < 0 { + d += 10; + borrow = 1; + } else { + borrow = 0; + } + out.push(u8::try_from(d).expect("0..=9")); + } + out +} + +fn mul_mag(a: &[u8], b: &[u8]) -> Vec { + let mut out = vec![0u32; a.len() + b.len()]; + for (i, da) in a.iter().enumerate() { + for (j, db) in b.iter().enumerate() { + out[i + j] += u32::from(*da) * u32::from(*db); + } + } + let mut carry = 0u32; + let mut digits = Vec::with_capacity(out.len()); + for v in out { + let s = v + carry; + digits.push(u8::try_from(s % 10).expect("0..=9")); + carry = s / 10; + } + while carry > 0 { + digits.push(u8::try_from(carry % 10).expect("0..=9")); + carry /= 10; + } + while digits.len() > 1 && *digits.last().expect("nonempty") == 0 { + digits.pop(); + } + digits +} + +#[cfg(test)] +mod tests { + use super::*; + + fn big(s: &str) -> BigInt { + BigInt::parse(s).expect("parse") + } + + fn expr(operands: &[&str], ops: &[Op]) -> Expr { + Expr { + operands: operands.iter().map(|s| big(s)).collect(), + ops: ops.to_vec(), + } + } + + #[test] + fn parse_and_display_roundtrip() { + for s in ["0", "7", "42", "999999999999999999999999", "-13"] { + assert_eq!(big(s).to_string(), s); + } + // Leading zeros and signs normalize. + assert_eq!(big("007").to_string(), "7"); + assert_eq!(big("+12").to_string(), "12"); + assert_eq!(big("-0").to_string(), "0"); + assert_eq!(big("-000").to_string(), "0"); + } + + #[test] + fn parse_rejects_non_decimal() { + for s in ["", " ", "12a", "1.5", "--3", "1 2", "0x1f"] { + assert!(BigInt::parse(s).is_none(), "expected reject: {s:?}"); + } + } + + #[test] + fn accessors() { + let n = big("4096"); + assert_eq!(n.digit_count(), 4); + assert_eq!(n.leading_digit(), 4); + assert!(!n.is_negative()); + assert!(!n.is_zero()); + assert!(big("0").is_zero()); + assert!(big("-5").is_negative()); + } + + #[test] + fn add_sub_mul_match_i128_on_small_values() { + let cases: &[(i128, i128)] = &[ + (0, 0), + (1, 9), + (99, 1), + (12345, 6789), + (1000000, -1), + (-456, -544), + (-12, 30), + (7, -7), + ]; + for &(a, b) in cases { + let (ba, bb) = (big(&a.to_string()), big(&b.to_string())); + assert_eq!(ba.add(&bb).to_string(), (a + b).to_string(), "{a}+{b}"); + assert_eq!(ba.sub(&bb).to_string(), (a - b).to_string(), "{a}-{b}"); + assert_eq!(ba.mul(&bb).to_string(), (a * b).to_string(), "{a}*{b}"); + } + } + + #[test] + fn carries_ripple_across_the_whole_number() { + assert_eq!(big("999999").add(&big("1")).to_string(), "1000000"); + assert_eq!(big("1000000").sub(&big("1")).to_string(), "999999"); + } + + #[test] + fn twenty_four_digit_add_is_exact() { + // Digit-wise nines-complement pair: sums to all nines. + let a = big("858358354868358358358358"); + let b = big("141641645131641641641641"); + assert_eq!(a.add(&b).to_string(), "999999999999999999999999"); + } + + #[test] + fn big_mul_beyond_i128_is_exact() { + // 20-digit × 20-digit = 40 digits; overflows i128 (max ~1.7e38). + let a = big("99999999999999999999"); + assert_eq!( + a.mul(&a).to_string(), + "9999999999999999999800000000000000000001" + ); + } + + #[test] + fn eval_precedence_mul_before_add() { + assert_eq!( + expr(&["2", "3", "4"], &[Op::Add, Op::Mul]).eval().to_string(), + "14" + ); + assert_eq!( + expr(&["2", "3", "4"], &[Op::Mul, Op::Add]).eval().to_string(), + "10" + ); + } + + #[test] + fn eval_two_op_chain() { + assert_eq!( + expr(&["999", "111", "222"], &[Op::Add, Op::Sub]) + .eval() + .to_string(), + "888" + ); + } + + #[test] + fn eval_negative_result() { + assert_eq!(expr(&["5", "12"], &[Op::Sub]).eval().to_string(), "-7"); + } + + #[test] + fn eval_falls_back_to_bignum_past_i128() { + let e = expr( + &["99999999999999999999", "99999999999999999999"], + &[Op::Mul], + ); + assert!(e.eval_i128().is_none(), "must overflow the fast path"); + assert_eq!( + e.eval().to_string(), + "9999999999999999999800000000000000000001" + ); + } + + #[test] + fn eval_fast_and_big_paths_agree() { + let e = expr(&["12345", "6789", "42"], &[Op::Mul, Op::Add]); + assert_eq!(e.eval_i128().expect("fits").to_string(), "83810247"); + assert_eq!(e.eval_big().to_string(), "83810247"); + } + + #[test] + fn expr_display_and_operand_width() { + let e = expr(&["12", "7"], &[Op::Add]); + assert_eq!(e.to_string(), "12 + 7"); + assert_eq!(e.max_operand_digits(), 2); + let e = expr(&["3", "999999", "21"], &[Op::Mul, Op::Sub]); + assert_eq!(e.to_string(), "3 * 999999 - 21"); + assert_eq!(e.max_operand_digits(), 6); + } + + #[test] + fn approx_magnitude_tracks_digit_count() { + let n = big("999999999999999999999999"); // 24 nines ≈ 1e24 + let approx = n.approx_magnitude(); + assert!((approx / 1e24 - 1.0).abs() < 0.01, "approx {approx}"); + assert_eq!(big("0").approx_magnitude(), 0.0); + } +} diff --git a/crates/larql-inference/src/experts/arith/drive.rs b/crates/larql-inference/src/experts/arith/drive.rs new file mode 100644 index 000000000..4dbbd2af4 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/drive.rs @@ -0,0 +1,145 @@ +//! Default drive path (spec §5.1): force the answer token sequence at the +//! sampler, then terminate at schedule end. +//! +//! Forcing at the sampler is the measured-equivalent cheapest drive (A9b: +//! logit bias ≅ injection ≅ constrained decoding on greedy), and schedule-end +//! termination makes delivery 1.0 **by construction** — the only observed +//! delivery defect without it was post-schedule digit continuation. The +//! forced tokens enter the KV cache normally, so the model stays conditioned +//! on the digits it "said". +//! +//! Residual injection (spec §5.2) is reserved and not used for digits in +//! v0.1; it lives with the Lazarus splice infrastructure when it lands. + +use larql_models::ModelWeights; +use larql_vindex::VectorIndex; +use tokenizers::Tokenizer; + +use crate::vindex::generate_kquant_cpu_constrained; + +/// Why the forced decode stopped. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TerminationCause { + /// All scheduled tokens were emitted and generation was stopped by the + /// controller — the mandatory path. + ScheduleEnd, + /// Generation halted before the schedule completed (EOS fired on a + /// forced token, or the decode loop bailed) — should not happen for + /// digit payloads; surfaced in telemetry rather than silently absorbed. + EarlyStop { at: usize }, +} + +impl TerminationCause { + pub fn label(&self) -> String { + match self { + TerminationCause::ScheduleEnd => "schedule_end".to_string(), + TerminationCause::EarlyStop { at } => format!("early_stop@{at}"), + } + } +} + +/// Result of one forced decode. +#[derive(Debug, Clone)] +pub struct ForcedDecode { + /// Decoded text of the emitted tokens. + pub emitted: String, + /// Token ids actually emitted (== schedule on the happy path). + pub ids: Vec, + pub cause: TerminationCause, +} + +/// Run the forced-decode schedule: at step `i` every logit except +/// `schedule[i]` is masked to −∞, and the loop is bounded at +/// `schedule.len()` — termination at schedule end by construction. +pub fn force_decode_kquant( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + index: &VectorIndex, + prompt_ids: &[u32], + schedule: &[u32], +) -> ForcedDecode { + if schedule.is_empty() { + return ForcedDecode { + emitted: String::new(), + ids: Vec::new(), + cause: TerminationCause::ScheduleEnd, + }; + } + let sched = schedule.to_vec(); + let out = generate_kquant_cpu_constrained( + weights, + tokenizer, + prompt_ids, + sched.len(), + index, + move |generated, logits| { + let step = generated.len(); + if let Some(&want) = sched.get(step) { + for (i, l) in logits.iter_mut().enumerate() { + if i as u32 != want { + *l = f32::NEG_INFINITY; + } + } + // The decode loop bails on a non-finite pick; pin the forced + // token if the model's own logit for it was non-finite. + if let Some(l) = logits.get_mut(want as usize) { + if !l.is_finite() { + *l = 0.0; + } + } + } + }, + ); + + let ids: Vec = out.iter().map(|(_, id)| *id).collect(); + let emitted: String = out.iter().map(|(t, _)| t.as_str()).collect(); + let cause = if ids == schedule { + TerminationCause::ScheduleEnd + } else { + TerminationCause::EarlyStop { at: ids.len() } + }; + ForcedDecode { + emitted, + ids, + cause, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::{make_test_q4k_vindex, make_test_q4k_weights, make_test_tokenizer}; + + #[test] + fn forced_decode_emits_exactly_the_schedule() { + let mut weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let tokenizer = make_test_tokenizer(weights.vocab_size); + let schedule = vec![3u32, 5, 1]; + let fd = force_decode_kquant(&mut weights, &tokenizer, &index, &[0u32, 1], &schedule); + assert_eq!(fd.ids, schedule, "emitted ids must equal the schedule"); + assert_eq!(fd.cause, TerminationCause::ScheduleEnd); + // WordLevel fixture decodes token N as "[N]". + assert!(fd.emitted.contains('3') && fd.emitted.contains('5')); + } + + #[test] + fn empty_schedule_is_a_no_op() { + let mut weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let tokenizer = make_test_tokenizer(weights.vocab_size); + let fd = force_decode_kquant(&mut weights, &tokenizer, &index, &[0u32], &[]); + assert!(fd.ids.is_empty()); + assert!(fd.emitted.is_empty()); + assert_eq!(fd.cause, TerminationCause::ScheduleEnd); + } + + #[test] + fn termination_cause_labels() { + assert_eq!(TerminationCause::ScheduleEnd.label(), "schedule_end"); + assert_eq!( + TerminationCause::EarlyStop { at: 2 }.label(), + "early_stop@2" + ); + } +} diff --git a/crates/larql-inference/src/experts/arith/extract.rs b/crates/larql-inference/src/experts/arith/extract.rs new file mode 100644 index 000000000..cb0609f10 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/extract.rs @@ -0,0 +1,260 @@ +//! Extraction for the arithmetic expert (spec §4). +//! +//! **Explicit path:** symbolic parse of operand digit spans and operators off +//! the prompt surface — exact by construction, zero tokens. The same scanner +//! is the tier-0 gate (`gate.rs`), so a tier-0 fire implies the symbolic +//! extract succeeds: fire ⇒ extraction, the A10 invariant, holds by +//! construction. +//! +//! **Disguised path:** 2-shot rewrite prompt → parse the *emitted expression*. +//! The parser reads the model's expression, never its sum (rigging-proofed by +//! design — anything after `=` is discarded). + +use super::alu::{BigInt, Expr, Op}; + +/// One lexed token of the prompt surface. `Other` breaks operand/operator +/// adjacency so unrelated numbers never join into an expression. +#[derive(Debug, Clone, PartialEq, Eq)] +enum Tok { + Num(String), + Op(Op), + Other, +} + +/// Scan `text` for the longest explicit integer chain `N op N (op N)*`. +/// +/// Operator rules (distractor protection — gate specificity is the contract): +/// - `+`, `*`, `×`, `−` (U+2212) count anywhere between digit spans; +/// - ASCII `-` counts only with whitespace on both sides, so dates +/// (`2026-06-11`), ranges (`5-10`) and phone formats never fire; +/// - `x`/`X` counts only as a standalone word between digit spans (`3 x 4`); +/// - `/` never counts — division is OPEN in v0.1 and `06/11` is a date. +/// +/// Numbers absorb `1,234,567`-style thousands separators and `_` separators. +pub fn find_expression(text: &str) -> Option { + let toks = lex(text); + let mut best: Option<(usize, usize)> = None; // (start, op_count) + + let mut i = 0; + while i < toks.len() { + if matches!(toks[i], Tok::Num(_)) { + let mut j = i; + let mut ops = 0usize; + while matches!(toks.get(j + 1), Some(Tok::Op(_))) + && matches!(toks.get(j + 2), Some(Tok::Num(_))) + { + ops += 1; + j += 2; + } + if ops > 0 && best.map(|(_, b)| ops > b).unwrap_or(true) { + best = Some((i, ops)); + } + i = j + 1; + } else { + i += 1; + } + } + + let (start, op_count) = best?; + let mut operands = Vec::with_capacity(op_count + 1); + let mut ops = Vec::with_capacity(op_count); + for k in 0..=op_count { + let Tok::Num(s) = &toks[start + 2 * k] else { + return None; + }; + operands.push(BigInt::parse(s)?); + if k < op_count { + let Tok::Op(op) = &toks[start + 2 * k + 1] else { + return None; + }; + ops.push(*op); + } + } + Some(Expr { operands, ops }) +} + +fn lex(text: &str) -> Vec { + let chars: Vec = text.chars().collect(); + let mut toks = Vec::new(); + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + if c.is_ascii_digit() { + let mut num = String::new(); + while i < chars.len() { + let c = chars[i]; + if c.is_ascii_digit() { + num.push(c); + i += 1; + } else if (c == ',' || c == '_') + && chars.get(i + 1).is_some_and(|n| n.is_ascii_digit()) + { + // Separator inside a number; keep digits only. + i += 1; + } else { + break; + } + } + toks.push(Tok::Num(num)); + continue; + } + if c.is_whitespace() { + i += 1; + continue; + } + let ws_before = i == 0 || chars[i - 1].is_whitespace(); + let ws_after = i + 1 >= chars.len() || chars[i + 1].is_whitespace(); + let op = match c { + '+' => Some(Op::Add), + '*' | '×' => Some(Op::Mul), + '−' => Some(Op::Sub), + '-' if ws_before && ws_after => Some(Op::Sub), + 'x' | 'X' if ws_before && ws_after => Some(Op::Mul), + _ => None, + }; + match op { + Some(op) => toks.push(Tok::Op(op)), + None => toks.push(Tok::Other), + } + i += 1; + } + toks +} + +/// The 2-shot rewrite prompt (the measured A8 floor — deliberately untuned; +/// structured-output extraction is the OPEN improvement, not this prompt). +pub fn rewrite_prompt(question: &str) -> String { + format!( + "Rewrite each question as a bare arithmetic expression. Do not solve it.\n\ + Q: If you have 7 apples and pick 5 more, how many apples do you have?\n\ + E: 7 + 5\n\ + Q: A crate holds 240 bottles. How many bottles are in 3 crates?\n\ + E: 240 * 3\n\ + Q: {question}\n\ + E:" + ) +} + +/// Parse the model-emitted rewrite. First emitted line only, truncated at +/// `=` so the model's own sum — if it volunteers one — is never consumed. +pub fn parse_rewrite(generated: &str) -> Option { + let line = generated.trim_start().lines().next()?; + let line = line.split(['=', '\u{ff1d}']).next().unwrap_or(line); + find_expression(line) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(text: &str) -> Option { + find_expression(text).map(|e| format!("{e} -> {}", e.eval())) + } + + #[test] + fn explicit_forms_parse_exactly() { + assert_eq!(parse("12 + 7 ="), Some("12 + 7 -> 19".into())); + assert_eq!( + parse("What is 123456 + 654321?"), + Some("123456 + 654321 -> 777777".into()) + ); + assert_eq!(parse("12345 * 6789"), Some("12345 * 6789 -> 83810205".into())); + assert_eq!(parse("12×34"), Some("12 * 34 -> 408".into())); + assert_eq!(parse("3 x 4"), Some("3 * 4 -> 12".into())); + assert_eq!(parse("100000 - 1 ="), Some("100000 - 1 -> 99999".into())); + assert_eq!(parse("47−5"), Some("47 - 5 -> 42".into())); + } + + #[test] + fn two_op_chains_parse() { + assert_eq!( + parse("999 + 111 - 222 ="), + Some("999 + 111 - 222 -> 888".into()) + ); + assert_eq!(parse("2 + 3 * 4"), Some("2 + 3 * 4 -> 14".into())); + } + + #[test] + fn thousands_separators_absorb_into_one_operand() { + assert_eq!( + parse("1,234,567 + 1"), + Some("1234567 + 1 -> 1234568".into()) + ); + assert_eq!(parse("1_000 + 24"), Some("1000 + 24 -> 1024".into())); + } + + #[test] + fn expression_stops_at_equals_never_reads_the_sum() { + // The chain is "12 + 7"; the 19 after '=' must not join it. + let e = find_expression("12 + 7 = 19").expect("expr"); + assert_eq!(e.to_string(), "12 + 7"); + } + + #[test] + fn distractors_do_not_parse() { + for text in [ + "My phone number is 4415550172.", + "The meeting is on 2026-06-11.", + "Trains depart at 18:45 from platform 3.", + "Order 66 was executed in 19 BBY.", + "What is the capital of France?", + "Account 123456789012345678901234567890 is active.", + "It takes 5-10 days to ship.", + "The score was 3/4.", + "version 1.2.3 released", + ] { + assert!( + find_expression(text).is_none(), + "false parse on distractor: {text:?}" + ); + } + } + + #[test] + fn hyphen_needs_whitespace_both_sides() { + assert!(find_expression("100-1").is_none()); + assert!(find_expression("100- 1").is_none()); + assert!(find_expression("100 -1").is_none()); + assert!(find_expression("100 - 1").is_some()); + } + + #[test] + fn x_must_be_standalone() { + assert!(find_expression("3x4").is_none(), "3x4 could be a label"); + assert!(find_expression("matrix 3 x 4").is_some()); + } + + #[test] + fn longest_chain_wins() { + // Two candidate chains; the 3-operand one is the expression. + let e = find_expression("page 7 + 1, then 10 + 20 + 30").expect("expr"); + assert_eq!(e.to_string(), "10 + 20 + 30"); + } + + #[test] + fn rewrite_prompt_embeds_question_and_two_shots() { + let p = rewrite_prompt("If a box holds 12 eggs, how many in 4 boxes?"); + assert!(p.contains("7 + 5")); + assert!(p.contains("240 * 3")); + assert!(p.ends_with("E:")); + assert!(p.contains("how many in 4 boxes?")); + } + + #[test] + fn parse_rewrite_reads_first_line_only() { + let e = parse_rewrite(" 240 * 3\nQ: another question\nE: 1 + 1").expect("expr"); + assert_eq!(e.to_string(), "240 * 3"); + } + + #[test] + fn parse_rewrite_discards_a_volunteered_sum() { + let e = parse_rewrite(" 7 + 5 = 12").expect("expr"); + assert_eq!(e.to_string(), "7 + 5"); + } + + #[test] + fn parse_rewrite_misses_on_garbage() { + assert!(parse_rewrite("I cannot rewrite that.").is_none()); + assert!(parse_rewrite("").is_none()); + } +} diff --git a/crates/larql-inference/src/experts/arith/gate.rs b/crates/larql-inference/src/experts/arith/gate.rs new file mode 100644 index 000000000..3e991fea4 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/gate.rs @@ -0,0 +1,181 @@ +//! Gate for the arithmetic expert (spec §3). Two tiers: +//! +//! - **Tier 0 — symbolic.** The explicit-expression scanner from +//! `extract.rs` run over the prompt surface. Sharing the scanner with the +//! extractor makes "tier-0 fire ⇒ symbolic extract succeeds" true by +//! construction (the A10 fire × extract invariant). +//! - **Tier 1 — engagement probe.** Ridge probe on the residual at the probe +//! layer, last prompt token, reading arithmetic-engagement exhaust. Probe +//! weights are per-checkpoint artifacts shipped alongside the vindex +//! (see `probe_weights/README.md`); current weights are SUB-SPEC on +//! sensitivity and hardening is required before the probe is sole trigger. +//! +//! Policy: tier-0 fire OR tier-1 fire ⇒ dispatch. No fire ⇒ native path +//! untouched. + +use serde::{Deserialize, Serialize}; + +use crate::experts::virtual_expert::{Fire, ResidualTap}; + +use super::extract::find_expression; + +/// Tier-0 symbolic scan: does the prompt surface carry an explicit integer +/// expression (operator adjacent to digit spans)? +pub fn tier0_fires(prompt_text: &str) -> bool { + find_expression(prompt_text).is_some() +} + +/// Ridge probe artifact: linear readout over the residual at `layer`, last +/// prompt token. Versioned per model checkpoint; re-fit per checkpoint. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RidgeProbe { + /// Model checkpoint the weights were fit on (informational). + #[serde(default)] + pub model: String, + /// Residual layer the probe reads (L8 on Gemma-3-4b; treat as a depth + /// fraction when porting — the relative-depth law is the ASSUMED part). + pub layer: usize, + pub weights: Vec, + pub bias: f32, + /// Fire when `score >= threshold`. + pub threshold: f32, +} + +impl RidgeProbe { + /// Load a probe artifact (JSON) from disk. + pub fn load(path: &std::path::Path) -> Result { + let bytes = std::fs::read(path).map_err(|e| format!("read {}: {e}", path.display()))?; + serde_json::from_slice(&bytes).map_err(|e| format!("parse {}: {e}", path.display())) + } + + /// Probe score for a tap, or `None` when the tap doesn't carry this + /// probe's layer or the dimension mismatches (must never fire on a + /// mismatched tap). + pub fn score(&self, tap: &ResidualTap) -> Option { + let residual = tap.residual_at(self.layer)?; + if residual.len() != self.weights.len() { + return None; + } + let dot: f32 = self + .weights + .iter() + .zip(residual.iter()) + .map(|(w, x)| w * x) + .sum(); + Some(dot + self.bias) + } +} + +/// Combined gate policy. Tier 0 is checked first (cost ~0); the probe only +/// decides when the surface scan is silent and a tap was captured. +pub fn gate(probe: Option<&RidgeProbe>, tap: Option<&ResidualTap>, prompt_text: &str) -> Fire { + if tier0_fires(prompt_text) { + return Fire::Tier0; + } + if let (Some(probe), Some(tap)) = (probe, tap) { + if let Some(score) = probe.score(tap) { + if score >= probe.threshold { + return Fire::Tier1(score); + } + } + } + Fire::No +} + +#[cfg(test)] +mod tests { + use super::*; + + fn probe(layer: usize, dim: usize, threshold: f32) -> RidgeProbe { + // weights = [1, 0, 0, ...] → score = residual[0] + bias + let mut weights = vec![0.0; dim]; + weights[0] = 1.0; + RidgeProbe { + model: "test".into(), + layer, + weights, + bias: 0.0, + threshold, + } + } + + fn tap(layer: usize, first: f32, dim: usize) -> ResidualTap { + let mut residual = vec![0.0; dim]; + residual[0] = first; + ResidualTap::single(layer, residual) + } + + #[test] + fn tier0_fires_on_explicit_math_only() { + assert!(tier0_fires("123456 + 654321 =")); + assert!(tier0_fires("what is 12345 * 6789?")); + assert!(!tier0_fires("My phone number is 4415550172.")); + assert!(!tier0_fires("The meeting is on 2026-06-11.")); + assert!(!tier0_fires("What is the capital of France?")); + } + + #[test] + fn probe_scores_matching_tap() { + let p = probe(8, 4, 0.5); + assert_eq!(p.score(&tap(8, 0.9, 4)), Some(0.9)); + } + + #[test] + fn probe_refuses_layer_or_dim_mismatch() { + let p = probe(8, 4, 0.5); + assert_eq!(p.score(&tap(9, 0.9, 4)), None, "wrong layer"); + assert_eq!(p.score(&tap(8, 0.9, 5)), None, "wrong dim"); + } + + #[test] + fn probe_selects_its_layer_from_a_multi_layer_tap() { + let p = probe(8, 4, 0.5); + let mut r8 = vec![0.0; 4]; + r8[0] = 0.7; + let tap = ResidualTap::from(vec![(4, vec![9.0; 4]), (8, r8), (16, vec![9.0; 4])]); + assert_eq!(p.score(&tap), Some(0.7)); + assert_eq!(tap.residual_at(16), Some(&[9.0f32; 4][..])); + assert_eq!(tap.residual_at(5), None); + assert_eq!(tap.layers().len(), 3); + } + + #[test] + fn gate_prefers_tier0() { + let p = probe(8, 4, 0.5); + let t = tap(8, 0.9, 4); + assert_eq!(gate(Some(&p), Some(&t), "12 + 7 ="), Fire::Tier0); + } + + #[test] + fn gate_tier1_fires_above_threshold() { + let p = probe(8, 4, 0.5); + let t = tap(8, 0.9, 4); + match gate(Some(&p), Some(&t), "If you have seven apples...") { + Fire::Tier1(s) => assert!((s - 0.9).abs() < 1e-6), + other => panic!("expected tier1, got {other:?}"), + } + } + + #[test] + fn gate_no_fire_below_threshold_or_without_tap() { + let p = probe(8, 4, 0.5); + let cold = tap(8, 0.1, 4); + assert_eq!(gate(Some(&p), Some(&cold), "plain prose"), Fire::No); + assert_eq!(gate(Some(&p), None, "plain prose"), Fire::No); + assert_eq!(gate(None, None, "plain prose"), Fire::No); + } + + #[test] + fn probe_load_roundtrip_and_errors() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("probe.json"); + let p = probe(8, 3, 0.25); + std::fs::write(&path, serde_json::to_vec(&p).expect("ser")).expect("write"); + let loaded = RidgeProbe::load(&path).expect("load"); + assert_eq!(loaded.layer, 8); + assert_eq!(loaded.weights.len(), 3); + assert!(RidgeProbe::load(&dir.path().join("missing.json")).is_err()); + std::fs::write(&path, b"not json").expect("write"); + assert!(RidgeProbe::load(&path).is_err()); + } +} diff --git a/crates/larql-inference/src/experts/arith/mod.rs b/crates/larql-inference/src/experts/arith/mod.rs new file mode 100644 index 000000000..636377979 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/mod.rs @@ -0,0 +1,509 @@ +//! Arithmetic Virtual Expert (AVE) — instance #1 of [`VirtualExpert`]. +//! +//! Spec: `docs/specs/virtual-experts/arithmetic-virtual-expert.md`. +//! +//! The model is an I/O system, not a calculator: it supplies digit +//! decomposition, an involuntary engagement signal, operand extraction, a +//! magnitude prior and a fluent readout — it structurally cannot supply the +//! serial algorithm. So **fired ⇒ dispatch, always**: the gate fires, the +//! payload is extracted (symbolically or via model rewrite), the ALU computes +//! exactly, and the answer is forced back through the sampler with +//! schedule-end termination. The model's own arithmetic output is consumed +//! only as a verification prior. + +pub mod alu; +pub mod drive; +pub mod extract; +pub mod gate; +pub mod verify; + +use larql_models::ModelWeights; +use larql_vindex::VectorIndex; +use serde::Serialize; +use tokenizers::Tokenizer; + +use crate::vindex::generate_kquant_cpu; + +use super::virtual_expert::{DriveSchedule, ExtractMiss, Fire, ResidualTap, Verdict, VirtualExpert}; +use alu::{BigInt, Expr}; +use drive::TerminationCause; + +/// Compute result plus the operand width that scopes the verify prior. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ArithAnswer { + pub value: BigInt, + pub max_operand_digits: usize, +} + +/// The arithmetic expert: tier-0 symbolic gate always on; tier-1 engagement +/// probe when a per-checkpoint artifact is loaded. +#[derive(Debug, Clone, Default)] +pub struct ArithmeticExpert { + pub probe: Option, +} + +impl ArithmeticExpert { + pub fn new() -> Self { + Self::default() + } + + pub fn with_probe(probe: gate::RidgeProbe) -> Self { + ArithmeticExpert { probe: Some(probe) } + } +} + +impl VirtualExpert for ArithmeticExpert { + type Payload = Expr; + type Answer = ArithAnswer; + + fn name(&self) -> &'static str { + "arith" + } + + fn gate(&self, tap: Option<&ResidualTap>, prompt_text: &str) -> Fire { + gate::gate(self.probe.as_ref(), tap, prompt_text) + } + + fn extract(&self, prompt_text: &str, rewrite: Option<&str>) -> Result { + match rewrite { + Some(r) => extract::parse_rewrite(r) + .ok_or_else(|| ExtractMiss(format!("unparseable rewrite: {:?}", r.trim()))), + None => extract::find_expression(prompt_text) + .ok_or_else(|| ExtractMiss("no explicit expression on prompt surface".into())), + } + } + + fn compute(&self, payload: &Expr) -> ArithAnswer { + ArithAnswer { + value: payload.eval(), + max_operand_digits: payload.max_operand_digits(), + } + } + + fn drive(&self, answer: &ArithAnswer) -> DriveSchedule { + // Leading space: the answer rides the position the model was about + // to emit into (typically after "=" or a question mark). + DriveSchedule { + text: format!(" {}", answer.value), + } + } + + fn verify(&self, answer: &ArithAnswer, native: Option<&str>) -> Verdict { + verify::magnitude_prior(&answer.value, native, answer.max_operand_digits) + } +} + +/// Which arm of the state machine handled the item. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum AvePath { + /// No fire — native path untouched. + Native, + /// Fired, symbolic extract, forced decode. + ForcedExplicit, + /// Fired, rewrite extract, forced decode. + ForcedRewrite, + /// Fired but extraction missed — native + `extract_miss` flag. + NativeExtractMiss, +} + +/// Controller options. +#[derive(Debug, Clone)] +pub struct AveOptions { + /// Token budget for the native path (no fire / extract miss). + pub max_native_tokens: usize, + /// Disguised path: allow the 2-shot rewrite when symbolic extract misses. + pub enable_rewrite: bool, + /// Token budget for the rewrite segment (~2× the native answer is the + /// measured floor; budget is a hard cap, not a target). + pub rewrite_max_tokens: usize, +} + +impl Default for AveOptions { + fn default() -> Self { + AveOptions { + max_native_tokens: 64, + enable_rewrite: true, + rewrite_max_tokens: 48, + } + } +} + +/// Per-item telemetry (mandatory — the A10 lesson: per-item logging turns a +/// rerun into a grep). Every field feeds the batch-level mutual-consistency +/// check. +#[derive(Debug, Clone, Serialize)] +pub struct AveTelemetry { + pub fire: String, + pub path: AvePath, + pub expression: Option, + pub alu_result: Option, + pub emitted: String, + pub termination: String, + pub verify: String, + pub flags: Vec, + /// Tokens spent on the rewrite segment (disguised path only). + pub rewrite_tokens: usize, + /// Tokens emitted on the answer segment (forced or native). + pub answer_tokens: usize, +} + +/// Outcome of one controller run. +#[derive(Debug, Clone)] +pub struct AveOutcome { + pub path: AvePath, + pub fire: Fire, + /// Exact ALU answer, when the dispatch path ran. + pub answer: Option, + /// Full emitted string (forced schedule or native generation). + pub emitted: String, + pub telemetry: AveTelemetry, +} + +/// State machine (spec §7) over the CPU Q4_K decode path: +/// +/// ```text +/// IDLE → (prompt pass; tap) +/// ├─ no fire ────────────→ NATIVE (untouched) +/// └─ fire (T0|T1) → EXTRACT +/// ├─ symbolic ok ──→ COMPUTE → DRIVE(forced) → TERMINATE → VERIFY? → IDLE +/// ├─ rewrite ok ───→ COMPUTE → DRIVE → TERMINATE → VERIFY? → IDLE +/// └─ extract miss ─→ NATIVE + flag +/// ``` +/// +/// `tap` is the caller's residual capture for the tier-1 probe (in +/// production a free read off the prompt pass; `None` runs tier-0 only). +/// The verify leg runs only when a native answer happens to exist — the +/// dispatch path never spends tokens producing one (`Verdict::Skipped`). +pub fn ave_generate_kquant( + expert: &ArithmeticExpert, + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + index: &VectorIndex, + prompt: &str, + tap: Option<&ResidualTap>, + opts: &AveOptions, +) -> Result { + let prompt_ids = tokenizer + .encode(prompt, true) + .map_err(|e| format!("tokenize prompt: {e}"))? + .get_ids() + .to_vec(); + + let fire = expert.gate(tap, prompt); + if !fire.fired() { + let (emitted, n) = run_native(weights, tokenizer, index, &prompt_ids, opts); + return Ok(outcome_native(AvePath::Native, fire, emitted, n, vec![])); + } + + // EXTRACT: symbolic first (zero tokens), rewrite fallback if enabled. + let mut rewrite_tokens = 0usize; + let (expr, path) = match expert.extract(prompt, None) { + Ok(expr) => (expr, AvePath::ForcedExplicit), + Err(_) if opts.enable_rewrite => { + let rp = extract::rewrite_prompt(prompt); + let rids = tokenizer + .encode(rp.as_str(), true) + .map_err(|e| format!("tokenize rewrite prompt: {e}"))? + .get_ids() + .to_vec(); + let rew = generate_kquant_cpu(weights, tokenizer, &rids, opts.rewrite_max_tokens, index); + rewrite_tokens = rew.len(); + let rew_text: String = rew.iter().map(|(t, _)| t.as_str()).collect(); + match expert.extract(prompt, Some(&rew_text)) { + Ok(expr) => (expr, AvePath::ForcedRewrite), + Err(miss) => { + let (emitted, n) = run_native(weights, tokenizer, index, &prompt_ids, opts); + let mut out = outcome_native( + AvePath::NativeExtractMiss, + fire, + emitted, + n, + vec!["extract_miss".into(), miss.0], + ); + out.telemetry.rewrite_tokens = rewrite_tokens; + return Ok(out); + } + } + } + Err(miss) => { + let (emitted, n) = run_native(weights, tokenizer, index, &prompt_ids, opts); + return Ok(outcome_native( + AvePath::NativeExtractMiss, + fire, + emitted, + n, + vec!["extract_miss".into(), miss.0], + )); + } + }; + + // COMPUTE → DRIVE(forced, schedule) → TERMINATE. + let answer = expert.compute(&expr); + let schedule = expert.drive(&answer); + let schedule_ids = schedule.forced_ids(tokenizer); + let fd = drive::force_decode_kquant(weights, tokenizer, index, &prompt_ids, &schedule_ids); + + // VERIFY?: no native answer was produced on this path — prior skipped. + let verdict = expert.verify(&answer, None); + + let mut flags = Vec::new(); + if matches!(fd.cause, TerminationCause::EarlyStop { .. }) { + flags.push("early_stop".into()); + } + if let Verdict::Suspect(_) = &verdict { + flags.push("extract_suspect".into()); + } + + let telemetry = AveTelemetry { + fire: fire.label(), + path, + expression: Some(expr.to_string()), + alu_result: Some(answer.value.to_string()), + emitted: fd.emitted.clone(), + termination: fd.cause.label(), + verify: verdict.label(), + flags, + rewrite_tokens, + answer_tokens: fd.ids.len(), + }; + Ok(AveOutcome { + path, + fire, + answer: Some(answer.value.to_string()), + emitted: fd.emitted, + telemetry, + }) +} + +fn run_native( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + index: &VectorIndex, + prompt_ids: &[u32], + opts: &AveOptions, +) -> (String, usize) { + let out = generate_kquant_cpu(weights, tokenizer, prompt_ids, opts.max_native_tokens, index); + let n = out.len(); + (out.into_iter().map(|(t, _)| t).collect(), n) +} + +fn outcome_native( + path: AvePath, + fire: Fire, + emitted: String, + answer_tokens: usize, + flags: Vec, +) -> AveOutcome { + AveOutcome { + path, + fire, + answer: None, + emitted: emitted.clone(), + telemetry: AveTelemetry { + fire: fire.label(), + path, + expression: None, + alu_result: None, + emitted, + termination: "native".into(), + verify: Verdict::Skipped.label(), + flags, + rewrite_tokens: 0, + answer_tokens, + }, + } +} + +/// Batch-level mutual-consistency check (spec §7): the controller should +/// assert `fleet ≈ fire·dispatch + (1−fire)·native` and alarm on violation — +/// table arithmetic is a control surface. Returns the residual. +pub fn decomposition_residual( + fleet_accuracy: f64, + fire_rate: f64, + dispatch_accuracy: f64, + native_accuracy_unfired: f64, +) -> f64 { + fleet_accuracy - (fire_rate * dispatch_accuracy + (1.0 - fire_rate) * native_accuracy_unfired) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::{make_test_q4k_vindex, make_test_q4k_weights, synthetic_tokenizer_json}; + + /// Fixture tokenizer with `[UNK]` mapped to id 0 (a real vocab slot), so + /// free-text prompts — which the WordLevel fixture can't represent — + /// still encode to in-range ids and the forward pass runs. + fn fixture() -> (ModelWeights, VectorIndex, Tokenizer) { + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let tokenizer = + Tokenizer::from_bytes(synthetic_tokenizer_json(weights.vocab_size).as_bytes()) + .expect("fixture tokenizer"); + (weights, index, tokenizer) + } + + /// Probe that fires on any matching tap (threshold below any score). + fn always_fire_probe(dim: usize) -> gate::RidgeProbe { + gate::RidgeProbe { + model: "fixture".into(), + layer: 8, + weights: vec![0.0; dim], + bias: 1.0, + threshold: 0.5, + } + } + + #[test] + fn expert_trait_explicit_pipeline_is_exact() { + let ave = ArithmeticExpert::new(); + assert_eq!(ave.name(), "arith"); + let fire = ave.gate(None, "123456 + 654321 ="); + assert_eq!(fire, Fire::Tier0); + let expr = ave.extract("123456 + 654321 =", None).expect("extract"); + let answer = ave.compute(&expr); + assert_eq!(answer.value.to_string(), "777777"); + assert_eq!(answer.max_operand_digits, 6); + assert_eq!(ave.drive(&answer).text, " 777777"); + assert_eq!(ave.verify(&answer, Some("777,777")), Verdict::Consistent); + assert_eq!(ave.verify(&answer, None), Verdict::Skipped); + } + + #[test] + fn extract_miss_carries_a_reason() { + let ave = ArithmeticExpert::new(); + let miss = ave.extract("no math here", None).expect_err("miss"); + assert!(miss.0.contains("no explicit expression")); + let miss = ave + .extract("ignored", Some("I will not rewrite that")) + .expect_err("miss"); + assert!(miss.0.contains("unparseable rewrite")); + } + + #[test] + fn controller_no_fire_takes_native_path() { + let (mut weights, index, tokenizer) = fixture(); + let ave = ArithmeticExpert::new(); + let opts = AveOptions { + max_native_tokens: 3, + ..AveOptions::default() + }; + // "[1] [2]" carries digits but no operator — must not fire. + let out = ave_generate_kquant( + &ave, &mut weights, &tokenizer, &index, "[1] [2]", None, &opts, + ) + .expect("run"); + assert_eq!(out.path, AvePath::Native); + assert_eq!(out.fire, Fire::No); + assert!(out.answer.is_none()); + assert_eq!(out.telemetry.termination, "native"); + } + + #[test] + fn controller_tier1_fire_with_unparseable_rewrite_flags_extract_miss() { + let (mut weights, index, tokenizer) = fixture(); + let dim = weights.hidden_size; + let ave = ArithmeticExpert::with_probe(always_fire_probe(dim)); + let tap = ResidualTap::single(8, vec![0.0; dim]); + let opts = AveOptions { + max_native_tokens: 2, + rewrite_max_tokens: 2, + enable_rewrite: true, + }; + // No explicit expression; probe fires; the fixture model's rewrite + // output ("[N]" tokens) is unparseable → NATIVE + extract_miss flag. + let out = ave_generate_kquant( + &ave, + &mut weights, + &tokenizer, + &index, + "[1] [2]", + Some(&tap), + &opts, + ) + .expect("run"); + assert_eq!(out.path, AvePath::NativeExtractMiss); + assert!(matches!(out.fire, Fire::Tier1(_))); + assert!(out.telemetry.flags.iter().any(|f| f == "extract_miss")); + assert!(out.telemetry.rewrite_tokens <= 2); + } + + #[test] + fn controller_tier1_fire_with_rewrite_disabled_falls_native() { + let (mut weights, index, tokenizer) = fixture(); + let dim = weights.hidden_size; + let ave = ArithmeticExpert::with_probe(always_fire_probe(dim)); + let tap = ResidualTap::single(8, vec![0.0; dim]); + let opts = AveOptions { + max_native_tokens: 2, + enable_rewrite: false, + ..AveOptions::default() + }; + let out = ave_generate_kquant( + &ave, + &mut weights, + &tokenizer, + &index, + "[1] [2]", + Some(&tap), + &opts, + ) + .expect("run"); + assert_eq!(out.path, AvePath::NativeExtractMiss); + assert_eq!(out.telemetry.rewrite_tokens, 0); + } + + #[test] + fn controller_explicit_fire_forces_the_schedule() { + let (mut weights, index, tokenizer) = fixture(); + let ave = ArithmeticExpert::new(); + // Tier-0 fires on the prompt text; the WordLevel fixture tokenizer + // can't encode " 19", so the schedule is empty — the state machine + // still walks COMPUTE → DRIVE → TERMINATE and reports exactly that. + let out = ave_generate_kquant( + &ave, + &mut weights, + &tokenizer, + &index, + "12 + 7 =", + None, + &AveOptions::default(), + ) + .expect("run"); + assert_eq!(out.path, AvePath::ForcedExplicit); + assert_eq!(out.fire, Fire::Tier0); + assert_eq!(out.answer.as_deref(), Some("19")); + assert_eq!(out.telemetry.expression.as_deref(), Some("12 + 7")); + assert_eq!(out.telemetry.termination, "schedule_end"); + assert_eq!(out.telemetry.verify, "skipped"); + } + + #[test] + fn decomposition_residual_is_zero_when_table_is_consistent() { + // fleet = fire·dispatch + (1−fire)·native exactly. + let r = decomposition_residual(0.92, 0.9, 1.0, 0.2); + assert!(r.abs() < 1e-12, "residual {r}"); + // And alarms (nonzero) when the table is inconsistent. + assert!(decomposition_residual(0.5, 0.9, 1.0, 0.2).abs() > 0.1); + } + + #[test] + fn telemetry_serializes_for_per_item_logs() { + let t = AveTelemetry { + fire: "tier0".into(), + path: AvePath::ForcedExplicit, + expression: Some("12 + 7".into()), + alu_result: Some("19".into()), + emitted: " 19".into(), + termination: "schedule_end".into(), + verify: "skipped".into(), + flags: vec![], + rewrite_tokens: 0, + answer_tokens: 2, + }; + let json = serde_json::to_string(&t).expect("json"); + assert!(json.contains("\"forced_explicit\"")); + assert!(json.contains("schedule_end")); + } +} diff --git a/crates/larql-inference/src/experts/arith/probe_weights/README.md b/crates/larql-inference/src/experts/arith/probe_weights/README.md new file mode 100644 index 000000000..bd246be32 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/probe_weights/README.md @@ -0,0 +1,38 @@ +# Engagement-probe artifacts — audit schema + +Per-checkpoint ridge-probe weights over the residual stream, last prompt token. + +**Status (A11): the probe is an audit instrument, not a gate component.** The +gate-hardening workstream is deleted; AVE v0.1 gates on tier-0 (symbolic) only, +and no-fire ⇒ native is the designed fallthrough (spec §3). This schema is +retained so engagement exhaust can be *audited* offline — scoring taps against +a probe to study exhaust generality (does dates/units/sorting share the +"bounded computation straining" signal, or emit separable signatures?) — and so +a future instrument-science result can ship weights without a format change. + +Artifact format (JSON, loaded by `gate::RidgeProbe::load`): + +```json +{ + "model": "gemma-3-4b-it", + "layer": 8, + "weights": [0.013, -0.002, ...], + "bias": -0.41, + "threshold": 0.5 +} +``` + +- `layer` — residual layer the probe reads, last prompt token (L8 ≈ 24% depth + on Gemma-3-4b; the relative-depth framing is the ASSUMED porting hypothesis). +- `weights` — ridge readout, one weight per hidden dim (λ ∝ mean feature norm + at fit time, per A7b). +- `threshold` — `dot(weights, residual) + bias >= threshold`. For audit runs + the score matters more than the threshold; for any future gate use, bias + toward firing (false-fire cost is one wasted extraction). +- A `RidgeProbe` scores a multi-layer `ResidualTap` by selecting its own + `layer` from the tap; a tap that doesn't carry the probe's layer (or has a + dimension mismatch) scores `None`, never fires. + +Status 2026-06-12: **no artifact present.** The A7b probe was fit on the +bf16/MLX pipeline outside this repo (measured: specificity 1.00, sensitivity +0.17–1.0 by phrasing — the unevenness that, with A11, demoted it). diff --git a/crates/larql-inference/src/experts/arith/verify.rs b/crates/larql-inference/src/experts/arith/verify.rs new file mode 100644 index 000000000..de721d3f7 --- /dev/null +++ b/crates/larql-inference/src/experts/arith/verify.rs @@ -0,0 +1,176 @@ +//! Verify leg (spec §6): the model's native answer is a **magnitude prior**, +//! nothing more — a tripwire for extraction bugs, never a judge of the exact +//! compute. +//! +//! Measured envelope (A4c/A5): native near-misses are magnitude-correct to +//! ~±25–35% through ~24-digit operands and collapse past ~28 digits. HARD +//! RULE: the prior is void past 24-digit operands. Thresholds here are +//! ASSUMED until an assembly increment measures the false-flag rate (the +//! spec pre-registered this leg out of A10). + +use crate::experts::virtual_expert::Verdict; + +use super::alu::BigInt; + +/// Operand width (decimal digits) past which the prior is void. +pub const PRIOR_VOID_OPERAND_DIGITS: usize = 24; + +/// Accept ratios in `[1/RATIO_BOUND, RATIO_BOUND]` — covers the measured +/// ±25–35% envelope with margin. ASSUMED until the false-flag rate is run. +pub const RATIO_BOUND: f64 = 1.65; + +/// Compare the ALU answer's magnitude against the model's native answer, +/// if one was produced. +pub fn magnitude_prior( + answer: &BigInt, + native_text: Option<&str>, + max_operand_digits: usize, +) -> Verdict { + if max_operand_digits > PRIOR_VOID_OPERAND_DIGITS { + return Verdict::Skipped; + } + let Some(text) = native_text else { + return Verdict::Skipped; + }; + let Some(native) = first_number(text) else { + return Verdict::Skipped; + }; + + if answer.is_zero() || native.is_zero() { + return if answer.is_zero() && native.is_zero() { + Verdict::Consistent + } else { + Verdict::Suspect(format!("native {native} vs alu {answer} (zero mismatch)")) + }; + } + if answer.is_negative() != native.is_negative() { + return Verdict::Suspect(format!("native {native} vs alu {answer} (sign mismatch)")); + } + + let ratio = native.approx_magnitude() / answer.approx_magnitude(); + if (1.0 / RATIO_BOUND..=RATIO_BOUND).contains(&ratio) { + Verdict::Consistent + } else { + Verdict::Suspect(format!( + "native {native} vs alu {answer} (magnitude ratio {ratio:.2})" + )) + } +} + +/// First decimal number in free text (optional leading `-`, separators +/// stripped). The native answer may arrive embedded in prose. +fn first_number(text: &str) -> Option { + let chars: Vec = text.chars().collect(); + let mut i = 0; + while i < chars.len() { + if chars[i].is_ascii_digit() { + let neg = i > 0 + && chars[i - 1] == '-' + && (i == 1 || !chars[i - 2].is_ascii_digit()); + let mut digits = String::new(); + while i < chars.len() { + let c = chars[i]; + if c.is_ascii_digit() { + digits.push(c); + i += 1; + } else if (c == ',' || c == '_') + && chars.get(i + 1).is_some_and(|n| n.is_ascii_digit()) + { + i += 1; + } else { + break; + } + } + let s = if neg { format!("-{digits}") } else { digits }; + return BigInt::parse(&s); + } + i += 1; + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + fn big(s: &str) -> BigInt { + BigInt::parse(s).expect("parse") + } + + #[test] + fn consistent_when_native_matches_exactly() { + assert_eq!( + magnitude_prior(&big("777777"), Some("The answer is 777777."), 6), + Verdict::Consistent + ); + } + + #[test] + fn consistent_within_the_envelope() { + // 25% high: still a magnitude-correct near-miss. + assert_eq!( + magnitude_prior(&big("100000"), Some("125000"), 6), + Verdict::Consistent + ); + } + + #[test] + fn suspect_on_magnitude_blowout() { + // Digit count off by two — the swapped-operand class of extraction bug. + let v = magnitude_prior(&big("777777"), Some("about 7,777,777,777"), 6); + assert!(matches!(v, Verdict::Suspect(_)), "got {v:?}"); + } + + #[test] + fn suspect_on_sign_and_zero_mismatch() { + assert!(matches!( + magnitude_prior(&big("-7"), Some("7"), 2), + Verdict::Suspect(_) + )); + assert!(matches!( + magnitude_prior(&big("0"), Some("12"), 2), + Verdict::Suspect(_) + )); + assert_eq!(magnitude_prior(&big("0"), Some("0"), 2), Verdict::Consistent); + } + + #[test] + fn negative_native_in_prose_is_parsed() { + assert_eq!( + magnitude_prior(&big("-7"), Some("The result is -7."), 2), + Verdict::Consistent + ); + } + + #[test] + fn skipped_when_prior_is_void_or_native_absent() { + // 25-digit operands: past the magnitude wall, prior void. + assert_eq!( + magnitude_prior(&big("1"), Some("999"), PRIOR_VOID_OPERAND_DIGITS + 1), + Verdict::Skipped + ); + assert_eq!(magnitude_prior(&big("19"), None, 2), Verdict::Skipped); + assert_eq!( + magnitude_prior(&big("19"), Some("no number here"), 2), + Verdict::Skipped + ); + } + + #[test] + fn at_exactly_24_digit_operands_prior_still_applies() { + let a = big("999999999999999999999999"); + assert_eq!( + magnitude_prior(&a, Some("999999999999999999999999"), 24), + Verdict::Consistent + ); + } + + #[test] + fn first_number_takes_the_first_span_only() { + assert_eq!(first_number("19 then 42"), Some(big("19"))); + assert_eq!(first_number("= 1,234"), Some(big("1234"))); + assert_eq!(first_number("x-5y"), Some(big("-5"))); + assert_eq!(first_number("12-5"), Some(big("12"))); + assert_eq!(first_number(""), None); + } +} diff --git a/crates/larql-inference/src/experts/mod.rs b/crates/larql-inference/src/experts/mod.rs index 95193cf2a..91d8adc88 100644 --- a/crates/larql-inference/src/experts/mod.rs +++ b/crates/larql-inference/src/experts/mod.rs @@ -1,13 +1,32 @@ +//! Experts: model-adjacent compute the forward pass dispatches into. +//! +//! Two families live here: +//! - **WASM tool experts** (`caller`/`loader`/`registry`/`session`/…): the +//! model *emits* an op-call, the host parses and dispatches it into a +//! sandboxed WASM unit (`docs/virtual-experts-dispatch.md`). +//! - **Virtual experts** (`virtual_expert` + `arith`): invisible to the +//! model — a gate reads forward-pass exhaust, payloads are extracted +//! through the model's I/O, compute is external and exact, and the answer +//! is forced back through the sampler +//! (`docs/specs/virtual-experts/arithmetic-virtual-expert.md`). + +pub mod arith; pub mod caller; pub mod loader; pub mod mask; pub mod parser; pub mod registry; pub mod session; +pub mod virtual_expert; +pub use arith::{ + ave_generate_kquant, ArithAnswer, ArithmeticExpert, AveOptions, AveOutcome, AvePath, + AveTelemetry, +}; pub use caller::{ExpertMetadata, ExpertResult, OpSpec}; pub use loader::load_expert; pub use mask::OpNameMask; pub use parser::{parse_op_call, OpCall}; pub use registry::{ExpertHandle, ExpertRegistry, WasmInfo}; pub use session::{DispatchOutcome, DispatchSkip, Dispatcher, ExpertSession, FilteredDispatcher}; +pub use virtual_expert::{DriveSchedule, ExtractMiss, Fire, ResidualTap, Verdict, VirtualExpert}; diff --git a/crates/larql-inference/src/experts/virtual_expert.rs b/crates/larql-inference/src/experts/virtual_expert.rs new file mode 100644 index 000000000..6ec234368 --- /dev/null +++ b/crates/larql-inference/src/experts/virtual_expert.rs @@ -0,0 +1,215 @@ +//! `VirtualExpert` — the gate / extract / compute / drive / verify decomposition +//! for experts that ride the forward pass without touching weights or routing. +//! +//! Spec: `docs/specs/virtual-experts/arithmetic-virtual-expert.md` (§8). +//! Instance #1 is [`crate::experts::arith::ArithmeticExpert`]. +//! +//! Design constraints (baked in from the arithmetic_mechanism arc): +//! - the gate reads **exhaust, not intent** — an involuntary engagement signal +//! in the residual stream, plus a symbolic scan of the prompt surface; +//! - the expert is **invisible to the model** — no weights touched, no model +//! routing used; +//! - compute is **never** the model's — the model supplies I/O (extraction, +//! readout, a magnitude prior), the expert supplies the algorithm. + +use tokenizers::Tokenizer; + +/// Read-only residual capture, last prompt token, at one or more layers. +/// In production this is a free read off the prompt forward pass; harnesses +/// may populate it with `crate::forward::capture_residuals` (whose +/// `Vec<(layer, residual)>` output converts directly via `From`). +/// +/// Multi-layer so one capture can serve several experts/probes reading at +/// different depths — the tap is taken once per prompt pass, not per expert. +#[derive(Debug, Clone, Default)] +pub struct ResidualTap { + layers: Vec<(usize, Vec)>, +} + +impl ResidualTap { + /// Tap with a single captured layer. + pub fn single(layer: usize, residual: Vec) -> Self { + ResidualTap { + layers: vec![(layer, residual)], + } + } + + /// The residual captured at `layer`, if that layer was tapped. + pub fn residual_at(&self, layer: usize) -> Option<&[f32]> { + self.layers + .iter() + .find(|(l, _)| *l == layer) + .map(|(_, r)| r.as_slice()) + } + + /// All captured `(layer, residual)` pairs. + pub fn layers(&self) -> &[(usize, Vec)] { + &self.layers + } +} + +impl From)>> for ResidualTap { + fn from(layers: Vec<(usize, Vec)>) -> Self { + ResidualTap { layers } + } +} + +/// Gate decision. A fire is a dispatch decision: fired ⇒ dispatch, always. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Fire { + /// Native path untouched. + No, + /// Tier-0 symbolic scan fired (explicit math on the prompt surface). + Tier0, + /// Tier-1 engagement probe fired, with the probe score. + Tier1(f32), +} + +impl Fire { + pub fn fired(&self) -> bool { + !matches!(self, Fire::No) + } + + /// Telemetry label ("no" | "tier0" | "tier1(score)"). + pub fn label(&self) -> String { + match self { + Fire::No => "no".to_string(), + Fire::Tier0 => "tier0".to_string(), + Fire::Tier1(s) => format!("tier1({s:.3})"), + } + } +} + +/// Extraction failed; controller falls to native and flags `extract_miss`. +#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] +#[error("extract miss: {0}")] +pub struct ExtractMiss(pub String); + +/// Verify-leg verdict. The native answer is a magnitude **prior**, not a +/// judge: `Suspect` flags a likely extraction bug, it never overrides the +/// exact compute. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Verdict { + /// No native answer available, or the prior is void (operand size past + /// its measured envelope). + Skipped, + /// ALU result magnitude-consistent with the model's native answer. + Consistent, + /// Magnitude mismatch — flag `extract_suspect`. + Suspect(String), +} + +impl Verdict { + /// Telemetry label. + pub fn label(&self) -> String { + match self { + Verdict::Skipped => "skipped".to_string(), + Verdict::Consistent => "consistent".to_string(), + Verdict::Suspect(r) => format!("suspect: {r}"), + } + } +} + +/// The answer text the controller forces at the sampler, one token per decode +/// step, then **terminates at schedule end** (delivery = 1.0 by construction — +/// the one observed delivery defect was post-schedule digit continuation). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DriveSchedule { + pub text: String, +} + +impl DriveSchedule { + /// Tokenize the schedule text into the forced token sequence (no special + /// tokens — the schedule rides the decode the model was emitting anyway). + pub fn forced_ids(&self, tokenizer: &Tokenizer) -> Vec { + tokenizer + .encode(self.text.as_str(), false) + .map(|e| e.get_ids().to_vec()) + .unwrap_or_default() + } +} + +/// An expert that gates on forward-pass exhaust, extracts a payload through +/// the model's I/O, computes externally and exactly, and drives the answer +/// back through the sampler. +pub trait VirtualExpert { + /// What extraction produces (e.g. a parsed arithmetic expression). + type Payload; + /// What compute produces (exact, external — never the model's). + type Answer; + + fn name(&self) -> &'static str; + + /// Gate on exhaust, not intent. `tap` is the residual capture for the + /// tier-1 probe when one is loaded; tier-0 scans the prompt surface. + fn gate(&self, tap: Option<&ResidualTap>, prompt_text: &str) -> Fire; + + /// Extract the payload: from the prompt surface (explicit path, + /// `rewrite = None`) or from a model-emitted rewrite (disguised path). + fn extract( + &self, + prompt_text: &str, + rewrite: Option<&str>, + ) -> Result; + + /// Exact external compute. + fn compute(&self, payload: &Self::Payload) -> Self::Answer; + + /// Forced-decode schedule for the answer (default drive path). + fn drive(&self, answer: &Self::Answer) -> DriveSchedule; + + /// Magnitude-prior check against the model's native answer, if one was + /// produced. A prior, not a judge. + fn verify(&self, answer: &Self::Answer, native: Option<&str>) -> Verdict; +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::{make_test_tokenizer, synthetic_tokenizer_json}; + + #[test] + fn fire_fired_and_labels() { + assert!(!Fire::No.fired()); + assert!(Fire::Tier0.fired()); + assert!(Fire::Tier1(0.93).fired()); + assert_eq!(Fire::No.label(), "no"); + assert_eq!(Fire::Tier0.label(), "tier0"); + assert_eq!(Fire::Tier1(0.5).label(), "tier1(0.500)"); + } + + #[test] + fn verdict_labels() { + assert_eq!(Verdict::Skipped.label(), "skipped"); + assert_eq!(Verdict::Consistent.label(), "consistent"); + assert_eq!( + Verdict::Suspect("digit count".into()).label(), + "suspect: digit count" + ); + } + + #[test] + fn extract_miss_displays_reason() { + let m = ExtractMiss("no expression".into()); + assert_eq!(m.to_string(), "extract miss: no expression"); + } + + #[test] + fn drive_schedule_tokenizes_with_the_session_tokenizer() { + // Null-pre-tokenizer fixture: "[N]" encodes to the single id N. + let tok = Tokenizer::from_bytes(synthetic_tokenizer_json(16).as_bytes()).expect("tok"); + let sched = DriveSchedule { + text: "[5]".to_string(), + }; + assert_eq!(sched.forced_ids(&tok), vec![5]); + } + + #[test] + fn drive_schedule_empty_text_yields_empty_schedule() { + let tok = make_test_tokenizer(8); + let sched = DriveSchedule { + text: String::new(), + }; + assert!(sched.forced_ids(&tok).is_empty()); + } +} diff --git a/docs/specs.md b/docs/specs.md index cf41156ab..5315436f8 100644 --- a/docs/specs.md +++ b/docs/specs.md @@ -17,3 +17,5 @@ All specs live with the crate they describe. | `larql confidence` (slot bits) | larql-cli | [crates/larql-cli/docs/slot-bits-spec.md](../crates/larql-cli/docs/slot-bits-spec.md) | | Trace format | larql-inference | [crates/larql-inference/docs/trace-format.md](../crates/larql-inference/docs/trace-format.md) | | Vindex-as-FFN lookup | larql-inference | [crates/larql-inference/docs/specs/vindex-as-ffn.md](../crates/larql-inference/docs/specs/vindex-as-ffn.md) | +| Arithmetic Virtual Expert (AVE) | larql-inference | [crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md](../crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md) | +| Math Expression Expert (MEE, AVE v0.2) | larql-inference | [crates/larql-inference/docs/specs/virtual-experts/math-expression-expert.md](../crates/larql-inference/docs/specs/virtual-experts/math-expression-expert.md) | From 9de8e0248235c9206a42c7b217f80480d9bd9814 Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Fri, 12 Jun 2026 23:44:50 +0100 Subject: [PATCH 2/8] working on performance improvements --- ROADMAP.md | 149 ++- ROADMAP_STATUS.md | 15 +- .../fr3_explicit_rewrite_gemma3-4b.json | 1 + .../fr3_template_ablation_gemma3-4b.json | 1 + ...arly_exit_decode_projection_gemma3-4b.json | 1 + bench/baselines/_c10_26b_larql_inproc.json | 23 + .../baselines/_c10_26b_larql_inproc_off.json | 23 + .../_c10_26b_larql_inproc_q4kattn.json | 23 + .../_c10_26b_larql_inproc_rerun.json | 23 + .../baselines/_c10_26b_larql_inproc_tmax.json | 23 + bench/baselines/_qres_full.json | 23 + bench/baselines/_qres_full_asm.json | 23 + bench/baselines/_qres_full_int8attn.json | 23 + bench/baselines/_qres_full_kvappend.json | 23 + bench/baselines/_qres_full_q6kasm.json | 23 + bench/baselines/_qres_full_serialcuts.json | 23 + bench/baselines/_qres_full_v3.json | 23 + bench/baselines/_qres_off.json | 23 + .../c10_gemma3-4b_cpu_reconciled.json | 2 +- .../c10_gemma4-26b-a4b_cpu_RUNBOOK.md | 19 + .../c10_gemma4-26b-a4b_cpu_reconciled.json | 139 +++ .../primary/bench/local_moe_runtime.rs | 26 + .../larql-compute/benches/q4k_q8k_matvec.rs | 291 ++++- crates/larql-compute/src/attention/decode.rs | 466 ++++++-- crates/larql-compute/src/cpu/ops/q4_common.rs | 222 +++- .../larql-compute/src/cpu/ops/q4k_q8k_dot.rs | 1030 ++++++++++++++++- .../src/kquant_forward/cached.rs | 103 +- crates/larql-compute/src/kv_dispatch/cpu.rs | 196 ++-- crates/larql-inference/examples/ave_demo.rs | 92 +- .../examples/ave_direct_layer_bisect.rs | 166 +++ .../examples/ave_direct_step_parity.rs | 113 ++ .../examples/ave_q4k_row_audit.rs | 186 +++ .../examples/fr3_explicit_rewrite.rs | 186 +++ .../examples/fr3_template_ablation.rs | 300 +++++ .../fr_early_exit_decode_projection.rs | 241 ++++ .../larql-inference/src/experts/arith/alu.rs | 84 ++ .../src/experts/arith/drive.rs | 99 +- .../larql-inference/src/experts/arith/mod.rs | 8 +- crates/larql-inference/src/ffn/local_moe.rs | 16 +- .../src/forward/predict/dense.rs | 85 ++ .../src/forward/predict/mod.rs | 6 +- .../src/vindex/kquant_forward/cached.rs | 273 ++++- .../src/vindex/kquant_forward/generation.rs | 192 +++ .../src/vindex/kquant_forward/hidden.rs | 52 +- .../src/vindex/kquant_forward/mod.rs | 9 +- crates/larql-inference/src/vindex/mod.rs | 8 +- .../tests/test_q4k_cached_parity.rs | 40 + crates/larql-kv/src/generation.rs | 48 +- .../src/executor/query/select/edges.rs | 125 +- .../src/executor/relation_resolver.rs | 24 +- crates/larql-lql/src/relations.rs | 71 ++ crates/larql-models/src/test_fixtures.rs | 38 + docs/audits/codebase-review-2026-06-12.md | 224 ++++ docs/diagnoses/fr3-explicit-rewrite.md | 79 ++ 54 files changed, 5428 insertions(+), 297 deletions(-) create mode 100644 bench/aim-validation/fr3_explicit_rewrite_gemma3-4b.json create mode 100644 bench/aim-validation/fr3_template_ablation_gemma3-4b.json create mode 100644 bench/aim-validation/fr_early_exit_decode_projection_gemma3-4b.json create mode 100644 bench/baselines/_c10_26b_larql_inproc.json create mode 100644 bench/baselines/_c10_26b_larql_inproc_off.json create mode 100644 bench/baselines/_c10_26b_larql_inproc_q4kattn.json create mode 100644 bench/baselines/_c10_26b_larql_inproc_rerun.json create mode 100644 bench/baselines/_c10_26b_larql_inproc_tmax.json create mode 100644 bench/baselines/_qres_full.json create mode 100644 bench/baselines/_qres_full_asm.json create mode 100644 bench/baselines/_qres_full_int8attn.json create mode 100644 bench/baselines/_qres_full_kvappend.json create mode 100644 bench/baselines/_qres_full_q6kasm.json create mode 100644 bench/baselines/_qres_full_serialcuts.json create mode 100644 bench/baselines/_qres_full_v3.json create mode 100644 bench/baselines/_qres_off.json create mode 100644 bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json create mode 100644 crates/larql-inference/examples/ave_direct_layer_bisect.rs create mode 100644 crates/larql-inference/examples/ave_direct_step_parity.rs create mode 100644 crates/larql-inference/examples/ave_q4k_row_audit.rs create mode 100644 crates/larql-inference/examples/fr3_explicit_rewrite.rs create mode 100644 crates/larql-inference/examples/fr3_template_ablation.rs create mode 100644 crates/larql-inference/examples/fr_early_exit_decode_projection.rs create mode 100644 docs/audits/codebase-review-2026-06-12.md create mode 100644 docs/diagnoses/fr3-explicit-rewrite.md diff --git a/ROADMAP.md b/ROADMAP.md index b4da87d5a..ac3b9f52a 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -133,7 +133,7 @@ the bottleneck.** | Acceptance tier (from "P0 — CPU path to blazing") | Confidence | Driver | |---|---|---| | Short-term: Gemma 3 4B CPU within 10% of `llama.cpp -ngl 0` | **~95%** | Pure engineering | -| Medium-term: Gemma 4 26B-A4B at ≥10 tok/s on 64 GB consumer, no GPU | **~62%** (was ~80% → 70% → 62%, revised 2026-05-31) | MoE active-param math works; 26B fits 64 GB (16 GB vindex). **V2 CONFIRMED FP4** (+); **V1 FALSIFIED hash-routing** but it wasn't load-bearing here (decode is GQA/expert-dispatch-bound). The sharp fact: measured CPU MoE is **~4.4 tok/s, ~2.3× short of the 10 target**, the easy lever is gone, and the close-the-gap toolkit is thinner than it looks — **Q4K-direct already washed out at representative context** (~0% at ctx 907; #16), and the binding constraint there is GQA O(N²) (needs flash-attention-class work, not a quant swap). 62% not lower because 4.4 is a just-shipped *unoptimized* number (C2/CPU-path headroom unmined) and it's throughput not feasibility. **Gated on C10:** if 10 tok/s ≈ llama.cpp-on-26B-CPU this rises toward 70 ("match a mature engine"); if it's above llama.cpp, drops toward 55. | +| Medium-term: Gemma 4 26B-A4B at ≥10 tok/s on 64 GB consumer, no GPU | **~75%** (was ~80% → 70% → 62% → 70% → 75%, revised 2026-06-11: target MEASURED-PASSED on M3 Max) | MoE active-param math works; 26B fits 64 GB (16 GB vindex). **C10 gate resolved favorably (2026-06-10):** llama.cpp-on-26B-CPU = 32 tok/s, the ≥10 target is 3× below a mature engine's proof. The gap was **byte traffic, not kernel quality** (in-process streamed ~10 GB/token f32-resident vs llama.cpp's ~2.1 GB all-quantized, every leg bandwidth-saturated ~62–71 GB/s). **Quantized residency BUILT 2026-06-11** (`LARQL_Q4K_DIRECT_ATTN` + `LARQL_Q4K_LM_HEAD` + `LARQL_Q4K_DIRECT_FFN`, parity-gated, default-off): **7.6 → 13.9 tok/s, +`LARQL_Q4K_ASM` → 15.9** — **the ≥10 target is passed with 60% margin on M3 Max**. Remaining ~1.9× to llama.cpp is C12 kernel work (Q6_K asm, attention's f32-activation `q4k_matvec` → q4k_q8k+asm, instruction-count toward 28 GiB/s; fused gate_up asm landed 2026-06-11 at 2.00× isolated — 9.89→19.82 GiB/s). Held at 75 (not higher) because the 64 GB-consumer class (M-Pro ~150–200 GB/s, x86 DDR5 ~80–100 GB/s) has less bandwidth than the 128 GB M3 Max measured on — though at the measured ~65 GB/s saturation even a 100 GB/s-class box should clear 10. Artifact `bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json` §update_2026_06_11. | | Long-term: 100B-class MoE at ≥5 tok/s, no GPU | **~52%** (was ~60% → 55% → 52%, revised 2026-05-31) | Four-way push: 100B@FP4 (~25–50 GB) **fits RAM** so the disk bet is moot here — *removes* a risk the original 60% priced (+); FP4 confirmed (+); lost hash multiplier makes ≥5 tok/s harder (−); and the exploitable-structure prior took a **two-probe hit** — V1 (FFN-feature sparsity doesn't compound) *and* routing locality (expert selection doesn't concentrate, ~124/128 over a sequence) both say there's less cacheable structure than the "weights-as-database" thesis assumed (−, soft but broad). The disk-risk *removal* is what keeps it off 50; **50 is the honest alternative if you weight the two-probe pattern over it.** Caveat: the uniformity is partly Gemma's load-balancing aux loss (trained-in) → may be router-specific; the cross-MoE-router check would settle 50-vs-55. | | Ultimate: 671B-class via multi-machine grid | **~30%** (was ~40%, revised 2026-05-31) | Hit hardest. 671B even at FP4 (~335 GB) **exceeds single-machine RAM**, and the MoE-routing-locality finding (working set ≈ whole expert population, no cacheable hot subset) **closes the single-machine disk-resident escape hatch** — it would thrash. That leaves only the harder multi-machine grid (C9, demoted to P2 per ADR-019), where integration risk dominates. | | Dense frontier (if the field stays dense at 1T+) | **~10%** (was ~15%, revised 2026-05-31) | The hash-routing 5× its arithmetic leaned on is FALSIFIED (1 TB Q4 → ~10 s/token now, not 2). Needs attention-sparsification breakthroughs outside engineering control. | @@ -324,6 +324,7 @@ two-tier router). FR3 is the cleanest standalone win; FR4 is research-first. | FR1 | **Top-k fuzzy entity router + verifier.** Inference routes on top-1 cosine + a fixed 0.75 gate (`infer_patched.rs:162-163`), the brittle near-rank-1 path E11/E15 indict; `query_knn` top-k exists (`knn_store.rs:132`) but is unused. **MEASURED ✅ (2026-06-07, Gemma-3-4B N=150):** entity key real & answer-leak-free at L24-26 (L26 top1 0.89/top5 0.95, cross-rel 1.00 — beats E15's MLP under cosine-NN, no training); the live 0.75 gate fires **150/150** with **11% confident-wrong @L26 / 84% @L20**. **BUILT ✅ (2026-06-07):** `apply_knn_override_verified` — top-k + entity-in-prompt verify + abstain, resolved-layer-first (no hardcoded layer), opt-in `LARQL_KNN_VERIFY`, default off = byte-identical (14 legacy tests green). E2E on real Gemma-3-4B: legacy "Germany's capital city is"→SpainX (confident-wrong) → verified→GermanyX (fixed), Poland correct both (no regression). 5 unit tests, clippy clean. **LQL surface landed:** first-class `INFER … ROUTE VERIFY [FALLBACK] [TOPK n]` clause (`KnnRouteMode` threaded through `infer_patched`, default Legacy = byte-identical; env vars set the default when no clause). E2E no-env: `ROUTE VERIFY` → Germany fixed. [`docs/diagnoses/fr1-topk-fuzzy-router.md`](docs/diagnoses/fr1-topk-fuzzy-router.md) §"BUILD LANDED". | larql-vindex, larql-inference, larql-lql | **built ✅ (LQL clause + env)** | | FR2 | **Two-tier router: symbolic-primary → activation-fuzzy fallback** (E16 assembled). `entries_for_entity` exact lookup exists (`knn_store.rs:172`) but isn't sequenced into routing. **MEASURED ✅ (2026-06-07, Gemma-3-4B):** symbolic exact-match **0/10** aliases, activation fallback **10/10 top-1** @L24/L26 (Persia→Iran, …) — E16 reproduced. Caveat: famous-alias easy end (general = FR1's ~0.9 top-5); FR1 verifier bounds confident-wrong. **BUILT ✅ (2026-06-07):** `apply_knn_override_two_tier` (tier-1 FR1 verify → tier-2 activation alias fallback, opt-in `LARQL_KNN_VERIFY`+`LARQL_KNN_FALLBACK`, default off = byte-identical). E2E real Gemma-3-4B: "capital of Persia" → verify-only abstains (Tehran), two-tier recovers IranX (cos 0.97), no regression on named. 4 unit tests, clippy clean. Tier-2 is the fuzzy ~0.7-0.9 route (fires only when verify missed). **LQL:** `INFER … ROUTE VERIFY FALLBACK` (E2E no-env: Persia→IranX). [`docs/diagnoses/fr2-two-tier-router.md`](docs/diagnoses/fr2-two-tier-router.md). | larql-inference, larql-vindex, larql-lql | **built ✅ (LQL clause + env)** | | FR3 | **Relation as a clean semantic address.** Relation probe generalizes to unseen synonyms at ~1.000 (`the-mechanism/address.py`); `RelationClassifier` (`relations.rs`) is the foundation. **MEASURED ✅ (2026-06-07, Gemma-3-4B N=40):** synonym-gen **1.00 at every layer L6-L26** (train {capital,currency,language} → classify unseen {seat,money,tongue,…}, semantic not lexical; clean from **L6**, earlier than the video's L10); asymmetry stark — relation 1.00 early vs entity top-1 0.07-0.20 until L26. **BUILT ✅ (2026-06-07):** `RelationResolver` — trained residual softmax probe (not string/cosine; the near-rank-1 "proxy" trap avoided), model-agnostic probe layer (`round(0.3·num_layers)`), wired into `SELECT … FROM EDGES WHERE relation=…` as a semantic fallback (cached per vindex). E2E real Gemma-3-4B: `WHERE relation="seat"` → resolved to "capital". 2 unit tests, 717 lql green, clippy clean. [`docs/diagnoses/fr3-relation-address.md`](docs/diagnoses/fr3-relation-address.md) §"BUILD LANDED". | larql-lql, larql-vindex | **built ✅ (SELECT)** | +| FR3b | **Explicit relation rewrite — phrasing-robust fallback.** FR3's probe is synonym-robust but **phrasing-brittle**: 1.00 was synonym *words* in one template; on an unseen *phrasing* it's at **chance** at its L10 probe layer, and more training templates = **no-op** (reverted). **MEASURED ✅ (2026-06-08, Gemma-3-4B):** explicit few-shot `word→relation` classify (1 forward, `predict_kquant`) = **12/12** synonyms + unseen phrasings (head city→capital, legal tender→currency, mother tongue→language — exactly the probe's chance cases), but forced-choice confident-wrongs distractors **2/3** (weather/altitude→capital) → add a `none` escape → **0/3** (all abstain), 12/12 kept. The `none` escape = the verify/abstain (the project's recurring confident-wrong trap, cf. FR1 gate). **BUILT ✅ (2026-06-09): probe-first / explicit-classify-with-`none` fallback** in `resolve_relation_synonym` (FR2 two-tier shape) — Tier 1 probe (cheap, on confidence) → Tier 2 `resolve_relation_explicit` on abstain (few-shot+`none` frame lifted from the harness; one full forward via `InferenceWeights::predict_dense` = the INFER path's `predict_kquant`+lm_head, since `RelationResolver` only dequantises `0..=L10`; `none`-gated `match_relation_top1`). Opt-in `LARQL_FR3_EXPLICIT`, default off = byte-identical. **Real-vindex fix:** prod vindex has 2890 noisy labels; alphabetical top-64 dropped `language`/kept `food_animal` (mother-tongue failed, banana resolved — backwards) → `RelationClassifier::relation_labels_ranked` (by feature count) for Tier 2 candidates. **E2E real Gemma-3-4B:** `mother tongue`→`language` by explicit (0.97, probe abstained — the win); `weather`→abstain (none-escape); default off → no resolution. Probe stronger than the ablation implied (`head city`/`legal tender`/`altitude` ride Tier 1). 4 new tests, 726 lql lib green, clippy clean. Harnesses `examples/fr3_{template_ablation,explicit_rewrite}.rs`; [`docs/diagnoses/fr3-explicit-rewrite.md`](docs/diagnoses/fr3-explicit-rewrite.md) §"BUILD LANDED". | larql-lql, larql-inference | **built ✅ (SELECT fallback + env)** | | FR4 | **Operation-class dispatch boundary** (E17 compute ladder). Linear-aggregate ops (COUNT/THRESHOLD/MAJORITY) ride the read free; joint-bit (PARITY) walls — **a property of the operation, not the packing**. E17's own ledger demotes the E4 bridge to a **conjecture** (G/O/T never ran). Measure first = run the real external ops (distance/argmin/optimization) on the E17 rig to close that conjecture, then map LQL aggregate verbs. **MEASURED ✅ (2026-06-07, conjecture REFINED):** ran the real external ops on the E17 rig — **DIST (geometric) + ARGMIN (selection) RIDE free at L1**, only **PARTITION (global optimization) walls like parity**. Parity was NOT a fair stand-in for "external"; E4 mis-files geometric/selection (they're internal). Real line = factors-through-reads vs global-joint. Dispatch consequence: keep count/filter/aggregate/threshold/majority/distance/argmin internal, route global-optimization+parity external. `fleet/E17_compute_ladder/E17_EXTERNAL_VERDICT.md`. Build (far): in-band eval + external dispatch per the re-cut criterion. | larql-lql, larql-router, larql-vindex | **measured ✅ (conjecture refined)** | --- @@ -408,6 +409,7 @@ item, not just a competitive-parity item. - **larql-compute-metal coverage push closed** (2026-05-16): post-ADR-019 split, the Metal backend now lives in its own crate with **97.28% line coverage, 59/59 files at the 90% per-file floor, zero debt baselines**. Up from 75.69% (50/59 files clearing 90%, 9 debt baselines) at session start. Key techniques: (1) `MetalBackend::with_options` to bypass the env-snapshot caching that silently no-op'd flag-toggling tests on `decode_one_token_with_env`, opening the `fused_attn` / `fused_qk_norm_rope` / `fused_kv_append_attend` / `fused_post_attn_norm` branches in `decode/encode_attn.rs` (68.78% → 99.53%); (2) per-format prefill split-phase tests (Q4_K / Q4_KF / Q4_0 × gated / non-gated, `LARQL_PROFILE_SPLIT=1`) for `decode/encode_ffn.rs` (61.43% → 92.86%); (3) direct calls to the public `run_experts_prestaged_metal` / `run_experts_preselected_metal` / `run_dense_ffn_q4k` paths plus a real-MoE-layer `decode_token_q4k_moe` end-to-end test for `moe_dispatch.rs` (38.91% → 95.25%); (4) `decode_attention_layer` integration tests covering V-norm, post-norms, and `wo.format` Q4_KF/Q6_K branches for `decode_hybrid.rs` (0% baseline → 94.41%); (5) dead-code deletion of `MetalBackend::full_pipeline` (108 lines, no callers, doc said "old benchmark entry point") to clear `pipeline.rs` to 100%; (6) `Config::from_args` + JSON helper + Smoke-profile end-to-end coverage for `diag/shader_bench.rs` (4.25% → 99.36%) and `diag/kernel_profile.rs` (0% → 97.12%) — the diag scripts now smoke-run real GPU dispatches in unit tests; (7) a dedicated `tests/test_decode_diag.rs` integration binary (fresh process, fresh `CALL_COUNT`) that hits the previously-believed-structural cap on `decode/diag.rs` (85.23% → 93.75%). Coverage-policy file now an empty-baseline gate: any regression on any file breaks CI. - **larql-router self-healing + HTTP/3 + hedged-dispatch phase** (2026-05-16): MoE expert routing (ADR-0018, per-(layer, expert-range) replication keys), Prometheus `/metrics` (ADR-0017), Phase 4 HTTP/3 shard transport behind `--http3-shards` / `--http3-port` (ADR-0019, h3 0.0.8 + h3-quinn 0.0.10 + h3-axum 0.2), hot-shard hysteresis (ADR-0014 amendment, `--hot-shard-demote-ratio` default 0.8), backpressure tier (ADR-0020 — `--saturation-ceiling N` filter in `route()` / `route_expert()`, dispatcher distinguishes 503 saturation from 400 no-owner via `has_owners_for()`, emits `Retry-After: 0.5`, bumps `larql_router_route_saturation_total`), long-running chaos test (`tests/test_grid_chaos.rs`, 5,000 random ticks × 2 variants, asserts ledger consistency + coverage floor + no `route()` panic), hedged dispatch (ADR-0021 — opt-in via `--hedge-after-ms M`, new `route_with_rank` / `route_expert_with_rank` grid APIs, `hedged_post_json` racing helper, dense + MoE fan-outs wired, `route_hedge_fires_total` / `route_hedge_wins_total` counters; supersedes the original "speculative next-layer prefetch" P1 framing — an audit falsified that framing since the router sees one batched call per token against a single input residual, so hedge-the-slow-primary is the legitimate router-layer optimisation). Concurrent-route bench (`bench_route_concurrent`, 2026-05-16) surfaced lock-contention plateau: pre-swap 1 = 5.6 → 4 = 8.7 → 8 = **4.0** → 16 = 3.6 Melem/s (8 workers *worse* than 1 — pathological). **Lock primitive swap** (2026-05-16): `tokio::sync::RwLock` → `parking_lot::RwLock` across larql-router and tests. Every grid critical section is short and sync (no `await` held under the lock), so synchronous is semantically correct and the compiler enforces it (parking_lot guards are `!Send`). Post-swap: 1 = 6.4 / 4 = 11.1 / 8 = 7.2 / 16 = 6.1 Melem/s — **+14% / +28% / +80% / +70%**, pathological 8-worker collapse eliminated. 220 tests still pass. Saturation-filter cost on the happy path: ~108 ns vs ~113 ns baseline (in noise); all-saturated short-circuit ~57 ns. Router test surface: 169 lib + 50 integration = **219 tests** (220 with `--features http3`). Coverage **~93%**. Five examples (`embed_grid`, `static_shards_server`, `admin_client`, `fanout_dispatch`, `saturation_backpressure`); criterion benches cover dense + MoE + saturation + concurrent-route. Multi-host deployment runbook at [`crates/larql-router/docs/multi-host-demo.md`](crates/larql-router/docs/multi-host-demo.md). Server-side `GET /v1/shard/{model}/{start}-{end}` audited + documented in [`crates/larql-server/docs/router-spec.md`](crates/larql-server/docs/router-spec.md) §4. ADRs: [0017](docs/adr/0017-prometheus-metrics.md), [0018](docs/adr/0018-moe-expert-routing.md), [0019](docs/adr/0019-http3-shard-transport.md), [0020](docs/adr/0020-route-backpressure-tier.md), [0021](docs/adr/0021-hedged-dispatch.md). - **Whole-codebase review** (2026-05-28): multi-agent deep review (17 crates, ~415K LOC; per-crate reader + adversarial verification). Clippy clean (2 trivial nits); exposure concentrated and thematic. ~7 verified high/medium items now tracked under "Codebase hardening (review 2026-05-28)" below and mirrored into crate-local roadmaps. Top two confirmed by hand: infallible `FfnBackend::forward` aborts serving on remote-shard blips; Metal KV append has no `pos **Prefer a parameter on the existing path over a parallel path.** A new +> code path needs the same justification as a new crate: a reason the +> existing one cannot be parameterised. Opt-in experiment paths are fine, +> but they get a removal-or-promotion condition when added, not after. + +Themes, in leverage order (concrete first steps live in hardening items +7–10 above; this section tracks the policy-level work): + +1. **One forward-pass spine** — the five parallel layer-step loops in + `larql-inference/vindex/kquant_forward/` are the canonical instance. + ADR first (what is the shared layer-step contract: sentinels, MoE + detection, KV dispatch, capture hooks), then fold + `hidden`/`prefill`/`decode_step`/`decode_step_direct`/remote-FFN onto + it. Sequenced behind the C10 residency arc (same hot files). The + padded-down twin extraction (hardening item 8) is the cheap pilot for + the same move one level down. [larql-inference, larql-compute] +2. **Flags → config** — beyond the registry (hardening item 7): any + `LARQL_*` flag that changes numerics and has survived its experiment + (e.g. the Q4K residency trio once C10 lands) gets promoted to real + config/CLI surface or deleted; env vars stay for diagnostics and + short-lived experiments only. Uniform parsing through the + `options.rs` taxonomy so `=true` vs `=1` can never again silently + change what a bench measured. [workspace] +3. **Experiment-path lifecycle** — opt-in paths that lost their A/B keep + accumulating (ADR-017 covers shaders; nothing covers CPU/env paths). + Extend the ADR-017 rule workspace-wide: every opt-in path carries a + retention rationale + revival story, and reviews may delete any that + lack one. Current deletions/decisions owed: 4 unreferenced Metal shader + modules, `model-compute` (no second consumer), `larql-experts` + integration status, `test_utils.rs` out of larql-inference's public + API. [workspace] +4. **API surface honesty** — `larql-inference/vindex` re-exports ~28 + implementation-named functions (`predict_kquant_*` variants); external + callers choose forward paths by fuzzy naming. After (1), expose one + facade that dispatches internally; deprecate the variants. Pairs with + the Engine/StatePolicy framing already proposed. [larql-inference] +5. **Coverage debt** — per-file ≥90% floor policy vs reality: + `larql-inference` 70.7%, `larql-cli` 12.0% (snapshot 2026-05-16). + Raise toward the floor opportunistically as files are touched by (1) + and (4) rather than as a standalone sweep; new/split files land at + ≥90% (existing policy). [larql-inference, larql-cli] +6. **Scratch-artifact hygiene** — underscore-prefixed bench baselines + (`bench/baselines/_*.json`) are scratch by convention but accumulate + untracked/half-tracked; adopt the rule that `_`-prefixed artifacts are + gitignored, and reconciled baselines get real names + a RUNBOOK line. + [bench] + --- ## Demo narrative @@ -894,7 +1039,7 @@ achievability table + `docs/diagnoses/`.)** | C7 | KV compression as **default** for long context (Apollo / MarkovRS / UnlimitedContext / TurboQuant) | larql-inference | engines reachable on `run`/`walk` (CPU) via `--engine` / `LARQL_KV_ENGINE`; default still `standard` (production K/V cache); GPU performance on opt-in engines requires AsyncComputeBackend (see U-series below) | Unification spec at [`kv-engine-unification.md`](crates/larql-inference/docs/specs/kv-engine-unification.md) — all 7 steps landed. MarkovRS / UnlimitedContext / TurboQuant opt-in via `--engine` (CPU-correct, Metal works via CPU-fallback delegation). Apollo bench-only. Promoting any of these as default for long context requires `AsyncComputeBackend` Step A6 (engine-specific Metal shaders) to land — see U5 below. Server engine wiring also blocked on AsyncComputeBackend (U7); without it the server would silently downgrade Metal decode to CPU. | | C8 | BR4 (Boundary refs Phase 4 — bounded KV eviction + durability-first capture) | larql-server + larql-inference | not started | See § "P1 — Boundary refs and cold-context storage" below. The CPU track makes BR4 load-bearing because long-context CPU inference can't keep raw KV in RAM. | | C9 | Distributed-load-balancing for "model spans 4 consumer machines" | larql-router + larql-server | shipped (grid + rebalancer) | **DEMOTED to P2 per ADR-019 (2026-05-09)** — substantial production-engineering with no current experiment requiring multi-machine. Single-shard grid (already shipped) sufficient for substrate. Re-promote if a specific experiment needs multi-machine. | -| C10 | CPU bench harness — `larql bench --cpu` with per-stage breakdown matched against `llama.cpp -ngl 0` | larql-cli + bench/ | **DISCREPANCY RESOLVED 2026-06-02 — no regression; true gap ~1.6–1.8×.** The 1.50× (05-16) vs 1.93× (05-31) split was **two stacked measurement confounds**, not a real change: (1) **larql path mismatch** — 27.6 was the `StandardEngine` path, 23.6 the legacy `larql bench --cpu` (`predict_kquant_decode_step`) path; a stable ~12% delta (26.4 vs 23.5 today), so comparing one date's StandardEngine against the other's legacy path manufactured a phantom "regression"; (2) **llama.cpp harness artifact** — the 45.5 was an unwarmed/short-n ollama `num_gpu=0` fluke; warmed + n=128 it converges to **42.8–43.0 = llama-bench's 42.99** (both harnesses, both dates agree at ~43). Reconciled like-for-like (M3 Max, t=8, warm): **larql 23.5 legacy / 26.4 StandardEngine vs llama.cpp 43.0 → 1.6–1.8×.** Gap is C12 (both attn AND FFN already use the int8 Q8_K SDOT kernel via `attention_decode_step_native`). **Free wins landed (2026-06-02):** `larql bench --cpu` now also reports the production StandardEngine row; new `--ollama-cpu` forces `num_gpu=0`+`num_thread` so `--ollama` is a true CPU baseline (was silently Metal-GPU). Reconciled artifact `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`. Still owes the **26B-A4B baseline** (needs a 26B GGUF) — that's what pins the medium-term tier. | CPU-track baseline-credibility threshold can't be enforced without this. First acceptance test: Gemma 3 4B Q4_K on M3 Max CPU vs quant-matched `llama.cpp -ngl 0`. Then Llama 2 7B + Mistral 7B for cross-arch CPU + the 26B-A4B MoE baseline. Major improvement 2026-05-15→05-16 (2.78× → 1.50×) — see `bench/baselines/cpu/COMPARISON.md` and `DIAGNOSIS-2026-05-16-thread-scaling.md`; reconciliation `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`. | +| C10 | CPU bench harness — `larql bench --cpu` with per-stage breakdown matched against `llama.cpp -ngl 0` | larql-cli + bench/ | **DISCREPANCY RESOLVED 2026-06-02 — no regression; true gap ~1.6–1.8×.** The 1.50× (05-16) vs 1.93× (05-31) split was **two stacked measurement confounds**, not a real change: (1) **larql path mismatch** — 27.6 was the `StandardEngine` path, 23.6 the legacy `larql bench --cpu` (`predict_kquant_decode_step`) path; a stable ~12% delta (26.4 vs 23.5 today), so comparing one date's StandardEngine against the other's legacy path manufactured a phantom "regression"; (2) **llama.cpp harness artifact** — the 45.5 was an unwarmed/short-n ollama `num_gpu=0` fluke; warmed + n=128 it converges to **42.8–43.0 = llama-bench's 42.99** (both harnesses, both dates agree at ~43). Reconciled like-for-like (M3 Max, t=8, warm): **larql 23.5 legacy / 26.4 StandardEngine vs llama.cpp 43.0 → 1.6–1.8×.** Gap is C12 (both attn AND FFN already use the int8 Q8_K SDOT kernel via `attention_decode_step_native`). **Free wins landed (2026-06-02):** `larql bench --cpu` now also reports the production StandardEngine row; new `--ollama-cpu` forces `num_gpu=0`+`num_thread` so `--ollama` is a true CPU baseline (was silently Metal-GPU). Reconciled artifact `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`. **26B-A4B baseline LANDED 2026-06-10** (`c10_gemma4-26b-a4b_cpu_reconciled.json`): llama.cpp **32.1** vs larql in-process **7.1** default / **9.7** with `LARQL_Q4K_DIRECT_ATTN=1` / loopback 7.3 (t=8, warm, n=128, drift-checked). The 26B gap (4.5×) is **f32-residency byte traffic** (attn 4.15 GB + dense slab 2.14 GB + lm_head 2.95 GB per token vs llama.cpp ~2.1 GB all-quantized; every leg bandwidth-saturated ~62–71 GB/s), NOT the C12 kernel (experts already int8 SDOT, ~8% of bytes). Medium-term tier 62%→70% per the gate rule. Method addition: **pmset AC check + cross-engine drift bracket are now mandatory** — the first session was invalidated by a silent battery drain (llama.cpp itself collapsed 34→1 tok/s at 31% battery; far beyond the 1.5–3× thermal class). | CPU-track baseline-credibility threshold can't be enforced without this. First acceptance test: Gemma 3 4B Q4_K on M3 Max CPU vs quant-matched `llama.cpp -ngl 0`. Then Llama 2 7B + Mistral 7B for cross-arch CPU + the 26B-A4B MoE baseline. Major improvement 2026-05-15→05-16 (2.78× → 1.50×) — see `bench/baselines/cpu/COMPARISON.md` and `DIAGNOSIS-2026-05-16-thread-scaling.md`; reconciliation `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`. | | C11 | Architecture rule enforcement — CI check for "no GPU-only paths in core" | scripts/ + crate boundaries | not started | Static check: anything in `larql-inference` core (not `metal/`, not `cpu/`) must compile and pass tests with Metal feature off. Prevents the dual-track from drifting into Metal-locked code. | | C12 | Q4K decode kernel — hand-asm aarch64 to close the 1.50× gap to llama.cpp | larql-compute | **v1 asm landed opt-in 2026-06-02 (`LARQL_Q4K_ASM=1`); roofline reframed the work.** Two 2026-06-02 results: (a) **Roofline microbench** (`benches/q4k_q8k_matvec.rs`) shows the kernel is **compute/issue-bound, NOT DRAM-bandwidth-bound** — scalar 9.3 vs NEON 17.7 GiB/s on identical data, size-invariant — which **overturns the `DIAGNOSIS-2026-05-16` "memory-system-level" conclusion** and confirms hand-asm scheduling is a real lever (17.7 GiB/s ↔ ~33 cyc/super-block, exactly as specced). (b) **`q4k_q8k_matvec_asm`** (whole super-block dot in one `asm!` block, 8 scales as vector lanes killing the 8 scalar `ldrb`) — **bit-exact** (`q8k_matvec_asm_matches_scalar_bit_exact`), **+3.7–4.9% isolated**, ~+1–2% e2e (diluted: opt-in covers `matvec_into` callers — attention Q/K/V/O + `down` — but NOT the fused `gate_up`). **Finding: latency-hiding has low headroom** — a 4-accumulator variant showed no reliable gain (the inlined row loop lets the OoO core already overlap super-blocks), so **the two-super-block interleave is deprioritized**; the real lever to reach ~28 GiB/s is **instruction-count reduction** (perf-counter-guided, llama.cpp-style vectorized scale path) + **asm-ifying `gate_up`** (lifts the e2e ceiling). See spec §"2026-06-02 roofline measurement". | Per-core gap is **1.73× constant across thread counts** (5.7 vs 9.88 tok/s single-threaded on M3 Max). Same algorithm (Q4K × Q8K with NEON SDOT), same `vdotq_s32` instructions — llama.cpp uses hand-written inline aarch64 asm with two-super-block interleaving + explicit prefetch hints, we use Rust intrinsics lowered by LLVM. Effective bandwidth: ~63 GB/s vs ~95 GB/s. **Per-stage profile (`LARQL_INSTRUMENT_UNLIMITED=1` on Gemma 3 4B 8-thread, 2026-05-16): FFN 26.0 ms (74%) + Attention 9.3-11.0 ms (26%, grows with ctx) + Embed ~0 ms = 35-37 ms/step.** FFN matvec on gate/up/down (4608 × 9216) is the dominant target; attention matvec is the same kernel on smaller matrices. The 38 tok/s asymptote (FFN-alone) sets the floor any engine can reach on the current kernel — Standard and UnlimitedContext both hit 26.6 tok/s on Gemma 3 4B Q4K CPU (8-thread, 40-token prompt, 64 decode tokens) because both route through the same `attention_decode_step_native` + `ffn_decode_step_native` hot paths. Phases: (1) hand-asm Q4K matvec on the FFN tile shapes (gate/up/down) — closes ~95% of the gap, 1-2 weeks; (2) pre-formatted block layout — 1.1-1.2× on top, 3-5 days; (3) Q6K kernel for `ffn_down` — 1.05×, 2-3 days; (4) reduce rayon launch overhead — 1.04×, 2-3 days. Acceptance: ≥9.5 tok/s single-core, ≥39 tok/s 8-thread on Gemma 3 4B Q4K. Spec: [`crates/larql-compute/docs/q4k-decode-kernel.md`](crates/larql-compute/docs/q4k-decode-kernel.md). Per-stage measurement protocol: see "C12 per-stage measurement" below. | diff --git a/ROADMAP_STATUS.md b/ROADMAP_STATUS.md index a61af598f..439676832 100644 --- a/ROADMAP_STATUS.md +++ b/ROADMAP_STATUS.md @@ -4,10 +4,20 @@ Canonical rollup for the next execution slice. Keep the detailed design in `ROADMAP.md` and crate-local roadmaps; use this file to answer "what is active now?" without rereading every crate document. -Last updated: 2026-06-07 +Last updated: 2026-06-10 ## Recently shipped (delta since last update) +- **KV append-in-place — 23.5→27.9 tok/s short-ctx, 16.0→24.8 long-ctx; gap to llama.cpp ~1.15×/1.29× (2026-06-12).** The old `CpuKvHandle` attention step did a full-cache **clone** + `zeros` + four assigns per layer per step (~190 MB/token of churn at ctx 130, growing linearly). Rewritten: growable row-major Vec buffers (amortised O(kv_dim) `append_row`), q4k-direct step split into **project / append-in-place / attend-over-views** (no concat, no clone; legacy owned-concat wrapper kept for engine walk-loops), f32 fallback moves state instead of cloning, `append_kv`/`clip_kv` lose their O(ctx) rebuilds too, `gqa_attention_decode_step` generic over views. **E2E:** n=128 **27.9 tok/s** (35.8 ms; bracket 33.6/31.0 → ~1.15×); n=512 **24.8** vs llama.cpp tg512 32.0 (1.29× — remaining ctx-growth is GQA compute, real work). Text sanity: full flag stack on 4B → "The capital of France is **Paris**." **Cumulative arc: 7.6 → 27.9 tok/s (3.7×); gap 4.5× → ~1.15×.** 692+1213+755 tests green incl. kv-engine parity oracles. Side-finding: `ave_direct_step_parity` garbage CONFIRMED PRE-EXISTING at clean 6659fd6c (worktree repro) — legacy direct-path twin, not this work; AVE session owns it. Artifact §"update_2026_06_12_kv_append_in_place". +- **DRAM-idle hunt: sinks NAMED by live-process sampling; first three serial cuts land 21.7→23.5 tok/s (2026-06-12).** MT shape sweep first **acquitted** the suspects (rayon-chunked matvecs sustain 72–113 GiB/s at every production shape; experts-granularity arm 104) → `/usr/bin/sample` of a live decode attributed the real gap: **~75% of thread-samples parked in wait states** — workers sleep while serial main-thread sections run. Named: attention non-projection ~20% of wall (KV-concat `zeros`+`bzero`+`memmove`, per-head `cblas_sgemv` GQA + scalar libm-`exp` softmax, norms/RoPE), `cpu_moe_forward` glue ~12%, dense-slab scalar gelu + serial requantise ~8%, **lm_head argmax epilogue 4.6%** (serial softmax+top-k over 262K logits), `madvise` churn ~3%. **Built (parity-safe):** `q4_lm_head_argmax` (argmax over raw logits — scaling/softcap/temp are monotone → identical selection; no softmax, no 3 MB temporaries), GQA rayon-parallel over heads (math unchanged), dense-slab activation rayon-chunked (same libm per element). **E2E n=128: 23.5 tok/s under a degraded bracket (27.1)** — same-state gap 1.15×, vs clean-bracket llama.cpp 33 ≈ 1.35×. **n=512: 16.0 tok/s — the O(ctx) KV-concat realloc+copy (~20 ms/token by step 500) is the dominant remaining structural item** (engine KvHandle append-in-place, contract change). Then: moe glue, buffer reuse vs madvise. 692+1212+755 tests green; clippy clean (3 pre-existing warnings in `experts/arith`, not this work). Artifact §"update_2026_06_12_serial_cuts". +- **C12 v3 kernel + the roofline-crossover finding — kernel work on this path is DONE; the remaining 1.55× is effective-bandwidth utilisation, not kernel quality (2026-06-12).** Measurement-first paid twice. **(1) Decomposition bench** (`q4k_sb_decomposition`): the v1 asm block is 16.3 cyc/SB but the per-super-block Rust glue (scale unpack + i32 array + scalar sum2 + 2× software `f16_to_f32` + epilogue) costs **19.2 cyc/SB with only ~3.6 OoO-hidden** — the glue was the fat, not the asm schedule. **(2) Built v2→v3:** whole super-block in one asm block (TBL-vectorised 6-bit scale/min unpack, `smull/smlal2` sum2, hardware `fcvt`, exact-order epilogue), then the super-block loop moved inside the asm with constants hoisted (the 144B header+quants layout gives a zero-arithmetic walking pointer). **Single-thread: 18.3 → 23.8 GiB/s (+30%), bit-exact vs scalar first run** (all five asm kernels this arc were — the wrapping-i32 + exact-conversion discipline works). `LARQL_Q4K_ASM` now routes to v3. **(3) E2E 26B: FLAT (21.3 ≈ 21.7, bracket 33.4) — the crossover measurement:** at 8 threads the q4k streams are now **memory-bandwidth-bound** (~47 GB/s effective vs llama.cpp's ~70 on the same machine; v3 could demand ~190). **The spec's 28 GiB/s single-kernel target is no longer the path to parity**; the next C12 round must target effective-bandwidth sinks — rayon fork-join gaps (~180+ parallel sections/token), compute-only phases with idle DRAM (GQA/RoPE/quantize ~7 ms), expert task granularity — measurement-first. 692 compute tests green, clippy clean. Artifact §"update_2026_06_11_v3_kernel_and_roofline_crossover". +- **Q6_K hand-asm landed — +12.4% isolated, e2e-neutral on 26B (2026-06-11, third drop of the day).** `q6k_q8k_matvec_asm` under the same `LARQL_Q4K_ASM` gate: one `tbl` replaces the per-group 4×scalar-multiply hi2 rebuild, vector-lane scale accumulate replaces 16 horizontal `addv`/super-block. Bit-exact vs scalar first try. Isolated 9.97→11.21 GiB/s; 26B e2e 21.6 ≈ 21.7 (Q6_K is only ~13% of full-stack bytes ≈ 3 ms — the +12% is ~0.4 ms, in noise). Keep: free, exact, and the 4B dense `down` is Q6_K (unmeasured there). **Remaining-lever ranking for the 1.5×:** (1) q4k_q8k instruction-count 18.4→28 GiB/s — touches ~35 of the 46 ms; gated on Instruments port attribution per the spec's own rule; (2) attention non-projection overhead ~7 ms (RoPE/GQA f32, KV-concat copies the cache per step — grows with ctx); (3) per-super-block Rust glue (scale unpack + sum2) — possibly OoO-hidden, measure first. 690 compute tests, clippy clean. +- **Int8 attention projections — 16.7 → 21.7 tok/s; gap to llama.cpp now 1.53× (2026-06-11, same day as residency).** The stage split (newly instrumented: `LARQL_DECODE_STAGES` now records *local* expert time — was remote-only — and the bench MoE row prints the split) showed attention at **~54% of decode while moving only ~26% of the bytes**: the Q4K-direct attention step still ran the f32-activation `q4k_matvec`/`q6k_matvec` while everything else had moved to int8 SDOT. **`LARQL_Q4K_ATTN_INT8`** routes Q/K/V/O through `q4k_q8k`/`q6k_q8k_matvec_into` (asm-aware, rayon-chunked; Q/K/V share one Q8_K quantisation of `h_norm`, O quantises the GQA context; per-projection f32 fallback) — the same numerics the dense-model production attention has always used. **Order-swapped A/B (M3 Max, AC, bracket 33.2):** attn 32.7→19.3 ms, decode **16.7→21.7 tok/s**. Cumulative 26B arc: **7.6 → 13.9 → 16.7 → 21.7 vs llama.cpp 33.2 — 4.5×→1.53×**, now *inside* the C12 kernel-quality band: the byte-residency gap is closed. Remaining levers (next C12 rounds): attn still ~30 GB/s effective vs experts ~50 (RoPE/GQA/KV-concat f32; **Q6_K has no asm**), instruction-count toward 28 GiB/s, KV-concat copies the cache every step. Parity: chunking bit-exact (Q4_K+Q6_K); int8-vs-f32-act ≤2% scale-relative (the activation quant production dense attention already carries); default off byte-identical. 690 compute tests green, clippy clean. Artifact §"update_2026_06_11_int8_attention". +- **26B CPU MoE quantized residency BUILT + MEASURED — ≥10 tok/s target PASSED: 7.6 → 13.9 → 15.9 tok/s; gap to llama.cpp 4.5×→~1.9× (2026-06-11).** Built the three residency levers the C10 byte-ledger analysis named, all opt-in/default-off byte-identical, parity-first: **`LARQL_Q4K_LM_HEAD`** (resident decode loop routes lm_head through the vindex Q4_K view via `logits_to_predictions_q4_lm_head` — synthesized automatically for tied-embedding models; `larql-kv/generation.rs` `argmax_next_token_resident`), **`LARQL_Q4K_DIRECT_FFN`** (hybrid-MoE dense slab via `ffn_decode_step_native` inside new `moe_ffn_block_cpu_with_index`, threaded from `LocalMoeFfn{index}`; decode-only, prefill stays f32 gemm per the #16 falsification; **padded-down handling** added to both cached.rs twins — the 26B stores intermediate 2112 as 2304-col Q6_K rows, activation zero-pad is exact, parity test bit-class ≤1e-5 + ragged-bytes rejection), and **C12 v2: fused gate+up hand-asm** (`q4k_q8k_gate_up_asm` under the existing `LARQL_Q4K_ASM` — shared activation loads + two independent SDOT chains; **bit-exact first try**; microbench **9.89→19.82 GiB/s, 2.00×** — the fused neon form was the worst kernel in the file (horizontal-sum-bound), now the best, exceeding single-matrix asm 18.4). **E2E (M3 Max t=8 n=128 warm, llama-bench brackets 32.54→28.80 = ~11% drift, flag legs understated if anything):** 26B default **7.6** → full residency **13.9** → +ASM **15.9 tok/s**. **Honest negative:** 4B legacy bench is e2e-neutral for the fused asm (23.2 vs 22.9) — local direct-decode FFN deliberately uses two separate rayon-parallel matvecs, so `gate_up_into`'s production homes are the remote expert server (`q8k_wire`) + walk-FFN; the 26B ASM gain (13.9→15.9) comes from the `matvec_into` callers (experts, lm_head, dense gate/up). **Remaining ~1.9× to llama.cpp:** Q6_K asm (down, no asm today), attention q4k-direct still on the f32-activation `q4k_matvec` (route through q4k_q8k+asm), instruction-count reduction toward 28 GiB/s, optional rayon-chunked fused gate_up for local decode. 1148+755+686 tests green, clippy clean. Artifact §"update_2026_06_11" in [`bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json`](bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json). +- **C10 26B-A4B CPU baseline LANDED — medium-term tier 62%→70%; the MoE gap is f32 residency, not the kernel (2026-06-10).** The owed number, measured under the full C10 discipline (M3 Max, t=8, warm, n=128, llama-bench drift bracket 32.08→30.45): **llama.cpp `-ngl 0` 32.1 ± 1.4 tok/s** vs larql in-process **7.1** (default) / **9.7 with `LARQL_Q4K_DIRECT_ATTN=1`** / loopback-shard **7.3**. **Mechanism, fully quantitative:** the in-process path streams **~10 GB/token** — attention Q/K/V/O (1.04B params, **f32 4.15 GB**) + dense FFN slab (**f32 2.14 GB**) + lm_head (**f32 2.95 GB**) + experts (Q4K 0.80 GB) — vs llama.cpp's ~2.1 GB all-quantized; all legs run bandwidth-saturated at ~62–71 GB/s, so the 4.8× byte ratio explains the whole 4.5× gap. The expert kernel (C12's turf) is NOT the bottleneck (~8% of bytes, already int8 Q8_K SDOT). **#16 reframed:** the "~0% at rep ctx" Q4K-direct-attention verdict was measured on the *network-bound loopback* path; in-process the flag is **1.36× decode + 10× TTFT** (prefill 6587→652 ms, skips `ensure_attn_tensors_dequantised`). **Path to ≥10 tok/s = quantized residency** (dense slab + lm_head q4k → ~2.0–2.3 GB/token → ~15–20 tok/s at the C12 1.6–1.8× residual). Tier gate favorable branch (target 10 ≪ llama.cpp 32) → **70%**. **Runbook mysteries resolved:** in-process-1.8-vs-loopback-4.4 was two artifacts (cold short-n smoke; warm AC = 7.1 vs 7.3, a wash — serialization ≈ second-process core gain); gemma4 llama.cpp CPU is NOT slow upstream (32 tok/s). **Measurement lesson (now method):** the first session (06-09 night) was invalidated by a silent battery drain to 31% — llama.cpp itself collapsed 34→1.05 tok/s (~30×, beyond the 1.5–3× thermal class), with Spotlight stacking on top after 30+ GB of model I/O; `pmset` AC check + cross-engine drift bracket are now mandatory. **Code gaps found:** `larql bench --moe-shards` still calls pre-C1 `generate_with_remote_moe` (fails on CPU with the #146 signature; `larql run --moe-shards --engine standard` is the working loopback instrument); `serve --ffn-only` doesn't serve expert endpoints (use `--experts 0-127`); `LARQL_DECODE_STAGES` doesn't record local-expert time. Artifact [`bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json`](bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json). +- **FR3b — explicit relation-rewrite fallback BUILT + validated e2e (2026-06-09).** Wired the measured two-tier resolver into `SELECT … FROM EDGES WHERE relation=…` (`executor/query/select/edges.rs`): Tier 1 = the cached residual probe (unchanged); on probe abstain → Tier 2 `resolve_relation_explicit` — the few-shot `word→relation`+`music→none` frame (lifted verbatim from `fr3_explicit_rewrite.rs`), **one full forward via `InferenceWeights::predict_dense`** (the INFER path = `predict_kquant` w/ lm_head; the resolver's partial `0..=L10` dequant can't run lm_head — the called-out wrinkle), `none`-gated accept (`match_relation_top1`). **Opt-in `LARQL_FR3_EXPLICIT`, default off = byte-identical** (726 lql lib tests green incl. 4 new, clippy clean). **Real-vindex refinement:** production `gemma3-4b-q4k-v2.vindex` has **2890 noisy labels**; `relation_labels()` is alphabetical and both tiers cap at 64, so an alphabetical top-64 *drops* `language` while *keeping* a rare `food_animal` → "mother tongue" failed, "banana" resolved (backwards). Fixed with `RelationClassifier::relation_labels_ranked(top_n)` (by feature count) for Tier 2's candidates — keeps the meaningful relations + a short prompt. **E2E (real Gemma-3-4B, `LARQL_FR3_EXPLICIT=1`):** `mother tongue`→`language` by explicit (0.97, probe abstained — the win); `weather`→abstain (none-escape, no confident-wrong); default off → no resolution (byte-identical). **Honest correction:** the production 64-class probe is *stronger* than the 3-class ablation implied — it resolves `head city`→capital, `legal tender`→currency, `altitude`→elevation by meaning (Tier 1); Tier 2 is the safety net for genuine abstains, and on a rich label set the `none` escape is necessarily weaker (`banana`→`food_category`, a real relation here, defensible). Verdict [`docs/diagnoses/fr3-explicit-rewrite.md`](docs/diagnoses/fr3-explicit-rewrite.md) §"BUILD LANDED". +- **FR3b — relation probe is phrasing-brittle; explicit rewrite + `none` escape wins (2026-06-08). → BUILT 2026-06-09 (above).** FR3's 1.00 was synonym *words* in one template; on an unseen *phrasing* the probe sits at **chance** at its L10 probe layer (`examples/fr3_template_ablation.rs`: held-out "The {r} for {e} would be", N=6, k=1/2/4 templates = 0.33/0.39/0.39 @L10; signal is early at L6 and decays with depth). More training templates = measured **no-op** → that change was **reverted** (it 4×'d build cost for nothing at the probe layer). **Explicit few-shot `word→relation` classify** (one forward via `predict_kquant`, `examples/fr3_explicit_rewrite.rs`) = **12/12** on synonyms + unseen phrasings (head city→capital, legal tender→currency, mother tongue→language — exactly the probe's chance cases), but forced-choice confident-wrongs distractors **2/3** (weather/altitude→capital) → add a `none` escape + `music→none` few-shot → **0/3** (all abstain), 12/12 kept. The `none` escape is the verify/abstain — the project's recurring forced-choice trap (cf. FR1's 0.75 gate). **BUILD NEXT: probe-first / explicit-classify-with-`none` fallback** in `resolve_relation_synonym` (FR2 two-tier shape) — Tier 1 = existing probe (cheap, on `MIN_CONFIDENCE`), Tier 2 = explicit classify on abstain. *Wiring wrinkle:* Tier 2 needs lm_head (full forward) but `RelationResolver` only dequantises `0..=L10` → run it via the **Session vindex** (`predict_kquant`/`InferenceWeights`, the INFER path), not the resolver's partial setup (~30 lines, resolver→session boundary). Lift the few-shot frame + `none`-gated accept from `fr3_explicit_rewrite.rs`. Verdict [`docs/diagnoses/fr3-explicit-rewrite.md`](docs/diagnoses/fr3-explicit-rewrite.md). +- **FR retrieval-augmented early-exit — shipped + decode-loop declined (2026-06-08).** When the FR1 verified router fires, short-circuit the forward at the resolved stored layer (skip tail + lm_head). Parity-exact (40/40 residual + token), **1.44× on fact-lookup answer tokens** (457→319 ms, gemma3-4b-q4k), distractor-safe. Wired end-to-end: `INFER … ROUTE VERIFY EXIT` + `LARQL_KNN_EARLY_EXIT` (dense + q4k, verified-only — EXIT ignored with FALLBACK; on verify-miss completes the full forward = parity). 2419 lql+inference tests green, committed `d9b761f6`, pushed. **Decode-loop extension MEASURED & DECLINED:** parity-safe only on the terminal token (KV-cache invariant), blended T=2 1.15×/T=5 1.05×/fact-not-terminal 1.00× → net-marginal; INFER is the lever's home. Harnesses `examples/fr_early_exit_{probe,parity,bench,decode_projection}.rs`. - **FR routing GAIN quantified — correctness, not throughput (2026-06-07).** Benchmark (`crates/larql-inference/examples/fr_routing_gain.rs`) runs all three router modes on the same forwards over CORRECT/DISTRACTOR/ALIAS slices (Gemma-3-4B, 20 installed facts @L26). **Legacy KNN injection is unsafe at scale: 0/20 distractor-safe** — 20 facts already confident-wrong 100% of unrelated queries (near-rank-1 cosine collides > 0.75). **FR1 verified fixes it: 0→100% distractor-safe**, installed-fact recall preserved (20/20), at **~13 µs/call** (0.05% of a decode — no tok/s cost). The gain is the KNN-fact-injection feature becoming safe to ship; it is NOT a throughput gain (the override is a post-logits sidecar; no-op without a KnnStore). **Caveat surfaced + hardened:** two-tier's fallback has no entity-name guard → 0/20 distractor-safe (same as legacy) → `ROUTE VERIFY FALLBACK` is a targeted alias tool, `ROUTE VERIFY` is the safe open default (doc-comment + LQL spec updated). FR4 is a dispatch criterion, not a speed gain. Verdict: [`docs/diagnoses/fr-routing-gain.md`](docs/diagnoses/fr-routing-gain.md). - **Fleet routing extensions FR1/FR2/FR3 — MEASURED on a real vindex, all WIN, builds greenlit (2026-06-07).** The `chris-experiments/fleet` native-store arc (E10–E17) + `videos/the-mechanism` build story ported into the Query/Edit/Interpret track. Spec + frozen pre-registrations: [`docs/fleet-routing-extensions.md`](docs/fleet-routing-extensions.md); roadmap in [`ROADMAP.md`](ROADMAP.md) §"FR". Three Rust measurement harnesses run against `output/gemma3-4b-q4k-v2.vindex` (the production `KnnStore` cosine path + `capture_residuals`), judged in predictive units (mean-cosine banned), all three WIN: **FR1** (`examples/fr1_topk_fuzzy_router.rs`) — the entity key is real & answer-leak-free at L24-26 (L26 top1 **0.89**/top5 0.95, cross-rel 1.00, **beats E15's MLP under plain cosine-NN, no training**); the live `query_top1`+fixed-0.75 gate (`infer_patched.rs:162-163`) fires **150/150** with **11% confident-wrong @L26, 84% @L20** → the defect is the consumer, fix = top-k+verify+abstain at the resolved layer. **FR3** (`examples/fr3_relation_address.rs`) — relation synonym-gen **1.00 at every layer L6-L26** (semantic, not lexical; clean from L6, earlier than the video's L10); asymmetry stark vs entity top-1 0.07-0.20 until L26. **FR2** (`examples/fr2_two_tier_router.rs`) — symbolic exact-match **0/10** aliases, activation fallback **10/10 top-1** (Persia→Iran, …) = E16 reproduced (famous-alias easy end; general = FR1's ~0.9 top-5). Verdicts in `docs/diagnoses/fr{1,2,3}-*.md`; artifacts in `bench/aim-validation/fr{1,2,3}_*.json`. **FR4** (E17 compute→dispatch) remains research-first — E17's own ledger demotes the E4 bridge to a conjecture (G/O/T never ran), and the E17 rig lives in `chris-experiments`. **FR1 + FR2 BUILDS LANDED (2026-06-07).** `apply_knn_override_verified` (FR1: top-k + entity-in-prompt verify + abstain) and `apply_knn_override_two_tier` (FR2: tier-1 verify → tier-2 activation alias fallback), both resolved-layer-first (no hardcoded layer), wired into `infer_patched`/`infer_patched_q4k`, opt-in `LARQL_KNN_VERIFY` (+`LARQL_KNN_FALLBACK` for FR2), **default off = byte-identical** (23 infer_patched tests green incl. 14 legacy unchanged, clippy clean). E2E real Gemma-3-4B: FR1 fixes the measured confident-wrong (Germany-paraphrase legacy→SpainX, verified→GermanyX, no regression); FR2 recovers the alias "capital of Persia" (verify-only abstains→Tehran, two-tier→IranX cos 0.97). **LQL SURFACE LANDED (2026-06-07):** `KnnRouteMode` enum threaded through `infer_patched` (default `Legacy` = byte-identical, `from_env()` preserves env-gating for Python/EXPLAIN); first-class `INFER … ROUTE VERIFY [FALLBACK] [TOPK n]` clause (lexer+ast+parser+executor, 5 parser tests, 715 lql + 23 inference tests green, clippy clean, spec'd). E2E with no env vars: `ROUTE VERIFY` → Germany fixed, `ROUTE VERIFY FALLBACK` → Persia→IranX recovered. **FR3 BUILD LANDED (2026-06-07):** `RelationResolver` (`executor/relation_resolver.rs`) — a trained residual softmax probe (NOT string/cosine: residuals are near-rank-1, so cosine would be the "proxy is not the thing" trap), model-agnostic probe layer (`round(0.3·num_layers)`), wired into `SELECT … FROM EDGES WHERE relation=…` as a cached semantic fallback when exact-string misses. E2E real Gemma-3-4B: `WHERE relation="seat"` → resolved to "capital", returned the capital edges. **All three measured wins (FR1/FR2/FR3) are now built + validated e2e.** **FR4 RAN (2026-06-07) — E4 conjecture REFINED:** added the real external ops to the E17 rig — DIST (geometric) + ARGMIN (selection) **ride free @L1**, only PARTITION (global optimization) **walls like parity**. Parity was NOT a fair stand-in for "external"; E4's internal/external split mis-files geometric/selection (they factor through reads → internal). Re-cut dispatch criterion: keep count/filter/aggregate/threshold/majority/distance/argmin internal, route global-optimization+parity external (`E17_EXTERNAL_VERDICT.md`). **All four FR items (FR1-FR4) now complete** — FR1/FR2/FR3 measured+built+LQL-surfaced, FR4 measured (conjecture refined). @@ -47,7 +57,7 @@ most plan if it fails — see [`ROADMAP.md`](ROADMAP.md) §"Strategic priorities | 2 | V1 hash routing across all layers | **✅ DONE — FALSIFIED (dense 2026-05-31 + MoE-within-expert 2026-06-05); KU4 FULLY CLOSED** | `larql-inference` (`examples/walk_ffn_v1_hash_routing.rs`, `examples/walk_ffn_v1_moe_within_expert.rs`), `larql-compute` (`moe/within_expert.rs`), `bench/aim-validation` | **Dense:** per-layer KL ≤ 0.05 thresholds DON'T compound — +5.4 to +7.7 bits/token NLL + 78–95% drift on Gemma 3 4B / Llama 2 7B / Mistral 7B; deployable ~2.4–2.9× (not 5×). **MoE-within-expert (the previously-OPEN half): ALSO FALSIFIED** — within a 26B-A4B expert's 704-feat FFN, L0–13 are fully dense (need all 704), only L14–29 per-layer-sparse, and they DON'T compound (50% argmax drift; mean NLL Δ−0.15 = the #26 trap — drift is the truth). Deployable BW only ~1.19× (oracle pays gate+up; half the layers save nothing); 1.91× best case unrealizable (cheap route clears 6%). Parity-anchored (all-dense schedule KL=0.00000). FFN/expert feature-sparsity multiplier **dead dense AND MoE**. Artifacts `v1_*.json` + `v1moe_gemma4-26b-a4b-q4k.json`; writeups `docs/diagnoses/v1-hash-routing.md` + `v1-moe-within-expert.md`. | | 3 | V3 disk-resident mmap spike (pulled forward) | **~ DONE 2026-05-31 — locality POOR** | `larql-vindex` (`mmap_cold_read_probe`), `bench/aim-validation/moe-routing` | KU5 resolved across two halves: (a) cold-read latency — cold scattered 16 KB read ~100µs p50/140µs p99, warm ~0.04µs (~2380×); (b) MoE routing locality (faithful in-process 26B decode) — working set saturates to ~124/128 experts/sequence (uniform-random), **no cacheable hot subset**. 26B fits RAM (fine after warmup); **>RAM frontier MoE would thrash** → long-term disk-residency bet undermined ("bigger RAM" not "spill to disk"). e2e tok/s on a genuinely->RAM model still needs different hardware. Artifacts `v3_granite-30b.json` + `moe-routing/v3moe_locality.json`; writeups `docs/diagnoses/v3-disk-resident-mmap.md` + `moe-routing-locality.md`. | | 4 | V2 FP4 generality | **✅ DONE 2026-05-31 — CONFIRMED** | `larql-vindex` (`fp4_q1_scan`), `larql-inference` (`walk_ffn_v2_fp4_nll`) | ≥99.8% per-feature R<16 on Gemma 3 4B + Granite 3B/8B (reproduces exp 26's 99.83% on gemma3 down exactly; `down` the tail). Predictive E2M1 +0.116 bits/tok vs f32, **beats** the shipped Q4-int baseline. No QAT. KU3 resolved CONFIRMED. (Llama/Mistral/MoE-expert weights need f16 exports — not covered.) Artifacts `v2_*_scan.json`; writeup `docs/diagnoses/v2-fp4-generality.md`. | -| 5 | C10 CPU baseline bench | **✅ DISCREPANCY RESOLVED 2026-06-02 — no regression** | `larql-cli`, `bench/` | The 1.50× (05-16) vs 1.93× (05-31) split was **two measurement confounds**: a larql **path mismatch** (27.6 StandardEngine vs 23.6 legacy `bench --cpu`, stable ~12% delta) and a llama.cpp **harness artifact** (45.5 was an unwarmed ollama `num_gpu=0` fluke; warm = 42.8–43.0 = llama-bench 42.99). Reconciled like-for-like (t=8, warm): **larql 23.5 legacy / 26.4 StandardEngine vs llama.cpp 43.0 → ~1.6–1.8×.** Gap is C12 (attn+FFN already on int8 Q8_K SDOT). **Free wins:** `bench --cpu` now shows the StandardEngine row; `--ollama-cpu` gives a true CPU baseline (was Metal-GPU). **26B-A4B baseline — STAGED, blocked on an idle machine (2026-06-06):** Q4_K_M GGUF now built from the *cached* safetensors (no download) at `/tmp/gemma4-26b-Q4_K_M.gguf`; in-process KV-cached CPU MoE row wired into `larql bench` (`LocalMoeFfn`), smoke n=8 = 1.8 tok/s (UNVERIFIED — needs n=128 warm; gemma4-in-llama.cpp CPU speed also unverified). Runbook `bench/baselines/c10_gemma4-26b-a4b_cpu_RUNBOOK.md`. 4B artifact `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`. C12 v1 asm kernel landed opt-in (see ROADMAP.md C12). | +| 5 | C10 CPU baseline bench | **✅ DISCREPANCY RESOLVED 2026-06-02 — no regression** | `larql-cli`, `bench/` | The 1.50× (05-16) vs 1.93× (05-31) split was **two measurement confounds**: a larql **path mismatch** (27.6 StandardEngine vs 23.6 legacy `bench --cpu`, stable ~12% delta) and a llama.cpp **harness artifact** (45.5 was an unwarmed ollama `num_gpu=0` fluke; warm = 42.8–43.0 = llama-bench 42.99). Reconciled like-for-like (t=8, warm): **larql 23.5 legacy / 26.4 StandardEngine vs llama.cpp 43.0 → ~1.6–1.8×.** Gap is C12 (attn+FFN already on int8 Q8_K SDOT). **Free wins:** `bench --cpu` now shows the StandardEngine row; `--ollama-cpu` gives a true CPU baseline (was Metal-GPU). **26B-A4B baseline ✅ LANDED 2026-06-10:** llama.cpp 32.1 vs larql in-proc 7.1 / 9.7 (`LARQL_Q4K_DIRECT_ATTN=1`) / loopback 7.3 — gap is f32-residency byte traffic (~10 GB/tok vs ~2.1), not the C12 kernel; tier 62%→70%; the 1.8 smoke was a cold artifact. Artifact `bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json` (method now mandates AC-power check + drift bracket). 4B artifact `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`. C12 v1 asm kernel landed opt-in (see ROADMAP.md C12). | | **5b** | **Q4K-direct attention — INVESTIGATED TO COMPLETION** | **recon ✅ + gates ✅ + built + parity ✅ + end-to-end ✅ (flag fires, parity holds; +4.6% short-ctx, ~0% rep-ctx — expert/GQA-bound); prefill twin GATED→FALSIFIED (compute-bound gemm, q4k 20× slower). Ship decode path opt-in, don't headline. Throughput next = expert/network + O(N²) GQA, NOT Q4K attn** | `larql-compute`, `larql-inference`, `larql-kv` | **Top engineering lever, fully in our control — and the one speed item with NO thesis riding on it** (the graph-FFN speed theses were all falsified, #17–#28). Route attention (~28%) + dense FFN (~13%) + lm_head (~12%) through Q4K-direct matvecs (Q4_K super-block × f32), not dequant-to-f32 BLAS; drop the up-front dequant-all. **This is ENGINEERING not research: the gate is PARITY (the spine, BEFORE any timing number — a tok/s on a wrong RoPE/GQA kernel SHIPS).** Parity baseline = **Q4K-direct vs Q4K-dequant** (skip the f32 round-trip), NOT vs f32-from-f32 (don't conflate quant error with dequant-tax removal); report as a per-token distribution + worst-token (attention feeds everything downstream). **Step 1 RECON ✅** ([`docs/diagnoses/q4k-direct-attention.md`](docs/diagnoses/q4k-direct-attention.md)): it's a **projection swap** (only Q/K/V/O accelerable; RoPE/GQA/concat stay f32); the 28% is paid in **`run_attention_block_decode_step_backend`** (decode.rs:111, per-token) — NOT `run_attention_with_kv_backend` (that's prefill/TTFT); **don't wire `q4_attention_proj`** (guards Q4_K but calls the Q4_0 kernel, only Q4_0-smoke-tested) — use `q4k_matvec`/`q4k_dual_matvec` via `quant_matvec(format,…)` reading `index.attn_kquant_layer_data`. **Step 2 gates ✅ GREEN** (both probes, #24-trap guard): projections = **97–98% of the attention block** at the measured ctx band (Gate 1), and `q4k_matvec` **beats f32 BLAS 2.06–2.51×** (Gate 2) → ~14–16% net decode win from attention alone (short-ctx end of range; decays with cached_len). **Step 3 ✅ built + unit-parity GREEN**: `run_attention_block_decode_step_q4k_direct` (opt-in `LARQL_Q4K_DIRECT_ATTN=1`, per-layer f32 fallback), Q4K-direct≈Q4K-dequant < 1e-3. Exit: end-to-end **net decode tok/s > dense at a representative ctx, pre-committed** (#24 lesson; isolated win ≠ end-to-end). Full framing: **task #16**; [`docs/diagnoses/remote-moe-bottlenecks.md`](docs/diagnoses/remote-moe-bottlenecks.md) (decode-stage split) + [`docs/diagnoses/walk-ffn-performance.md`](docs/diagnoses/walk-ffn-performance.md) (why this is the only lever left). | | 6 | MI4/T7 trace truthfulness gate | queued | `larql-inference` | TRACE final residual/logit parity pinned for WalkFfn and patched-vindex paths, then Q4K/MoE. (Also the verify/reversible backbone for the Query/Edit/Interpret track.) | | 7 | R6 depth-fraction probe API | queued | `larql-inference`, `larql-models` | Stable probe API available before MTP3 layer-choice validation. | @@ -64,6 +74,7 @@ only on a WIN, parity-first. | Order | Item | Status | Owner | Exit criterion (the measurement) | |---:|---|---|---|---| | FR3 | Relation as a clean semantic address | **✅ MEASURED + BUILT 2026-06-07** | `larql-lql`, `larql-vindex` | **Measured:** Gemma-3-4B/N=40, relation synonym-gen **1.00 at every layer L6-L26** (semantic, not lexical; clean from L6); asymmetry stark vs entity 0.07-0.20 until L26. **Built:** `RelationResolver` (`executor/relation_resolver.rs`) — trained residual softmax probe (NOT string/cosine, near-rank-1 proxy trap avoided), model-agnostic probe layer (`round(0.3·num_layers)`), wired into `SELECT … FROM EDGES WHERE relation=…` as a cached semantic fallback. E2E real Gemma-3-4B: `WHERE relation="seat"` → resolved to "capital", returned capital edges. 2 unit + 717 lql tests green, clippy clean. `examples/fr3_relation_address.rs`, [`docs/diagnoses/fr3-relation-address.md`](docs/diagnoses/fr3-relation-address.md). | +| FR3b | Explicit relation rewrite (phrasing-robust fallback) | **✅ MEASURED 2026-06-08 + BUILT 2026-06-09** | `larql-lql`, `larql-inference` | **Measured:** probe is synonym-robust but phrasing-brittle (chance @L10 on a held-out phrasing; more templates = no-op, reverted). Explicit few-shot `word→relation` classify = **12/12** synonyms+phrasings; distractors **2/3** confident-wrong → **`none` escape → 0/3**, 12/12 kept. **Built:** two-tier `resolve_relation_synonym` (Tier 1 probe → Tier 2 `resolve_relation_explicit` on abstain — few-shot+`none` frame, one full forward via `InferenceWeights::predict_dense` since lm_head needs the full vindex not the resolver's `0..=L10`, `none`-gated `match_relation_top1`), opt-in `LARQL_FR3_EXPLICIT`, default off = byte-identical. Real-vindex fix: `relation_labels_ranked` (by feature count) replaces alphabetical top-64 (which dropped `language`/kept `food_animal`). E2E real Gemma-3-4B: `mother tongue`→`language` by explicit (0.97, probe abstained); `weather`→abstain (none-escape); default off → no resolution. Probe stronger than ablation implied (handles `head city`/`legal tender`/`altitude` via Tier 1). 4 new tests, 726 lql lib green, clippy clean. `examples/fr3_{template_ablation,explicit_rewrite}.rs`, [`docs/diagnoses/fr3-explicit-rewrite.md`](docs/diagnoses/fr3-explicit-rewrite.md) §"BUILD LANDED". | | FR1 | Top-k fuzzy entity router + verifier | **✅ MEASURED + BUILT 2026-06-07** | `larql-vindex`, `larql-inference`, `larql-lql` | **Measured:** Gemma-3-4B/N=150, entity key real & answer-leak-free at L24-26 (L26 top1 **0.89**/top5 0.95, CROSS 1.00, beats E15's MLP, no training); live `query_top1`+0.75 gate fires **150/150** → **11% confident-wrong @L26, 84% @L20**. **Built:** `apply_knn_override_verified` (top-k + entity-in-prompt verify + abstain, resolved-layer-first, opt-in `LARQL_KNN_VERIFY`, default off = byte-identical, 14 legacy + 5 new tests green, clippy clean). E2E real Gemma-3-4B: legacy Germany-paraphrase→SpainX (confident-wrong) → verified→GermanyX (fixed), no regression. `examples/fr1_topk_fuzzy_router.rs`, [`docs/diagnoses/fr1-topk-fuzzy-router.md`](docs/diagnoses/fr1-topk-fuzzy-router.md). LQL `ROUTE TOPK k VERIFY` = follow-up. | | FR2 | Two-tier symbolic→activation router | **✅ MEASURED + BUILT 2026-06-07** | `larql-inference`, `larql-vindex`, `larql-lql` | **Measured:** symbolic exact-match **0/10** aliases (Persia≠Iran), activation fallback **10/10 top-1** @L24/L26 — E16 reproduced. **Built:** `apply_knn_override_two_tier` (tier-1 FR1 verify → tier-2 activation alias fallback, opt-in `LARQL_KNN_VERIFY`+`LARQL_KNN_FALLBACK`, default off = byte-identical, 4 new + 19 = 23 tests green, clippy clean). E2E real Gemma-3-4B: "capital of Persia" → verify-only abstains (Tehran), two-tier recovers IranX (cos 0.97); named case no regression. Tier-2 = fuzzy ~0.7-0.9 route (fires only when verify missed). `examples/fr2_two_tier_router.rs`, [`docs/diagnoses/fr2-two-tier-router.md`](docs/diagnoses/fr2-two-tier-router.md). | | FR4 | Operation-class dispatch boundary | **✅ MEASURED 2026-06-07 — conjecture REFINED** | `larql-lql`, `larql-router`, `larql-vindex` | Ran the real external ops on the E17 rig (`e17_ladder.py external`, +DIST/ARGMIN/PARTITION). **DIST (geometric) + ARGMIN (selection) ride free @L1; only PARTITION (global optimization) walls like parity** (NO-CLEAR 0.81). Parity was NOT a fair stand-in — E4 mis-files geometric/selection (internal). Real line = factors-through-reads vs global-joint. Dispatch: keep count/filter/aggregate/threshold/majority/distance/argmin internal, route global-optimization+parity external. `E17_EXTERNAL_PLAN.md`/`E17_EXTERNAL_VERDICT.md`, `e17_external.json`. | diff --git a/bench/aim-validation/fr3_explicit_rewrite_gemma3-4b.json b/bench/aim-validation/fr3_explicit_rewrite_gemma3-4b.json new file mode 100644 index 000000000..b767e4320 --- /dev/null +++ b/bench/aim-validation/fr3_explicit_rewrite_gemma3-4b.json @@ -0,0 +1 @@ +{"experiment":"fr3_explicit_rewrite","vindex":"output/gemma3-4b-q4k-v2.vindex","synonym_top1":[6,6],"phrasing_top1":[6,6],"distractor_fires":[0,3],"cases":[{"w":"seat","bucket":"synonym","expected":"capital","top1":"capital","rank":1},{"w":"metropolis","bucket":"synonym","expected":"capital","top1":"capital","rank":1},{"w":"money","bucket":"synonym","expected":"currency","top1":"currency","rank":1},{"w":"cash","bucket":"synonym","expected":"currency","top1":"currency","rank":1},{"w":"tongue","bucket":"synonym","expected":"language","top1":"language","rank":1},{"w":"speech","bucket":"synonym","expected":"language","top1":"language","rank":1},{"w":"head city","bucket":"phrasing","expected":"capital","top1":"capital","rank":1},{"w":"main city","bucket":"phrasing","expected":"capital","top1":"capital","rank":1},{"w":"legal tender","bucket":"phrasing","expected":"currency","top1":"currency","rank":1},{"w":"unit of money","bucket":"phrasing","expected":"currency","top1":"currency","rank":1},{"w":"spoken language","bucket":"phrasing","expected":"language","top1":"language","rank":1},{"w":"mother tongue","bucket":"phrasing","expected":"language","top1":"language","rank":1},{"w":"banana","bucket":"distractor","expected":"","top1":"none","rank":-1},{"w":"weather","bucket":"distractor","expected":"","top1":"none","rank":-1},{"w":"altitude","bucket":"distractor","expected":"","top1":"none","rank":-1}]} \ No newline at end of file diff --git a/bench/aim-validation/fr3_template_ablation_gemma3-4b.json b/bench/aim-validation/fr3_template_ablation_gemma3-4b.json new file mode 100644 index 000000000..188a01a59 --- /dev/null +++ b/bench/aim-validation/fr3_template_ablation_gemma3-4b.json @@ -0,0 +1 @@ +{"experiment":"fr3_template_ablation","vindex":"output/gemma3-4b-q4k-v2.vindex","n_entities":6,"held_out_template":"The {r} for {e} would be","layers":[{"layer":6,"acc_k1":0.3889,"acc_k2":0.3333,"acc_k4":0.8333},{"layer":10,"acc_k1":0.3333,"acc_k2":0.3889,"acc_k4":0.3889},{"layer":14,"acc_k1":0.3333,"acc_k2":0.3333,"acc_k4":0.3333},{"layer":20,"acc_k1":0.1667,"acc_k2":0.1111,"acc_k4":0.1667}]} \ No newline at end of file diff --git a/bench/aim-validation/fr_early_exit_decode_projection_gemma3-4b.json b/bench/aim-validation/fr_early_exit_decode_projection_gemma3-4b.json new file mode 100644 index 000000000..2d852151d --- /dev/null +++ b/bench/aim-validation/fr_early_exit_decode_projection_gemma3-4b.json @@ -0,0 +1 @@ +{"experiment":"fr_early_exit_decode_projection","vindex":"output/gemma3-4b-q4k-v2.vindex","install_layer":24,"num_layers":34,"fired":12,"full_ms":466.4905,"early_ms":344.9526,"per_token_speedup":1.3523} \ No newline at end of file diff --git a/bench/baselines/_c10_26b_larql_inproc.json b/bench/baselines/_c10_26b_larql_inproc.json new file mode 100644 index 000000000..e11c46534 --- /dev/null +++ b/bench/baselines/_c10_26b_larql_inproc.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781043889", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 4446.251667, + "ms_per_tok": { + "mean": 159.1440967795276, + "p50": 153.134333, + "p99": 267.903 + }, + "tok_per_s": 6.283613531611942, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_c10_26b_larql_inproc_off.json b/bench/baselines/_c10_26b_larql_inproc_off.json new file mode 100644 index 000000000..0524d9698 --- /dev/null +++ b/bench/baselines/_c10_26b_larql_inproc_off.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781130875", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 6587.231458, + "ms_per_tok": { + "mean": 140.4677713307087, + "p50": 134.121208, + "p99": 216.553542 + }, + "tok_per_s": 7.119070734351308, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_c10_26b_larql_inproc_q4kattn.json b/bench/baselines/_c10_26b_larql_inproc_q4kattn.json new file mode 100644 index 000000000..a1e7b9023 --- /dev/null +++ b/bench/baselines/_c10_26b_larql_inproc_q4kattn.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781130897", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 651.566959, + "ms_per_tok": { + "mean": 103.53604429133861, + "p50": 101.69933400000001, + "p99": 157.133625 + }, + "tok_per_s": 9.658472147014947, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_c10_26b_larql_inproc_rerun.json b/bench/baselines/_c10_26b_larql_inproc_rerun.json new file mode 100644 index 000000000..fbad88a56 --- /dev/null +++ b/bench/baselines/_c10_26b_larql_inproc_rerun.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781045118", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 2574.087584, + "ms_per_tok": { + "mean": 460.48107119685034, + "p50": 434.098458, + "p99": 712.708333 + }, + "tok_per_s": 2.1716419252606185, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_c10_26b_larql_inproc_tmax.json b/bench/baselines/_c10_26b_larql_inproc_tmax.json new file mode 100644 index 000000000..a81af8482 --- /dev/null +++ b/bench/baselines/_c10_26b_larql_inproc_tmax.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781044778", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 1788.925916, + "ms_per_tok": { + "mean": 307.8270738188976, + "p50": 292.93525, + "p99": 477.71054200000003 + }, + "tok_per_s": 3.2485771559792207, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full.json b/bench/baselines/_qres_full.json new file mode 100644 index 000000000..f51b3c8f1 --- /dev/null +++ b/bench/baselines/_qres_full.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781204761", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 651.106875, + "ms_per_tok": { + "mean": 71.86225622834645, + "p50": 69.125958, + "p99": 126.85254100000002 + }, + "tok_per_s": 13.915510763013653, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full_asm.json b/bench/baselines/_qres_full_asm.json new file mode 100644 index 000000000..466ff81dd --- /dev/null +++ b/bench/baselines/_qres_full_asm.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781204777", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 878.4857499999999, + "ms_per_tok": { + "mean": 63.034867456692886, + "p50": 62.892792, + "p99": 70.413875 + }, + "tok_per_s": 15.864235784854062, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full_int8attn.json b/bench/baselines/_qres_full_int8attn.json new file mode 100644 index 000000000..31f6ff4ef --- /dev/null +++ b/bench/baselines/_qres_full_int8attn.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781215508", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 623.639625, + "ms_per_tok": { + "mean": 53.79693537007876, + "p50": 51.4435, + "p99": 85.4415 + }, + "tok_per_s": 18.588419454023185, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full_kvappend.json b/bench/baselines/_qres_full_kvappend.json new file mode 100644 index 000000000..84b0b31f9 --- /dev/null +++ b/bench/baselines/_qres_full_kvappend.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781303433", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 648.1185409999999, + "ms_per_tok": { + "mean": 35.78364665354332, + "p50": 35.689167, + "p99": 38.137292 + }, + "tok_per_s": 27.945726428666802, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full_q6kasm.json b/bench/baselines/_qres_full_q6kasm.json new file mode 100644 index 000000000..36906ebb9 --- /dev/null +++ b/bench/baselines/_qres_full_q6kasm.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781216286", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 601.012541, + "ms_per_tok": { + "mean": 46.30266011023623, + "p50": 46.490790999999994, + "p99": 51.000875 + }, + "tok_per_s": 21.597031307039913, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full_serialcuts.json b/bench/baselines/_qres_full_serialcuts.json new file mode 100644 index 000000000..879dca8ac --- /dev/null +++ b/bench/baselines/_qres_full_serialcuts.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781222172", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 704.3161249999999, + "ms_per_tok": { + "mean": 42.604734905511805, + "p50": 42.479625, + "p99": 48.934290999999995 + }, + "tok_per_s": 23.47156958534741, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_full_v3.json b/bench/baselines/_qres_full_v3.json new file mode 100644 index 000000000..473ebb9c1 --- /dev/null +++ b/bench/baselines/_qres_full_v3.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781218079", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 661.730292, + "ms_per_tok": { + "mean": 46.920711937007894, + "p50": 46.163792, + "p99": 66.024875 + }, + "tok_per_s": 21.31254959094658, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/_qres_off.json b/bench/baselines/_qres_off.json new file mode 100644 index 000000000..50e378f4b --- /dev/null +++ b/bench/baselines/_qres_off.json @@ -0,0 +1,23 @@ +{ + "timestamp": "1781204743", + "model": "output/gemma4-26b-a4b-q4k.vindex", + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall:", + "tokens": 128, + "wire": null, + "concurrent": 1, + "results": [ + { + "backend": "larql-cpu-moe (standard)", + "prefill_ms": 4660.109875, + "ms_per_tok": { + "mean": 131.48366732283458, + "p50": 126.305417, + "p99": 192.647666 + }, + "tok_per_s": 7.605507363471078, + "wire_bytes_per_tok": null, + "n_steps": 127, + "note": "in-process experts, KV-cached" + } + ] +} \ No newline at end of file diff --git a/bench/baselines/c10_gemma3-4b_cpu_reconciled.json b/bench/baselines/c10_gemma3-4b_cpu_reconciled.json index 2f7876dd1..5cd15f9df 100644 --- a/bench/baselines/c10_gemma3-4b_cpu_reconciled.json +++ b/bench/baselines/c10_gemma3-4b_cpu_reconciled.json @@ -42,5 +42,5 @@ "ollama num_gpu=0 needs a warmup call after any GPU-mode use (mode switch forces a model reload); short unwarmed measurements are unreliable for a CPU baseline." ], "mechanism_confirms_C12": "The reconciled ~1.6-1.8x gap is the same kernel-quality gap the C12 diagnosis identified: 1.73x per-core, larql's NEON intrinsics (LLVM-lowered) vs llama.cpp's hand-asm Q4K x Q8K SDOT (two-super-block interleave + prefetch). Both attention and FFN already run the int8 Q8_K SDOT kernel. The lever remains C12 (hand-asm kernel), unchanged.", - "still_owed": "26B-A4B llama.cpp CPU baseline (needs a 26B GGUF, not local) -- that is the number that actually pins the medium-term tier." + "still_owed": "CLEARED 2026-06-10 -- see c10_gemma4-26b-a4b_cpu_reconciled.json (llama.cpp 32.1 vs larql 7.1 default / 9.7 with LARQL_Q4K_DIRECT_ATTN; the 26B gap is f32-residency byte traffic, not the C12 kernel)." } diff --git a/bench/baselines/c10_gemma4-26b-a4b_cpu_RUNBOOK.md b/bench/baselines/c10_gemma4-26b-a4b_cpu_RUNBOOK.md index 8fd8dd565..3ae2c4d7d 100644 --- a/bench/baselines/c10_gemma4-26b-a4b_cpu_RUNBOOK.md +++ b/bench/baselines/c10_gemma4-26b-a4b_cpu_RUNBOOK.md @@ -1,5 +1,24 @@ # Runbook — Gemma 4 26B-A4B CPU baseline (the medium-term-tier pin) +> **✅ COMPLETE 2026-06-10** — results in +> `c10_gemma4-26b-a4b_cpu_reconciled.json`: llama.cpp 32.1 tok/s vs larql +> in-process 7.1 (default) / 9.7 (`LARQL_Q4K_DIRECT_ATTN=1`) / loopback 7.3 +> (t=8, warm, n=128, drift-bracketed). Gap = f32-residency byte traffic +> (~10 GB/tok vs ~2.1), not the kernel. Tier 62%→70%. +> +> **Corrections to this runbook learned in the run:** +> - §2: serve with `--experts 0-127`, NOT `--ffn-only` (no expert endpoints → +> "bad expert response"). And `larql bench --moe-shards` still uses the +> pre-C1 path (fails on CPU, #146 signature) — use +> `larql run --moe-shards --engine standard` with `RAYON_NUM_THREADS=8`. +> - §Method addition (mandatory): `pmset -g batt` must show AC/full charge, +> and bracket the matrix with a llama-bench drift check. The first session +> was invalidated by a silent battery drain (llama.cpp itself fell 34→1.05 +> tok/s at 31% battery; ~30×, far beyond the thermal class) plus Spotlight +> churn after 30+ GB of model I/O. +> - The 1.8-vs-4.4 question dissolves: both were artifacts (cold n=8 smoke vs +> cross-session conditions). Warm AC: in-process 7.1 ≈ loopback 7.3. + Goal: produce `c10_gemma4-26b-a4b_cpu_reconciled.json` — the missing 26B-A4B CPU decode number that pins the **medium-term achievability tier** (currently 62%, gate rule in `ROADMAP.md`: *"if 10 tok/s ≈ llama.cpp-on-26B-CPU diff --git a/bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json b/bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json new file mode 100644 index 000000000..8c231bc64 --- /dev/null +++ b/bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json @@ -0,0 +1,139 @@ +{ + "test_id": "C10-26B-A4B", + "model": "gemma-4-26B-A4B-it (Q4_K)", + "machine": "M3 Max, 128GB, 8 threads, battery 100% throughout (verified, see power_state_lesson)", + "date": "2026-06-10", + "purpose": "The missing 26B-A4B CPU decode baseline that pins the medium-term achievability tier (ROADMAP.md gate rule on the ~62% row). Also resolves the runbook's in-process-1.8-vs-loopback-4.4 question and localises the larql-vs-llama.cpp MoE gap.", + "method": { + "prompt": "Write a long detailed essay about the history of the Roman empire, covering its founding, rise, and fall: (n=128 decode steps, 5 warmup, generation-friendly)", + "threads": 8, + "larql_inproc": "larql bench --cpu -n 128 --warmup 5 --threads 8 -> LocalMoeFfn (in-process experts, KV-cached, generate_with_engine_resident)", + "larql_inproc_q4k_attn": "same + LARQL_Q4K_DIRECT_ATTN=1 (task #16 decode path: Q/K/V/O via q4k/q6k matvec from index bytes instead of f32-resident BLAS)", + "larql_loopback": "larql serve --port 8081 --experts 0-127 (NOT --ffn-only; that mode breaks the expert endpoints) + RAYON_NUM_THREADS=8 larql run --moe-shards 0-127=http://127.0.0.1:8081 --engine standard -n 133, warm (one discarded n=20 run)", + "llama_bench": "llama-bench -m /tmp/gemma4-26b-Q4_K_M.gguf -dev BLAS -ngl 0 -p 64 -n 128 -r 3 -t 8 (GGUF built 2026-06-06 from the cached google/gemma-4-26B-A4B-it safetensors via llama.cpp HEAD convert + llama-quantize Q4_K_M; build d48a56eff (9430))", + "drift_check": "llama-bench tg128 r=1 re-run AFTER the larql legs: 30.45 vs 32.08 before (~5%) -> machine state stable across the matrix" + }, + "decode_tok_s_t8": { + "llama_bench_ngl0": 32.08, + "llama_bench_ngl0_stddev": 1.41, + "llama_bench_drift_check": 30.45, + "larql_inproc_default": 7.1, + "larql_inproc_q4k_direct_attn": 9.7, + "larql_loopback_shard": 7.3 + }, + "decode_ms_per_tok_t8": { + "llama_bench_ngl0": 31.2, + "larql_inproc_default": 140.47, + "larql_inproc_q4k_direct_attn": 103.54 + }, + "prefill": { + "llama_bench_pp64_tok_s": 80.26, + "larql_inproc_default_ms": 6587, + "larql_inproc_q4k_attn_ms": 652, + "note": "larql OFF prefill includes the lazy f32 dequant of attention tensors; the q4k-direct flag skips ensure_attn_tensors_dequantised entirely -> 10x TTFT. Different prompt lengths than pp64; not directly comparable to llama.cpp." + }, + "reconciled_gap_x": { + "default_vs_llamacpp": 4.5, + "q4k_attn_vs_llamacpp": 3.3, + "note": "Much wider than the 4B dense gap (1.6-1.8x) -- but the mechanism is NOT kernel quality (C12). It is byte traffic: the in-process MoE path keeps attention + dense FFN slab + lm_head f32-RESIDENT." + }, + "mechanism_byte_ledger_per_token": { + "attention_qkvo": {"params_b": 1.04, "larql_default": "f32 4.15 GB", "llama_cpp": "Q4_K ~0.58 GB"}, + "dense_ffn_slab_2112": {"params_b": 0.54, "larql_default": "f32 2.14 GB", "llama_cpp": "Q4_K ~0.30 GB"}, + "lm_head_vocab_262144": {"params_b": 0.74, "larql_default": "f32 2.95 GB", "llama_cpp": "Q4_K ~0.42 GB"}, + "experts_top8_of_128": {"params_b": 1.43, "larql_default": "Q4_K 0.80 GB", "llama_cpp": "Q4_K ~0.80 GB"}, + "total": {"larql_default_gb": 10.0, "larql_q4k_attn_gb": 6.4, "llama_cpp_gb": 2.1}, + "implied_bandwidth_gb_s": {"larql_default": 71, "larql_q4k_attn": 62, "llama_cpp": 67}, + "verdict": "All three legs run bandwidth-saturated at ~62-71 GB/s. The byte ratio (10.0/2.1 = 4.8x) explains essentially the whole measured 4.5x gap; the expert kernel (already int8 Q8_K SDOT) is only ~8% of default-path bytes and is NOT the bottleneck. The A/B confirms the model: cutting attention 4.15->0.58 GB predicted ~100 ms/tok, measured 103.5." + }, + "resolution_inproc_vs_loopback": { + "verdict": "The runbook's 'in-process 1.8 < loopback 4.4' anomaly was two artifacts, not a real inversion. The 1.8 was a cold/short-n (n=8) smoke; warm AC numbers are in-process 7.1 vs loopback 7.3 -- effectively equal. The loopback's second process contributes extra cores that roughly cancel its ~660 KB/token/direction HTTP+JSON serialization cost.", + "loopback_wire_per_token": "~330 KB sent + ~330 KB recv (30 layers x hidden x f32, both directions)" + }, + "power_state_lesson": { + "verdict": "The entire first measurement session (2026-06-09 night) was invalidated by the laptop silently running on BATTERY and draining 100->31%. Symptoms: monotonic decline 6.3->3.2->2.2 on identical commands, bimodal swings, and finally llama.cpp itself collapsing 34.3->1.05 tok/s (~30x, far beyond the 1.5-3x thermal class). Spotlight (mds_stores 91% after 30+ GB of model I/O) stacked on top.", + "discipline_addition": "C10 method now REQUIRES: pmset -g batt shows AC (or 100%), and a cross-engine drift check bracketing the matrix (llama-bench before AND after). A fast number on a degraded machine is a price floor; a slow number proves nothing." + }, + "secondary_findings": [ + "larql bench --moe-shards still calls the pre-C1 generate_with_remote_moe (no KV engine) and fails on CPU with the #146 error signature ('decode_token_with_moe returned None during prefill') regardless of server mode. larql run --moe-shards --engine standard is the working loopback instrument. Fix tracked: route bench through the C1 resident path.", + "larql serve --ffn-only does NOT serve the MoE expert endpoints the client needs ('bad expert response'); use --experts 0-127.", + "LARQL_DECODE_STAGES=1 records expert time only in the REMOTE branch of moe_ffn_block_cpu; the local-expert branch (LocalMoeFfn) is unrecorded, and the bench table never prints stages anyway.", + "gemma4-26B-A4B CPU in llama.cpp is NOT slow upstream (the runbook's open question): 32 tok/s tg128. The earlier 1-core observation was machine-load confound.", + "llama.cpp pays only 1.25x going dense-4B (43 tok/s) -> MoE-26B (32); active-param math holds (26B-A4B active FFN MACs are ~0.73x the 4B dense FFN's)." + ], + "roadmap_implications": { + "tier_gate": "Gate rule on the medium-term row: 'if 10 tok/s ~ llama.cpp-on-26B-CPU this rises toward 70'. Measured llama.cpp = 32, i.e. the >=10 target is ~3x BELOW what a mature engine proves the hardware does -> favorable branch, tier 62% -> 70%.", + "path_to_10": "LARQL_Q4K_DIRECT_ATTN alone reaches 9.7. Remaining f32 components are the same fix class: dense slab q4k (2.14->0.30 GB) + lm_head q4k (2.95->0.42 GB) -> ~2.0-2.3 GB/token -> ~15-20 tok/s at larql kernel efficiency (C12 1.6-1.8x residual vs llama.cpp). The lever is quantized RESIDENCY (engineering, parity-gated), not new research.", + "q4k_direct_attn_reframed": "Task #16's end-to-end verdict ('~0% at representative ctx') was measured on the network-bound loopback path. On the in-process path the flag is worth 1.36x decode + 10x TTFT. Promote from 'keep opt-in, don't headline' to the default for in-process CPU MoE once parity is re-confirmed at n>=128 (unit parity + e2e Paris already green from #16)." + }, + "artifacts": { + "larql_inproc_off": "bench/baselines/_c10_26b_larql_inproc_off.json", + "larql_inproc_q4k_attn": "bench/baselines/_c10_26b_larql_inproc_q4kattn.json", + "first_session_invalid": "bench/baselines/_c10_26b_larql_inproc.json (2026-06-09, battery-confounded; kept for the power-state lesson only)" + }, + "update_2026_06_11_quantized_residency": { + "what": "Built the residency levers the 06-10 analysis named: LARQL_Q4K_LM_HEAD (q4k lm_head in the resident decode loop, larql-kv generation.rs) + LARQL_Q4K_DIRECT_FFN (q4k dense slab via ffn_decode_step_native in moe_ffn_block_cpu_with_index, with padded-down handling for the 2112-stored-as-2304 layout, exact-parity tested) + C12 v2 fused gate+up hand-asm (q4k_q8k_gate_up_asm, bit-exact, under LARQL_Q4K_ASM). All opt-in, default off = byte-identical; 1148+755+686 tests green, clippy clean.", + "decode_tok_s_t8_n128_warm": { + "default_f32_resident": 7.6, + "full_residency_attn_lmh_ffn": 13.9, + "full_residency_plus_asm": 15.9, + "llama_bench_bracket_start": 32.54, + "llama_bench_bracket_end": 28.8 + }, + "verdict": "MEDIUM-TERM TARGET (>=10 tok/s on 26B-A4B CPU) PASSED at 15.9 on M3 Max (caveat: 128 GB M3 Max has more memory bandwidth than the 64 GB consumer class the tier names). Gap to llama.cpp now ~1.9x (was 4.5x). End bracket shows ~11% thermal drift after sustained load, so the later (flag) legs are if anything understated.", + "kernel_microbench": "Fused gate_up: neon 9.89 GiB/s (horizontal-sum-bound, the worst kernel in the file) -> asm 19.82 GiB/s (2.00x, exceeds the single-matrix asm's 18.4 -- shared activation loads + two independent SDOT chains fill the stall slots). Single-matrix kernels reproduce 2026-06-02 exactly (neon 17.7-18.2, asm 18.2-18.5, scalar 9.36).", + "honest_negative": "4B legacy bench A/B is e2e-NEUTRAL for the fused asm (23.2 vs 22.9; StandardEngine 26.2 both) -- the local direct-decode FFN deliberately uses two SEPARATE rayon-parallel matvecs (cached.rs), so q4k_q8k_gate_up_into's production consumers are the remote expert server (q8k_wire) and walk-FFN paths, not the local bench rows. The spec's 'production FFN gate/up uses it directly' was stale. The 26B's 13.9->15.9 ASM gain comes from the matvec_into callers (experts gate/up/down, lm_head, dense slab gate/up).", + "remaining_to_llamacpp": "~1.9x: (a) Q6_K asm (down projections + 26B dense-slab down -- no asm today); (b) the q4k-direct ATTENTION path runs the f32-activation q4k_matvec (q4_common), not the int8 q8k kernel -- route it through q4k_q8k + asm; (c) instruction-count reduction toward the 28 GiB/s single-kernel target; (d) optionally a rayon-chunked fused gate_up for the local decode paths (+~12% on gate/up portion).", + "raw_rows": "bench/baselines/_qres_{off,full,full_asm}.json" + }, + "update_2026_06_11_int8_attention": { + "what": "LARQL_Q4K_ATTN_INT8: the Q4K-direct attention projections move from the f32-activation kernels (quant_matvec -> q4k_matvec/q6k_matvec) to the int8 Q8_K SDOT kernels (q4k_q8k/q6k_q8k matvec_into, asm-aware, rayon-chunked; Q/K/V share one Q8_K quantisation of h_norm, O quantises the GQA context separately). Same numerics class as the dense-model production attention (attention_decode_step_native). Motivated by the stage split: attn was ~54% of decode at only ~26% of bytes (f32-activation kernel ~3x worse per byte than the expert path). Also: LARQL_DECODE_STAGES now records LOCAL expert time (was remote-only) and the bench MoE row prints the split.", + "decode_tok_s_t8_n128_warm_ac": { + "full_stack_f32act_attn": 16.7, + "full_stack_plus_attn_int8": 21.7, + "llama_bench_bracket": 33.2 + }, + "stage_split_ms_per_token": { + "before": {"attn": 32.7, "experts": 16.5, "dense": 8.1, "lm_head": 6.7}, + "after": {"attn": 19.3, "experts": 16.0, "dense": 8.2, "lm_head": 6.6}, + "note": "Order-swapped A/B (both orders run) — the one anomalous experts=23.7 reading was second-position noise; A reproduces 16.7/32.7 exactly in either position." + }, + "verdict": "Gap to llama.cpp now 1.53x (was 4.5x at the 06-10 baseline, 2.0x after residency+asm). The byte-residency gap is CLOSED; the remainder is C12 kernel-rate + per-step overhead territory: attn still ~30 GB/s effective vs experts ~50 (RoPE/GQA/KV-concat f32 + Q6_K V has no asm), Q6_K asm (dense down + V), instruction-count toward 28 GiB/s, KV-concat copies the cache every step (O(ctx) per layer).", + "parity": "q8k_direct_proj chunking bit-exact vs whole-matrix kernel call (Q4_K + Q6_K); int8-vs-f32-activation within 2% scale-relative (the Q8_K activation quant the production dense path already carries). Flags default off = byte-identical.", + "raw_row": "bench/baselines/_qres_full_int8attn.json" + }, + "update_2026_06_11_q6k_asm": { + "what": "q6k_q8k_matvec_asm under the same LARQL_Q4K_ASM gate: TBL-replicate for the hi2 unpack (replaces 4 scalar multiplies + vector rebuild per 16-elem group) + q4k-style vector-lane scale accumulate (replaces 16 horizontal addv + scalar muls per super-block). Bit-exact vs scalar (first try, same as the q4k and gate_up asm kernels).", + "isolated": "neon 9.97 -> asm 11.21 GiB/s (+12.4%) on the 2560x10240 down shape. Q6_K's 6-bit unpack is inherently heavier than Q4_K's, so the ceiling is lower.", + "e2e_26b": "NEUTRAL within noise: 21.6 vs 21.7 tok/s (attn 19.3->19.1 ms from the V projection, dense 8.2->8.0 from the down) -- the Q6_K streams are only ~13% of full-stack bytes (~3 ms), so +12% of that is ~0.4 ms. Keep: bit-exact, free, and the 4B dense down is Q6_K too (not re-measured).", + "remaining_lever_ranking": "At 46 ms/token (21.7 tok/s) vs llama.cpp ~30 ms: (1) q4k_q8k instruction-count reduction 18.4->28 GiB/s touches ~35 ms of the 46 (experts + attn projections + lm_head + dense gate/up) -- THE remaining lever, but the spec's own gate applies: needs Instruments port-saturation attribution before guessing at the asm restructure; (2) attention non-projection overhead ~7 ms (RoPE/GQA f32, KV-concat copies the cache every step -- grows with ctx); (3) per-super-block Rust glue (unpack_scales_mins + sum2) possibly partly hidden by OoO overlap -- measure, don't assume.", + "raw_row": "bench/baselines/_qres_full_q6kasm.json" + }, + "update_2026_06_11_v3_kernel_and_roofline_crossover": { + "measurement_first": "Criterion decomposition of the v1 asm matvec (attn_proj shape, single thread): asm block alone 16.3 cyc/SB, Rust glue alone 19.2 cyc/SB, full 29.6 — the glue (unpack_scales_mins + i32 scale array + scalar sum2 + 2x software f16_to_f32 + epilogue) costs as much as the asm and only ~3.6 cyc are OoO-hidden. Overturned the 'glue may be hidden' guess; justified folding the glue into the asm.", + "built": "q4k_sb_contrib_asm (v2: whole super-block in one asm block — TBL-vectorised 6-bit scale/min unpack, sum2 via smull/smlal2 over the i16 Q8_K sums, hardware fcvt for d/dmin, exact-order f32 epilogue) then q4k_row_dot_asm (v3: the super-block LOOP inside the asm, constants hoisted per row; the 144-byte header+quants layout lets one walking pointer do the whole row with zero pointer arithmetic). LARQL_Q4K_ASM now routes to v3. All bit-exact vs scalar on first run (fcvt h->s is exact; scvtf rounds identically to Rust `as f32`; same multiplication tree).", + "isolated_single_thread": {"v1": "18.3 GiB/s (29.7 cyc/SB)", "v2": "22.5 (24.1)", "v3": "23.8 (22.9)", "asm_only_floor": "33.3 (16.3)"}, + "e2e_26b_full_stack": "FLAT: 21.3 vs 21.7 tok/s (bracket 33.4). THE FINDING: at 8 threads the q4k_q8k streams are now MEMORY-BANDWIDTH-BOUND — v3 could demand ~190 GB/s aggregate but effective e2e throughput is ~47 GB/s (2.2 GB / 46.9 ms) vs llama.cpp's ~70 GB/s (2.1 GB / 30 ms). The single-core compute-bound roofline reading (2026-06-02) was correct THEN and the kernel work closed it; the multi-thread regime has crossed over.", + "strategic_implication": "C12's '28 GiB/s single-kernel target' is NO LONGER the path to llama.cpp parity at 8 threads. The remaining 1.55x is EFFECTIVE-BANDWIDTH UTILISATION: llama.cpp extracts ~70 GB/s from the same machine, larql ~47. Candidate sinks (unmeasured, next session's measurement-first target): rayon fork-join gaps (~180+ parallel sections per token leave the memory system idle at every join), compute-only phases with idle DRAM (GQA/RoPE/quantize ~7 ms), per-expert task granularity. The v3 kernel win is still banked: real at low thread counts and it lowers the compute floor everywhere.", + "raw_row": "bench/baselines/_qres_full_v3.json" + }, + "update_2026_06_12_serial_cuts": { + "measurement": "MT shape sweep: the rayon-chunked matvecs sustain 72-113 GiB/s at every production shape (104 MB arm = 113 from DRAM; experts-granularity arm 104) — the parallel sections were ACQUITTED. /usr/bin/sample of a live decode then attributed the gap: ~51k of ~68k thread-samples parked in cvwait/ulock/workq (workers asleep during serial main-thread sections). Named sinks: attention non-projection ~20% of wall (KV-concat zeros+bzero+memmove 5%, per-head cblas_sgemv GQA + scalar libm-exp softmax 5%, norms/RoPE), cpu_moe_forward glue ~12%, dense-slab step ~8% (scalar gelu + serial requantise), lm_head argmax epilogue 4.6% (serial softmax+top-k over 262K logits + ~3 MB temporaries), allocator madvise churn ~3%.", + "built": "(1) q4_lm_head_argmax — decode-loop argmax over RAW q4k lm_head logits (scaling/softcap/temperature are monotone -> identical token selection), parallel max, no softmax, no temporaries; wired into argmax_next_token_resident. (2) gqa_attention_decode_step rayon-parallel over heads (per-head math unchanged, disjoint output chunks). (3) dense-slab activation loop rayon-chunked in both cached.rs twins (same libm calls per element — bit-identical).", + "e2e": "n=128: 21.7 -> 23.5 tok/s (42.6 ms; lm_head stage 6.6->3.7 ms) — measured under a DEGRADED bracket (llama-bench 27.1; battery 87% discharging post-compile). Same-state gap 1.15x; vs the clean-bracket llama.cpp 33 ≈ 1.35x. n=512: 16.0 tok/s (62.6 ms) — the O(ctx) tax (~20 ms/token by step 500) is now the dominant remaining structural item.", + "remaining": "(1) KV append-in-place (engine KvHandle contract change): kills the per-layer per-step cache realloc+copy (grows with ctx; the n=512 run pays ~20 ms/token extra); (2) cpu_moe_forward serial glue (~12%: router, per-position copies, combine, build_moe_weights per layer per token); (3) madvise churn -> buffer reuse across steps; (4) remaining attention serial (norms/RoPE/concat-copy).", + "raw_rows": "bench/baselines/_qres_full_serialcuts.json" + }, + "update_2026_06_12_kv_append_in_place": { + "what": "CpuKvHandle rewritten from SharedKV tuples to growable row-major Vec buffers (rows x kv_dim valid prefix): append_row is amortised O(kv_dim). The q4k-direct attention step split into decode_step_project_q4k_direct / decode_step_attend_q4k_direct — the dispatch path projects, APPENDS IN PLACE, then attends over zero-copy views; no concat, no clone. The old monolithic form is kept as a legacy wrapper (owned-concat) for engine walk-loops that hold SharedKV tuples. The f32 fallback path now MOVES state out (take_shared) instead of cloning. append_kv (used by other engines) and clip_kv also lose their per-call O(ctx) rebuilds. gqa_attention_decode_step made generic over array storage (views + owned).", + "why_it_was_huge": "The old attention_step did as_shared_kv().cloned() (full-cache clone) + zeros((total,kv_dim)) x2 + four assigns per LAYER per STEP — at ctx 130 that is ~190 MB/token of pure memory churn, growing linearly with context.", + "e2e": { + "n128_warm": "23.5 -> 27.9 tok/s (35.8 ms/tok); llama-bench bracket 33.6/31.0 -> gap ~1.15x", + "n512": "16.0 -> 24.8 tok/s (p50 37.9 ms); llama.cpp tg512 = 32.0 -> long-ctx gap 1.29x (llama.cpp barely degrades with ctx; remaining larql ctx-growth is GQA compute, which is real work)", + "text_sanity": "4B through the resident engine with ALL flags (DIRECT_ATTN+INT8+LM_HEAD+ASM): 'The capital of France is **Paris**.' — coherent, correct." + }, + "cumulative_arc": "7.6 -> 13.9 (residency) -> 16.7 (+asm) -> 21.7 (+int8 attn) -> 23.5 (+serial cuts) -> 27.9 (+kv append-in-place): 3.7x total; gap to llama.cpp 4.5x -> ~1.15x short-ctx / 1.29x long-ctx.", + "tests": "692 compute + 1213 inference + 755 kv green (incl. the kv-engine parity oracles over the dispatch path); clippy clean on touched files.", + "raw_rows": "bench/baselines/_qres_full_kvappend.json" + } +} diff --git a/crates/larql-cli/src/commands/primary/bench/local_moe_runtime.rs b/crates/larql-cli/src/commands/primary/bench/local_moe_runtime.rs index 8443f0063..9c9e50908 100644 --- a/crates/larql-cli/src/commands/primary/bench/local_moe_runtime.rs +++ b/crates/larql-cli/src/commands/primary/bench/local_moe_runtime.rs @@ -57,6 +57,7 @@ pub(super) fn run_local_moe( let weights_ref: &ModelWeights = weights; let moe_ffn = larql_inference::ffn::LocalMoeFfn { weights: weights_ref, + index: Some(index), }; let mut rows = Vec::new(); @@ -96,6 +97,11 @@ fn run_one( let label = format!("larql-cpu-moe ({})", kind.display_name()); let mut engine = kind.build(larql_inference::cpu_engine_backend()); + // Per-stage split (`LARQL_DECODE_STAGES=1`): reset the thread-local + // accumulators so this engine's numbers aren't polluted by a previous + // engine kind in the same process. + larql_inference::decode_stages::reset(); + let max_tokens = args.warmup + args.tokens; // Capture a timestamp per emitted token: prefill (TTFT) is the gap from // start to the first emit; decode is the gap between consecutive emits. @@ -144,6 +150,26 @@ fn run_one( "in-process experts, KV-cached".to_string() }; + // Accumulated over prefill + decode on the driving thread; per-token + // figures divide by ALL emitted tokens (incl. warmup) since the + // accumulators can't be split retroactively. "other" (router, embed, + // combine, norms) falls out of the wall-clock by subtraction. + if larql_inference::decode_stages::is_enabled() && !tok_times.is_empty() { + let (attn_ms, dense_ms, expert_ms, lmhead_ms) = + larql_inference::decode_stages::snapshot_ms(); + let n = tok_times.len() as f64; + eprintln!( + " [stages/{}] per-token over {} emits (incl. prefill+warmup): \ + attn {:.1} ms | dense {:.1} ms | experts {:.1} ms | lm_head {:.1} ms", + label, + tok_times.len(), + attn_ms / n, + dense_ms / n, + expert_ms / n, + lmhead_ms / n, + ); + } + BenchRow { backend: label, prefill_ms, diff --git a/crates/larql-compute/benches/q4k_q8k_matvec.rs b/crates/larql-compute/benches/q4k_q8k_matvec.rs index 23ba235ce..b6f90e99b 100644 --- a/crates/larql-compute/benches/q4k_q8k_matvec.rs +++ b/crates/larql-compute/benches/q4k_q8k_matvec.rs @@ -34,7 +34,12 @@ use larql_compute::cpu::ops::q4k_q8k_dot::{ // at their definitions); importing them unconditionally breaks the x86_64 build // (CI runs benches via `--all-targets`). Gate the import + their bench arms. #[cfg(target_arch = "aarch64")] -use larql_compute::cpu::ops::q4k_q8k_dot::{q4k_q8k_matvec_asm, q4k_q8k_matvec_neon}; +use larql_compute::cpu::ops::q4_common::quantize_q6_k; +#[cfg(target_arch = "aarch64")] +use larql_compute::cpu::ops::q4k_q8k_dot::{ + q4k_q8k_gate_up_asm, q4k_q8k_gate_up_neon, q4k_q8k_matvec_asm, q4k_q8k_matvec_neon, + q6k_q8k_matvec_asm, q6k_q8k_matvec_neon, +}; const BLOCK_BYTES: usize = 144; const ELEMS_PER_BLOCK: usize = 256; @@ -133,7 +138,291 @@ fn bench_q4k_q8k(c: &mut Criterion) { } group.finish(); + + // Fused gate+up pair (C12): the kernel `kquant_ffn_forward_layer` and the + // remote expert server's q8k_wire path call. Same Gemma 3 4B gate/up + // shape; throughput counts BOTH weight streams (the fusion shares the + // activation loads, not the weight bytes). + #[cfg(target_arch = "aarch64")] + { + let rows = 10240usize; + let cols = 2560usize; + let g_q4 = quantize_q4_k(&synth(rows * cols, 0.3)); + let u_q4 = quantize_q4_k(&synth(rows * cols, 0.9)); + let x = synth(cols, 1.1); + let q8: Q8KActivation = quantize_x_to_q8k(&x); + let mut g_out = vec![0.0f32; rows]; + let mut u_out = vec![0.0f32; rows]; + + let mut group = c.benchmark_group("q4k_q8k_gate_up"); + group.sample_size(60); + group.throughput(Throughput::Bytes(2 * weight_bytes(rows, cols))); + group.bench_with_input(BenchmarkId::new("neon", "ffn_gate_up"), &(), |b, _| { + b.iter(|| { + q4k_q8k_gate_up_neon(&mut g_out, &mut u_out, &q8, &g_q4, &u_q4, rows, cols); + std::hint::black_box(g_out[0] + u_out[0]); + }); + }); + group.bench_with_input(BenchmarkId::new("asm", "ffn_gate_up"), &(), |b, _| { + b.iter(|| { + q4k_q8k_gate_up_asm(&mut g_out, &mut u_out, &q8, &g_q4, &u_q4, rows, cols); + std::hint::black_box(g_out[0] + u_out[0]); + }); + }); + group.finish(); + } + + // Q6_K matvec (C12): the `down`-projection / attention-V format. Gemma 3 + // 4B down shape; throughput on the 210-byte/256-elem Q6_K stream. + #[cfg(target_arch = "aarch64")] + { + let rows = 2560usize; + let cols = 10240usize; + let w_q6 = quantize_q6_k(&synth(rows * cols, 0.3)); + let x = synth(cols, 1.1); + let q8: Q8KActivation = quantize_x_to_q8k(&x); + let mut out = vec![0.0f32; rows]; + + let mut group = c.benchmark_group("q6k_q8k_matvec"); + group.sample_size(60); + group.throughput(Throughput::Bytes( + (rows * (cols / ELEMS_PER_BLOCK) * 210) as u64, + )); + group.bench_with_input(BenchmarkId::new("neon", "ffn_down"), &(), |b, _| { + b.iter(|| { + q6k_q8k_matvec_neon(&mut out, &q8, &w_q6, rows, cols); + std::hint::black_box(out[0]); + }); + }); + group.bench_with_input(BenchmarkId::new("asm", "ffn_down"), &(), |b, _| { + b.iter(|| { + q6k_q8k_matvec_asm(&mut out, &q8, &w_q6, rows, cols); + std::hint::black_box(out[0]); + }); + }); + group.finish(); + } } +/// C12 decomposition: split the per-super-block cost of `q4k_q8k_matvec_asm` +/// into (a) the `asm!` block itself and (b) the per-block Rust glue +/// (`unpack_scales_mins` + the i32 scale array + the scalar `sum2` loop + 2× +/// f16→f32 + the f32 epilogue). The full matvec measures (a)+(b) with +/// whatever out-of-order overlap the core finds; comparing the three numbers +/// says whether the glue is exposed (worth folding into the asm) or already +/// hidden (only intra-asm instruction-count reduction can pay). +#[cfg(target_arch = "aarch64")] +fn bench_sb_decomposition(c: &mut Criterion) { + use larql_compute::cpu::ops::q4_common::f16_to_f32; + use larql_compute::cpu::ops::q4k_q8k_dot::{q4k_sb_sum1_asm, unpack_scales_mins}; + + // One row's worth of super-blocks, attn_proj-like width, repeated over a + // weight buffer big enough to defeat L1 but stay in the same cache regime + // as the full-matvec bench (~3.7 MB). + let cols = 2560usize; + let rows = 1024usize; + let n_sb_per_row = cols / ELEMS_PER_BLOCK; + let n_sb = rows * n_sb_per_row; + let w_q4 = quantize_q4_k(&synth(rows * cols, 0.3)); + let x = synth(cols, 1.1); + let q8: Q8KActivation = quantize_x_to_q8k(&x); + + // Pre-extracted per-SB inputs for the asm-only arm (the extraction IS + // the glue — it must not be timed inside this arm). + let sb_scales: Vec<[i32; 8]> = (0..n_sb) + .map(|i| { + let block = &w_q4[i * BLOCK_BYTES..(i + 1) * BLOCK_BYTES]; + let (sc, _mn) = unpack_scales_mins(&block[4..16]); + [ + sc[0] as i32, + sc[1] as i32, + sc[2] as i32, + sc[3] as i32, + sc[4] as i32, + sc[5] as i32, + sc[6] as i32, + sc[7] as i32, + ] + }) + .collect(); + + let mut group = c.benchmark_group("q4k_sb_decomposition"); + group.sample_size(60); + // Throughput in super-blocks so criterion reports per-SB time directly. + group.throughput(Throughput::Elements(n_sb as u64)); + + group.bench_function("asm_only", |b| { + b.iter(|| { + let mut acc = 0i64; + for i in 0..n_sb { + let sb_in_row = i % n_sb_per_row; + let quants = w_q4[i * BLOCK_BYTES + 16..].as_ptr(); + let act = q8.qs[sb_in_row * ELEMS_PER_BLOCK..].as_ptr(); + // SAFETY: same contracts as the production caller. + let s = unsafe { q4k_sb_sum1_asm(quants, act, sb_scales[i].as_ptr()) }; + acc += s as i64; + } + std::hint::black_box(acc) + }); + }); + + group.bench_function("glue_only", |b| { + b.iter(|| { + let mut acc = 0.0f32; + for i in 0..n_sb { + let sb_in_row = i % n_sb_per_row; + let block = &w_q4[i * BLOCK_BYTES..(i + 1) * BLOCK_BYTES]; + let d_w = f16_to_f32(u16::from_le_bytes([block[0], block[1]])); + let dmin_w = f16_to_f32(u16::from_le_bytes([block[2], block[3]])); + let (scales, mins) = unpack_scales_mins(&block[4..16]); + let sc = [ + scales[0] as i32, + scales[1] as i32, + scales[2] as i32, + scales[3] as i32, + scales[4] as i32, + scales[5] as i32, + scales[6] as i32, + scales[7] as i32, + ]; + std::hint::black_box(sc.as_ptr()); + let q8_sums = &q8.sums[sb_in_row * 8..sb_in_row * 8 + 8]; + let d_y = q8.d[sb_in_row]; + let mut sum2_acc: i32 = 0; + for s in 0..8 { + sum2_acc += mins[s] as i32 * q8_sums[s] as i32; + } + // Fake sum1 stands in for the asm result; the epilogue math is + // the real per-SB f32 work. + let sum1 = i as i32; + acc += d_w * d_y * sum1 as f32 - dmin_w * d_y * sum2_acc as f32; + } + std::hint::black_box(acc) + }); + }); + + group.bench_function("full_matvec", |b| { + let mut out = vec![0.0f32; rows]; + b.iter(|| { + q4k_q8k_matvec_asm(&mut out, &q8, &w_q4, rows, cols); + std::hint::black_box(out[0]); + }); + }); + + group.bench_function("full_matvec_v2", |b| { + use larql_compute::cpu::ops::q4k_q8k_dot::q4k_q8k_matvec_asm_v2; + let mut out = vec![0.0f32; rows]; + b.iter(|| { + q4k_q8k_matvec_asm_v2(&mut out, &q8, &w_q4, rows, cols); + std::hint::black_box(out[0]); + }); + }); + + group.bench_function("full_matvec_v3", |b| { + use larql_compute::cpu::ops::q4k_q8k_dot::q4k_q8k_matvec_asm_v3; + let mut out = vec![0.0f32; rows]; + b.iter(|| { + q4k_q8k_matvec_asm_v3(&mut out, &q8, &w_q4, rows, cols); + std::hint::black_box(out[0]); + }); + }); + + group.finish(); +} + +/// Effective-bandwidth shape sweep (C12, post-roofline-crossover): the SAME +/// rayon-chunked matvec the production decode path runs (par_chunks_mut(32) +/// over rows → single-thread asm kernel per chunk), measured at the real +/// per-layer shapes. If GB/s climbs steeply with matrix size, the per-call +/// fork-join tax on small per-layer matvecs is where DRAM goes idle — the +/// 26B decode issues ~180 of these sections per token. +#[cfg(target_arch = "aarch64")] +fn bench_mt_shapes(c: &mut Criterion) { + use larql_compute::cpu::ops::q4k_q8k_dot::q4k_q8k_matvec_asm_v3; + use rayon::prelude::*; + + // (label, rows, cols) — production 26B shapes plus a big amortised + // reference. K/V 2048×2816 (3.2 MB), Q 4096×2816, O 2816×4096, + // dense gate/up 2112×2816, lm_head-class 65536×2816 (~104 MB). + let cases: &[(&str, usize, usize)] = &[ + ("kv_proj_2048x2816", 2048, 2816), + ("q_proj_4096x2816", 4096, 2816), + ("o_proj_2816x4096", 2816, 4096), + ("dense_gu_2112x2816", 2112, 2816), + ("big_65536x2816", 65536, 2816), + ]; + + let mut group = c.benchmark_group("q8k_mt_shapes"); + group.sample_size(30); + + for &(label, rows, cols) in cases { + let w_q4 = quantize_q4_k(&synth(rows * cols, 0.3)); + let x = synth(cols, 1.1); + let q8: Q8KActivation = quantize_x_to_q8k(&x); + let bytes_per_row = (cols / ELEMS_PER_BLOCK) * BLOCK_BYTES; + let mut out = vec![0.0f32; rows]; + + group.throughput(Throughput::Bytes((rows * bytes_per_row) as u64)); + group.bench_function(label, |b| { + b.iter(|| { + // Mirror q8k_direct_proj / matvec_q4k_or_q6k_q8k exactly: + // 32-row chunks, one single-thread kernel call per chunk. + out.par_chunks_mut(32).enumerate().for_each(|(ci, chunk)| { + let row_start = ci * 32; + let n = chunk.len().min(rows.saturating_sub(row_start)); + if n == 0 { + return; + } + let w = &w_q4[row_start * bytes_per_row..(row_start + n) * bytes_per_row]; + q4k_q8k_matvec_asm_v3(&mut chunk[..n], &q8, w, n, cols); + }); + std::hint::black_box(out[0]); + }); + }); + } + + // Expert-granularity arm: 8 parallel tasks (top-8 experts), each a + // SEQUENTIAL gate+up+down for one expert (single-thread kernels) — the + // production cpu_moe_forward shape. 8 tasks on 8 threads = any straggler + // idles a core; compare GB/s against the row-chunked arms above. + { + let inter = 704usize; + let hidden = 2816usize; + let n_experts = 8usize; + let gu_rows = 2 * inter; // gate+up stacked + let per_expert_gu = quantize_q4_k(&synth(gu_rows * hidden, 0.3)); + let per_expert_dn = quantize_q4_k(&synth(hidden * 768, 0.7)); // 704→768 padded + let x = synth(hidden, 1.1); + let q8: Q8KActivation = quantize_x_to_q8k(&x); + let act = synth(768, 0.5); + let q8_act: Q8KActivation = quantize_x_to_q8k(&act); + let total_bytes = n_experts + * (gu_rows * (hidden / ELEMS_PER_BLOCK) * BLOCK_BYTES + + hidden * (768 / ELEMS_PER_BLOCK) * BLOCK_BYTES); + + group.throughput(Throughput::Bytes(total_bytes as u64)); + group.bench_function("experts_8x_gate_up_down", |b| { + b.iter(|| { + let acc: f32 = (0..n_experts) + .into_par_iter() + .map(|_| { + let mut gu_out = vec![0.0f32; gu_rows]; + let mut dn_out = vec![0.0f32; hidden]; + q4k_q8k_matvec_asm_v3(&mut gu_out, &q8, &per_expert_gu, gu_rows, hidden); + q4k_q8k_matvec_asm_v3(&mut dn_out, &q8_act, &per_expert_dn, hidden, 768); + gu_out[0] + dn_out[0] + }) + .sum(); + std::hint::black_box(acc) + }); + }); + } + + group.finish(); +} + +#[cfg(target_arch = "aarch64")] +criterion_group!(benches, bench_q4k_q8k, bench_sb_decomposition, bench_mt_shapes); +#[cfg(not(target_arch = "aarch64"))] criterion_group!(benches, bench_q4k_q8k); criterion_main!(benches); diff --git a/crates/larql-compute/src/attention/decode.rs b/crates/larql-compute/src/attention/decode.rs index 22bffbb34..d2bf1ad68 100644 --- a/crates/larql-compute/src/attention/decode.rs +++ b/crates/larql-compute/src/attention/decode.rs @@ -26,55 +26,72 @@ use super::SharedKV; /// No causal mask — the new token naturally sees everything, and the /// cache only grew by 1 at the end. #[allow(clippy::too_many_arguments)] -pub fn gqa_attention_decode_step( +pub fn gqa_attention_decode_step( q_new: &Array2, - k_full: &Array2, - v_full: &Array2, + k_full: &ndarray::ArrayBase, + v_full: &ndarray::ArrayBase, num_q: usize, head_dim: usize, reps: usize, scale: f64, softcap: Option, -) -> Array2 { +) -> Array2 +where + S1: ndarray::Data + Sync, + S2: ndarray::Data + Sync, +{ let total_len = k_full.shape()[0]; let mut out = Array2::::zeros((1, num_q * head_dim)); let scale_f32 = scale as f32; - let mut scores = vec![0.0f32; total_len]; - for h in 0..num_q { - let kv_h = h / reps; - let q_off = h * head_dim; - let kv_off = kv_h * head_dim; - - let q_row = q_new.slice(ndarray::s![0, q_off..q_off + head_dim]); - let k_block = k_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); - let raw: ndarray::Array1 = k_block.dot(&q_row); - for i in 0..total_len { - let mut s = raw[i] * scale_f32; - if let Some(cap) = softcap { - s = (s / cap).tanh() * cap; - } - scores[i] = s; - } - // Softmax - let max_val = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max); - let mut sum = 0.0f64; - for s in scores.iter_mut() { - let e = ((*s - max_val) as f64).exp(); - *s = e as f32; - sum += e; - } - let inv_sum = (1.0 / sum) as f32; - for s in scores.iter_mut() { - *s *= inv_sum; - } - // Weighted sum of V - let v_block = v_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); - let scores_view = ndarray::ArrayView1::from(&scores[..]); - let weighted_v = v_block.t().dot(&scores_view); - for d in 0..head_dim { - out[[0, q_off + d]] = weighted_v[d]; - } + // Heads are independent — run them rayon-parallel into disjoint output + // chunks (the per-head math is unchanged, so the result is identical to + // the previous serial loop). The decode sample showed this loop serial + // on the main thread at ~5% of wall while 8 workers slept. + { + use rayon::prelude::*; + let out_slice = out + .as_slice_mut() + .expect("freshly allocated [1, q_dim] is contiguous"); + out_slice + .par_chunks_mut(head_dim) + .enumerate() + .for_each(|(h, out_h)| { + let kv_h = h / reps; + let q_off = h * head_dim; + let kv_off = kv_h * head_dim; + + let q_row = q_new.slice(ndarray::s![0, q_off..q_off + head_dim]); + let k_block = k_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); + let raw: ndarray::Array1 = k_block.dot(&q_row); + let mut scores = vec![0.0f32; total_len]; + for i in 0..total_len { + let mut s = raw[i] * scale_f32; + if let Some(cap) = softcap { + s = (s / cap).tanh() * cap; + } + scores[i] = s; + } + // Softmax + let max_val = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max); + let mut sum = 0.0f64; + for s in scores.iter_mut() { + let e = ((*s - max_val) as f64).exp(); + *s = e as f32; + sum += e; + } + let inv_sum = (1.0 / sum) as f32; + for s in scores.iter_mut() { + *s *= inv_sum; + } + // Weighted sum of V + let v_block = v_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); + let scores_view = ndarray::ArrayView1::from(&scores[..]); + let weighted_v = v_block.t().dot(&scores_view); + out_h.copy_from_slice( + weighted_v.as_slice().expect("1-D dot output is contiguous"), + ); + }); } out } @@ -282,6 +299,102 @@ pub fn run_attention_block_decode_step_backend( Some((h_post_attn, (k_concat, v_concat))) } +/// `LARQL_Q4K_ATTN_INT8=1`: upgrade the Q4K-direct attention projections from +/// the f32-activation kernels (`q4k_matvec`/`q6k_matvec` via `quant_matvec`) +/// to the int8 Q8_K SDOT kernels (`q4k_q8k_matvec_into`/`q6k_q8k_matvec_into`, +/// asm-aware under `LARQL_Q4K_ASM`) — the same numerics the dense-model +/// production attention (`attention_decode_step_native`) has always used. +/// The 26B stage split showed attention at ~54% of decode while moving only +/// ~26% of the bytes: the f32-activation kernel is ~3× worse per byte than +/// the expert path's int8 kernels. Default off = the existing f32-activation +/// behaviour, byte-identical. +fn attn_int8_enabled() -> bool { + use std::sync::OnceLock; + static ON: OnceLock = OnceLock::new(); + *ON.get_or_init(|| std::env::var("LARQL_Q4K_ATTN_INT8").as_deref() == Ok("1")) +} + +/// Int8 decode-step projection: `[1, num_rows] = qw × x_q8k`. The activation +/// is pre-quantised ONCE by the caller (Q/K/V share `h_norm`'s Q8_K form). +/// The per-call kernels are single-threaded, so rows are rayon-chunked here +/// (same pattern as the Q4_K lm_head path). Returns `None` on formats other +/// than Q4_K/Q6_K or a non-256-multiple `in_dim` — caller falls back to the +/// f32-activation projection. +fn q8k_direct_proj( + qw: &crate::QuantWeight, + x_q8k: &crate::cpu::ops::q4k_q8k_dot::Q8KActivation, + num_rows: usize, + in_dim: usize, +) -> Option> { + use crate::cpu::ops::q4k_q8k_dot::{q4k_q8k_matvec_into, q6k_q8k_matvec_into}; + use rayon::prelude::*; + + if !in_dim.is_multiple_of(256) { + return None; + } + let bytes_per_row = match qw.format { + crate::QuantFormat::Q4_K => (in_dim / 256) * 144, + crate::QuantFormat::Q6_K => (in_dim / 256) * 210, + _ => return None, + }; + if qw.data.len() < num_rows * bytes_per_row { + return None; + } + + let mut out = vec![0.0f32; num_rows]; + const CHUNK_ROWS: usize = 32; + out.par_chunks_mut(CHUNK_ROWS) + .enumerate() + .for_each(|(chunk_idx, chunk)| { + let row_start = chunk_idx * CHUNK_ROWS; + let chunk_len = chunk.len().min(num_rows.saturating_sub(row_start)); + if chunk_len == 0 { + return; + } + let w_chunk = + &qw.data[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; + match qw.format { + crate::QuantFormat::Q4_K => { + q4k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, in_dim) + } + crate::QuantFormat::Q6_K => { + q6k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, in_dim) + } + _ => {} + } + }); + Array2::from_shape_vec((1, num_rows), out).ok() +} + +/// Projection dispatch for the Q4K-direct attention step: int8 Q8_K route +/// when `LARQL_Q4K_ATTN_INT8=1` (quantising `x` lazily, at most once per +/// distinct input via the caller-held slot), else the f32-activation route. +/// A `None` from the int8 kernel (odd dims/format) falls back to f32-act +/// rather than aborting the layer. +fn direct_proj( + backend: &dyn crate::ComputeBackend, + qw: &crate::QuantWeight, + x: &Array2, + x_q8k_slot: &mut Option, + int8: bool, + num_rows: usize, + in_dim: usize, +) -> Option> { + if int8 && in_dim.is_multiple_of(256) { + if x_q8k_slot.is_none() { + if let Some(x_slice) = x.as_slice() { + x_q8k_slot.replace(crate::cpu::ops::q4k_q8k_dot::quantize_x_to_q8k(x_slice)); + } + } + if let Some(q8) = x_q8k_slot.as_ref() { + if let Some(out) = q8k_direct_proj(qw, q8, num_rows, in_dim) { + return Some(out); + } + } + } + q4k_direct_proj(backend, qw, x, num_rows, in_dim) +} + /// Single decode-step projection via Q4K/Q6K-direct matvec — no dequant. /// /// `x` is `[1, in_dim]` (decode is one new token); `qw` carries the @@ -302,29 +415,28 @@ fn q4k_direct_proj( Array2::from_shape_vec((1, num_rows), out).ok() } -/// Q4K-direct decode-step attention — reads the Q/K/V/O projection bytes -/// straight from the index (`resolve_attn_weights`) and runs them as -/// `quant_matvec` (Q4_K / Q6_K), skipping the up-front dequant-to-f32 of the -/// f32-BLAS path (`run_attention_block_decode_step_backend`). Everything around -/// the projections — input/QK/V norms, RoPE, GQA decode step, KV-concat, -/// biases, residual — is byte-identical to that function (copied verbatim); the -/// ONLY change is the four projection calls. Parity contract: Q4K-direct ≈ -/// Q4K-dequant within float-summation noise (the kernels are parity-tested vs -/// dequant→matmul), pinned by the test in `larql-inference`'s dequant module. -/// -/// Returns `None` (so the caller falls back to the f32 path) when the index has -/// no Q4K attention bytes for this layer, or the backend can't run a format. +/// Projection half of the Q4K-direct decode step: input norm, Q/K/V +/// projections (f32-act or int8 per `LARQL_Q4K_ATTN_INT8`), biases, QK/V +/// norms, RoPE. No KV-cache access at all — the caller appends +/// `k_new_rope`/`v_new` to its cache (in place, amortised O(1) on the +/// dispatch path) and then runs [`decode_step_attend_q4k_direct`] over views +/// of the full cache. Splitting here is what removes the per-layer-per-step +/// O(ctx) concat copy the monolithic form paid. +pub struct Q4kDecodeProj { + pub q_rope: Array2, + pub k_new_rope: Array2, + pub v_new: Array2, +} + #[allow(clippy::too_many_arguments)] -#[allow(clippy::type_complexity)] -pub fn run_attention_block_decode_step_q4k_direct( +pub fn decode_step_project_q4k_direct( weights: &larql_models::ModelWeights, h_new: &Array2, layer: usize, - kv_entry: Option<&SharedKV>, abs_position: usize, backend: &dyn crate::ComputeBackend, index: &dyn crate::KvIndex, -) -> Option<(Array2, SharedKV)> { +) -> Option { use crate::forward::add_bias; use crate::residual::{rms_norm_heads, rms_norm_heads_no_weight}; @@ -332,12 +444,6 @@ pub fn run_attention_block_decode_step_q4k_direct( let head_dim = arch.head_dim_for_layer(layer); let num_q = arch.num_q_heads_for_layer(layer); let num_kv = arch.num_kv_heads_for_layer(layer); - let reps = num_q / num_kv; - let scale = if arch.attention_multiplier() != 1.0 { - arch.attention_multiplier() as f64 - } else { - arch.attention_scale_for_layer(layer) - }; let norm_offset = arch.norm_weight_offset(); let position = abs_position; let hidden = weights.hidden_size; @@ -346,7 +452,7 @@ pub fn run_attention_block_decode_step_q4k_direct( // Q4K-direct projection weights straight from the index. `None` → no Q4K // attn bytes for this layer; caller uses the f32 dequant path. - let (wq, wk, wv, wo) = crate::pipeline_layer::resolve_attn_weights(index, layer)?; + let (wq, wk, wv, _wo) = crate::pipeline_layer::resolve_attn_weights(index, layer)?; let h_norm = crate::forward::apply_norm( weights, @@ -355,7 +461,12 @@ pub fn run_attention_block_decode_step_q4k_direct( norm_offset, ); - let mut q_full = q4k_direct_proj(backend, &wq, &h_norm, q_dim, hidden)?; + // Int8 route (`LARQL_Q4K_ATTN_INT8=1`): Q/K/V share one Q8_K quantisation + // of `h_norm` (filled lazily by the first projection). + let int8 = attn_int8_enabled(); + let mut h_norm_q8k: Option = None; + + let mut q_full = direct_proj(backend, &wq, &h_norm, &mut h_norm_q8k, int8, q_dim, hidden)?; if let Some(bias) = arch .attn_q_bias_key(layer) .and_then(|k| weights.vectors.get(&k)) @@ -392,8 +503,8 @@ pub fn run_attention_block_decode_step_q4k_direct( llama3, ); - let mut k_full_new = q4k_direct_proj(backend, &wk, &h_norm, kv_dim, hidden)?; - let mut v_full_new = q4k_direct_proj(backend, &wv, &h_norm, kv_dim, hidden)?; + let mut k_full_new = direct_proj(backend, &wk, &h_norm, &mut h_norm_q8k, int8, kv_dim, hidden)?; + let mut v_full_new = direct_proj(backend, &wv, &h_norm, &mut h_norm_q8k, int8, kv_dim, hidden)?; if let Some(bias) = arch .attn_k_bias_key(layer) .and_then(|k| weights.vectors.get(&k)) @@ -427,34 +538,54 @@ pub fn run_attention_block_decode_step_q4k_direct( llama3, ); - let (k_concat, v_concat) = match kv_entry { - Some((k_cached, v_cached)) => { - let total = k_cached.shape()[0] + 1; - let mut k_out = Array2::::zeros((total, kv_dim)); - let mut v_out = Array2::::zeros((total, kv_dim)); - k_out - .slice_mut(ndarray::s![..k_cached.shape()[0], ..]) - .assign(k_cached); - v_out - .slice_mut(ndarray::s![..v_cached.shape()[0], ..]) - .assign(v_cached); - k_out - .slice_mut(ndarray::s![k_cached.shape()[0].., ..]) - .assign(&k_new_rope); - v_out - .slice_mut(ndarray::s![v_cached.shape()[0].., ..]) - .assign(&v_full_new); - (k_out, v_out) - } - None => (k_new_rope, v_full_new), + Some(Q4kDecodeProj { + q_rope, + k_new_rope, + v_new: v_full_new, + }) +} + +/// Attend half of the Q4K-direct decode step: GQA over the FULL cache views +/// (which must already include this step's new K/V row), O projection, +/// post-attention norm + residual. Math is identical to the monolithic form; +/// only the cache representation (views vs owned concat) differs. +#[allow(clippy::too_many_arguments)] +pub fn decode_step_attend_q4k_direct( + weights: &larql_models::ModelWeights, + h_new: &Array2, + layer: usize, + q_rope: &Array2, + k_all: ndarray::ArrayView2, + v_all: ndarray::ArrayView2, + backend: &dyn crate::ComputeBackend, + index: &dyn crate::KvIndex, +) -> Option> { + use crate::forward::add_bias; + + let arch = &*weights.arch; + let head_dim = arch.head_dim_for_layer(layer); + let num_q = arch.num_q_heads_for_layer(layer); + let num_kv = arch.num_kv_heads_for_layer(layer); + let reps = num_q / num_kv; + let scale = if arch.attention_multiplier() != 1.0 { + arch.attention_multiplier() as f64 + } else { + arch.attention_scale_for_layer(layer) }; + let norm_offset = arch.norm_weight_offset(); + let hidden = weights.hidden_size; + let q_dim = num_q * head_dim; + + let (_wq, _wk, _wv, wo) = crate::pipeline_layer::resolve_attn_weights(index, layer)?; + let int8 = attn_int8_enabled(); let softcap = arch.attn_logit_softcapping(); - let attn_out = gqa_attention_decode_step( - &q_rope, &k_concat, &v_concat, num_q, head_dim, reps, scale, softcap, - ); + let attn_out = + gqa_attention_decode_step(q_rope, &k_all, &v_all, num_q, head_dim, reps, scale, softcap); - let mut attn_projected = q4k_direct_proj(backend, &wo, &attn_out, hidden, q_dim)?; + let mut attn_out_q8k: Option = None; + let mut attn_projected = + direct_proj(backend, &wo, &attn_out, &mut attn_out_q8k, int8, hidden, q_dim)?; if let Some(bias) = arch .attn_o_bias_key(layer) .and_then(|k| weights.vectors.get(&k)) @@ -481,6 +612,73 @@ pub fn run_attention_block_decode_step_q4k_direct( h_new + &attn_projected }; + Some(h_post_attn) +} + +/// Q4K-direct decode-step attention — reads the Q/K/V/O projection bytes +/// straight from the index (`resolve_attn_weights`) and runs them as +/// `quant_matvec` (Q4_K / Q6_K), skipping the up-front dequant-to-f32 of the +/// f32-BLAS path (`run_attention_block_decode_step_backend`). +/// +/// LEGACY OWNED-CONCAT FORM: kept for callers that own their cache as +/// `SharedKV` tuples (larql-kv engine walk loops). It pays an O(ctx) +/// concat copy per call — the dispatch path (`CpuBackend::attention_step`) +/// instead uses the split project/append/attend flow above, which appends in +/// place. Outputs are identical either way. +/// +/// Returns `None` (so the caller falls back to the f32 path) when the index has +/// no Q4K attention bytes for this layer, or the backend can't run a format. +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] +pub fn run_attention_block_decode_step_q4k_direct( + weights: &larql_models::ModelWeights, + h_new: &Array2, + layer: usize, + kv_entry: Option<&SharedKV>, + abs_position: usize, + backend: &dyn crate::ComputeBackend, + index: &dyn crate::KvIndex, +) -> Option<(Array2, SharedKV)> { + let arch = &*weights.arch; + let head_dim = arch.head_dim_for_layer(layer); + let num_kv = arch.num_kv_heads_for_layer(layer); + let kv_dim = num_kv * head_dim; + + let proj = decode_step_project_q4k_direct(weights, h_new, layer, abs_position, backend, index)?; + + let (k_concat, v_concat) = match kv_entry { + Some((k_cached, v_cached)) => { + let total = k_cached.shape()[0] + 1; + let mut k_out = Array2::::zeros((total, kv_dim)); + let mut v_out = Array2::::zeros((total, kv_dim)); + k_out + .slice_mut(ndarray::s![..k_cached.shape()[0], ..]) + .assign(k_cached); + v_out + .slice_mut(ndarray::s![..v_cached.shape()[0], ..]) + .assign(v_cached); + k_out + .slice_mut(ndarray::s![k_cached.shape()[0].., ..]) + .assign(&proj.k_new_rope); + v_out + .slice_mut(ndarray::s![v_cached.shape()[0].., ..]) + .assign(&proj.v_new); + (k_out, v_out) + } + None => (proj.k_new_rope, proj.v_new), + }; + + let h_post_attn = decode_step_attend_q4k_direct( + weights, + h_new, + layer, + &proj.q_rope, + k_concat.view(), + v_concat.view(), + backend, + index, + )?; + Some((h_post_attn, (k_concat, v_concat))) } @@ -490,6 +688,96 @@ mod tests { use larql_models::test_fixtures::make_test_weights; use ndarray::Array2; + /// The rayon row-chunking in `q8k_direct_proj` must be bit-exact with one + /// whole-matrix kernel call (chunk boundaries change nothing), for both + /// Q4_K and Q6_K formats. + #[test] + fn q8k_direct_proj_chunking_is_bit_exact() { + use crate::cpu::ops::q4_common::{quantize_q4_k, quantize_q6_k}; + use crate::cpu::ops::q4k_q8k_dot::{ + q4k_q8k_matvec_into, q6k_q8k_matvec_into, quantize_x_to_q8k, + }; + + let in_dim = 512usize; + let num_rows = 70usize; // not a multiple of CHUNK_ROWS=32 — exercises the tail + let w_f32: Vec = (0..num_rows * in_dim) + .map(|i| ((i as f32) * 0.011).sin() * 0.3) + .collect(); + let x: Vec = (0..in_dim).map(|i| ((i as f32) * 0.017).cos()).collect(); + let q8 = quantize_x_to_q8k(&x); + + for fmt in [crate::QuantFormat::Q4_K, crate::QuantFormat::Q6_K] { + let bytes = match fmt { + crate::QuantFormat::Q4_K => quantize_q4_k(&w_f32), + _ => quantize_q6_k(&w_f32), + }; + let qw = crate::QuantWeight { + data: &bytes, + scales: None, + format: fmt, + }; + let chunked = + q8k_direct_proj(&qw, &q8, num_rows, in_dim).expect("q8k proj must run"); + + let mut whole = vec![0.0f32; num_rows]; + match fmt { + crate::QuantFormat::Q4_K => { + q4k_q8k_matvec_into(&mut whole, &q8, &bytes, num_rows, in_dim) + } + _ => q6k_q8k_matvec_into(&mut whole, &q8, &bytes, num_rows, in_dim), + } + for (r, (&c, &w)) in chunked.iter().zip(whole.iter()).enumerate() { + assert_eq!( + c.to_bits(), + w.to_bits(), + "{fmt:?} row {r}: chunked={c} whole={w}" + ); + } + } + } + + /// Int8 projections vs the f32-activation projection: same weights, same + /// input — outputs agree within activation-quantisation tolerance (the + /// int8 route adds ONLY the Q8_K activation quant the production dense + /// attention path already carries; weight quant is identical bytes). + #[test] + fn q8k_direct_proj_matches_f32_activation_within_quant_tolerance() { + use crate::cpu::ops::q4_common::quantize_q4_k; + use crate::cpu::ops::q4k_q8k_dot::quantize_x_to_q8k; + + let in_dim = 512usize; + let num_rows = 48usize; + let w_f32: Vec = (0..num_rows * in_dim) + .map(|i| ((i as f32) * 0.011).sin() * 0.3) + .collect(); + let bytes = quantize_q4_k(&w_f32); + let qw = crate::QuantWeight { + data: &bytes, + scales: None, + format: crate::QuantFormat::Q4_K, + }; + let x: Vec = (0..in_dim).map(|i| ((i as f32) * 0.017).cos()).collect(); + + let q8 = quantize_x_to_q8k(&x); + let int8_out = q8k_direct_proj(&qw, &q8, num_rows, in_dim).expect("int8 proj"); + + let backend = crate::CpuBackend; + let x_arr = Array2::from_shape_vec((1, in_dim), x).unwrap(); + let f32_out = + q4k_direct_proj(&backend, &qw, &x_arr, num_rows, in_dim).expect("f32-act proj"); + + // Scale-relative bound: Q8_K activation quant is ~1/255 per block + // value; accumulated over 512 terms the practical error is well + // under 1% of the output magnitude. + let denom = f32_out.iter().map(|v| v.abs()).fold(0.0f32, f32::max); + for (r, (&a, &b)) in int8_out.iter().zip(f32_out.iter()).enumerate() { + assert!( + (a - b).abs() <= 0.02 * denom.max(1e-3), + "row {r}: int8={a} f32act={b} denom={denom}" + ); + } + } + #[test] fn decode_step_output_shape() { let weights = make_test_weights(); diff --git a/crates/larql-compute/src/cpu/ops/q4_common.rs b/crates/larql-compute/src/cpu/ops/q4_common.rs index f8af7e0cc..1ba56896a 100644 --- a/crates/larql-compute/src/cpu/ops/q4_common.rs +++ b/crates/larql-compute/src/cpu/ops/q4_common.rs @@ -432,7 +432,12 @@ pub fn f16_to_f32(bits: u16) -> f32 { // mantissa region. let lz = (mant as u16).leading_zeros() - 6; // 0..=9 let new_mant = (mant << (lz + 14)) & 0x7F_FFFF; - let new_exp = (127u32 - 14 - lz) << 23; + // Leading one sits at mantissa bit (9 - lz), so the value is + // 1.f × 2^(9 - lz - 24) = 1.f × 2^(-15 - lz) → biased exponent + // 127 - 15 - lz. (Was `127 - 14 - lz`, which decoded every f16 + // subnormal 2× too large — and the exhaustive test never caught + // it because a test-local `f16_to_f32` shadowed this one.) + let new_exp = (127u32 - 15 - lz) << 23; return f32::from_bits(sign | new_exp | new_mant); } if exp == 31 { @@ -1168,6 +1173,189 @@ mod tests { assert_eq!(diffs, 0, "{diffs} f16 inputs decode to different f32 bits"); } + // ── f16 subnormal regression battery (2026-06-12). The subnormal + // branch decoded 2× too large while the exhaustive test silently + // verified a test-local `f16_to_f32` that shadowed the production fn. + // Assertions below call through `super::` so a future shadow cannot + // re-mask the production path. ── + + #[test] + fn f16_to_f32_subnormal_pinned_values() { + // IEEE 754 half subnormals: value = mant × 2^-24 exactly. + assert_eq!(super::f16_to_f32(0x0001), 2f32.powi(-24), "smallest subnormal"); + assert_eq!( + super::f16_to_f32(0x03fe), + 1022.0 * 2f32.powi(-24), + "the field case — the gemma3-4b L32 K-scale that exposed the 2× bug" + ); + assert_eq!( + super::f16_to_f32(0x03ff), + 1023.0 * 2f32.powi(-24), + "largest subnormal" + ); + assert_eq!(super::f16_to_f32(0x0400), 2f32.powi(-14), "smallest normal"); + assert_eq!(super::f16_to_f32(0x8001), -(2f32.powi(-24)), "negative subnormal"); + } + + #[test] + fn f16_to_f32_strictly_monotonic_across_subnormal_boundary() { + // The 2× bug made f16(0x03ff) ≈ 1.22e-4 > f16(0x0400) = 6.1e-5 — a + // monotonicity violation at the subnormal/normal seam. Walk the + // positive seam region and require strict increase. + let mut prev = super::f16_to_f32(0x0000); + for bits in 0x0001u16..=0x0410 { + let v = super::f16_to_f32(bits); + assert!( + v > prev, + "f16 decode must be strictly increasing: bits={bits:#06x} gives {v:e}, prev {prev:e}" + ); + prev = v; + } + } + + /// Deterministic pseudo-random data at a chosen magnitude. Magnitude + /// ~4e-4 drives the per-super-block `d`/`dmin` f16 scales into the + /// subnormal range (< 2^-14), the regime the 2× bug corrupted. + fn seeded_data(n: usize, magnitude: f32, mut seed: u64) -> Vec { + (0..n) + .map(|_| { + seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + (((seed >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * magnitude + }) + .collect() + } + + /// True if any Q4_K super-block in `bytes` carries a subnormal f16 + /// `d` or `dmin` (exp bits zero, mantissa nonzero). + fn q4k_has_subnormal_scale(bytes: &[u8]) -> bool { + bytes.chunks_exact(144).any(|b| { + let d = u16::from_le_bytes([b[0], b[1]]); + let dmin = u16::from_le_bytes([b[2], b[3]]); + let sub = |v: u16| (v >> 10) & 0x1F == 0 && (v & 0x3FF) != 0; + sub(d) || sub(dmin) + }) + } + + /// Cross-crate seam test: same bytes, q4_common decoder vs the + /// larql-models decoder (which backs the vindex registry and the + /// staged/dequant path). These disagreed on every subnormal-scale + /// block until 2026-06-12 — same bytes, silently different weights. + #[test] + fn q4k_decode_matches_models_reference_incl_subnormal_scales() { + for (name, magnitude) in [("normal", 1.0f32), ("subnormal-scale", 4.0e-4)] { + let data = seeded_data(1024, magnitude, 0xA11C1); + let bytes = quantize_q4_k(&data); + if magnitude < 1e-3 { + assert!( + q4k_has_subnormal_scale(&bytes), + "fixture drift: {name} case no longer produces subnormal f16 scales" + ); + } + let ours = dequantize_q4_k(&bytes, 1024); + let reference = + larql_models::quant::ggml::dequantize_q4_k(&bytes, 1024).expect("models decode"); + for (i, (a, b)) in ours.iter().zip(reference.iter()).enumerate() { + let tol = 1e-5 * a.abs().max(b.abs()).max(1e-30); + assert!( + (a - b).abs() <= tol, + "{name}: decoders disagree at elem {i}: q4_common {a:e} vs models {b:e}" + ); + } + } + } + + /// Q6_K twin — its `d` is also an f16 scale, and the int8 Q6K matvec + /// reads it through the shared (previously buggy) `f16_to_f32`. + /// Reference decode comes from larql-models (independent f16 impl). + #[test] + fn q6k_int8_matvec_matches_models_reference_incl_tiny_scales() { + use crate::cpu::ops::q4k_q8k_dot::{ + q6k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, + }; + let (rows, cols) = (2usize, 256usize); + for (name, magnitude) in [("normal", 1.0f32), ("tiny-scale", 4.0e-4)] { + let data = seeded_data(rows * cols, magnitude, 0xA11C2); + let bytes = quantize_q6_k(&data); + let x = seeded_data(cols, 1.0, 0xA11C5); + let reference = larql_models::quant::ggml::dequantize_q6_k(&bytes, rows * cols) + .expect("models decode"); + let expected: Vec = (0..rows) + .map(|r| { + reference[r * cols..(r + 1) * cols] + .iter() + .zip(x.iter()) + .map(|(w, v)| w * v) + .sum() + }) + .collect(); + let denom: f32 = expected.iter().map(|v| v.abs()).fold(1e-12, f32::max); + let mut x_q8k = Q8KActivation::with_capacity(cols); + quantize_x_to_q8k_into(&mut x_q8k, &x); + let mut out = vec![0.0f32; rows]; + q6k_q8k_matvec_into(&mut out, &x_q8k, &bytes, rows, cols); + for (r, (got, want)) in out.iter().zip(expected.iter()).enumerate() { + assert!( + (got - want).abs() <= 2e-2 * denom, + "{name}: Q6K int8 matvec row {r}: {got:e} vs models reference {want:e}" + ); + } + } + } + + /// Both Q4_K matvec kernels against the dequant·dot reference on the + /// same bytes, including subnormal-scale blocks. Pre-fix, affected + /// blocks contributed 2× — far outside either tolerance. + #[test] + fn q4k_matvecs_match_dequant_dot_incl_subnormal_scales() { + use crate::cpu::ops::q4k_q8k_dot::{ + q4k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, + }; + let (rows, cols) = (4usize, 256usize); + for (name, magnitude) in [("normal", 1.0f32), ("subnormal-scale", 4.0e-4)] { + let data = seeded_data(rows * cols, magnitude, 0xA11C3); + let bytes = quantize_q4_k(&data); + if magnitude < 1e-3 { + assert!(q4k_has_subnormal_scale(&bytes), "fixture drift ({name})"); + } + let x = seeded_data(cols, 1.0, 0xA11C4); + let deq = dequantize_q4_k(&bytes, rows * cols); + let expected: Vec = (0..rows) + .map(|r| { + deq[r * cols..(r + 1) * cols] + .iter() + .zip(x.iter()) + .map(|(w, v)| w * v) + .sum() + }) + .collect(); + let denom: f32 = expected.iter().map(|v| v.abs()).fold(1e-12, f32::max); + + // f32-activation kernel: decode-identical, tight tolerance. + let mut out_f32 = vec![0.0f32; rows]; + q4k_matvec_into(&mut out_f32, &x, &bytes, rows, cols); + for (r, (got, want)) in out_f32.iter().zip(expected.iter()).enumerate() { + assert!( + (got - want).abs() <= 1e-4 * denom, + "{name}: f32-act matvec row {r}: {got:e} vs {want:e}" + ); + } + + // int8-activation kernel: Q8_K rounding allowed, 2× is not. + let mut x_q8k = Q8KActivation::with_capacity(cols); + quantize_x_to_q8k_into(&mut x_q8k, &x); + let mut out_i8 = vec![0.0f32; rows]; + q4k_q8k_matvec_into(&mut out_i8, &x_q8k, &bytes, rows, cols); + for (r, (got, want)) in out_i8.iter().zip(expected.iter()).enumerate() { + assert!( + (got - want).abs() <= 2e-2 * denom, + "{name}: int8 matvec row {r}: {got:e} vs {want:e}" + ); + } + } + } + #[test] fn q8_quantize_round_trip() { let x: Vec = (0..64).map(|i| (i as f32 - 32.0) * 0.1).collect(); @@ -1399,37 +1587,7 @@ mod tests { ); } - /// Decode f16 bits to f32 (for test verification). - fn f16_to_f32(bits: u16) -> f32 { - let sign = ((bits >> 15) & 1) as u32; - let exp = ((bits >> 10) & 0x1F) as i32; - let mant = (bits & 0x3FF) as u32; - if exp == 0 { - if mant == 0 { - return if sign == 1 { -0.0 } else { 0.0 }; - } - // Subnormal - let val = mant as f32 / 1024.0 * 2.0f32.powi(-14); - return if sign == 1 { -val } else { val }; - } - if exp == 31 { - return if mant == 0 { - if sign == 1 { - f32::NEG_INFINITY - } else { - f32::INFINITY - } - } else { - f32::NAN - }; - } - let val = (1.0 + mant as f32 / 1024.0) * 2.0f32.powi(exp - 15); - if sign == 1 { - -val - } else { - val - } - } + /// Test alias — dispatches to the canonical module-scope implementation. fn dequantize_q4_k_llama(data: &[u8], n_elements: usize) -> Vec { diff --git a/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs b/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs index 3f30d4f12..35d08c1d2 100644 --- a/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs +++ b/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs @@ -155,7 +155,8 @@ pub fn quantize_x_to_q8k(x: &[f32]) -> Q8KActivation { /// into 8 6-bit scales + 8 6-bit mins. Matches llama.cpp's /// `get_scale_min_k4` (and `q4_common::dequantize_q4_k` / `q4k_matvec.rs`). #[inline(always)] -fn unpack_scales_mins(p: &[u8]) -> ([u8; 8], [u8; 8]) { +#[doc(hidden)] // pub for the C12 decomposition microbench (benches/q4k_q8k_matvec.rs) +pub fn unpack_scales_mins(p: &[u8]) -> ([u8; 8], [u8; 8]) { let mut scales = [0u8; 8]; let mut mins = [0u8; 8]; for j in 0..4 { @@ -614,7 +615,8 @@ pub fn q4k_q8k_matvec_neon_2row( /// baseline on `aarch64-apple-darwin` — same assumption as `sdot_acc`. #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline] -unsafe fn q4k_sb_sum1_asm(quants: *const u8, act: *const i8, scales: *const i32) -> i32 { +#[doc(hidden)] // pub for the C12 decomposition microbench (benches/q4k_q8k_matvec.rs) +pub unsafe fn q4k_sb_sum1_asm(quants: *const u8, act: *const i8, scales: *const i32) -> i32 { let sum1: i32; // One group of the unrolled body, parameterised by the two scale lanes // (`$sv` = scale vector, `$l0`/`$l1` = lane indices for sub-blocks 2g/2g+1). @@ -754,6 +756,385 @@ pub fn q4k_q8k_matvec_asm( } } +/// TBL index tables for the v2 super-block kernel's vectorised scale/min +/// unpack. The 16-byte header vector holds `d`(0-1) `dmin`(2-3) and the 12 +/// packed scale bytes at lanes 4..16, so the `unpack_scales_mins` byte +/// positions shift by +4: A=p[0..4]→lanes 4..8, B=p[4..8]→lanes 8..12, +/// C=p[8..12]→lanes 12..16. 0xFF lanes produce zeros (TBL out-of-range). +/// Order: SCLO, SCHI_HI2, HI_LO4 (shared by scales and mins), MNLO, +/// MNHI_HI2 → loaded into v24..v28. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[rustfmt::skip] +static Q4K_UNPACK_IDX: [u8; 80] = [ + // SCLO: scales[0..4] = lo6 of A + 4, 5, 6, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // SCHI_HI2: scales[4..8] |= (A >> 6) << 4 + 0xff, 0xff, 0xff, 0xff, 4, 5, 6, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // HI_LO4: scales[4..8] = lo4 of C (and mins[4..8] = hi4 of C, same lanes) + 0xff, 0xff, 0xff, 0xff, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // MNLO: mins[0..4] = lo6 of B + 8, 9, 10, 11, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // MNHI_HI2: mins[4..8] |= (B >> 6) << 4 + 0xff, 0xff, 0xff, 0xff, 8, 9, 10, 11, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +]; + +/// v2 super-block kernel (C12): the WHOLE per-super-block computation in one +/// `asm!` block — vectorised 6-bit scale/min unpack (TBL, replaces the +/// scalar `unpack_scales_mins` + i32 array round-trip), the 4-group SDOT +/// `sum1` body (identical to [`q4k_sb_sum1_asm`]), `sum2` as +/// `smull`/`smlal2` over the Q8_K sub-block sums, hardware `fcvt` for +/// `d`/`dmin` (replaces two software `f16_to_f32` calls), and the exact-order +/// f32 epilogue. Returns the super-block's contribution +/// `d_w·d_y·sum1 − dmin_w·d_y·sum2`. +/// +/// Built from the 2026-06-11 decomposition measurement: the v1 asm block is +/// 16.3 cyc/SB but the Rust glue around it costs 19.2 cyc/SB with only ~3.6 +/// hidden by OoO overlap — the glue, not the asm schedule, is the fat. +/// +/// Bit-exactness: `fcvt` h→s is exact (every f16 is representable), `scvtf` +/// rounds i32→f32 identically to Rust's `as f32`, and the epilogue +/// multiplication tree matches the scalar reference's expression order. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[inline] +unsafe fn q4k_sb_contrib_asm( + header: *const u8, + quants: *const u8, + act: *const i8, + q8_sums: *const i16, + d_y: f32, +) -> f32 { + let contrib: f32; + macro_rules! grp { + ($sv:literal, $l0:literal, $l1:literal) => { + concat!( + "ld1 {{v0.16b, v1.16b}}, [{q}], #32\n", + "ld1 {{v20.16b, v21.16b, v22.16b, v23.16b}}, [{a}], #64\n", + "and v2.16b, v0.16b, v16.16b\n", + "and v3.16b, v1.16b, v16.16b\n", + "ushr v4.16b, v0.16b, #4\n", + "ushr v5.16b, v1.16b, #4\n", + "movi v6.4s, #0\n", + "movi v7.4s, #0\n", + "sdot v6.4s, v2.16b, v20.16b\n", + "sdot v6.4s, v3.16b, v21.16b\n", + "sdot v7.4s, v4.16b, v22.16b\n", + "sdot v7.4s, v5.16b, v23.16b\n", + "mul v6.4s, v6.4s, ", + $sv, + ".s[", + $l0, + "]\n", + "mul v7.4s, v7.4s, ", + $sv, + ".s[", + $l1, + "]\n", + "add v17.4s, v17.4s, v6.4s\n", + "add v17.4s, v17.4s, v7.4s\n", + ) + }; + } + unsafe { + core::arch::asm!( + "movi v16.16b, #0x0f", + "movi v30.16b, #0x3f", + "ld1 {{v24.16b, v25.16b, v26.16b, v27.16b}}, [{idx}]", + "ld1 {{v28.16b}}, [{idx2}]", + "ld1 {{v0.16b}}, [{hdr}]", // d | dmin | 12 packed scale bytes + // ── vectorised unpack_scales_mins ── + "and v1.16b, v0.16b, v30.16b", // lo6 of every byte + "ushr v2.16b, v0.16b, #6", + "shl v2.16b, v2.16b, #4", // (byte >> 6) << 4 + "and v3.16b, v0.16b, v16.16b", // lo4 + "ushr v4.16b, v0.16b, #4", // hi4 + "tbl v5.16b, {{v1.16b}}, v24.16b", // scales[0..4] + "tbl v6.16b, {{v3.16b}}, v26.16b", // scales[4..8] lo4 (from C) + "tbl v7.16b, {{v2.16b}}, v25.16b", // scales[4..8] hi2 (from A) + "orr v5.16b, v5.16b, v6.16b", + "orr v5.16b, v5.16b, v7.16b", // sc8 in lanes 0..8 + "tbl v6.16b, {{v1.16b}}, v27.16b", // mins[0..4] + "tbl v7.16b, {{v4.16b}}, v26.16b", // mins[4..8] lo4 (from C) + "tbl v1.16b, {{v2.16b}}, v28.16b", // mins[4..8] hi2 (from B) + "orr v6.16b, v6.16b, v7.16b", + "orr v31.16b, v6.16b, v1.16b", // mn8 stashed in v31 + "ushll v5.8h, v5.8b, #0", + "ushll v18.4s, v5.4h, #0", + "ushll2 v19.4s, v5.8h, #0", // scales as i32x4 ×2 + // ── sum1: 4 groups, identical to the v1 asm ── + "movi v17.4s, #0", + grp!("v18", "0", "1"), + grp!("v18", "2", "3"), + grp!("v19", "0", "1"), + grp!("v19", "2", "3"), + "addv s17, v17.4s", // sum1 (i32 in s17) + // ── sum2 = Σ mins[s] · q8_sums[s] (i16 sums, mins ≤ 63) ── + "ushll v1.8h, v31.8b, #0", + "ld1 {{v2.8h}}, [{sums}]", + "smull v3.4s, v2.4h, v1.4h", + "smlal2 v3.4s, v2.8h, v1.8h", + "addv s3, v3.4s", // sum2 (i32 in s3) + // ── epilogue: d_w·d_y·sum1f − dmin_w·d_y·sum2f, exact order ── + "ldr s0, [{hdr}]", // d (h[0]) | dmin (h[1]) + "mov h4, v0.h[0]", + "fcvt s4, h4", // d_w (exact) + "mov h5, v0.h[1]", + "fcvt s5, h5", // dmin_w (exact) + "scvtf s6, s17", // sum1 as f32 (same rounding as Rust) + "scvtf s7, s3", // sum2 as f32 + "fmul s4, s4, {dy:s}", + "fmul s4, s4, s6", + "fmul s5, s5, {dy:s}", + "fmul s5, s5, s7", + "fsub {contrib:s}, s4, s5", + hdr = in(reg) header, + q = inout(reg) quants => _, + a = inout(reg) act => _, + sums = in(reg) q8_sums, + idx = in(reg) Q4K_UNPACK_IDX.as_ptr(), + idx2 = in(reg) Q4K_UNPACK_IDX.as_ptr().wrapping_add(64), + dy = in(vreg) d_y, + contrib = out(vreg) contrib, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + out("v28") _, out("v30") _, out("v31") _, + options(nostack, readonly), + ); + } + contrib +} + +/// v2 hand-asm Q4_K × Q8_K matvec: per super-block one [`q4k_sb_contrib_asm`] +/// call — no per-block Rust glue at all (the v1 form's `unpack_scales_mins` + +/// scale-array + `sum2` + 2× software `f16_to_f32` measured 19.2 cyc/SB, +/// as much as the asm block itself). Bit-exact with the scalar reference +/// (`q8k_matvec_asm_v2_matches_scalar_bit_exact`). +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +pub fn q4k_q8k_matvec_asm_v2( + out: &mut [f32], + q8k_x: &Q8KActivation, + w: &[u8], + rows: usize, + cols: usize, +) { + debug_assert_eq!(out.len(), rows); + debug_assert_eq!(q8k_x.qs.len(), cols); + debug_assert_eq!(cols % ELEMS_PER_BLOCK, 0); + if rows == 0 || cols == 0 { + for v in out.iter_mut() { + *v = 0.0; + } + return; + } + let n_blocks = cols / ELEMS_PER_BLOCK; + let row_bytes = n_blocks * BLOCK_BYTES; + if w.len() < rows * row_bytes { + for v in out.iter_mut() { + *v = 0.0; + } + return; + } + + for (r, out_slot) in out.iter_mut().enumerate().take(rows) { + let row_base = r * row_bytes; + let mut acc = 0.0f32; + for sb in 0..n_blocks { + let block = &w[row_base + sb * BLOCK_BYTES..row_base + (sb + 1) * BLOCK_BYTES]; + let q8_base = sb * ELEMS_PER_BLOCK; + // SAFETY: 144-byte super-block (16 header + 128 quants); the act + // slice spans 256 i8; q8_sums has 8 i16 per super-block; the + // static index tables are 80 bytes. + acc += unsafe { + q4k_sb_contrib_asm( + block.as_ptr(), + block[16..].as_ptr(), + q8k_x.qs[q8_base..q8_base + ELEMS_PER_BLOCK].as_ptr(), + q8k_x.sums[sb * SUBBLOCKS_PER_BLOCK..].as_ptr(), + q8k_x.d[sb], + ) + }; + } + *out_slot = acc; + } +} + +/// v3 (C12): one `asm!` block per ROW — the super-block loop lives inside +/// the asm, so the TBL tables / masks load once per row instead of once per +/// super-block (v2 paid ~4-5 cyc/SB reloading loop-invariant constants). +/// The walking pointer exploits the Q4_K layout: each 144-byte block is +/// [16B header][128B quants] contiguous, so the header `ld1 ..., #16` +/// followed by the four group loads (4×32B) lands the pointer exactly on +/// the next block's header — no pointer arithmetic at all. +/// +/// Accumulation order matches the v1/v2/scalar forms exactly (sequential +/// `fadd` of per-block contributions), so it stays bit-exact. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[inline] +unsafe fn q4k_row_dot_asm( + row: *const u8, + act: *const i8, + q8_sums: *const i16, + d: *const f32, + n_blocks: usize, +) -> f32 { + let acc: f32; + macro_rules! grp { + ($sv:literal, $l0:literal, $l1:literal) => { + concat!( + "ld1 {{v0.16b, v1.16b}}, [{p}], #32\n", + "ld1 {{v20.16b, v21.16b, v22.16b, v23.16b}}, [{a}], #64\n", + "and v2.16b, v0.16b, v16.16b\n", + "and v3.16b, v1.16b, v16.16b\n", + "ushr v4.16b, v0.16b, #4\n", + "ushr v5.16b, v1.16b, #4\n", + "movi v6.4s, #0\n", + "movi v7.4s, #0\n", + "sdot v6.4s, v2.16b, v20.16b\n", + "sdot v6.4s, v3.16b, v21.16b\n", + "sdot v7.4s, v4.16b, v22.16b\n", + "sdot v7.4s, v5.16b, v23.16b\n", + "mul v6.4s, v6.4s, ", + $sv, + ".s[", + $l0, + "]\n", + "mul v7.4s, v7.4s, ", + $sv, + ".s[", + $l1, + "]\n", + "add v17.4s, v17.4s, v6.4s\n", + "add v17.4s, v17.4s, v7.4s\n", + ) + }; + } + unsafe { + core::arch::asm!( + // ── loop-invariant constants (per row, not per super-block) ── + "movi v16.16b, #0x0f", + "movi v30.16b, #0x3f", + "ld1 {{v24.16b, v25.16b, v26.16b, v27.16b}}, [{idx}]", + "ld1 {{v28.16b}}, [{idx2}]", + "fmov s29, wzr", // row accumulator + "2:", + "ld1 {{v0.16b}}, [{p}], #16", // header; pointer → quants + // ── vectorised unpack_scales_mins ── + "and v1.16b, v0.16b, v30.16b", + "ushr v2.16b, v0.16b, #6", + "shl v2.16b, v2.16b, #4", + "and v3.16b, v0.16b, v16.16b", + "ushr v4.16b, v0.16b, #4", + "tbl v5.16b, {{v1.16b}}, v24.16b", + "tbl v6.16b, {{v3.16b}}, v26.16b", + "tbl v7.16b, {{v2.16b}}, v25.16b", + "orr v5.16b, v5.16b, v6.16b", + "orr v5.16b, v5.16b, v7.16b", + "tbl v6.16b, {{v1.16b}}, v27.16b", + "tbl v7.16b, {{v4.16b}}, v26.16b", + "tbl v1.16b, {{v2.16b}}, v28.16b", + "orr v6.16b, v6.16b, v7.16b", + "orr v31.16b, v6.16b, v1.16b", // mn8 + "mov v15.16b, v0.16b", // keep header (d|dmin) for epilogue + "ushll v5.8h, v5.8b, #0", + "ushll v18.4s, v5.4h, #0", + "ushll2 v19.4s, v5.8h, #0", + // ── sum1 ── + "movi v17.4s, #0", + grp!("v18", "0", "1"), + grp!("v18", "2", "3"), + grp!("v19", "0", "1"), + grp!("v19", "2", "3"), + "addv s17, v17.4s", + // ── sum2 ── + "ushll v1.8h, v31.8b, #0", + "ld1 {{v2.8h}}, [{sums}], #16", + "smull v3.4s, v2.4h, v1.4h", + "smlal2 v3.4s, v2.8h, v1.8h", + "addv s3, v3.4s", + // ── epilogue ── + "ldr s8, [{d}], #4", // d_y for this super-block + "mov h4, v15.h[0]", + "fcvt s4, h4", + "mov h5, v15.h[1]", + "fcvt s5, h5", + "scvtf s6, s17", + "scvtf s7, s3", + "fmul s4, s4, s8", + "fmul s4, s4, s6", + "fmul s5, s5, s8", + "fmul s5, s5, s7", + "fsub s4, s4, s5", + "fadd s29, s29, s4", + "subs {n}, {n}, #1", + "b.ne 2b", + "fmov {acc:s}, s29", + p = inout(reg) row => _, + a = inout(reg) act => _, + sums = inout(reg) q8_sums => _, + d = inout(reg) d => _, + n = inout(reg) n_blocks => _, + idx = in(reg) Q4K_UNPACK_IDX.as_ptr(), + idx2 = in(reg) Q4K_UNPACK_IDX.as_ptr().wrapping_add(64), + acc = out(vreg) acc, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v15") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + out("v28") _, out("v29") _, out("v30") _, out("v31") _, + options(nostack, readonly), + ); + } + acc +} + +/// v3 hand-asm Q4_K × Q8_K matvec: one asm block per row (constants hoisted, +/// zero per-super-block Rust glue). Bit-exact with the scalar reference +/// (`q8k_matvec_asm_v3_matches_scalar_bit_exact`). +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +pub fn q4k_q8k_matvec_asm_v3( + out: &mut [f32], + q8k_x: &Q8KActivation, + w: &[u8], + rows: usize, + cols: usize, +) { + debug_assert_eq!(out.len(), rows); + debug_assert_eq!(q8k_x.qs.len(), cols); + debug_assert_eq!(cols % ELEMS_PER_BLOCK, 0); + if rows == 0 || cols == 0 { + for v in out.iter_mut() { + *v = 0.0; + } + return; + } + let n_blocks = cols / ELEMS_PER_BLOCK; + let row_bytes = n_blocks * BLOCK_BYTES; + if w.len() < rows * row_bytes { + for v in out.iter_mut() { + *v = 0.0; + } + return; + } + + for (r, out_slot) in out.iter_mut().enumerate().take(rows) { + // SAFETY: the row spans n_blocks × 144 bytes (checked above); the + // activation arrays carry n_blocks super-blocks of qs/sums/d. + *out_slot = unsafe { + q4k_row_dot_asm( + w[r * row_bytes..].as_ptr(), + q8k_x.qs.as_ptr(), + q8k_x.sums.as_ptr(), + q8k_x.d.as_ptr(), + n_blocks, + ) + }; + } +} + /// C12 opt-in: route Q4_K × Q8_K matvecs through the hand-asm kernel /// (`q4k_q8k_matvec_asm`) instead of the intrinsic path when `LARQL_Q4K_ASM` /// is `1`/`true`. Read once and cached — the env lookup must not land in the @@ -796,11 +1177,12 @@ pub fn q4k_q8k_matvec_into( // is compute/issue-bound, see `docs/q4k-decode-kernel.md`.) // // C12: opt-in hand-asm kernel (`LARQL_Q4K_ASM=1`). Bit-exact with - // the intrinsic path; ~+2.5% isolated. Default off until the - // two-super-block-interleaved version closes more of the gap and - // the gate_up fused path gets the same treatment. + // the intrinsic path. v2 (2026-06-11) folds ALL the per-super-block + // Rust glue into the asm block (vectorised scale/min unpack, sum2, + // hardware fcvt + epilogue) — the decomposition bench measured the + // glue at 19.2 cyc/SB vs the asm block's 16.3. if use_asm_kernel() { - q4k_q8k_matvec_asm(out, q8k_x, w, rows, cols); + q4k_q8k_matvec_asm_v3(out, q8k_x, w, rows, cols); } else { q4k_q8k_matvec_neon(out, q8k_x, w, rows, cols); } @@ -934,7 +1316,14 @@ pub fn q4k_q8k_gate_up_into( ) { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - q4k_q8k_gate_up_neon(gate_out, up_out, q8k_x, gate_w, up_w, rows, cols); + // C12: same opt-in as `q4k_q8k_matvec_into` — `LARQL_Q4K_ASM=1` + // routes the fused kernel through the hand-asm form. Bit-exact + // (`q8k_gate_up_asm_matches_scalar_bit_exact`); default off. + if use_asm_kernel() { + q4k_q8k_gate_up_asm(gate_out, up_out, q8k_x, gate_w, up_w, rows, cols); + } else { + q4k_q8k_gate_up_neon(gate_out, up_out, q8k_x, gate_w, up_w, rows, cols); + } return; } #[allow(unreachable_code)] @@ -1073,6 +1462,234 @@ pub fn q4k_q8k_gate_up_neon( } } +/// Fused gate+up twin of [`q4k_sb_sum1_asm`] (C12): one super-block's integer +/// `sum1` for BOTH the gate and up matrices in a single `asm!` block, sharing +/// the four activation vector loads per group between them — the point of the +/// fusion (the separate-matvec form streams the same 64 activation bytes per +/// group twice, and the doubled SDOT stream gives the OoO core independent +/// work to fill the ~20 stalled cycles the single-matrix form measures). +/// Scale handling matches the single-matrix form: 8 scales per matrix arrive +/// as two i32x4 vectors, applied with `mul (by element)` — no scalar `ldrb`. +/// +/// Register map: v16 nibble mask; v17/v26 gate/up accumulators; v18-v19 +/// gate scales, v24-v25 up scales; per group v0-v1/v8-v9 raw nibbles, +/// v2-v5/v10-v13 unpacked, v6-v7/v14-v15 dot temps, v20-v23 shared +/// activation. i32 lane sums are order-independent (wrapping add is +/// associative), so the tree-sum is bit-exact with the scalar reference. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[inline] +unsafe fn q4k_gate_up_sb_sum1_asm( + g_quants: *const u8, + u_quants: *const u8, + act: *const i8, + g_scales: *const i32, + u_scales: *const i32, +) -> (i32, i32) { + let sum1_g: i32; + let sum1_u: i32; + // One group of the unrolled body: `$gsv`/`$usv` = gate/up scale vectors, + // `$l0`/`$l1` = lane indices for sub-blocks 2g / 2g+1. + macro_rules! grp2 { + ($gsv:literal, $usv:literal, $l0:literal, $l1:literal) => { + concat!( + "ld1 {{v0.16b, v1.16b}}, [{g}], #32\n", + "ld1 {{v8.16b, v9.16b}}, [{u}], #32\n", + "ld1 {{v20.16b, v21.16b, v22.16b, v23.16b}}, [{a}], #64\n", + "and v2.16b, v0.16b, v16.16b\n", // gate lo (sub-block 2g) + "and v3.16b, v1.16b, v16.16b\n", + "ushr v4.16b, v0.16b, #4\n", // gate hi (sub-block 2g+1) + "ushr v5.16b, v1.16b, #4\n", + "and v10.16b, v8.16b, v16.16b\n", // up lo + "and v11.16b, v9.16b, v16.16b\n", + "ushr v12.16b, v8.16b, #4\n", // up hi + "ushr v13.16b, v9.16b, #4\n", + "movi v6.4s, #0\n", + "movi v7.4s, #0\n", + "movi v14.4s, #0\n", + "movi v15.4s, #0\n", + // Gate / up SDOTs interleaved — independent chains on the + // same shared activation registers. + "sdot v6.4s, v2.16b, v20.16b\n", + "sdot v14.4s, v10.16b, v20.16b\n", + "sdot v6.4s, v3.16b, v21.16b\n", + "sdot v14.4s, v11.16b, v21.16b\n", + "sdot v7.4s, v4.16b, v22.16b\n", + "sdot v15.4s, v12.16b, v22.16b\n", + "sdot v7.4s, v5.16b, v23.16b\n", + "sdot v15.4s, v13.16b, v23.16b\n", + "mul v6.4s, v6.4s, ", + $gsv, + ".s[", + $l0, + "]\n", + "mul v7.4s, v7.4s, ", + $gsv, + ".s[", + $l1, + "]\n", + "mul v14.4s, v14.4s, ", + $usv, + ".s[", + $l0, + "]\n", + "mul v15.4s, v15.4s, ", + $usv, + ".s[", + $l1, + "]\n", + "add v17.4s, v17.4s, v6.4s\n", + "add v17.4s, v17.4s, v7.4s\n", + "add v26.4s, v26.4s, v14.4s\n", + "add v26.4s, v26.4s, v15.4s\n", + ) + }; + } + unsafe { + core::arch::asm!( + "movi v16.16b, #0x0f", // nibble mask + "movi v17.4s, #0", // gate sum1 accumulator + "movi v26.4s, #0", // up sum1 accumulator + "ld1 {{v18.4s, v19.4s}}, [{gs}]", // gate scales[0..4], [4..8] + "ld1 {{v24.4s, v25.4s}}, [{us}]", // up scales[0..4], [4..8] + grp2!("v18", "v24", "0", "1"), // group 0 → sub-blocks 0,1 + grp2!("v18", "v24", "2", "3"), // group 1 → sub-blocks 2,3 + grp2!("v19", "v25", "0", "1"), // group 2 → sub-blocks 4,5 + grp2!("v19", "v25", "2", "3"), // group 3 → sub-blocks 6,7 + "addv s17, v17.4s", + "addv s26, v26.4s", + "fmov {sg:w}, s17", + "fmov {su:w}, s26", + g = inout(reg) g_quants => _, + u = inout(reg) u_quants => _, + a = inout(reg) act => _, + gs = in(reg) g_scales, + us = in(reg) u_scales, + sg = out(reg) sum1_g, + su = out(reg) sum1_u, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, out("v7") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("v16") _, out("v17") _, out("v18") _, out("v19") _, + out("v20") _, out("v21") _, out("v22") _, out("v23") _, + out("v24") _, out("v25") _, out("v26") _, + options(nostack, readonly), + ); + } + (sum1_g, sum1_u) +} + +/// Hand-asm fused gate+up matvec (C12). Identical interface and output to +/// [`q4k_q8k_gate_up_neon`] — integer `sum1` pairs come from +/// [`q4k_gate_up_sb_sum1_asm`], the `sum2` terms and the f32 epilogue are +/// the same Rust code as the neon/scalar forms, so it is bit-exact with two +/// independent scalar matvecs (`q8k_gate_up_asm_matches_scalar_bit_exact`). +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[allow(clippy::too_many_arguments)] +pub fn q4k_q8k_gate_up_asm( + gate_out: &mut [f32], + up_out: &mut [f32], + q8k_x: &Q8KActivation, + gate_w: &[u8], + up_w: &[u8], + rows: usize, + cols: usize, +) { + debug_assert_eq!(gate_out.len(), rows); + debug_assert_eq!(up_out.len(), rows); + debug_assert_eq!(q8k_x.qs.len(), cols); + debug_assert_eq!(cols % ELEMS_PER_BLOCK, 0); + if rows == 0 || cols == 0 { + for v in gate_out.iter_mut() { + *v = 0.0; + } + for v in up_out.iter_mut() { + *v = 0.0; + } + return; + } + let n_blocks = cols / ELEMS_PER_BLOCK; + let row_bytes = n_blocks * BLOCK_BYTES; + if gate_w.len() < rows * row_bytes || up_w.len() < rows * row_bytes { + for v in gate_out.iter_mut() { + *v = 0.0; + } + for v in up_out.iter_mut() { + *v = 0.0; + } + return; + } + + for r in 0..rows { + let row_base = r * row_bytes; + let mut acc_g = 0.0f32; + let mut acc_u = 0.0f32; + for sb in 0..n_blocks { + let g_block = &gate_w[row_base + sb * BLOCK_BYTES..row_base + (sb + 1) * BLOCK_BYTES]; + let u_block = &up_w[row_base + sb * BLOCK_BYTES..row_base + (sb + 1) * BLOCK_BYTES]; + let d_g = f16_to_f32(u16::from_le_bytes([g_block[0], g_block[1]])); + let dmin_g = f16_to_f32(u16::from_le_bytes([g_block[2], g_block[3]])); + let d_u = f16_to_f32(u16::from_le_bytes([u_block[0], u_block[1]])); + let dmin_u = f16_to_f32(u16::from_le_bytes([u_block[2], u_block[3]])); + let (sc_g, mn_g) = unpack_scales_mins(&g_block[4..16]); + let (sc_u, mn_u) = unpack_scales_mins(&u_block[4..16]); + + let sc_g_i32 = [ + sc_g[0] as i32, + sc_g[1] as i32, + sc_g[2] as i32, + sc_g[3] as i32, + sc_g[4] as i32, + sc_g[5] as i32, + sc_g[6] as i32, + sc_g[7] as i32, + ]; + let sc_u_i32 = [ + sc_u[0] as i32, + sc_u[1] as i32, + sc_u[2] as i32, + sc_u[3] as i32, + sc_u[4] as i32, + sc_u[5] as i32, + sc_u[6] as i32, + sc_u[7] as i32, + ]; + + let q8_base = sb * ELEMS_PER_BLOCK; + let q8_qs_ptr = q8k_x.qs[q8_base..q8_base + ELEMS_PER_BLOCK].as_ptr(); + let q8_sums = &q8k_x.sums[sb * SUBBLOCKS_PER_BLOCK..(sb + 1) * SUBBLOCKS_PER_BLOCK]; + let d_y = q8k_x.d[sb]; + + // SAFETY: each Q4_K super-block is 144 bytes (16 header + 128 + // quants), `q8_qs_ptr` spans a full 256-i8 super-block, and both + // scale arrays are 8 i32. + let (s1_g, s1_u) = unsafe { + q4k_gate_up_sb_sum1_asm( + g_block[16..].as_ptr(), + u_block[16..].as_ptr(), + q8_qs_ptr, + sc_g_i32.as_ptr(), + sc_u_i32.as_ptr(), + ) + }; + + // sum2 stays scalar (precomputed Q8_K sums; no SDOT) — identical + // to the neon/scalar paths so the f32 epilogue is bit-for-bit the + // same. + let mut s2_g: i32 = 0; + let mut s2_u: i32 = 0; + for s in 0..SUBBLOCKS_PER_BLOCK { + s2_g += mn_g[s] as i32 * q8_sums[s] as i32; + s2_u += mn_u[s] as i32 * q8_sums[s] as i32; + } + acc_g += d_g * d_y * s1_g as f32 - dmin_g * d_y * s2_g as f32; + acc_u += d_u * d_y * s1_u as f32 - dmin_u * d_y * s2_u as f32; + } + gate_out[r] = acc_g; + up_out[r] = acc_u; + } +} + // ── Q6_K × Q8_K matvec ─────────────────────────────────────────────────────── // // Q6_K super-block: 210 bytes per 256 values. @@ -1268,6 +1885,170 @@ pub fn q6k_q8k_matvec_neon( } } +/// TBL index table for the Q6_K hi2 replicate: group `j` (of 4 within one +/// 16-byte `qh` vector) selects bytes `4j..4j+3`, each repeated 4×, so a +/// single `tbl` builds the per-element hi2 source that the neon form +/// assembles with four scalar multiplies per group. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[rustfmt::skip] +static Q6K_TBL_IDX: [u8; 64] = [ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, + 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, +]; + +/// Right-shift pattern for the replicated hi2 bytes (negative = shift right +/// under `sshl`): element 4j+k needs `qh_byte >> 2k`. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +static Q6K_SHIFT_RIGHT: [i8; 16] = [0, -2, -4, -6, 0, -2, -4, -6, 0, -2, -4, -6, 0, -2, -4, -6]; + +/// One Q6_K super-block's integer `sum1 = Σ_g scale[g] · dot16_g` in a single +/// `asm!` block (C12). Differences from [`q6k_q8k_matvec_neon`]'s inner loop: +/// the hi2 replicate is one `tbl` (vs 4 scalar multiplies + vector rebuild), +/// and the per-group scale lands as a vector-lane `mul` on the 4-lane SDOT +/// partials with a single `addv` at the end (vs 16 horizontal `addv` + scalar +/// multiply-adds). i32 lane sums are order-independent (wrapping add), so the +/// result is bit-exact with the neon/scalar forms. +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +#[inline] +unsafe fn q6k_sb_sum1_asm( + ql: *const u8, + qh: *const u8, + act: *const i8, + scales: *const i32, +) -> i32 { + let sum1: i32; + // One 16-element group: `$qh` = the loaded qh vector for this group's + // quad (v8-v11), `$idx` = the TBL replicate index vector for the group's + // position within that quad (v24-v27), `$sv`/`$lane` = widened scale + // vector (v12-v15) and lane. + macro_rules! q6grp { + ($qh:literal, $idx:literal, $sv:literal, $lane:literal) => { + concat!( + "ld1 {{v0.8b}}, [{ql}], #8\n", + "ld1 {{v5.16b}}, [{a}], #16\n", + "and v1.16b, v0.16b, v29.16b\n", // lo4 of even elements + "ushr v2.16b, v0.16b, #4\n", // lo4 of odd elements + "zip1 v3.16b, v1.16b, v2.16b\n", // restore element order + "tbl v4.16b, {{", $qh, ".16b}}, ", $idx, ".16b\n", + "sshl v4.16b, v4.16b, v28.16b\n", + "and v4.16b, v4.16b, v30.16b\n", + "shl v4.16b, v4.16b, #4\n", + "orr v3.16b, v3.16b, v4.16b\n", // raw6 = lo4 | hi2<<4 + "sub v3.16b, v3.16b, v31.16b\n", // signed: raw6 - 32 + "movi v6.4s, #0\n", + "sdot v6.4s, v3.16b, v5.16b\n", + "mul v6.4s, v6.4s, ", $sv, ".s[", $lane, "]\n", + "add v16.4s, v16.4s, v6.4s\n", + ) + }; + } + unsafe { + core::arch::asm!( + "movi v16.4s, #0", // sum1 accumulator + "movi v29.16b, #0x0f", // lo4 mask + "movi v30.16b, #0x03", // hi2 mask + "movi v31.16b, #32", // raw6 bias + "ld1 {{v8.16b, v9.16b, v10.16b, v11.16b}}, [{qh}]", // 64B qh + "ld1 {{v12.4s, v13.4s, v14.4s, v15.4s}}, [{scales}]", // 16 i32 scales + "ld1 {{v24.16b, v25.16b, v26.16b, v27.16b}}, [{idx}]", // TBL tables + "ld1 {{v28.16b}}, [{shift}]", // shift pattern + q6grp!("v8", "v24", "v12", "0"), + q6grp!("v8", "v25", "v12", "1"), + q6grp!("v8", "v26", "v12", "2"), + q6grp!("v8", "v27", "v12", "3"), + q6grp!("v9", "v24", "v13", "0"), + q6grp!("v9", "v25", "v13", "1"), + q6grp!("v9", "v26", "v13", "2"), + q6grp!("v9", "v27", "v13", "3"), + q6grp!("v10", "v24", "v14", "0"), + q6grp!("v10", "v25", "v14", "1"), + q6grp!("v10", "v26", "v14", "2"), + q6grp!("v10", "v27", "v14", "3"), + q6grp!("v11", "v24", "v15", "0"), + q6grp!("v11", "v25", "v15", "1"), + q6grp!("v11", "v26", "v15", "2"), + q6grp!("v11", "v27", "v15", "3"), + "addv s16, v16.4s", + "fmov {sum1:w}, s16", + ql = inout(reg) ql => _, + a = inout(reg) act => _, + qh = in(reg) qh, + scales = in(reg) scales, + idx = in(reg) Q6K_TBL_IDX.as_ptr(), + shift = in(reg) Q6K_SHIFT_RIGHT.as_ptr(), + sum1 = out(reg) sum1, + out("v0") _, out("v1") _, out("v2") _, out("v3") _, + out("v4") _, out("v5") _, out("v6") _, + out("v8") _, out("v9") _, out("v10") _, out("v11") _, + out("v12") _, out("v13") _, out("v14") _, out("v15") _, + out("v16") _, + out("v24") _, out("v25") _, out("v26") _, out("v27") _, + out("v28") _, out("v29") _, out("v30") _, out("v31") _, + options(nostack, readonly), + ); + } + sum1 +} + +/// Hand-asm Q6_K × Q8_K matvec (C12). Identical interface and output to +/// [`q6k_q8k_matvec_neon`] — `sum1` comes from [`q6k_sb_sum1_asm`], the f32 +/// epilogue (`acc += d_w·d_y·sum1`, no mins term) is the same Rust code, so +/// it is bit-exact with the scalar reference +/// (`q6k_matvec_asm_matches_scalar_bit_exact`). +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +pub fn q6k_q8k_matvec_asm( + out: &mut [f32], + q8k_x: &Q8KActivation, + w: &[u8], + rows: usize, + cols: usize, +) { + debug_assert_eq!(cols % ELEMS_PER_BLOCK, 0); + let n_blocks = cols / ELEMS_PER_BLOCK; + let row_bytes = n_blocks * Q6K_BLOCK_BYTES; + for v in out.iter_mut() { + *v = 0.0; + } + if rows == 0 || cols == 0 || w.len() < rows * row_bytes { + return; + } + + for (r, out_r) in out.iter_mut().enumerate().take(rows) { + let row_base = r * row_bytes; + let mut acc = 0.0f32; + for sb in 0..n_blocks { + let block = &w[row_base + sb * Q6K_BLOCK_BYTES..]; + let d_w = f16_to_f32(u16::from_le_bytes([block[208], block[209]])); + let d_y = q8k_x.d[sb]; + + // 16 per-group i8 scales widened to i32 for the vector-lane muls. + let mut sc = [0i32; 16]; + for (g, s) in sc.iter_mut().enumerate() { + *s = block[192 + g] as i8 as i32; + } + + let q8_base = sb * ELEMS_PER_BLOCK; + let q8_ptr = q8k_x.qs[q8_base..q8_base + ELEMS_PER_BLOCK].as_ptr(); + + // SAFETY: a Q6_K super-block is 210 bytes (128 ql + 64 qh + 16 + // scales + 2 d); `q8_ptr` spans a full 256-i8 super-block; `sc` + // is 16 i32; the static TBL/shift tables are 64/16 bytes. + let sum1 = unsafe { + q6k_sb_sum1_asm( + block.as_ptr(), + block.as_ptr().add(128), + q8_ptr, + sc.as_ptr(), + ) + }; + acc += d_w * d_y * sum1 as f32; + } + *out_r = acc; + } +} + /// Public entry point: dispatches to NEON on aarch64, scalar elsewhere. /// `w` is a Q6_K weight matrix of `rows` rows × `cols` columns. /// `q8k_x` is the pre-quantised activation vector (`cols` elements). @@ -1280,7 +2061,13 @@ pub fn q6k_q8k_matvec_into( ) { #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - q6k_q8k_matvec_neon(out, q8k_x, w, rows, cols); + // C12: same opt-in as the Q4_K kernels — `LARQL_Q4K_ASM=1` routes + // through the hand-asm form. Bit-exact; default off. + if use_asm_kernel() { + q6k_q8k_matvec_asm(out, q8k_x, w, rows, cols); + } else { + q6k_q8k_matvec_neon(out, q8k_x, w, rows, cols); + } return; } #[allow(unreachable_code)] @@ -1512,6 +2299,233 @@ mod tests { ); } + /// The fused gate+up hand-asm kernel must be bit-exact with two + /// independent scalar matvecs — same shapes discipline as the + /// single-matrix asm test, with DIFFERENT gate vs up weights so a + /// pointer/register swap between the two matrices can't cancel out. + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[test] + fn q8k_gate_up_asm_matches_scalar_bit_exact() { + for &(rows, cols) in &[(7usize, 1024usize), (8, 2560), (3, 2560), (16, 512)] { + let x: Vec = (0..cols) + .map(|i| { + let f = i as f32; + ((f * 0.0173).sin() * 1.7 + (f * 0.041).cos() * 0.9) * 1.3 + }) + .collect(); + let g_f32: Vec = (0..rows * cols) + .map(|i| { + let f = i as f32; + ((f * 0.013).cos() * 0.4 - (f * 0.027).sin() * 0.2) * 0.6 + }) + .collect(); + let u_f32: Vec = (0..rows * cols) + .map(|i| { + let f = i as f32; + ((f * 0.019).sin() * 0.5 + (f * 0.031).cos() * 0.3) * 0.7 + }) + .collect(); + let g_q4 = quantize_q4_k(&g_f32); + let u_q4 = quantize_q4_k(&u_f32); + let q8 = quantize_x_to_q8k(&x); + + let mut g_scalar = vec![0.0f32; rows]; + let mut u_scalar = vec![0.0f32; rows]; + q4k_q8k_matvec_scalar(&mut g_scalar, &q8, &g_q4, rows, cols); + q4k_q8k_matvec_scalar(&mut u_scalar, &q8, &u_q4, rows, cols); + + let mut g_asm = vec![0.0f32; rows]; + let mut u_asm = vec![0.0f32; rows]; + q4k_q8k_gate_up_asm(&mut g_asm, &mut u_asm, &q8, &g_q4, &u_q4, rows, cols); + + for r in 0..rows { + assert_eq!( + g_scalar[r].to_bits(), + g_asm[r].to_bits(), + "gate rows={rows} cols={cols} row {r}: scalar={} asm={}", + g_scalar[r], + g_asm[r], + ); + assert_eq!( + u_scalar[r].to_bits(), + u_asm[r].to_bits(), + "up rows={rows} cols={cols} row {r}: scalar={} asm={}", + u_scalar[r], + u_asm[r], + ); + } + } + } + + /// Fused gate+up asm early-return guards: zero dims and short weight + /// buffers must zero BOTH outputs (same contract as the neon form). + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[test] + fn q8k_gate_up_asm_zero_dims_and_short_weights_zero_output() { + let empty = Q8KActivation { + qs: vec![], + d: vec![], + sums: vec![], + }; + let mut g = vec![1.0f32; 4]; + let mut u = vec![1.0f32; 4]; + q4k_q8k_gate_up_asm(&mut g, &mut u, &empty, &[], &[], 4, 0); + assert!(g.iter().chain(u.iter()).all(|&v| v == 0.0)); + + let cols = 256; + let rows = 2; + let q = quantize_x_to_q8k(&vec![0.5f32; cols]); + let w_short = vec![0u8; BLOCK_BYTES]; // one row's worth, rows == 2 + let w_full = vec![0u8; 2 * BLOCK_BYTES]; + let mut g = vec![1.0f32; rows]; + let mut u = vec![1.0f32; rows]; + q4k_q8k_gate_up_asm(&mut g, &mut u, &q, &w_short, &w_full, rows, cols); + assert!(g.iter().chain(u.iter()).all(|&v| v == 0.0)); + } + + /// The v2 (all-glue-in-asm) kernel must be bit-exact with the scalar + /// reference: the vectorised scale/min unpack must reproduce + /// `unpack_scales_mins` exactly, `fcvt`/`scvtf` match the software + /// conversions bit-for-bit, and the epilogue preserves expression order. + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[test] + fn q8k_matvec_asm_v2_matches_scalar_bit_exact() { + for &(rows, cols) in &[(7usize, 1024usize), (8, 2560), (3, 2560), (16, 512)] { + let x: Vec = (0..cols) + .map(|i| { + let f = i as f32; + ((f * 0.0173).sin() * 1.7 + (f * 0.041).cos() * 0.9) * 1.3 + }) + .collect(); + let w_f32: Vec = (0..rows * cols) + .map(|i| { + let f = i as f32; + ((f * 0.013).cos() * 0.4 - (f * 0.027).sin() * 0.2) * 0.6 + }) + .collect(); + let w_q4 = quantize_q4_k(&w_f32); + let q8 = quantize_x_to_q8k(&x); + + let mut out_scalar = vec![0.0f32; rows]; + let mut out_v2 = vec![0.0f32; rows]; + q4k_q8k_matvec_scalar(&mut out_scalar, &q8, &w_q4, rows, cols); + q4k_q8k_matvec_asm_v2(&mut out_v2, &q8, &w_q4, rows, cols); + + for r in 0..rows { + assert_eq!( + out_scalar[r].to_bits(), + out_v2[r].to_bits(), + "rows={rows} cols={cols} row {r}: scalar={} v2={} diff={}", + out_scalar[r], + out_v2[r], + (out_scalar[r] - out_v2[r]).abs() + ); + } + } + } + + /// The v3 (whole-row-in-asm) kernel must be bit-exact with the scalar + /// reference — the in-asm loop changes only WHERE the iteration happens, + /// not any arithmetic or its order. + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[test] + fn q8k_matvec_asm_v3_matches_scalar_bit_exact() { + for &(rows, cols) in &[(7usize, 1024usize), (8, 2560), (3, 2560), (16, 512)] { + let x: Vec = (0..cols) + .map(|i| { + let f = i as f32; + ((f * 0.0173).sin() * 1.7 + (f * 0.041).cos() * 0.9) * 1.3 + }) + .collect(); + let w_f32: Vec = (0..rows * cols) + .map(|i| { + let f = i as f32; + ((f * 0.013).cos() * 0.4 - (f * 0.027).sin() * 0.2) * 0.6 + }) + .collect(); + let w_q4 = quantize_q4_k(&w_f32); + let q8 = quantize_x_to_q8k(&x); + + let mut out_scalar = vec![0.0f32; rows]; + let mut out_v3 = vec![0.0f32; rows]; + q4k_q8k_matvec_scalar(&mut out_scalar, &q8, &w_q4, rows, cols); + q4k_q8k_matvec_asm_v3(&mut out_v3, &q8, &w_q4, rows, cols); + + for r in 0..rows { + assert_eq!( + out_scalar[r].to_bits(), + out_v3[r].to_bits(), + "rows={rows} cols={cols} row {r}: scalar={} v3={} diff={}", + out_scalar[r], + out_v3[r], + (out_scalar[r] - out_v3[r]).abs() + ); + } + } + } + + /// The Q6_K hand-asm kernel must be bit-exact with the scalar reference + /// (and therefore the neon form) — the TBL-replicate + vector-lane scale + /// restructure changes only the i32 summation order, which is exact. + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[test] + fn q6k_matvec_asm_matches_scalar_bit_exact() { + for &(rows, cols) in &[(7usize, 1024usize), (8, 2560), (3, 2560), (16, 512)] { + let x: Vec = (0..cols) + .map(|i| { + let f = i as f32; + ((f * 0.0173).sin() * 1.7 + (f * 0.041).cos() * 0.9) * 1.3 + }) + .collect(); + let w_f32: Vec = (0..rows * cols) + .map(|i| { + let f = i as f32; + ((f * 0.013).cos() * 0.4 - (f * 0.027).sin() * 0.2) * 0.6 + }) + .collect(); + let w_q6 = quantize_q6_k(&w_f32); + let q8 = quantize_x_to_q8k(&x); + + let mut out_scalar = vec![0.0f32; rows]; + let mut out_asm = vec![0.0f32; rows]; + q6k_q8k_matvec_scalar(&mut out_scalar, &q8, &w_q6, rows, cols); + q6k_q8k_matvec_asm(&mut out_asm, &q8, &w_q6, rows, cols); + + for r in 0..rows { + assert_eq!( + out_scalar[r].to_bits(), + out_asm[r].to_bits(), + "rows={rows} cols={cols} row {r}: scalar={} asm={} diff={}", + out_scalar[r], + out_asm[r], + (out_scalar[r] - out_asm[r]).abs() + ); + } + } + } + + /// Q6_K asm early-return guards: zero dims / short weights zero output. + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[test] + fn q6k_matvec_asm_zero_dims_and_short_weights_zero_output() { + let empty = Q8KActivation { + qs: vec![], + d: vec![], + sums: vec![], + }; + let mut out = vec![1.0f32; 4]; + q6k_q8k_matvec_asm(&mut out, &empty, &[], 4, 0); + assert!(out.iter().all(|&v| v == 0.0)); + + let cols = 256; + let rows = 2; + let q = quantize_x_to_q8k(&vec![0.5f32; cols]); + let w = vec![0u8; Q6K_BLOCK_BYTES]; // one row's worth, rows == 2 + let mut out = vec![1.0f32; rows]; + q6k_q8k_matvec_asm(&mut out, &q, &w, rows, cols); + assert!(out.iter().all(|&v| v == 0.0)); + } + /// `LARQL_Q4K_ASM` opt-in truth table (the pure parse behind the /// `OnceLock`-cached `use_asm_kernel`). #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] diff --git a/crates/larql-compute/src/kquant_forward/cached.rs b/crates/larql-compute/src/kquant_forward/cached.rs index f5fb4108e..eba5db63a 100644 --- a/crates/larql-compute/src/kquant_forward/cached.rs +++ b/crates/larql-compute/src/kquant_forward/cached.rs @@ -34,7 +34,7 @@ use ndarray::Array2; use crate::attention::{ decode::{gqa_attention_decode_step, run_attention_block_decode_step_backend}, - rope::apply_rope_partial_at, + rope::apply_rope_partial_at_full, run_attention_with_kv_backend, }; use crate::ffn::WeightFfn; @@ -630,15 +630,26 @@ pub fn attention_decode_step_native( Some(norm_w) => rms_norm_heads(&q_full, norm_w, num_q, head_dim, qk_norm_off), None => q_full, }; - let layer_rope_base = arch.rope_base_for_layer(layer); + // RoPE must match the staged path / prefill exactly: override-aware + // base, the per-layer position divisor (Gemma 3 linear rope_scaling + // applies ÷factor on GLOBAL layers only), and llama3 frequency + // scaling. The unscaled `apply_rope_partial_at` here was the direct- + // path divergence on gemma3-4b (global-layer K/Q rope'd at 8× the + // position the prefill cache used). + let layer_rope_base = crate::forward_overrides::effective_rope_base_for_layer(arch, layer); let rotary_frac = arch.rotary_fraction_for_layer(layer); - let q_rope = apply_rope_partial_at( + let pos_divisor = + crate::forward_overrides::effective_rope_position_divisor_for_layer(arch, layer); + let llama3 = crate::forward_overrides::effective_llama3_rope_scaling(arch); + let q_rope = apply_rope_partial_at_full( &q_normed, num_q, head_dim, layer_rope_base, rotary_frac, abs_position, + pos_divisor, + llama3, ); let k_vec = matvec_q4k_or_q6k_q8k(k_bytes, k_fmt, &h_norm_q8k, kv_dim, hidden)?; @@ -667,13 +678,15 @@ pub fn attention_decode_step_native( Some(norm_w) => rms_norm_heads(&k_full_new, norm_w, num_kv, head_dim, qk_norm_off), None => k_full_new, }; - let k_new_rope = apply_rope_partial_at( + let k_new_rope = apply_rope_partial_at_full( &k_normed, num_kv, head_dim, layer_rope_base, rotary_frac, abs_position, + pos_divisor, + llama3, ); let (k_concat, v_concat) = match kv_entry { @@ -819,36 +832,72 @@ fn run_ffn_decode_step_q4k_direct( let gate_vec = matvec_q4k_or_q6k_q8k(gate_bytes, gate_fmt, &h_in_q8k, intermediate, hidden)?; let up_vec = matvec_q4k_or_q6k_q8k(up_bytes, up_fmt, &h_in_q8k, intermediate, hidden)?; - // Element-wise activation: activation(gate) * up. + // Element-wise activation: activation(gate) * up. Rayon-chunked — the + // per-element math (libm tanh/exp included) is unchanged, so the output + // is bit-identical to the serial loop; the decode sample showed this + // scalar pass serial on the main thread while the workers slept. let mut activated = vec![0.0f32; intermediate]; - match arch.activation() { - larql_models::Activation::GeluTanh => { - let sqrt_2_over_pi = (2.0f32 / std::f32::consts::PI).sqrt(); - for i in 0..intermediate { - let x = gate_vec[i]; - let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); - let g = 0.5 * x * (1.0 + inner.tanh()); - activated[i] = g * up_vec[i]; - } - } - _ => { - // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. - for i in 0..intermediate { - let x = gate_vec[i]; - let sig = 1.0 / (1.0 + (-x).exp()); - let g = x * sig; - activated[i] = g * up_vec[i]; - } - } + { + use rayon::prelude::*; + let gelu = matches!(arch.activation(), larql_models::Activation::GeluTanh); + let sqrt_2_over_pi = (2.0f32 / std::f32::consts::PI).sqrt(); + activated + .par_chunks_mut(256) + .zip(gate_vec.par_chunks(256).zip(up_vec.par_chunks(256))) + .for_each(|(a_c, (g_c, u_c))| { + if gelu { + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); + *a = 0.5 * x * (1.0 + inner.tanh()) * u; + } + } else { + // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let sig = 1.0 / (1.0 + (-x).exp()); + *a = x * sig * u; + } + } + }); } // down projection: out = activated @ W_down.T → [hidden]. // Re-quantise the post-activation vector (`intermediate`-wide) for // the down matvec — different input from gate/up. - let mut activated_q8k = Q8KActivation::with_capacity(intermediate); - quantize_x_to_q8k_into(&mut activated_q8k, &activated); + // + // The stored down row width may be PADDED up to a 256-multiple when + // `intermediate` isn't one (e.g. the 26B-A4B hybrid-MoE dense slab: + // intermediate 2112 stored as 2304-col Q6_K rows). Derive the stored + // width from the byte length and zero-pad the activation to match — + // pad columns multiply zero activations, so the result is exact. + // (Twin of the same handling in larql-inference's cached.rs — keep in + // lockstep, see the consolidation hazard in q4k-direct-attention.md.) + let down_sb_bytes = match down_fmt { + "Q4_K" => 144, + "Q6_K" => 210, + _ => return None, + }; + let down_bytes_per_row = down_bytes.len() / hidden; + if down_bytes_per_row == 0 || !down_bytes_per_row.is_multiple_of(down_sb_bytes) { + return None; + } + let stored_cols = + down_bytes_per_row / down_sb_bytes * larql_models::quant::ggml::Q4_K_BLOCK_ELEMS; + if stored_cols < intermediate { + return None; + } + let activated_padded: Vec; + let act_slice: &[f32] = if stored_cols != intermediate { + let mut p = vec![0.0f32; stored_cols]; + p[..intermediate].copy_from_slice(&activated); + activated_padded = p; + &activated_padded + } else { + &activated + }; + let mut activated_q8k = Q8KActivation::with_capacity(stored_cols); + quantize_x_to_q8k_into(&mut activated_q8k, act_slice); let down_vec = - matvec_q4k_or_q6k_q8k(down_bytes, down_fmt, &activated_q8k, hidden, intermediate)?; + matvec_q4k_or_q6k_q8k(down_bytes, down_fmt, &activated_q8k, hidden, stored_cols)?; let mut out = vec_to_2d_row(down_vec); if let Some(bias) = arch .ffn_down_bias_key(layer) diff --git a/crates/larql-compute/src/kv_dispatch/cpu.rs b/crates/larql-compute/src/kv_dispatch/cpu.rs index a79a915a3..6f5fdd6b1 100644 --- a/crates/larql-compute/src/kv_dispatch/cpu.rs +++ b/crates/larql-compute/src/kv_dispatch/cpu.rs @@ -30,7 +30,7 @@ use ndarray::Array2; use super::{KvDispatch, KvHandle, KvHandleInner, ResidualHandle, ResidualHandleInner}; use crate::attention::{ - run_attention_block_decode_step_backend, run_attention_block_decode_step_q4k_direct, + run_attention_block_decode_step_backend, run_attention_with_kv_backend, SharedKV, }; use larql_models::ModelWeights; @@ -54,9 +54,12 @@ fn q4k_direct_attn_enabled() -> bool { // ─── CpuKvHandle ──────────────────────────────────────────────────────────── -/// Single-layer K/V cache held in host memory. Wraps the existing -/// `SharedKV = (K, V)` shape — `K` and `V` are owned `Array2` -/// growing by one row per `append_kv` call. +/// Single-layer K/V cache held in host memory as growable row-major +/// buffers (`rows × kv_dim` valid prefix of each Vec). Appending one row +/// per decode step is amortised O(kv_dim) — the previous `SharedKV`-tuple +/// representation re-allocated and copied the WHOLE cache per layer per +/// step (clone + zeros + two assigns ≈ 3 full-cache copies), which the +/// decode sample showed as the dominant O(ctx) serial sink. pub struct CpuKvHandle { /// Layer index this handle was minted for. Carried for debugging /// / future trait surface; not consulted by the current append / @@ -64,8 +67,9 @@ pub struct CpuKvHandle { #[allow(dead_code)] layer: usize, kv_dim: usize, - /// `None` before the first `append_kv` / `attention_prefill`. - state: Option, + k_buf: Vec, + v_buf: Vec, + rows: usize, } impl CpuKvHandle { @@ -73,25 +77,77 @@ impl CpuKvHandle { Self { layer, kv_dim, - state: None, + k_buf: Vec::new(), + v_buf: Vec::new(), + rows: 0, } } - /// Replace the internal state — used by backend impls that - /// populate the handle from the prefill path (which returns a - /// fresh `SharedKV` rather than appending incrementally). + /// Append one K/V row in place (amortised O(kv_dim) — Vec doubling). + fn append_row(&mut self, k_row: &[f32], v_row: &[f32]) { + debug_assert_eq!(k_row.len(), self.kv_dim); + debug_assert_eq!(v_row.len(), self.kv_dim); + self.k_buf.extend_from_slice(k_row); + self.v_buf.extend_from_slice(v_row); + self.rows += 1; + } + + /// Views over the valid prefix — zero-copy access for the attend half. + fn views(&self) -> Option<(ndarray::ArrayView2<'_, f32>, ndarray::ArrayView2<'_, f32>)> { + if self.rows == 0 { + return None; + } + let n = self.rows * self.kv_dim; + let k = ndarray::ArrayView2::from_shape((self.rows, self.kv_dim), &self.k_buf[..n]) + .expect("k_buf prefix matches rows × kv_dim"); + let v = ndarray::ArrayView2::from_shape((self.rows, self.kv_dim), &self.v_buf[..n]) + .expect("v_buf prefix matches rows × kv_dim"); + Some((k, v)) + } + + /// Replace the buffers from an owned `SharedKV` — prefill and the f32 + /// fallback path still produce whole arrays. fn replace_state(&mut self, kv: SharedKV) { - self.state = Some(kv); + let (k, v) = kv; + debug_assert_eq!(k.shape()[1], self.kv_dim); + self.rows = k.shape()[0]; + self.k_buf.clear(); + self.v_buf.clear(); + match k.as_slice() { + Some(s) => self.k_buf.extend_from_slice(s), + None => self.k_buf.extend(k.iter().copied()), + } + match v.as_slice() { + Some(s) => self.v_buf.extend_from_slice(s), + None => self.v_buf.extend(v.iter().copied()), + } } - fn as_shared_kv(&self) -> Option<&SharedKV> { - self.state.as_ref() + /// Materialise an owned `SharedKV` copy (host reads, f32 fallback). + fn to_shared(&self) -> Option { + if self.rows == 0 { + return None; + } + let n = self.rows * self.kv_dim; + let k = Array2::from_shape_vec((self.rows, self.kv_dim), self.k_buf[..n].to_vec()).ok()?; + let v = Array2::from_shape_vec((self.rows, self.kv_dim), self.v_buf[..n].to_vec()).ok()?; + Some((k, v)) + } + + /// Move the state out as an owned `SharedKV`, leaving the handle empty + /// (the f32 fallback path re-populates via `replace_state`). + fn take_shared(&mut self) -> Option { + let out = self.to_shared(); + self.k_buf.clear(); + self.v_buf.clear(); + self.rows = 0; + out } } impl KvHandleInner for CpuKvHandle { fn cached_len(&self) -> usize { - self.state.as_ref().map_or(0, |(k, _)| k.shape()[0]) + self.rows } fn kv_dim(&self) -> usize { @@ -252,41 +308,24 @@ impl KvDispatch for CpuBackend { // ordered by insertion, and RoPE rotations are applied by the // caller (or by attention_step's underlying function). let h = cpu_handle_mut(handle); - debug_assert_eq!(k_row.len(), h.kv_dim); - debug_assert_eq!(v_row.len(), h.kv_dim); - - let new_k_row = Array2::from_shape_vec((1, k_row.len()), k_row.to_vec()) - .expect("k_row length doesn't match handle's kv_dim"); - let new_v_row = Array2::from_shape_vec((1, v_row.len()), v_row.to_vec()) - .expect("v_row length doesn't match handle's kv_dim"); - - h.state = Some(match h.state.take() { - Some((mut k, mut v)) => { - k.append(ndarray::Axis(0), new_k_row.view()).unwrap(); - v.append(ndarray::Axis(0), new_v_row.view()).unwrap(); - (k, v) - } - None => (new_k_row, new_v_row), - }); + h.append_row(k_row, v_row); } fn clip_kv(&self, handle: &mut KvHandle, window_size: usize) { let h = cpu_handle_mut(handle); - if let Some((k, v)) = h.state.as_mut() { - let rows = k.shape()[0]; - if rows > window_size { - let start = rows - window_size; - let k_slice = k.slice(ndarray::s![start..rows, ..]).to_owned(); - let v_slice = v.slice(ndarray::s![start..rows, ..]).to_owned(); - *k = k_slice; - *v = v_slice; - } + if h.rows > window_size { + let start = h.rows - window_size; + let kv_dim = h.kv_dim; + h.k_buf.copy_within(start * kv_dim..h.rows * kv_dim, 0); + h.v_buf.copy_within(start * kv_dim..h.rows * kv_dim, 0); + h.rows = window_size; + h.k_buf.truncate(h.rows * kv_dim); + h.v_buf.truncate(h.rows * kv_dim); } } fn read_kv_to_host(&self, handle: &KvHandle) -> Option<(Array2, Array2)> { - let h = cpu_handle(handle); - h.state.as_ref().map(|(k, v)| (k.clone(), v.clone())) + cpu_handle(handle).to_shared() } fn attention_step( @@ -298,40 +337,59 @@ impl KvDispatch for CpuBackend { abs_position: usize, index: Option<&dyn crate::KvIndex>, ) -> Option> { - // Default (f32) path: CpuBackend reads f32 attention tensors out of - // `weights.tensors`, which the caller pre-populates via - // `ensure_attn_tensors_dequantised` (the up-front dequant-to-f32 tax). - // - // Opt-in Q4K-direct path (`LARQL_Q4K_DIRECT_ATTN=1`, task #16): when the - // caller has a Q4K `index`, run the projections straight from its packed - // bytes via `quant_matvec` (no dequant), falling back per layer to the - // f32 path if the index lacks Q4K attn bytes / a format is unsupported. - let h = cpu_handle_mut(kv); - let prior_kv = h.as_shared_kv().cloned(); - let f32_path = |prior: Option<&SharedKV>| { - run_attention_block_decode_step_backend( - weights, - query, - layer, - prior, - abs_position, - Some(self), - ) - }; - let (h_post_attn, new_kv) = match index.filter(|_| q4k_direct_attn_enabled()) { - Some(idx) => run_attention_block_decode_step_q4k_direct( + // Opt-in Q4K-direct path (`LARQL_Q4K_DIRECT_ATTN=1`, task #16): project + // the new token (no cache access), APPEND the new K/V row to the + // handle's growable buffers in place (amortised O(kv_dim) — no O(ctx) + // copy), then attend over zero-copy views of the full cache. Falls + // back per layer to the f32 path when the index lacks Q4K attn bytes. + if let Some(idx) = index.filter(|_| q4k_direct_attn_enabled()) { + if let Some(proj) = crate::attention::decode::decode_step_project_q4k_direct( weights, query, layer, - prior_kv.as_ref(), abs_position, self, idx, - ) - .or_else(|| f32_path(prior_kv.as_ref()))?, - None => f32_path(prior_kv.as_ref())?, - }; - h.replace_state(new_kv); + ) { + let h = cpu_handle_mut(kv); + let k_row = proj + .k_new_rope + .as_slice() + .expect("[1, kv_dim] projection row is contiguous"); + let v_row = proj + .v_new + .as_slice() + .expect("[1, kv_dim] projection row is contiguous"); + h.append_row(k_row, v_row); + let (k_all, v_all) = h.views().expect("non-empty after append"); + return crate::attention::decode::decode_step_attend_q4k_direct( + weights, + query, + layer, + &proj.q_rope, + k_all, + v_all, + self, + idx, + ); + } + } + + // Default (f32) path: CpuBackend reads f32 attention tensors out of + // `weights.tensors`, which the caller pre-populates via + // `ensure_attn_tensors_dequantised` (the up-front dequant-to-f32 tax). + // The state moves OUT of the handle (no full-cache clone) and the + // concatenated result moves back in. + let prior_kv = cpu_handle_mut(kv).take_shared(); + let (h_post_attn, new_kv) = run_attention_block_decode_step_backend( + weights, + query, + layer, + prior_kv.as_ref(), + abs_position, + Some(self), + )?; + cpu_handle_mut(kv).replace_state(new_kv); Some(h_post_attn) } diff --git a/crates/larql-inference/examples/ave_demo.rs b/crates/larql-inference/examples/ave_demo.rs index d6b0d819e..90ae0f052 100644 --- a/crates/larql-inference/examples/ave_demo.rs +++ b/crates/larql-inference/examples/ave_demo.rs @@ -13,12 +13,14 @@ //! Per-item telemetry is written as JSON (the A10 lesson: per-item logs turn //! a rerun into a grep). //! -//! Usage: `cargo run --release --example ave_demo -- [VINDEX_DIR]` +//! Usage: `cargo run --release --example ave_demo -- [VINDEX_DIR] [--metal]` +//! (`--metal` needs `--features gpu`; reruns the AT-1 forced-decode leg on +//! the Metal pipeline — the spec §10.5 quantization/backend note.) //! Writes `bench/aim-validation/ave_demo_gemma3-4b.json`. use larql_inference::experts::{ave_generate_kquant, ArithmeticExpert, AveOptions}; use larql_inference::load_tokenizer; -use larql_inference::vindex::generate_kquant_cpu; +use larql_inference::vindex::generate_kquant_cpu_cached; /// (prompt, expected exact answer) — tier-0 explicit forms, incl. the /// 24-digit add (the A10 demo cell: dispatch 0.92 vs native 0.00). @@ -46,8 +48,11 @@ const DISTRACTORS: &[&str] = &[ fn main() { let args: Vec = std::env::args().collect(); + let use_metal = args.iter().any(|a| a == "--metal"); let vindex = args - .get(1) + .iter() + .skip(1) + .find(|a| !a.starts_with("--")) .cloned() .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); let dir = std::path::PathBuf::from(&vindex); @@ -99,7 +104,7 @@ fn main() { .to_vec(); let budget = out.telemetry.answer_tokens.max(expected.len()) + 8; let t1 = std::time::Instant::now(); - let native = generate_kquant_cpu(&mut weights, &tok, &prompt_ids, budget, &index); + let native = generate_kquant_cpu_cached(&mut weights, &tok, &prompt_ids, budget, &index); let native_ms = t1.elapsed().as_millis(); let native_text: String = native.iter().map(|(t, _)| t.as_str()).collect(); let native_tokens = native.len(); @@ -131,6 +136,66 @@ fn main() { )); } + // ── AT-1 on the Metal pipeline (optional): same gate/extract/compute, + // forced decode driven through the backend-routed constrained path. + // Forcing is sampler-level, so this is the spec §10.5 check that the + // drive is backend-independent in practice, at full decode speed. ── + let mut metal_rows = String::new(); + let mut metal_summary: Option<(usize, usize)> = None; + if use_metal { + match metal_backend_boxed() { + Some(backend) => { + use larql_inference::experts::arith::drive::force_decode_backend; + use larql_inference::experts::VirtualExpert; + println!("\n ── AT-1 on Metal (forced decode via backend path) ──"); + let mut ok = 0usize; + for (prompt, expected) in EXPLICIT { + let expr = ave.extract(prompt, None).expect("tier-0 fired above"); + let answer = ave.compute(&expr); + let schedule_ids = ave + .drive(&answer) + .forced_ids(&tok); + let prompt_ids = tok + .encode(*prompt, true) + .expect("encode") + .get_ids() + .to_vec(); + let t0 = std::time::Instant::now(); + let fd = force_decode_backend( + &mut weights, + &tok, + &index, + &*backend, + &prompt_ids, + &schedule_ids, + ); + let ms = t0.elapsed().as_millis(); + let d_ok = fd.emitted.trim() == *expected + && fd.cause == larql_inference::experts::arith::drive::TerminationCause::ScheduleEnd; + ok += usize::from(d_ok); + println!( + " {:<58} metal dispatch: {:<9} [{}tok {}ms {}]", + format!("{prompt:?}"), + if d_ok { "✓ exact" } else { "✗ WRONG" }, + fd.ids.len(), + ms, + fd.cause.label(), + ); + metal_rows.push_str(&format!( + ",{{\"leg\":\"metal\",\"prompt\":{},\"dispatch_ok\":{d_ok},\"emitted\":{},\"termination\":\"{}\",\"ms\":{ms}}}", + serde_json::to_string(prompt).expect("json"), + serde_json::to_string(fd.emitted.trim()).expect("json"), + fd.cause.label(), + )); + } + metal_summary = Some((ok, EXPLICIT.len())); + } + None => { + eprintln!("--metal requested but no Metal backend (build with --features gpu on macOS); skipping Metal leg."); + } + } + } + // ── AT-2: distractor specificity (gate only — no generation needed // to score a false fire) ──────────────────────────────────────────── println!("\n ── AT-2 distractors (false fires must be 0) ──"); @@ -159,6 +224,9 @@ fn main() { println!( " explicit dispatch: {dispatch_ok}/{n_e} exact schedule-end termination: {schedule_end_ok}/{n_e} native: {native_ok}/{n_e}" ); + if let Some((m_ok, m_n)) = metal_summary { + println!(" metal dispatch: {m_ok}/{m_n} exact (backend-routed forced decode)"); + } println!(" distractor false fires: {false_fires}/{n_d} (AT-2 bar: 0)"); // Fire rate on the explicit leg is 1.0 by construction (tier-0), so the // §7 decomposition reduces to fleet == dispatch accuracy there. @@ -171,8 +239,11 @@ fn main() { ); println!(" §7 decomposition residual (explicit leg): {residual:.4} (alarm if ≉ 0)"); + let metal_field = metal_summary + .map(|(ok, n)| format!(",\"metal\":[{ok},{n}]")) + .unwrap_or_default(); let json = format!( - "{{\"experiment\":\"ave_demo\",\"vindex\":{},\"explicit\":[{dispatch_ok},{n_e}],\"schedule_end\":[{schedule_end_ok},{n_e}],\"native\":[{native_ok},{n_e}],\"false_fires\":[{false_fires},{n_d}],\"items\":[{json_rows}]}}", + "{{\"experiment\":\"ave_demo\",\"vindex\":{}{metal_field},\"explicit\":[{dispatch_ok},{n_e}],\"schedule_end\":[{schedule_end_ok},{n_e}],\"native\":[{native_ok},{n_e}],\"false_fires\":[{false_fires},{n_d}],\"items\":[{json_rows}{metal_rows}]}}", serde_json::to_string(&vindex).expect("json"), ); let out_path = "bench/aim-validation/ave_demo_gemma3-4b.json"; @@ -182,3 +253,14 @@ fn main() { println!("\nwrote {out_path}"); } } + +#[cfg(all(feature = "gpu", target_os = "macos"))] +fn metal_backend_boxed() -> Option> { + larql_compute_metal::metal_backend() + .map(|m| Box::new(m) as Box) +} + +#[cfg(not(all(feature = "gpu", target_os = "macos")))] +fn metal_backend_boxed() -> Option> { + None +} diff --git a/crates/larql-inference/examples/ave_direct_layer_bisect.rs b/crates/larql-inference/examples/ave_direct_layer_bisect.rs new file mode 100644 index 000000000..b64e99edf --- /dev/null +++ b/crates/larql-inference/examples/ave_direct_layer_bisect.rs @@ -0,0 +1,166 @@ +//! Per-layer bisect of the direct-matvec decode divergence: gold chain from +//! a staged prefill over prompt+token (with per-layer state capture), direct +//! chain from a decode step over the same cache. The first layer whose input +//! residual diverges names the broken block; K/V row comparison at that +//! layer splits the QKV/RoPE side from the attention-mix/O/FFN side. +//! +//! Usage: `cargo run --release --example ave_direct_layer_bisect -- [VINDEX_DIR]` + +use larql_inference::load_tokenizer; +use larql_inference::vindex::{ + attention_decode_step_native, predict_kquant_decode_step_direct_with_state, + predict_kquant_prefill, predict_kquant_prefill_with_state, +}; +use larql_inference::PerLayerDecodeState; +use ndarray::Array2; + +fn cos_last_vs_first(gold: &Array2, direct: &Array2) -> f32 { + let g = gold.row(gold.nrows() - 1); + let d = direct.row(0); + let dot: f32 = g.iter().zip(d.iter()).map(|(a, b)| a * b).sum(); + let ng: f32 = g.iter().map(|a| a * a).sum::().sqrt(); + let nd: f32 = d.iter().map(|a| a * a).sum::().sqrt(); + if ng == 0.0 || nd == 0.0 { + return f32::NAN; + } + dot / (ng * nd) +} + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + + let prompt_ids = tok + .encode("12 + 7 =", true) + .expect("encode") + .get_ids() + .to_vec(); + + // First token off the prompt prefill (greedy), as in the parity probe. + let (h, _cache_unused, _) = predict_kquant_prefill(&mut weights, &prompt_ids, &index); + let last = h.nrows() - 1; + let h_last = h.slice(ndarray::s![last..last + 1, ..]).to_owned(); + let logits = larql_inference::forward::hidden_to_raw_logits(&weights, &h_last); + let first_id = logits + .iter() + .enumerate() + .filter(|(_, v)| v.is_finite()) + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(i, _)| i as u32) + .unwrap(); + + // Gold: staged prefill over prompt + first token, capturing per-layer + // h_in / k_new / v_new for every position. + let mut full_ids = prompt_ids.clone(); + full_ids.push(first_id); + let mut gold = PerLayerDecodeState::with_capacity(weights.num_layers); + let _ = predict_kquant_prefill_with_state(&mut weights, &full_ids, &index, Some(&mut gold)); + + // Direct: fresh prompt-only prefill cache, one direct step with capture. + let (_h2, mut cache, _) = predict_kquant_prefill(&mut weights, &prompt_ids, &index); + let mut direct = PerLayerDecodeState::with_capacity(weights.num_layers); + let backend = larql_compute::default_backend(); + let _ = predict_kquant_decode_step_direct_with_state( + &mut weights, + first_id, + &index, + &*backend, + &mut cache, + prompt_ids.len(), + Some(&mut direct), + ) + .expect("direct step"); + + println!( + "{:>5} {:>10} {:>10} {:>10} (h_in[L] = input residual to layer L; k/v = new rows at L)", + "layer", "cos(h_in)", "cos(k_new)", "cos(v_new)" + ); + for layer in 0..weights.num_layers { + let ch = cos_last_vs_first( + &gold.h_in_per_layer[layer].to_array(), + &direct.h_in_per_layer[layer].to_array(), + ); + let ck = cos_last_vs_first( + &gold.k_new_per_layer[layer].to_array(), + &direct.k_new_per_layer[layer].to_array(), + ); + let cv = cos_last_vs_first( + &gold.v_new_per_layer[layer].to_array(), + &direct.v_new_per_layer[layer].to_array(), + ); + let flag = if ch < 0.999 || ck < 0.999 || cv < 0.999 { + " <-- diverged" + } else { + "" + }; + println!("{layer:>5} {ch:>10.6} {ck:>10.6} {cv:>10.6}{flag}"); + } + + // ── Same-input discriminator: feed each layer's GOLD input residual to + // the direct attention block. Any K/V divergence here is the block + // itself (slice bytes / matvec / norm-rope plumbing), not chain + // compounding. ── + println!("\nSame-input per-layer attention block (gold h_in → direct block):"); + println!( + "{:>5} {:>10} {:>10} {:>6} {:>6} {:>6} {:>6}", + "layer", "cos(k_new)", "cos(v_new)", "q_fmt", "k_fmt", "v_fmt", "o_fmt" + ); + let (_h3, cache_fresh, _) = predict_kquant_prefill(&mut weights, &prompt_ids, &index); + #[allow(clippy::needless_range_loop)] + for layer in 0..weights.num_layers { + let gold_h = gold.h_in_per_layer[layer].to_array(); + let h_last = gold_h + .slice(ndarray::s![gold_h.nrows() - 1..gold_h.nrows(), ..]) + .to_owned(); + let kv_entry = cache_fresh[layer].as_ref(); + let Some((_h_post, (k_cat, v_cat))) = attention_decode_step_native( + &weights, + &index, + &*backend, + &h_last, + layer, + kv_entry, + prompt_ids.len(), + ) else { + println!("{layer:>5} block returned None"); + continue; + }; + let ck = cos_last_vs_first(&gold.k_new_per_layer[layer].to_array(), &{ + let n = k_cat.nrows(); + k_cat.slice(ndarray::s![n - 1..n, ..]).to_owned() + }); + let cv = cos_last_vs_first(&gold.v_new_per_layer[layer].to_array(), &{ + let n = v_cat.nrows(); + v_cat.slice(ndarray::s![n - 1..n, ..]).to_owned() + }); + let fmts = index + .attn_kquant_layer_data(layer) + .map(|a| [a[0].1, a[1].1, a[2].1, a[3].1]) + .unwrap_or(["?"; 4]); + let flag = if ck < 0.999 || cv < 0.999 { + " <-- block diverges on SAME input" + } else { + "" + }; + println!( + "{layer:>5} {ck:>10.6} {cv:>10.6} {:>6} {:>6} {:>6} {:>6}{flag}", + fmts[0], fmts[1], fmts[2], fmts[3] + ); + } +} diff --git a/crates/larql-inference/examples/ave_direct_step_parity.rs b/crates/larql-inference/examples/ave_direct_step_parity.rs new file mode 100644 index 000000000..5a52f3c55 --- /dev/null +++ b/crates/larql-inference/examples/ave_direct_step_parity.rs @@ -0,0 +1,113 @@ +//! One-step parity probe: staged (dequant) vs direct-matvec decode step on a +//! real vindex. Discriminates "my generation loop is wrong" from "the direct +//! kernel path diverges on this model" — compare the same single decode step +//! both ways from an identical prefill cache. +//! +//! Usage: `cargo run --release --example ave_direct_step_parity -- [VINDEX_DIR]` + +use larql_inference::load_tokenizer; +use larql_inference::vindex::{ + predict_kquant_decode_step, predict_kquant_decode_step_direct, predict_kquant_prefill, + supports_direct_matvec_decode, +}; + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + + println!( + "supports_direct_matvec_decode: {}", + supports_direct_matvec_decode(&weights, &index) + ); + + let prompt_ids = tok + .encode("12 + 7 =", true) + .expect("encode") + .get_ids() + .to_vec(); + + // Two independent prefills → two identical caches (prefill is staged in + // both worlds; only the decode step differs). + let (h, mut cache_staged, _) = predict_kquant_prefill(&mut weights, &prompt_ids, &index); + let (_h2, mut cache_direct, _) = predict_kquant_prefill(&mut weights, &prompt_ids, &index); + + // Greedy-pick the first token off the prefill logits (shared). + let last = h.nrows() - 1; + let h_last = h.slice(ndarray::s![last..last + 1, ..]).to_owned(); + let logits = larql_inference::forward::hidden_to_raw_logits(&weights, &h_last); + let first_id = logits + .iter() + .enumerate() + .filter(|(_, v)| v.is_finite()) + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(i, _)| i as u32) + .unwrap(); + println!( + "first greedy token: {} {:?}", + first_id, + tok.decode(&[first_id], true).unwrap_or_default() + ); + + let abs_position = prompt_ids.len(); + let (h_staged, _) = predict_kquant_decode_step( + &mut weights, + first_id, + &index, + &mut cache_staged, + abs_position, + ) + .expect("staged step"); + let backend = larql_compute::default_backend(); + let h_direct = predict_kquant_decode_step_direct( + &mut weights, + first_id, + &index, + &*backend, + &mut cache_direct, + abs_position, + ) + .expect("direct step"); + + // Compare hidden states. + let a = h_staged.row(0); + let b = h_direct.row(0); + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let na: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + let max_abs = a + .iter() + .zip(b.iter()) + .map(|(x, y)| (x - y).abs()) + .fold(0f32, f32::max); + println!("hidden cosine(staged, direct): {:.6}", dot / (na * nb)); + println!("hidden max |diff|: {max_abs:.6} norms: staged {na:.3} direct {nb:.3}"); + + // And the next-token view: top-3 from each. + let top3 = |h: &ndarray::Array2| -> Vec<(u32, String)> { + let logits = larql_inference::forward::hidden_to_raw_logits(&weights, h); + let mut idx: Vec = (0..logits.len()).collect(); + idx.sort_by(|&i, &j| logits[j].partial_cmp(&logits[i]).unwrap()); + idx.iter() + .take(3) + .map(|&i| (i as u32, tok.decode(&[i as u32], true).unwrap_or_default())) + .collect() + }; + println!("staged next top-3: {:?}", top3(&h_staged)); + println!("direct next top-3: {:?}", top3(&h_direct)); +} diff --git a/crates/larql-inference/examples/ave_q4k_row_audit.rs b/crates/larql-inference/examples/ave_q4k_row_audit.rs new file mode 100644 index 000000000..625a7aa66 --- /dev/null +++ b/crates/larql-inference/examples/ave_q4k_row_audit.rs @@ -0,0 +1,186 @@ +//! Row-level audit of the two Q4_K decoders on real attention bytes. +//! +//! Same bytes, same activation vector, three readings per row of the K +//! projection: +//! a) `q4k_matvec_into` (the direct-path f32-act kernel), +//! b) `dequantize_q4_k` row → f32 dot (reference decode), +//! c) the staged path's `insert_q4k_layer_tensors` tensor row → dot. +//! A row where (a) disagrees with (b)/(c) pinpoints a super-block decode +//! bug in the matvec kernel; (b) vs (c) checks the two dequantisers +//! against each other. +//! +//! Usage: `cargo run --release --example ave_q4k_row_audit -- [VINDEX_DIR] [LAYERS...]` + +use larql_compute::cpu::ops::q4_common::{dequantize_q4_k, q4k_matvec_into}; +use larql_inference::vindex::{insert_q4k_layer_tensors, remove_layer_tensors}; + +fn main() { + if std::env::var("LARQL_F16_PROBE").is_ok() { + f16_probe(); + } + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let layers: Vec = if args.len() > 2 { + args[2..].iter().filter_map(|a| a.parse().ok()).collect() + } else { + vec![20, 32] // clean control, worst offender + }; + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + + let hidden = weights.hidden_size; + let arch_kv = { + let arch = &*weights.arch; + arch.num_kv_heads_for_layer(0) * arch.head_dim_for_layer(0) + }; + // Deterministic pseudo-random activation (no Math.random in harness + // discipline; LCG is plenty for a kernel audit). + let mut seed = 0x2545F4914F6CDD1Du64; + let x: Vec = (0..hidden) + .map(|_| { + seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + ((seed >> 33) as f32 / (1u64 << 31) as f32) - 0.5 + }) + .collect(); + + const BLOCK_BYTES: usize = 144; + const ELEMS: usize = 256; + let bytes_per_row = (hidden / ELEMS) * BLOCK_BYTES; + + for &layer in &layers { + let attn = index.attn_kquant_layer_data(layer).expect("attn data"); + let (k_bytes, k_fmt) = attn[1]; + println!("\nlayer {layer}: k_fmt={k_fmt} kv_dim={arch_kv} bytes={}", k_bytes.len()); + if k_fmt != "Q4_K" { + println!(" (not Q4_K, skipping)"); + continue; + } + + // Staged tensor for (c). + let k_bytes_owned = k_bytes.to_vec(); + let inserted = insert_q4k_layer_tensors(&mut weights, &index, layer).expect("insert"); + let k_key = weights.arch.attn_k_key(layer); + let w_staged = weights.tensors.get(&k_key).expect("staged K").clone(); + println!(" staged tensor shape: {:?}", w_staged.shape()); + + let mut bad_ab = 0usize; + let mut bad_ac = 0usize; + let mut bad_bc = 0usize; + let mut worst: (usize, f32, f32, f32) = (0, 0.0, 0.0, 0.0); + for r in 0..arch_kv { + let row_bytes = &k_bytes_owned[r * bytes_per_row..(r + 1) * bytes_per_row]; + let mut a = [0.0f32]; + q4k_matvec_into(&mut a, &x, row_bytes, 1, hidden); + let deq = dequantize_q4_k(row_bytes, hidden); + let b: f32 = deq.iter().zip(x.iter()).map(|(w, v)| w * v).sum(); + // (c): staged row — orientation per dequantize_matrix(rows=kv_dim, cols=hidden). + let c: f32 = if w_staged.shape()[0] == arch_kv { + w_staged.row(r).iter().zip(x.iter()).map(|(w, v)| w * v).sum() + } else { + w_staged.column(r).iter().zip(x.iter()).map(|(w, v)| w * v).sum() + }; + let scale = b.abs().max(1e-3); + let dab = (a[0] - b).abs() / scale; + let dac = (a[0] - c).abs() / scale; + let dbc = (b - c).abs() / scale; + if dab > 1e-3 { + bad_ab += 1; + } + if dac > 1e-3 { + bad_ac += 1; + } + if dbc > 1e-3 { + bad_bc += 1; + } + if dab > worst.1 { + worst = (r, dab, a[0], b); + } + } + println!( + " rows with rel-diff > 1e-3 of {arch_kv}: matvec-vs-deq(a,b): {bad_ab} matvec-vs-staged(a,c): {bad_ac} deq-vs-staged(b,c): {bad_bc}" + ); + println!( + " worst row {}: rel {:.4} matvec {:.6} vs dequant-dot {:.6}", + worst.0, worst.1, worst.2, worst.3 + ); + + // Element-level: q4_common dequant vs the staged tensor row, no dot + // products involved. If the decode logic were identical these are + // bit-equal; print the worst element diff found anywhere. + let mut worst_elem: (usize, usize, f32, f32, f32) = (0, 0, 0.0, 0.0, 0.0); + let mut rows_with_elem_diff = 0usize; + for r in 0..arch_kv { + let row_bytes = &k_bytes_owned[r * bytes_per_row..(r + 1) * bytes_per_row]; + let deq = dequantize_q4_k(row_bytes, hidden); + let staged_row = w_staged.row(r); + let mut row_worst = 0f32; + for (i, (b, c)) in deq.iter().zip(staged_row.iter()).enumerate() { + let d = (b - c).abs(); + if d > row_worst { + row_worst = d; + } + if d > worst_elem.4 { + worst_elem = (r, i, *b, *c, d); + } + } + if row_worst > 1e-7 { + rows_with_elem_diff += 1; + } + } + println!( + " element-level: rows with any |Δ|>1e-7: {rows_with_elem_diff}/{arch_kv}; worst at row {} elem {}: q4_common {} vs staged {} (|Δ| {})", + worst_elem.0, worst_elem.1, worst_elem.2, worst_elem.3, worst_elem.4 + ); + // Forensic dump of the worst block: both decoders on the same 144 + // bytes, plus the raw header, so the layout disagreement is visible. + if worst_elem.4 > 0.0 { + let (r, i) = (worst_elem.0, worst_elem.1); + let blk = i / 256; + let row_bytes = &k_bytes_owned[r * bytes_per_row..(r + 1) * bytes_per_row]; + let block = &row_bytes[blk * 144..(blk + 1) * 144]; + println!(" forensic block row {r} block {blk} (elem {i} = in-block {}):", i % 256); + println!(" header[0..16]: {:02x?}", &block[0..16]); + let via_common = dequantize_q4_k(block, 256); + let info = larql_vindex::quant::registry::lookup("Q4_K").expect("registry"); + let via_registry = (info.dequantize)(block, 256).expect("registry decode"); + let e = i % 256; + let lo = e.saturating_sub(4); + let hi = (e + 4).min(255); + println!(" elems {lo}..={hi}:"); + println!(" q4_common: {:?}", &via_common[lo..=hi]); + println!(" registry : {:?}", &via_registry[lo..=hi]); + let n_diff = via_common + .iter() + .zip(via_registry.iter()) + .filter(|(a, b)| (**a - **b).abs() > 1e-7) + .count(); + println!(" elems differing in this block: {n_diff}/256"); + } + remove_layer_tensors(&mut weights, inserted); + } +} + +#[allow(dead_code)] +fn f16_probe() { + // Called from main when LARQL_F16_PROBE=1. + let bits = 0x03feu16; + println!( + "f16(0x03fe): q4_common={:e} models={:e} (true subnormal = 1022*2^-24 = {:e})", + larql_compute::cpu::ops::q4_common::f16_to_f32(bits), + larql_models::quant::half::f16_to_f32(bits), + 1022f32 * 2f32.powi(-24), + ); +} diff --git a/crates/larql-inference/examples/fr3_explicit_rewrite.rs b/crates/larql-inference/examples/fr3_explicit_rewrite.rs new file mode 100644 index 000000000..a2f64dc72 --- /dev/null +++ b/crates/larql-inference/examples/fr3_explicit_rewrite.rs @@ -0,0 +1,186 @@ +//! FR3 **explicit rewrite** — measure whether the model, asked directly, maps +//! an arbitrary relation phrasing to a canonical relation the vindex knows. +//! +//! The template ablation (`fr3_template_ablation`) showed the residual probe is +//! ~chance on UNSEEN phrasings at its probe layer — diversifying training +//! templates didn't fix it. This tests the alternative (chris's call): instead +//! of a phrasing-invariant probe, do an **explicit** model classification — +//! few-shot "word -> relation" — and read the next-token prediction. One forward +//! pass (no probe training), using the model's own language understanding. +//! +//! Three buckets: known synonyms (seat/money/tongue…), harder UNSEEN phrasings +//! (head city / legal tender / spoken language…) — where the probe failed — and +//! distractors (banana/weather) that should map to NONE of the relations. +//! +//! If explicit classification nails the synonyms AND the unseen phrasings while +//! abstaining on distractors, it's the right resolver fallback: probe-first +//! (cheap, rides the model's implicit normalisation when it works), +//! explicit-rewrite-fallback (robust) — the FR2 two-tier shape, for relations. +//! +//! Usage: `cargo run --release --example fr3_explicit_rewrite -- [VINDEX_DIR]` +//! Writes `bench/aim-validation/fr3_explicit_rewrite_gemma3-4b.json`. + +use larql_inference::load_tokenizer; +use larql_inference::vindex::predict_kquant; + +/// Canonical relations the vindex knows (the classification target set). +const RELATIONS: &[&str] = &["capital", "currency", "language"]; + +/// (phrasing, expected canonical relation, bucket). `""` = should abstain. +const CASES: &[(&str, &str, &str)] = &[ + // known single-word synonyms + ("seat", "capital", "synonym"), + ("metropolis", "capital", "synonym"), + ("money", "currency", "synonym"), + ("cash", "currency", "synonym"), + ("tongue", "language", "synonym"), + ("speech", "language", "synonym"), + // unseen multi-word phrasings (where the residual probe was ~chance) + ("head city", "capital", "phrasing"), + ("main city", "capital", "phrasing"), + ("legal tender", "currency", "phrasing"), + ("unit of money", "currency", "phrasing"), + ("spoken language", "language", "phrasing"), + ("mother tongue", "language", "phrasing"), + // distractors — no relation should be confidently chosen + ("banana", "", "distractor"), + ("weather", "", "distractor"), + ("altitude", "", "distractor"), +]; + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + eprintln!(" pass a Q4_K gemma3-4b vindex dir as the first arg"); + eprintln!(" (default: output/gemma3-4b-q4k-v2.vindex). Skipping cleanly."); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + + // Few-shot frame: examples are NOT in the test set (no leakage), and they + // pin the candidate space + the "word -> relation" task. + // Candidate set includes a `none` escape so out-of-domain words can abstain + // instead of being forced into the nearest relation (the forced-choice + // confident-wrong fix — the same abstain discipline as FR1's verify). + let rel_list = RELATIONS.join(", "); + let prompt_for = |w: &str| -> String { + format!( + "Map each word to one of: {rel_list}, none.\ncity -> capital\ndollar -> currency\ndialect -> language\nmusic -> none\n{w} ->" + ) + }; + // Does the canonical relation appear as a top-k next token (prefix-matched, + // since a relation may tokenise to a leading sub-word)? + let matches = |preds: &[(String, f64)], canonical: &str| -> Option { + preds.iter().position(|(t, _)| { + let t = t.trim().to_lowercase(); + !t.is_empty() && (canonical.starts_with(&t) || t.starts_with(canonical)) + }) + }; + // Any relation chosen as top-1 (for the distractor abstain check)? + let any_rel_top1 = |preds: &[(String, f64)]| -> Option { + let (t, _) = preds.first()?; + let t = t.trim().to_lowercase(); + RELATIONS + .iter() + .find(|r| !t.is_empty() && (r.starts_with(&t) || t.starts_with(**r))) + .map(|r| r.to_string()) + }; + + println!("\n=== FR3 explicit-rewrite classification on {vindex} ==="); + println!(" few-shot \"word -> relation\" over {{{rel_list}}}; one forward, top-5\n"); + println!(" bucket phrasing → top-1 canonical? top-1∈relations"); + + let (mut syn_ok, mut syn_n) = (0usize, 0usize); + let (mut phr_ok, mut phr_n) = (0usize, 0usize); + let (mut distractor_fires, mut distractor_n) = (0usize, 0usize); + let mut json_rows = String::new(); + + for (w, expected, bucket) in CASES { + let ids = tok + .encode(prompt_for(w).as_str(), true) + .expect("encode") + .get_ids() + .to_vec(); + let preds = predict_kquant(&mut weights, &tok, &ids, 5, &index).predictions; + let top1 = preds + .first() + .map(|(t, _)| t.trim().to_string()) + .unwrap_or_default(); + let rank = if expected.is_empty() { + None + } else { + matches(&preds, expected) + }; + let rel_top1 = any_rel_top1(&preds); + + match *bucket { + "synonym" => { + syn_n += 1; + if rank == Some(0) { + syn_ok += 1; + } + } + "phrasing" => { + phr_n += 1; + if rank == Some(0) { + phr_ok += 1; + } + } + "distractor" => { + distractor_n += 1; + if rel_top1.is_some() { + distractor_fires += 1; + } + } + _ => {} + } + + let hit = match (expected.is_empty(), rank) { + (true, _) => format!( + "(abstain; top-1∈rel: {})", + rel_top1.unwrap_or_else(|| "no".into()) + ), + (false, Some(0)) => "✓ top-1".to_string(), + (false, Some(r)) => format!("rank {}", r + 1), + (false, None) => "✗ absent".to_string(), + }; + println!(" {bucket:<11} {w:<19} → {top1:<12} {hit}"); + json_rows.push_str(&format!( + "{}{{\"w\":\"{w}\",\"bucket\":\"{bucket}\",\"expected\":\"{expected}\",\"top1\":\"{}\",\"rank\":{}}}", + if json_rows.is_empty() { "" } else { "," }, + top1.replace('"', "'"), + rank.map(|r| (r as i64 + 1).to_string()).unwrap_or_else(|| "-1".into()) + )); + } + + println!("\n ── verdict ──"); + println!( + " synonyms top-1: {syn_ok}/{syn_n} unseen phrasings top-1: {phr_ok}/{phr_n} distractor false-fires: {distractor_fires}/{distractor_n}" + ); + println!(" (residual probe was ~0.33 = chance on unseen phrasings at its layer — compare.)"); + println!(" If phrasings ≈ synonyms ≈ high and distractors abstain, wire explicit rewrite as"); + println!(" the resolver fallback (probe-first when confident, else explicit classify)."); + + let json = format!( + "{{\"experiment\":\"fr3_explicit_rewrite\",\"vindex\":\"{vindex}\",\"synonym_top1\":[{syn_ok},{syn_n}],\"phrasing_top1\":[{phr_ok},{phr_n}],\"distractor_fires\":[{distractor_fires},{distractor_n}],\"cases\":[{json_rows}]}}" + ); + let out = "bench/aim-validation/fr3_explicit_rewrite_gemma3-4b.json"; + if let Err(e) = std::fs::write(out, &json) { + eprintln!("warning: could not write {out}: {e}"); + } else { + println!("\nwrote {out}"); + } +} diff --git a/crates/larql-inference/examples/fr3_template_ablation.rs b/crates/larql-inference/examples/fr3_template_ablation.rs new file mode 100644 index 000000000..13bb7969d --- /dev/null +++ b/crates/larql-inference/examples/fr3_template_ablation.rs @@ -0,0 +1,300 @@ +//! FR3 **template ablation** — does training the relation probe over MORE +//! phrasing templates actually make synonym resolution robust to phrasings it +//! has never seen? Validates the multi-template change to the production FR3 +//! resolver (`larql-lql/src/executor/relation_resolver.rs`). +//! +//! Setup: train a relation probe on BASE relations {capital,currency,language} +//! rendered through the first `k` of the resolver's TRAIN templates; test it on +//! the unseen SYNONYMS {seat,money,tongue} rendered through a **held-out** +//! template that appears in NO training set. Sweep `k ∈ {1,2,4}` and read the +//! synonym-classification accuracy at the resolver's probe layer (depth ≈ 0.3). +//! +//! If accuracy rises with `k`, more templates buy genuine phrasing-invariance +//! (the change is justified). If flat, the single template was already enough +//! (the change is harmless but unnecessary). Either way it's a measured call. +//! +//! Usage: `cargo run --release --example fr3_template_ablation -- [VINDEX_DIR] [N_ENTITIES]` +//! Writes `bench/aim-validation/fr3_template_ablation_gemma3-4b.json`. + +use larql_inference::vindex::insert_q4k_layer_tensors; +use larql_inference::{capture_residuals, load_tokenizer}; +use ndarray::{Array1, Array2, Axis}; +use std::collections::HashMap; + +/// Per-layer last-token residuals for one rendered prompt (layer → residual). +type LayerRes = HashMap>; + +/// Layers swept; the resolver's probe layer for a 34-layer model is L10 (0.3·L). +const LAYERS: [usize; 4] = [6, 10, 14, 20]; +/// Relation classes the probe is trained on (label index per class). +const BASE: [(&str, usize); 3] = [("capital", 0), ("currency", 1), ("language", 2)]; +/// Unseen synonyms the probe is tested on (true class index). +const SYN: [(&str, usize); 3] = [("seat", 0), ("money", 1), ("tongue", 2)]; +/// The resolver's training templates (`{r}` relation, `{e}` entity). +const TRAIN_TEMPLATES: &[&str] = &[ + "The {r} of {e} is", + "{e}'s {r} is", + "The {r} of {e}:", + "What is the {r} of {e}? It is", +]; +/// A phrasing that appears in NO training set — the generalization test. +const HELD_OUT_TEMPLATE: &str = "The {r} for {e} would be"; + +const ENTITIES: &[&str] = &[ + "France", "Japan", "Brazil", "Egypt", "Canada", "India", "Germany", "Kenya", +]; + +fn render(t: &str, r: &str, e: &str) -> String { + t.replace("{r}", r).replace("{e}", e) +} + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let n: usize = args + .get(2) + .and_then(|s| s.parse().ok()) + .unwrap_or(6) + .min(ENTITIES.len()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + eprintln!(" pass a Q4_K gemma3-4b vindex dir as the first arg"); + eprintln!(" (default: output/gemma3-4b-q4k-v2.vindex). Skipping cleanly."); + return; + } + let entities = &ENTITIES[..n]; + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + eprintln!("Dequantising {} layers ...", weights.num_layers); + for l in 0..weights.num_layers { + insert_q4k_layer_tensors(&mut weights, &index, l).expect("dequant"); + } + + let cap = |prompt: &str| -> LayerRes { + let ids = tok.encode(prompt, true).expect("encode").get_ids().to_vec(); + capture_residuals(&weights, &ids, &LAYERS) + .into_iter() + .collect() + }; + + // Train captures: BASE × entities × TRAIN_TEMPLATES. Indexed [base][ent][tmpl]. + eprintln!( + "Capturing train set: {} base × {n} ent × {} templates ...", + BASE.len(), + TRAIN_TEMPLATES.len() + ); + let mut train: Vec>> = Vec::new(); + for (r, _) in BASE { + let mut per_ent = Vec::new(); + for e in entities { + let mut per_t = Vec::new(); + for t in TRAIN_TEMPLATES { + per_t.push(cap(&render(t, r, e))); + } + per_ent.push(per_t); + } + train.push(per_ent); + } + // Test captures: SYN × entities × HELD_OUT (unseen phrasing). + eprintln!( + "Capturing held-out test set: {} syn × {n} ent × 1 template ...", + SYN.len() + ); + let mut test: Vec> = Vec::new(); + for (r, _) in SYN { + let mut per_ent = Vec::new(); + for e in entities { + per_ent.push(cap(&render(HELD_OUT_TEMPLATE, r, e))); + } + test.push(per_ent); + } + + println!("\n=== FR3 template ablation on {vindex} (N={n} entities) ==="); + println!(" train BASE {{capital,currency,language}} over k templates; test SYN"); + println!(" {{seat,money,tongue}} on a HELD-OUT phrasing \"{HELD_OUT_TEMPLATE}\" (chance = 0.33)\n"); + println!(" layer k=1 k=2 k=4"); + + let h = train[0][0][0][&LAYERS[0]].len(); + let mut json_rows = String::new(); + for &layer in &LAYERS { + let mut accs = [0f64; 3]; + for (ki, &k) in [1usize, 2, 4].iter().enumerate() { + // Train set = first k templates. + let n_train = BASE.len() * entities.len() * k; + let mut x = Array2::::zeros((n_train, h)); + let mut y = Vec::with_capacity(n_train); + let mut row = 0; + for (bi, (_, lbl)) in BASE.iter().enumerate() { + for per_ent in &train[bi] { + for t_map in per_ent.iter().take(k) { + let v = &t_map[&layer]; + for j in 0..h { + x[[row, j]] = v[j]; + } + y.push(*lbl); + row += 1; + } + } + } + let (xz, mu, sd) = standardize(&x); + let (w, b) = train_probe(&xz, &y, BASE.len(), 400, 0.1, 1e-3); + + // Test on held-out-phrasing synonyms. + let n_test = SYN.len() * entities.len(); + let mut xt = Array2::::zeros((n_test, h)); + let mut yt = Vec::with_capacity(n_test); + let mut r2 = 0; + for (si, (_, lbl)) in SYN.iter().enumerate() { + for ent_map in &test[si] { + let v = &ent_map[&layer]; + for j in 0..h { + xt[[r2, j]] = v[j]; + } + yt.push(*lbl); + r2 += 1; + } + } + let xtz = apply_std(&xt, &mu, &sd); + let pred = predict(&xtz, &w, &b); + let correct = pred.iter().zip(&yt).filter(|(p, t)| p == t).count(); + accs[ki] = correct as f64 / n_test as f64; + } + println!( + " L{:<3} {:.2} {:.2} {:.2}", + layer, accs[0], accs[1], accs[2] + ); + json_rows.push_str(&format!( + "{}{{\"layer\":{},\"acc_k1\":{:.4},\"acc_k2\":{:.4},\"acc_k4\":{:.4}}}", + if json_rows.is_empty() { "" } else { "," }, + layer, + accs[0], + accs[1], + accs[2] + )); + } + + println!("\n ── verdict ──"); + println!(" Read the resolver's probe layer (L10, depth 0.3). If k=4 > k=1 there, more"); + println!(" templates buy real phrasing-invariance on UNSEEN phrasings — the change is"); + println!(" justified. If flat/equal, one template already generalised (change is harmless)."); + + let json = format!( + "{{\"experiment\":\"fr3_template_ablation\",\"vindex\":\"{vindex}\",\"n_entities\":{n},\"held_out_template\":\"{HELD_OUT_TEMPLATE}\",\"layers\":[{json_rows}]}}" + ); + let out = "bench/aim-validation/fr3_template_ablation_gemma3-4b.json"; + if let Err(e) = std::fs::write(out, &json) { + eprintln!("warning: could not write {out}: {e}"); + } else { + println!("\nwrote {out}"); + } +} + +// ── probe math (mirrors relation_resolver + fr3_relation_address) ── + +fn standardize(x: &Array2) -> (Array2, Array1, Array1) { + let (n, h) = x.dim(); + let mut mu = Array1::::zeros(h); + let mut sd = Array1::::zeros(h); + for j in 0..h { + let mut m = 0.0f32; + for i in 0..n { + m += x[[i, j]]; + } + m /= n as f32; + let mut v = 0.0f32; + for i in 0..n { + let d = x[[i, j]] - m; + v += d * d; + } + mu[j] = m; + sd[j] = (v / n as f32).sqrt() + 1e-6; + } + (apply_std(x, &mu, &sd), mu, sd) +} + +fn apply_std(x: &Array2, mu: &Array1, sd: &Array1) -> Array2 { + let (n, h) = x.dim(); + let mut z = x.clone(); + for i in 0..n { + for j in 0..h { + z[[i, j]] = (z[[i, j]] - mu[j]) / sd[j]; + } + } + z +} + +fn softmax_rows(logits: &Array2) -> Array2 { + let (n, c) = logits.dim(); + let mut p = logits.clone(); + for i in 0..n { + let mut mx = f32::NEG_INFINITY; + for j in 0..c { + mx = mx.max(p[[i, j]]); + } + let mut s = 0.0f32; + for j in 0..c { + let e = (p[[i, j]] - mx).exp(); + p[[i, j]] = e; + s += e; + } + for j in 0..c { + p[[i, j]] /= s; + } + } + p +} + +fn train_probe( + x: &Array2, + y: &[usize], + c: usize, + steps: usize, + lr: f32, + l2: f32, +) -> (Array2, Array1) { + let (n, h) = x.dim(); + let mut w = Array2::::zeros((h, c)); + let mut b = Array1::::zeros(c); + for _ in 0..steps { + let logits = x.dot(&w) + &b; + let probs = softmax_rows(&logits); + let mut d = probs; + for i in 0..n { + d[[i, y[i]]] -= 1.0; + } + d /= n as f32; + let gw = x.t().dot(&d) + &(&w * l2); + let gb = d.sum_axis(Axis(0)); + w = &w - &(&gw * lr); + b = &b - &(&gb * lr); + } + (w, b) +} + +fn predict(x: &Array2, w: &Array2, b: &Array1) -> Vec { + let logits = x.dot(w) + b; + let (n, c) = logits.dim(); + (0..n) + .map(|i| { + let mut best = 0usize; + let mut bv = f32::NEG_INFINITY; + for j in 0..c { + if logits[[i, j]] > bv { + bv = logits[[i, j]]; + best = j; + } + } + best + }) + .collect() +} diff --git a/crates/larql-inference/examples/fr_early_exit_decode_projection.rs b/crates/larql-inference/examples/fr_early_exit_decode_projection.rs new file mode 100644 index 000000000..db65a408e --- /dev/null +++ b/crates/larql-inference/examples/fr_early_exit_decode_projection.rs @@ -0,0 +1,241 @@ +//! Decode-loop **measure-first** projection — quantifies the *realizable* +//! gain of terminal-token early-exit in a streaming generation loop, BEFORE +//! committing to the `larql-kv` decode-engine wiring. +//! +//! The KV-cache invariant (incremental decode caches per-layer K/V per +//! position) means early-exit is parity-safe ONLY on the terminal token — skip +//! the tail for a non-terminal token and the next token's attention at those +//! layers loses this position. So for an answer of `T` tokens, at most the last +//! token early-exits; the other `T-1` run the full forward (their KV is needed): +//! +//! blended_speedup(T) = (T · full) / ((T-1) · full + early) +//! +//! and — harsher — the early-exit only fires if the *fact* token is the terminal +//! one. For a natural answer where the fact is mid-sentence ("… is Paris."), the +//! terminal token (".") is not a retrieval, so early-exit fires 0× → 1.0×. +//! +//! This measures `full` (Verified `infer_patched`) and `early` +//! (`infer_patched_early_exit`) per answer-token on the real model, then prints +//! the blended curve so the decode-loop build can be judged on realizable value. +//! +//! Usage: `cargo run --release --example fr_early_exit_decode_projection -- [VINDEX_DIR] [N] [INSTALL_LAYER]` +//! Writes `bench/aim-validation/fr_early_exit_decode_projection_gemma3-4b.json`. + +use larql_inference::forward::{ + infer_patched, infer_patched_early_exit, KnnRouteMode, KNN_COSINE_THRESHOLD, KNN_VERIFY_TOPK, +}; +use larql_inference::load_tokenizer; +use larql_inference::vindex::insert_q4k_layer_tensors; +use larql_vindex::PatchedVindex; +use std::time::Instant; + +const ENTITIES: &[&str] = &[ + "France", + "Germany", + "Italy", + "Spain", + "Portugal", + "Greece", + "Austria", + "Belgium", + "Netherlands", + "Denmark", + "Norway", + "Sweden", + "Finland", + "Poland", + "Hungary", + "Romania", + "Japan", + "China", + "India", + "Pakistan", + "Thailand", + "Vietnam", + "Indonesia", + "Malaysia", +]; + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let n: usize = args + .get(2) + .and_then(|s| s.parse().ok()) + .unwrap_or(16) + .min(ENTITIES.len()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + eprintln!(" pass a Q4_K gemma3-4b vindex dir as the first arg"); + eprintln!(" (default: output/gemma3-4b-q4k-v2.vindex). Skipping cleanly."); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + let num_layers = weights.num_layers; + let last = num_layers - 1; + let install_layer = args + .get(3) + .and_then(|s| s.parse().ok()) + .unwrap_or(24) + .min(last); + eprintln!("Dequantising {num_layers} layers to f32 ..."); + for layer in 0..num_layers { + insert_q4k_layer_tensors(&mut weights, &index, layer).expect("dequant"); + } + let patched = PatchedVindex::new(index); + + let installed = (n * 3 / 4).max(1).min(n.saturating_sub(1).max(1)); + let entities: Vec = ENTITIES[..n].iter().map(|s| s.to_string()).collect(); + let enc = |p: &str| tok.encode(p, true).expect("encode").get_ids().to_vec(); + + eprintln!("Installing {installed} facts at L{install_layer} ..."); + let mut store = larql_vindex::KnnStore::default(); + for (i, e) in entities.iter().take(installed).enumerate() { + let ids = enc(&format!("The capital of {e} is")); + let key = infer_patched( + &weights, + &tok, + &patched, + None, + &ids, + 1, + &KnnRouteMode::Legacy, + ) + .residuals + .into_iter() + .find(|(l, _)| *l == install_layer) + .map(|(_, v)| v) + .expect("install residual"); + store.add( + install_layer, + key, + i as u32, + e.clone(), + e.clone(), + "capital".to_string(), + 1.0, + ); + } + + // Warm up. + { + let ids = enc("France's capital city is"); + let _ = infer_patched( + &weights, + &tok, + &patched, + Some(&store), + &ids, + 5, + &KnnRouteMode::Verified { + k: KNN_VERIFY_TOPK, + threshold: KNN_COSINE_THRESHOLD, + }, + ); + let _ = infer_patched_early_exit( + &weights, + &tok, + &patched, + Some(&store), + &ids, + 5, + KNN_VERIFY_TOPK, + KNN_COSINE_THRESHOLD, + ); + } + + eprintln!("Timing full vs early answer-token on {installed} installed facts ..."); + let mut full_ns: u128 = 0; + let mut early_ns: u128 = 0; + let mut fired = 0usize; + for e in entities.iter().take(installed) { + let ids = enc(&format!("{e}'s capital city is")); + let t0 = Instant::now(); + let _ = infer_patched( + &weights, + &tok, + &patched, + Some(&store), + &ids, + 5, + &KnnRouteMode::Verified { + k: KNN_VERIFY_TOPK, + threshold: KNN_COSINE_THRESHOLD, + }, + ); + let f = t0.elapsed().as_nanos(); + let t1 = Instant::now(); + let (_, exited) = infer_patched_early_exit( + &weights, + &tok, + &patched, + Some(&store), + &ids, + 5, + KNN_VERIFY_TOPK, + KNN_COSINE_THRESHOLD, + ); + let ee = t1.elapsed().as_nanos(); + if exited { + full_ns += f; + early_ns += ee; + fired += 1; + } + } + + if fired == 0 { + eprintln!("no early-exit fired — cannot project; aborting."); + return; + } + let full = full_ns as f64 / 1e6 / fired as f64; + let early = early_ns as f64 / 1e6 / fired as f64; + let per_tok = full / early; + + println!("\n=== FR early-exit decode-loop projection on {vindex} ==="); + println!( + " resolved L* = {install_layer}/{num_layers}; measured on {fired} fired retrievals\n" + ); + println!( + " per terminal answer-token: full {full:.1} ms vs early {early:.1} ms → {per_tok:.2}×" + ); + println!("\n blended speedup if the FACT is the terminal token (answer length T):"); + println!(" blended(T) = T·full / ((T-1)·full + early)"); + for t in [1usize, 2, 3, 4, 5, 8, 16] { + let blended = (t as f64 * full) / ((t as f64 - 1.0) * full + early); + let pct = 100.0 * (1.0 - 1.0 / blended); + println!(" T={t:<3} → {blended:.2}× ({pct:.0}% faster)"); + } + println!(" T→∞ → 1.00× (the one terminal token is amortised away)"); + println!("\n if the fact is NOT terminal (natural answer, e.g. \"… is Paris.\"): 1.00× (early-exit never fires)"); + + println!("\n ── verdict ──"); + println!( + " Realizable decode-loop value concentrates at T=1 / max_tokens=1 (answer-token-only" + ); + println!(" generation), which the single-forward `INFER … ROUTE VERIFY EXIT` already serves."); + println!( + " A streaming decode-loop build buys terminal-token early-exit only — worth it ONLY if" + ); + println!(" the target workload is dominated by short, answer-token-terminal generations."); + + let json = format!( + "{{\"experiment\":\"fr_early_exit_decode_projection\",\"vindex\":\"{vindex}\",\"install_layer\":{install_layer},\"num_layers\":{num_layers},\"fired\":{fired},\"full_ms\":{full:.4},\"early_ms\":{early:.4},\"per_token_speedup\":{per_tok:.4}}}" + ); + let out = "bench/aim-validation/fr_early_exit_decode_projection_gemma3-4b.json"; + if let Err(e) = std::fs::write(out, &json) { + eprintln!("warning: could not write {out}: {e}"); + } else { + println!("\nwrote {out}"); + } +} diff --git a/crates/larql-inference/src/experts/arith/alu.rs b/crates/larql-inference/src/experts/arith/alu.rs index d7c174ec8..26bd75f4b 100644 --- a/crates/larql-inference/src/experts/arith/alu.rs +++ b/crates/larql-inference/src/experts/arith/alu.rs @@ -489,6 +489,90 @@ mod tests { assert_eq!(e.max_operand_digits(), 6); } + // ── AT-C1 property tests: the expert's one absolute is that emitted + // digits are correct, so the bignum is cross-checked against i128 on + // bulk random ops and on the carry/borrow edge families, and the two + // eval tiers are cross-checked against each other above and below the + // i128 boundary. Seeded — reproducible, no wall-clock dependence. ── + + #[test] + fn property_add_sub_mul_match_i128_on_1000_random_pairs() { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(0xA11C1); + for case in 0..1000 { + // Span small to ~18-digit magnitudes so products stay in i128. + let a: i64 = rng.gen(); + let b: i64 = rng.gen(); + let (a, b) = (i128::from(a), i128::from(b)); + let (ba, bb) = (big(&a.to_string()), big(&b.to_string())); + assert_eq!(ba.add(&bb).to_string(), (a + b).to_string(), "case {case}: {a}+{b}"); + assert_eq!(ba.sub(&bb).to_string(), (a - b).to_string(), "case {case}: {a}-{b}"); + assert_eq!(ba.mul(&bb).to_string(), (a * b).to_string(), "case {case}: {a}*{b}"); + } + } + + #[test] + fn property_carry_chain_family() { + // 9…9 + 1 = 10…0 and 10…0 − 1 = 9…9 at every width through 40 + // digits — the all-positions carry/borrow ripple, crossing the + // i128 boundary (39 digits) on the way. + for width in 1..=40 { + let nines = "9".repeat(width); + let one_zeros = format!("1{}", "0".repeat(width)); + assert_eq!(big(&nines).add(&big("1")).to_string(), one_zeros, "width {width}"); + assert_eq!(big(&one_zeros).sub(&big("1")).to_string(), nines, "width {width}"); + // Nines-complement pair sums to all nines (the demo's 24-digit + // construction, generalized): N + (nines − N) = nines. + let n = big(&"4".repeat(width)); + assert_eq!(n.add(&big(&nines).sub(&n)).to_string(), nines, "width {width}"); + } + } + + #[test] + fn property_eval_tiers_agree_on_random_exprs() { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(0xA11C1 + 1); + let ops = [Op::Add, Op::Sub, Op::Mul]; + for case in 0..300 { + // 2–4 operands, mixed widths from 1 to 30 digits — exprs land + // on both sides of the i128 fast-path boundary. + let n_operands = rng.gen_range(2..=4); + let mut operands = Vec::new(); + for _ in 0..n_operands { + let width = rng.gen_range(1..=30); + let mut s = String::new(); + s.push(char::from(b'1' + rng.gen_range(0..9u8))); + for _ in 1..width { + s.push(char::from(b'0' + rng.gen_range(0..10u8))); + } + operands.push(big(&s)); + } + let e = Expr { + ops: (1..n_operands).map(|_| ops[rng.gen_range(0..3)]).collect(), + operands, + }; + let via_big = e.eval_big().to_string(); + assert_eq!(e.eval().to_string(), via_big, "case {case}: {e}"); + if let Some(fast) = e.eval_i128() { + assert_eq!(fast.to_string(), via_big, "case {case} fast/big: {e}"); + } + } + } + + #[test] + fn property_mul_widths_against_string_construction() { + // 10^a × 10^b = 10^(a+b): exercises mul_mag length/carry handling + // at controlled widths, including far past i128. + for a in [0usize, 1, 5, 19, 38, 60] { + for b in [0usize, 1, 7, 21, 40] { + let pa = big(&format!("1{}", "0".repeat(a))); + let pb = big(&format!("1{}", "0".repeat(b))); + let expect = format!("1{}", "0".repeat(a + b)); + assert_eq!(pa.mul(&pb).to_string(), expect, "10^{a} * 10^{b}"); + } + } + } + #[test] fn approx_magnitude_tracks_digit_count() { let n = big("999999999999999999999999"); // 24 nines ≈ 1e24 diff --git a/crates/larql-inference/src/experts/arith/drive.rs b/crates/larql-inference/src/experts/arith/drive.rs index 4dbbd2af4..823200cf0 100644 --- a/crates/larql-inference/src/experts/arith/drive.rs +++ b/crates/larql-inference/src/experts/arith/drive.rs @@ -15,7 +15,7 @@ use larql_models::ModelWeights; use larql_vindex::VectorIndex; use tokenizers::Tokenizer; -use crate::vindex::generate_kquant_cpu_constrained; +use crate::vindex::generate_kquant_cpu_constrained_cached; /// Why the forced decode stopped. #[derive(Debug, Clone, PartialEq, Eq)] @@ -66,7 +66,7 @@ pub fn force_decode_kquant( }; } let sched = schedule.to_vec(); - let out = generate_kquant_cpu_constrained( + let out = generate_kquant_cpu_constrained_cached( weights, tokenizer, prompt_ids, @@ -105,6 +105,76 @@ pub fn force_decode_kquant( } } +/// Backend-routed forced decode (the Metal path): same schedule contract as +/// [`force_decode_kquant`], but driven through +/// [`crate::layer_graph::generate_constrained_streaming_sampled`], which runs +/// the fused GPU pipeline when the backend supports Q4_K and falls back to +/// the CPU constrained loop otherwise. Forcing is sampler-level either way — +/// the mask is applied to CPU-resident logits before each pick, so the drive +/// is quantization- and backend-independent by construction (spec §10.5). +pub fn force_decode_backend( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + index: &VectorIndex, + backend: &dyn larql_compute::ComputeBackend, + prompt_ids: &[u32], + schedule: &[u32], +) -> ForcedDecode { + if schedule.is_empty() { + return ForcedDecode { + emitted: String::new(), + ids: Vec::new(), + cause: TerminationCause::ScheduleEnd, + }; + } + let sched = schedule.to_vec(); + let cached_layers = crate::layer_graph::CachedLayerGraph::from_residuals(vec![]); + let result = crate::layer_graph::generate_constrained_streaming_sampled( + weights, + tokenizer, + prompt_ids, + sched.len(), + index, + backend, + &cached_layers, + 0..weights.num_layers, + move |generated: &[u32], logits: &mut Vec| { + let step = generated.len(); + if let Some(&want) = sched.get(step) { + for (i, l) in logits.iter_mut().enumerate() { + if i as u32 != want { + *l = f32::NEG_INFINITY; + } + } + if let Some(l) = logits.get_mut(want as usize) { + if !l.is_finite() { + *l = 0.0; + } + } + } + }, + |_id, _tok, _p| {}, + crate::layer_graph::SamplingConfig::greedy(), + &crate::layer_graph::EosConfig::builtin(), + ); + // `GenerateResult` carries (text, prob) pairs, not ids; under the mask + // the picked id at step i can only be schedule[i], so the emitted count + // recovers the id prefix exactly. + let n = result.tokens.len(); + let ids: Vec = schedule[..n.min(schedule.len())].to_vec(); + let emitted: String = result.tokens.iter().map(|(t, _)| t.as_str()).collect(); + let cause = if n == schedule.len() && result.error.is_none() { + TerminationCause::ScheduleEnd + } else { + TerminationCause::EarlyStop { at: n } + }; + ForcedDecode { + emitted, + ids, + cause, + } +} + #[cfg(test)] mod tests { use super::*; @@ -134,6 +204,31 @@ mod tests { assert_eq!(fd.cause, TerminationCause::ScheduleEnd); } + #[test] + fn force_decode_backend_obeys_schedule_via_cpu_fallback() { + // On a CPU backend the constrained layer-graph path falls back to + // the CPU Q4K loop — the schedule contract must hold identically. + let mut weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let tokenizer = make_test_tokenizer(weights.vocab_size); + let backend = larql_compute::default_backend(); + let schedule = vec![2u32, 6, 4]; + let fd = force_decode_backend( + &mut weights, + &tokenizer, + &index, + &*backend, + &[0u32, 1], + &schedule, + ); + assert_eq!(fd.ids, schedule); + assert_eq!(fd.cause, TerminationCause::ScheduleEnd); + + let empty = force_decode_backend(&mut weights, &tokenizer, &index, &*backend, &[0u32], &[]); + assert!(empty.ids.is_empty()); + assert_eq!(empty.cause, TerminationCause::ScheduleEnd); + } + #[test] fn termination_cause_labels() { assert_eq!(TerminationCause::ScheduleEnd.label(), "schedule_end"); diff --git a/crates/larql-inference/src/experts/arith/mod.rs b/crates/larql-inference/src/experts/arith/mod.rs index 636377979..9ba38dcdc 100644 --- a/crates/larql-inference/src/experts/arith/mod.rs +++ b/crates/larql-inference/src/experts/arith/mod.rs @@ -22,7 +22,7 @@ use larql_vindex::VectorIndex; use serde::Serialize; use tokenizers::Tokenizer; -use crate::vindex::generate_kquant_cpu; +use crate::vindex::generate_kquant_cpu_cached; use super::virtual_expert::{DriveSchedule, ExtractMiss, Fire, ResidualTap, Verdict, VirtualExpert}; use alu::{BigInt, Expr}; @@ -207,7 +207,8 @@ pub fn ave_generate_kquant( .map_err(|e| format!("tokenize rewrite prompt: {e}"))? .get_ids() .to_vec(); - let rew = generate_kquant_cpu(weights, tokenizer, &rids, opts.rewrite_max_tokens, index); + let rew = + generate_kquant_cpu_cached(weights, tokenizer, &rids, opts.rewrite_max_tokens, index); rewrite_tokens = rew.len(); let rew_text: String = rew.iter().map(|(t, _)| t.as_str()).collect(); match expert.extract(prompt, Some(&rew_text)) { @@ -283,7 +284,8 @@ fn run_native( prompt_ids: &[u32], opts: &AveOptions, ) -> (String, usize) { - let out = generate_kquant_cpu(weights, tokenizer, prompt_ids, opts.max_native_tokens, index); + let out = + generate_kquant_cpu_cached(weights, tokenizer, prompt_ids, opts.max_native_tokens, index); let n = out.len(); (out.into_iter().map(|(t, _)| t).collect(), n) } diff --git a/crates/larql-inference/src/ffn/local_moe.rs b/crates/larql-inference/src/ffn/local_moe.rs index 5fa1cf1ea..617941063 100644 --- a/crates/larql-inference/src/ffn/local_moe.rs +++ b/crates/larql-inference/src/ffn/local_moe.rs @@ -33,7 +33,7 @@ use larql_models::ModelWeights; use ndarray::Array2; use crate::ffn::WeightFfn; -use crate::vindex::moe_ffn_block_cpu; +use crate::vindex::moe_ffn_block_cpu_with_index; /// In-process MoE [`FfnBackend`] for CPU decode through a `KvEngine`. /// @@ -41,8 +41,13 @@ use crate::vindex::moe_ffn_block_cpu; /// `weights.tensors` — the caller pre-dequantizes the client's Q4K attention + /// dense FFN), and the expert `h2` contribution is computed locally from the /// resident expert weights (no shards). +/// +/// When `index` is set, the dense slab can additionally run quantised-direct +/// under `LARQL_Q4K_DIRECT_FFN=1` (decode steps only); with the flag unset or +/// `index: None` the path is byte-identical to the f32 `WeightFfn` slab. pub struct LocalMoeFfn<'a> { pub weights: &'a ModelWeights, + pub index: Option<&'a larql_vindex::VectorIndex>, } impl<'a> FfnBackend for LocalMoeFfn<'a> { @@ -69,7 +74,7 @@ impl<'a> FfnBackend for LocalMoeFfn<'a> { layer: usize, h_post_attn: &Array2, ) -> Option> { - Some(moe_ffn_block_cpu( + Some(moe_ffn_block_cpu_with_index( self.weights, h_post_attn, layer, @@ -78,6 +83,7 @@ impl<'a> FfnBackend for LocalMoeFfn<'a> { }, None, None, + self.index, )) } } @@ -96,7 +102,7 @@ mod tests { #[test] fn forward_moe_full_layer_returns_finite_combined_output() { let weights = make_test_gemma4_moe_weights(); - let ffn = LocalMoeFfn { weights: &weights }; + let ffn = LocalMoeFfn { weights: &weights, index: None }; let h_post_attn = Array2::::from_elem((2, weights.hidden_size), 0.1); let out = ffn .forward_moe_full_layer(0, &h_post_attn) @@ -130,7 +136,7 @@ mod tests { // (The synthetic fixture's expert weights are tiny, so the delta is // small but strictly non-zero; an identically-zeroed h2 would give a // bit-exact match.) - let local = LocalMoeFfn { weights: &weights }; + let local = LocalMoeFfn { weights: &weights, index: None }; let disconnected = RemoteMoeBackend::new_disconnected(); let remote = RemoteMoeFfn { weights: &weights, @@ -156,7 +162,7 @@ mod tests { #[test] fn dense_fallbacks_and_name() { let weights = make_test_gemma4_moe_weights(); - let ffn = LocalMoeFfn { weights: &weights }; + let ffn = LocalMoeFfn { weights: &weights, index: None }; assert_eq!(ffn.name(), "local-moe"); let x = Array2::::from_elem((2, weights.hidden_size), 0.1); let dense = ffn.forward(0, &x); diff --git a/crates/larql-inference/src/forward/predict/dense.rs b/crates/larql-inference/src/forward/predict/dense.rs index b9679d98f..abe7897e7 100644 --- a/crates/larql-inference/src/forward/predict/dense.rs +++ b/crates/larql-inference/src/forward/predict/dense.rs @@ -208,6 +208,91 @@ pub fn logits_to_predictions_q4_lm_head( finalize_topk_predictions(logits, tokenizer, top_k) } +/// Decode-loop fast path: argmax over the Q4_K lm_head WITHOUT the +/// full-vocab softmax. Logit scaling (×1/scale), final softcapping +/// (`tanh`-based) and temperature are all strictly monotone, so the argmax +/// over RAW matvec outputs selects the same token as +/// [`logits_to_predictions_q4_lm_head`] — while skipping the serial 262K-wide +/// `exp` pass + the ~3 MB of probability/index temporaries it allocates +/// (sampled at ~4.6% of decode wall time on the 26B). The max itself runs +/// rayon-parallel with a deterministic lowest-index tie-break. +pub fn q4_lm_head_argmax( + weights: &ModelWeights, + h: &Array2, + q4_lm_head: &[u8], + vocab: usize, + tokenizer: &tokenizers::Tokenizer, +) -> Option<(u32, String)> { + use rayon::prelude::*; + + let seq_len = h.shape()[0]; + let norm_offset = weights.arch.norm_weight_offset(); + let h_final = apply_norm(weights, h, weights.arch.final_norm_key(), norm_offset); + let hidden = h_final.shape()[1]; + let last_row: &[f32] = h_final + .row(seq_len - 1) + .to_slice() + .or_else(|| h_final.as_slice())?; + + // Same raw-matvec block as `logits_to_predictions_q4_lm_head`. + let raw = { + use larql_compute::cpu::ops::q4k_q8k_dot::{ + q4k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, + }; + let mut h_q8k = Q8KActivation::with_capacity(hidden); + quantize_x_to_q8k_into(&mut h_q8k, last_row); + let bytes_per_row = (hidden / 256) * 144; + let mut out = vec![0.0f32; vocab]; + const CHUNK_ROWS: usize = 64; + out.par_chunks_mut(CHUNK_ROWS) + .enumerate() + .for_each(|(chunk_idx, chunk)| { + let row_start = chunk_idx * CHUNK_ROWS; + let chunk_len = chunk.len().min(vocab.saturating_sub(row_start)); + if chunk_len == 0 { + return; + } + let w_chunk = + &q4_lm_head[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; + q4k_q8k_matvec_into(&mut chunk[..chunk_len], &h_q8k, w_chunk, chunk_len, hidden); + }); + out + }; + + // Parallel argmax; ties resolve to the lowest index (matches the serial + // `select_nth`/sort behaviour for distinct maxima; deterministic always). + let (best_idx, _) = raw + .par_chunks(8192) + .enumerate() + .map(|(ci, chunk)| { + let mut bi = 0usize; + let mut bv = f32::NEG_INFINITY; + for (i, &v) in chunk.iter().enumerate() { + if v > bv { + bv = v; + bi = i; + } + } + (ci * 8192 + bi, bv) + }) + .reduce( + || (usize::MAX, f32::NEG_INFINITY), + |a, b| { + if b.1 > a.1 || (b.1 == a.1 && b.0 < a.0) { + b + } else { + a + } + }, + ); + if best_idx == usize::MAX { + return None; + } + let id = best_idx as u32; + let decoded = tokenizer.decode(&[id], true).ok()?; + Some((id, decoded)) +} + /// Shared softmax + top-k decode used by both the f32 and Q4 lm_head /// paths. Pulled out so the two flavours diverge only in how they /// compute the raw logits. diff --git a/crates/larql-inference/src/forward/predict/mod.rs b/crates/larql-inference/src/forward/predict/mod.rs index 9261be2db..6845281ac 100644 --- a/crates/larql-inference/src/forward/predict/mod.rs +++ b/crates/larql-inference/src/forward/predict/mod.rs @@ -24,9 +24,9 @@ pub use raw::{ }; pub use dense::{ - logit_lens_top1, logits_to_predictions_pub, logits_to_predictions_q4_lm_head, predict, - predict_from_hidden, predict_from_hidden_with_ffn, predict_with_ffn_trace, - predict_with_temperature, + logit_lens_top1, logits_to_predictions_pub, logits_to_predictions_q4_lm_head, + q4_lm_head_argmax, predict, predict_from_hidden, predict_from_hidden_with_ffn, + predict_with_ffn_trace, predict_with_temperature, }; pub use ffn::{ diff --git a/crates/larql-inference/src/vindex/kquant_forward/cached.rs b/crates/larql-inference/src/vindex/kquant_forward/cached.rs index 7a766f989..1bdd6e19a 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/cached.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/cached.rs @@ -35,7 +35,7 @@ use ndarray::Array2; use crate::attention::{ decode::{gqa_attention_decode_step, run_attention_block_decode_step_backend}, - rope::apply_rope_partial_at, + rope::apply_rope_partial_at_full, run_attention_with_kv_backend, }; use crate::ffn::WeightFfn; @@ -620,15 +620,26 @@ pub fn attention_decode_step_native( Some(norm_w) => rms_norm_heads(&q_full, norm_w, num_q, head_dim, qk_norm_off), None => q_full, }; - let layer_rope_base = arch.rope_base_for_layer(layer); + // RoPE must match the staged path / prefill exactly: override-aware + // base, the per-layer position divisor (Gemma 3 linear rope_scaling + // applies ÷factor on GLOBAL layers only), and llama3 frequency + // scaling. The unscaled `apply_rope_partial_at` here was the direct- + // path divergence on gemma3-4b (global-layer K/Q rope'd at 8× the + // position the prefill cache used — `ave_direct_step_parity`). + let layer_rope_base = crate::forward_overrides::effective_rope_base_for_layer(arch, layer); let rotary_frac = arch.rotary_fraction_for_layer(layer); - let q_rope = apply_rope_partial_at( + let pos_divisor = + crate::forward_overrides::effective_rope_position_divisor_for_layer(arch, layer); + let llama3 = crate::forward_overrides::effective_llama3_rope_scaling(arch); + let q_rope = apply_rope_partial_at_full( &q_normed, num_q, head_dim, layer_rope_base, rotary_frac, abs_position, + pos_divisor, + llama3, ); let k_vec = matvec_q4k_or_q6k_q8k(k_bytes, k_fmt, &h_norm_q8k, kv_dim, hidden)?; @@ -657,13 +668,15 @@ pub fn attention_decode_step_native( Some(norm_w) => rms_norm_heads(&k_full_new, norm_w, num_kv, head_dim, qk_norm_off), None => k_full_new, }; - let k_new_rope = apply_rope_partial_at( + let k_new_rope = apply_rope_partial_at_full( &k_normed, num_kv, head_dim, layer_rope_base, rotary_frac, abs_position, + pos_divisor, + llama3, ); let (k_concat, v_concat) = match kv_entry { @@ -809,36 +822,70 @@ fn run_ffn_decode_step_q4k_direct( let gate_vec = matvec_q4k_or_q6k_q8k(gate_bytes, gate_fmt, &h_in_q8k, intermediate, hidden)?; let up_vec = matvec_q4k_or_q6k_q8k(up_bytes, up_fmt, &h_in_q8k, intermediate, hidden)?; - // Element-wise activation: activation(gate) * up. + // Element-wise activation: activation(gate) * up. Rayon-chunked — the + // per-element math (libm tanh/exp included) is unchanged, so the output + // is bit-identical to the serial loop; the decode sample showed this + // scalar pass serial on the main thread while the workers slept. let mut activated = vec![0.0f32; intermediate]; - match arch.activation() { - larql_models::Activation::GeluTanh => { - let sqrt_2_over_pi = (2.0f32 / std::f32::consts::PI).sqrt(); - for i in 0..intermediate { - let x = gate_vec[i]; - let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); - let g = 0.5 * x * (1.0 + inner.tanh()); - activated[i] = g * up_vec[i]; - } - } - _ => { - // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. - for i in 0..intermediate { - let x = gate_vec[i]; - let sig = 1.0 / (1.0 + (-x).exp()); - let g = x * sig; - activated[i] = g * up_vec[i]; - } - } + { + use rayon::prelude::*; + let gelu = matches!(arch.activation(), larql_models::Activation::GeluTanh); + let sqrt_2_over_pi = (2.0f32 / std::f32::consts::PI).sqrt(); + activated + .par_chunks_mut(256) + .zip(gate_vec.par_chunks(256).zip(up_vec.par_chunks(256))) + .for_each(|(a_c, (g_c, u_c))| { + if gelu { + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); + *a = 0.5 * x * (1.0 + inner.tanh()) * u; + } + } else { + // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let sig = 1.0 / (1.0 + (-x).exp()); + *a = x * sig * u; + } + } + }); } // down projection: out = activated @ W_down.T → [hidden]. // Re-quantise the post-activation vector (`intermediate`-wide) for // the down matvec — different input from gate/up. - let mut activated_q8k = Q8KActivation::with_capacity(intermediate); - quantize_x_to_q8k_into(&mut activated_q8k, &activated); + // + // The stored down row width may be PADDED up to a 256-multiple when + // `intermediate` isn't one (e.g. the 26B-A4B hybrid-MoE dense slab: + // intermediate 2112 stored as 2304-col Q6_K rows). Derive the stored + // width from the byte length and zero-pad the activation to match — + // pad columns multiply zero activations, so the result is exact. + let down_sb_bytes = match down_fmt { + "Q4_K" => 144, + "Q6_K" => 210, + _ => return None, + }; + let down_bytes_per_row = down_bytes.len() / hidden; + if down_bytes_per_row == 0 || !down_bytes_per_row.is_multiple_of(down_sb_bytes) { + return None; + } + let stored_cols = + down_bytes_per_row / down_sb_bytes * larql_models::quant::ggml::Q4_K_BLOCK_ELEMS; + if stored_cols < intermediate { + return None; + } + let activated_padded: Vec; + let act_slice: &[f32] = if stored_cols != intermediate { + let mut p = vec![0.0f32; stored_cols]; + p[..intermediate].copy_from_slice(&activated); + activated_padded = p; + &activated_padded + } else { + &activated + }; + let mut activated_q8k = Q8KActivation::with_capacity(stored_cols); + quantize_x_to_q8k_into(&mut activated_q8k, act_slice); let down_vec = - matvec_q4k_or_q6k_q8k(down_bytes, down_fmt, &activated_q8k, hidden, intermediate)?; + matvec_q4k_or_q6k_q8k(down_bytes, down_fmt, &activated_q8k, hidden, stored_cols)?; let mut out = vec_to_2d_row(down_vec); if let Some(bias) = arch .ffn_down_bias_key(layer) @@ -1121,6 +1168,53 @@ mod tests { // ── predict_kquant_decode_step_direct (Q4K × Q8K sdot path) ──────────── + /// The direct step must TRACK the staged step, not merely stay finite: + /// same prefill cache, same token, same position → high-cosine hidden + /// agreement. (The q4_common f16 subnormal bug passed the finite-only + /// check below while garbling chained generation on real models — + /// see `examples/ave_direct_step_parity.rs`.) + #[test] + fn predict_kquant_decode_step_direct_tracks_staged_step() { + let token_ids = vec![1u32, 2, 3]; + + let mut fx_a = Q4KTestFixtures::build(); + let (_, mut cache_a, _) = + predict_kquant_prefill(&mut fx_a.weights, &token_ids, &fx_a.index); + let (h_staged, _) = + predict_kquant_decode_step(&mut fx_a.weights, 4, &fx_a.index, &mut cache_a, 3) + .expect("staged step"); + + let mut fx_b = Q4KTestFixtures::build(); + let (_, mut cache_b, _) = + predict_kquant_prefill(&mut fx_b.weights, &token_ids, &fx_b.index); + let backend = CpuBackend; + let h_direct = predict_kquant_decode_step_direct( + &mut fx_b.weights, + 4, + &fx_b.index, + &backend, + &mut cache_b, + 3, + ) + .expect("direct step"); + + let a = h_staged.row(0); + let b = h_direct.row(0); + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let na: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + let cos = dot / (na * nb); + assert!( + cos > 0.999, + "direct step diverged from staged step: cosine {cos} (norms {na} vs {nb})" + ); + let ratio = if na > nb { na / nb } else { nb / na }; + assert!( + ratio < 1.05, + "direct step norm drifted from staged: {na} vs {nb}" + ); + } + #[test] fn predict_kquant_decode_step_direct_returns_finite_hidden() { let mut fx = Q4KTestFixtures::build(); @@ -1462,4 +1556,129 @@ mod branch_tests { let t = CachedTimings::default(); assert_eq!(t.dequant_ms, 0.0); } + + /// Padded-down handling: when the stored down rows are wider than the + /// layer's `intermediate` (256-padded — the 26B-A4B hybrid-MoE dense + /// slab stores intermediate 2112 as 2304-col rows), the direct FFN + /// step derives the stored width from the byte length, zero-pads the + /// activation, and produces the same output as the unpadded layout: + /// the real 256-element quant blocks are bit-identical and the pad + /// blocks multiply zero activations. + #[test] + fn ffn_decode_step_native_padded_down_matches_unpadded() { + use crate::test_utils::arc_mmap_from_bytes; + use larql_compute::cpu::ops::q4_common::quantize_q4_k; + + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let hidden = weights.hidden_size; + let h_post_attn = ndarray::Array2::from_shape_vec( + (1, hidden), + (0..hidden) + .map(|i| ((i as f32) * 0.013).sin() * 0.05) + .collect(), + ) + .unwrap(); + let backend = CpuBackend; + let baseline = ffn_decode_step_native(&weights, &index, &backend, &h_post_attn, 0) + .expect("unpadded direct FFN step"); + + // Rebuild the interleaved storage with every down matrix stored + // 256-padded: [hidden, inter] → [hidden, inter + 256], zero cols. + let arch = &*weights.arch; + let mut payload: Vec = Vec::new(); + let mut manifest: Vec<(usize, usize, String)> = Vec::new(); + for layer in 0..weights.num_layers { + for (key, pad) in [ + (arch.ffn_gate_key(layer), false), + (arch.ffn_up_key(layer), false), + (arch.ffn_down_key(layer), true), + ] { + let tensor = weights + .tensors + .get(&key) + .unwrap_or_else(|| panic!("missing tensor {key}")); + let bytes = if pad { + let rows = tensor.shape()[0]; + let cols = tensor.shape()[1]; + let padded_cols = cols + 256; + let mut padded = vec![0.0f32; rows * padded_cols]; + for r in 0..rows { + let src = tensor.row(r).to_vec(); + padded[r * padded_cols..r * padded_cols + cols].copy_from_slice(&src); + } + quantize_q4_k(&padded) + } else { + quantize_q4_k(tensor.as_slice().expect("contiguous row-major")) + }; + let offset = payload.len(); + manifest.push((offset, bytes.len(), "Q4_K".to_string())); + payload.extend_from_slice(&bytes); + } + } + let mut index_padded = make_test_q4k_vindex(&weights); + { + let storage = std::sync::Arc::make_mut(&mut index_padded.storage); + storage.set_interleaved_kquant(arc_mmap_from_bytes(&payload), Some(manifest)); + } + assert_eq!( + index_padded.num_features(0), + index.num_features(0), + "down padding must not change the derived intermediate width \ + (num_features comes from the gate manifest)" + ); + + let padded_out = ffn_decode_step_native(&weights, &index_padded, &backend, &h_post_attn, 0) + .expect("padded direct FFN step"); + + let max_abs = baseline + .iter() + .zip(padded_out.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0_f32, f32::max); + assert!( + max_abs <= 1e-5, + "padded-down output must match unpadded layout (max_abs={max_abs})" + ); + } + + /// The padded-down derivation must reject byte lengths that aren't a + /// whole number of super-blocks per row (corrupt / mismatched store) + /// rather than computing with a truncated width. + #[test] + fn ffn_decode_step_native_rejects_ragged_down_bytes() { + use crate::test_utils::arc_mmap_from_bytes; + use larql_compute::cpu::ops::q4_common::quantize_q4_k; + + let weights = make_test_q4k_weights(); + let arch = &*weights.arch; + let mut payload: Vec = Vec::new(); + let mut manifest: Vec<(usize, usize, String)> = Vec::new(); + for layer in 0..weights.num_layers { + for key in [ + arch.ffn_gate_key(layer), + arch.ffn_up_key(layer), + arch.ffn_down_key(layer), + ] { + let tensor = weights.tensors.get(&key).expect("fixture tensor"); + let mut bytes = quantize_q4_k(tensor.as_slice().expect("contiguous")); + if key == arch.ffn_down_key(layer) { + bytes.truncate(bytes.len() - 7); // ragged: not a whole super-block + } + let offset = payload.len(); + manifest.push((offset, bytes.len(), "Q4_K".to_string())); + payload.extend_from_slice(&bytes); + } + } + let mut index = make_test_q4k_vindex(&weights); + { + let storage = std::sync::Arc::make_mut(&mut index.storage); + storage.set_interleaved_kquant(arc_mmap_from_bytes(&payload), Some(manifest)); + } + let h = ndarray::Array2::::from_elem((1, weights.hidden_size), 0.01); + assert!( + ffn_decode_step_native(&weights, &index, &CpuBackend, &h, 0).is_none(), + "ragged down byte length must fall back (return None), not mis-stride" + ); + } } diff --git a/crates/larql-inference/src/vindex/kquant_forward/generation.rs b/crates/larql-inference/src/vindex/kquant_forward/generation.rs index e6163860d..2867fc3c9 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/generation.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/generation.rs @@ -99,6 +99,126 @@ pub fn generate_kquant_cpu_remote( out } +/// KV-cached autoregressive generation: one prefill over the prompt, then +/// one [`super::cached::predict_kquant_decode_step`] per token — O(n) decode +/// instead of the full-recompute O(n²) of [`generate_kquant_cpu`]. Greedy. +/// +/// Falls back to the naive loop when the arch doesn't support cached decode +/// (hybrid MoE, KV-shared layers — see `supports_cached_decode`). +pub fn generate_kquant_cpu_cached( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + prompt_ids: &[u32], + max_tokens: usize, + index: &VectorIndex, +) -> Vec<(String, u32)> { + generate_kquant_cpu_constrained_cached( + weights, + tokenizer, + prompt_ids, + max_tokens, + index, + |_, _| {}, + ) +} + +/// KV-cached variant of [`generate_kquant_cpu_constrained`]: same mask +/// contract (called on raw logits before each greedy pick, `-inf` to +/// exclude), same EOS policy, prefill + per-token decode steps instead of +/// full recompute. Falls back to the naive loop when cached decode is +/// unsupported for the arch. +pub fn generate_kquant_cpu_constrained_cached( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + prompt_ids: &[u32], + max_tokens: usize, + index: &VectorIndex, + mut mask_fn: M, +) -> Vec<(String, u32)> +where + M: FnMut(&[u32], &mut Vec), +{ + if !super::cached::supports_cached_decode(weights) { + return generate_kquant_cpu_constrained( + weights, + tokenizer, + prompt_ids, + max_tokens, + index, + mask_fn, + ); + } + let mut out: Vec<(String, u32)> = Vec::with_capacity(max_tokens); + if max_tokens == 0 || prompt_ids.is_empty() { + return out; + } + + let eos = crate::layer_graph::EosConfig::builtin(); + let mut sampler = + crate::layer_graph::Sampler::new(crate::layer_graph::SamplingConfig::greedy()); + let mut generated: Vec = Vec::with_capacity(max_tokens); + + let (h, mut cache, _timings) = super::cached::predict_kquant_prefill(weights, prompt_ids, index); + let last = h.nrows().saturating_sub(1); + let mut h_last = h.slice(ndarray::s![last..last + 1, ..]).to_owned(); + + // Prefer the dequant-free direct-matvec step — the staged step's cost + // is dominated by re-dequantising every layer's tensors. Parity with + // the staged step was restored by the q4_common f16 subnormal fix + // (subnormal scales decoded 2× → garbled K on outlier layers; probe + // `examples/ave_direct_step_parity.rs`, post-fix hidden cosine + // 0.99995, identical top-k). `LARQL_DIRECT_DECODE_STEP=0` forces the + // staged step for A/B runs. + let direct = std::env::var("LARQL_DIRECT_DECODE_STEP").map(|v| v != "0").unwrap_or(true) + && super::cached::supports_direct_matvec_decode(weights, index); + let backend = larql_compute::default_backend(); + + for step in 0..max_tokens { + let mut logits = crate::forward::hidden_to_raw_logits(weights, &h_last); + mask_fn(&generated, &mut logits); + + let id = match sampler.sample_with_history(&logits, &generated) { + Some(id) => id, + None => break, + }; + // Same sanity bail as the naive loop: a non-finite pick means the + // mask wiped everything. + let score = *logits.get(id as usize).unwrap_or(&f32::NEG_INFINITY); + if !score.is_finite() { + break; + } + let tok = tokenizer.decode(&[id], true).unwrap_or_default(); + let stop = eos.is_eos_with_tokenizer(id, &tok, tokenizer); + out.push((tok, id)); + generated.push(id); + if stop || step + 1 == max_tokens { + break; + } + + // Feed the picked token through one cached step; its absolute RoPE + // position is prompt_len + step. + let abs_position = prompt_ids.len() + step; + let h_next = if direct { + super::cached::predict_kquant_decode_step_direct( + weights, + id, + index, + &*backend, + &mut cache, + abs_position, + ) + } else { + super::cached::predict_kquant_decode_step(weights, id, index, &mut cache, abs_position) + .map(|(h, _t)| h) + }; + match h_next { + Some(h) => h_last = h, + None => break, + } + } + out +} + /// Constrained variant of [`generate_kquant_cpu`]. Greedy under the mask. pub fn generate_kquant_cpu_constrained( weights: &mut ModelWeights, @@ -350,6 +470,78 @@ mod tests { assert!(out.len() <= 2); } + /// Parity gate for the KV-cached decode loop: on the same fixture, + /// prompt and mask, the cached path must emit exactly the ids the + /// naive full-recompute path emits (the timing win is only real on a + /// kernel that says the same thing — parity before tok/s). + #[test] + fn cached_constrained_matches_naive_loop_on_fixture() { + let index; + let tokenizer; + let naive = { + let mut weights = make_test_q4k_weights(); + index = make_test_q4k_vindex(&weights); + tokenizer = make_test_tokenizer(weights.vocab_size); + generate_kquant_cpu_constrained( + &mut weights, + &tokenizer, + &[0u32, 1, 2], + 4, + &index, + |_, _| {}, + ) + }; + // Fresh weights for the cached run — both paths mutate layer + // tensor scratch, a shared instance would hide state leakage. + let mut weights = make_test_q4k_weights(); + let cached = generate_kquant_cpu_constrained_cached( + &mut weights, + &tokenizer, + &[0u32, 1, 2], + 4, + &index, + |_, _| {}, + ); + let naive_ids: Vec = naive.iter().map(|(_, id)| *id).collect(); + let cached_ids: Vec = cached.iter().map(|(_, id)| *id).collect(); + assert_eq!(naive_ids, cached_ids); + } + + /// Cached path under a forcing mask emits exactly the forced ids, and + /// the unconstrained wrapper runs. + #[test] + fn cached_constrained_obeys_a_forcing_mask() { + let mut weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let tokenizer = make_test_tokenizer(weights.vocab_size); + let schedule = [7u32, 3, 9]; + let out = generate_kquant_cpu_constrained_cached( + &mut weights, + &tokenizer, + &[0u32, 1], + schedule.len(), + &index, + |generated, logits| { + let want = schedule[generated.len()]; + for (i, l) in logits.iter_mut().enumerate() { + if i as u32 != want { + *l = f32::NEG_INFINITY; + } + } + if let Some(l) = logits.get_mut(want as usize) { + if !l.is_finite() { + *l = 0.0; + } + } + }, + ); + let ids: Vec = out.iter().map(|(_, id)| *id).collect(); + assert_eq!(ids, schedule); + + let plain = generate_kquant_cpu_cached(&mut weights, &tokenizer, &[0u32, 1], 2, &index); + assert!(plain.len() <= 2); + } + /// `generate_kquant_cpu_constrained_streaming` wraps the sampled /// variant with `SamplingConfig::greedy()`. Drives lines 133, 146-156 /// — the body just forwards to the sampled variant. diff --git a/crates/larql-inference/src/vindex/kquant_forward/hidden.rs b/crates/larql-inference/src/vindex/kquant_forward/hidden.rs index 63366dd60..91753f6f8 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/hidden.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/hidden.rs @@ -161,6 +161,35 @@ pub fn moe_ffn_block_cpu( ffn: &dyn crate::ffn::FfnBackend, ple_input: Option<&Array2>, moe_remote: Option<&crate::ffn::RemoteMoeBackend>, +) -> Array2 { + moe_ffn_block_cpu_with_index(weights, h_post_attn, layer, ffn, ple_input, moe_remote, None) +} + +/// `LARQL_Q4K_DIRECT_FFN=1` routes the hybrid-MoE *dense slab* through the +/// direct Q4_K/Q6_K matvec (`ffn_decode_step_native`) instead of the +/// f32-resident `run_ffn` — on the 26B-A4B this drops the slab's per-token +/// traffic ~7× (2.14 GB f32 → ~0.3 GB quantised). Decode-only (single-row): +/// prefill stays on the f32 BLAS gemm, where repeated quantised matvec +/// loses (the task-#16 prefill falsification). Default off = byte-identical. +fn q4k_direct_ffn_enabled() -> bool { + use std::sync::OnceLock; + static ON: OnceLock = OnceLock::new(); + *ON.get_or_init(|| std::env::var("LARQL_Q4K_DIRECT_FFN").as_deref() == Ok("1")) +} + +/// Index-aware variant of [`moe_ffn_block_cpu`]: when `index` is provided +/// (the resident engine path threads it) and `LARQL_Q4K_DIRECT_FFN=1`, the +/// dense `h1` contribution reads quantised gate/up/down bytes directly; +/// otherwise byte-identical to [`moe_ffn_block_cpu`]. +#[allow(clippy::too_many_arguments)] +pub fn moe_ffn_block_cpu_with_index( + weights: &ModelWeights, + h_post_attn: &Array2, + layer: usize, + ffn: &dyn crate::ffn::FfnBackend, + ple_input: Option<&Array2>, + moe_remote: Option<&crate::ffn::RemoteMoeBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Array2 { let arch = &*weights.arch; let norm_offset = arch.norm_weight_offset(); @@ -175,7 +204,23 @@ pub fn moe_ffn_block_cpu( } let _t_dense = std::time::Instant::now(); - let (h_post_ffn_dense, _) = crate::forward::run_ffn(weights, h_post_attn, layer, ffn, false); + // Dense slab: quantised-direct on the decode step when enabled, with a + // per-layer fallback to the f32 path (`ffn_decode_step_native` returns + // `None` on unsupported formats/shapes). + let h_post_ffn_dense = index + .filter(|_| q4k_direct_ffn_enabled() && h_post_attn.nrows() == 1) + .and_then(|idx| { + super::cached::ffn_decode_step_native( + weights, + idx, + &larql_compute::CpuBackend, + h_post_attn, + layer, + ) + }) + .unwrap_or_else(|| { + crate::forward::run_ffn(weights, h_post_attn, layer, ffn, false).0 + }); crate::decode_stages::record_dense(_t_dense.elapsed().as_nanos()); let h1 = &h_post_ffn_dense - h_post_attn; @@ -193,6 +238,10 @@ pub fn moe_ffn_block_cpu( } } } else { + // Local experts count toward the expert stage too (`LARQL_DECODE_STAGES`) + // — previously only the remote branch recorded, so in-process MoE + // decode showed 0 expert time and the split was unusable. + let _t_expert = std::time::Instant::now(); let moe_weights = crate::layer_graph::pipeline_layer::build_moe_weights(weights, arch, layer); if let Some(ref moe) = moe_weights { @@ -209,6 +258,7 @@ pub fn moe_ffn_block_cpu( *dst = *src; } } + crate::decode_stages::record_expert(_t_expert.elapsed().as_nanos()); } else { let out = h_post_ffn_dense; let mut h_ple = diff --git a/crates/larql-inference/src/vindex/kquant_forward/mod.rs b/crates/larql-inference/src/vindex/kquant_forward/mod.rs index f80074664..4d8e48873 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/mod.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/mod.rs @@ -28,11 +28,12 @@ pub use cached::{ pub(crate) use generation::generate_kquant_cpu_constrained_streaming_sampled_with_eos; pub use generation::{ - generate_kquant_cpu, generate_kquant_cpu_constrained, - generate_kquant_cpu_constrained_streaming, generate_kquant_cpu_constrained_streaming_sampled, - generate_kquant_cpu_remote, is_end_of_turn, predict_kquant, + generate_kquant_cpu, generate_kquant_cpu_cached, generate_kquant_cpu_constrained, + generate_kquant_cpu_constrained_cached, generate_kquant_cpu_constrained_streaming, + generate_kquant_cpu_constrained_streaming_sampled, generate_kquant_cpu_remote, + is_end_of_turn, predict_kquant, }; -pub use hidden::{moe_ffn_block_cpu, predict_kquant_hidden}; +pub use hidden::{moe_ffn_block_cpu, moe_ffn_block_cpu_with_index, predict_kquant_hidden}; pub use hooks::predict_kquant_hidden_hooked; pub use interventions::{ predict_kquant_hidden_with_mapped_head_residual_delta, diff --git a/crates/larql-inference/src/vindex/mod.rs b/crates/larql-inference/src/vindex/mod.rs index ba6b140eb..c012635d5 100644 --- a/crates/larql-inference/src/vindex/mod.rs +++ b/crates/larql-inference/src/vindex/mod.rs @@ -15,11 +15,13 @@ pub use dequant::ensure_attn_tensors_dequantised; pub(crate) use kquant_forward::generate_kquant_cpu_constrained_streaming_sampled_with_eos; pub use kquant_forward::{ attention_decode_step_native, ffn_decode_step_native, fused_decode_step, - fused_decode_step_with_state, fused_prefill, generate_kquant_cpu, - generate_kquant_cpu_constrained, generate_kquant_cpu_constrained_streaming, + fused_decode_step_with_state, fused_prefill, generate_kquant_cpu, generate_kquant_cpu_cached, + generate_kquant_cpu_constrained, generate_kquant_cpu_constrained_cached, + generate_kquant_cpu_constrained_streaming, generate_kquant_cpu_constrained_streaming_sampled, generate_kquant_cpu_remote, insert_q4k_layer_tensors, is_end_of_turn, kquant_ffn_forward_layer, - kquant_ffn_forward_layer_q8k, moe_ffn_block_cpu, predict_kquant, predict_kquant_decode_step, + kquant_ffn_forward_layer_q8k, moe_ffn_block_cpu, moe_ffn_block_cpu_with_index, predict_kquant, + predict_kquant_decode_step, predict_kquant_decode_step_direct, predict_kquant_decode_step_direct_with_state, predict_kquant_hidden, predict_kquant_hidden_hooked, predict_kquant_hidden_with_ffn, predict_kquant_hidden_with_mapped_head_residual_delta, diff --git a/crates/larql-inference/tests/test_q4k_cached_parity.rs b/crates/larql-inference/tests/test_q4k_cached_parity.rs index 4c465b134..ac0b61e60 100644 --- a/crates/larql-inference/tests/test_q4k_cached_parity.rs +++ b/crates/larql-inference/tests/test_q4k_cached_parity.rs @@ -226,4 +226,44 @@ fn direct_matvec_decode_matches_dequant_path() { "direct and dequant decode disagree on every position — looks like a structural bug, \ not Q8 rounding drift: {direct_ids:?} vs {dequant_ids:?}" ); + + // ── One-step hidden-state cosine gate ───────────────────────────── + // Token-level "any match" tolerates a lot; the hidden state does not. + // From identical caches and an identical input token, the direct step + // must track the dequant step at ≥0.999 cosine. The q4_common f16 + // subnormal bug sat at 0.929 here (post-QK-norm K corruption on + // subnormal-scale blocks) while this test's token assertions passed. + let (h_a, mut cache_a, _) = predict_kquant_prefill(&mut weights_a, &prompt_ids, &index); + let first = argmax_token(&weights_a, &tokenizer, &h_a); + let (_, mut cache_b, _) = predict_kquant_prefill(&mut weights_b, &prompt_ids, &index); + let h_direct = predict_kquant_decode_step_direct( + &mut weights_a, + first, + &index, + &backend, + &mut cache_a, + prompt_ids.len(), + ) + .expect("direct step"); + let (h_dequant, _) = predict_kquant_decode_step( + &mut weights_b, + first, + &index, + &mut cache_b, + prompt_ids.len(), + ) + .expect("dequant step"); + let a = h_direct.row(0); + let b = h_dequant.row(0); + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let na: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + let cos = dot / (na * nb); + eprintln!("one-step hidden cosine (direct vs dequant): {cos:.6}"); + assert!( + cos >= 0.999, + "direct decode step no longer tracks the dequant step: hidden cosine {cos:.6} \ + (norms {na:.1} vs {nb:.1}) — kernel-level decode divergence, run \ + examples/ave_q4k_row_audit.rs" + ); } diff --git a/crates/larql-kv/src/generation.rs b/crates/larql-kv/src/generation.rs index 1c744fba4..a06c0dbdd 100644 --- a/crates/larql-kv/src/generation.rs +++ b/crates/larql-kv/src/generation.rs @@ -316,7 +316,7 @@ where Err(_) => return Vec::new(), }; - let first = match argmax_next_token(weights, tokenizer, &last_hidden) { + let first = match argmax_next_token_resident(weights, tokenizer, index, &last_hidden) { Some(t) => t, None => return Vec::new(), }; @@ -334,7 +334,7 @@ where Ok(h) => h, Err(_) => break, }; - let (id, tok_str) = match argmax_next_token(weights, tokenizer, &h_step) { + let (id, tok_str) = match argmax_next_token_resident(weights, tokenizer, index, &h_step) { Some(t) => t, None => break, }; @@ -632,6 +632,50 @@ fn argmax_next_token( Some((id, decoded)) } +/// `LARQL_Q4K_LM_HEAD=1` routes the resident-path lm_head matvec through +/// the vindex's Q4_K lm_head view (synthesised from f16 embeddings at load +/// for tied-embedding models) instead of the f32 row-parallel sgemv. On a +/// 262K-vocab head this drops lm_head bandwidth ~4× (e.g. 2.95 GB → 0.42 GB +/// per step on Gemma 4 26B-A4B). Default off = byte-identical f32 path. +fn q4k_lm_head_enabled() -> bool { + use std::sync::OnceLock; + static ON: OnceLock = OnceLock::new(); + *ON.get_or_init(|| std::env::var("LARQL_Q4K_LM_HEAD").as_deref() == Ok("1")) +} + +/// Resident-path argmax: like [`argmax_next_token`] but with the vindex at +/// hand, so the lm_head matvec can run Q4_K-direct under +/// `LARQL_Q4K_LM_HEAD=1`. Falls back to the f32 path when the flag is off +/// or the vindex has no Q4_K lm_head view (untied model without +/// `lm_head_q4.bin`). +fn argmax_next_token_resident( + weights: &ModelWeights, + tokenizer: &larql_inference::tokenizers::Tokenizer, + index: &larql_inference::larql_vindex::VectorIndex, + h_single: &Array2, +) -> Option<(u32, String)> { + if q4k_lm_head_enabled() && index.vocab_size > 0 { + if let Some(q4_bytes) = index.storage.lm_head_kquant_view() { + let _t_lmhead = std::time::Instant::now(); + // Argmax-only fast path: skips the full-vocab softmax + top-k + // temporaries (scaling/softcap/temperature are monotone, so the + // selected token is identical to the softmax route). + let result = larql_inference::forward::predict::q4_lm_head_argmax( + weights, + h_single, + q4_bytes.as_ref(), + index.vocab_size, + tokenizer, + ); + larql_inference::decode_stages::record_lmhead(_t_lmhead.elapsed().as_nanos()); + if let Some((id, decoded)) = result { + return Some((id, decoded)); + } + } + } + argmax_next_token(weights, tokenizer, h_single) +} + fn is_stop_token_str(s: &str) -> bool { matches!( s, diff --git a/crates/larql-lql/src/executor/query/select/edges.rs b/crates/larql-lql/src/executor/query/select/edges.rs index 51997cf49..c01ed040e 100644 --- a/crates/larql-lql/src/executor/query/select/edges.rs +++ b/crates/larql-lql/src/executor/query/select/edges.rs @@ -127,7 +127,7 @@ impl Session { order: Option<&OrderBy>, limit: Option, ) -> Result, LqlError> { - let (path, _config, patched) = self.require_vindex()?; + let (path, config, patched) = self.require_vindex()?; if let Some(nc) = nearest { return self.exec_select_nearest(patched, path, nc, limit); @@ -176,11 +176,27 @@ impl Session { let relations = rc.relation_labels(); let already_exact = relations.iter().any(|r| r.eq_ignore_ascii_case(rel)); if relations.len() >= 2 && !already_exact { - if let Some((canonical, conf)) = - self.resolve_relation_synonym(path, relations, rel) - { + // FR3b two-tier resolve (the FR2 router shape, for relations): + // Tier 1 = the cheap residual probe (synonym-robust, cached); + // Tier 2 = explicit few-shot classification on probe abstain + // (phrasing-robust — the probe is ~chance on unseen phrasings + // at its layer — but a full forward, so opt-in). See + // docs/diagnoses/fr3-explicit-rewrite.md. + let resolved = self + .resolve_relation_synonym(path, relations.clone(), rel) + .map(|(c, conf)| (c, conf, "meaning")) + .or_else(|| { + // Tier 2 candidates = the frequency-ranked relations + // (the meaningful ones), bounded like the probe's set. + let cands = rc.relation_labels_ranked( + crate::executor::relation_resolver::MAX_RELATIONS, + ); + self.resolve_relation_explicit(path, config, &cands, rel) + .map(|(c, conf)| (c, conf, "explicit classification")) + }); + if let Some((canonical, conf, how)) = resolved { notes.push(format!( - " (relation '{rel}' resolved to '{canonical}' by meaning, confidence {conf:.2})" + " (relation '{rel}' resolved to '{canonical}' by {how}, confidence {conf:.2})" )); collect_edges( patched, @@ -234,6 +250,83 @@ impl Session { *self.relation_resolver.borrow_mut() = Some((path.to_path_buf(), built)); result } + + /// FR3b — explicit relation classification (phrasing-robust Tier 2). + /// + /// When the cheap residual probe (Tier 1, [`Self::resolve_relation_synonym`]) + /// abstains, ask the model directly: a few-shot `word -> relation` prompt + /// with a `none` escape, read top-1 from a **full forward** (lm_head). The + /// probe is synonym-robust but *phrasing*-brittle (≈chance at its layer on + /// unseen phrasings like "head city" / "legal tender"); the explicit pass + /// nails both, and the `none` escape stops out-of-domain words ("weather", + /// "altitude") snapping to the nearest relation — the project's recurring + /// confident-wrong trap (cf. FR1's verify gate, FR2's fallback). Measured + /// 12/12 synonyms+phrasings, 0/3 distractor false-fires + /// (`docs/diagnoses/fr3-explicit-rewrite.md`). + /// + /// The resolver only dequantises `0..=probe_layer`, so it cannot run + /// lm_head; Tier 2 goes through `InferenceWeights` (the same path INFER + /// uses). Opt-in via `LARQL_FR3_EXPLICIT` because it is a full forward (plus + /// a model load) per probe-abstain; default off keeps SELECT byte-identical. + fn resolve_relation_explicit( + &self, + path: &std::path::Path, + config: &larql_vindex::VindexConfig, + candidates: &[String], + word: &str, + ) -> Option<(String, f32)> { + // Opt-in: absent var → abstain (the `?` short-circuits to `None`). + std::env::var_os("LARQL_FR3_EXPLICIT")?; + if candidates.len() < 2 { + return None; + } + let mut cb = larql_vindex::SilentLoadCallbacks; + let tokenizer = larql_vindex::load_vindex_tokenizer(path).ok()?; + let mut iw = larql_inference::InferenceWeights::load(path, config, &mut cb).ok()?; + + // Few-shot frame lifted verbatim from examples/fr3_explicit_rewrite.rs: + // the examples pin the "word -> relation" task, and the trailing + // `music -> none` teaches the `none` escape so an out-of-domain word + // abstains instead of snapping to a relation. `candidates` is the + // frequency-ranked, bounded relation set (the meaningful relations, not + // an alphabetical slice — see `relation_labels_ranked`). The + // demonstration mappings are tuned for the country-facts relation set + // (the measured scope); a different relation set should re-verify + // 12/12 + 0/3 before this is load-bearing for it. + let rel_list = candidates.join(", "); + let prompt = format!( + "Map each word to one of: {rel_list}, none.\n\ + city -> capital\ndollar -> currency\ndialect -> language\nmusic -> none\n\ + {word} ->" + ); + let ids = tokenizer + .encode(prompt.as_str(), true) + .ok()? + .get_ids() + .to_vec(); + let result = iw.predict_dense(&tokenizer, &ids, 5); + let (top1, prob) = result.predictions.first()?; + match_relation_top1(candidates, top1).map(|r| (r, *prob as f32)) + } +} + +/// FR3b — `none`-gated prefix match: which canonical relation (if any) does the +/// explicit classifier's top-1 token indicate? `none` and any out-of-domain +/// token match nothing → abstain. A relation may tokenise to a leading +/// sub-word, so prefix-match in either direction (mirrors the harness's +/// `any_rel_top1`). +fn match_relation_top1(relations: &[String], top1: &str) -> Option { + let t = top1.trim().to_lowercase(); + if t.is_empty() { + return None; + } + relations + .iter() + .find(|r| { + let r = r.to_lowercase(); + r.starts_with(&t) || t.starts_with(&r) + }) + .cloned() } /// Dispatch edge collection: walk-anchored when both entity and relation are @@ -609,6 +702,28 @@ mod tests { assert!(out.iter().any(|l| l.contains("[French, Europe]"))); } + #[test] + fn match_relation_top1_accepts_exact_and_subword_relations() { + let rels = vec!["capital".to_string(), "currency".to_string(), "language".to_string()]; + // Full-word top-1 (the common case — these tokenise to one token). + assert_eq!(match_relation_top1(&rels, " capital").as_deref(), Some("capital")); + assert_eq!(match_relation_top1(&rels, "Currency").as_deref(), Some("currency")); + // Leading sub-word still resolves (prefix-match in either direction). + assert_eq!(match_relation_top1(&rels, "lang").as_deref(), Some("language")); + } + + #[test] + fn match_relation_top1_abstains_on_none_and_out_of_domain() { + let rels = vec!["capital".to_string(), "currency".to_string(), "language".to_string()]; + // The `none` escape: top-1 == none → no relation → abstain. + assert_eq!(match_relation_top1(&rels, "none"), None); + // Out-of-domain distractors abstain (the confident-wrong fix). + assert_eq!(match_relation_top1(&rels, "weather"), None); + assert_eq!(match_relation_top1(&rels, "banana"), None); + // Empty / whitespace top-1 abstains rather than panicking. + assert_eq!(match_relation_top1(&rels, " "), None); + } + #[test] fn format_rows_drops_relation_column_when_no_filter_and_no_label() { let row = EdgeRow { diff --git a/crates/larql-lql/src/executor/relation_resolver.rs b/crates/larql-lql/src/executor/relation_resolver.rs index 150035d19..80ffc3fa1 100644 --- a/crates/larql-lql/src/executor/relation_resolver.rs +++ b/crates/larql-lql/src/executor/relation_resolver.rs @@ -28,11 +28,25 @@ const PROBE_ENTITIES: &[&str] = &[ ]; /// Entities a query word is resolved over (majority by averaged probability). const RESOLVE_ENTITIES: &[&str] = &["France", "Japan", "Brazil"]; -/// Bound the one-time build cost. -const MAX_RELATIONS: usize = 64; +/// Cap on how many relations either tier reasons over. A real vindex carries +/// thousands of noisy probe labels; both tiers bound to this many (Tier 1's +/// probe trains on a sorted slice via [`canonical_candidates`]; Tier 2's +/// explicit classifier enumerates the *frequency-ranked* slice — see +/// `RelationClassifier::relation_labels_ranked`). Keeps the probe's one-time +/// cost — and the explicit prompt — sane. +pub(crate) const MAX_RELATIONS: usize = 64; /// Minimum averaged softmax probability to accept a semantic resolution. const MIN_CONFIDENCE: f32 = 0.5; +/// The bounded, deduped, sorted candidate set the Tier-1 probe trains on +/// (one residual key per relation, capped at [`MAX_RELATIONS`]). +fn canonical_candidates(mut relations: Vec) -> Vec { + relations.sort(); + relations.dedup(); + relations.truncate(MAX_RELATIONS); + relations +} + pub(crate) struct RelationResolver { /// Class index → canonical relation label. relations: Vec, @@ -80,11 +94,9 @@ impl RelationResolver { /// Returns `Ok(None)` when there are too few relations to discriminate. pub(crate) fn build( path: &std::path::Path, - mut relations: Vec, + relations: Vec, ) -> Result, LqlError> { - relations.sort(); - relations.dedup(); - relations.truncate(MAX_RELATIONS); + let relations = canonical_candidates(relations); if relations.len() < 2 { return Ok(None); } diff --git a/crates/larql-lql/src/relations.rs b/crates/larql-lql/src/relations.rs index 68bd1f796..22e5facca 100644 --- a/crates/larql-lql/src/relations.rs +++ b/crates/larql-lql/src/relations.rs @@ -142,6 +142,42 @@ impl RelationClassifier { .collect() } + /// Known relation labels ranked by how many features carry each (probe + /// labels per-feature, cluster labels by cluster size), most-common first, + /// keeping the top `top_n`. The frequency-ordered analogue of + /// [`Self::relation_labels`] (which is alphabetical). + /// + /// FR3b's explicit classifier needs the *meaningful* relations in its + /// candidate set — "language", "capital", "currency". An alphabetical + /// top-N silently drops them (e.g. keeps a rare early-alphabet label while + /// cutting "language"), which made "mother tongue" fail to resolve while a + /// niche relation survived. Ranking by frequency keeps the relations that + /// actually populate the index and lets the `none` escape work (rare labels + /// fall out, so an out-of-domain word finds no plausible match). + pub fn relation_labels_ranked(&self, top_n: usize) -> Vec { + let mut counts: std::collections::HashMap = + std::collections::HashMap::new(); + if let Some(clusters) = self.clusters.as_ref() { + for (i, l) in clusters.labels.iter().enumerate() { + *counts.entry(l.clone()).or_insert(0) += clusters.counts.get(i).copied().unwrap_or(0); + } + } + for l in self.probe_labels.values() { + *counts.entry(l.clone()).or_insert(0) += 1; + } + let mut ranked: Vec<(String, usize)> = counts + .into_iter() + .filter(|(l, _)| { + let t = l.trim(); + !t.is_empty() && !t.eq_ignore_ascii_case("unknown") + }) + .collect(); + // Most-common first; alphabetical tie-break for determinism. + ranked.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); + ranked.truncate(top_n); + ranked.into_iter().map(|(l, _)| l).collect() + } + /// Check whether a feature's label is probe-confirmed (vs cluster-assigned). pub fn is_probe_label(&self, layer: usize, feature: usize) -> bool { self.probe_labels.contains_key(&(layer, feature)) @@ -392,6 +428,41 @@ mod tests { assert!(rc.relation_labels().is_empty()); } + // ── relation_labels_ranked (FR3b explicit candidate set) ── + + #[test] + fn relation_labels_ranked_orders_by_feature_count_and_caps() { + // Clusters give capital=100/language=80/continent=60; add a probe + // label "currency" on three features (count 3, the rarest). Ranking is + // most-common first, and top_n caps the tail (currency drops at n=3). + let mut rc = make_test_classifier(); + rc.probe_labels.insert((1, 1), "currency".into()); + rc.probe_labels.insert((2, 2), "currency".into()); + rc.probe_labels.insert((3, 3), "currency".into()); + rc.probe_count = rc.probe_labels.len(); + + assert_eq!( + rc.relation_labels_ranked(10), + vec!["capital", "language", "continent", "currency"], + ); + // The cap keeps the *meaningful* (frequent) head, drops the rare tail — + // the property alphabetical truncation lacked. + assert_eq!( + rc.relation_labels_ranked(3), + vec!["capital", "language", "continent"], + ); + } + + #[test] + fn relation_labels_ranked_filters_empty_and_unknown() { + let mut rc = make_test_classifier(); + if let Some(c) = rc.clusters.as_mut() { + c.labels = vec!["capital".into(), "".into(), "unknown".into()]; + c.counts = vec![100, 50, 50]; + } + assert_eq!(rc.relation_labels_ranked(10), vec!["capital"]); + } + #[test] fn from_nonexistent_vindex() { let rc = RelationClassifier::from_vindex(std::path::Path::new("/nonexistent")); diff --git a/crates/larql-models/src/test_fixtures.rs b/crates/larql-models/src/test_fixtures.rs index 166090d0e..9f7818f48 100644 --- a/crates/larql-models/src/test_fixtures.rs +++ b/crates/larql-models/src/test_fixtures.rs @@ -613,6 +613,44 @@ pub fn make_test_q4k_weights_layers(num_layers: usize) -> ModelWeights { "hidden_activation": "gelu_pytorch_tanh", "rope_theta": 10000.0, }); + q4k_test_weights_from_json(arch_json, num_layers) +} + +/// Rope-scaled sibling of [`make_test_q4k_weights`]: Gemma-3 arch at the +/// same Q4_K-compatible dims, with `sliding_window` + flat linear +/// `rope_scaling` (factor 8) and **6 layers**, so the 5:1 local:global +/// pattern puts a full-attention layer (position divisor 8) inside the +/// model. This is the fixture for direct-vs-staged decode parity on the +/// scaled-RoPE path — [`make_gemma3_rope_scaled_test_weights`] covers the +/// same arch shape but at hidden=16, below Q4_K's 256-element super-block +/// floor, so it cannot drive the direct-matvec kernels. +pub fn make_test_q4k_weights_rope_scaled() -> ModelWeights { + let num_q = 4usize; + let num_kv = 2usize; + let head_dim = Q4K_TEST_HIDDEN / num_q; + let num_layers = 6usize; + + let arch_json = serde_json::json!({ + "model_type": "gemma3_text", + "hidden_size": Q4K_TEST_HIDDEN, + "num_hidden_layers": num_layers, + "intermediate_size": Q4K_TEST_INTER, + "head_dim": head_dim, + "num_attention_heads": num_q, + "num_key_value_heads": num_kv, + "vocab_size": Q4K_TEST_VOCAB, + "hidden_activation": "gelu_pytorch_tanh", + "rope_theta": 10000.0, + "sliding_window": 512, + "rope_scaling": {"rope_type": "linear", "factor": 8.0}, + }); + q4k_test_weights_from_json(arch_json, num_layers) +} + +fn q4k_test_weights_from_json(arch_json: serde_json::Value, num_layers: usize) -> ModelWeights { + let num_q = 4usize; + let num_kv = 2usize; + let head_dim = Q4K_TEST_HIDDEN / num_q; let arch = detect_from_json(&arch_json); let mut tensors: HashMap = HashMap::new(); diff --git a/docs/audits/codebase-review-2026-06-12.md b/docs/audits/codebase-review-2026-06-12.md new file mode 100644 index 000000000..015dc48c4 --- /dev/null +++ b/docs/audits/codebase-review-2026-06-12.md @@ -0,0 +1,224 @@ +# Whole-codebase review — 2026-06-12 + +Follow-up to [`codebase-review-2026-05-28.md`](codebase-review-2026-05-28.md). +Two parts: (1) a working-tree diff review of the C10/FR3 changes in flight +(~1,400 lines across 18 files, pre-commit), then (2) a fresh whole-workspace +sweep (17 crates, ~324K LOC src) — one reader per crate/subsystem plus a +cross-cutting hygiene auditor, with adversarial verification of every +high-severity claim. Only verified findings are listed; refuted claims are +recorded at the bottom because several were the kind that recur. + +This document is the canonical record; prioritized actions are tracked in +[`ROADMAP.md`](../../ROADMAP.md) §"Codebase hardening" under +"Follow-up review (2026-06-12)". + +## Method + +- Diff review: 7 finder angles (line-by-line, removed-behavior, cross-file + tracer, reuse, simplification, efficiency, altitude) → dedup → one + recall-biased verifier per candidate. +- Workspace sweep: 10 subsystem readers (compute, compute-metal, inference, + vindex, kv, lql, server+router, models, cli+small crates, cross-cutting + hygiene) → 6 adversarial verifiers on the headline claims. +- Verdict policy: CONFIRMED/PLAUSIBLE kept, REFUTED dropped (but logged). + +## Verdict + +The numeric core held up well: the new NEON/asm kernels, int8 attention +projections, and Q4K-direct FFN paths in the working tree verified clean on +bounds, cfg-gating, and fallback semantics, and the release-mode-bounds and +GGUF-overflow claims against the kernel/loader core were all refuted on +inspection. Exposure is again concentrated at the edges: one network-facing +path-traversal hole, a Metal backend with zero GPU-error observability, two +dispatch sites that re-introduce the (thrice-bitten) dispatch-geometry bug +class despite `KernelHandle` existing to prevent it, corrupt-vindex panics +(partially overlapping the 2026-05-28 item 1, still unfixed), and a Python +binding that never releases the GIL. Architecturally the dominant debt is +five parallel forward-pass loop implementations and 145 `LARQL_*` env flags +with ~18 documented. + +--- + +## Part 1 — Working-tree diff review (C10 residency + FR3 explicit rewrite) + +Scope: uncommitted changes on `main` after `d9b761f6` (Q4K_ATTN_INT8 path, +Q4K asm kernels, Q4K lm_head, padded-down handling, FR3 two-tier relation +resolution). High-risk code (asm, int8 projections, padding derivation) +verified clean. Surviving findings, ranked: + +1. **`larql-kv/src/generation.rs:657`** — new Q4K lm_head path + (`argmax_next_token_resident`) never validates the kquant buffer length + against `vocab_size × bytes_per_row` before the chunked slice in + `logits_to_predictions_q4_lm_head` (larql-inference + `forward/predict/dense.rs:189`). A truncated `lm_head.weights` panics + mid-decode (safe panic, not OOB); a padded one silently decodes garbage + logits. `load_lm_head_kquant` infers vocab from file size only when + `vocab_size == 0` and never cross-checks otherwise. One + `bytes.len() >= vocab_size * bytes_per_row` check at the view or call + site turns this into a clean f32 fallback. +2. **`larql-compute/src/kquant_forward/cached.rs:861`** (twin: + `larql-inference/src/vindex/kquant_forward/cached.rs:1227`) — padded-down + derivation divides by `hidden` with no zero guard. `hidden` is validated + at GGUF load (`larql-models/loading/gguf/orient.rs:48`), so this is a + defensive gap, not a live crash; fold `hidden == 0` into the existing + `down_bytes_per_row == 0` guard. +3. **Padded-down block duplicated + hot-path alloc** — the ~35-line block + (derive `stored_cols`, allocate `activated_padded`, zero-pad, re-quantize) + exists verbatim in both files above with a comment admitting the lockstep + hazard. On 26B-A4B (intermediate 2112, not a 256-multiple) it allocates + ~30 layers × ~2.3 KB per generated token. Extract one shared helper into + `larql-compute` (larql-inference already imports its Q4K/Q8K API) and + reuse a scratch buffer. +4. **Env-flag value divergence** — the three new flags + (`LARQL_Q4K_ATTN_INT8` decode.rs:271, `LARQL_Q4K_DIRECT_FFN` + hidden.rs:1404, `LARQL_Q4K_LM_HEAD` generation.rs:1528) accept only + `"1"`, while pre-existing `LARQL_Q4K_ASM` accepts `"1"` and `"true"`. + `LARQL_Q4K_ATTN_INT8=true` silently measures the wrong configuration. + Feeds the flag-registry action in Part 2. +5. **`larql-lql/src/executor/query/select/edges.rs:186`** — + `relations.clone()` is unnecessary (not used afterward; the Tier-2 + closure builds its own candidates). Pass by move or change the callee to + `&[String]`. +6. **`larql-lql/src/relations.rs:162`** — `relation_labels_ranked` maps + label indices beyond `counts.len()` to frequency 0 via `unwrap_or(0)`; + build guarantees equal lengths (`clustering.rs:94`) but load + (`relations.rs:35`) never re-checks. A length check at load makes + corruption loud instead of silently dropping relations from Tier-2. +7. **`edges.rs:279`** — `LARQL_FR3_EXPLICIT` read uncached via + `env::var_os` per call, unlike every other flag in the same diff + (OnceLock). Immaterial cost (Tier-2 does a full model load anyway, + documented and intentional) — consistency fix only. +8. **`edges.rs:297`** — Tier-2 few-shot prompt (city→capital, + dollar→currency, dialect→language, music→none) hardcodes country-facts + demonstrations in the generic executor. Env-gated default-off and the + comment says re-verify on other domains; when this graduates from + experiment to default, source demonstrations from vindex metadata. + +## Part 2 — Workspace sweep (verified findings, ranked) + +### Security / serving + +- **Path traversal via unsanitized `model_id`** — + `larql-server/src/shard_loader.rs:30`: + `PathBuf::from(store_path).join(model_id)` where `model_id` comes straight + from the router's `AssignMsg` (`announce.rs:544`). The tar unpack itself + is safe (tar 0.4.45, `Archive::unpack` rejects escaping members), but a + malicious/compromised router can send `model_id = "../../../…"` and the + shard dir — and tar contents — land outside the store. Reject path + separators and `..` in `model_id`. Related (lower): grid non-join RPCs + (`drain_server`, `assign_range`) don't require the grid key + (`larql-router/src/grid/service.rs:114`). +- **Serving posture** (plausible, not hand-verified): streaming completions + hold the weights guard for the whole generation + (`routes/openai/completions.rs:302`) so concurrent requests serialize — + likely intentional for a single-model server but undocumented; no + per-request timeout on streaming (`completions.rs:366`); no graceful + drain on shutdown despite `RifGuard` existing (`bootstrap.rs:1255`); grid + join stream has no malformed-message rate limit + (`grid/service.rs:121`). + +### Metal backend + +- **GPU errors silently swallowed — 77 call sites.** Every + `wait_until_completed()` in `larql-compute-metal` is followed by buffer + reads with zero inspection of command-buffer `status()`/`error()` + (e.g. `ops/full_pipeline/dispatch.rs:456,783`). A failed/timed-out + command buffer yields stale or uninitialized data that flows into logits + with no trace — the observability gap that makes the next phantom-drift + hunt expensive. Add a `wait_and_check()` helper asserting + `status == Completed` and migrate call sites. +- **Dispatch-geometry duplication is back (historical 3× bug class).** + `decode_hybrid.rs:388-391` hardcodes `MTLSize::new(256,1,1)` while + `self.quant.q8_matvec_pipeline` is *already a `KernelHandle`* carrying + `threads_per_tg` — the dispatch ignores it. `stages/qkv_proj.rs:241` + takes a raw `ComputePipelineState` (`:199`) so it cannot consult a handle + at all. Both correct today (shader = 256), both silently break + fast-but-wrong if shader geometry ever changes. Use the handle's + geometry; change the qkv_proj signature. +- **Dead shaders (ADR-017 hygiene).** `graph_walk_knn`, `q4_sparse_matvec`, + `turboquant_decode`, `turboquant_encode` (`shaders/mod.rs:12`) compile + and ship with no dispatch site and no retention rationale doc-block. + +### Corrupt-file robustness + +- **`larql-vindex/src/format/load.rs:81,293`** — `gate_slices[info.layer]` + where `info.layer` is deserialized from `index.json` with no bounds check + against `num_layers`; a corrupt manifest panics "index out of bounds" at + load. Validate `info.layer < num_layers` and return `VindexError::Parse`. +- **`larql-inference` kquant panics** — `cached.rs:123,200`, `hidden.rs:38`: + `insert_q4k_layer_tensors(...).unwrap_or_else(|err| panic!("{err}"))` + aborts the session mid-inference on missing/corrupt Q4K slices. *Same + finding as 2026-05-28 review item 1 (`FfnBackend::forward` fallibility) — + re-confirmed, still open.* +- **`larql-vindex/src/format/load.rs:317`** — interleaved-kquant manifest + fields `offset`/`length` default to 0 via `unwrap_or(0)` when missing, + masking the real error behind a later cryptic "exceeds mmap length". +- **`larql-models/src/loading/safetensors.rs:236`** — + `Array2::from_shape_vec(...)` panics on shape/len mismatch instead of + returning `ModelError::Parse` (low; metadata-vs-data mismatch on corrupt + safetensors). + +### Python bindings + +- **GIL held for entire forward passes.** Zero `allow_threads` uses in + `larql-python/src`. `WalkModel.predict()` (walk.rs:351), + `trace()` (:544), `generate_with_hooks()` (:1046), `PyVindex.infer()` + (vindex.rs:1196), `infer_trace()` (:1342) all block every Python thread + for the duration of inference. Wrap compute in `py.allow_threads(|| …)`. +- **`vindex.rs:847`** — `partial_cmp().unwrap()` on gate scores aborts the + interpreter on NaN; line 1434 in the same file already uses + `unwrap_or(Ordering::Equal)`. *Subset of 2026-05-28 item 5 (shared + NaN-safe sort helper) — re-confirmed, still open.* + +### Cross-cutting + +- **Env-flag sprawl: 145 distinct `LARQL_*` flags, ~18 documented.** A + meaningful subset changes numerics (`LARQL_Q4K_DIRECT_FFN`, + `LARQL_Q4K_ATTN_INT8`, `LARQL_FUSED_ATTN`, …): two "identical" bench runs + can diverge 5-10% with nothing in the logs, and accepted values already + diverge (`"1"` vs `"true"`, Part 1 item 4). `larql-compute/src/options.rs` + already defines the taxonomy (`env_opt_in`/`env_opt_out`/`env_flag`) — + most flags bypass it. Action: route flags through one helper + generate + `docs/env-flags.md`. +- **Five parallel forward-pass loops** in + `larql-inference/src/vindex/kquant_forward/` (`predict_kquant_hidden`, + `_prefill`, `_decode_step`, `_decode_step_direct`, remote-FFN path) each + repeat sentinel logic (`insert_q4k_layer_tensors`, MoE detection, KV + attention dispatch). Every layer-stepping change lands five times or + outputs silently diverge — the padded-down lockstep twins (Part 1 item 3) + are the same disease one level down. Wants an ADR before refactoring; + cuts across the files the C10 work is hot in. +- **Dead weight**: `model-compute` (~50-line crate, no second consumer — + violates the no-speculative-extraction policy); `test_utils.rs` + (1,228 lines) ships as public API of `larql-inference`. + +## Refuted claims (logged so they don't recur) + +- **GGUF loader overflow** (`loader.rs:84` dims product, `ggml/mod.rs:180` + `n_elements * 4`, `parser.rs:245` n_tensors cast): refuted on 64-bit + (usize = u64, no wrap), and `check_block_input` re-validates with + `checked_mul` before any dequant slice. No OOB path. +- **`q4_matvec.rs` debug_assert-only bounds → release OOB**: refuted — + callers derive dims from load-validated metadata and the safety contract + is held caller-side. +- **`q4k_q8k_dot.rs` scalar-path OOB**: refuted — safe-slice indexing + (panic, not UB) plus a runtime `w.len() < rows * row_bytes` early-return + at the public entry points. +- **`attn_fused` threadgroup overflow at head_dim>256 / seq>1024**: refuted — + dispatch is gated by `MAX_HEAD_DIM_SINGLE_SG` (256) and + `SHORT_ATTENTION_SPAN` (1024) (`decode/encode_attn.rs:173-174`), and the + shader is opt-in (`LARQL_FUSED_ATTN`). +- **Tar member path traversal in `shard_loader.rs`**: refuted as stated + (tar 0.4.45 `unpack` is safe) — but redirected to the real hole one level + up (unsanitized `model_id` join, see Part 2). +- **`relation_labels_ranked` recompute cost**: refuted — once per SELECT on + Tier-1 abstain, microseconds next to the Tier-2 model load. + +## Non-finding hygiene + +- Diff-review good news worth keeping: asm kernels bit-exact-tested, int8 + attention falls back cleanly, `stored_cols` is provably a 256-multiple by + construction, FR3 Tier-2 error paths all `?`-propagate to `None`. +- Healthy signals re-confirmed: clippy/fmt enforced, llvm-cov gates, + shannon-verify CI gate, atomic tmp+rename vindex writes, 9 ignored tests, + 23 ADRs + 18 diagnosis docs indexed and load-bearing. diff --git a/docs/diagnoses/fr3-explicit-rewrite.md b/docs/diagnoses/fr3-explicit-rewrite.md new file mode 100644 index 000000000..f74de1f94 --- /dev/null +++ b/docs/diagnoses/fr3-explicit-rewrite.md @@ -0,0 +1,79 @@ +# FR3b — relation resolution: probe is phrasing-brittle, explicit rewrite wins + +**Date:** 2026-06-08. **Status:** ran (`examples/fr3_template_ablation.rs`, `examples/fr3_explicit_rewrite.rs` → `bench/aim-validation/fr3_{template_ablation,explicit_rewrite}_gemma3-4b.json`). Gemma-3-4B Q4K vindex. Follow-on to [`fr3-relation-address.md`](fr3-relation-address.md) — refines, doesn't overturn, the FR3 WIN. + +## Headline + +**The FR3 residual probe is synonym-robust but PHRASING-brittle, and an explicit few-shot classification (with a `none` escape) is the fix.** FR3's 1.00 was measured on synonym *words* substituted into one fixed template (`"The {w} of {e} is"`). That generalisation is real — but it does **not** extend to unseen *phrasings* (a different template structure). On a held-out phrasing the probe sits at **chance** at its own probe layer, and diversifying the training templates does **not** rescue it. The model's own answer — asked directly "this word → which relation?" — nails synonyms **and** unseen phrasings (12/12), and a `none` option stops it confident-wronging out-of-domain inputs (distractor false-fires 2/3 → 0/3). This is chris's call ("explicit rewrites unseen phrasings to relevant templates"), measured. + +## Results + +### Probe vs phrasing (ablation, `fr3_template_ablation.rs`, N=6 entities) +Train BASE `{capital,currency,language}` over the first `k` of the resolver's templates; test SYN `{seat,money,tongue}` on a **held-out** phrasing `"The {r} for {e} would be"` (chance 0.33): + +| layer | k=1 | k=2 | k=4 | +|---|---|---|---| +| L6 | 0.39 | 0.33 | **0.83** | +| **L10** (resolver probe layer) | 0.33 | 0.39 | **0.39** | +| L14 | 0.33 | 0.33 | 0.33 | +| L20 | 0.17 | 0.11 | 0.17 | + +- At the **resolver's L10**, more templates = **no-op** (0.33→0.39, chance). The multi-template change was **reverted** (4×'d build cost for nothing at the probe layer). +- Signal for an unseen phrasing is **early (L6) and decays with depth** — the opposite of "deeper = more normalised". The model resolves surface form early then consumes the relation representation computing the answer; it does not hold a stable canonical form at depth. (N=6 is coarse — L6's 0.83 is 15/18; the *shape* is the point.) + +### Explicit classification (`fr3_explicit_rewrite.rs`, one forward via `predict_kquant`) +Few-shot `word -> relation` over `{capital,currency,language[, none]}`, read top-1: + +| candidate set | synonyms | unseen phrasings | distractor false-fires | +|---|---|---|---| +| forced-choice (no escape) | 6/6 | 6/6 | **2/3** (weather→capital, altitude→capital) | +| **+ `none` escape** | **6/6** | **6/6** | **0/3** (banana/weather/altitude → none) | + +`head city`/`main city`→capital, `legal tender`/`unit of money`→currency, `spoken language`/`mother tongue`→language — all the phrasings the probe sat at chance on. + +## What it establishes + +- **Two different generalisation axes.** The probe generalises across synonym *words* (FR3's 1.00, real) but **not** across *phrasings* (chance at L10). "Synonym-robust" ≠ "phrasing-robust"; the original WIN holds for the former only. +- **You can't fix phrasing-brittleness by diversifying the probe's training templates** — measured no-op at the probe layer. The relation signal isn't a phrasing-invariant direction at depth; it's early and transient. +- **Explicit model classification is phrasing-robust** because it uses the LM head + full language understanding, not a thin residual probe — 12/12 including the exact cases the probe missed. +- **Forced-choice is the project's recurring confident-wrong trap** (cf. FR1's 0.75 gate, FR2's fallback). A closed "map X to one of {…}" prompt forces a relation even for `weather`/`altitude`. **The `none` escape is the verify/abstain** — 0/3 once present, 12/12 preserved. + +## Honest scope / caveats + +- 3 relations, ~12 phrasings, 1 model (Gemma-3-4B Q4K), country entities, N=6 in the ablation (coarse). Strong + consistent, not a law. +- Explicit classify costs a **full forward (lm_head)**; the residual probe is cheaper (partial forward to ~L10, no lm_head). So explicit is the *fallback*, not the default — it earns its cost only when the probe abstains. +- Few-shot prompt + `none` example chosen by hand; not prompt-robustness-swept. A second few-shot frame should reproduce 12/12 + 0/3 before this is load-bearing. + +## What the build is (NEXT SESSION — not yet wired) + +**Probe-first / explicit-classify-fallback in `resolve_relation_synonym`** (`larql-lql/src/executor/query/select/edges.rs`) — the FR2 two-tier shape, for relation resolution: + +1. **Tier 1 — residual probe (unchanged, cheap):** the existing `RelationResolver`. When it clears `MIN_CONFIDENCE`, use it (rides the model's implicit normalisation on canonical synonyms — "implicit happens sometimes"). +2. **Tier 2 — explicit classify (on abstain):** few-shot `word -> {relations, none}`, one forward, top-1; accept iff it's a real relation (not `none`). + +**Wiring wrinkle (the one real structural choice):** `RelationResolver` only dequantises layers `0..=probe_layer` (≈L10) → it **cannot run lm_head**, so Tier 2 must run via the **Session's already-loaded vindex** (`predict_kquant`/`InferenceWeights`, the same path INFER uses), not the resolver's partial setup. ~30 lines crossing the resolver→session boundary. Add an LQL knob if the explicit pass should be opt-in (it's a full forward per abstain). + +Harnesses to lift the prompt/matching from: `examples/fr3_explicit_rewrite.rs` (the few-shot frame + `none`-gated accept + prefix-matching over top-k). + +## BUILD LANDED (2026-06-09) + +**Wired the two-tier resolver into `SELECT … FROM EDGES WHERE relation = …`, opt-in, default off = byte-identical.** When the exact/substring relation match returns nothing, `resolve_relation_synonym` runs Tier 1 (the cached residual probe, unchanged); on probe abstain it falls through (`.or_else`) to **Tier 2 — `resolve_relation_explicit`** (`crates/larql-lql/src/executor/query/select/edges.rs`): + +- **Few-shot frame lifted verbatim** from `examples/fr3_explicit_rewrite.rs` (`word -> relation` + `music -> none`), one **full forward** via `InferenceWeights::predict_dense` (the INFER path — for a Q4_K vindex this is exactly `predict_kquant`, lm_head included). The resolver's partial `0..=L10` dequant can't run lm_head, so Tier 2 goes through `InferenceWeights`, not the resolver's setup — the one structural wrinkle the pre-registration called out. +- **`none`-gated accept** (`match_relation_top1`, unit-tested): prefix-match top-1 against the candidate relations; `none` / out-of-domain → no match → abstain. +- **Gated by `LARQL_FR3_EXPLICIT`** (full forward + model load per probe-abstain). Default off → SELECT is byte-identical to FR3 (probe-only). + +**Refinement forced by the real vindex — frequency-ranked candidates, not alphabetical.** The measurement used a clean 3-relation set; production `gemma3-4b-q4k-v2.vindex` carries **2890** noisy probe labels. Two consequences the clean measurement hid: +1. `relation_labels()` is a `BTreeSet` (**alphabetical**), and both tiers cap at `MAX_RELATIONS=64`. An alphabetical top-64 *keeps* a rare early-alphabet label (`food_animal`) while *dropping* `language` — so "mother tongue" couldn't resolve while "banana" could (exactly backwards). Fixed with `RelationClassifier::relation_labels_ranked(top_n)` — **by feature count, most-common first** — used for Tier 2's prompt enumeration + matching. Now the meaningful relations are always in the candidate set, and the `none` escape strengthens (rare labels fall out). +2. Enumerating all 2890 labels is a ~10K-token prompt; the ranked top-64 keeps it to "one short forward," matching the measured intent. + +**E2E on real Gemma-3-4B (`LARQL_FR3_EXPLICIT=1`):** +- `mother tongue` → **`language` by explicit classification (0.97)** — probe abstained, Tier 2 resolved. The headline win, on the exact phrasing the probe sits at chance on. +- `weather` → **abstain** (no resolution) — the `none` escape fires, no confident-wrong. +- Default off: `mother tongue`/`weather` → no resolution (probe-only) — **byte-identical**. + +**Honest scope correction.** On the production probe (64-class, its own `"The {r} of {e} is"` template) the Tier-1 probe is **stronger than the 3-class ablation implied** — it resolves `head city`→capital (0.97), `legal tender`→currency (0.80), `altitude`→elevation (0.96) *by meaning* without Tier 2. The ablation's "chance" was specific to a 3-class probe on a held-out *template structure*; many real phrasings still ride Tier 1. Tier 2 is the **safety net for genuine abstains** (e.g. `mother tongue`), and on a rich relation set the `none` escape is necessarily weaker (`banana`→`food_category` — a real, common relation here, a defensible resolution, not the clean-world `none`). Tests: `match_relation_top1` (2) + `relation_labels_ranked` (2), 726 lql lib green, clippy clean. + +## Bottom line + +FR3's relation address is a clean index **for synonym words within a template**, not a phrasing-invariant one — the probe is the right *cheap tier*, not the whole story. **Explicit model classification with a `none` escape is the robust tier** (12/12 phrasings + synonyms, 0/3 confident-wrong on the clean set). **Built:** the two-tier resolver — probe-first, explicit-`none`-gated fallback over frequency-ranked candidates, opt-in `LARQL_FR3_EXPLICIT`. From d68ee1aad0bdbd155cf2e9160777a838f95c6ff9 Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Sat, 13 Jun 2026 14:50:07 +0100 Subject: [PATCH 3/8] working on performance improvements --- ROADMAP_STATUS.md | 3 + bench/aim-validation/ave_demo_gemma3-4b.json | 2 +- .../ave_stream_trigger_gemma3-4b.json | 1 + crates/larql-compute/src/attention/decode.rs | 63 ++- crates/larql-compute/src/attention/mod.rs | 3 +- crates/larql-compute/src/kv_dispatch/cpu.rs | 64 +-- .../docs/specs/kv-engine-unification.md | 23 ++ .../arithmetic-virtual-expert.md | 36 +- crates/larql-inference/examples/ave_demo.rs | 146 ++++++- .../examples/ave_stream_trigger_probe.rs | 363 ++++++++++++++++++ .../examples/scanner_adversarial.rs | 51 +++ crates/larql-inference/src/attention/mod.rs | 2 +- .../src/experts/arith/drive.rs | 22 +- .../src/experts/arith/extract.rs | 270 +++++++++++-- .../src/experts/arith/verify.rs | 40 +- crates/larql-inference/src/test_utils.rs | 4 +- .../src/vindex/kquant_forward/cached.rs | 97 +++++ .../src/vindex/kquant_forward/generation.rs | 28 ++ .../src/vindex/kquant_forward/mod.rs | 6 +- crates/larql-inference/src/vindex/mod.rs | 2 +- crates/larql-kv/ROADMAP.md | 33 ++ .../src/engines/boundary_kv/engine.rs | 42 ++ .../src/engines/boundary_per_layer/engine.rs | 64 ++- .../src/engines/boundary_per_layer/walk.rs | 13 +- .../src/engines/markov_residual/compute.rs | 31 +- .../src/engines/markov_residual/engine.rs | 82 ++-- .../src/engines/markov_residual/walk.rs | 3 +- .../engines/markov_residual_codec/compute.rs | 14 +- .../engines/markov_residual_codec/engine.rs | 51 ++- .../src/engines/markov_residual_codec/walk.rs | 5 +- crates/larql-kv/src/engines/mod.rs | 77 ++++ .../src/engines/turbo_quant/engine.rs | 152 +++++--- .../src/engines/unlimited_context/engine.rs | 38 +- .../src/engines/unlimited_context/extend.rs | 5 +- 34 files changed, 1591 insertions(+), 245 deletions(-) create mode 100644 bench/aim-validation/ave_stream_trigger_gemma3-4b.json create mode 100644 crates/larql-inference/examples/ave_stream_trigger_probe.rs create mode 100644 crates/larql-inference/examples/scanner_adversarial.rs diff --git a/ROADMAP_STATUS.md b/ROADMAP_STATUS.md index 439676832..a915a462d 100644 --- a/ROADMAP_STATUS.md +++ b/ROADMAP_STATUS.md @@ -8,6 +8,9 @@ Last updated: 2026-06-10 ## Recently shipped (delta since last update) +- **Bottleneck pass #2 (code-level, machine contended) — GQA scratch fix landed; measurement queue parked (2026-06-13).** With the box owned by a sibling session (battery also low), did the allocation-churn audit instead of timing: the expert path is already TLS-pooled (`ExpertScratch`), but the GQA head-parallelization had introduced a per-head scores `vec!` (≈480 allocs+zeroings/token, growing with ctx) — replaced with `for_each_init` per-worker scratch (rayon workers are long-lived, so it amortises across calls too). 697+1220+756 green. **Measurement queue for the next quiet-machine session:** (1) fresh `/usr/bin/sample` + stage split on the current build (the last profile predates KV append-in-place + serial cuts — the sink distribution has shifted); (2) boundary-kv ratio re-measure (its 0.80× predates its resident-forwarding fix); (3) clean absolute engine matrix + llama-bench brackets; (4) remaining known sinks by size: `cpu_moe_forward` glue (router f32 matvec ~43 MB/tok + norms + route), allocator churn from per-projection out-Vecs and ndarray temporaries (arena-per-step is the structural fix if a fresh sample still shows `madvise`), long-ctx GQA growth (real work; flash-attention-class restructure is the eventual answer). +- **Engine structural gap CLOSED — every KV engine now plugs into the CPU fast path (2026-06-13).** The `KvEngine::decode_step_resident` trait default silently dropped the index (`let _ = index`), so the 06-11/12 fast-path arc (q4k/int8 attention + asm + append-in-place) reached only `StandardEngine`. **Built:** single-source dispatcher `run_attention_block_decode_step_auto` (same per-layer q4k-vs-f32 choice as `CpuBackend::attention_step`, for `SharedKV`-owning walk loops; flag moved to `attention::decode` as the one source); `markov-rs`/`markov-rs-codec`/`turbo-quant`/`unlimited-context`/`boundary_per_layer` override `decode_step_resident` and thread the vindex down their walks; **`boundary-kv` forwards both resident methods to its inner StandardEngine** (was silently f32). `no_cache`/`apollo` keep the default by design. **Regression pin:** `engines::resident_identity_tests` — 7 concrete specs, resident ≡ plain bit-identical flags-off, coverage count can't shrink. **Measured (within-run ratios vs standard; absolutes pending quiet machine — sibling session load):** turbo 0.64×→0.85×, unlimited 0.76×→**1.07×**; markov/codec/boundary-per-layer flat = their own recompute/codec machinery (the feature), not attention. Prefill stays f32 gemm everywhere (task-#16 prefill falsification). 697+1220+756 tests green, clippy clean. Crate detail: [`crates/larql-kv/ROADMAP.md`](crates/larql-kv/ROADMAP.md) §"CPU resident fast-path". +- **KV-engine review of the append-in-place handle — all engines green; two failure-path fixes landed (2026-06-13).** `EngineBackend: ComputeBackend + KvDispatch`, so every handle-holding engine rides the rewritten `CpuKvHandle`. Audit found two semantic edges the happy-path suites couldn't catch, both fixed: (1) q4k attend-failure after the in-place append now **pops the appended row and falls back to f32** (the old monolithic form's semantics — protects engine-level fallbacks like boundary_per_layer's dense-walk that reuse the handle); (2) the f32 path's prior is **copied not moved** so a backend failure leaves the handle intact. Empirical matrix, flags ON: **26B** all six MoE-capable engines run clean (standard 14.3 cold / boundary-kv 11.5 / unlimited 10.9 / turbo 9.2 / codec 7.9 / markov 7.2 — C1-class ordering); **4B** standard **28.9 tok/s** (dense also gains: gap to llama.cpp 43 now 1.49×), `standard:window=64` exercises `clip_kv` on the new buffers correctly, no-cache/markov/codec/turbo/unlimited all run. 697+755 suites green (incl. the cross-engine parity oracles). - **KV append-in-place — 23.5→27.9 tok/s short-ctx, 16.0→24.8 long-ctx; gap to llama.cpp ~1.15×/1.29× (2026-06-12).** The old `CpuKvHandle` attention step did a full-cache **clone** + `zeros` + four assigns per layer per step (~190 MB/token of churn at ctx 130, growing linearly). Rewritten: growable row-major Vec buffers (amortised O(kv_dim) `append_row`), q4k-direct step split into **project / append-in-place / attend-over-views** (no concat, no clone; legacy owned-concat wrapper kept for engine walk-loops), f32 fallback moves state instead of cloning, `append_kv`/`clip_kv` lose their O(ctx) rebuilds too, `gqa_attention_decode_step` generic over views. **E2E:** n=128 **27.9 tok/s** (35.8 ms; bracket 33.6/31.0 → ~1.15×); n=512 **24.8** vs llama.cpp tg512 32.0 (1.29× — remaining ctx-growth is GQA compute, real work). Text sanity: full flag stack on 4B → "The capital of France is **Paris**." **Cumulative arc: 7.6 → 27.9 tok/s (3.7×); gap 4.5× → ~1.15×.** 692+1213+755 tests green incl. kv-engine parity oracles. Side-finding: `ave_direct_step_parity` garbage CONFIRMED PRE-EXISTING at clean 6659fd6c (worktree repro) — legacy direct-path twin, not this work; AVE session owns it. Artifact §"update_2026_06_12_kv_append_in_place". - **DRAM-idle hunt: sinks NAMED by live-process sampling; first three serial cuts land 21.7→23.5 tok/s (2026-06-12).** MT shape sweep first **acquitted** the suspects (rayon-chunked matvecs sustain 72–113 GiB/s at every production shape; experts-granularity arm 104) → `/usr/bin/sample` of a live decode attributed the real gap: **~75% of thread-samples parked in wait states** — workers sleep while serial main-thread sections run. Named: attention non-projection ~20% of wall (KV-concat `zeros`+`bzero`+`memmove`, per-head `cblas_sgemv` GQA + scalar libm-`exp` softmax, norms/RoPE), `cpu_moe_forward` glue ~12%, dense-slab scalar gelu + serial requantise ~8%, **lm_head argmax epilogue 4.6%** (serial softmax+top-k over 262K logits), `madvise` churn ~3%. **Built (parity-safe):** `q4_lm_head_argmax` (argmax over raw logits — scaling/softcap/temp are monotone → identical selection; no softmax, no 3 MB temporaries), GQA rayon-parallel over heads (math unchanged), dense-slab activation rayon-chunked (same libm per element). **E2E n=128: 23.5 tok/s under a degraded bracket (27.1)** — same-state gap 1.15×, vs clean-bracket llama.cpp 33 ≈ 1.35×. **n=512: 16.0 tok/s — the O(ctx) KV-concat realloc+copy (~20 ms/token by step 500) is the dominant remaining structural item** (engine KvHandle append-in-place, contract change). Then: moe glue, buffer reuse vs madvise. 692+1212+755 tests green; clippy clean (3 pre-existing warnings in `experts/arith`, not this work). Artifact §"update_2026_06_12_serial_cuts". - **C12 v3 kernel + the roofline-crossover finding — kernel work on this path is DONE; the remaining 1.55× is effective-bandwidth utilisation, not kernel quality (2026-06-12).** Measurement-first paid twice. **(1) Decomposition bench** (`q4k_sb_decomposition`): the v1 asm block is 16.3 cyc/SB but the per-super-block Rust glue (scale unpack + i32 array + scalar sum2 + 2× software `f16_to_f32` + epilogue) costs **19.2 cyc/SB with only ~3.6 OoO-hidden** — the glue was the fat, not the asm schedule. **(2) Built v2→v3:** whole super-block in one asm block (TBL-vectorised 6-bit scale/min unpack, `smull/smlal2` sum2, hardware `fcvt`, exact-order epilogue), then the super-block loop moved inside the asm with constants hoisted (the 144B header+quants layout gives a zero-arithmetic walking pointer). **Single-thread: 18.3 → 23.8 GiB/s (+30%), bit-exact vs scalar first run** (all five asm kernels this arc were — the wrapping-i32 + exact-conversion discipline works). `LARQL_Q4K_ASM` now routes to v3. **(3) E2E 26B: FLAT (21.3 ≈ 21.7, bracket 33.4) — the crossover measurement:** at 8 threads the q4k streams are now **memory-bandwidth-bound** (~47 GB/s effective vs llama.cpp's ~70 on the same machine; v3 could demand ~190). **The spec's 28 GiB/s single-kernel target is no longer the path to parity**; the next C12 round must target effective-bandwidth sinks — rayon fork-join gaps (~180+ parallel sections/token), compute-only phases with idle DRAM (GQA/RoPE/quantize ~7 ms), expert task granularity — measurement-first. 692 compute tests green, clippy clean. Artifact §"update_2026_06_11_v3_kernel_and_roofline_crossover". diff --git a/bench/aim-validation/ave_demo_gemma3-4b.json b/bench/aim-validation/ave_demo_gemma3-4b.json index 7c3e1b30b..6f5a6e867 100644 --- a/bench/aim-validation/ave_demo_gemma3-4b.json +++ b/bench/aim-validation/ave_demo_gemma3-4b.json @@ -1 +1 @@ -{"experiment":"ave_demo","vindex":"output/gemma3-4b-q4k-v2.vindex","explicit":[6,6],"schedule_end":[6,6],"native":[2,6],"false_fires":[0,6],"items":[{"leg":"explicit","prompt":"12 + 7 =","expected":"19","dispatch_ok":true,"native_ok":true,"native_text":"19\n12 - 7 =","native_tokens":11,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"12 + 7","alu_result":"19","emitted":" 19","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":3}},{"leg":"explicit","prompt":"123456 + 654321 =","expected":"777777","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n123456 + 65432","native_tokens":15,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"123456 + 654321","alu_result":"777777","emitted":" 777777","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":7}},{"leg":"explicit","prompt":"100000 - 1 =","expected":"99999","dispatch_ok":true,"native_ok":true,"native_text":"99999\n100000 -","native_tokens":14,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"100000 - 1","alu_result":"99999","emitted":" 99999","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":6}},{"leg":"explicit","prompt":"12345 * 6789 =","expected":"83810205","dispatch_ok":true,"native_ok":false,"native_text":"8380000 - 1000 = 8","native_tokens":17,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"12345 * 6789","alu_result":"83810205","emitted":" 83810205","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":9}},{"leg":"explicit","prompt":"999 + 111 - 222 =","expected":"888","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n999 + 111 =","native_tokens":12,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"999 + 111 - 222","alu_result":"888","emitted":" 888","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":4}},{"leg":"explicit","prompt":"858358354868358358358358 + 141641645131641641641641 =","expected":"999999999999999999999999","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n**Calculation:**\n\n858358354868358358358358 + 1","native_tokens":33,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"858358354868358358358358 + 141641645131641641641641","alu_result":"999999999999999999999999","emitted":" 999999999999999999999999","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":25}},{"leg":"distractor","prompt":"My phone number is 4415550172.","fire":"no"},{"leg":"distractor","prompt":"The meeting is on 2026-06-11.","fire":"no"},{"leg":"distractor","prompt":"Train 9 departs at 18:45 from platform 3.","fire":"no"},{"leg":"distractor","prompt":"Order 66 was executed in 19 BBY.","fire":"no"},{"leg":"distractor","prompt":"Account 123456789012345678901234567890 is active.","fire":"no"},{"leg":"distractor","prompt":"What is the capital of France?","fire":"no"}]} \ No newline at end of file +{"experiment":"ave_demo","vindex":"output/gemma3-4b-q4k-v2.vindex","explicit":[6,6],"schedule_end":[6,6],"native":[2,6],"false_fires":[0,11],"items":[{"leg":"explicit","prompt":"12 + 7 =","expected":"19","dispatch_ok":true,"native_ok":true,"native_text":"19\n12 - 7 =","native_tokens":11,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"12 + 7","alu_result":"19","emitted":" 19","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":3}},{"leg":"explicit","prompt":"123456 + 654321 =","expected":"777777","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n123456 + 65432","native_tokens":15,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"123456 + 654321","alu_result":"777777","emitted":" 777777","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":7}},{"leg":"explicit","prompt":"100000 - 1 =","expected":"99999","dispatch_ok":true,"native_ok":true,"native_text":"99999\n100000 -","native_tokens":14,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"100000 - 1","alu_result":"99999","emitted":" 99999","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":6}},{"leg":"explicit","prompt":"12345 * 6789 =","expected":"83810205","dispatch_ok":true,"native_ok":false,"native_text":"8380000 - 1000 = 8","native_tokens":17,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"12345 * 6789","alu_result":"83810205","emitted":" 83810205","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":9}},{"leg":"explicit","prompt":"999 + 111 - 222 =","expected":"888","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n999 + 111 =","native_tokens":12,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"999 + 111 - 222","alu_result":"888","emitted":" 888","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":4}},{"leg":"explicit","prompt":"858358354868358358358358 + 141641645131641641641641 =","expected":"999999999999999999999999","dispatch_ok":true,"native_ok":false,"native_text":"?\n\n**Calculation:**\n\n858358354868358358358358 + 1","native_tokens":33,"telemetry":{"fire":"tier0","path":"forced_explicit","expression":"858358354868358358358358 + 141641645131641641641641","alu_result":"999999999999999999999999","emitted":" 999999999999999999999999","termination":"schedule_end","verify":"skipped","flags":[],"rewrite_tokens":0,"answer_tokens":25}},{"leg":"distractor","prompt":"My phone number is 4415550172.","fire":"no"},{"leg":"distractor","prompt":"The meeting is on 2026-06-11.","fire":"no"},{"leg":"distractor","prompt":"Train 9 departs at 18:45 from platform 3.","fire":"no"},{"leg":"distractor","prompt":"Order 66 was executed in 19 BBY.","fire":"no"},{"leg":"distractor","prompt":"Account 123456789012345678901234567890 is active.","fire":"no"},{"leg":"distractor","prompt":"What is the capital of France?","fire":"no"},{"leg":"distractor","prompt":"It takes 5 - 10 business days.","fire":"no"},{"leg":"distractor","prompt":"I work a 9 - 5 job.","fire":"no"},{"leg":"distractor","prompt":"a 4 x 4 truck parked outside","fire":"no"},{"leg":"distractor","prompt":"Are you available 9 - 5?","fire":"no"},{"leg":"distractor","prompt":"dated 2026 - 06 - 11 in the ledger","fire":"no"}]} \ No newline at end of file diff --git a/bench/aim-validation/ave_stream_trigger_gemma3-4b.json b/bench/aim-validation/ave_stream_trigger_gemma3-4b.json new file mode 100644 index 000000000..17456647f --- /dev/null +++ b/bench/aim-validation/ave_stream_trigger_gemma3-4b.json @@ -0,0 +1 @@ +{"experiment":"ave_stream_trigger","vindex":"output/gemma3-4b-q4k-v2.vindex","arms":[{"arm":"bare","fire":[10,24],"fidelity_of_fired":[7,10],"median_pos":15,"multi_trigger":6,"branch":"PARKED: restatement reflex insufficient in this arm"},{"arm":"cot","fire":[20,24],"fidelity_of_fired":[15,20],"median_pos":40,"multi_trigger":7,"branch":"ENGAGEMENT-ONLY: trigger fires but emitted exprs unfaithful — payload needs the instructed rewrite"}],"release_overruns":[0,9],"items":[{"arm":"bare","prompt":"If you have 38 apples and pick 17 more, how many apples do you have?","expected":"38 + 17","fire":true,"emitted_expr":"38 + 17","correct":true,"pos":8,"n_triggers":1,"emission":"38 + 17 = 55\n\nYou have 55 apples.\nFinal Answer: The final answer is $\\boxed{55}$"},{"arm":"bare","prompt":"What do you get when you add 123456 and 654321?","expected":"123456 + 654321","fire":true,"emitted_expr":"123456 + 654321","correct":true,"pos":16,"n_triggers":1,"emission":"123456 + 654321 = 777777\n\nSo the answer is 777777.\nFinal Answer: The final answer is $\\boxed{777777}$"},{"arm":"bare","prompt":"What is the sum of 999 and 111?","expected":"999 + 111","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"What is the product of 2 and 5?\n\nWhat is the result of 100 - 50?\n\nWhat is the result of 2 x 3 + 4?\n\nWhat is the result of 10 / 2?\n\nWhat is the result of 10"},{"arm":"bare","prompt":"A tank holds 4500 liters and 2750 more are pumped in. How much is in the tank?","expected":"4500 + 2750","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"**1. Understand the Problem:**\n\n* We start with a certain amount of water in a tank.\n* We add more water to the tank.\n* We need to find the total amount of water in the tank.\n\n**2. Set up the Equation:**\n\nLet 'x' be the amount"},{"arm":"bare","prompt":"Tom scored 1284 points and then earned another 716. What is his total?","expected":"1284 + 716","fire":true,"emitted_expr":"1284 + 716","correct":true,"pos":11,"n_triggers":5,"emission":"1284 + 716 = 1999 + 1 = 2000\n\n1284 + 716 = 1284 + 700 + 16 = 1984 + 16 = 19"},{"arm":"bare","prompt":"Add 87 to 246.","expected":"246 + 87","fire":true,"emitted_expr":"6 + 7","correct":false,"pos":54,"n_triggers":1,"emission":"To add 87 to 246, we can write the addition as:\n246 + 87\n\nWe can add the numbers column by column, starting from the rightmost column (the ones place):\n6 + 7 = 13. Write down 3 and carry"},{"arm":"bare","prompt":"A library has 58210 books and acquires 4790 new ones. How many books now?","expected":"58210 + 4790","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"A library has 58210 books and acquires 4790 new ones.\nThe number of books the library has now is the sum of the original number of books and the number of new books acquired.\nNumber of books now = Original number of books + Number of new books acquired"},{"arm":"bare","prompt":"What is 312487 increased by 96513?","expected":"312487 + 96513","fire":true,"emitted_expr":"312487 + 96513","correct":true,"pos":15,"n_triggers":5,"emission":"312487 + 96513 = 408999 + 1 = 409000 - 1 = 409000 - 1 = 409000 - 1 = 4090"},{"arm":"bare","prompt":"Sarah had 5000 dollars and spent 1234. How much does she have left?","expected":"5000 - 1234","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"Sarah had 5000 dollars and spent 1234 dollars.\nTo find out how much she has left, we need to subtract the amount she spent from the amount she had initially.\nAmount left = Initial amount - Amount spent\nAmount left = 5000 - 1"},{"arm":"bare","prompt":"Take 250 away from 1000.","expected":"1000 - 250","fire":true,"emitted_expr":"1000 - 250","correct":true,"pos":11,"n_triggers":2,"emission":"1000 - 250 = 750\n\nThe answer is 750.\n1000 - 250 = 750\nFinal Answer: The final answer is $\\boxed{750}$"},{"arm":"bare","prompt":"John is 47 and Mary is 23 years younger. How old is Mary?","expected":"47 - 23","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"Let Mary's age be $m$.\nJohn is 47 years old.\nMary is 23 years younger than John.\nSo, Mary's age is John's age minus 23.\n$m = 47 - 23$\n$m ="},{"arm":"bare","prompt":"A warehouse stored 90000 crates and shipped 12345. How many remain?","expected":"90000 - 12345","fire":true,"emitted_expr":"90000 - 12345","correct":true,"pos":14,"n_triggers":1,"emission":"90000 - 12345 = 77655\n\nThere are 77655 crates remaining.\nFinal Answer: The final answer is $\\boxed{77655}$"},{"arm":"bare","prompt":"What is 700 minus 458?","expected":"700 - 458","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"What is 12 x 8?\n\nWhat is 25 + 15?\n\nWhat is 100 - 50?\n\nWhat is 3 x 5 + 2?\n\nWhat is 15 - 7?\n\nWhat is 4 x 6 -"},{"arm":"bare","prompt":"From 86420 subtract 13579.","expected":"86420 - 13579","fire":true,"emitted_expr":"86420 - 13579","correct":true,"pos":14,"n_triggers":2,"emission":"86420 - 13579 = 72841\n\nFinal Answer: The final answer is $\\boxed{72841}$\n86420 - 13579 = 72841\n\n$86420"},{"arm":"bare","prompt":"A flight covers 5400 km and 1750 km are already behind. How far is left?","expected":"5400 - 1750","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"Let the total distance of the flight be $D = 5400$ km.\nThe distance already covered is $D_c = 1750$ km.\nThe distance left is $D_l = D - D_c = 5400 - 17"},{"arm":"bare","prompt":"A crate holds 240 bottles. How many bottles are in 12 crates?","expected":"240 * 12","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"Let the number of bottles in one crate be $b$.\nWe are given that $b = 240$.\nWe want to find the number of bottles in 12 crates.\nLet the number of bottles in 12 crates be $N$.\nThen $N = 12"},{"arm":"bare","prompt":"Multiply 73 by 19.","expected":"73 * 19","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"To multiply 73 by 19, we can use the distributive property:\n$73 \\times 19 = 73 \\times (10 + 9) = 73 \\times 10 + 73 \\times 9$\n$73 \\times"},{"arm":"bare","prompt":"A factory makes 1500 widgets a day. How many in 365 days?","expected":"1500 * 365","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"1500 widgets a day.\nNumber of days = 365\nTotal number of widgets = 1500 widgets/day * 365 days\nTotal number of widgets = 1500 * 365\n1500 * 365"},{"arm":"bare","prompt":"Each of the 48 rows has 96 seats. How many seats in total?","expected":"48 * 96","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"Each of the 48 rows has 96 seats.\nThe total number of seats is the number of rows multiplied by the number of seats in each row.\nTotal number of seats = 48 rows * 96 seats/row = 48 * 96\nWe can calculate"},{"arm":"bare","prompt":"What is the product of 407 and 311?","expected":"407 * 311","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"We want to find the product of 407 and 311.\nWe can use the standard multiplication method:\n```\n 407\nx 311\n------\n 407\n 407\n1221\n------\n1242"},{"arm":"bare","prompt":"Nine hundred boxes each weigh 75 kilos. What is the total weight?","expected":"900 * 75","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"A. 675 kilos\nB. 750 kilos\nC. 7500 kilos\nD. 6750 kilos\n\nThe correct answer is D. 6750 kilos.\n\nTo find the total weight, we multiply the number of boxes by the weight"},{"arm":"bare","prompt":"What is 47 plus 358 plus 1200?","expected":"47 + 358 + 1200","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"What is the sum of the following numbers: 47, 358, and 1200?\n\nTo find the sum of 47, 358, and 1200, we add the numbers together:\n47 + 358 +"},{"arm":"bare","prompt":"Start with 999, add 111, then take away 222. What do you get?","expected":"999 + 111 - 222","fire":true,"emitted_expr":"999 + 111","correct":false,"pos":26,"n_triggers":2,"emission":"This is a classic riddle that plays on how we read numbers.\n\n* 999 + 111 = 1110\n* 1110 - 222 = 888\n\nSo the answer is 888."},{"arm":"bare","prompt":"A bus starts with 50 passengers, then 23 get off and 12 get on. How many are aboard?","expected":"50 - 23 + 12","fire":true,"emitted_expr":"50 - 23","correct":false,"pos":38,"n_triggers":2,"emission":"**Solution:**\n\n1. **Start with the initial number of passengers:** 50\n2. **Subtract the passengers who get off:** 50 - 23 = 27\n3. **Add the passengers who get on:** 27 + 12 = 39"},{"arm":"cot","prompt":"If you have 38 apples and pick 17 more, how many apples do you have?","expected":"38 + 17","fire":true,"emitted_expr":"38 + 17","correct":true,"pos":42,"n_triggers":1,"emission":"* Start with the initial number of apples: 38\n* Add the number of apples picked: 17\n* Total number of apples: 38 + 17 = 55\n\nSo, you have 55 apples.\n\nFinal Answer: The final answer is $\\boxed{55}$"},{"arm":"cot","prompt":"What do you get when you add 123456 and 654321?","expected":"123456 + 654321","fire":true,"emitted_expr":"123456 + 654321","correct":true,"pos":43,"n_triggers":1,"emission":"123456\n+ 654321\n-------\n777777\n\nSo, 123456 + 654321 = 777777\n\nFinal Answer: The final answer is $\\boxed{777777}$"},{"arm":"cot","prompt":"What is the sum of 999 and 111?","expected":"999 + 111","fire":true,"emitted_expr":"999 + 111","correct":true,"pos":9,"n_triggers":2,"emission":"999 + 111 = (900 + 90 + 9) + (100 + 10 + 1)\n= 900 + 100 + 90 + 10 + 9 + 1\n= 1000 + 90 + 10 + 9 + 1"},{"arm":"cot","prompt":"A tank holds 4500 liters and 2750 more are pumped in. How much is in the tank?","expected":"4500 + 2750","fire":true,"emitted_expr":"4500 + 2750","correct":true,"pos":59,"n_triggers":1,"emission":"1. Start with the initial amount in the tank: 4500 liters\n2. Add the amount pumped in: 4500 + 2750\n3. Calculate the total: 4500 + 2750 = 7250\n\nSo, there are 7250 liters in the tank."},{"arm":"cot","prompt":"Tom scored 1284 points and then earned another 716. What is his total?","expected":"1284 + 716","fire":true,"emitted_expr":"1284 + 716","correct":true,"pos":28,"n_triggers":6,"emission":"1. First, we need to add the two numbers together.\n2. 1284 + 716 = 1999 + 100 - 1 = 2000 - 1 = 1999 + 1 = 2000 - 1 = 1999 + 1 = 2"},{"arm":"cot","prompt":"Add 87 to 246.","expected":"246 + 87","fire":true,"emitted_expr":"6 + 7","correct":false,"pos":38,"n_triggers":1,"emission":"1. Write the numbers vertically, aligning the ones place:\n 246\n+ 87\n-------\n\n2. Add the ones column: 6 + 7 = 13. Write down the 3 and carry over the 1 to the tens column.\n 246\n+ 87\n-------\n 3\n\n3. Add the tens column"},{"arm":"cot","prompt":"A library has 58210 books and acquires 4790 new ones. How many books now?","expected":"58210 + 4790","fire":true,"emitted_expr":"58210 + 4790","correct":true,"pos":61,"n_triggers":1,"emission":"1. Start with the number of books the library has: 58210\n2. Add the number of new books acquired: 58210 + 4790\n3. Calculate the sum: 58210 + 4790 = 63000\n\nThe library now has 63000 books."},{"arm":"cot","prompt":"What is 312487 increased by 96513?","expected":"312487 + 96513","fire":true,"emitted_expr":"312487 + 96513","correct":true,"pos":14,"n_triggers":1,"emission":"312487 + 96513 = 408999 + 100 - 100 + 100 - 100 + 100 - 100 + 100 - 100 + 100 - 100 + 100 - 10"},{"arm":"cot","prompt":"Sarah had 5000 dollars and spent 1234. How much does she have left?","expected":"5000 - 1234","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"1. Start with the initial amount: $5000\n2. Subtract the amount spent: $5000 - $1234\n3. Calculate the difference: $5000 - $1234 = $3766\n\nSo, Sarah has $3766 left.\n\nFinal Answer: The final answer is $\\boxed{3"},{"arm":"cot","prompt":"Take 250 away from 1000.","expected":"1000 - 250","fire":true,"emitted_expr":"1000 - 250","correct":true,"pos":39,"n_triggers":1,"emission":"1. Start with the number 1000.\n2. Subtract 250 from 1000.\n 1000 - 250 = 750\n\nSo, 250 away from 1000 is 750.\n\nFinal Answer: The final answer is $\\boxed{750}$"},{"arm":"cot","prompt":"John is 47 and Mary is 23 years younger. How old is Mary?","expected":"47 - 23","fire":true,"emitted_expr":"47 - 23","correct":true,"pos":52,"n_triggers":1,"emission":"* Mary is 23 years younger than John.\n* John's age is 47.\n* To find Mary's age, subtract 23 from John's age: 47 - 23 = 24\n\nSo, Mary is 24 years old.\n\nFinal Answer: The final answer is $\\boxed{24}$"},{"arm":"cot","prompt":"A warehouse stored 90000 crates and shipped 12345. How many remain?","expected":"90000 - 12345","fire":true,"emitted_expr":"90000 - 12345","correct":true,"pos":40,"n_triggers":1,"emission":"1. **Start with the initial number of crates:** 90000\n2. **Subtract the number shipped:** 90000 - 12345 = 77655\n\n**Answer:** There are 77655 crates remaining."},{"arm":"cot","prompt":"What is 700 minus 458?","expected":"700 - 458","fire":true,"emitted_expr":"10 - 8","correct":false,"pos":63,"n_triggers":1,"emission":"1. **Set up the problem:** 700 - 458\n2. **Subtract the ones place:** 0 - 8. We need to borrow from the tens place. The tens place becomes 9, and the ones place becomes 10 - 8 = 2.\n3. **Subtract the tens place:** 9 -"},{"arm":"cot","prompt":"From 86420 subtract 13579.","expected":"86420 - 13579","fire":true,"emitted_expr":"86420 - 13579","correct":true,"pos":13,"n_triggers":4,"emission":"86420 - 13579 = ?\n\nFirst, subtract 9 from 0, which gives 0.\n86420 - 13579 = 86420 - 10000 - 3579\n= 76420 - 3579\n= 7"},{"arm":"cot","prompt":"A flight covers 5400 km and 1750 km are already behind. How far is left?","expected":"5400 - 1750","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"1. **Total distance:** 5400 km\n2. **Distance already covered:** 1750 km\n3. **Distance left:** Total distance - Distance already covered\n 5400 km - 1750 km = 3650 km\n\n**Answer:** 3650 km is left."},{"arm":"cot","prompt":"A crate holds 240 bottles. How many bottles are in 12 crates?","expected":"240 * 12","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"1. We know that one crate holds 240 bottles.\n2. We want to find out how many bottles are in 12 crates.\n3. To find the number of bottles in 12 crates, we multiply the number of bottles in one crate by the number of crates.\n So, the number of bottles in 12 crates = 12 *"},{"arm":"cot","prompt":"Multiply 73 by 19.","expected":"73 * 19","fire":true,"emitted_expr":"73 * 19","correct":true,"pos":7,"n_triggers":4,"emission":"73 * 19 = 73 * (10 + 9) = 73 * 10 + 73 * 9\n= 730 + 73 * 9\n73 * 9 = 73 * (10 - 1) = 730 - 73 = 657\nSo"},{"arm":"cot","prompt":"A factory makes 1500 widgets a day. How many in 365 days?","expected":"1500 * 365","fire":true,"emitted_expr":"1500 * 365","correct":true,"pos":69,"n_triggers":1,"emission":"1. Find the number of widgets made in a day: 1500\n2. Find the number of days: 365\n3. Multiply the number of widgets made in a day by the number of days: 1500 * 365\n\n1500 * 365 = 1500 * (300 +"},{"arm":"cot","prompt":"Each of the 48 rows has 96 seats. How many seats in total?","expected":"48 * 96","fire":true,"emitted_expr":"48 * 96","correct":true,"pos":75,"n_triggers":1,"emission":"* Each row has 96 seats.\n* There are 48 rows.\n* Total number of seats = number of rows * number of seats per row\n* Total number of seats = 48 * 96\n\nNow, let's calculate 48 * 96:\n48 * 96 = 48 * ("},{"arm":"cot","prompt":"What is the product of 407 and 311?","expected":"407 * 311","fire":true,"emitted_expr":"407 * 311","correct":true,"pos":9,"n_triggers":3,"emission":"407 * 311 = 407 * (300 + 10 + 1)\n= 407 * 300 + 407 * 10 + 407 * 1\n= 122100 + 4070 + 407\n= 122"},{"arm":"cot","prompt":"Nine hundred boxes each weigh 75 kilos. What is the total weight?","expected":"900 * 75","fire":false,"emitted_expr":null,"correct":false,"pos":-1,"n_triggers":0,"emission":"* Each box weighs 75 kilos.\n* There are 900 boxes.\n* Total weight = 900 boxes * 75 kilos/box = 67500 kilos.\n\nFinal Answer: The final answer is $\\boxed{67500}$"},{"arm":"cot","prompt":"What is 47 plus 358 plus 1200?","expected":"47 + 358 + 1200","fire":true,"emitted_expr":"47 + 358","correct":false,"pos":22,"n_triggers":3,"emission":"1. Add 47 and 358:\n 47 + 358 = 405\n\n2. Add the result to 1200:\n 405 + 1200 = 1605\n\nTherefore, 47 + 358 + 1200 = 1605."},{"arm":"cot","prompt":"Start with 999, add 111, then take away 222. What do you get?","expected":"999 + 111 - 222","fire":true,"emitted_expr":"999 + 111","correct":false,"pos":28,"n_triggers":2,"emission":"1. Start with 999.\n2. Add 111: 999 + 111 = 1110\n3. Take away 222: 1110 - 222 = 888\n\nSo the answer is 888.\n\nFinal Answer: The final answer is $\\boxed{8"},{"arm":"cot","prompt":"A bus starts with 50 passengers, then 23 get off and 12 get on. How many are aboard?","expected":"50 - 23 + 12","fire":true,"emitted_expr":"50 - 23","correct":false,"pos":47,"n_triggers":1,"emission":"* **Start:** The bus begins with 50 passengers.\n* **Get off:** 23 passengers get off, so we subtract 23 from the initial number: 50 - 23 = 27\n* **Get on:** 12 passengers get on, so we add 12 to the current number: 27 +"}],"release_cell":[{"prompt":"If you have 38 apples and pick 17 more, how many apples do you have?","overrun":false,"tail":"So, you have 55 apples.\n\nFinal Answer: The final answer is $\\boxed{55}$"},{"prompt":"What do you get when you add 123456 and 654321?","overrun":false,"tail":"Final Answer: The final answer is $\\boxed{777777}$"},{"prompt":"What is the sum of 999 and 111?","overrun":false,"tail":"Alternatively, we can add the numbers as follows:\n999\n+ 111\n-------\n1110\n\nSo, the sum of 999 and 111 is 1110.\n\nFinal Answer: The final answer is $\\boxed{1110}$"},{"prompt":"A tank holds 4500 liters and 2750 more are pumped in. How much is in the tank?","overrun":false,"tail":"So, there are 7250 liters in the tank.\n\nFinal Answer: The final answer is $\\boxed{7250}$"},{"prompt":"Tom scored 1284 points and then earned another 716. What is his total?","overrun":false,"tail":"So, his total is 2000 points.\n\nFinal Answer: The final answer is $\\boxed{2000}$"},{"prompt":"Add 87 to 246.","overrun":false,"tail":". Write down the 3 and carry over the 1 to the tens column.\n 246\n+ 87\n-------\n 3\n\n3. Add the tens column: 4 + 8 + 1 (carry-over) ="},{"prompt":"A library has 58210 books and acquires 4790 new ones. How many books now?","overrun":false,"tail":"The library now has 63000 books.\n\nFinal Answer: The final answer is $\\boxed{63000"},{"prompt":"What is 312487 increased by 96513?","overrun":false,"tail":"We can add the numbers as follows:\n 312487\n+ 96513\n---------\n 408999 + 1 = 409000\n\nSo, 312487 increased by 96513 is 409000.\n\nFinal"},{"prompt":"Take 250 away from 1000.","overrun":false,"tail":"So, 250 away from 1000 is 750.\n\nFinal Answer: The final answer is $\\boxed{750}$"}]} \ No newline at end of file diff --git a/crates/larql-compute/src/attention/decode.rs b/crates/larql-compute/src/attention/decode.rs index d2bf1ad68..50160347c 100644 --- a/crates/larql-compute/src/attention/decode.rs +++ b/crates/larql-compute/src/attention/decode.rs @@ -56,7 +56,14 @@ where out_slice .par_chunks_mut(head_dim) .enumerate() - .for_each(|(h, out_h)| { + .for_each_init( + // Per-worker scores scratch, reused across all heads this + // worker processes (and across calls — rayon workers are + // long-lived). The per-head `vec![0.0; total_len]` it + // replaces was ~480 allocs+zeroings per token at 26B sizes + // and grew with context. + Vec::::new, + |scores, (h, out_h)| { let kv_h = h / reps; let q_off = h * head_dim; let kv_off = kv_h * head_dim; @@ -64,7 +71,7 @@ where let q_row = q_new.slice(ndarray::s![0, q_off..q_off + head_dim]); let k_block = k_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); let raw: ndarray::Array1 = k_block.dot(&q_row); - let mut scores = vec![0.0f32; total_len]; + scores.resize(total_len, 0.0); for i in 0..total_len { let mut s = raw[i] * scale_f32; if let Some(cap) = softcap { @@ -91,7 +98,8 @@ where out_h.copy_from_slice( weighted_v.as_slice().expect("1-D dot output is contiguous"), ); - }); + }, + ); } out } @@ -299,6 +307,55 @@ pub fn run_attention_block_decode_step_backend( Some((h_post_attn, (k_concat, v_concat))) } +/// `LARQL_Q4K_DIRECT_ATTN=1`: route decode-step attention projections through +/// the Q4K-direct kernels (packed bytes from the index) instead of the +/// f32-BLAS path over pre-dequantised `weights.tensors`. Single source of +/// truth for the flag — `CpuBackend::attention_step` and the engine walk +/// loops (via [`run_attention_block_decode_step_auto`]) must make the same +/// choice. Cached once; never in the hot loop. +pub fn q4k_direct_attn_enabled() -> bool { + use std::sync::OnceLock; + static ON: OnceLock = OnceLock::new(); + *ON.get_or_init(|| std::env::var("LARQL_Q4K_DIRECT_ATTN").as_deref() == Ok("1")) +} + +/// Best-available decode-step attention for callers that own their cache as +/// `SharedKV` tuples (engine walk loops, the cached-generation parity +/// oracle): Q4K-direct projections (int8 under `LARQL_Q4K_ATTN_INT8`, asm +/// under `LARQL_Q4K_ASM`) when the flag is on and an index with attention +/// bytes is supplied, else the f32 path — the SAME per-layer choice +/// `CpuBackend::attention_step` makes on the dispatch path, so engines and +/// the oracle stay numerically aligned. With the flag off (default) this is +/// byte-identical to calling `run_attention_block_decode_step_backend`. +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] +pub fn run_attention_block_decode_step_auto( + weights: &larql_models::ModelWeights, + h_new: &Array2, + layer: usize, + kv_entry: Option<&SharedKV>, + abs_position: usize, + backend: Option<&dyn crate::ComputeBackend>, + index: Option<&dyn crate::KvIndex>, +) -> Option<(Array2, SharedKV)> { + if q4k_direct_attn_enabled() { + if let (Some(be), Some(idx)) = (backend, index) { + if let Some(r) = run_attention_block_decode_step_q4k_direct( + weights, + h_new, + layer, + kv_entry, + abs_position, + be, + idx, + ) { + return Some(r); + } + } + } + run_attention_block_decode_step_backend(weights, h_new, layer, kv_entry, abs_position, backend) +} + /// `LARQL_Q4K_ATTN_INT8=1`: upgrade the Q4K-direct attention projections from /// the f32-activation kernels (`q4k_matvec`/`q6k_matvec` via `quant_matvec`) /// to the int8 Q8_K SDOT kernels (`q4k_q8k_matvec_into`/`q6k_q8k_matvec_into`, diff --git a/crates/larql-compute/src/attention/mod.rs b/crates/larql-compute/src/attention/mod.rs index 40febc8a3..b569fd4fd 100644 --- a/crates/larql-compute/src/attention/mod.rs +++ b/crates/larql-compute/src/attention/mod.rs @@ -61,7 +61,8 @@ pub use block::{ }; pub use decode::{ gqa_attention_decode_step, run_attention_block_decode_step, - run_attention_block_decode_step_backend, run_attention_block_decode_step_q4k_direct, + run_attention_block_decode_step_auto, run_attention_block_decode_step_backend, + run_attention_block_decode_step_q4k_direct, }; pub use gpu::{ q4_attention_proj, run_attention_block_gpu, run_attention_with_kv, diff --git a/crates/larql-compute/src/kv_dispatch/cpu.rs b/crates/larql-compute/src/kv_dispatch/cpu.rs index 6f5fdd6b1..377a56c9d 100644 --- a/crates/larql-compute/src/kv_dispatch/cpu.rs +++ b/crates/larql-compute/src/kv_dispatch/cpu.rs @@ -35,22 +35,16 @@ use crate::attention::{ }; use larql_models::ModelWeights; -/// Opt-in: route the CPU decode-step attention projections through the -/// Q4K-direct kernels (`quant_matvec` straight from the index) instead of the -/// f32-BLAS path on pre-dequantised `weights.tensors`. Gated while parity + -/// end-to-end are validated on the 26B (task #16); falls back to f32 per layer -/// when the index lacks Q4K attn bytes or a format is unsupported. -/// -/// ⚠️ SIBLING PATH: `cached_decode_step_q4k` / `CpuQ4kCacheHandle` (below, in -/// `crate::kquant_forward`) is a SECOND independent CPU Q4K attention decode. -/// Any RoPE / softcap / QK-V-norm / GQA change here must mirror there (and vice -/// versa) or the two silently diverge. Consolidate before either is load-bearing -/// — see `docs/diagnoses/q4k-direct-attention.md` §"CONSOLIDATION HAZARD". -fn q4k_direct_attn_enabled() -> bool { - use std::sync::OnceLock; - static ON: OnceLock = OnceLock::new(); - *ON.get_or_init(|| std::env::var("LARQL_Q4K_DIRECT_ATTN").as_deref() == Ok("1")) -} +// Opt-in flag (`LARQL_Q4K_DIRECT_ATTN`) lives in `attention::decode` — single +// source shared with `run_attention_block_decode_step_auto` so the dispatch +// path and the engine walk loops make the same per-layer choice. +// +// ⚠️ SIBLING PATH: `cached_decode_step_q4k` / `CpuQ4kCacheHandle` (in +// `crate::kquant_forward`) is a SECOND independent CPU Q4K attention decode. +// Any RoPE / softcap / QK-V-norm / GQA change here must mirror there (and vice +// versa) or the two silently diverge. Consolidate before either is load-bearing +// — see `docs/diagnoses/q4k-direct-attention.md` §"CONSOLIDATION HAZARD". +use crate::attention::decode::q4k_direct_attn_enabled; // ─── CpuKvHandle ──────────────────────────────────────────────────────────── @@ -134,14 +128,16 @@ impl CpuKvHandle { Some((k, v)) } - /// Move the state out as an owned `SharedKV`, leaving the handle empty - /// (the f32 fallback path re-populates via `replace_state`). - fn take_shared(&mut self) -> Option { - let out = self.to_shared(); - self.k_buf.clear(); - self.v_buf.clear(); - self.rows = 0; - out + /// Remove the most recently appended row (failure-path undo: the q4k + /// attend half can in principle fail AFTER the project half appended — + /// the handle must be left exactly as before the step so engine + /// fallbacks that reuse it see consistent state). + fn pop_row(&mut self) { + if self.rows > 0 { + self.rows -= 1; + self.k_buf.truncate(self.rows * self.kv_dim); + self.v_buf.truncate(self.rows * self.kv_dim); + } } } @@ -362,7 +358,7 @@ impl KvDispatch for CpuBackend { .expect("[1, kv_dim] projection row is contiguous"); h.append_row(k_row, v_row); let (k_all, v_all) = h.views().expect("non-empty after append"); - return crate::attention::decode::decode_step_attend_q4k_direct( + match crate::attention::decode::decode_step_attend_q4k_direct( weights, query, layer, @@ -371,16 +367,26 @@ impl KvDispatch for CpuBackend { v_all, self, idx, - ); + ) { + Some(h_post_attn) => return Some(h_post_attn), + None => { + // Attend failed after the append — undo it so the f32 + // fallback (and any engine-level fallback that reuses + // this handle) sees the pre-step state, matching the + // old monolithic form's failure semantics. + cpu_handle_mut(kv).pop_row(); + } + } } } // Default (f32) path: CpuBackend reads f32 attention tensors out of // `weights.tensors`, which the caller pre-populates via // `ensure_attn_tensors_dequantised` (the up-front dequant-to-f32 tax). - // The state moves OUT of the handle (no full-cache clone) and the - // concatenated result moves back in. - let prior_kv = cpu_handle_mut(kv).take_shared(); + // The prior state is COPIED out (not moved) so a backend failure + // leaves the handle intact — same semantics as the pre-refactor + // clone, and this path is cold whenever the q4k flags are on. + let prior_kv = cpu_handle(kv).to_shared(); let (h_post_attn, new_kv) = run_attention_block_decode_step_backend( weights, query, diff --git a/crates/larql-inference/docs/specs/kv-engine-unification.md b/crates/larql-inference/docs/specs/kv-engine-unification.md index 3612b59f3..01a956eb7 100644 --- a/crates/larql-inference/docs/specs/kv-engine-unification.md +++ b/crates/larql-inference/docs/specs/kv-engine-unification.md @@ -639,3 +639,26 @@ For reviewers, the concrete file pointers this spec is built on: The 200 `larql-kv` tests passing today is the safety net for §8.2's trait widening — any widening that breaks them is wrong. + +## Addendum 2026-06-13 — the resident-path contract + +The `prefill_resident` / `decode_step_resident` trait methods (added for +task #16's Q4K-direct attention) carry the vindex so the attention step can +read packed bytes instead of f32 `weights.tensors`. Their trait DEFAULTS +drop the index and forward to the plain methods — which silently kept every +own-walk-loop engine on the f32 path while `StandardEngine` got the +2026-06-11/12 CPU fast-path arc (q4k/int8 attention, asm kernels, +append-in-place KV). + +**Contract going forward:** every cached-state engine MUST either override +`decode_step_resident` (threading the index to +`larql_compute::attention::run_attention_block_decode_step_auto`, the +single-source q4k-vs-f32 dispatcher) or forward it to a wrapped engine that +does (`boundary-kv` → inner `StandardEngine`). Intentional exceptions: +`no_cache` and `apollo` (debug / bench-only full re-forward). + +Pinned by `larql-kv::engines::resident_identity_tests` — resident and plain +paths must be bit-identical with the flags off, across 7 concrete engine +specs, with a coverage-count floor so the matrix can't silently shrink. +Prefill deliberately stays on the f32 BLAS gemm for all engines (the +prefill-q4k falsification, `docs/diagnoses/q4k-direct-attention.md`). diff --git a/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md b/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md index 759dc1b7c..acac546da 100644 --- a/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md +++ b/crates/larql-inference/docs/specs/virtual-experts/arithmetic-virtual-expert.md @@ -19,6 +19,12 @@ algorithm (bounded-depth, A4e/A5). Therefore: > once surface form is uncontrolled (template-fragility 0.58–0.67 at sizes where one > template scored 0.93); A10 measured dispatch ≥ native in every cell at equal-or-known > token cost. The model's own arithmetic output is consumed only as a verification prior. +> +> Scale note (A14, 12B): the capability WALLS are per-model — at 12B the carry wall moved +> ~7–8 → ~13–16 digits and the magnitude wall ~24–28 → 28–32, while exact-long-random +> stayed ~0 at both scales. This does NOT touch this policy: "fired ⇒ dispatch, always" is +> justified by A8's template-fragility (surface-form), which was never length-based. Any +> future length-aware variant must re-derive its thresholds on the host model. This component is also **instance #1 of the VirtualExpert trait** (§8) — the gate / extract / compute / drive / verify decomposition is intended to be reused by future @@ -57,9 +63,16 @@ Two tiers, evaluated during the prompt forward pass (which runs anyway — the t free read, not a 0.24-forward surcharge; the 0.24 framing only applies if an early-exit dispatch skips the remaining layers, which is an optional optimization, OPEN). -**Tier 0 — symbolic (explicit math).** Regex/scanner over the token stream for operator -tokens adjacent to digit spans. Cost ~0. MEASURED: fire 1.0, extraction downstream 1.0 -(A10 bare cells). +**Tier 0 — symbolic (explicit math notation).** Scanner over the prompt surface for +digit chains joined by math notation. Cost ~0. MEASURED: fire 1.0, extraction +downstream 1.0 (A10 bare cells). **Scope rule (larql-rs v0.1, adversarial-prose +measured):** tier-0 fires on *notation*, never on inferred intent — strong glyphs +(`+`, `*`, `×`, `−`) fire anywhere; ambiguous prose operators (spaced `-`, standalone +`x` — ranges, scores, shifts, dimensions, spaced dates) fire only with an explicit +trailing `=`. Everything else is the designed fallthrough: deciding whether "9 - 5" +is arithmetic is an engagement question and belongs to the model (tier-1 exhaust, or +an FR3-style explicit classify), not to surface heuristics. Adversarial prose corpus: +0 false fires (`examples/scanner_adversarial.rs`). **Tier 1 — engagement probe (disguised math). DEMOTED (A11).** Ridge probe on the L8 residual at the last prompt token, reading arithmetic-engagement exhaust (math vs @@ -140,8 +153,9 @@ The native estimator is retained as a **magnitude prior**, nothing more: - Use: after extraction, compare ALU result's magnitude (digit count, leading digit) against the model's native answer *if one was produced*, or skip. Mismatch ⇒ flag `extract_suspect`, re-extract once, surface on second failure. -- HARD RULE: the prior is void past 24-digit operands (A5 magnitude wall). Never gate a - correctness decision on it; it is a tripwire for extraction bugs only. +- HARD RULE: the prior is void past 24-digit operands (A5 magnitude wall, **4B**; at 12B + the wall measured 28–32, A14 — the void threshold is PER-MODEL, re-derive on host). + Never gate a correctness decision on it; it is a tripwire for extraction bugs only. - v0.1 status: not exercised in any assembly run (A10 pre-registered it out) — wire it but treat its thresholds as ASSUMED until an assembly increment measures the false-flag rate. @@ -202,6 +216,7 @@ weights touched, no model routing used); compute is **never** the model's. | disguised-path token overhead | ~2× (rewrite floor) | MEASURED | A8/A10 | | estimator prior envelope | ±25–35% to 24 digits; void ≥28 | MEASURED | A4c/A5 | | end-to-end demo | 24-digit add 0.92 vs native 0.00, equal tokens | MEASURED | A10 | +| forced decode, Metal Q4_K | 6/6 exact, schedule-end 6/6, ~20 tok/s e2e | MEASURED | larql-rs 2026-06-12 | ## 10. Out of scope / risks @@ -220,9 +235,14 @@ weights touched, no model routing used); compute is **never** the model's. is an audit instrument and disguised coverage is parked behind the exhaust-generality instrument science. The explicit path — the measured-1.0 path — is the product surface; native is the designed fallthrough for everything else. -5. **Quantization.** All measurements bf16/MLX. Behavior under Q4_K/Q6_K on the Metal - pipeline: ASSUMED stable for forced decode (sampler-level, quantization-independent); - probe and injection paths need one re-calibration run. +5. **Quantization.** Arc measurements bf16/MLX. Forced decode under Q4_K on the Metal + pipeline: **MEASURED** (2026-06-12, larql-rs assembly run): AT-1 6/6 exact with + schedule-end termination through the backend-routed constrained path, 345 ms–1.26 s + per item incl. prefill (24-digit add = 25 forced tokens at ~20 tok/s end-to-end) — + `bench/aim-validation/ave_demo_gemma3-4b.json`, `ave_demo --metal`. The + sampler-level argument held: the mask applies to CPU-resident logits, so the drive + is backend- and quantization-independent by construction *and now by measurement*. + Probe and injection paths still need a re-calibration run if revived. ## 11. Acceptance tests (assembly increments) diff --git a/crates/larql-inference/examples/ave_demo.rs b/crates/larql-inference/examples/ave_demo.rs index 90ae0f052..f5cc5965b 100644 --- a/crates/larql-inference/examples/ave_demo.rs +++ b/crates/larql-inference/examples/ave_demo.rs @@ -17,6 +17,13 @@ //! (`--metal` needs `--features gpu`; reruns the AT-1 forced-decode leg on //! the Metal pipeline — the spec §10.5 quantization/backend note.) //! Writes `bench/aim-validation/ave_demo_gemma3-4b.json`. +//! +//! **Showcase mode:** `ave_demo -- --prompt "what is 123456 + 654321?"` +//! runs one free prompt twice — native first (the model alone), then +//! through the expert with the state-machine trace rendered live: +//! GATE → EXTRACT → COMPUTE → DRIVE (forced tokens streaming into the +//! model's own sentence) → TERMINATE → VERIFY. The model is never asked +//! for the answer; the trace makes the invisibility property visible. use larql_inference::experts::{ave_generate_kquant, ArithmeticExpert, AveOptions}; use larql_inference::load_tokenizer; @@ -37,6 +44,9 @@ const EXPLICIT: &[(&str, &str)] = &[ ]; /// Distractors: digits present, no computation asked — gate must stay cold. +/// The second block is the adversarial-prose family (spaced ranges, scores, +/// shift idioms, dimensions, question forms) that drove the +/// notation-only tier-0 rule. const DISTRACTORS: &[&str] = &[ "My phone number is 4415550172.", "The meeting is on 2026-06-11.", @@ -44,17 +54,37 @@ const DISTRACTORS: &[&str] = &[ "Order 66 was executed in 19 BBY.", "Account 123456789012345678901234567890 is active.", "What is the capital of France?", + "It takes 5 - 10 business days.", + "I work a 9 - 5 job.", + "a 4 x 4 truck parked outside", + "Are you available 9 - 5?", + "dated 2026 - 06 - 11 in the ledger", ]; fn main() { let args: Vec = std::env::args().collect(); let use_metal = args.iter().any(|a| a == "--metal"); - let vindex = args + let showcase_prompt = args .iter() - .skip(1) - .find(|a| !a.starts_with("--")) - .cloned() - .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + .position(|a| a == "--prompt") + .and_then(|i| args.get(i + 1)) + .cloned(); + let mut vindex = "output/gemma3-4b-q4k-v2.vindex".to_string(); + let mut skip_next = false; + for a in args.iter().skip(1) { + if skip_next { + skip_next = false; + continue; + } + if a == "--prompt" { + skip_next = true; + continue; + } + if !a.starts_with("--") { + vindex = a.clone(); + break; + } + } let dir = std::path::PathBuf::from(&vindex); if !dir.exists() { eprintln!("skipped: vindex not found at {vindex}"); @@ -76,6 +106,11 @@ fn main() { let ave = ArithmeticExpert::new(); let opts = AveOptions::default(); + if let Some(prompt) = showcase_prompt { + showcase(&ave, &mut weights, &tok, &index, &prompt); + return; + } + println!("\n=== AVE assembly increment on {vindex} ==="); println!(" gate: tier-0 symbolic (no probe artifact); drive: forced decode + schedule-end termination\n"); @@ -254,6 +289,107 @@ fn main() { } } +/// Showcase: one free prompt, native first, then the expert with the +/// state-machine trace rendered live. The trace is the star — the viewer +/// should see the model never being asked. +fn showcase( + ave: &larql_inference::experts::ArithmeticExpert, + weights: &mut larql_models::ModelWeights, + tok: &tokenizers::Tokenizer, + index: &larql_vindex::VectorIndex, + prompt: &str, +) { + use larql_inference::experts::arith::drive::{force_decode_kquant_streaming, TerminationCause}; + use larql_inference::experts::VirtualExpert; + use larql_inference::vindex::generate_kquant_cpu_constrained_cached_streaming; + use std::io::Write; + + let flush = || std::io::stdout().flush().ok(); + let prompt_ids = tok + .encode(prompt, true) + .expect("encode") + .get_ids() + .to_vec(); + + println!("\n════════════════ AVE showcase ════════════════"); + + // ── The model alone ────────────────────────────────────────────── + println!("\n── native (the model alone) ──"); + print!("{prompt}"); + flush(); + let t0 = std::time::Instant::now(); + let native = generate_kquant_cpu_constrained_cached_streaming( + weights, + tok, + &prompt_ids, + 24, + index, + |_, _| {}, + |_, text| { + print!("{text}"); + flush(); + }, + ); + let native_ms = t0.elapsed().as_millis(); + let native_text: String = native.iter().map(|(t, _)| t.as_str()).collect(); + println!("\n({} tokens, {} ms)", native.len(), native_ms); + + // ── The expert ──────────────────────────────────────────────────── + println!("\n── AVE (the model is never asked) ──"); + let t = std::time::Instant::now(); + let fire = ave.gate(None, prompt); + let gate_us = t.elapsed().as_micros(); + if !fire.fired() { + println!("GATE no fire — no math notation on the prompt surface [{gate_us} µs]"); + println!(" native path untouched (the designed fallthrough)"); + return; + } + println!("GATE tier-0 fire — math notation adjacent to digit spans [{gate_us} µs]"); + + let t = std::time::Instant::now(); + let expr = ave.extract(prompt, None).expect("tier-0 fire ⇒ symbolic extract"); + let extract_us = t.elapsed().as_micros(); + println!("EXTRACT {expr} [symbolic, 0 model tokens, {extract_us} µs]"); + + let t = std::time::Instant::now(); + let answer = ave.compute(&expr); + let compute_us = t.elapsed().as_micros(); + println!("COMPUTE = {} [exact, {compute_us} µs]", answer.value); + + let schedule = ave.drive(&answer); + let schedule_ids = schedule.forced_ids(tok); + println!( + "DRIVE {}-token forced schedule at the sampler", + schedule_ids.len() + ); + + print!("{prompt}"); + flush(); + let t0 = std::time::Instant::now(); + let fd = force_decode_kquant_streaming(weights, tok, index, &prompt_ids, &schedule_ids, |_, text| { + print!("{text}"); + flush(); + }); + let drive_ms = t0.elapsed().as_millis(); + println!(); + let n = fd.ids.len(); + match fd.cause { + TerminationCause::ScheduleEnd => println!( + "TERMINATE schedule end ({n}/{n} tokens — delivery 1.0 by construction) [{drive_ms} ms, {:.0} ms/tok]", + drive_ms as f64 / n.max(1) as f64 + ), + TerminationCause::EarlyStop { at } => { + println!("TERMINATE EARLY STOP at {at}/{} — investigate", schedule_ids.len()) + } + } + + // The side-by-side produced a native answer, so the verify prior has + // something to read — the one place the model's own arithmetic is + // consumed, as a tripwire, never as the emission. + let verdict = ave.verify(&answer, Some(&native_text)); + println!("VERIFY magnitude prior vs the native attempt: {}", verdict.label()); +} + #[cfg(all(feature = "gpu", target_os = "macos"))] fn metal_backend_boxed() -> Option> { larql_compute_metal::metal_backend() diff --git a/crates/larql-inference/examples/ave_stream_trigger_probe.rs b/crates/larql-inference/examples/ave_stream_trigger_probe.rs new file mode 100644 index 000000000..54507db43 --- /dev/null +++ b/crates/larql-inference/examples/ave_stream_trigger_probe.rs @@ -0,0 +1,363 @@ +//! Stream-trigger measurement (pre-registered) — does the model's +//! spontaneous restatement reflex support mid-stream dispatch? +//! +//! The observation under test: on arithmetic word problems the model +//! reliably rewrites prose into notation (`123456 + 654321 = `) before +//! face-planting on the digits. If that reflex is frequent AND faithful, +//! stream-gating on the model's own emitted `expr =` gives the disguised +//! path with no probe, no instructed rewrite, no intent heuristics — the +//! engagement signal expressed in tokens, auditable in the transcript. +//! +//! Two arms per item — `bare` (raw completion: the spontaneous reflex) +//! and `cot` (a one-line generic step-by-step nudge: no examples, no +//! format rigging — the deployment shape). AMENDMENT NOTE: the cot arm +//! was added after the bare arm's interim fire rate (~0.4) was visible, +//! on the observation that CoT rewrites into notation; thresholds are +//! inherited unchanged and the cot arm runs blind. +//! +//! Three numbers per item, plus one release-mode cell: +//! 1. FIRE — does a trigger (`expr =`) appear within budget? +//! 2. FIDELITY — is the emitted expression the RIGHT expression +//! (scored against ground-truth operands/op — the A13b +//! expression-echo discipline applied to the trigger itself)? +//! 3. POSITION — tokens until first trigger; trigger multiplicity. +//! +//! Plus the RELEASE cell — splice the ALU answer at the trigger, release +//! the mask, count post-schedule digit overruns (the A10 ~4% mode, in +//! its mid-sentence form). +//! +//! Pre-registered branches: +//! - fire ≥ 0.8 AND fidelity ≥ 0.95 of fired → build the stream-gate. +//! - fire ≥ 0.8 AND fidelity < 0.95 → trigger = engagement +//! signal only; the payload still needs the instructed rewrite. +//! - fire < 0.5 → the reflex was the +//! prompt family talking; disguised path stays parked. +//! +//! Usage: `cargo run --release --example ave_stream_trigger_probe -- [VINDEX_DIR]` +//! Writes `bench/aim-validation/ave_stream_trigger_gemma3-4b.json`. + +use larql_inference::experts::arith::extract::find_triggers; +use larql_inference::load_tokenizer; +use larql_inference::vindex::generate_kquant_cpu_constrained_cached_streaming; + +/// (word problem, canonical expression the model SHOULD restate). +/// No notation in the prompt — these are disguised asks; tier-0 stays +/// cold on all of them by construction. +const PROBLEMS: &[(&str, &str)] = &[ + // addition, varied phrasing + ("If you have 38 apples and pick 17 more, how many apples do you have?", "38 + 17"), + ("What do you get when you add 123456 and 654321?", "123456 + 654321"), + ("What is the sum of 999 and 111?", "999 + 111"), + ("A tank holds 4500 liters and 2750 more are pumped in. How much is in the tank?", "4500 + 2750"), + ("Tom scored 1284 points and then earned another 716. What is his total?", "1284 + 716"), + ("Add 87 to 246.", "246 + 87"), + ("A library has 58210 books and acquires 4790 new ones. How many books now?", "58210 + 4790"), + ("What is 312487 increased by 96513?", "312487 + 96513"), + // subtraction + ("Sarah had 5000 dollars and spent 1234. How much does she have left?", "5000 - 1234"), + ("Take 250 away from 1000.", "1000 - 250"), + ("John is 47 and Mary is 23 years younger. How old is Mary?", "47 - 23"), + ("A warehouse stored 90000 crates and shipped 12345. How many remain?", "90000 - 12345"), + ("What is 700 minus 458?", "700 - 458"), + ("From 86420 subtract 13579.", "86420 - 13579"), + ("A flight covers 5400 km and 1750 km are already behind. How far is left?", "5400 - 1750"), + // multiplication + ("A crate holds 240 bottles. How many bottles are in 12 crates?", "240 * 12"), + ("Multiply 73 by 19.", "73 * 19"), + ("A factory makes 1500 widgets a day. How many in 365 days?", "1500 * 365"), + ("Each of the 48 rows has 96 seats. How many seats in total?", "48 * 96"), + ("What is the product of 407 and 311?", "407 * 311"), + ("Nine hundred boxes each weigh 75 kilos. What is the total weight?", "900 * 75"), + // two-op chains (multiplicity watch) + ("What is 47 plus 358 plus 1200?", "47 + 358 + 1200"), + ("Start with 999, add 111, then take away 222. What do you get?", "999 + 111 - 222"), + ("A bus starts with 50 passengers, then 23 get off and 12 get on. How many are aboard?", "50 - 23 + 12"), +]; + +/// (arm name, prompt suffix, generation budget). +const ARMS: &[(&str, &str, usize)] = &[ + ("bare", "", 64), + ("cot", "\n\nLet's work this out step by step:\n", 80), +]; + +fn main() { + let args: Vec = std::env::args().collect(); + let vindex = args + .get(1) + .cloned() + .unwrap_or_else(|| "output/gemma3-4b-q4k-v2.vindex".to_string()); + let dir = std::path::PathBuf::from(&vindex); + if !dir.exists() { + eprintln!("skipped: vindex not found at {vindex}"); + return; + } + + let mut cb = larql_vindex::SilentLoadCallbacks; + eprintln!("Loading {vindex} ..."); + let mut weights = larql_vindex::load_model_weights_kquant(&dir, &mut cb).expect("weights"); + let mut index = larql_vindex::VectorIndex::load_vindex(&dir, &mut cb).expect("index"); + index.load_interleaved_kquant(&dir).expect("interleaved"); + index.load_attn_kquant(&dir).expect("attn kquant"); + let tok = load_tokenizer(&dir).expect("tokenizer"); + + println!("\n=== stream-trigger probe on {vindex} ==="); + + let mut json_rows = String::new(); + let mut arm_summaries: Vec<(String, usize, usize, usize, usize, usize)> = Vec::new(); + + for (arm, suffix, budget) in ARMS { + println!("\n ── arm: {arm} (budget {budget} tok) ──"); + println!( + "{:<4} {:>5} {:>9} {:>6} {:>5} emitted-expr (vs expected)", + "item", "fire", "fidelity", "pos", "n_trg" + ); + + let mut fired = 0usize; + let mut faithful = 0usize; + let mut positions: Vec = Vec::new(); + let mut multi = 0usize; + + for (idx, (prompt, expected)) in PROBLEMS.iter().enumerate() { + let full_prompt = format!("{prompt}{suffix}"); + let prompt_ids = tok + .encode(full_prompt.as_str(), true) + .expect("encode") + .get_ids() + .to_vec(); + + // Stream and record the token position at which the first trigger + // completes — the same incremental read the gate would perform. + let mut emitted = String::new(); + let mut first_trigger_pos: Option = None; + let mut n_tokens = 0usize; + let out = generate_kquant_cpu_constrained_cached_streaming( + &mut weights, + &tok, + &prompt_ids, + *budget, + &index, + |_, _| {}, + |_, text| { + emitted.push_str(text); + n_tokens += 1; + if first_trigger_pos.is_none() + && text.contains('=') + && !find_triggers(&emitted).is_empty() + { + first_trigger_pos = Some(n_tokens); + } + }, + ); + let _ = out; + let triggers = find_triggers(&emitted); + let fire = !triggers.is_empty(); + let n_trg = triggers.len(); + let first_expr = triggers.first().map(|(e, _)| e.to_string()); + // Fidelity: the FIRST emitted trigger must be the ground-truth + // expression (operands and ops, exact, order-insensitive only via + // the canonical string — the harness corpus is written in the + // model's natural restatement order). + let correct = first_expr.as_deref() == Some(*expected); + + fired += usize::from(fire); + faithful += usize::from(correct); + if let Some(p) = first_trigger_pos { + positions.push(p); + } + multi += usize::from(n_trg > 1); + + println!( + "{:<4} {:>5} {:>9} {:>6} {:>5} {} (exp {})", + idx, + if fire { "✓" } else { "—" }, + if !fire { + "n/a" + } else if correct { + "✓" + } else { + "✗ WRONG" + }, + first_trigger_pos.map(|p| p.to_string()).unwrap_or_else(|| "-".into()), + n_trg, + first_expr.as_deref().unwrap_or("-"), + expected, + ); + json_rows.push_str(&format!( + "{}{{\"arm\":\"{arm}\",\"prompt\":{},\"expected\":{},\"fire\":{fire},\"emitted_expr\":{},\"correct\":{correct},\"pos\":{},\"n_triggers\":{n_trg},\"emission\":{}}}", + if json_rows.is_empty() { "" } else { "," }, + serde_json::to_string(prompt).expect("json"), + serde_json::to_string(expected).expect("json"), + serde_json::to_string(&first_expr).expect("json"), + first_trigger_pos.map(|p| p as i64).unwrap_or(-1), + serde_json::to_string(emitted.trim()).expect("json"), + )); + } + + let median_pos = { + let mut p = positions.clone(); + p.sort_unstable(); + p.get(p.len() / 2).copied().unwrap_or(0) + }; + arm_summaries.push((arm.to_string(), fired, faithful, median_pos, multi, PROBLEMS.len())); + } + + // ── Release-mode cell: splice at the trigger, release the mask, + // count post-schedule digit overruns. The splice payload is the ALU + // result of the EMITTED expression (honest end-to-end: wrong emitted + // expr → wrong splice, which fidelity already scores). ── + println!("\n ── release-mode cell (splice at trigger, release mask, watch for digit overrun) ──"); + let mut release_runs = 0usize; + let mut overruns = 0usize; + let mut release_rows = String::new(); + let cot_suffix = ARMS[1].1; + for (prompt, _expected) in PROBLEMS.iter().take(10) { + let full_prompt = format!("{prompt}{cot_suffix}"); + let prompt_ids = tok + .encode(full_prompt.as_str(), true) + .expect("encode") + .get_ids() + .to_vec(); + + // Stateful stream-gate split across the mask closure (reads) and + // the token callback (writes) — shared via RefCell since both + // borrow the same state, sequentially per step. This is the + // future controller in miniature. + #[derive(Default)] + struct GateState { + emitted: String, + schedule: Option>, + forced: usize, + done_forcing: bool, + released_tail: String, + } + let state = std::cell::RefCell::new(GateState::default()); + let tok_ref = &tok; + let out = generate_kquant_cpu_constrained_cached_streaming( + &mut weights, + &tok, + &prompt_ids, + 96, + &index, + |_generated, logits| { + let s = state.borrow(); + if s.done_forcing { + return; // released — model continues unmasked + } + if let Some(sched) = &s.schedule { + if s.forced < sched.len() { + let want = sched[s.forced]; + for (i, l) in logits.iter_mut().enumerate() { + if i as u32 != want { + *l = f32::NEG_INFINITY; + } + } + if let Some(l) = logits.get_mut(want as usize) { + if !l.is_finite() { + *l = 0.0; + } + } + } + } + }, + |_, text| { + let mut s = state.borrow_mut(); + s.emitted.push_str(text); + if s.schedule.is_none() { + if let Some((expr, _)) = find_triggers(&s.emitted).into_iter().next() { + let answer = expr.eval(); + let ids = tok_ref + .encode(format!(" {answer}").as_str(), false) + .map(|e| e.get_ids().to_vec()) + .unwrap_or_default(); + if !ids.is_empty() { + s.schedule = Some(ids); + } + } + } else if !s.done_forcing { + s.forced += 1; + if s.forced >= s.schedule.as_ref().map(|v| v.len()).unwrap_or(0) { + s.done_forcing = true; + } + } else { + s.released_tail.push_str(text); + } + }, + ); + let _ = out; + let state = state.into_inner(); + let (released_tail, done_forcing, had_schedule) = + (state.released_tail, state.done_forcing, state.schedule.is_some()); + if had_schedule && done_forcing { + release_runs += 1; + // Overrun = the released model immediately continues the + // number (first non-space char of the tail is a digit). + let overrun = released_tail + .trim_start() + .chars() + .next() + .is_some_and(|c| c.is_ascii_digit()); + overruns += usize::from(overrun); + println!( + " {:<58} overrun: {} tail: {:?}", + format!("{prompt:?}"), + if overrun { "✗ YES" } else { "✓ no" }, + released_tail.chars().take(28).collect::(), + ); + release_rows.push_str(&format!( + "{}{{\"prompt\":{},\"overrun\":{overrun},\"tail\":{}}}", + if release_rows.is_empty() { "" } else { "," }, + serde_json::to_string(prompt).expect("json"), + serde_json::to_string(released_tail.trim()).expect("json"), + )); + } else { + println!( + " {:<58} (no trigger within budget — release cell skipped)", + format!("{prompt:?}") + ); + } + } + + // ── verdict ────────────────────────────────────────────────────── + println!("\n ── verdict ──"); + let mut arm_json = String::new(); + for (arm, fired, faithful, median_pos, multi, n) in &arm_summaries { + let fire_rate = *fired as f64 / *n as f64; + let fidelity = if *fired > 0 { + *faithful as f64 / *fired as f64 + } else { + 0.0 + }; + let branch = if fire_rate >= 0.8 && fidelity >= 0.95 { + "BUILD: stream-gate on the model's own `expr =` — disguised path without probe or rewrite" + } else if fire_rate >= 0.8 { + "ENGAGEMENT-ONLY: trigger fires but emitted exprs unfaithful — payload needs the instructed rewrite" + } else if fire_rate < 0.5 { + "PARKED: restatement reflex insufficient in this arm" + } else { + "GRAY ZONE: fire rate between branches — widen the corpus before deciding" + }; + println!( + " [{arm}] fire: {fired}/{n} ({fire_rate:.2}) fidelity-of-fired: {faithful}/{fired} ({fidelity:.2}) median pos: {median_pos} tok multi-trigger: {multi}" + ); + println!(" [{arm}] branch: {branch}"); + arm_json.push_str(&format!( + "{}{{\"arm\":\"{arm}\",\"fire\":[{fired},{n}],\"fidelity_of_fired\":[{faithful},{fired}],\"median_pos\":{median_pos},\"multi_trigger\":{multi},\"branch\":{}}}", + if arm_json.is_empty() { "" } else { "," }, + serde_json::to_string(branch).expect("json"), + )); + } + println!( + " release cell (cot arm): {overruns}/{release_runs} digit overruns (guard-token mitigation clamps this by construction)" + ); + + let json = format!( + "{{\"experiment\":\"ave_stream_trigger\",\"vindex\":{},\"arms\":[{arm_json}],\"release_overruns\":[{overruns},{release_runs}],\"items\":[{json_rows}],\"release_cell\":[{release_rows}]}}", + serde_json::to_string(&vindex).expect("json"), + ); + let out_path = "bench/aim-validation/ave_stream_trigger_gemma3-4b.json"; + if let Err(e) = std::fs::write(out_path, &json) { + eprintln!("warning: could not write {out_path}: {e}"); + } else { + println!("\nwrote {out_path}"); + } +} diff --git a/crates/larql-inference/examples/scanner_adversarial.rs b/crates/larql-inference/examples/scanner_adversarial.rs new file mode 100644 index 000000000..6240854d3 --- /dev/null +++ b/crates/larql-inference/examples/scanner_adversarial.rs @@ -0,0 +1,51 @@ +//! Adversarial session against the tier-0 scanner: prose that carries +//! digits and operator-shaped characters but asks for no computation. +use larql_inference::experts::arith::extract::find_expression; + +fn main() { + let cases: &[&str] = &[ + // spaced ranges / scores / idioms — the '-' with whitespace family + "It takes 5 - 10 business days.", + "They won 3 - 1 at home.", + "I work a 9 - 5 job.", + "Open Monday - Friday, 9 - 17.", + "pages 12 - 48 cover the appendix", + "the score was 2 - 2 after extra time", + "ages 18 - 25 only", + "dated 2026 - 06 - 11 in the ledger", + // 'x' family + "a 4 x 4 truck", + "2 x 4 lumber at the yard", + "a 3 x 5 index card", + "room is 12 x 14 feet", + // '+' in prose + "I have 2 + years of experience", + "rated 4 + stars on average", + "C++ 11 added move semantics", + "call +44 7911 123456", + "she scored 1600 + on the test", + // metaphor words (MEE territory — must be inert in AVE v0.1) + "exponential growth of 300 users", + "let me go off on a tangent about 7 things", + "check the log file at line 42", + "a sine of the times, all 9 of them", + // ambiguous bare/question weak forms — now the model's territory + "9 - 5", + "what is 100 - 7?", + "Are you available 9 - 5?", + // legit math notation that MUST keep firing + "12 + 7 =", + "what is 123456 + 654321?", + "100000 - 1 =", + "12345 * 6789", + "3 x 4 =", + "47−5", + "999 + 111 - 222 =", + ]; + for c in cases { + match find_expression(c) { + Some(e) => println!("FIRE {c:<46} -> {} = {}", e, e.eval()), + None => println!(" no {c}"), + } + } +} diff --git a/crates/larql-inference/src/attention/mod.rs b/crates/larql-inference/src/attention/mod.rs index 786605dae..551b2d14b 100644 --- a/crates/larql-inference/src/attention/mod.rs +++ b/crates/larql-inference/src/attention/mod.rs @@ -32,7 +32,7 @@ pub use block::{ }; pub use decode::{ gqa_attention_decode_step, run_attention_block_decode_step, - run_attention_block_decode_step_backend, + run_attention_block_decode_step_auto, run_attention_block_decode_step_backend, }; pub use gpu::{ q4_attention_proj, run_attention_block_gpu, run_attention_with_kv, diff --git a/crates/larql-inference/src/experts/arith/drive.rs b/crates/larql-inference/src/experts/arith/drive.rs index 823200cf0..a10ec33a5 100644 --- a/crates/larql-inference/src/experts/arith/drive.rs +++ b/crates/larql-inference/src/experts/arith/drive.rs @@ -15,7 +15,7 @@ use larql_models::ModelWeights; use larql_vindex::VectorIndex; use tokenizers::Tokenizer; -use crate::vindex::generate_kquant_cpu_constrained_cached; +use crate::vindex::generate_kquant_cpu_constrained_cached_streaming; /// Why the forced decode stopped. #[derive(Debug, Clone, PartialEq, Eq)] @@ -58,6 +58,23 @@ pub fn force_decode_kquant( prompt_ids: &[u32], schedule: &[u32], ) -> ForcedDecode { + force_decode_kquant_streaming(weights, tokenizer, index, prompt_ids, schedule, |_, _| {}) +} + +/// Streaming sibling of [`force_decode_kquant`]: `on_token(id, text)` fires +/// as each forced token decodes — the showcase path, where the splice is +/// rendered live into the model's own sentence. +pub fn force_decode_kquant_streaming( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + index: &VectorIndex, + prompt_ids: &[u32], + schedule: &[u32], + on_token: F, +) -> ForcedDecode +where + F: FnMut(u32, &str), +{ if schedule.is_empty() { return ForcedDecode { emitted: String::new(), @@ -66,7 +83,7 @@ pub fn force_decode_kquant( }; } let sched = schedule.to_vec(); - let out = generate_kquant_cpu_constrained_cached( + let out = generate_kquant_cpu_constrained_cached_streaming( weights, tokenizer, prompt_ids, @@ -89,6 +106,7 @@ pub fn force_decode_kquant( } } }, + on_token, ); let ids: Vec = out.iter().map(|(_, id)| *id).collect(); diff --git a/crates/larql-inference/src/experts/arith/extract.rs b/crates/larql-inference/src/experts/arith/extract.rs index cb0609f10..8581a6c11 100644 --- a/crates/larql-inference/src/experts/arith/extract.rs +++ b/crates/larql-inference/src/experts/arith/extract.rs @@ -13,40 +13,67 @@ use super::alu::{BigInt, Expr, Op}; /// One lexed token of the prompt surface. `Other` breaks operand/operator -/// adjacency so unrelated numbers never join into an expression. +/// adjacency so unrelated numbers never join into an expression. Spans are +/// char indices into the input, used for the weak-chain cue check. #[derive(Debug, Clone, PartialEq, Eq)] enum Tok { - Num(String), - Op(Op), + Num { digits: String, start: usize, end: usize }, + Op { op: Op, weak: bool }, Other, } -/// Scan `text` for the longest explicit integer chain `N op N (op N)*`. +/// Scan `text` for the longest explicit integer chain `N op N (op N)*` +/// written in *math notation* — tier-0 stays symbolic and dumb by design. /// -/// Operator rules (distractor protection — gate specificity is the contract): -/// - `+`, `*`, `×`, `−` (U+2212) count anywhere between digit spans; -/// - ASCII `-` counts only with whitespace on both sides, so dates -/// (`2026-06-11`), ranges (`5-10`) and phone formats never fire; -/// - `x`/`X` counts only as a standalone word between digit spans (`3 x 4`); -/// - `/` never counts — division is OPEN in v0.1 and `06/11` is a date. +/// Operator rules (gate specificity is the contract; intent inference is +/// NOT tier-0's job — ambiguous surface forms are the model's territory, +/// and no-fire ⇒ native is the designed fallthrough, spec §3): +/// - **Strong** operators — `+`, `*`, `×`, `−` (U+2212) — are math glyphs +/// wherever they appear; a chain containing any of them fires bare. +/// - **Weak** operators — ASCII `-` with whitespace on both sides, and +/// standalone `x`/`X` — are ordinary prose syntax (ranges `5 - 10`, +/// scores `3 - 1`, shifts `9 - 5`, spaced dates `2026 - 06 - 11`, +/// dimensions `4 x 4`). A chain whose operators are ALL weak fires only +/// when followed by an explicit `=` — the one cue that is itself math +/// notation. (`?` is sentence punctuation, not notation: "Are you +/// available 9 - 5?" must never fire.) Everything else falls through to +/// native untouched. Adversarial prose corpus: 0 false fires +/// (`scanner_adversarial` example). +/// - Unspaced `-` never counts (dates `2026-06-11`, ranges `5-10`, phones); +/// `/` never counts — division is OPEN in v0.1 and `06/11` is a date. /// /// Numbers absorb `1,234,567`-style thousands separators and `_` separators. pub fn find_expression(text: &str) -> Option { - let toks = lex(text); - let mut best: Option<(usize, usize)> = None; // (start, op_count) + find_expression_with_policy(text, true) +} + +/// `require_notation_cue = false` relaxes the weak-chain `=` requirement — +/// used by [`parse_rewrite`] only, where the line being parsed is a +/// model-emitted expression by instruction (the rewrite prompt IS the +/// cue), not user prose. +fn find_expression_with_policy(text: &str, require_notation_cue: bool) -> Option { + let chars: Vec = text.chars().collect(); + let toks = lex(&chars); + // Collect every maximal chain, then pick the longest QUALIFYING one — + // a weak unqualified range earlier in the text must not shadow a real + // expression later in it. + let mut best: Option<(usize, usize)> = None; // (start tok idx, op_count) let mut i = 0; while i < toks.len() { - if matches!(toks[i], Tok::Num(_)) { + if matches!(toks[i], Tok::Num { .. }) { let mut j = i; let mut ops = 0usize; - while matches!(toks.get(j + 1), Some(Tok::Op(_))) - && matches!(toks.get(j + 2), Some(Tok::Num(_))) + while matches!(toks.get(j + 1), Some(Tok::Op { .. })) + && matches!(toks.get(j + 2), Some(Tok::Num { .. })) { ops += 1; j += 2; } - if ops > 0 && best.map(|(_, b)| ops > b).unwrap_or(true) { + if ops > 0 + && (!require_notation_cue || chain_qualifies(&chars, &toks, i, ops)) + && best.map(|(_, b)| ops > b).unwrap_or(true) + { best = Some((i, ops)); } i = j + 1; @@ -56,15 +83,19 @@ pub fn find_expression(text: &str) -> Option { } let (start, op_count) = best?; + build_expr(&toks, start, op_count) +} + +fn build_expr(toks: &[Tok], start: usize, op_count: usize) -> Option { let mut operands = Vec::with_capacity(op_count + 1); let mut ops = Vec::with_capacity(op_count); for k in 0..=op_count { - let Tok::Num(s) = &toks[start + 2 * k] else { + let Tok::Num { digits, .. } = &toks[start + 2 * k] else { return None; }; - operands.push(BigInt::parse(s)?); + operands.push(BigInt::parse(digits)?); if k < op_count { - let Tok::Op(op) = &toks[start + 2 * k + 1] else { + let Tok::Op { op, .. } = &toks[start + 2 * k + 1] else { return None; }; ops.push(*op); @@ -73,18 +104,87 @@ pub fn find_expression(text: &str) -> Option { Some(Expr { operands, ops }) } -fn lex(text: &str) -> Vec { +/// Stream-trigger scan: every maximal chain immediately followed by `=` +/// (whitespace allowed) — the moment a generating model has restated a +/// problem in notation and positioned its cursor at the answer slot. +/// Returns `(expr, char index just past the '=')` per trigger, in text +/// order. The trailing `=` IS the cue, so weak-operator chains qualify +/// here (the model writing `9 - 5 =` is notation by its own hand). +/// +/// This is the read primitive for mid-stream dispatch; the +/// `ave_stream_trigger_probe` harness measures whether the model's +/// spontaneous restatement reflex is frequent and faithful enough to +/// gate on (fire rate × emitted-expression fidelity × position). +pub fn find_triggers(text: &str) -> Vec<(Expr, usize)> { let chars: Vec = text.chars().collect(); + let toks = lex(&chars); + let mut out = Vec::new(); + + let mut i = 0; + while i < toks.len() { + if matches!(toks[i], Tok::Num { .. }) { + let mut j = i; + let mut ops = 0usize; + while matches!(toks.get(j + 1), Some(Tok::Op { .. })) + && matches!(toks.get(j + 2), Some(Tok::Num { .. })) + { + ops += 1; + j += 2; + } + if ops > 0 { + let span_end = match &toks[j] { + Tok::Num { end, .. } => *end, + _ => unreachable!("chain ends on Num"), + }; + let eq_pos = chars[span_end..] + .iter() + .position(|c| !c.is_whitespace()) + .map(|off| span_end + off) + .filter(|p| chars[*p] == '='); + if let (Some(p), Some(expr)) = (eq_pos, build_expr(&toks, i, ops)) { + out.push((expr, p + 1)); + } + } + i = j + 1; + } else { + i += 1; + } + } + out +} + +/// Strong-op chains qualify bare; all-weak chains qualify only when the +/// next non-whitespace char after the chain is `=` (see +/// [`find_expression`] docs). +fn chain_qualifies(chars: &[char], toks: &[Tok], start: usize, op_count: usize) -> bool { + let has_strong = (0..op_count).any(|k| { + matches!(toks[start + 2 * k + 1], Tok::Op { weak: false, .. }) + }); + if has_strong { + return true; + } + let span_end = match &toks[start + 2 * op_count] { + Tok::Num { end, .. } => *end, + _ => return false, + }; + chars[span_end..] + .iter() + .find(|c| !c.is_whitespace()) + .is_some_and(|c| *c == '=') +} + +fn lex(chars: &[char]) -> Vec { let mut toks = Vec::new(); let mut i = 0; while i < chars.len() { let c = chars[i]; if c.is_ascii_digit() { - let mut num = String::new(); + let start = i; + let mut digits = String::new(); while i < chars.len() { let c = chars[i]; if c.is_ascii_digit() { - num.push(c); + digits.push(c); i += 1; } else if (c == ',' || c == '_') && chars.get(i + 1).is_some_and(|n| n.is_ascii_digit()) @@ -95,7 +195,11 @@ fn lex(text: &str) -> Vec { break; } } - toks.push(Tok::Num(num)); + toks.push(Tok::Num { + digits, + start, + end: i, + }); continue; } if c.is_whitespace() { @@ -105,15 +209,15 @@ fn lex(text: &str) -> Vec { let ws_before = i == 0 || chars[i - 1].is_whitespace(); let ws_after = i + 1 >= chars.len() || chars[i + 1].is_whitespace(); let op = match c { - '+' => Some(Op::Add), - '*' | '×' => Some(Op::Mul), - '−' => Some(Op::Sub), - '-' if ws_before && ws_after => Some(Op::Sub), - 'x' | 'X' if ws_before && ws_after => Some(Op::Mul), + '+' => Some((Op::Add, false)), + '*' | '×' => Some((Op::Mul, false)), + '−' => Some((Op::Sub, false)), + '-' if ws_before && ws_after => Some((Op::Sub, true)), + 'x' | 'X' if ws_before && ws_after => Some((Op::Mul, true)), _ => None, }; match op { - Some(op) => toks.push(Tok::Op(op)), + Some((op, weak)) => toks.push(Tok::Op { op, weak }), None => toks.push(Tok::Other), } i += 1; @@ -137,10 +241,13 @@ pub fn rewrite_prompt(question: &str) -> String { /// Parse the model-emitted rewrite. First emitted line only, truncated at /// `=` so the model's own sum — if it volunteers one — is never consumed. +/// Weak-operator chains parse bare here: the rewrite instruction is the +/// notation cue, so `10 - 4` on the rewrite line is an expression, not a +/// range. pub fn parse_rewrite(generated: &str) -> Option { let line = generated.trim_start().lines().next()?; let line = line.split(['=', '\u{ff1d}']).next().unwrap_or(line); - find_expression(line) + find_expression_with_policy(line, false) } #[cfg(test)] @@ -160,7 +267,7 @@ mod tests { ); assert_eq!(parse("12345 * 6789"), Some("12345 * 6789 -> 83810205".into())); assert_eq!(parse("12×34"), Some("12 * 34 -> 408".into())); - assert_eq!(parse("3 x 4"), Some("3 * 4 -> 12".into())); + assert_eq!(parse("3 x 4 ="), Some("3 * 4 -> 12".into())); assert_eq!(parse("100000 - 1 ="), Some("100000 - 1 -> 99999".into())); assert_eq!(parse("47−5"), Some("47 - 5 -> 42".into())); } @@ -211,17 +318,74 @@ mod tests { } #[test] - fn hyphen_needs_whitespace_both_sides() { - assert!(find_expression("100-1").is_none()); - assert!(find_expression("100- 1").is_none()); - assert!(find_expression("100 -1").is_none()); - assert!(find_expression("100 - 1").is_some()); + fn hyphen_needs_whitespace_both_sides_and_equals_notation() { + assert!(find_expression("100-1 =").is_none(), "unspaced never counts"); + assert!(find_expression("100- 1 =").is_none()); + assert!(find_expression("100 -1 =").is_none()); + assert!(find_expression("100 - 1 =").is_some()); } #[test] - fn x_must_be_standalone() { + fn x_must_be_standalone_and_notation_cued() { assert!(find_expression("3x4").is_none(), "3x4 could be a label"); - assert!(find_expression("matrix 3 x 4").is_some()); + // Standalone x is a WEAK operator: only explicit `=` notation + // fires it — bare forms and prose dimensions are the model's + // territory (designed fallthrough). + assert!(find_expression("3 x 4 =").is_some(), "= notation"); + assert!(find_expression("3 x 4").is_none(), "bare → native"); + assert!(find_expression("matrix 3 x 4").is_none(), "prose dimension"); + assert!(find_expression("what is 3 x 4?").is_none(), "? is not notation"); + } + + #[test] + fn weak_chains_fire_only_on_explicit_equals_notation() { + // The adversarial-prose corpus: ranges, scores, idioms, spaced + // dates, dimensions, question forms — all carried digits around a + // spaced hyphen or standalone x and all used to fire (e.g. + // "9 - 5 job" → 4, "dated 2026 - 06 - 11" → 2009). Tier-0 does + // not infer intent: no `=`, no fire. + for text in [ + "It takes 5 - 10 business days.", + "They won 3 - 1 at home.", + "I work a 9 - 5 job.", + "Open Monday - Friday, 9 - 17.", + "pages 12 - 48 cover the appendix", + "the score was 2 - 2 after extra time", + "ages 18 - 25 only", + "dated 2026 - 06 - 11 in the ledger", + "a 4 x 4 truck", + "2 x 4 lumber at the yard", + "a 3 x 5 index card", + "room is 12 x 14 feet", + "Are you available 9 - 5?", + "9 - 5", + "100 - 1", + "what is 100 - 7?", + ] { + assert!( + find_expression(text).is_none(), + "weak chain fired without `=` notation: {text:?}" + ); + } + // Explicit `=` notation fires. + assert!(find_expression("100000 - 1 =").is_some(), "= notation"); + assert!(find_expression("9 - 5 =").is_some(), "= notation"); + // Any strong glyph in the chain is notation wherever it appears. + assert!( + find_expression("she computed 999 + 111 - 222 quickly").is_some(), + "strong + in chain" + ); + assert!( + find_expression("what is 123456 + 654321?").is_some(), + "question form rides the strong op, not the ?" + ); + } + + #[test] + fn qualifying_chain_wins_over_earlier_unqualified_range() { + // The prose range must not shadow the real expression behind it. + let e = find_expression("ages 18 - 25 only, so 12 + 7 =").expect("expr"); + assert_eq!(e.to_string(), "12 + 7"); } #[test] @@ -257,4 +421,34 @@ mod tests { assert!(parse_rewrite("I cannot rewrite that.").is_none()); assert!(parse_rewrite("").is_none()); } + + #[test] + fn find_triggers_reads_chains_at_their_equals_marker() { + // The canonical restatement shape: prose, then `expr = `. + let t = find_triggers("Sure!\n\n123456 + 654321 = "); + assert_eq!(t.len(), 1); + assert_eq!(t[0].0.to_string(), "123456 + 654321"); + // Weak ops qualify here — the model's own `=` is the cue. + let t = find_triggers("so 9 - 5 = "); + assert_eq!(t.len(), 1); + assert_eq!(t[0].0.to_string(), "9 - 5"); + // No `=` yet → no trigger (mid-restatement). + assert!(find_triggers("123456 + 654321").is_empty()); + // Multiple triggers arrive in text order (the chained case). + let t = find_triggers("12 + 7 = 19\nthen 19 * 2 = "); + assert_eq!(t.len(), 2); + assert_eq!(t[0].0.to_string(), "12 + 7"); + assert_eq!(t[1].0.to_string(), "19 * 2"); + // Plain prose with `=` but no chain stays silent. + assert!(find_triggers("x = y in the limit").is_empty()); + } + + #[test] + fn find_triggers_position_points_past_the_equals() { + let text = "ok 12 + 7 = "; + let t = find_triggers(text); + assert_eq!(t.len(), 1); + let chars: Vec = text.chars().collect(); + assert_eq!(chars[t[0].1 - 1], '=', "index is just past the '='"); + } } diff --git a/crates/larql-inference/src/experts/arith/verify.rs b/crates/larql-inference/src/experts/arith/verify.rs index de721d3f7..6b4344746 100644 --- a/crates/larql-inference/src/experts/arith/verify.rs +++ b/crates/larql-inference/src/experts/arith/verify.rs @@ -32,7 +32,7 @@ pub fn magnitude_prior( let Some(text) = native_text else { return Verdict::Skipped; }; - let Some(native) = first_number(text) else { + let Some(native) = native_answer_number(text) else { return Verdict::Skipped; }; @@ -57,6 +57,21 @@ pub fn magnitude_prior( } } +/// The number the native text offers as its ANSWER. Models typically +/// restate the operands before answering ("123456 + 654321 = 777777"), so +/// the first number in the text is usually an echo, not the answer — that +/// misread false-flagged the showcase. The model's own `=` is its answer +/// marker: take the first number after the LAST `=`; fall back to the +/// first number in the text when no marked answer exists. +fn native_answer_number(text: &str) -> Option { + if let Some(idx) = text.rfind('=') { + if let Some(n) = first_number(&text[idx + 1..]) { + return Some(n); + } + } + first_number(text) +} + /// First decimal number in free text (optional leading `-`, separators /// stripped). The native answer may arrive embedded in prose. fn first_number(text: &str) -> Option { @@ -173,4 +188,27 @@ mod tests { assert_eq!(first_number("12-5"), Some(big("12"))); assert_eq!(first_number(""), None); } + + #[test] + fn native_answer_reads_after_the_models_own_equals_marker() { + // The showcase false-flag: the model restates operands before + // answering — the answer is the number after its `=`, not the + // first number in the text. + assert_eq!( + magnitude_prior(&big("777777"), Some("123456 + 654321 = 777777"), 6), + Verdict::Consistent + ); + // Trailing `=` with nothing after it (the model starting a new + // problem) falls back to the first number — which IS the answer + // in the "19\n12 - 7 =" continuation shape. + assert_eq!( + magnitude_prior(&big("19"), Some("19\n12 - 7 ="), 2), + Verdict::Consistent + ); + // No `=` at all: first number, as before. + assert_eq!( + magnitude_prior(&big("42"), Some("The answer is 42."), 2), + Verdict::Consistent + ); + } } diff --git a/crates/larql-inference/src/test_utils.rs b/crates/larql-inference/src/test_utils.rs index 36ef58969..f82c7d397 100644 --- a/crates/larql-inference/src/test_utils.rs +++ b/crates/larql-inference/src/test_utils.rs @@ -559,8 +559,8 @@ pub use larql_models::test_fixtures::{make_gemma3_test_weights, make_starcoder2_ // `crate::test_utils::*` callers. pub use larql_models::test_fixtures::{ arc_mmap_from_bytes, make_test_q4k_weights, make_test_q4k_weights_layers, - make_test_q4k_weights_silu, Q4K_TEST_HIDDEN, Q4K_TEST_INTER, Q4K_TEST_NUM_LAYERS, - Q4K_TEST_VOCAB, + make_test_q4k_weights_rope_scaled, make_test_q4k_weights_silu, Q4K_TEST_HIDDEN, + Q4K_TEST_INTER, Q4K_TEST_NUM_LAYERS, Q4K_TEST_VOCAB, }; /// Build a fully-populated synthetic `VectorIndex` that satisfies the /// cached + direct-matvec decode contract on the Q4_K weights from diff --git a/crates/larql-inference/src/vindex/kquant_forward/cached.rs b/crates/larql-inference/src/vindex/kquant_forward/cached.rs index 1bdd6e19a..1310e8664 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/cached.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/cached.rs @@ -1215,6 +1215,103 @@ mod tests { ); } + /// Scaled-RoPE regression: on a Gemma-3 arch with linear + /// `rope_scaling` (position divisor 8 on the global layer), the + /// direct step must still track the staged step. Pre-2026-06-12 the + /// direct path roped Q/K with the UNSCALED `apply_rope_partial_at` — + /// no position divisor, no llama3 scaling — so on any rope-scaled + /// config the global layer's K landed at 8× the position the prefill + /// cache used. The non-scaled fixtures can't see that gap; this one + /// exists to. + #[test] + fn predict_kquant_decode_step_direct_tracks_staged_on_rope_scaled_arch() { + use crate::test_utils::{make_test_q4k_vindex, make_test_q4k_weights_rope_scaled}; + + let mut weights_a = make_test_q4k_weights_rope_scaled(); + // Guard: the fixture must actually parse into a divisor-8 global + // layer — otherwise this test silently stops testing anything. + let scaled_layers: Vec = (0..weights_a.num_layers) + .filter(|&l| weights_a.arch.rope_position_divisor_for_layer(l) == 8.0) + .collect(); + assert!( + !scaled_layers.is_empty(), + "fixture drift: no layer carries rope position divisor 8 — \ + the rope_scaling config no longer parses as global-only linear" + ); + let index = make_test_q4k_vindex(&weights_a); + assert!( + supports_direct_matvec_decode(&weights_a, &index), + "rope-scaled fixture must support the direct-matvec path" + ); + + // Prompt long enough that the scaled position (pos/8) and the + // unscaled position differ by a large rotary angle. + let token_ids = vec![1u32, 2, 3, 4, 5]; + let next = 6u32; + + let (_, mut cache_a, _) = + predict_kquant_prefill(&mut weights_a, &token_ids, &index); + let (h_staged, _) = predict_kquant_decode_step( + &mut weights_a, + next, + &index, + &mut cache_a, + token_ids.len(), + ) + .expect("staged step"); + + let mut weights_b = make_test_q4k_weights_rope_scaled(); + let (_, mut cache_b, _) = + predict_kquant_prefill(&mut weights_b, &token_ids, &index); + let backend = CpuBackend; + let h_direct = predict_kquant_decode_step_direct( + &mut weights_b, + next, + &index, + &backend, + &mut cache_b, + token_ids.len(), + ) + .expect("direct step"); + + // Primary assertion: the K row each path APPENDED to the cache. + // RoPE is relative — if the direct step ropes both new-Q and + // new-K at the wrong scale, their geometry to each other is + // preserved and the hidden state barely moves on a bland random + // fixture. The appended K row is the object the divisor rotates, + // and it must match the staged row at every layer (most of all + // the divisor-8 global layers). + for layer in 0..weights_a.num_layers { + let (k_a, _) = cache_a[layer].as_ref().expect("staged cache"); + let (k_b, _) = cache_b[layer].as_ref().expect("direct cache"); + let ra = k_a.row(k_a.nrows() - 1); + let rb = k_b.row(k_b.nrows() - 1); + let dot: f32 = ra.iter().zip(rb.iter()).map(|(x, y)| x * y).sum(); + let na: f32 = ra.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = rb.iter().map(|x| x * x).sum::().sqrt(); + let cos = dot / (na * nb); + assert!( + cos > 0.999, + "appended K row diverged at layer {layer} (divisor {}): cosine {cos} — \ + check the rope divisor / llama3 scaling in attention_decode_step_native", + weights_a.arch.rope_position_divisor_for_layer(layer) + ); + } + + // Secondary: the hidden state still tracks. + let a = h_staged.row(0); + let b = h_direct.row(0); + let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let na: f32 = a.iter().map(|x| x * x).sum::().sqrt(); + let nb: f32 = b.iter().map(|x| x * x).sum::().sqrt(); + let cos = dot / (na * nb); + assert!( + cos > 0.999, + "direct step hidden diverged from staged on the rope-scaled arch \ + (global layers {scaled_layers:?}): cosine {cos}" + ); + } + #[test] fn predict_kquant_decode_step_direct_returns_finite_hidden() { let mut fx = Q4KTestFixtures::build(); diff --git a/crates/larql-inference/src/vindex/kquant_forward/generation.rs b/crates/larql-inference/src/vindex/kquant_forward/generation.rs index 2867fc3c9..643e5b207 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/generation.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/generation.rs @@ -128,15 +128,42 @@ pub fn generate_kquant_cpu_cached( /// full recompute. Falls back to the naive loop when cached decode is /// unsupported for the arch. pub fn generate_kquant_cpu_constrained_cached( + weights: &mut ModelWeights, + tokenizer: &Tokenizer, + prompt_ids: &[u32], + max_tokens: usize, + index: &VectorIndex, + mask_fn: M, +) -> Vec<(String, u32)> +where + M: FnMut(&[u32], &mut Vec), +{ + generate_kquant_cpu_constrained_cached_streaming( + weights, + tokenizer, + prompt_ids, + max_tokens, + index, + mask_fn, + |_, _| {}, + ) +} + +/// Streaming-callback sibling of [`generate_kquant_cpu_constrained_cached`]: +/// fires `on_token(id, text)` after each pick so callers can render tokens +/// as they decode (the showcase/demo path). +pub fn generate_kquant_cpu_constrained_cached_streaming( weights: &mut ModelWeights, tokenizer: &Tokenizer, prompt_ids: &[u32], max_tokens: usize, index: &VectorIndex, mut mask_fn: M, + mut on_token: F, ) -> Vec<(String, u32)> where M: FnMut(&[u32], &mut Vec), + F: FnMut(u32, &str), { if !super::cached::supports_cached_decode(weights) { return generate_kquant_cpu_constrained( @@ -189,6 +216,7 @@ where } let tok = tokenizer.decode(&[id], true).unwrap_or_default(); let stop = eos.is_eos_with_tokenizer(id, &tok, tokenizer); + on_token(id, &tok); out.push((tok, id)); generated.push(id); if stop || step + 1 == max_tokens { diff --git a/crates/larql-inference/src/vindex/kquant_forward/mod.rs b/crates/larql-inference/src/vindex/kquant_forward/mod.rs index 4d8e48873..8b786271f 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/mod.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/mod.rs @@ -29,9 +29,9 @@ pub use cached::{ pub(crate) use generation::generate_kquant_cpu_constrained_streaming_sampled_with_eos; pub use generation::{ generate_kquant_cpu, generate_kquant_cpu_cached, generate_kquant_cpu_constrained, - generate_kquant_cpu_constrained_cached, generate_kquant_cpu_constrained_streaming, - generate_kquant_cpu_constrained_streaming_sampled, generate_kquant_cpu_remote, - is_end_of_turn, predict_kquant, + generate_kquant_cpu_constrained_cached, generate_kquant_cpu_constrained_cached_streaming, + generate_kquant_cpu_constrained_streaming, generate_kquant_cpu_constrained_streaming_sampled, + generate_kquant_cpu_remote, is_end_of_turn, predict_kquant, }; pub use hidden::{moe_ffn_block_cpu, moe_ffn_block_cpu_with_index, predict_kquant_hidden}; pub use hooks::predict_kquant_hidden_hooked; diff --git a/crates/larql-inference/src/vindex/mod.rs b/crates/larql-inference/src/vindex/mod.rs index c012635d5..a506d568b 100644 --- a/crates/larql-inference/src/vindex/mod.rs +++ b/crates/larql-inference/src/vindex/mod.rs @@ -17,7 +17,7 @@ pub use kquant_forward::{ attention_decode_step_native, ffn_decode_step_native, fused_decode_step, fused_decode_step_with_state, fused_prefill, generate_kquant_cpu, generate_kquant_cpu_cached, generate_kquant_cpu_constrained, generate_kquant_cpu_constrained_cached, - generate_kquant_cpu_constrained_streaming, + generate_kquant_cpu_constrained_cached_streaming, generate_kquant_cpu_constrained_streaming, generate_kquant_cpu_constrained_streaming_sampled, generate_kquant_cpu_remote, insert_q4k_layer_tensors, is_end_of_turn, kquant_ffn_forward_layer, kquant_ffn_forward_layer_q8k, moe_ffn_block_cpu, moe_ffn_block_cpu_with_index, predict_kquant, diff --git a/crates/larql-kv/ROADMAP.md b/crates/larql-kv/ROADMAP.md index 71b36baa8..1c649c968 100644 --- a/crates/larql-kv/ROADMAP.md +++ b/crates/larql-kv/ROADMAP.md @@ -1,5 +1,38 @@ # Roadmap — larql-kv +## CPU resident fast-path — all engines pluggable into it (2026-06-13) + +The 2026-06-11/12 CPU fast-path arc (Q4K-direct + int8 attention, q4k +lm_head/dense residency, hand-asm kernels, KV append-in-place — see +`bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json`) initially landed +only on `StandardEngine`: the `KvEngine::decode_step_resident` trait default +DROPPED the index (`let _ = index`), so every own-walk-loop engine stayed on +f32 attention. **Fixed:** + +- New single-source dispatcher + `larql_compute::attention::run_attention_block_decode_step_auto` — makes + the same q4k-direct-vs-f32 per-layer choice as + `CpuBackend::attention_step`, for callers that own `SharedKV` caches. +- `markov-rs`, `markov-rs-codec`, `turbo-quant`, `unlimited-context`, + `boundary_per_layer` now override `decode_step_resident` and thread the + vindex down their walk loops to `_auto`. `boundary-kv` forwards both + resident methods to its inner `StandardEngine` (was silently dropping to + the f32 path). `no_cache`/`apollo` keep the default by design (debug / + bench-only full re-forward). +- Regression pin: `engines::resident_identity_tests` — for 7 concrete + engine specs, `prefill/decode_step_resident` must be BIT-IDENTICAL to + `prefill/decode_step` with the flags off, and the covered-engine count + must not shrink. +- Measured effect (26B, flags on, within-run ratios vs standard — absolute + numbers pending a quiet machine): turbo 0.64×→0.85×, unlimited + 0.76×→1.07×; markov/codec/boundary_per_layer flat — their cost is their + own recompute/codec machinery (the feature), not the attention path; + markov's walk already tries the kquant-native attention helper first. + +Prefill stays on the f32 BLAS gemm for all engines deliberately (the task +#16 prefill falsification: q4k repeated-matvec loses ~20× to AMX at +prefill shapes). + ## Hardening — codebase review 2026-05-28 From the whole-codebase review ([`docs/audits/codebase-review-2026-05-28.md`](../../../docs/audits/codebase-review-2026-05-28.md)): diff --git a/crates/larql-kv/src/engines/boundary_kv/engine.rs b/crates/larql-kv/src/engines/boundary_kv/engine.rs index ef10a180e..f8cc2e3f6 100644 --- a/crates/larql-kv/src/engines/boundary_kv/engine.rs +++ b/crates/larql-kv/src/engines/boundary_kv/engine.rs @@ -217,6 +217,48 @@ impl KvEngine for BoundaryKvEngine { Ok(hidden) } + /// Resident-path prefill: forwards to the inner `StandardEngine`'s + /// resident form (threads `index` → Q4K-direct attention family) and + /// keeps the boundary frame emission identical to [`Self::prefill`]. + fn prefill_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_inference::larql_vindex::VectorIndex, + token_ids: &[u32], + ) -> Result, EngineError> { + if token_ids.is_empty() { + return Err(EngineError::EmptyPrompt); + } + let hidden = self.inner.prefill_resident(weights, ffn, index, token_ids)?; + self.abs_position = token_ids.len(); + if self.maybe_emit_frame(weights, &hidden).is_err() { + return Err(EngineError::BackendFailure { + details: "boundary frame emit failed".into(), + }); + } + Ok(hidden) + } + + /// Resident-path decode: forwards to the inner `StandardEngine`'s + /// resident form; frame emission identical to [`Self::decode_step`]. + fn decode_step_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_inference::larql_vindex::VectorIndex, + token_id: u32, + ) -> Result, EngineError> { + let hidden = self.inner.decode_step_resident(weights, ffn, index, token_id)?; + self.abs_position += 1; + if self.maybe_emit_frame(weights, &hidden).is_err() { + return Err(EngineError::BackendFailure { + details: "boundary frame emit failed".into(), + }); + } + Ok(hidden) + } + fn memory_bytes(&self) -> usize { self.inner.memory_bytes() } diff --git a/crates/larql-kv/src/engines/boundary_per_layer/engine.rs b/crates/larql-kv/src/engines/boundary_per_layer/engine.rs index 68abcd12f..4ae411f77 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/engine.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/engine.rs @@ -135,6 +135,38 @@ impl BoundaryPerLayerEngine { } } +impl BoundaryPerLayerEngine { + /// Shared body for `decode_step` / `decode_step_resident`. + fn decode_step_impl( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + token_id: u32, + index: Option<&larql_vindex::VectorIndex>, + ) -> Result, EngineError> { + let rs = self + .store + .take() + .ok_or_else(|| EngineError::InvariantViolation { + what: "decode_step called before prefill (store missing)".into(), + })?; + let (hidden, new_rs) = walk::run_decode( + weights, + ffn, + self.backend.as_ref(), + &self.policy, + rs, + token_id, + index, + ) + .ok_or_else(|| EngineError::BackendFailure { + details: "walk::run_decode returned None".into(), + })?; + self.store = Some(new_rs); + Ok(hidden) + } +} + impl KvEngine for BoundaryPerLayerEngine { fn name(&self) -> &str { "boundary-per-layer" @@ -188,25 +220,19 @@ impl KvEngine for BoundaryPerLayerEngine { ffn: &dyn FfnBackend, token_id: u32, ) -> Result, EngineError> { - let rs = self - .store - .take() - .ok_or_else(|| EngineError::InvariantViolation { - what: "decode_step called before prefill (store missing)".into(), - })?; - let (hidden, new_rs) = walk::run_decode( - weights, - ffn, - self.backend.as_ref(), - &self.policy, - rs, - token_id, - ) - .ok_or_else(|| EngineError::BackendFailure { - details: "walk::run_decode returned None".into(), - })?; - self.store = Some(new_rs); - Ok(hidden) + self.decode_step_impl(weights, ffn, token_id, None) + } + + /// Resident-path decode: threads `index` to the attention step's + /// Q4K-direct route (the non-standard-engine structural-gap fix). + fn decode_step_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_vindex::VectorIndex, + token_id: u32, + ) -> Result, EngineError> { + self.decode_step_impl(weights, ffn, token_id, Some(index)) } fn memory_bytes(&self) -> usize { diff --git a/crates/larql-kv/src/engines/boundary_per_layer/walk.rs b/crates/larql-kv/src/engines/boundary_per_layer/walk.rs index bd949ddc3..66445e066 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/walk.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/walk.rs @@ -11,7 +11,7 @@ use larql_compute::ComputeBackend; use larql_inference::attention::{ - run_attention_block_decode_step_backend, run_attention_with_kv_backend, SharedKV, + run_attention_with_kv_backend, SharedKV, }; use larql_inference::ffn::FfnBackend; use larql_inference::forward::embed_tokens_pub; @@ -86,6 +86,7 @@ pub(super) fn run_prefill( /// Run one decode step through the dense walk. Consumes `rs`, returns /// the new store alongside the hidden output. +#[allow(clippy::too_many_arguments)] pub(super) fn run_decode( weights: &ModelWeights, ffn: &dyn FfnBackend, @@ -93,6 +94,7 @@ pub(super) fn run_decode( policy: &BoundaryLayerPolicy, mut rs: RsStorePerLayer, token_id: u32, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<(Array2, RsStorePerLayer)> { let num_layers = weights.num_layers; let abs_position = rs.next_position; @@ -141,13 +143,14 @@ pub(super) fn run_decode( new_stored.push(h_new.clone()); - let (h_post_attn, _new_kv) = run_attention_block_decode_step_backend( + let (h_post_attn, _new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, Some(&(k_full, v_full)), abs_position, Some(backend), + index.map(|v| v as &dyn larql_compute::KvIndex), )?; let h_out = crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, ffn, Some(ffn)); @@ -251,7 +254,7 @@ mod tests { let (_, rs) = run_prefill(&weights, &ffn, &backend, &policy, Some(4), &[0]).unwrap(); assert!(rs.cold_encoded.is_none()); - let (hidden, rs_after) = run_decode(&weights, &ffn, &backend, &policy, rs, 1).unwrap(); + let (hidden, rs_after) = run_decode(&weights, &ffn, &backend, &policy, rs, 1, None).unwrap(); assert_eq!(hidden.shape(), &[1, weights.hidden_size]); assert_eq!(rs_after.next_position, 2); for slab in &rs_after.stored { @@ -285,7 +288,7 @@ mod tests { assert!(rs.cold_encoded.as_ref().unwrap()[0].n_positions > 0); let _ = ColdResidualCodec::Bf16; // keep import live - let (hidden, _) = run_decode(&weights, &ffn, &backend, &policy, rs, 3) + let (hidden, _) = run_decode(&weights, &ffn, &backend, &policy, rs, 3, None) .expect("decode should succeed without cold_kv"); assert_eq!(hidden.shape(), &[1, weights.hidden_size]); } @@ -306,7 +309,7 @@ mod tests { .unwrap_or(0); assert_eq!(initial, 1); - let (_, rs_after) = run_decode(&weights, &ffn, &backend, &policy, rs, 3).unwrap(); + let (_, rs_after) = run_decode(&weights, &ffn, &backend, &policy, rs, 3, None).unwrap(); let after = rs_after .cold_encoded .as_ref() diff --git a/crates/larql-kv/src/engines/markov_residual/compute.rs b/crates/larql-kv/src/engines/markov_residual/compute.rs index 5229e5dee..8489ae252 100644 --- a/crates/larql-kv/src/engines/markov_residual/compute.rs +++ b/crates/larql-kv/src/engines/markov_residual/compute.rs @@ -10,7 +10,7 @@ use super::store::RsStore; use crate::profiler::EngineProfiler; use larql_inference::attention::SharedKV; use larql_inference::attention::{ - apply_rope_partial_at, run_attention_block_decode_step_backend, run_attention_with_kv_backend, + apply_rope_partial_at, run_attention_with_kv_backend, }; use larql_inference::ffn::BackendFfn; use larql_inference::forward::{add_bias, apply_norm, embed_tokens_pub}; @@ -152,10 +152,12 @@ pub fn rs_decode_step( rs: RsStore, backend: &dyn ComputeBackend, moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<(Array2, RsStore)> { - rs_decode_step_inner(weights, new_token_id, rs, backend, None, moe_ffn) + rs_decode_step_inner(weights, new_token_id, rs, backend, None, moe_ffn, index) } +#[allow(clippy::too_many_arguments)] pub(crate) fn rs_decode_step_profiled( weights: &ModelWeights, new_token_id: u32, @@ -163,8 +165,9 @@ pub(crate) fn rs_decode_step_profiled( backend: &dyn ComputeBackend, profiler: &mut EngineProfiler, moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<(Array2, RsStore)> { - rs_decode_step_inner(weights, new_token_id, rs, backend, Some(profiler), moe_ffn) + rs_decode_step_inner(weights, new_token_id, rs, backend, Some(profiler), moe_ffn, index) } #[allow(clippy::too_many_arguments)] @@ -175,6 +178,7 @@ fn rs_decode_step_inner( backend: &dyn ComputeBackend, mut profiler: Option<&mut EngineProfiler>, moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<(Array2, RsStore)> { use std::time::Instant; @@ -257,13 +261,14 @@ fn rs_decode_step_inner( } else { None }; - let (h_post_attn, _new_kv) = run_attention_block_decode_step_backend( + let (h_post_attn, _new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, Some(&(k_full, v_full)), abs_position, Some(backend), + index.map(|v| v as &dyn larql_compute::KvIndex), )?; if let Some(t) = t_attn { attention_us += t.elapsed().as_secs_f64() * 1e6; @@ -995,7 +1000,7 @@ mod tests { let weights = make_test_weights(); let prefill = rs_prefill(&weights, &[0u32], None, &CpuBackend, None); let (h, _) = - rs_decode_step(&weights, 1, prefill.store, &CpuBackend, None).expect("decode step"); + rs_decode_step(&weights, 1, prefill.store, &CpuBackend, None, None).expect("decode step"); assert_eq!(h.shape(), &[1, weights.hidden_size]); assert!(h.iter().all(|v| v.is_finite())); } @@ -1005,9 +1010,9 @@ mod tests { let weights = make_test_weights(); let prefill = rs_prefill(&weights, &[0u32, 1], None, &CpuBackend, None); assert_eq!(prefill.store.next_position, 2); - let (_, rs2) = rs_decode_step(&weights, 2, prefill.store, &CpuBackend, None).unwrap(); + let (_, rs2) = rs_decode_step(&weights, 2, prefill.store, &CpuBackend, None, None).unwrap(); assert_eq!(rs2.next_position, 3); - let (_, rs3) = rs_decode_step(&weights, 3, rs2, &CpuBackend, None).unwrap(); + let (_, rs3) = rs_decode_step(&weights, 3, rs2, &CpuBackend, None, None).unwrap(); assert_eq!(rs3.next_position, 4); } @@ -1023,14 +1028,14 @@ mod tests { prefill.store.cold_kv.is_some(), "expected cold_kv to be set" ); - let (h, rs2) = rs_decode_step(&weights, 4, prefill.store, &CpuBackend, None) + let (h, rs2) = rs_decode_step(&weights, 4, prefill.store, &CpuBackend, None, None) .expect("decode_step over cold_kv"); assert_eq!(h.shape(), &[1, weights.hidden_size]); assert!(h.iter().all(|v| v.is_finite())); // After overflow merges into cold_residuals, cold_kv is cleared // (compute.rs line 260) so a second decode exercises the // cold_residuals-only branch (lines 149-160). - let (h2, _) = rs_decode_step(&weights, 5, rs2, &CpuBackend, None) + let (h2, _) = rs_decode_step(&weights, 5, rs2, &CpuBackend, None, None) .expect("decode_step over cold_residuals"); assert_eq!(h2.shape(), &[1, weights.hidden_size]); assert!(h2.iter().all(|v| v.is_finite())); @@ -1089,7 +1094,7 @@ mod tests { assert!(prefill.store.cold_kv.is_some()); let mut profiler = EngineProfiler::default(); let result = - rs_decode_step_profiled(&weights, 4, prefill.store, &CpuBackend, &mut profiler, None); + rs_decode_step_profiled(&weights, 4, prefill.store, &CpuBackend, &mut profiler, None, None); assert!(result.is_some()); // Profiler must record positive durations across all stages. assert!(profiler.recompute_hot.count > 0); @@ -1106,13 +1111,13 @@ mod tests { // cold_kv (compute.rs line 260); second hits the cold_residuals // branch (lines 149-160) under profiling. let prefill = rs_prefill(&weights, &[0u32, 1, 2, 3], Some(2), &CpuBackend, None); - let (_, rs2) = rs_decode_step(&weights, 4, prefill.store, &CpuBackend, None).unwrap(); + let (_, rs2) = rs_decode_step(&weights, 4, prefill.store, &CpuBackend, None, None).unwrap(); assert!( rs2.cold_kv.is_none(), "cold_kv should be cleared after overflow" ); let mut profiler = EngineProfiler::default(); - let result = rs_decode_step_profiled(&weights, 5, rs2, &CpuBackend, &mut profiler, None); + let result = rs_decode_step_profiled(&weights, 5, rs2, &CpuBackend, &mut profiler, None, None); assert!(result.is_some()); // cold_residuals branch exercises recompute_cold counter (line 171). assert!(profiler.recompute_cold.count > 0); @@ -1385,7 +1390,7 @@ mod tests { max_window: None, cold_len: 0, }; - let result = rs_decode_step(&weights, 0, store, &CpuBackend, None); + let result = rs_decode_step(&weights, 0, store, &CpuBackend, None, None); assert!(result.is_some()); let (h, _) = result.unwrap(); assert_eq!(h.shape(), &[1, weights.hidden_size]); diff --git a/crates/larql-kv/src/engines/markov_residual/engine.rs b/crates/larql-kv/src/engines/markov_residual/engine.rs index d8c865975..8fb9302bf 100644 --- a/crates/larql-kv/src/engines/markov_residual/engine.rs +++ b/crates/larql-kv/src/engines/markov_residual/engine.rs @@ -61,6 +61,46 @@ impl MarkovResidualEngine { // additional `impl MarkovResidualEngine` block. They mutate the // `pub(super)` fields above. +impl MarkovResidualEngine { + /// Shared body for `decode_step` / `decode_step_resident` — `index` + /// reaches the attention step's Q4K-direct route when present. + fn decode_step_impl( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + token_id: u32, + index: Option<&larql_vindex::VectorIndex>, + ) -> Result, EngineError> { + let rs = self + .store + .take() + .ok_or_else(|| EngineError::InvariantViolation { + what: "decode_step called before prefill (store missing)".into(), + })?; + let (hidden, new_rs) = if self.profiling { + rs_decode_step_profiled( + weights, + token_id, + rs, + self.backend.as_ref(), + &mut self.profile, + Some(ffn), + index, + ) + .ok_or_else(|| EngineError::BackendFailure { + details: "rs_decode_step_profiled returned None".into(), + })? + } else { + rs_decode_step(weights, token_id, rs, self.backend.as_ref(), Some(ffn), index) + .ok_or_else(|| EngineError::BackendFailure { + details: "rs_decode_step returned None".into(), + })? + }; + self.store = Some(new_rs); + Ok(hidden) + } +} + impl KvEngine for MarkovResidualEngine { fn name(&self) -> &str { "markov-rs" @@ -110,33 +150,21 @@ impl KvEngine for MarkovResidualEngine { ffn: &dyn FfnBackend, token_id: u32, ) -> Result, EngineError> { - let rs = self - .store - .take() - .ok_or_else(|| EngineError::InvariantViolation { - what: "decode_step called before prefill (store missing)".into(), - })?; - let (hidden, new_rs) = if self.profiling { - rs_decode_step_profiled( - weights, - token_id, - rs, - self.backend.as_ref(), - &mut self.profile, - Some(ffn), - ) - .ok_or_else(|| EngineError::BackendFailure { - details: "rs_decode_step_profiled returned None".into(), - })? - } else { - rs_decode_step(weights, token_id, rs, self.backend.as_ref(), Some(ffn)).ok_or_else( - || EngineError::BackendFailure { - details: "rs_decode_step returned None".into(), - }, - )? - }; - self.store = Some(new_rs); - Ok(hidden) + self.decode_step_impl(weights, ffn, token_id, None) + } + + /// Resident-path decode: threads `index` down to the walk's attention + /// step so the Q4K-direct projections (`LARQL_Q4K_DIRECT_ATTN` family) + /// can fire — the structural gap that left non-standard engines on the + /// f32 path after the standard engine got the fast attention. + fn decode_step_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_vindex::VectorIndex, + token_id: u32, + ) -> Result, EngineError> { + self.decode_step_impl(weights, ffn, token_id, Some(index)) } fn memory_bytes(&self) -> usize { diff --git a/crates/larql-kv/src/engines/markov_residual/walk.rs b/crates/larql-kv/src/engines/markov_residual/walk.rs index 6eb287af4..612519a2d 100644 --- a/crates/larql-kv/src/engines/markov_residual/walk.rs +++ b/crates/larql-kv/src/engines/markov_residual/walk.rs @@ -296,13 +296,14 @@ pub(super) fn rs_decode_step_walk( } } let (h_post_attn, new_kv_full) = native_result.or_else(|| { - larql_inference::attention::run_attention_block_decode_step_backend( + larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, Some(&kv_pair), abs_position, Some(backend), + Some(index as &dyn larql_compute::KvIndex), ) })?; if let Some(t) = t_attn { diff --git a/crates/larql-kv/src/engines/markov_residual_codec/compute.rs b/crates/larql-kv/src/engines/markov_residual_codec/compute.rs index a7f03af93..c76094a40 100644 --- a/crates/larql-kv/src/engines/markov_residual_codec/compute.rs +++ b/crates/larql-kv/src/engines/markov_residual_codec/compute.rs @@ -7,7 +7,7 @@ use larql_compute::ComputeBackend; use larql_inference::attention::{ - run_attention_block_decode_step_backend, run_attention_with_kv_backend, SharedKV, + run_attention_with_kv_backend, SharedKV, }; use larql_inference::ffn::BackendFfn; use larql_inference::forward::embed_tokens_pub; @@ -99,6 +99,7 @@ pub fn rs_decode_step_codec( rs: RsStoreCodec, backend: &dyn ComputeBackend, moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<(Array2, RsStoreCodec)> { let num_layers = weights.num_layers; let abs_position = rs.next_position; @@ -148,13 +149,14 @@ pub fn rs_decode_step_codec( new_stored.push(h_new.clone()); - let (h_post_attn, _new_kv) = run_attention_block_decode_step_backend( + let (h_post_attn, _new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, Some(&(k_full, v_full)), abs_position, Some(backend), + index.map(|v| v as &dyn larql_compute::KvIndex), )?; let bffn = BackendFfn { weights, backend }; @@ -302,7 +304,7 @@ mod tests { None, ); assert_eq!(prefill.store.next_position, 2); - let (_, rs2) = rs_decode_step_codec(&weights, 2, prefill.store, &CpuBackend, None).unwrap(); + let (_, rs2) = rs_decode_step_codec(&weights, 2, prefill.store, &CpuBackend, None, None).unwrap(); assert_eq!(rs2.next_position, 3); } @@ -318,7 +320,7 @@ mod tests { None, ); assert!(prefill.store.cold_kv.is_some()); - let (h, _) = rs_decode_step_codec(&weights, 4, prefill.store, &CpuBackend, None).unwrap(); + let (h, _) = rs_decode_step_codec(&weights, 4, prefill.store, &CpuBackend, None, None).unwrap(); assert_eq!(h.shape(), &[1, weights.hidden_size]); assert!(h.iter().all(|v| v.is_finite())); } @@ -336,10 +338,10 @@ mod tests { &CpuBackend, None, ); - let (_, rs2) = rs_decode_step_codec(&weights, 4, prefill.store, &CpuBackend, None).unwrap(); + let (_, rs2) = rs_decode_step_codec(&weights, 4, prefill.store, &CpuBackend, None, None).unwrap(); // Second decode: cold_kv was cleared by overflow at the first decode, // so this step exercises the cold_encoded recompute branch. - let (h, _) = rs_decode_step_codec(&weights, 5, rs2, &CpuBackend, None).unwrap(); + let (h, _) = rs_decode_step_codec(&weights, 5, rs2, &CpuBackend, None, None).unwrap(); assert_eq!(h.shape(), &[1, weights.hidden_size]); assert!(h.iter().all(|v| v.is_finite())); } diff --git a/crates/larql-kv/src/engines/markov_residual_codec/engine.rs b/crates/larql-kv/src/engines/markov_residual_codec/engine.rs index 714a25104..1e7391862 100644 --- a/crates/larql-kv/src/engines/markov_residual_codec/engine.rs +++ b/crates/larql-kv/src/engines/markov_residual_codec/engine.rs @@ -89,6 +89,31 @@ impl MarkovResidualCodecEngine { // mutate `store` / `kv_handle` / `abs_position` / `profile` (all // `pub(super)`). +impl MarkovResidualCodecEngine { + /// Shared body for `decode_step` / `decode_step_resident`. + fn decode_step_impl( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + token_id: u32, + index: Option<&larql_vindex::VectorIndex>, + ) -> Result, EngineError> { + let rs = self + .store + .take() + .ok_or_else(|| EngineError::InvariantViolation { + what: "decode_step called before prefill (store missing)".into(), + })?; + let (hidden, new_rs) = + rs_decode_step_codec(weights, token_id, rs, self.backend.as_ref(), Some(ffn), index) + .ok_or_else(|| EngineError::BackendFailure { + details: "rs_decode_step_codec returned None".into(), + })?; + self.store = Some(new_rs); + Ok(hidden) + } +} + impl KvEngine for MarkovResidualCodecEngine { fn name(&self) -> &str { "markov-rs-codec" @@ -140,19 +165,19 @@ impl KvEngine for MarkovResidualCodecEngine { ffn: &dyn FfnBackend, token_id: u32, ) -> Result, EngineError> { - let rs = self - .store - .take() - .ok_or_else(|| EngineError::InvariantViolation { - what: "decode_step called before prefill (store missing)".into(), - })?; - let (hidden, new_rs) = - rs_decode_step_codec(weights, token_id, rs, self.backend.as_ref(), Some(ffn)) - .ok_or_else(|| EngineError::BackendFailure { - details: "rs_decode_step_codec returned None".into(), - })?; - self.store = Some(new_rs); - Ok(hidden) + self.decode_step_impl(weights, ffn, token_id, None) + } + + /// Resident-path decode: threads `index` to the attention step's + /// Q4K-direct route (the non-standard-engine structural-gap fix). + fn decode_step_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_vindex::VectorIndex, + token_id: u32, + ) -> Result, EngineError> { + self.decode_step_impl(weights, ffn, token_id, Some(index)) } fn memory_bytes(&self) -> usize { diff --git a/crates/larql-kv/src/engines/markov_residual_codec/walk.rs b/crates/larql-kv/src/engines/markov_residual_codec/walk.rs index 18925234a..0d28ed94e 100644 --- a/crates/larql-kv/src/engines/markov_residual_codec/walk.rs +++ b/crates/larql-kv/src/engines/markov_residual_codec/walk.rs @@ -10,7 +10,7 @@ use larql_compute::ComputeBackend; use larql_inference::attention::{ - run_attention_block_decode_step_backend, run_attention_with_kv_backend, SharedKV, + run_attention_with_kv_backend, SharedKV, }; use larql_inference::forward::{embed_tokens_pub, run_ffn}; use larql_inference::model::ModelWeights; @@ -224,13 +224,14 @@ pub fn rs_decode_step_codec_walk( abs_position, ); let (h_post_attn, new_kv_full) = native_result.or_else(|| { - run_attention_block_decode_step_backend( + larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, Some(&kv_pair), abs_position, Some(backend), + Some(index as &dyn larql_compute::KvIndex), ) })?; if let Some(t) = t_attn { diff --git a/crates/larql-kv/src/engines/mod.rs b/crates/larql-kv/src/engines/mod.rs index 3c3bb5e85..2cbd728de 100644 --- a/crates/larql-kv/src/engines/mod.rs +++ b/crates/larql-kv/src/engines/mod.rs @@ -201,3 +201,80 @@ mod layer_ffn_or_moe_tests { assert_eq!(a.shape(), &[2, 1]); } } + +#[cfg(test)] +mod resident_identity_tests { + //! Structural-gap pin (2026-06-13): every pluggable engine overrides + //! `decode_step_resident` (or forwards it, for wrappers) so the vindex + //! reaches the attention step's Q4K-direct route. With the flags OFF — + //! the default test environment — the resident path must be + //! BIT-IDENTICAL to the plain path for every engine; a fork here means + //! an engine's resident override drifted from its decode_step. + + use crate::EngineKind; + use larql_inference::ffn::NullFfn; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + + #[test] + fn every_engine_decode_step_resident_matches_decode_step_flag_off() { + // Concrete specs (parameterised kinds need real params). Excluded: + // apollo (bench-only, full re-forward by design; resident default = + // forward to decode_step is the documented intent) and boundary-kv + // (frame emission needs an archive sink; its resident forwarding is + // a thin wrapper over StandardEngine — pinned here — plus its own + // frame tests). + let specs = [ + "standard", + "standard:window=4", + "no-cache", + "markov-rs", + "markov-rs-codec", + "turbo-quant", + "unlimited-context:window=4", + ]; + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let ffn = NullFfn; + let prompt = [0u32, 1, 2]; + + let mut tested = 0usize; + for spec in specs { + let Some(kind) = EngineKind::from_name(spec) else { + panic!("spec {spec:?} no longer parses — update this pin"); + }; + let mut plain = kind.clone().build(larql_inference::cpu_engine_backend()); + let mut resident = kind.build(larql_inference::cpu_engine_backend()); + + let h_plain = plain + .prefill(&weights, &ffn, &prompt) + .unwrap_or_else(|e| panic!("{spec}: prefill failed: {e:?}")); + let h_res = resident + .prefill_resident(&weights, &ffn, &index, &prompt) + .unwrap_or_else(|e| panic!("{spec}: prefill_resident failed: {e:?}")); + assert_eq!( + h_plain.iter().map(|v| v.to_bits()).collect::>(), + h_res.iter().map(|v| v.to_bits()).collect::>(), + "{spec}: prefill outputs diverged with flags off" + ); + + for (step, tok) in [3u32, 4, 5].into_iter().enumerate() { + let d_plain = plain + .decode_step(&weights, &ffn, tok) + .unwrap_or_else(|e| panic!("{spec}: decode_step failed: {e:?}")); + let d_res = resident + .decode_step_resident(&weights, &ffn, &index, tok) + .unwrap_or_else(|e| panic!("{spec}: decode_step_resident failed: {e:?}")); + assert_eq!( + d_plain.iter().map(|v| v.to_bits()).collect::>(), + d_res.iter().map(|v| v.to_bits()).collect::>(), + "{spec}: decode outputs diverged at step {step} with flags off" + ); + } + tested += 1; + } + assert!( + tested >= 7, + "engine coverage shrank ({tested} < 7) — no silent caps" + ); + } +} diff --git a/crates/larql-kv/src/engines/turbo_quant/engine.rs b/crates/larql-kv/src/engines/turbo_quant/engine.rs index 64f9a6774..cb94b2383 100644 --- a/crates/larql-kv/src/engines/turbo_quant/engine.rs +++ b/crates/larql-kv/src/engines/turbo_quant/engine.rs @@ -308,6 +308,83 @@ impl TurboQuantEngine { // additional `impl TurboQuantEngine` block. They mutate the // `pub(super)` fields above. +impl TurboQuantEngine { + /// Shared body for `decode_step` / `decode_step_resident`. + fn decode_step_impl( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + token_id: u32, + index: Option<&larql_vindex::VectorIndex>, + ) -> Result, EngineError> { + let num_layers = weights.num_layers; + let abs_position = self.abs_position; + let mut h = embed_tokens_pub(weights, &[token_id]); + + for layer in 0..num_layers { + // Decompress full prior K/V for attention. + let prior_kv = self.layers[layer].decompress(&self.tq); + + // Decode step returns updated K/V (prior + new token). + let (h_post_attn, updated_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h, + layer, + Some(&prior_kv), + abs_position, + Some(self.backend.as_ref()), + index.map(|v| v as &dyn larql_compute::KvIndex), + ) + .ok_or_else(|| EngineError::BackendFailure { + details: "run_attention_block_decode_step_backend returned None".into(), + })?; + + // Append-only codec path: encode just the new row head-by- + // head and push onto the existing compressed buffer. + let arch = &*weights.arch; + let kv_dim = arch.num_kv_heads_for_layer(layer) * arch.head_dim_for_layer(layer); + let head_dim = detect_head_dim(kv_dim); + let layer_slot = &mut self.layers[layer]; + let new_rows = updated_kv.0.shape()[0]; + let k_last = updated_kv.0.row(new_rows - 1).to_owned(); + let v_last = updated_kv.1.row(new_rows - 1).to_owned(); + let mut scratch_f32: Vec = Vec::new(); + let mut scratch_u8: Vec = Vec::new(); + for chunk in k_last.as_slice().expect("k row contig").chunks(head_dim) { + self.tq.encode_vector_into( + chunk, + &mut layer_slot.compressed_k, + &mut scratch_f32, + &mut scratch_u8, + ); + } + for chunk in v_last.as_slice().expect("v row contig").chunks(head_dim) { + self.tq.encode_vector_into( + chunk, + &mut layer_slot.compressed_v, + &mut scratch_f32, + &mut scratch_u8, + ); + } + layer_slot.num_vecs = new_rows; + layer_slot.kv_dim = kv_dim; + layer_slot.head_dim = head_dim; + + let bffn = BackendFfn { + weights, + backend: self.backend.as_ref(), + }; + let h_out = + crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, &bffn, Some(ffn)); + h = h_out; + } + + self.abs_position += 1; + Ok(last_row(&h)) + } +} + impl KvEngine for TurboQuantEngine { fn name(&self) -> &str { "turbo-quant" @@ -368,71 +445,22 @@ impl KvEngine for TurboQuantEngine { ffn: &dyn FfnBackend, token_id: u32, ) -> Result, EngineError> { - let num_layers = weights.num_layers; - let abs_position = self.abs_position; - let mut h = embed_tokens_pub(weights, &[token_id]); - - for layer in 0..num_layers { - // Decompress full prior K/V for attention. - let prior_kv = self.layers[layer].decompress(&self.tq); - - // Decode step returns updated K/V (prior + new token). - let (h_post_attn, updated_kv) = run_attention_block_decode_step_backend( - weights, - &h, - layer, - Some(&prior_kv), - abs_position, - Some(self.backend.as_ref()), - ) - .ok_or_else(|| EngineError::BackendFailure { - details: "run_attention_block_decode_step_backend returned None".into(), - })?; - - // Append-only codec path: encode just the new row head-by- - // head and push onto the existing compressed buffer. - let arch = &*weights.arch; - let kv_dim = arch.num_kv_heads_for_layer(layer) * arch.head_dim_for_layer(layer); - let head_dim = detect_head_dim(kv_dim); - let layer_slot = &mut self.layers[layer]; - let new_rows = updated_kv.0.shape()[0]; - let k_last = updated_kv.0.row(new_rows - 1).to_owned(); - let v_last = updated_kv.1.row(new_rows - 1).to_owned(); - let mut scratch_f32: Vec = Vec::new(); - let mut scratch_u8: Vec = Vec::new(); - for chunk in k_last.as_slice().expect("k row contig").chunks(head_dim) { - self.tq.encode_vector_into( - chunk, - &mut layer_slot.compressed_k, - &mut scratch_f32, - &mut scratch_u8, - ); - } - for chunk in v_last.as_slice().expect("v row contig").chunks(head_dim) { - self.tq.encode_vector_into( - chunk, - &mut layer_slot.compressed_v, - &mut scratch_f32, - &mut scratch_u8, - ); - } - layer_slot.num_vecs = new_rows; - layer_slot.kv_dim = kv_dim; - layer_slot.head_dim = head_dim; - - let bffn = BackendFfn { - weights, - backend: self.backend.as_ref(), - }; - let h_out = - crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, &bffn, Some(ffn)); - h = h_out; - } + self.decode_step_impl(weights, ffn, token_id, None) + } - self.abs_position += 1; - Ok(last_row(&h)) + /// Resident-path decode: threads `index` to the attention step's + /// Q4K-direct route (the non-standard-engine structural-gap fix). + fn decode_step_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_vindex::VectorIndex, + token_id: u32, + ) -> Result, EngineError> { + self.decode_step_impl(weights, ffn, token_id, Some(index)) } + fn memory_bytes(&self) -> usize { self.layers.iter().map(|l| l.memory_bytes()).sum() } diff --git a/crates/larql-kv/src/engines/unlimited_context/engine.rs b/crates/larql-kv/src/engines/unlimited_context/engine.rs index 60daed223..ed0f69a44 100644 --- a/crates/larql-kv/src/engines/unlimited_context/engine.rs +++ b/crates/larql-kv/src/engines/unlimited_context/engine.rs @@ -130,13 +130,26 @@ impl UnlimitedContextEngine { weights: &ModelWeights, tokens: &[u32], moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + ) -> Option<()> { + self.process_with_index(weights, tokens, moe_ffn, None) + } + + /// `process` with an optional vindex threaded to the per-token attention + /// steps (Q4K-direct route under `LARQL_Q4K_DIRECT_ATTN` — the + /// non-standard-engine structural-gap fix). + pub fn process_with_index( + &mut self, + weights: &ModelWeights, + tokens: &[u32], + moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<()> { let mut remaining = tokens; while !remaining.is_empty() { let free = self.window_size - self.current_window_tokens.len(); let take = remaining.len().min(free); let (chunk, rest) = remaining.split_at(take); - self.extend_current(weights, chunk, moe_ffn)?; + self.extend_current(weights, chunk, moe_ffn, index)?; remaining = rest; if self.current_window_tokens.len() >= self.window_size { self.close_window(); @@ -178,6 +191,7 @@ impl UnlimitedContextEngine { abs_offset, self.backend.as_ref(), None, + None, )?; let abs_end = abs_offset + tokens.len() - 1; Some((out.kv_cache, abs_end)) @@ -306,6 +320,7 @@ impl UnlimitedContextEngine { weights: &ModelWeights, chunk: &[u32], moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option<()> { if chunk.is_empty() { return Some(()); @@ -332,6 +347,7 @@ impl UnlimitedContextEngine { abs_start, self.backend.as_ref(), moe_ffn, + index, )?; self.last_hidden = Some(out.last_hidden); @@ -481,6 +497,26 @@ impl KvEngine for UnlimitedContextEngine { }) } + /// Resident-path decode: threads `index` to the per-token attention + /// steps' Q4K-direct route (the non-standard-engine structural-gap fix). + fn decode_step_resident( + &mut self, + weights: &ModelWeights, + ffn: &dyn FfnBackend, + index: &larql_vindex::VectorIndex, + token_id: u32, + ) -> Result, EngineError> { + self.process_with_index(weights, &[token_id], Some(ffn), Some(index)) + .ok_or_else(|| EngineError::BackendFailure { + details: "process returned None during decode_step".into(), + })?; + self.last_hidden + .clone() + .ok_or_else(|| EngineError::BackendFailure { + details: "last_hidden missing after decode_step".into(), + }) + } + fn memory_bytes(&self) -> usize { self.checkpoints.total_bytes() + self.archive.total_bytes() + self.current_kv_bytes() } diff --git a/crates/larql-kv/src/engines/unlimited_context/extend.rs b/crates/larql-kv/src/engines/unlimited_context/extend.rs index ff2322f0a..4059db8e6 100644 --- a/crates/larql-kv/src/engines/unlimited_context/extend.rs +++ b/crates/larql-kv/src/engines/unlimited_context/extend.rs @@ -39,6 +39,7 @@ pub fn rs_extend_from_checkpoint( abs_start, &larql_compute::CpuBackend, None, + None, ) } @@ -55,6 +56,7 @@ pub fn rs_extend_from_checkpoint_backend( abs_start: usize, backend: &dyn ComputeBackend, moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, ) -> Option { let num_layers = weights.num_layers; @@ -79,13 +81,14 @@ pub fn rs_extend_from_checkpoint_backend( None }; - let (h_post_attn, new_kv) = run_attention_block_decode_step_backend( + let (h_post_attn, new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( weights, &h, layer, kv_entry, abs_position, Some(backend), + index.map(|v| v as &dyn larql_compute::KvIndex), )?; let bffn = BackendFfn { weights, backend }; From 201d545797a46aaec88d5d4ff2487a5353e6f6ed Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Sat, 13 Jun 2026 21:24:44 +0100 Subject: [PATCH 4/8] spin performance --- ROADMAP_STATUS.md | 5 +- crates/larql-compute/src/attention/decode.rs | 168 +++--- crates/larql-compute/src/cpu/mod.rs | 1 + .../larql-compute/src/cpu/ops/moe/forward.rs | 213 ++++--- crates/larql-compute/src/cpu/ops/q4_common.rs | 56 +- .../larql-compute/src/cpu/ops/q4k_q8k_dot.rs | 82 ++- crates/larql-compute/src/cpu/spin_pool.rs | 537 ++++++++++++++++++ .../src/kquant_forward/cached.rs | 65 +-- crates/larql-compute/src/options.rs | 150 +++-- .../src/forward/predict/dense.rs | 50 +- crates/larql-inference/src/vindex/dequant.rs | 23 + .../src/vindex/kquant_forward/cached.rs | 65 +-- .../src/vindex/kquant_forward/hidden.rs | 7 +- crates/larql-kv/ROADMAP.md | 61 +- crates/larql-kv/src/engines/mod.rs | 30 + .../src/engines/unlimited_context/engine.rs | 21 +- crates/larql-kv/src/generation.rs | 7 +- 17 files changed, 1124 insertions(+), 417 deletions(-) create mode 100644 crates/larql-compute/src/cpu/spin_pool.rs diff --git a/ROADMAP_STATUS.md b/ROADMAP_STATUS.md index a915a462d..ea6259ff3 100644 --- a/ROADMAP_STATUS.md +++ b/ROADMAP_STATUS.md @@ -4,10 +4,13 @@ Canonical rollup for the next execution slice. Keep the detailed design in `ROADMAP.md` and crate-local roadmaps; use this file to answer "what is active now?" without rereading every crate document. -Last updated: 2026-06-10 +Last updated: 2026-06-13 ## Recently shipped (delta since last update) +- **Decode fast path shipped DEFAULT-ON + env-flag cleanup — `larql` is now fast out of the box, no flags (2026-06-13).** The whole CPU decode fast-path stack had been gated behind opt-in env vars, so the *default* product ran the f32 slow path (C10: 7.1 tok/s default vs ~35 with flags) — the on-switch for all the perf work was hidden, and forgetting one flag silently reverted to slow. **Flipped the five Q4K stages + the spin pool to default-ON (opt-*out* via `=0`/`false`/`off`/`no`)**: `LARQL_Q4K_DIRECT_ATTN`, `LARQL_Q4K_ATTN_INT8`, `LARQL_Q4K_LM_HEAD`, `LARQL_Q4K_DIRECT_FFN`, `LARQL_Q4K_ASM`, `LARQL_SPIN_POOL`. **Centralized** them as resolvers in [`larql-compute/src/options.rs`](crates/larql-compute/src/options.rs) (5 scattered per-site `OnceLock` reads now delegate; per-layer/token ones uncached=testable, the per-matvec `asm` + per-section `spin` cached=hot), with per-layer/format fallbacks intact so a model/layer that can't take a stage silently uses f32. **Verified fast-by-default:** no-env 26B decode = ~31 tok/s (the fast path) vs the old 7.1 default. Fixed 6 strict-`<1e-3`-parity tests that had *relied* on the old default-off (they pin int8/flags off via Drop guards — the int8 route carries a looser ~2% scale-relative bound by design). Also **env cleanup (Tier 3)**: dropped 3 redundant aliases (`SKIP_MOE`/`MOE_DEBUG` non-namespaced legacies, `LARQL_DECODE_STAGE_TIMING`), helpers now read the single `LARQL_`-namespaced flag. **This supersedes the "opt-in `LARQL_X`, default off = byte-identical" framing in the older entries below** — the fast-path stack is on by default now; those entries describe each stage as it landed. (Retained comparison knobs — `asm_v2` bench, fused-shader flags per ADR-017 — deliberately kept.) Full workspace lib tests green, clippy clean. +- **Spin-barrier pool made safe for default-on + panic-fixed (2026-06-13).** Added a spin→yield→park backoff (spin `SPIN_HOT`≈the proven pure-spin window so active decode is unchanged → `yield_now` cooperative bridge once a wait outlives a token = idle/starved → `park` deep-idle, ~0 CPU; dispatcher unparks on dispatch) so the pool doesn't peg cores when the decode loop is idle — what makes on-by-default safe on a shared box. **Found + fixed a real panic-safety bug:** a panicking chunk body killed the worker thread → `completed` never hit `num_chunks` → the dispatcher spin-waited the barrier FOREVER (hung two test procs at 200% for 25 min; also the nondeterministic "flake" — dispatcher-chunk panic propagated/passed, worker-chunk panic hung). Fix: `catch_unwind` per chunk, always count, re-raise the first payload on the dispatcher (rayon semantics) + a regression test. 705 compute + 1220 inference + 756 kv green; spin_pool 8 tests incl. panic + concurrency hammered 25×. +- **CPU MoE decode CAUGHT llama.cpp — spin-barrier thread pool, 26B 27→35 tok/s (+28%), now ~9% AHEAD of llama.cpp's 32.1 (2026-06-13).** A `/usr/bin/sample` profile of live 26B decode pinned the post-residency frontier to **rayon fork-join overhead**, not the kernels: the decode driver runs *outside* the global rayon pool, so each of ~211 parallel sections/token took the cold path (`in_worker_cold → LockLatch::wait_and_reset → __psynch_cvwait`) and workers slept between sections — ~40% of thread-time in wait states. **Built `larql_compute::cpu::spin_pool`** (llama.cpp-style persistent spin-barrier pool: workers spin on an epoch counter, park only after a long idle; static strided partition makes `completed==num_chunks` a sound barrier; dispatch mutex + reentrancy guard for `--concurrent`/tests; 7 unit tests incl. a concurrent-dispatcher one that caught a real cursor-reset bug). **Centralized** four byte-identical `par_chunks_mut` matvec twins (larql-compute `cached.rs`, larql-inference `cached.rs`, lm_head ×2 in `dense.rs` — the long-standing "consolidation hazard") into one `q4k_q8k_matvec_parallel`, and routed every hot decode section (attn int8 Q/K/V/O, GQA, dense FFN gate/up/down, geglu, expert fold, lm_head q4+f32) through the pool. Parity-validated both ways (704 compute + 1220 inference + 756 kv green flags-off AND flags-on, incl. the `predict_kquant` parity oracles; clippy clean) — **now default-on (see the env-cleanup entry above); `LARQL_SPIN_POOL=0` opts out**. **Profile after: rayon eliminated** (`in_worker_cold` 2682→0, `join_context` 10300→0, `wait_until_cold` 4463→9). **Measured (M3 Max, t=8, warm, tight A/B bracket, flags inline):** 26B short-ctx OFF ~26.9 → ON **33–35**; n=256 OFF ~27.4 → ON **~34.9 (+28%, ON 35.0/34.8 vs OFF 27.3/27.4 dead-consistent)** — vs llama.cpp recorded **32.1** ⇒ larql ~9% ahead (machine validated: 4B llama.cpp 44 vs recorded 43). **Caveat:** the pool *spins* during active decode (that is the win on a dedicated box); the spin→yield→park backoff (shipped, see entry above) cedes cores only when the decode loop goes idle, which is what makes default-on safe on a shared machine. **Methodology bug (burned a chunk of the session):** `env $FLAGS …` does **not** word-split in this shell (zsh semantics) → only the first flag was set; spell decode flags **inline** before the binary, never via a `$VAR`. Crate detail: [`crates/larql-kv/ROADMAP.md`](crates/larql-kv/ROADMAP.md) §"Spin-barrier pool". - **Bottleneck pass #2 (code-level, machine contended) — GQA scratch fix landed; measurement queue parked (2026-06-13).** With the box owned by a sibling session (battery also low), did the allocation-churn audit instead of timing: the expert path is already TLS-pooled (`ExpertScratch`), but the GQA head-parallelization had introduced a per-head scores `vec!` (≈480 allocs+zeroings/token, growing with ctx) — replaced with `for_each_init` per-worker scratch (rayon workers are long-lived, so it amortises across calls too). 697+1220+756 green. **Measurement queue for the next quiet-machine session:** (1) fresh `/usr/bin/sample` + stage split on the current build (the last profile predates KV append-in-place + serial cuts — the sink distribution has shifted); (2) boundary-kv ratio re-measure (its 0.80× predates its resident-forwarding fix); (3) clean absolute engine matrix + llama-bench brackets; (4) remaining known sinks by size: `cpu_moe_forward` glue (router f32 matvec ~43 MB/tok + norms + route), allocator churn from per-projection out-Vecs and ndarray temporaries (arena-per-step is the structural fix if a fresh sample still shows `madvise`), long-ctx GQA growth (real work; flash-attention-class restructure is the eventual answer). - **Engine structural gap CLOSED — every KV engine now plugs into the CPU fast path (2026-06-13).** The `KvEngine::decode_step_resident` trait default silently dropped the index (`let _ = index`), so the 06-11/12 fast-path arc (q4k/int8 attention + asm + append-in-place) reached only `StandardEngine`. **Built:** single-source dispatcher `run_attention_block_decode_step_auto` (same per-layer q4k-vs-f32 choice as `CpuBackend::attention_step`, for `SharedKV`-owning walk loops; flag moved to `attention::decode` as the one source); `markov-rs`/`markov-rs-codec`/`turbo-quant`/`unlimited-context`/`boundary_per_layer` override `decode_step_resident` and thread the vindex down their walks; **`boundary-kv` forwards both resident methods to its inner StandardEngine** (was silently f32). `no_cache`/`apollo` keep the default by design. **Regression pin:** `engines::resident_identity_tests` — 7 concrete specs, resident ≡ plain bit-identical flags-off, coverage count can't shrink. **Measured (within-run ratios vs standard; absolutes pending quiet machine — sibling session load):** turbo 0.64×→0.85×, unlimited 0.76×→**1.07×**; markov/codec/boundary-per-layer flat = their own recompute/codec machinery (the feature), not attention. Prefill stays f32 gemm everywhere (task-#16 prefill falsification). 697+1220+756 tests green, clippy clean. Crate detail: [`crates/larql-kv/ROADMAP.md`](crates/larql-kv/ROADMAP.md) §"CPU resident fast-path". - **KV-engine review of the append-in-place handle — all engines green; two failure-path fixes landed (2026-06-13).** `EngineBackend: ComputeBackend + KvDispatch`, so every handle-holding engine rides the rewritten `CpuKvHandle`. Audit found two semantic edges the happy-path suites couldn't catch, both fixed: (1) q4k attend-failure after the in-place append now **pops the appended row and falls back to f32** (the old monolithic form's semantics — protects engine-level fallbacks like boundary_per_layer's dense-walk that reuse the handle); (2) the f32 path's prior is **copied not moved** so a backend failure leaves the handle intact. Empirical matrix, flags ON: **26B** all six MoE-capable engines run clean (standard 14.3 cold / boundary-kv 11.5 / unlimited 10.9 / turbo 9.2 / codec 7.9 / markov 7.2 — C1-class ordering); **4B** standard **28.9 tok/s** (dense also gains: gap to llama.cpp 43 now 1.49×), `standard:window=64` exercises `clip_kv` on the new buffers correctly, no-cache/markov/codec/turbo/unlimited all run. 697+755 suites green (incl. the cross-engine parity oracles). diff --git a/crates/larql-compute/src/attention/decode.rs b/crates/larql-compute/src/attention/decode.rs index 50160347c..7d8dcc609 100644 --- a/crates/larql-compute/src/attention/decode.rs +++ b/crates/larql-compute/src/attention/decode.rs @@ -49,57 +49,75 @@ where // the previous serial loop). The decode sample showed this loop serial // on the main thread at ~5% of wall while 8 workers slept. { - use rayon::prelude::*; let out_slice = out .as_slice_mut() .expect("freshly allocated [1, q_dim] is contiguous"); - out_slice - .par_chunks_mut(head_dim) - .enumerate() - .for_each_init( - // Per-worker scores scratch, reused across all heads this - // worker processes (and across calls — rayon workers are - // long-lived). The per-head `vec![0.0; total_len]` it - // replaces was ~480 allocs+zeroings per token at 26B sizes - // and grew with context. - Vec::::new, - |scores, (h, out_h)| { - let kv_h = h / reps; - let q_off = h * head_dim; - let kv_off = kv_h * head_dim; - - let q_row = q_new.slice(ndarray::s![0, q_off..q_off + head_dim]); - let k_block = k_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); - let raw: ndarray::Array1 = k_block.dot(&q_row); - scores.resize(total_len, 0.0); - for i in 0..total_len { - let mut s = raw[i] * scale_f32; - if let Some(cap) = softcap { - s = (s / cap).tanh() * cap; - } - scores[i] = s; + // Per-head attention math, factored so the rayon and spin-pool paths + // share one body (and stay numerically identical). `scores` is a + // reused scratch buffer (per rayon worker / per spin thread): the + // per-head `vec![0.0; total_len]` it replaces was ~480 allocs+zeroings + // per token at 26B sizes and grew with context. + let head_body = |h: usize, out_h: &mut [f32], scores: &mut Vec| { + let kv_h = h / reps; + let q_off = h * head_dim; + let kv_off = kv_h * head_dim; + + let q_row = q_new.slice(ndarray::s![0, q_off..q_off + head_dim]); + let k_block = k_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); + let raw: ndarray::Array1 = k_block.dot(&q_row); + scores.resize(total_len, 0.0); + for i in 0..total_len { + let mut s = raw[i] * scale_f32; + if let Some(cap) = softcap { + s = (s / cap).tanh() * cap; } - // Softmax - let max_val = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max); - let mut sum = 0.0f64; - for s in scores.iter_mut() { - let e = ((*s - max_val) as f64).exp(); - *s = e as f32; - sum += e; - } - let inv_sum = (1.0 / sum) as f32; - for s in scores.iter_mut() { - *s *= inv_sum; + scores[i] = s; + } + // Softmax + let max_val = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max); + let mut sum = 0.0f64; + for s in scores.iter_mut() { + let e = ((*s - max_val) as f64).exp(); + *s = e as f32; + sum += e; + } + let inv_sum = (1.0 / sum) as f32; + for s in scores.iter_mut() { + *s *= inv_sum; + } + // Weighted sum of V + let v_block = v_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); + let scores_view = ndarray::ArrayView1::from(&scores[..]); + let weighted_v = v_block.t().dot(&scores_view); + out_h.copy_from_slice(weighted_v.as_slice().expect("1-D dot output is contiguous")); + }; + + if crate::cpu::spin_pool::enabled() { + // Each head owns a disjoint `head_dim`-wide output slice; spin + // workers keep a thread-local scratch (same reuse as for_each_init). + let base = out_slice.as_mut_ptr() as usize; + let total = out_slice.len(); + crate::cpu::spin_pool::global().for_each_chunk(num_q, |h| { + thread_local! { + static SCORES: std::cell::RefCell> = + const { std::cell::RefCell::new(Vec::new()) }; } - // Weighted sum of V - let v_block = v_full.slice(ndarray::s![.., kv_off..kv_off + head_dim]); - let scores_view = ndarray::ArrayView1::from(&scores[..]); - let weighted_v = v_block.t().dot(&scores_view); - out_h.copy_from_slice( - weighted_v.as_slice().expect("1-D dot output is contiguous"), - ); - }, - ); + let start = h * head_dim; + let len = head_dim.min(total.saturating_sub(start)); + // SAFETY: head `h` owns the disjoint range [start, start+len). + let out_h = + unsafe { std::slice::from_raw_parts_mut((base as *mut f32).add(start), len) }; + SCORES.with(|cell| head_body(h, out_h, &mut cell.borrow_mut())); + }); + } else { + use rayon::prelude::*; + out_slice + .par_chunks_mut(head_dim) + .enumerate() + .for_each_init(Vec::::new, |scores, (h, out_h)| { + head_body(h, out_h, scores); + }); + } } out } @@ -314,9 +332,7 @@ pub fn run_attention_block_decode_step_backend( /// loops (via [`run_attention_block_decode_step_auto`]) must make the same /// choice. Cached once; never in the hot loop. pub fn q4k_direct_attn_enabled() -> bool { - use std::sync::OnceLock; - static ON: OnceLock = OnceLock::new(); - *ON.get_or_init(|| std::env::var("LARQL_Q4K_DIRECT_ATTN").as_deref() == Ok("1")) + crate::options::q4k_direct_attn_enabled() } /// Best-available decode-step attention for callers that own their cache as @@ -366,9 +382,7 @@ pub fn run_attention_block_decode_step_auto( /// the expert path's int8 kernels. Default off = the existing f32-activation /// behaviour, byte-identical. fn attn_int8_enabled() -> bool { - use std::sync::OnceLock; - static ON: OnceLock = OnceLock::new(); - *ON.get_or_init(|| std::env::var("LARQL_Q4K_ATTN_INT8").as_deref() == Ok("1")) + crate::options::q4k_attn_int8_enabled() } /// Int8 decode-step projection: `[1, num_rows] = qw × x_q8k`. The activation @@ -384,7 +398,6 @@ fn q8k_direct_proj( in_dim: usize, ) -> Option> { use crate::cpu::ops::q4k_q8k_dot::{q4k_q8k_matvec_into, q6k_q8k_matvec_into}; - use rayon::prelude::*; if !in_dim.is_multiple_of(256) { return None; @@ -400,26 +413,23 @@ fn q8k_direct_proj( let mut out = vec![0.0f32; num_rows]; const CHUNK_ROWS: usize = 32; - out.par_chunks_mut(CHUNK_ROWS) - .enumerate() - .for_each(|(chunk_idx, chunk)| { - let row_start = chunk_idx * CHUNK_ROWS; - let chunk_len = chunk.len().min(num_rows.saturating_sub(row_start)); - if chunk_len == 0 { - return; + crate::cpu::spin_pool::par_chunks_mut(&mut out, CHUNK_ROWS, |chunk_idx, chunk| { + let row_start = chunk_idx * CHUNK_ROWS; + let chunk_len = chunk.len().min(num_rows.saturating_sub(row_start)); + if chunk_len == 0 { + return; + } + let w_chunk = &qw.data[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; + match qw.format { + crate::QuantFormat::Q4_K => { + q4k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, in_dim) } - let w_chunk = - &qw.data[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; - match qw.format { - crate::QuantFormat::Q4_K => { - q4k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, in_dim) - } - crate::QuantFormat::Q6_K => { - q6k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, in_dim) - } - _ => {} + crate::QuantFormat::Q6_K => { + q6k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, in_dim) } - }); + _ => {} + } + }); Array2::from_shape_vec((1, num_rows), out).ok() } @@ -1001,6 +1011,22 @@ mod tests { #[test] fn q4k_direct_decode_step_matches_dequant_path_within_tolerance() { + // This pins the strict <1e-3 WEIGHT parity of the *f32-activation* + // Q4K-direct path. The int8 activation route is now on by default and + // carries a looser (~2% scale-relative) bound by design, so disable it + // here. The guard restores the env even if an assertion panics. + struct EnvGuard(Option); + impl Drop for EnvGuard { + fn drop(&mut self) { + match self.0.take() { + Some(v) => std::env::set_var("LARQL_Q4K_ATTN_INT8", v), + None => std::env::remove_var("LARQL_Q4K_ATTN_INT8"), + } + } + } + let _guard = EnvGuard(std::env::var_os("LARQL_Q4K_ATTN_INT8")); + std::env::set_var("LARQL_Q4K_ATTN_INT8", "0"); + // Parity contract (roadmap #16, "<1e-3"): the Q4K-direct decode // step should track the f32-BLAS path that runs on the SAME bytes // dequantised into `weights.tensors`. We dequantise the fixture's diff --git a/crates/larql-compute/src/cpu/mod.rs b/crates/larql-compute/src/cpu/mod.rs index f2d39e9f3..9315d5941 100644 --- a/crates/larql-compute/src/cpu/mod.rs +++ b/crates/larql-compute/src/cpu/mod.rs @@ -19,6 +19,7 @@ //! - `ops/linalg`: Cholesky factor/solve, `ridge_decomposition_solve` pub mod ops; +pub mod spin_pool; // Re-export for backward compatibility (used by benchmarks/examples) pub mod q4 { diff --git a/crates/larql-compute/src/cpu/ops/moe/forward.rs b/crates/larql-compute/src/cpu/ops/moe/forward.rs index dd9322095..240dd58e3 100644 --- a/crates/larql-compute/src/cpu/ops/moe/forward.rs +++ b/crates/larql-compute/src/cpu/ops/moe/forward.rs @@ -140,102 +140,135 @@ pub fn cpu_moe_forward( } use rayon::prelude::*; - let expert_out = expert_indices - .par_iter() - .zip(expert_weights.par_iter()) - .filter(|(_, &w)| w != 0.0) - .fold( - || vec![0.0f32; hidden], - |mut acc, (&ei, &w)| { - let Some(&gate_up_bytes) = moe.experts_gate_up.get(ei) else { - return acc; - }; - let Some(&down_bytes) = moe.experts_down.get(ei) else { - return acc; - }; + // Add expert `ei`'s weighted contribution (`w * down(act(gate·x)·up·x)`) + // into `dst`. Shared by the rayon fold and the spin-pool path so both + // compute the identical arithmetic; only the parallel *schedule* differs. + let add_expert = |ei: usize, w: f32, dst: &mut [f32]| { + let Some(&gate_up_bytes) = moe.experts_gate_up.get(ei) else { + return; + }; + let Some(&down_bytes) = moe.experts_down.get(ei) else { + return; + }; + SCRATCH.with(|cell| { + let mut borrow = cell.borrow_mut(); + let scratch = + borrow.get_or_insert_with(|| ExpertScratch::new(hidden, inter, inter_padded)); + if scratch.gate_out.len() != inter + || scratch.act.len() != inter_padded + || scratch.out.len() != hidden + { + *scratch = ExpertScratch::new(hidden, inter, inter_padded); + } - SCRATCH.with(|cell| { - let mut borrow = cell.borrow_mut(); - let scratch = borrow - .get_or_insert_with(|| ExpertScratch::new(hidden, inter, inter_padded)); - if scratch.gate_out.len() != inter - || scratch.act.len() != inter_padded - || scratch.out.len() != hidden - { - *scratch = ExpertScratch::new(hidden, inter, inter_padded); - } + if let Some(q8k) = expert_input_q8k.as_ref() { + // Q4_K direct path — single source of truth in + // `expert::run_single_expert_q4k_q8k_into`. Reuses the + // scratch's act_q8k buffer too. + let h2 = run_single_expert_q4k_q8k_into( + scratch, + q8k, + gate_up_bytes, + down_bytes, + inter, + activation, + ); + for (a, &v) in dst.iter_mut().zip(h2.iter()) { + *a += w * v; + } + return; + } - if let Some(q8k) = expert_input_q8k.as_ref() { - // Q4_K direct path — single source of truth in - // `expert::run_single_expert_q4k_q8k_into`. Reuses - // the scratch's act_q8k buffer too. - let h2 = run_single_expert_q4k_q8k_into( - scratch, - q8k, - gate_up_bytes, - down_bytes, - inter, - activation, - ); - for (a, &v) in acc.iter_mut().zip(h2.iter()) { - *a += w * v; - } - return; - } + // Fallback: BF16 / F32 / Q4_K-with-disable — original f32 cache + // path. Inlined here to avoid pulling the per-call rms_norm / + // format dispatch from the legacy `run_single_expert_into` that + // doesn't share scratch. + let gate_up_w = try_cached_dequant(gate_up_bytes, format, 2 * inter * hidden) + .unwrap_or_else(|err| panic!("{err}")); + if gate_up_w.is_empty() { + return; + } + let gate_w = &gate_up_w[..inter * hidden]; + let up_w = &gate_up_w[inter * hidden..2 * inter * hidden]; + + let gate_out = matmul_vec(&expert_input, gate_w, inter, hidden); + let up_out = matmul_vec(&expert_input, up_w, inter, hidden); + + for j in 0..inter { + let g = gate_out[j]; + let u = up_out[j]; + scratch.act[j] = match activation { + crate::Activation::GeluTanh => gelu_tanh(g) * u, + _ => silu(g) * u, + }; + } - // Fallback: BF16 / F32 / Q4_K-with-disable — original - // f32 cache path. Inlined here to avoid pulling the - // per-call rms_norm / format dispatch from the legacy - // `run_single_expert_into` that doesn't share scratch. - let gate_up_w = try_cached_dequant(gate_up_bytes, format, 2 * inter * hidden) - .unwrap_or_else(|err| panic!("{err}")); - if gate_up_w.is_empty() { - return; - } - let gate_w = &gate_up_w[..inter * hidden]; - let up_w = &gate_up_w[inter * hidden..2 * inter * hidden]; - - let gate_out = matmul_vec(&expert_input, gate_w, inter, hidden); - let up_out = matmul_vec(&expert_input, up_w, inter, hidden); - - for j in 0..inter { - let g = gate_out[j]; - let u = up_out[j]; - scratch.act[j] = match activation { - crate::Activation::GeluTanh => gelu_tanh(g) * u, - _ => silu(g) * u, - }; - } + // Within-expert feature routing (aim-validation probe); no-op + // unless a schedule is installed. Mirrors the Q4_K-direct path so + // the f32-fallback (LARQL_DISABLE_Q4K_DIRECT) measures the same + // object. + super::within_expert::prune_act(&mut scratch.act, inter); - // Within-expert feature routing (aim-validation probe); - // no-op unless a schedule is installed. Mirrors the - // Q4_K-direct path so the f32-fallback (LARQL_DISABLE_Q4K_DIRECT) - // measures the same object. - super::within_expert::prune_act(&mut scratch.act, inter); + let down_w = try_cached_dequant(down_bytes, format, hidden * inter_padded) + .unwrap_or_else(|err| panic!("{err}")); + if down_w.is_empty() { + return; + } + let expert_contribution = matmul_vec(&scratch.act, &down_w, hidden, inter_padded); + for (a, &v) in dst.iter_mut().zip(expert_contribution.iter()) { + *a += w * v; + } + }); + }; - let down_w = try_cached_dequant(down_bytes, format, hidden * inter_padded) - .unwrap_or_else(|err| panic!("{err}")); - if down_w.is_empty() { - return; - } - let expert_contribution = - matmul_vec(&scratch.act, &down_w, hidden, inter_padded); - for (a, &v) in acc.iter_mut().zip(expert_contribution.iter()) { - *a += w * v; + let expert_out = if crate::cpu::spin_pool::enabled() { + // Spin-pool path: one chunk per active (non-zero-weight) expert, each + // accumulating into its own disjoint `hidden`-wide slot; summed after. + // Keeps all decode sections on one hot pool (no rayon/spin two-pool + // contention). Sum order differs from the rayon tree-reduce by fp + // reordering only — within the experts' tolerance parity. + let active: Vec<(usize, f32)> = expert_indices + .iter() + .copied() + .zip(expert_weights.iter().copied()) + .filter(|(_, w)| *w != 0.0) + .collect(); + let mut contribs = vec![0.0f32; active.len() * hidden]; + let active_ref = &active[..]; + let add_expert_ref = &add_expert; + crate::cpu::spin_pool::par_chunks_mut(&mut contribs, hidden, |ci, slot| { + let (ei, w) = active_ref[ci]; + add_expert_ref(ei, w, slot); + }); + let mut acc = vec![0.0f32; hidden]; + for ci in 0..active.len() { + for (a, &v) in acc.iter_mut().zip(contribs[ci * hidden..(ci + 1) * hidden].iter()) { + *a += v; + } + } + acc + } else { + expert_indices + .par_iter() + .zip(expert_weights.par_iter()) + .filter(|(_, &w)| w != 0.0) + .fold( + || vec![0.0f32; hidden], + |mut acc, (&ei, &w)| { + add_expert(ei, w, &mut acc); + acc + }, + ) + .reduce( + || vec![0.0f32; hidden], + |mut a, b| { + for (x, &y) in a.iter_mut().zip(b.iter()) { + *x += y; } - }); - acc - }, - ) - .reduce( - || vec![0.0f32; hidden], - |mut a, b| { - for (x, &y) in a.iter_mut().zip(b.iter()) { - *x += y; - } - a - }, - ); + a + }, + ) + }; let t_par = t_par_start.elapsed(); let t_sum = std::time::Duration::ZERO; diff --git a/crates/larql-compute/src/cpu/ops/q4_common.rs b/crates/larql-compute/src/cpu/ops/q4_common.rs index 1ba56896a..7dd6ec4ed 100644 --- a/crates/larql-compute/src/cpu/ops/q4_common.rs +++ b/crates/larql-compute/src/cpu/ops/q4_common.rs @@ -583,34 +583,30 @@ pub fn q4k_matvec_into(out: &mut [f32], x: &[f32], w: &[u8], rows: usize, cols: // for Q4_K) is large enough to amortise rayon's join overhead by // 100×+. Empirically on M3 Max this drops a 2560-row decode from // ~70ms → ~10ms (≈ 7× across 11 perf cores). - use rayon::prelude::*; let sum_x_ref = &sum_x[..]; let w_ref = w; let x_ref = x; // par_chunks_mut(CHUNK_ROWS) instead of per-row par_iter_mut: each - // rayon task processes a contiguous block of rows sequentially. - // Cuts the number of work-stealing units from `rows` (10K+) down - // to ~rows/CHUNK_ROWS, reducing scheduler overhead while keeping - // enough granularity for the 11 perf cores on M3 Max to load- - // balance. + // task processes a contiguous block of rows sequentially. Cuts the + // number of work-stealing units from `rows` (10K+) down to + // ~rows/CHUNK_ROWS, reducing scheduler overhead while keeping enough + // granularity for the 11 perf cores on M3 Max to load-balance. const CHUNK_ROWS: usize = 32; - out.par_chunks_mut(CHUNK_ROWS) - .enumerate() - .for_each(|(chunk_idx, chunk_slots)| { - let row_base_chunk = chunk_idx * CHUNK_ROWS; - for (local_r, out_slot) in chunk_slots.iter_mut().enumerate() { - let r = row_base_chunk + local_r; - if r >= rows { - break; - } - let row_base = r * row_bytes; - let mut acc = 0.0f32; - for sb in 0..n_blocks { - acc += process_q4k_superblock(w_ref, x_ref, sum_x_ref, row_base, sb); - } - *out_slot = acc; + crate::cpu::spin_pool::par_chunks_mut(out, CHUNK_ROWS, |chunk_idx, chunk_slots| { + let row_base_chunk = chunk_idx * CHUNK_ROWS; + for (local_r, out_slot) in chunk_slots.iter_mut().enumerate() { + let r = row_base_chunk + local_r; + if r >= rows { + break; } - }); + let row_base = r * row_bytes; + let mut acc = 0.0f32; + for sb in 0..n_blocks { + acc += process_q4k_superblock(w_ref, x_ref, sum_x_ref, row_base, sb); + } + *out_slot = acc; + } + }); } /// Per-super-block dot contribution for a Q4_K row. Returned scalar @@ -740,20 +736,18 @@ pub fn q4k_dual_matvec_into( // each worker computes both outputs for its assigned row index. // Zip `out_a` and `out_b` so rayon stays simple and the two // writes hit different cache lines per row. - use rayon::prelude::*; let sum_x_ref = &sum_x[..]; let w_a_ref = w_a; let w_b_ref = w_b; let x_ref = x; - // par_chunks_mut(CHUNK_ROWS) — same rationale as - // `q4k_matvec_into`. Fewer-but-larger work units reduce rayon - // work-stealing overhead. + // Fewer-but-larger work units (CHUNK_ROWS rows each) reduce + // work-stealing overhead; same rationale as `q4k_matvec_into`. const CHUNK_ROWS: usize = 32; - out_a - .par_chunks_mut(CHUNK_ROWS) - .zip(out_b.par_chunks_mut(CHUNK_ROWS)) - .enumerate() - .for_each(|(chunk_idx, (chunk_a, chunk_b))| { + crate::cpu::spin_pool::par_chunks_mut2( + out_a, + out_b, + CHUNK_ROWS, + |chunk_idx, chunk_a, chunk_b| { let row_base_chunk = chunk_idx * CHUNK_ROWS; for (local_r, (out_a_slot, out_b_slot)) in chunk_a.iter_mut().zip(chunk_b.iter_mut()).enumerate() diff --git a/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs b/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs index 35d08c1d2..b62f6748d 100644 --- a/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs +++ b/crates/larql-compute/src/cpu/ops/q4k_q8k_dot.rs @@ -1135,23 +1135,13 @@ pub fn q4k_q8k_matvec_asm_v3( } } -/// C12 opt-in: route Q4_K × Q8_K matvecs through the hand-asm kernel -/// (`q4k_q8k_matvec_asm`) instead of the intrinsic path when `LARQL_Q4K_ASM` -/// is `1`/`true`. Read once and cached — the env lookup must not land in the -/// per-token hot loop. Default off; both paths are bit-exact. -/// Pure parse of the `LARQL_Q4K_ASM` opt-in value (`1`/`true` → on). -/// Split out so the truth table is unit-testable without touching the -/// process environment or the `OnceLock` cache below. -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -fn q4k_asm_flag_enabled(val: Option<&str>) -> bool { - matches!(val, Some(v) if v == "1" || v.eq_ignore_ascii_case("true")) -} - +/// C12: route Q4_K × Q8_K matvecs through the hand-asm kernel +/// (`q4k_q8k_matvec_asm`) instead of the intrinsic path. **Default on** +/// (`LARQL_Q4K_ASM=0` opts out); both paths are bit-exact. The truth table + +/// caching live in [`crate::options::q4k_asm_enabled`]. #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] fn use_asm_kernel() -> bool { - use std::sync::OnceLock; - static ON: OnceLock = OnceLock::new(); - *ON.get_or_init(|| q4k_asm_flag_enabled(std::env::var("LARQL_Q4K_ASM").ok().as_deref())) + crate::options::q4k_asm_enabled() } /// Public entry point: dispatches to NEON on aarch64, scalar elsewhere. @@ -1198,6 +1188,53 @@ pub fn q4k_q8k_matvec_into( q4k_q8k_matvec_scalar(out, q8k_x, w, rows, cols); } +/// Row-chunked **parallel** Q4_K / Q6_K × Q8_K matvec — the single source for +/// every quantized projection on the decode path (attention Q/K/V/O, the dense +/// FFN gate/up/down slab, and the lm_head vocab projection). Fills `out[0..rows]` +/// with each weight row's dot against `q8k_x`. +/// +/// `format` is `"Q4_K"` or `"Q6_K"` (selects the per-row kernel and the byte +/// stride). The per-row kernel ([`q4k_q8k_matvec_into`] / [`q6k_q8k_matvec_into`]) +/// is single-threaded; this wraps it across output-row chunks and routes the +/// chunks through the spin pool when enabled, else rayon — so the whole decode +/// path shares one parallelism strategy. +/// +/// This centralizes what were four byte-identical `par_chunks_mut` copies +/// (larql-compute `cached.rs`, larql-inference `cached.rs`, and the two lm_head +/// blocks in larql-inference `dense.rs`) — the "consolidation hazard" twins. +/// `out.len()` must be `>= rows`; rows beyond `rows` are left untouched. +pub fn q4k_q8k_matvec_parallel( + out: &mut [f32], + q8k_x: &Q8KActivation, + bytes: &[u8], + rows: usize, + cols: usize, + format: &str, +) { + let bytes_per_row = match format { + "Q4_K" => (cols / ELEMS_PER_BLOCK) * 144, + "Q6_K" => (cols / ELEMS_PER_BLOCK) * 210, + _ => return, + }; + if rows == 0 || cols == 0 || bytes.len() < rows * bytes_per_row { + return; + } + const CHUNK_ROWS: usize = 32; + crate::cpu::spin_pool::par_chunks_mut(&mut out[..rows], CHUNK_ROWS, |chunk_idx, chunk| { + let row_start = chunk_idx * CHUNK_ROWS; + let chunk_len = chunk.len().min(rows.saturating_sub(row_start)); + if chunk_len == 0 { + return; + } + let w_chunk = &bytes[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; + match format { + "Q4_K" => q4k_q8k_matvec_into(&mut chunk[..chunk_len], q8k_x, w_chunk, chunk_len, cols), + "Q6_K" => q6k_q8k_matvec_into(&mut chunk[..chunk_len], q8k_x, w_chunk, chunk_len, cols), + _ => {} + } + }); +} + /// AVX2 Q4_K × Q8_K matvec for x86_64. /// /// `vpmaddubsw` (unsigned×signed 8-bit → adjacent-pair-summed 16-bit) replaces @@ -2526,21 +2563,6 @@ mod tests { assert!(out.iter().all(|&v| v == 0.0)); } - /// `LARQL_Q4K_ASM` opt-in truth table (the pure parse behind the - /// `OnceLock`-cached `use_asm_kernel`). - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - #[test] - fn q4k_asm_flag_truth_table() { - assert!(q4k_asm_flag_enabled(Some("1"))); - assert!(q4k_asm_flag_enabled(Some("true"))); - assert!(q4k_asm_flag_enabled(Some("TRUE"))); - assert!(q4k_asm_flag_enabled(Some("True"))); - assert!(!q4k_asm_flag_enabled(Some("0"))); - assert!(!q4k_asm_flag_enabled(Some("false"))); - assert!(!q4k_asm_flag_enabled(Some("yes"))); - assert!(!q4k_asm_flag_enabled(Some(""))); - assert!(!q4k_asm_flag_enabled(None)); - } /// `quantize_x_to_q8k_into` must produce the same `qs`, `d`, `sums` as /// the allocating `quantize_x_to_q8k` for any well-sized input — both diff --git a/crates/larql-compute/src/cpu/spin_pool.rs b/crates/larql-compute/src/cpu/spin_pool.rs new file mode 100644 index 000000000..33669875b --- /dev/null +++ b/crates/larql-compute/src/cpu/spin_pool.rs @@ -0,0 +1,537 @@ +//! Spin-barrier thread pool for the decode hot path. +//! +//! Rayon puts idle workers to sleep between parallel sections (the right call +//! for batch throughput, the wrong one for a tight decode loop). A 26B-A4B +//! token runs ~200 small fork-join sections — attention Q/K/V/O, dense +//! gate_up/down, the expert fold, lm_head, per layer — and a `/usr/bin/sample` +//! profile attributed ~30% of decode thread-time to the resulting churn: +//! workers asleep in `wait_until_cold`, the driver blocked in +//! `in_worker_cold -> LockLatch::wait_and_reset -> __psynch_cvwait`, plus the +//! condvar wake latency paid on *every* section. +//! +//! This pool keeps workers HOT. They spin on an epoch counter and only +//! [`park`](std::thread::park_timeout) after a long idle gap, so a +//! `for_each_chunk` dispatched microseconds after the previous one finds them +//! already spinning — ready in ~ns, no condvar round-trip. The dispatcher +//! participates as the n-th worker; chunks are owned by static strided +//! assignment (participant `p` runs `p, p+n, p+2n, …`), which keeps the +//! `completed == num_chunks` barrier sound across back-to-back dispatches. +//! When a worker has to wait it backs off spin → yield → park, so it stays +//! cooperative under contention. Modeled on llama.cpp's persistent thread +//! pool + `ggml_barrier`. +//! +//! [`enabled`] gates whether callers route through here or stay on rayon. It is +//! **on by default** (the yield backoff makes it safe on shared machines); +//! `LARQL_SPIN_POOL=0` forces the rayon path. Either way the arithmetic is +//! identical — only *which threads run which chunks* differs. + +use std::cell::Cell; +use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, OnceLock}; +use std::thread; +use std::time::Duration; + +thread_local! { + /// True while this thread is executing a dispatched chunk body. Guards + /// against reentrant `for_each_chunk` (a body that itself dispatches): the + /// nested call runs serially inline rather than deadlocking on the pool. + static IN_BODY: Cell = const { Cell::new(false) }; +} + +/// Adaptive-backoff thresholds (iterations of the wait loop) for a worker +/// waiting on the next dispatch. It escalates spin → yield → park: +/// +/// - **spin** (`< SPIN_HOT`): `spin_loop()` for ~hundreds of µs. This is the +/// same pure-spin window that produced the measured decode win, so *active +/// decode behaviour is unchanged* — every inter-section gap within a token +/// stays in the spin phase, giving a ~ns wake. +/// - **yield** (`< YIELD_UNTIL`): `yield_now()` — cooperative bridge once a wait +/// outlives a whole token's worth of spinning (i.e. the loop went genuinely +/// idle, or another process is starving this core). Hands the core to other +/// runnable threads instead of burning it. +/// - **park** (otherwise): deep idle between requests / runs, ~0 CPU. The +/// dispatcher unparks all workers on every dispatch, so a parked worker wakes +/// immediately on the next section — parking is cheap to enter; the timeout +/// is only a shutdown-check backstop. +/// +/// Net: spin = the win during active decode; yield+park = don't peg cores when +/// the decode loop is idle — which is what makes on-by-default safe. +const SPIN_HOT: u32 = 256_000; +const YIELD_UNTIL: u32 = 384_000; + +/// Cross-thread dispatch state. Published to workers by the `epoch` release +/// store; workers read the task fields only after the matching acquire load, +/// so the plain `Relaxed` value stores are safe (epoch is the synchroniser). +struct Shared { + /// Bumped once per `for_each_chunk`; workers wake when it changes. + epoch: AtomicU64, + /// Chunks finished this dispatch; the barrier waits for `== num_chunks`. + /// With static strided ownership each chunk is run exactly once, so this + /// reaching `num_chunks` proves every trampoline call has returned — no + /// worker can still touch the (about-to-drop) closure. + completed: AtomicUsize, + /// Chunk count for the current dispatch. + num_chunks: AtomicUsize, + /// Type-erased `&F` for the current dispatch (valid until the barrier). + data: AtomicPtr<()>, + /// `fn(*const (), usize)` trampoline that recovers `&F` and calls it. + tramp: AtomicUsize, + /// Set on drop; workers observe it and exit. + shutdown: AtomicBool, + /// Set when any chunk this dispatch panicked — a cheap flag the dispatcher + /// checks after the barrier without locking on the happy path. + panicked: AtomicBool, + /// The first chunk panic's payload. A panicking body still increments + /// `completed` (so the barrier finishes instead of hanging on a dead + /// worker), and the dispatcher `resume_unwind`s this afterward — so a panic + /// propagates to the caller exactly like rayon, rather than killing a + /// worker thread and live-locking every future dispatch. + panic_payload: Mutex>>, +} + +/// A persistent spin-barrier pool. Owns `n-1` worker threads; the thread that +/// calls [`for_each_chunk`] is the n-th participant. +pub struct SpinPool { + shared: Arc, + workers: Vec>, + n_threads: usize, + /// Serializes dispatchers. Uncontended (≈one atomic CAS) for the normal + /// single-driver decode loop; serializes the rare concurrent dispatch + /// (`bench --concurrent N`, multi-threaded test harness) so the shared + /// epoch/cursor state stays consistent. + dispatch_lock: Mutex<()>, +} + +/// Recover `&F` from the type-erased data pointer and invoke it for `chunk`. +/// +/// # Safety +/// `data` must point to the live `F` published for the current epoch (the +/// dispatcher keeps it on its stack until the completion barrier passes), and +/// `F: Sync` (multiple threads call it concurrently). +fn trampoline(data: *const (), chunk: usize) { + // SAFETY: see fn docs — `data` is `&F` published under the epoch fence and + // outlives every call within the dispatch. + let f = unsafe { &*(data as *const F) }; + f(chunk); +} + +fn worker_loop(shared: Arc, worker_id: usize, n_participants: usize) { + let mut seen_epoch = 0u64; + loop { + // Wait for a new dispatch (spin first, park if idle persists). + let mut spins = 0u32; + let epoch = loop { + let e = shared.epoch.load(Ordering::Acquire); + if e != seen_epoch { + break e; + } + if shared.shutdown.load(Ordering::Relaxed) { + return; + } + spins += 1; + if spins < SPIN_HOT { + std::hint::spin_loop(); + } else if spins < YIELD_UNTIL { + std::thread::yield_now(); + } else { + thread::park_timeout(Duration::from_micros(50)); + } + }; + seen_epoch = epoch; + if shared.shutdown.load(Ordering::Relaxed) { + return; + } + run_chunks(&shared, worker_id, n_participants); + } +} + +/// Run this participant's statically-assigned chunks (strided: +/// `participant_id, participant_id + n, …`). Static ownership — rather than a +/// shared resettable cursor — is what makes `completed == num_chunks` a sound +/// barrier across back-to-back dispatches: no participant can re-claim a chunk +/// the next dispatch reset, so once the count is reached every trampoline call +/// has returned and the closure is safe to drop. +fn run_chunks(shared: &Shared, participant_id: usize, n_participants: usize) { + let num = shared.num_chunks.load(Ordering::Relaxed); + let tramp_addr = shared.tramp.load(Ordering::Relaxed); + if tramp_addr == 0 || num == 0 || participant_id >= n_participants { + return; + } + let data = shared.data.load(Ordering::Relaxed) as *const (); + // SAFETY: `tramp_addr` is a `fn(*const (), usize)` stored by the dispatcher + // before the epoch release; recovered here after the epoch acquire. + let tramp: fn(*const (), usize) = unsafe { std::mem::transmute(tramp_addr) }; + let mut c = participant_id; + while c < num { + // `IN_BODY` makes a reentrant `for_each_chunk` (a body that dispatches) + // fall back to serial instead of deadlocking. run_chunks is only + // entered at top level, so the prior value is always false. + IN_BODY.with(|b| b.set(true)); + // Catch a panicking body so we still `completed.fetch_add` below: a + // worker that unwound out of the loop would never count its chunk and + // the dispatcher would spin the barrier forever. The first payload is + // kept and re-raised on the dispatcher. + let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| tramp(data, c))); + IN_BODY.with(|b| b.set(false)); + if let Err(payload) = r { + if !shared.panicked.swap(true, Ordering::AcqRel) { + *shared.panic_payload.lock().unwrap_or_else(|e| e.into_inner()) = Some(payload); + } + } + shared.completed.fetch_add(1, Ordering::Release); + c += n_participants; + } +} + +impl SpinPool { + /// Build a pool with `n_threads` total participants (spawns `n_threads-1` + /// persistent workers; the dispatcher is the n-th). `n_threads <= 1` makes + /// [`for_each_chunk`] run inline with no workers. + pub fn new(n_threads: usize) -> Self { + let n_threads = n_threads.max(1); + let shared = Arc::new(Shared { + epoch: AtomicU64::new(0), + completed: AtomicUsize::new(0), + num_chunks: AtomicUsize::new(0), + data: AtomicPtr::new(std::ptr::null_mut()), + tramp: AtomicUsize::new(0), + shutdown: AtomicBool::new(false), + panicked: AtomicBool::new(false), + panic_payload: Mutex::new(None), + }); + let workers = (1..n_threads) + .map(|i| { + let shared = Arc::clone(&shared); + thread::Builder::new() + .name(format!("larql-spin-{i}")) + // Participant `i` of `n_threads`; the dispatcher is 0. + .spawn(move || worker_loop(shared, i, n_threads)) + .expect("spawn spin-pool worker") + }) + .collect(); + Self { + shared, + workers, + n_threads, + dispatch_lock: Mutex::new(()), + } + } + + /// Number of participating threads (workers + dispatcher). + pub fn num_threads(&self) -> usize { + self.n_threads + } + + /// Run `body(chunk_idx)` for every `chunk_idx in 0..num_chunks`, across the + /// pool, blocking until all chunks complete. + /// + /// `body` must only touch data disjoint per `chunk_idx` — exactly the + /// contract of `slice::par_chunks_mut().enumerate().for_each()`, which this + /// replaces. The calling thread participates, so this is *not* reentrant: + /// `body` must not itself call `for_each_chunk` on the same pool. + pub fn for_each_chunk(&self, num_chunks: usize, body: F) { + if num_chunks == 0 { + return; + } + // No workers, or already inside a dispatched body (reentrant): run the + // chunks serially on this thread. The reentrancy fallback also avoids + // deadlocking against `dispatch_lock` if a body ever dispatches. + if self.workers.is_empty() || IN_BODY.with(|b| b.get()) { + for c in 0..num_chunks { + body(c); + } + return; + } + // Serialize dispatchers so the shared epoch/cursor state is consistent; + // uncontended in the single-driver decode loop. + let _dispatch = self.dispatch_lock.lock().unwrap_or_else(|e| e.into_inner()); + let shared = &self.shared; + // Publish the task, then release it to workers via the epoch bump. + shared + .data + .store(&body as *const F as *mut (), Ordering::Relaxed); + shared + .tramp + .store(trampoline:: as *const () as usize, Ordering::Relaxed); + shared.num_chunks.store(num_chunks, Ordering::Relaxed); + shared.completed.store(0, Ordering::Relaxed); + shared.panicked.store(false, Ordering::Relaxed); + shared.epoch.fetch_add(1, Ordering::Release); + + // Wake any worker that parked during an idle gap so the barrier never + // stalls ~park_timeout waiting on its strided share. Unparking a + // still-spinning worker just sets its token (harmless). During tight + // back-to-back decode dispatches workers stay spinning and this is a + // no-op fast path. + for w in &self.workers { + w.thread().unpark(); + } + + // The dispatcher participates as participant 0. + run_chunks(shared, 0, self.n_threads); + + // Completion barrier: spin until every chunk has finished. With static + // strided ownership, `completed == num_chunks` means every trampoline + // call has returned (panics still count, see run_chunks), so it is safe + // to let `body` drop as this returns. + while shared.completed.load(Ordering::Acquire) < num_chunks { + std::hint::spin_loop(); + } + + // Re-raise the first chunk panic on this (the dispatching) thread, so a + // panicking body propagates to the caller like a serial loop or rayon — + // instead of being swallowed on a worker. Drop the dispatch guard first + // so the pool stays usable after the unwind. + if shared.panicked.load(Ordering::Acquire) { + let payload = shared + .panic_payload + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); + drop(_dispatch); + if let Some(payload) = payload { + std::panic::resume_unwind(payload); + } + } + } +} + +impl Drop for SpinPool { + fn drop(&mut self) { + self.shared.shutdown.store(true, Ordering::Relaxed); + // Bump epoch so any spinning worker breaks out and re-checks shutdown. + self.shared.epoch.fetch_add(1, Ordering::Release); + for w in self.workers.drain(..) { + let _ = w.join(); + } + } +} + +/// Process-wide pool, lazily sized to the active rayon thread count (which the +/// bench/CLI configures from `--threads`). Built on first use. +pub fn global() -> &'static SpinPool { + static POOL: OnceLock = OnceLock::new(); + POOL.get_or_init(|| SpinPool::new(rayon::current_num_threads().max(1))) +} + +/// Whether the decode hot path routes parallel sections through the spin pool +/// instead of rayon. **On by default** — the spin-then-yield backoff makes it +/// safe on shared/contended machines — set `LARQL_SPIN_POOL=0` to force the +/// rayon path (e.g. for an A/B or a heavily oversubscribed host). Either path +/// is numerically identical; only *which threads run which chunks* differs. +pub fn enabled() -> bool { + crate::options::spin_pool_enabled() +} + +/// Drop-in for `out.par_chunks_mut(chunk).enumerate().for_each(|(ci, c)| body(ci, c))` +/// that routes through the spin pool when [`enabled`], else stays on rayon. +/// +/// `body(chunk_idx, chunk)` receives each disjoint `chunk`-sized (last shorter) +/// slice of `out` and its index — identical semantics either way, so the +/// arithmetic is unchanged; only *which thread runs which chunk* differs. +pub fn par_chunks_mut(out: &mut [T], chunk: usize, body: F) +where + T: Send, + F: Fn(usize, &mut [T]) + Sync + Send, +{ + if chunk == 0 || out.is_empty() { + return; + } + if enabled() { + let total = out.len(); + let n = total.div_ceil(chunk); + let base = out.as_mut_ptr() as usize; + global().for_each_chunk(n, |ci| { + let start = ci * chunk; + let len = chunk.min(total - start); + // SAFETY: chunk index `ci` owns the disjoint range + // `[start, start+len)` of `out`; no two chunks overlap, and the + // dispatch barrier keeps `out` borrowed for the whole call. + let s = unsafe { std::slice::from_raw_parts_mut((base as *mut T).add(start), len) }; + body(ci, s); + }); + } else { + use rayon::prelude::*; + out.par_chunks_mut(chunk) + .enumerate() + .for_each(|(ci, c)| body(ci, c)); + } +} + +/// Two-output sibling of [`par_chunks_mut`] for kernels that write `a` and `b` +/// at the same row index (e.g. the fused gate/up dual matvec). `a` and `b` must +/// have the same length; `body(chunk_idx, a_chunk, b_chunk)` gets the matching +/// disjoint slices. +pub fn par_chunks_mut2(a: &mut [T], b: &mut [T], chunk: usize, body: F) +where + T: Send, + F: Fn(usize, &mut [T], &mut [T]) + Sync + Send, +{ + debug_assert_eq!(a.len(), b.len(), "par_chunks_mut2 needs equal-length a/b"); + if chunk == 0 || a.is_empty() { + return; + } + if enabled() { + let total = a.len(); + let n = total.div_ceil(chunk); + let base_a = a.as_mut_ptr() as usize; + let base_b = b.as_mut_ptr() as usize; + global().for_each_chunk(n, |ci| { + let start = ci * chunk; + let len = chunk.min(total - start); + // SAFETY: disjoint per-chunk ranges of `a` and `b` (separate + // buffers); barrier keeps both borrowed for the call. + let sa = unsafe { std::slice::from_raw_parts_mut((base_a as *mut T).add(start), len) }; + let sb = unsafe { std::slice::from_raw_parts_mut((base_b as *mut T).add(start), len) }; + body(ci, sa, sb); + }); + } else { + use rayon::prelude::*; + a.par_chunks_mut(chunk) + .zip(b.par_chunks_mut(chunk)) + .enumerate() + .for_each(|(ci, (ca, cb))| body(ci, ca, cb)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::AtomicU32; + + #[test] + fn runs_every_chunk_exactly_once() { + let pool = SpinPool::new(4); + let hits: Vec = (0..1000).map(|_| AtomicU32::new(0)).collect(); + pool.for_each_chunk(hits.len(), |c| { + hits[c].fetch_add(1, Ordering::Relaxed); + }); + for (i, h) in hits.iter().enumerate() { + assert_eq!(h.load(Ordering::Relaxed), 1, "chunk {i} ran != once"); + } + } + + #[test] + fn disjoint_mut_writes_match_serial() { + // The production pattern: each chunk writes its disjoint row range of a + // shared output buffer via a raw pointer (caller guarantees disjoint). + let pool = SpinPool::new(4); + let rows = 517usize; + let chunk = 32usize; + let n_chunks = rows.div_ceil(chunk); + let mut out = vec![0u64; rows]; + let ptr = out.as_mut_ptr() as usize; + pool.for_each_chunk(n_chunks, |ci| { + let start = ci * chunk; + let end = (start + chunk).min(rows); + for r in start..end { + // SAFETY: chunks are disjoint row ranges of `out`. + unsafe { *(ptr as *mut u64).add(r) = (r as u64) * 3 + 1 }; + } + }); + for (r, v) in out.iter().enumerate() { + assert_eq!(*v, (r as u64) * 3 + 1); + } + } + + #[test] + fn parallel_sum_matches_serial() { + let pool = SpinPool::new(8); + let n = 100_000usize; + let partials: Vec = (0..64).map(|_| AtomicU64::new(0)).collect(); + let chunk = n.div_ceil(64); + pool.for_each_chunk(64, |ci| { + let start = ci * chunk; + let end = (start + chunk).min(n); + let s: u64 = (start as u64..end as u64).sum(); + partials[ci].store(s, Ordering::Relaxed); + }); + let got: u64 = partials.iter().map(|a| a.load(Ordering::Relaxed)).sum(); + let want: u64 = (0..n as u64).sum(); + assert_eq!(got, want); + } + + #[test] + fn zero_chunks_is_noop() { + let pool = SpinPool::new(4); + pool.for_each_chunk(0, |_| panic!("must not run")); + } + + #[test] + fn single_thread_runs_inline() { + let pool = SpinPool::new(1); + let hits: Vec = (0..50).map(|_| AtomicU32::new(0)).collect(); + pool.for_each_chunk(hits.len(), |c| { + hits[c].fetch_add(1, Ordering::Relaxed); + }); + assert!(hits.iter().all(|h| h.load(Ordering::Relaxed) == 1)); + } + + #[test] + fn chunk_panic_propagates_and_pool_stays_usable() { + // A panicking body must (a) NOT hang the barrier (a dead worker would + // never count its chunk → dispatcher spins forever) and (b) propagate + // the panic to the dispatcher. Chunk 37 lands on a worker (37 % 4 != 0), + // exercising the worker-side catch, not just the dispatcher's own. + let pool = SpinPool::new(4); + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + pool.for_each_chunk(50, |c| { + if c == 37 { + panic!("boom at chunk {c}"); + } + }); + })); + assert!( + result.is_err(), + "a panicking chunk body must propagate to the dispatcher" + ); + // The pool must still work after a panic (not poisoned / not hung). + let hits: Vec = (0..20).map(|_| AtomicU32::new(0)).collect(); + pool.for_each_chunk(hits.len(), |c| { + hits[c].fetch_add(1, Ordering::Relaxed); + }); + assert!( + hits.iter().all(|h| h.load(Ordering::Relaxed) == 1), + "pool must stay usable after a chunk panic" + ); + } + + #[test] + fn concurrent_dispatchers_stay_consistent() { + // Multiple driver threads dispatching on one shared pool (the + // `--concurrent N` / multi-threaded-test shape). The dispatch lock + // serializes them; each dispatch must still complete correctly. + let pool = SpinPool::new(4); + std::thread::scope(|s| { + for _ in 0..3 { + s.spawn(|| { + for round in 1..=50u64 { + let acc: Vec = (0..20).map(|_| AtomicU64::new(0)).collect(); + pool.for_each_chunk(20, |c| { + acc[c].store(round * (c as u64 + 1), Ordering::Relaxed); + }); + for (c, a) in acc.iter().enumerate() { + assert_eq!(a.load(Ordering::Relaxed), round * (c as u64 + 1)); + } + } + }); + } + }); + } + + #[test] + fn back_to_back_dispatches_reuse_workers() { + // Exercises the epoch path: many tiny dispatches in a row (the decode + // loop shape) must each complete fully. + let pool = SpinPool::new(4); + for round in 1..=200u64 { + let acc: Vec = (0..16).map(|_| AtomicU64::new(0)).collect(); + pool.for_each_chunk(16, |c| { + acc[c].store(round * (c as u64 + 1), Ordering::Relaxed); + }); + for (c, a) in acc.iter().enumerate() { + assert_eq!(a.load(Ordering::Relaxed), round * (c as u64 + 1)); + } + } + } +} diff --git a/crates/larql-compute/src/kquant_forward/cached.rs b/crates/larql-compute/src/kquant_forward/cached.rs index eba5db63a..24d63738a 100644 --- a/crates/larql-compute/src/kquant_forward/cached.rs +++ b/crates/larql-compute/src/kquant_forward/cached.rs @@ -25,9 +25,7 @@ // across two files. #![allow(clippy::needless_range_loop, clippy::type_complexity)] -use crate::cpu::ops::q4k_q8k_dot::{ - q4k_q8k_matvec_into, q6k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, -}; +use crate::cpu::ops::q4k_q8k_dot::{quantize_x_to_q8k_into, Q8KActivation}; use crate::ComputeBackend; use larql_models::ModelWeights; use ndarray::Array2; @@ -276,30 +274,10 @@ fn matvec_q4k_or_q6k_q8k( // `q4k_matvec_into` in `q4_common.rs`. Without this, decode runs // single-threaded and the sdot path actually regresses vs the // (rayon-parallel) f32 path despite each row being faster. - use rayon::prelude::*; - const CHUNK_ROWS: usize = 32; let mut out = vec![0.0f32; rows]; - let w_ref = bytes; - out.par_chunks_mut(CHUNK_ROWS) - .enumerate() - .for_each(|(chunk_idx, chunk)| { - let row_start = chunk_idx * CHUNK_ROWS; - let chunk_len = chunk.len().min(rows.saturating_sub(row_start)); - if chunk_len == 0 { - return; - } - let w_chunk = - &w_ref[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; - match format { - "Q4_K" => { - q4k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, cols) - } - "Q6_K" => { - q6k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, cols) - } - _ => {} - } - }); + crate::cpu::ops::q4k_q8k_dot::q4k_q8k_matvec_parallel( + &mut out, x_q8k, bytes, rows, cols, format, + ); Some(out) } @@ -838,26 +816,27 @@ fn run_ffn_decode_step_q4k_direct( // scalar pass serial on the main thread while the workers slept. let mut activated = vec![0.0f32; intermediate]; { - use rayon::prelude::*; let gelu = matches!(arch.activation(), larql_models::Activation::GeluTanh); let sqrt_2_over_pi = (2.0f32 / std::f32::consts::PI).sqrt(); - activated - .par_chunks_mut(256) - .zip(gate_vec.par_chunks(256).zip(up_vec.par_chunks(256))) - .for_each(|(a_c, (g_c, u_c))| { - if gelu { - for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { - let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); - *a = 0.5 * x * (1.0 + inner.tanh()) * u; - } - } else { - // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. - for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { - let sig = 1.0 / (1.0 + (-x).exp()); - *a = x * sig * u; - } + let gate_ref = &gate_vec[..]; + let up_ref = &up_vec[..]; + crate::cpu::spin_pool::par_chunks_mut(&mut activated, 256, |ci, a_c| { + let start = ci * 256; + let g_c = &gate_ref[start..start + a_c.len()]; + let u_c = &up_ref[start..start + a_c.len()]; + if gelu { + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); + *a = 0.5 * x * (1.0 + inner.tanh()) * u; + } + } else { + // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let sig = 1.0 / (1.0 + (-x).exp()); + *a = x * sig * u; } - }); + } + }); } // down projection: out = activated @ W_down.T → [hidden]. diff --git a/crates/larql-compute/src/options.rs b/crates/larql-compute/src/options.rs index 3d667abe8..ebf5c5716 100644 --- a/crates/larql-compute/src/options.rs +++ b/crates/larql-compute/src/options.rs @@ -5,6 +5,23 @@ //! explicit options struct; this module is the compatibility bridge while those //! APIs are split out. //! +//! ## Environment-variable surface (the categories) +//! +//! - **Decode fast path — default ON, opt out with `=0`.** The shipped CPU +//! decode default; you do *not* set anything to go fast. Resolvers: +//! [`q4k_direct_attn_enabled`], [`q4k_attn_int8_enabled`], +//! [`q4k_lm_head_enabled`], [`q4k_direct_ffn_enabled`], [`q4k_asm_enabled`], +//! [`spin_pool_enabled`] (env `LARQL_Q4K_*`, `LARQL_SPIN_POOL`). Set any to +//! `0`/`false`/`off`/`no` to force the f32/rayon path (A/B, kernel debug). +//! - **Diagnostics / dumps — presence = on** (`env_flag`): the `LARQL_*_DUMP_*`, +//! `*_TIMING`, `LARQL_PROFILE_SPLIT`, `LARQL_DECODE_STAGES`, +//! `LARQL_VINDEX_DESCRIBE`, `LARQL_MOE_DEBUG` toggles. Off unless set. +//! - **Retained comparison knobs** (ADR-017 shader/kernel retention): the +//! fused-shader flags (`LARQL_QKV_FUSED`, `LARQL_FUSED_*`) and the `asm_v2` +//! bench arm — deliberately kept for A/B, *not* dead code. +//! - **Config / paths / experiment / test** live with their feature, not here +//! (`HF_*`, `LARQL_HOME`, `LARQL_MODEL`, `LARQL_MEMIT_*`, `LARQL_TEST_*`, …). +//! //! ## Helper taxonomy — pick the matching one for the flag's intended default //! //! Mixing these on the same env var is the bug class flagged in the @@ -49,14 +66,10 @@ pub const ENV_DISABLE_Q4K_DIRECT: &str = "LARQL_DISABLE_Q4K_DIRECT"; pub const ENV_Q4K_DIRECT: &str = "LARQL_Q4K_DIRECT"; /// Max entries in the dequantised MoE expert cache. pub const ENV_MOE_CACHE_ENTRIES: &str = "LARQL_MOE_CACHE_ENTRIES"; -/// Namespaced MoE bypass toggle. +/// MoE bypass toggle (diagnostic). pub const ENV_SKIP_MOE: &str = "LARQL_SKIP_MOE"; -/// Legacy MoE bypass toggle. Prefer [`ENV_SKIP_MOE`] in new scripts. -pub const ENV_SKIP_MOE_LEGACY: &str = "SKIP_MOE"; -/// Namespaced MoE route/debug output toggle. +/// MoE route/debug output toggle. pub const ENV_MOE_DEBUG: &str = "LARQL_MOE_DEBUG"; -/// Legacy MoE route/debug output toggle. Prefer [`ENV_MOE_DEBUG`]. -pub const ENV_MOE_DEBUG_LEGACY: &str = "MOE_DEBUG"; /// Enable Metal MoE dispatch timing. pub const ENV_METAL_MOE_TIMING: &str = "LARQL_MOE_TIMING"; /// Select the 8-simdgroup Q4_K matvec kernel; set to a false value to opt out. @@ -119,11 +132,73 @@ pub const ENV_STAGE_DUMP_LAYER: &str = "LARQL_STAGE_DUMP_LAYER"; pub const ENV_GPU_TIMING: &str = "LARQL_GPU_TIMING"; /// Request paired commit/wait decode stage profiling. pub const ENV_PROFILE_SPLIT: &str = "LARQL_PROFILE_SPLIT"; -/// Legacy alias for [`ENV_PROFILE_SPLIT`]. -pub const ENV_DECODE_STAGE_TIMING: &str = "LARQL_DECODE_STAGE_TIMING"; /// Debug-only outer norm bypass in Metal MoE combine. pub const ENV_SKIP_OUTER_NORM: &str = "SKIP_OUTER_NORM"; +// ── CPU decode fast path — default ON, opt out with `=0` ───────────────────── +// +// These graduated from opt-in experiments (2026-06) to the shipped default: +// together they take CPU MoE decode from ~7 tok/s (f32 fallback) to ~35 on the +// 26B-A4B, parity-safe, with per-layer/format fallbacks (a layer/model that +// can't take the fast route silently uses the f32 one). Disable any single +// stage with `LARQL_=0` (also accepts `false`/`off`/`no`) — e.g. for an +// A/B against the f32 path or to debug a kernel. +// +/// Q4_K-direct attention projections (read Q4_K weights straight from the index +/// instead of dequantising to f32 first). +pub const ENV_Q4K_DIRECT_ATTN: &str = "LARQL_Q4K_DIRECT_ATTN"; +/// Int8 (Q8_K) activation route for the Q4_K-direct attention projections. +pub const ENV_Q4K_ATTN_INT8: &str = "LARQL_Q4K_ATTN_INT8"; +/// Q4_K lm_head (vocab projection straight from the Q4_K view; ~4× the +/// bandwidth of the f32 head). Falls back to f32 when no Q4_K head view exists. +pub const ENV_Q4K_LM_HEAD: &str = "LARQL_Q4K_LM_HEAD"; +/// Q4_K-direct dense-FFN slab on the decode path (prefill stays f32 gemm). +pub const ENV_Q4K_DIRECT_FFN: &str = "LARQL_Q4K_DIRECT_FFN"; +/// Hand-asm aarch64 Q4_K/Q6_K × Q8_K kernels (bit-exact with the intrinsic path). +pub const ENV_Q4K_ASM: &str = "LARQL_Q4K_ASM"; +/// Spin-barrier thread pool for the decode hot path (vs rayon's sleeping pool). +pub const ENV_SPIN_POOL: &str = "LARQL_SPIN_POOL"; + +/// A decode fast-path stage is ON unless explicitly disabled +/// (`=0`/`false`/`off`/`no`). +fn fast_path_on(name: &str) -> bool { + !env_opt_out(name) +} + +// The per-layer / per-token stages read the env each call (an uncontended +// single-thread `getenv` ~ns, negligible at layer granularity) so they stay +// togglable in tests. The two genuinely hot stages — `asm` (per matvec) and +// `spin_pool` (per parallel section) — cache at first read; no test toggles +// them via env (their unit tests drive the kernels / `SpinPool` directly). + +/// Q4_K-direct attention projections enabled (default on). +pub fn q4k_direct_attn_enabled() -> bool { + fast_path_on(ENV_Q4K_DIRECT_ATTN) +} +/// Int8 attention projection route enabled (default on). +pub fn q4k_attn_int8_enabled() -> bool { + fast_path_on(ENV_Q4K_ATTN_INT8) +} +/// Q4_K lm_head enabled (default on; falls back to f32 without a head view). +pub fn q4k_lm_head_enabled() -> bool { + fast_path_on(ENV_Q4K_LM_HEAD) +} +/// Q4_K-direct dense-FFN decode slab enabled (default on). +pub fn q4k_direct_ffn_enabled() -> bool { + fast_path_on(ENV_Q4K_DIRECT_FFN) +} +/// Hand-asm Q4_K/Q6_K kernels enabled (default on; aarch64 only). Cached — read +/// per matvec. +pub fn q4k_asm_enabled() -> bool { + static ON: std::sync::OnceLock = std::sync::OnceLock::new(); + *ON.get_or_init(|| fast_path_on(ENV_Q4K_ASM)) +} +/// Spin-barrier decode pool enabled (default on). Cached — read per section. +pub fn spin_pool_enabled() -> bool { + static ON: std::sync::OnceLock = std::sync::OnceLock::new(); + *ON.get_or_init(|| fast_path_on(ENV_SPIN_POOL)) +} + // Helpers below are `pub` (not `pub(crate)`) because sibling backend // crates (`larql-compute-metal`, future `larql-compute-vulkan`, …) // share the same env-toggle vocabulary defined above. Keeping the @@ -172,15 +247,15 @@ pub fn env_not_zero_or_default(name: &str, default: bool) -> bool { } pub(crate) fn moe_debug_enabled() -> bool { - env_flag_any(&[ENV_MOE_DEBUG, ENV_MOE_DEBUG_LEGACY]) + env_flag(ENV_MOE_DEBUG) } pub(crate) fn skip_moe_enabled() -> bool { - env_flag_any(&[ENV_SKIP_MOE, ENV_SKIP_MOE_LEGACY]) + env_flag(ENV_SKIP_MOE) } pub fn split_profile_requested() -> bool { - env_flag_any(&[ENV_PROFILE_SPLIT, ENV_DECODE_STAGE_TIMING]) + env_flag(ENV_PROFILE_SPLIT) } #[cfg(test)] @@ -271,54 +346,27 @@ mod tests { } #[test] - fn legacy_alias_helpers_still_work() { - with_env_vars( - &[(ENV_SKIP_MOE, None), (ENV_SKIP_MOE_LEGACY, Some("1"))], - || { - assert!(skip_moe_enabled()); - }, - ); - with_env_vars( - &[(ENV_MOE_DEBUG, Some("1")), (ENV_MOE_DEBUG_LEGACY, None)], - || { - assert!(moe_debug_enabled()); - }, - ); - with_env_vars( - &[ - (ENV_PROFILE_SPLIT, None), - (ENV_DECODE_STAGE_TIMING, Some("1")), - ], - || { - assert!(split_profile_requested()); - }, - ); + fn namespaced_toggle_helpers_read_their_flag() { + with_env(ENV_SKIP_MOE, Some("1"), || assert!(skip_moe_enabled())); + with_env(ENV_MOE_DEBUG, Some("1"), || assert!(moe_debug_enabled())); + with_env(ENV_PROFILE_SPLIT, Some("1"), || { + assert!(split_profile_requested()) + }); } #[test] fn env_flag_any_and_debug_helpers_cover_absent_and_present_cases() { - with_env_vars( - &[ - (ENV_SKIP_OUTER_NORM, None), - (ENV_MOE_DEBUG, None), - (ENV_MOE_DEBUG_LEGACY, None), - ], - || { - assert!(!env_flag(ENV_SKIP_OUTER_NORM)); - assert!(!env_flag_any(&[ENV_SKIP_OUTER_NORM, ENV_MOE_DEBUG])); - assert!(!moe_debug_enabled()); - }, - ); + with_env_vars(&[(ENV_SKIP_OUTER_NORM, None), (ENV_MOE_DEBUG, None)], || { + assert!(!env_flag(ENV_SKIP_OUTER_NORM)); + assert!(!env_flag_any(&[ENV_SKIP_OUTER_NORM, ENV_MOE_DEBUG])); + assert!(!moe_debug_enabled()); + }); with_env_vars( - &[ - (ENV_SKIP_OUTER_NORM, Some("1")), - (ENV_MOE_DEBUG, Some("1")), - (ENV_MOE_DEBUG_LEGACY, None), - ], + &[(ENV_SKIP_OUTER_NORM, Some("1")), (ENV_MOE_DEBUG, Some("1"))], || { assert!(env_flag(ENV_SKIP_OUTER_NORM)); - assert!(env_flag_any(&[ENV_SKIP_OUTER_NORM, ENV_MOE_DEBUG_LEGACY])); + assert!(env_flag_any(&[ENV_SKIP_OUTER_NORM, ENV_MOE_DEBUG])); assert!(moe_debug_enabled()); }, ); diff --git a/crates/larql-inference/src/forward/predict/dense.rs b/crates/larql-inference/src/forward/predict/dense.rs index abe7897e7..f98a4b1f2 100644 --- a/crates/larql-inference/src/forward/predict/dense.rs +++ b/crates/larql-inference/src/forward/predict/dense.rs @@ -9,7 +9,6 @@ use crate::attention::SharedKV; use crate::ffn::WeightFfn; use crate::model::ModelWeights; use ndarray::Array2; -use rayon::prelude::*; /// Row-parallel matvec: `out[v] = sum_h x[0, h] * lm_head[v, h]`. /// `lm_head` is `[vocab, hidden]` row-major; `x` is `[1, hidden]`. @@ -28,9 +27,17 @@ fn parallel_lm_head_logits( .as_slice() .expect("lm_head expected contiguous row-major"); let mut out = vec![0.0f32; vocab]; - out.par_iter_mut().enumerate().for_each(|(v, slot)| { - let row = &lm_slice[v * hidden..(v + 1) * hidden]; - *slot = f32_dot(row, x_row); + // Route through the spin pool when enabled (else rayon) so the f32 lm_head + // shares the decode loop's one parallelism strategy instead of waking a + // second (rayon) pool that thrashes against it. Chunk by rows of vocab. + const CHUNK_ROWS: usize = 64; + larql_compute::cpu::spin_pool::par_chunks_mut(&mut out, CHUNK_ROWS, |ci, chunk| { + let base = ci * CHUNK_ROWS; + for (j, slot) in chunk.iter_mut().enumerate() { + let v = base + j; + let row = &lm_slice[v * hidden..(v + 1) * hidden]; + *slot = f32_dot(row, x_row); + } }); out } @@ -169,26 +176,12 @@ pub fn logits_to_predictions_q4_lm_head( // linearly across M3 Max's 11 perf cores. let raw = { use larql_compute::cpu::ops::q4k_q8k_dot::{ - q4k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, + q4k_q8k_matvec_parallel, quantize_x_to_q8k_into, Q8KActivation, }; - use rayon::prelude::*; let mut h_q8k = Q8KActivation::with_capacity(hidden); quantize_x_to_q8k_into(&mut h_q8k, last_row); - let bytes_per_row = (hidden / 256) * 144; // Q4_K row size let mut out = vec![0.0f32; vocab]; - const CHUNK_ROWS: usize = 64; - out.par_chunks_mut(CHUNK_ROWS) - .enumerate() - .for_each(|(chunk_idx, chunk)| { - let row_start = chunk_idx * CHUNK_ROWS; - let chunk_len = chunk.len().min(vocab.saturating_sub(row_start)); - if chunk_len == 0 { - return; - } - let w_chunk = - &q4_lm_head[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; - q4k_q8k_matvec_into(&mut chunk[..chunk_len], &h_q8k, w_chunk, chunk_len, hidden); - }); + q4k_q8k_matvec_parallel(&mut out, &h_q8k, q4_lm_head, vocab, hidden, "Q4_K"); out }; let _ = backend; @@ -237,25 +230,12 @@ pub fn q4_lm_head_argmax( // Same raw-matvec block as `logits_to_predictions_q4_lm_head`. let raw = { use larql_compute::cpu::ops::q4k_q8k_dot::{ - q4k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, + q4k_q8k_matvec_parallel, quantize_x_to_q8k_into, Q8KActivation, }; let mut h_q8k = Q8KActivation::with_capacity(hidden); quantize_x_to_q8k_into(&mut h_q8k, last_row); - let bytes_per_row = (hidden / 256) * 144; let mut out = vec![0.0f32; vocab]; - const CHUNK_ROWS: usize = 64; - out.par_chunks_mut(CHUNK_ROWS) - .enumerate() - .for_each(|(chunk_idx, chunk)| { - let row_start = chunk_idx * CHUNK_ROWS; - let chunk_len = chunk.len().min(vocab.saturating_sub(row_start)); - if chunk_len == 0 { - return; - } - let w_chunk = - &q4_lm_head[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; - q4k_q8k_matvec_into(&mut chunk[..chunk_len], &h_q8k, w_chunk, chunk_len, hidden); - }); + q4k_q8k_matvec_parallel(&mut out, &h_q8k, q4_lm_head, vocab, hidden, "Q4_K"); out }; diff --git a/crates/larql-inference/src/vindex/dequant.rs b/crates/larql-inference/src/vindex/dequant.rs index d402ea409..62d214da6 100644 --- a/crates/larql-inference/src/vindex/dequant.rs +++ b/crates/larql-inference/src/vindex/dequant.rs @@ -83,6 +83,26 @@ mod tests { use super::*; use crate::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + /// Pin `LARQL_Q4K_ATTN_INT8=0` for the f32-activation Q4K-direct parity + /// tests below: they assert the strict `<1e-3` weight bound, which only the + /// f32-activation route satisfies. The int8 route is on by default and + /// carries a looser (~2% scale-relative) bound by design. Restores the + /// prior env value on drop (even if an assertion panics). + struct Int8OffGuard(Option); + impl Drop for Int8OffGuard { + fn drop(&mut self) { + match self.0.take() { + Some(v) => std::env::set_var("LARQL_Q4K_ATTN_INT8", v), + None => std::env::remove_var("LARQL_Q4K_ATTN_INT8"), + } + } + } + fn pin_int8_off() -> Int8OffGuard { + let guard = Int8OffGuard(std::env::var_os("LARQL_Q4K_ATTN_INT8")); + std::env::set_var("LARQL_Q4K_ATTN_INT8", "0"); + guard + } + /// `ensure_attn_tensors_dequantised` populates every layer's /// Q/K/V/O tensors when the vindex carries Q4K attention bytes. #[test] @@ -310,6 +330,7 @@ mod tests { /// PARITY GATE (task #16, step 3) — all-Q4_K attn (Q/K/V/O). #[test] fn q4k_direct_decode_step_matches_q4k_dequant() { + let _int8_off = pin_int8_off(); let weights = make_test_q4k_weights(); let index = make_test_q4k_vindex(&weights); assert_q4k_direct_matches_dequant(&index); @@ -320,6 +341,7 @@ mod tests { /// dispatch + Q6_K reference dequant that the all-Q4_K fixture never hits. #[test] fn q4k_direct_decode_step_matches_dequant_with_q6k_v() { + let _int8_off = pin_int8_off(); let weights = make_test_q4k_weights(); let index = make_attn_vindex_v_as_q6k(&weights); assert_q4k_direct_matches_dequant(&index); @@ -337,6 +359,7 @@ mod tests { /// Mixed V=Q6_K index so Q6_K compounds through both sides too. #[test] fn q4k_direct_decode_multistep_parity_compounds_within_noise() { + let _int8_off = pin_int8_off(); use larql_compute::attention::{ run_attention_block_decode_step_backend, run_attention_block_decode_step_q4k_direct, SharedKV, diff --git a/crates/larql-inference/src/vindex/kquant_forward/cached.rs b/crates/larql-inference/src/vindex/kquant_forward/cached.rs index 1310e8664..67d4c6943 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/cached.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/cached.rs @@ -25,9 +25,7 @@ // across two files. #![allow(clippy::needless_range_loop, clippy::type_complexity)] -use larql_compute::cpu::ops::q4k_q8k_dot::{ - q4k_q8k_matvec_into, q6k_q8k_matvec_into, quantize_x_to_q8k_into, Q8KActivation, -}; +use larql_compute::cpu::ops::q4k_q8k_dot::{quantize_x_to_q8k_into, Q8KActivation}; use larql_compute::ComputeBackend; use larql_models::ModelWeights; use larql_vindex::VectorIndex; @@ -283,30 +281,10 @@ fn matvec_q4k_or_q6k_q8k( // `q4k_matvec_into` in `q4_common.rs`. Without this, decode runs // single-threaded and the sdot path actually regresses vs the // (rayon-parallel) f32 path despite each row being faster. - use rayon::prelude::*; - const CHUNK_ROWS: usize = 32; let mut out = vec![0.0f32; rows]; - let w_ref = bytes; - out.par_chunks_mut(CHUNK_ROWS) - .enumerate() - .for_each(|(chunk_idx, chunk)| { - let row_start = chunk_idx * CHUNK_ROWS; - let chunk_len = chunk.len().min(rows.saturating_sub(row_start)); - if chunk_len == 0 { - return; - } - let w_chunk = - &w_ref[row_start * bytes_per_row..(row_start + chunk_len) * bytes_per_row]; - match format { - "Q4_K" => { - q4k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, cols) - } - "Q6_K" => { - q6k_q8k_matvec_into(&mut chunk[..chunk_len], x_q8k, w_chunk, chunk_len, cols) - } - _ => {} - } - }); + larql_compute::cpu::ops::q4k_q8k_dot::q4k_q8k_matvec_parallel( + &mut out, x_q8k, bytes, rows, cols, format, + ); Some(out) } @@ -828,26 +806,27 @@ fn run_ffn_decode_step_q4k_direct( // scalar pass serial on the main thread while the workers slept. let mut activated = vec![0.0f32; intermediate]; { - use rayon::prelude::*; let gelu = matches!(arch.activation(), larql_models::Activation::GeluTanh); let sqrt_2_over_pi = (2.0f32 / std::f32::consts::PI).sqrt(); - activated - .par_chunks_mut(256) - .zip(gate_vec.par_chunks(256).zip(up_vec.par_chunks(256))) - .for_each(|(a_c, (g_c, u_c))| { - if gelu { - for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { - let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); - *a = 0.5 * x * (1.0 + inner.tanh()) * u; - } - } else { - // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. - for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { - let sig = 1.0 / (1.0 + (-x).exp()); - *a = x * sig * u; - } + let gate_ref = &gate_vec[..]; + let up_ref = &up_vec[..]; + larql_compute::cpu::spin_pool::par_chunks_mut(&mut activated, 256, |ci, a_c| { + let start = ci * 256; + let g_c = &gate_ref[start..start + a_c.len()]; + let u_c = &up_ref[start..start + a_c.len()]; + if gelu { + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x); + *a = 0.5 * x * (1.0 + inner.tanh()) * u; } - }); + } else { + // SiLU = x * sigmoid(x). Same shape as dense_ffn_forward_backend. + for ((a, &x), &u) in a_c.iter_mut().zip(g_c.iter()).zip(u_c.iter()) { + let sig = 1.0 / (1.0 + (-x).exp()); + *a = x * sig * u; + } + } + }); } // down projection: out = activated @ W_down.T → [hidden]. diff --git a/crates/larql-inference/src/vindex/kquant_forward/hidden.rs b/crates/larql-inference/src/vindex/kquant_forward/hidden.rs index 91753f6f8..aeb4d5534 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/hidden.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/hidden.rs @@ -170,11 +170,10 @@ pub fn moe_ffn_block_cpu( /// f32-resident `run_ffn` — on the 26B-A4B this drops the slab's per-token /// traffic ~7× (2.14 GB f32 → ~0.3 GB quantised). Decode-only (single-row): /// prefill stays on the f32 BLAS gemm, where repeated quantised matvec -/// loses (the task-#16 prefill falsification). Default off = byte-identical. +/// loses (the task-#16 prefill falsification). **Default on** +/// (`LARQL_Q4K_DIRECT_FFN=0` opts out); see [`larql_compute::options`]. fn q4k_direct_ffn_enabled() -> bool { - use std::sync::OnceLock; - static ON: OnceLock = OnceLock::new(); - *ON.get_or_init(|| std::env::var("LARQL_Q4K_DIRECT_FFN").as_deref() == Ok("1")) + larql_compute::options::q4k_direct_ffn_enabled() } /// Index-aware variant of [`moe_ffn_block_cpu`]: when `index` is provided diff --git a/crates/larql-kv/ROADMAP.md b/crates/larql-kv/ROADMAP.md index 1c649c968..41cf38ccc 100644 --- a/crates/larql-kv/ROADMAP.md +++ b/crates/larql-kv/ROADMAP.md @@ -1,5 +1,51 @@ # Roadmap — larql-kv +## Spin-barrier pool — CPU MoE decode caught llama.cpp (2026-06-13) + +After residency closed the byte-traffic gap (06-11/12), a `/usr/bin/sample` of +live 26B decode showed the remaining ~1.15× was **rayon fork-join overhead**, +not kernels. The decode driver runs *outside* the global rayon pool, so each of +the ~211 parallel sections/token took the cold path (`in_worker_cold → +LockLatch::wait_and_reset → __psynch_cvwait`) and workers slept between sections +— ~40% of thread-time in wait states. + +**Built** [`larql_compute::cpu::spin_pool`](../../larql-compute/src/cpu/spin_pool.rs): +a llama.cpp-style persistent spin-barrier pool. Workers spin on an epoch counter +and only `park` after a long idle gap; the dispatcher participates as the n-th +worker; **static strided chunk ownership** makes `completed == num_chunks` a +sound barrier (no shared resettable cursor → no stale re-claim across +back-to-back dispatches — a concurrent-dispatcher test caught that bug); a +dispatch `Mutex` + thread-local reentrancy guard make it safe for +`--concurrent`/multi-threaded tests. `par_chunks_mut` / `par_chunks_mut2` +helpers route a row-chunked parallel-for through the pool, or rayon when +`LARQL_SPIN_POOL=0`. **Default-on** (see "Decode fast path default-on" — the +whole Q4K stack ships on, opt out per stage with `=0`); both paths are +numerically identical, only the threading differs. + +**Centralized** the four byte-identical `par_chunks_mut` Q4_K/Q6_K×Q8_K matvec +copies (larql-compute `cached.rs`, larql-inference `cached.rs`, lm_head ×2 in +`dense.rs` — the prior "consolidation hazard") into one +`q4k_q8k_matvec_parallel`, and routed every hot decode section (attention int8 +Q/K/V/O, GQA, dense FFN gate/up/down, geglu, expert fold, lm_head q4 + f32) +through it — so when enabled the whole token runs on one hot pool. + +- **Parity:** 704 compute + 1220 inference + 756 kv green, flags-off AND + flags-on (incl. the `predict_kquant` oracles). clippy clean. +- **Profile after:** rayon eliminated from the hot path — `in_worker_cold` + 2682→0, `join_context` 10300→0, `wait_until_cold` 4463→9. +- **Measured** (M3 Max, t=8, warm, tight A/B bracket, flags **inline**): + 26B short-ctx OFF ~26.9 → ON **33–35**; n=256 OFF ~27.4 → ON **~34.9 + (+28%)** — vs llama.cpp recorded **32.1** ⇒ ~9% ahead. +- **Default-on + safe (2026-06-13):** shipped a spin→yield→park backoff (spin + the proven window during active decode → yield once a wait outlives a token → + park when idle, ~0 CPU; dispatcher unparks on dispatch) so the pool doesn't + peg cores between requests — what makes on-by-default safe on a shared box. + Also fixed a panic-safety bug (a panicking chunk killed a worker → the + barrier spun forever): `catch_unwind` per chunk + re-raise on the dispatcher. +- **Caveat:** the pool spins during active decode (the win on a dedicated box); + under a transient mid-decode load spike a run can still regress (an n=512 ON + run hit 10.7 once) — `LARQL_SPIN_POOL=0` falls back to rayon if needed. + ## CPU resident fast-path — all engines pluggable into it (2026-06-13) The 2026-06-11/12 CPU fast-path arc (Q4K-direct + int8 attention, q4k @@ -350,12 +396,15 @@ for compressed KV memory at near-standard speed; `unlimited_context` for long-context windowed KV (slowest, bounded memory). `no_cache` / `apollo` are not a fit (re-forward multiplies round-trips). -**Known limitation:** `unlimited_context`'s archived-window *replay* -(`replay_window`, fires only on window eviction / long context) passes `None` for -`moe_ffn` → dense FFN. Correct within a single window (the verified case); a -long-context MoE run that evicts windows would need replay threaded too. This is the -only remaining MoE-correctness gap. CLI guard allows the seven verified engines and -rejects `no-cache` / `apollo` with a clear message. +**Resolved (2026-06-13):** `unlimited_context::replay_window` now takes +`moe_ffn` + `index` and threads them to `rs_extend_from_checkpoint_backend` +(matching the live-window `extend_current` path), so an evicted MoE window +replays with experts instead of silently falling back to dense FFN. It is a +standalone utility (no decode-loop caller — the decode path attends to the +current window + boundary checkpoints, never a full replay), so this was a +*latent* correctness gap; it is now correct for any caller. Dense callers pass +`None`/`None`. CLI guard allows the seven verified engines and rejects +`no-cache` / `apollo` with a clear message. ### ⏭ NEXT — Q4K-direct client decode path (remove the f32 tax) — top engineering lever diff --git a/crates/larql-kv/src/engines/mod.rs b/crates/larql-kv/src/engines/mod.rs index 2cbd728de..e27d6af00 100644 --- a/crates/larql-kv/src/engines/mod.rs +++ b/crates/larql-kv/src/engines/mod.rs @@ -217,6 +217,36 @@ mod resident_identity_tests { #[test] fn every_engine_decode_step_resident_matches_decode_step_flag_off() { + // The Q4K decode fast path is on by default now; this pin asserts the + // flags-OFF f32 identity (resident must equal plain when the resident + // route is *not* taking the Q4K-direct branch), so disable the stages + // that change the resident hidden state. They read the env uncached, so + // this takes effect; restored on drop even if an assertion panics. + struct FlagsOff(Vec<(&'static str, Option)>); + impl Drop for FlagsOff { + fn drop(&mut self) { + for (k, v) in self.0.drain(..) { + match v { + Some(v) => std::env::set_var(k, v), + None => std::env::remove_var(k), + } + } + } + } + let _flags_off = { + let names = [ + "LARQL_Q4K_DIRECT_ATTN", + "LARQL_Q4K_ATTN_INT8", + "LARQL_Q4K_DIRECT_FFN", + "LARQL_Q4K_LM_HEAD", + ]; + let saved: Vec<_> = names.iter().map(|n| (*n, std::env::var_os(n))).collect(); + for n in names { + std::env::set_var(n, "0"); + } + FlagsOff(saved) + }; + // Concrete specs (parameterised kinds need real params). Excluded: // apollo (bench-only, full re-forward by design; resident default = // forward to decode_step is the documented intent) and boundary-kv diff --git a/crates/larql-kv/src/engines/unlimited_context/engine.rs b/crates/larql-kv/src/engines/unlimited_context/engine.rs index ed0f69a44..dd1a00d52 100644 --- a/crates/larql-kv/src/engines/unlimited_context/engine.rs +++ b/crates/larql-kv/src/engines/unlimited_context/engine.rs @@ -167,9 +167,17 @@ impl UnlimitedContextEngine { /// Reconstruct a window's full K,V by replaying its archived tokens from /// the prior window's boundary checkpoint. + /// + /// For hybrid-MoE models, pass the FFN hook + vindex so the replay + /// dispatches experts exactly like the live-window path + /// ([`extend_current`](Self::extend_current)); pass `None`/`None` for dense + /// models. (Previously this always passed `None` → dense FFN, which would + /// have produced wrong K/V for an evicted MoE window — the C1 follow-up.) pub fn replay_window( &self, weights: &ModelWeights, + moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, window_id: usize, ) -> Option<(Vec, usize)> { let (tokens, abs_offset) = self.archive.retrieve(window_id)?; @@ -181,17 +189,14 @@ impl UnlimitedContextEngine { empty_prior(weights) }; - // Archived-window replay does not yet re-dispatch remote MoE experts - // (it fires only on window eviction / long context). `None` → dense - // FFN; see the larql-kv "MoE-aware KV engines (C1)" roadmap follow-up. let out = rs_extend_from_checkpoint_backend( weights, tokens, prior, abs_offset, self.backend.as_ref(), - None, - None, + moe_ffn, + index, )?; let abs_end = abs_offset + tokens.len() - 1; Some((out.kv_cache, abs_end)) @@ -1030,8 +1035,8 @@ mod tests { let engine = UnlimitedContextEngine::new(512); // No windows archived → any window_id returns None at the // `self.archive.retrieve(window_id)?` line. - assert!(engine.replay_window(&weights, 0).is_none()); - assert!(engine.replay_window(&weights, 99).is_none()); + assert!(engine.replay_window(&weights, None, None, 0).is_none()); + assert!(engine.replay_window(&weights, None, None, 99).is_none()); } #[test] @@ -1053,7 +1058,7 @@ mod tests { ); // Replay the first archived window — exercises the // `rs_extend_from_checkpoint_backend` path (lines 132-138). - let replay = engine.replay_window(&weights, 0); + let replay = engine.replay_window(&weights, None, None, 0); assert!(replay.is_some(), "replay_window(0) should succeed"); let (kv, abs_end) = replay.unwrap(); assert!(!kv.is_empty(), "replayed K/V cache should be non-empty"); diff --git a/crates/larql-kv/src/generation.rs b/crates/larql-kv/src/generation.rs index a06c0dbdd..31f1c59b3 100644 --- a/crates/larql-kv/src/generation.rs +++ b/crates/larql-kv/src/generation.rs @@ -636,11 +636,10 @@ fn argmax_next_token( /// the vindex's Q4_K lm_head view (synthesised from f16 embeddings at load /// for tied-embedding models) instead of the f32 row-parallel sgemv. On a /// 262K-vocab head this drops lm_head bandwidth ~4× (e.g. 2.95 GB → 0.42 GB -/// per step on Gemma 4 26B-A4B). Default off = byte-identical f32 path. +/// per step on Gemma 4 26B-A4B). **Default on** (`LARQL_Q4K_LM_HEAD=0` opts +/// out); falls back to the f32 path when no Q4_K head view exists. fn q4k_lm_head_enabled() -> bool { - use std::sync::OnceLock; - static ON: OnceLock = OnceLock::new(); - *ON.get_or_init(|| std::env::var("LARQL_Q4K_LM_HEAD").as_deref() == Ok("1")) + larql_compute::options::q4k_lm_head_enabled() } /// Resident-path argmax: like [`argmax_next_token`] but with the vindex at From 41aab8833cd2e8c293abe1cc5a82d5c28017b6bb Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Sun, 14 Jun 2026 00:03:14 +0100 Subject: [PATCH 5/8] more improvements --- ROADMAP.md | 6 +- ROADMAP_STATUS.md | 2 +- crates/larql-kv/ROADMAP.md | 40 ++++++++++--- .../src/engines/markov_residual/compute.rs | 60 ++++++++++++++++++- .../engines/markov_residual_codec/compute.rs | 52 +++++++++++++++- crates/larql-kv/src/engines/mod.rs | 6 +- .../src/engines/turbo_quant/engine.rs | 45 ++++++++------ 7 files changed, 175 insertions(+), 36 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index ac3b9f52a..fc1920ea0 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -133,7 +133,7 @@ the bottleneck.** | Acceptance tier (from "P0 — CPU path to blazing") | Confidence | Driver | |---|---|---| | Short-term: Gemma 3 4B CPU within 10% of `llama.cpp -ngl 0` | **~95%** | Pure engineering | -| Medium-term: Gemma 4 26B-A4B at ≥10 tok/s on 64 GB consumer, no GPU | **~75%** (was ~80% → 70% → 62% → 70% → 75%, revised 2026-06-11: target MEASURED-PASSED on M3 Max) | MoE active-param math works; 26B fits 64 GB (16 GB vindex). **C10 gate resolved favorably (2026-06-10):** llama.cpp-on-26B-CPU = 32 tok/s, the ≥10 target is 3× below a mature engine's proof. The gap was **byte traffic, not kernel quality** (in-process streamed ~10 GB/token f32-resident vs llama.cpp's ~2.1 GB all-quantized, every leg bandwidth-saturated ~62–71 GB/s). **Quantized residency BUILT 2026-06-11** (`LARQL_Q4K_DIRECT_ATTN` + `LARQL_Q4K_LM_HEAD` + `LARQL_Q4K_DIRECT_FFN`, parity-gated, default-off): **7.6 → 13.9 tok/s, +`LARQL_Q4K_ASM` → 15.9** — **the ≥10 target is passed with 60% margin on M3 Max**. Remaining ~1.9× to llama.cpp is C12 kernel work (Q6_K asm, attention's f32-activation `q4k_matvec` → q4k_q8k+asm, instruction-count toward 28 GiB/s; fused gate_up asm landed 2026-06-11 at 2.00× isolated — 9.89→19.82 GiB/s). Held at 75 (not higher) because the 64 GB-consumer class (M-Pro ~150–200 GB/s, x86 DDR5 ~80–100 GB/s) has less bandwidth than the 128 GB M3 Max measured on — though at the measured ~65 GB/s saturation even a 100 GB/s-class box should clear 10. Artifact `bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json` §update_2026_06_11. | +| Medium-term: Gemma 4 26B-A4B at ≥10 tok/s on 64 GB consumer, no GPU | **~85%** (was ~80% → 70% → 62% → 70% → 75% → 85%, revised 2026-06-13: CAUGHT llama.cpp on 26B CPU MoE) | MoE active-param math works; 26B fits 64 GB (16 GB vindex). **C10 gate resolved favorably (2026-06-10):** llama.cpp-on-26B-CPU = 32 tok/s, the ≥10 target is 3× below a mature engine's proof. The gap was **byte traffic, not kernel quality** (in-process streamed ~10 GB/token f32-resident vs llama.cpp's ~2.1 GB all-quantized). **Quantized residency (2026-06-11): 7.6 → 13.9 → 15.9; int8 attn → 21.7; KV append-in-place → 27.9.** **Spin-barrier pool (2026-06-13): → ~35 tok/s — CAUGHT/EXCEEDED llama.cpp (32.1, ~9% ahead), shipped DEFAULT-ON.** The final ~1.15× was **rayon fork-join overhead** (decode driver ran outside the pool → ~211 cold-path sections/token, ~40% of thread-time parked), *not* kernel quality — exactly what the C12 roofline-crossover entry called ("target effective-bandwidth sinks — rayon fork-join gaps"); the pool closed it via scheduling. Since larql now **matches the mature reference on the same box**, any 64 GB-consumer class where llama.cpp clears 10 (all of them) clears it too. Held at 85 (not higher) only for the unmeasured M-Pro/x86 bandwidth classes + the 26B llama.cpp anchor being recorded-not-same-session. Artifact `bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json`. | | Long-term: 100B-class MoE at ≥5 tok/s, no GPU | **~52%** (was ~60% → 55% → 52%, revised 2026-05-31) | Four-way push: 100B@FP4 (~25–50 GB) **fits RAM** so the disk bet is moot here — *removes* a risk the original 60% priced (+); FP4 confirmed (+); lost hash multiplier makes ≥5 tok/s harder (−); and the exploitable-structure prior took a **two-probe hit** — V1 (FFN-feature sparsity doesn't compound) *and* routing locality (expert selection doesn't concentrate, ~124/128 over a sequence) both say there's less cacheable structure than the "weights-as-database" thesis assumed (−, soft but broad). The disk-risk *removal* is what keeps it off 50; **50 is the honest alternative if you weight the two-probe pattern over it.** Caveat: the uniformity is partly Gemma's load-balancing aux loss (trained-in) → may be router-specific; the cross-MoE-router check would settle 50-vs-55. | | Ultimate: 671B-class via multi-machine grid | **~30%** (was ~40%, revised 2026-05-31) | Hit hardest. 671B even at FP4 (~335 GB) **exceeds single-machine RAM**, and the MoE-routing-locality finding (working set ≈ whole expert population, no cacheable hot subset) **closes the single-machine disk-resident escape hatch** — it would thrash. That leaves only the harder multi-machine grid (C9, demoted to P2 per ADR-019), where integration risk dominates. | | Dense frontier (if the field stays dense at 1T+) | **~10%** (was ~15%, revised 2026-05-31) | The hash-routing 5× its arithmetic leaned on is FALSIFIED (1 TB Q4 → ~10 s/token now, not 2). Needs attention-sparsification breakthroughs outside engineering control. | @@ -185,8 +185,8 @@ Current state (2026-05-15): | **GPU (Metal)** | Gemma 3 4B decode | 88 tok/s | ollama ~103 | 17% behind | over (defensible-with-caveat) | | **GPU (Metal)** | Gemma 3 4B prefill (340 tok) | per-pos matvec | gemm | 14× behind | far over | | **GPU (Metal)** | Gemma 4 + MTP (when adopted) | 88 tok/s no-MTP | ~225 with MTP | ~2.6× behind | far over | -| **CPU** | Gemma 3 4B Q4K decode | 24.5 tok/s | llama.cpp Q4_K_M CPU 42.53 tok/s | ~1.69× behind | over (KV-cache + direct Q4_K matvec + NEON Q4_K/Q6_K/f32_dot + Q4 lm_head + 4-way acc + par_chunks_mut(32) + Q4_K×Q8_K sdot + auto-t=8 on Apple silicon landed 2026-05-15/16; ~68× over original 0.36 tok/s baseline. Per-core ratio 1.73× (kernel inner-loop vs llama.cpp's hand-asm); remaining gap needs prefetch + scheduling. See `bench/baselines/cpu/DIAGNOSIS.md`) | -| **CPU** | Gemma 4 26B-A4B decode | in-proc KV-cached MoE **~1.8 tok/s** (n=8 smoke 2026-06-06, UNVERIFIED; historical grid 18.3 / shard-KV 4.4 unreconciled) | pending — Q4_K_M GGUF built from *cached* safetensors (no download), `/tmp/gemma4-26b-Q4_K_M.gguf` | TBD | **STAGED, blocked on idle machine** (under load a known-good 4B clocked 0.5 vs ~43 warm = contention). In-process row wired via `LocalMoeFfn`; runbook `bench/baselines/c10_gemma4-26b-a4b_cpu_RUNBOOK.md`. gemma4-in-llama.cpp CPU speed also unverified (1-core run seen under load). | +| **CPU** | Gemma 3 4B Q4K decode | **30.9 tok/s** (residency default-on, same-session 2026-06-13; was 24.5) | llama.cpp Q4_K_M CPU ~43 | **~1.42× behind** (was 1.69×) | over — the Q4_K residency + int8 + asm + spin-pool stack is now **default-on** (2026-06-13); earlier kernels (KV-cache, direct Q4_K matvec, NEON Q4_K/Q6_K/f32_dot, Q4 lm_head, par_chunks_mut(32), Q4_K×Q8_K sdot, auto-t=8) landed 2026-05-15/16, ~86× over the original 0.36 baseline. See `bench/baselines/cpu/DIAGNOSIS.md` | +| **CPU** | Gemma 4 26B-A4B decode | in-proc KV-cached MoE **~35 tok/s** (spin pool, default-on; M3 Max t=8 warm n=256, 2026-06-13) | llama.cpp Q4_K_M CPU **32.1** (recorded, drift-bracketed) | **larql ~9% AHEAD** | ✅ **CAUGHT** — arc 7.6 → 13.9 (residency) → 21.7 (int8 attn) → 27.9 (KV append-in-place) → **~35 (spin-barrier pool)**. The final ~1.15× was **rayon fork-join overhead** (decode driver ran outside the pool → ~211 cold-path sections/token, ~40% of thread-time in waits), *not* kernel quality — closed by the spin pool (effective-bandwidth/scheduling, exactly as the C12 roofline-crossover entry predicted), shipped **default-on**. Caveat: the 26B llama.cpp anchor is the recorded 32.1 (ollama wouldn't run the HF GGUF on CPU this session); machine validated via 4B llama.cpp 44 ≈ recorded 43. `bench/baselines/c10_gemma4-26b-a4b_cpu_reconciled.json`. | Items the threshold makes load-bearing (not optional) on the **GPU track**: - **D-ATTN-MTG** — flash attention; without it, attention-mechanism deltas are muddied by missing baseline. diff --git a/ROADMAP_STATUS.md b/ROADMAP_STATUS.md index ea6259ff3..c24def74c 100644 --- a/ROADMAP_STATUS.md +++ b/ROADMAP_STATUS.md @@ -11,7 +11,7 @@ Last updated: 2026-06-13 - **Decode fast path shipped DEFAULT-ON + env-flag cleanup — `larql` is now fast out of the box, no flags (2026-06-13).** The whole CPU decode fast-path stack had been gated behind opt-in env vars, so the *default* product ran the f32 slow path (C10: 7.1 tok/s default vs ~35 with flags) — the on-switch for all the perf work was hidden, and forgetting one flag silently reverted to slow. **Flipped the five Q4K stages + the spin pool to default-ON (opt-*out* via `=0`/`false`/`off`/`no`)**: `LARQL_Q4K_DIRECT_ATTN`, `LARQL_Q4K_ATTN_INT8`, `LARQL_Q4K_LM_HEAD`, `LARQL_Q4K_DIRECT_FFN`, `LARQL_Q4K_ASM`, `LARQL_SPIN_POOL`. **Centralized** them as resolvers in [`larql-compute/src/options.rs`](crates/larql-compute/src/options.rs) (5 scattered per-site `OnceLock` reads now delegate; per-layer/token ones uncached=testable, the per-matvec `asm` + per-section `spin` cached=hot), with per-layer/format fallbacks intact so a model/layer that can't take a stage silently uses f32. **Verified fast-by-default:** no-env 26B decode = ~31 tok/s (the fast path) vs the old 7.1 default. Fixed 6 strict-`<1e-3`-parity tests that had *relied* on the old default-off (they pin int8/flags off via Drop guards — the int8 route carries a looser ~2% scale-relative bound by design). Also **env cleanup (Tier 3)**: dropped 3 redundant aliases (`SKIP_MOE`/`MOE_DEBUG` non-namespaced legacies, `LARQL_DECODE_STAGE_TIMING`), helpers now read the single `LARQL_`-namespaced flag. **This supersedes the "opt-in `LARQL_X`, default off = byte-identical" framing in the older entries below** — the fast-path stack is on by default now; those entries describe each stage as it landed. (Retained comparison knobs — `asm_v2` bench, fused-shader flags per ADR-017 — deliberately kept.) Full workspace lib tests green, clippy clean. - **Spin-barrier pool made safe for default-on + panic-fixed (2026-06-13).** Added a spin→yield→park backoff (spin `SPIN_HOT`≈the proven pure-spin window so active decode is unchanged → `yield_now` cooperative bridge once a wait outlives a token = idle/starved → `park` deep-idle, ~0 CPU; dispatcher unparks on dispatch) so the pool doesn't peg cores when the decode loop is idle — what makes on-by-default safe on a shared box. **Found + fixed a real panic-safety bug:** a panicking chunk body killed the worker thread → `completed` never hit `num_chunks` → the dispatcher spin-waited the barrier FOREVER (hung two test procs at 200% for 25 min; also the nondeterministic "flake" — dispatcher-chunk panic propagated/passed, worker-chunk panic hung). Fix: `catch_unwind` per chunk, always count, re-raise the first payload on the dispatcher (rayon semantics) + a regression test. 705 compute + 1220 inference + 756 kv green; spin_pool 8 tests incl. panic + concurrency hammered 25×. - **CPU MoE decode CAUGHT llama.cpp — spin-barrier thread pool, 26B 27→35 tok/s (+28%), now ~9% AHEAD of llama.cpp's 32.1 (2026-06-13).** A `/usr/bin/sample` profile of live 26B decode pinned the post-residency frontier to **rayon fork-join overhead**, not the kernels: the decode driver runs *outside* the global rayon pool, so each of ~211 parallel sections/token took the cold path (`in_worker_cold → LockLatch::wait_and_reset → __psynch_cvwait`) and workers slept between sections — ~40% of thread-time in wait states. **Built `larql_compute::cpu::spin_pool`** (llama.cpp-style persistent spin-barrier pool: workers spin on an epoch counter, park only after a long idle; static strided partition makes `completed==num_chunks` a sound barrier; dispatch mutex + reentrancy guard for `--concurrent`/tests; 7 unit tests incl. a concurrent-dispatcher one that caught a real cursor-reset bug). **Centralized** four byte-identical `par_chunks_mut` matvec twins (larql-compute `cached.rs`, larql-inference `cached.rs`, lm_head ×2 in `dense.rs` — the long-standing "consolidation hazard") into one `q4k_q8k_matvec_parallel`, and routed every hot decode section (attn int8 Q/K/V/O, GQA, dense FFN gate/up/down, geglu, expert fold, lm_head q4+f32) through the pool. Parity-validated both ways (704 compute + 1220 inference + 756 kv green flags-off AND flags-on, incl. the `predict_kquant` parity oracles; clippy clean) — **now default-on (see the env-cleanup entry above); `LARQL_SPIN_POOL=0` opts out**. **Profile after: rayon eliminated** (`in_worker_cold` 2682→0, `join_context` 10300→0, `wait_until_cold` 4463→9). **Measured (M3 Max, t=8, warm, tight A/B bracket, flags inline):** 26B short-ctx OFF ~26.9 → ON **33–35**; n=256 OFF ~27.4 → ON **~34.9 (+28%, ON 35.0/34.8 vs OFF 27.3/27.4 dead-consistent)** — vs llama.cpp recorded **32.1** ⇒ larql ~9% ahead (machine validated: 4B llama.cpp 44 vs recorded 43). **Caveat:** the pool *spins* during active decode (that is the win on a dedicated box); the spin→yield→park backoff (shipped, see entry above) cedes cores only when the decode loop goes idle, which is what makes default-on safe on a shared machine. **Methodology bug (burned a chunk of the session):** `env $FLAGS …` does **not** word-split in this shell (zsh semantics) → only the first flag was set; spell decode flags **inline** before the binary, never via a `$VAR`. Crate detail: [`crates/larql-kv/ROADMAP.md`](crates/larql-kv/ROADMAP.md) §"Spin-barrier pool". -- **Bottleneck pass #2 (code-level, machine contended) — GQA scratch fix landed; measurement queue parked (2026-06-13).** With the box owned by a sibling session (battery also low), did the allocation-churn audit instead of timing: the expert path is already TLS-pooled (`ExpertScratch`), but the GQA head-parallelization had introduced a per-head scores `vec!` (≈480 allocs+zeroings/token, growing with ctx) — replaced with `for_each_init` per-worker scratch (rayon workers are long-lived, so it amortises across calls too). 697+1220+756 green. **Measurement queue for the next quiet-machine session:** (1) fresh `/usr/bin/sample` + stage split on the current build (the last profile predates KV append-in-place + serial cuts — the sink distribution has shifted); (2) boundary-kv ratio re-measure (its 0.80× predates its resident-forwarding fix); (3) clean absolute engine matrix + llama-bench brackets; (4) remaining known sinks by size: `cpu_moe_forward` glue (router f32 matvec ~43 MB/tok + norms + route), allocator churn from per-projection out-Vecs and ndarray temporaries (arena-per-step is the structural fix if a fresh sample still shows `madvise`), long-ctx GQA growth (real work; flash-attention-class restructure is the eventual answer). +- **Bottleneck pass #2 (code-level, machine contended) — GQA scratch fix landed; measurement queue parked (2026-06-13).** With the box owned by a sibling session (battery also low), did the allocation-churn audit instead of timing: the expert path is already TLS-pooled (`ExpertScratch`), but the GQA head-parallelization had introduced a per-head scores `vec!` (≈480 allocs+zeroings/token, growing with ctx) — replaced with `for_each_init` per-worker scratch (rayon workers are long-lived, so it amortises across calls too). 697+1220+756 green. **Measurement queue for the next quiet-machine session:** (1) fresh `/usr/bin/sample` + stage split on the current build (the last profile predates KV append-in-place + serial cuts — the sink distribution has shifted); (2) boundary-kv ratio re-measure (its 0.80× predates its resident-forwarding fix); (3) clean absolute engine matrix + llama-bench brackets; (4) remaining known sinks by size: `cpu_moe_forward` glue (router f32 matvec ~43 MB/tok + norms + route), allocator churn from per-projection out-Vecs and ndarray temporaries (arena-per-step is the structural fix if a fresh sample still shows `madvise`), long-ctx GQA growth (real work; flash-attention-class restructure is the eventual answer). **✅ RESOLVED 2026-06-13:** (1) the fresh `/usr/bin/sample` ran → the dominant sink was **rayon fork-join overhead** (driver outside the pool), not the listed candidates → fixed by the spin-barrier pool (see top entries). (2)+(3) clean absolute engine matrix measured (standard 30.5 / unlimited 31.8 / boundary-kv **0.89×** post-fix / turbo 9.4 / markov 7.8 / codec 7.3); spin A/B 35.7 on / 28.1 off = +27%. (4) the named glue/allocator/GQA sinks are now the *next* tier below the (closed) fork-join one. 26B llama.cpp same-session anchor still owed (ollama wouldn't run the HF GGUF on CPU); recorded 32.1 + 4B-anchor stand. - **Engine structural gap CLOSED — every KV engine now plugs into the CPU fast path (2026-06-13).** The `KvEngine::decode_step_resident` trait default silently dropped the index (`let _ = index`), so the 06-11/12 fast-path arc (q4k/int8 attention + asm + append-in-place) reached only `StandardEngine`. **Built:** single-source dispatcher `run_attention_block_decode_step_auto` (same per-layer q4k-vs-f32 choice as `CpuBackend::attention_step`, for `SharedKV`-owning walk loops; flag moved to `attention::decode` as the one source); `markov-rs`/`markov-rs-codec`/`turbo-quant`/`unlimited-context`/`boundary_per_layer` override `decode_step_resident` and thread the vindex down their walks; **`boundary-kv` forwards both resident methods to its inner StandardEngine** (was silently f32). `no_cache`/`apollo` keep the default by design. **Regression pin:** `engines::resident_identity_tests` — 7 concrete specs, resident ≡ plain bit-identical flags-off, coverage count can't shrink. **Measured (within-run ratios vs standard; absolutes pending quiet machine — sibling session load):** turbo 0.64×→0.85×, unlimited 0.76×→**1.07×**; markov/codec/boundary-per-layer flat = their own recompute/codec machinery (the feature), not attention. Prefill stays f32 gemm everywhere (task-#16 prefill falsification). 697+1220+756 tests green, clippy clean. Crate detail: [`crates/larql-kv/ROADMAP.md`](crates/larql-kv/ROADMAP.md) §"CPU resident fast-path". - **KV-engine review of the append-in-place handle — all engines green; two failure-path fixes landed (2026-06-13).** `EngineBackend: ComputeBackend + KvDispatch`, so every handle-holding engine rides the rewritten `CpuKvHandle`. Audit found two semantic edges the happy-path suites couldn't catch, both fixed: (1) q4k attend-failure after the in-place append now **pops the appended row and falls back to f32** (the old monolithic form's semantics — protects engine-level fallbacks like boundary_per_layer's dense-walk that reuse the handle); (2) the f32 path's prior is **copied not moved** so a backend failure leaves the handle intact. Empirical matrix, flags ON: **26B** all six MoE-capable engines run clean (standard 14.3 cold / boundary-kv 11.5 / unlimited 10.9 / turbo 9.2 / codec 7.9 / markov 7.2 — C1-class ordering); **4B** standard **28.9 tok/s** (dense also gains: gap to llama.cpp 43 now 1.49×), `standard:window=64` exercises `clip_kv` on the new buffers correctly, no-cache/markov/codec/turbo/unlimited all run. 697+755 suites green (incl. the cross-engine parity oracles). - **KV append-in-place — 23.5→27.9 tok/s short-ctx, 16.0→24.8 long-ctx; gap to llama.cpp ~1.15×/1.29× (2026-06-12).** The old `CpuKvHandle` attention step did a full-cache **clone** + `zeros` + four assigns per layer per step (~190 MB/token of churn at ctx 130, growing linearly). Rewritten: growable row-major Vec buffers (amortised O(kv_dim) `append_row`), q4k-direct step split into **project / append-in-place / attend-over-views** (no concat, no clone; legacy owned-concat wrapper kept for engine walk-loops), f32 fallback moves state instead of cloning, `append_kv`/`clip_kv` lose their O(ctx) rebuilds too, `gqa_attention_decode_step` generic over views. **E2E:** n=128 **27.9 tok/s** (35.8 ms; bracket 33.6/31.0 → ~1.15×); n=512 **24.8** vs llama.cpp tg512 32.0 (1.29× — remaining ctx-growth is GQA compute, real work). Text sanity: full flag stack on 4B → "The capital of France is **Paris**." **Cumulative arc: 7.6 → 27.9 tok/s (3.7×); gap 4.5× → ~1.15×.** 692+1213+755 tests green incl. kv-engine parity oracles. Side-finding: `ave_direct_step_parity` garbage CONFIRMED PRE-EXISTING at clean 6659fd6c (worktree repro) — legacy direct-path twin, not this work; AVE session owns it. Artifact §"update_2026_06_12_kv_append_in_place". diff --git a/crates/larql-kv/ROADMAP.md b/crates/larql-kv/ROADMAP.md index 41cf38ccc..ffa66a942 100644 --- a/crates/larql-kv/ROADMAP.md +++ b/crates/larql-kv/ROADMAP.md @@ -69,11 +69,27 @@ f32 attention. **Fixed:** engine specs, `prefill/decode_step_resident` must be BIT-IDENTICAL to `prefill/decode_step` with the flags off, and the covered-engine count must not shrink. -- Measured effect (26B, flags on, within-run ratios vs standard — absolute - numbers pending a quiet machine): turbo 0.64×→0.85×, unlimited - 0.76×→1.07×; markov/codec/boundary_per_layer flat — their cost is their - own recompute/codec machinery (the feature), not the attention path; - markov's walk already tries the kquant-native attention helper first. +- **Absolute matrix + slow-engine fixes 2026-06-13** (26B, default-on incl. + spin pool, M3 Max t=8 warm n=128). First measured: unlimited 31.8 / standard + 30.5 / boundary-kv 27.1 (**0.80×→0.89×**, its resident-forwarding fix) / + turbo 9.4 / markov 7.8 / codec 7.3 — the recompute/codec engines sat at + **~0.24–0.31×** because the spin pool sped up the shared attention/FFN/matvec + but not their per-step machinery. **Then fixed all three, feature intact:** + - **turbo-quant 9.4 → ~24** — `decompress_matrix`'s per-vector WHT decode was + *serial on the driver* (~35% of it); fanned across the spin pool. Still + 3-4-bit compressed (decoded every step, now parallel) — no memory tradeoff. + - **markov-rs 7.8 → 27.9, markov-rs-codec 7.3 → 27.7** — ported the W2 hot-K/V + cache to the **resident walk** (`rs_decode_step_inner`/`_codec`): read the + cached `hot_kv` and append the free `new_kv` from the attention step instead + of `recompute_kv`-ing every position each step. Gated `cache_eligible = + max_window.is_none() && no-cold` so it never tracks a window-clip + transition; the residual `stored` stays the canonical, re-derivable state + (the engine's point), the K/V is a droppable derivative. Parity gate: + `#[cfg(debug_assertions)]` assert cached K/V ≡ `recompute_kv` (≤1e-2), + exercised by `resident_identity_tests` (extended to a 10-step decode). + Final matrix: standard 34.5 / unlimited 32.1 / markov 27.9 / codec 27.7 / + boundary-kv 27.4 / turbo 21.1 — all **0.6–1.0× of standard** (was 0.24–0.31× + for the slow three). 756 kv tests green debug+release, clippy clean. Prefill stays on the f32 BLAS gemm for all engines deliberately (the task #16 prefill falsification: q4k repeated-matvec loses ~20× to AMX at @@ -406,9 +422,19 @@ current window + boundary checkpoints, never a full replay), so this was a `None`/`None`. CLI guard allows the seven verified engines and rejects `no-cache` / `apollo` with a clear message. -### ⏭ NEXT — Q4K-direct client decode path (remove the f32 tax) — top engineering lever +### ✅ DONE / EXCEEDED — Q4K-direct decode path (remove the f32 tax) -**Why now:** the bottleneck diagnosis +**Status (2026-06-13):** done and the target was blown past. This section's exit +was "~20–25 tok/s, within ~10% of the ~22 tok/s bandwidth ceiling." Reality: +the residency stack (Q4K-direct attn/lm_head/ffn + int8 + asm) + KV +append-in-place + the **spin-barrier pool** took the 26B in-process decode to +**~35 tok/s — past llama.cpp (32.1)** — and the whole stack now ships +**default-on** (see ROADMAP.md baseline table + "Spin-barrier pool" above). The +last lever was *not* the f32→Q4K tax (that was the residency work); it was +**rayon fork-join overhead** (driver outside the pool), closed by the spin pool. +Original framing kept below for history. + +**Why now (historical):** the bottleneck diagnosis ([`docs/diagnoses/remote-moe-bottlenecks.md`](../../docs/diagnoses/remote-moe-bottlenecks.md), 2026-05-29) measured the remote-MoE decode split on the 26B: **~60% is client-side f32 compute** (attention + lm_head + dense FFN, on the dequant-to-f32 BLAS path), diff --git a/crates/larql-kv/src/engines/markov_residual/compute.rs b/crates/larql-kv/src/engines/markov_residual/compute.rs index 8489ae252..8f9ac524e 100644 --- a/crates/larql-kv/src/engines/markov_residual/compute.rs +++ b/crates/larql-kv/src/engines/markov_residual/compute.rs @@ -196,12 +196,54 @@ fn rs_decode_step_inner( let mut attention_us = 0.0f64; let mut ffn_us = 0.0f64; + // W2 hot-K/V cache on the resident walk (2026-06-13). When there is no cold + // tier (the common unbounded-window case), `hot_kv` holds the FULL K/V and + // we read it instead of re-deriving every position via `recompute_kv` (a + // per-step O(N) matmul — the engine's bottleneck). The residual `stored` is + // still the canonical, re-derivable state (the engine's point); `hot_kv` is + // a droppable derivative. With a cold tier (windowed/evicted) we fall back + // to the recompute path. `step_new_kv` collects each layer's updated full + // K/V returned by the attention step (it concatenates prior cache + the new + // RoPE'd row), which IS next step's cache — no recompute, no concat here. + // Only for unbounded windows (the default): then `clip_layer` is a no-op, + // so the cache never has to track a window-eviction transition. Windowed + // configs keep the existing recompute path unchanged. + let cache_eligible = + rs.max_window.is_none() && rs.cold_residuals.is_none() && rs.cold_kv.is_none(); + let mut step_new_kv: Vec = + Vec::with_capacity(num_layers); + for layer in 0..num_layers { let h_hot = &rs.stored[layer]; let s_hot = h_hot.shape()[0]; let hot_abs_start = abs_position.saturating_sub(s_hot); - let (k_full, v_full) = if let Some(cold_kv) = &rs.cold_kv { + let (k_full, v_full) = if let Some(hot_kv) = rs.hot_kv.as_ref().filter(|_| cache_eligible) { + // W2 cached path: no cold tier, so `hot_kv` IS the full K/V — + // read it instead of the per-step `recompute_kv` matmul. Debug + // builds assert it matches a fresh recompute (the parity gate). + let (k_buf, v_buf) = &hot_kv[layer]; + let k = k_buf.slice(s![..s_hot, ..]).to_owned(); + let v = v_buf.slice(s![..s_hot, ..]).to_owned(); + #[cfg(debug_assertions)] + if let Some((rk, rv)) = + recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None) + { + let kd = k + .iter() + .zip(rk.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + let vd = v + .iter() + .zip(rv.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + debug_assert!(kd < 1e-2, "markov hot_kv K cache diverged: {kd}"); + debug_assert!(vd < 1e-2, "markov hot_kv V cache diverged: {vd}"); + } + (k, v) + } else if let Some(cold_kv) = &rs.cold_kv { let (k_cold_buf, v_cold_buf) = &cold_kv[layer]; // 2026-05-19 audit fix: slice to cold_len, not shape()[0]. // cold_kv now uses doubling-capacity (see RsStore::append_cold_overflow). @@ -261,7 +303,7 @@ fn rs_decode_step_inner( } else { None }; - let (h_post_attn, _new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( + let (h_post_attn, new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, @@ -270,6 +312,11 @@ fn rs_decode_step_inner( Some(backend), index.map(|v| v as &dyn larql_compute::KvIndex), )?; + // The attention step already projected the new token's K/V (RoPE'd) — + // free; append it to the cache for next step instead of re-deriving. + if cache_eligible { + step_new_kv.push(new_kv); + } if let Some(t) = t_attn { attention_us += t.elapsed().as_secs_f64() * 1e6; } @@ -315,7 +362,14 @@ fn rs_decode_step_inner( cold_residuals: rs.cold_residuals, cold_kv: rs.cold_kv, cold_len: rs.cold_len, - hot_kv: rs.hot_kv, + // Cache the full K/V (returned by attention) for next step when there's + // no cold tier; else None (the cold/windowed path recomputes). The clip + // loop below clips `hot_kv` in lockstep with `stored` when a window is set. + hot_kv: if cache_eligible { + Some(step_new_kv) + } else { + None + }, cold_abs_start: rs.cold_abs_start, next_position: abs_position + 1, max_window: rs.max_window, diff --git a/crates/larql-kv/src/engines/markov_residual_codec/compute.rs b/crates/larql-kv/src/engines/markov_residual_codec/compute.rs index c76094a40..74af7b6c4 100644 --- a/crates/larql-kv/src/engines/markov_residual_codec/compute.rs +++ b/crates/larql-kv/src/engines/markov_residual_codec/compute.rs @@ -106,12 +106,49 @@ pub fn rs_decode_step_codec( let mut h_new = embed_tokens_pub(weights, &[new_token_id]); let mut new_stored: Vec> = Vec::with_capacity(num_layers); + // W2 hot-K/V cache on the resident walk (2026-06-13), twin of + // markov_residual: with no cold tier, `hot_kv` holds the FULL K/V and is + // read instead of re-deriving via `recompute_kv` each step. `stored` + // remains the canonical re-derivable state. `step_new_kv` collects the + // attention step's updated full K/V (= next step's cache). + // Only for unbounded windows (the default): `clip_layer_overflow` is then a + // no-op, so the cache never tracks a window-eviction transition. Windowed + // configs keep the existing recompute path unchanged. + let cache_eligible = + rs.max_window.is_none() && rs.cold_encoded.is_none() && rs.cold_kv.is_none(); + let mut step_new_kv: Vec = + Vec::with_capacity(num_layers); + for layer in 0..num_layers { let h_hot = &rs.stored[layer]; let s_hot = h_hot.shape()[0]; let hot_abs_start = abs_position.saturating_sub(s_hot); - let (k_full, v_full) = if let Some(cold_kv) = &rs.cold_kv { + let (k_full, v_full) = if let Some(hot_kv) = rs.hot_kv.as_ref().filter(|_| cache_eligible) { + // W2 cached path (no cold tier): hot_kv IS the full K/V — read it, + // skip recompute. Debug builds assert it matches a fresh recompute. + let (k_buf, v_buf) = &hot_kv[layer]; + let k = k_buf.slice(s![..s_hot, ..]).to_owned(); + let v = v_buf.slice(s![..s_hot, ..]).to_owned(); + #[cfg(debug_assertions)] + if let Some((rk, rv)) = + recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None) + { + let kd = k + .iter() + .zip(rk.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + let vd = v + .iter() + .zip(rv.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + debug_assert!(kd < 1e-2, "codec hot_kv K cache diverged: {kd}"); + debug_assert!(vd < 1e-2, "codec hot_kv V cache diverged: {vd}"); + } + (k, v) + } else if let Some(cold_kv) = &rs.cold_kv { let (k_cold, v_cold) = &cold_kv[layer]; let (k_hot, v_hot) = recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; let c = k_cold.shape()[0]; @@ -149,7 +186,7 @@ pub fn rs_decode_step_codec( new_stored.push(h_new.clone()); - let (h_post_attn, _new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( + let (h_post_attn, new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( weights, &h_new, layer, @@ -158,6 +195,9 @@ pub fn rs_decode_step_codec( Some(backend), index.map(|v| v as &dyn larql_compute::KvIndex), )?; + if cache_eligible { + step_new_kv.push(new_kv); + } let bffn = BackendFfn { weights, backend }; let h_out = crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, &bffn, moe_ffn); @@ -180,7 +220,13 @@ pub fn rs_decode_step_codec( stored: updated_stored, cold_encoded: rs.cold_encoded, cold_kv: rs.cold_kv, - hot_kv: rs.hot_kv, + // Cache the full K/V for next step when there's no cold tier; else None + // (cold/windowed recomputes). clip_layer_overflow clips hot_kv in step. + hot_kv: if cache_eligible { + Some(step_new_kv) + } else { + None + }, cold_abs_start: rs.cold_abs_start, next_position: abs_position + 1, max_window: rs.max_window, diff --git a/crates/larql-kv/src/engines/mod.rs b/crates/larql-kv/src/engines/mod.rs index e27d6af00..ca412def0 100644 --- a/crates/larql-kv/src/engines/mod.rs +++ b/crates/larql-kv/src/engines/mod.rs @@ -287,7 +287,11 @@ mod resident_identity_tests { "{spec}: prefill outputs diverged with flags off" ); - for (step, tok) in [3u32, 4, 5].into_iter().enumerate() { + // Several decode steps so the growing context exercises engines' + // per-step caches deeply — in particular markov-rs/-codec's new + // hot-K/V cache (its debug parity assert fires every cached step, + // which is most of these). + for (step, tok) in (3u32..=12).enumerate() { let d_plain = plain .decode_step(&weights, &ffn, tok) .unwrap_or_else(|e| panic!("{spec}: decode_step failed: {e:?}")); diff --git a/crates/larql-kv/src/engines/turbo_quant/engine.rs b/crates/larql-kv/src/engines/turbo_quant/engine.rs index cb94b2383..99e0dee69 100644 --- a/crates/larql-kv/src/engines/turbo_quant/engine.rs +++ b/crates/larql-kv/src/engines/turbo_quant/engine.rs @@ -238,25 +238,34 @@ pub(super) fn decompress_matrix( let heads_per_vec = kv_dim / head_dim; let bytes_per_head = tq.bytes_per_vector(head_dim); let mut data = vec![0.0f32; num_vecs * kv_dim]; - // Scratch buffers reused across every chunk (mirrors - // compress_matrix). `decoded` is small (head_dim wide) and - // written-then-copied per chunk; without reuse this Vec was - // reallocated once per `(vec, head)` pair. - let mut decoded = Vec::with_capacity(head_dim); - let mut scratch_u8 = Vec::with_capacity(head_dim); - for i in 0..num_vecs { - for h in 0..heads_per_vec { - let offset = (i * heads_per_vec + h) * bytes_per_head; - tq.decode_vector_into( - &bytes[offset..offset + bytes_per_head], - head_dim, - &mut decoded, - &mut scratch_u8, - ); - let row_start = i * kv_dim + h * head_dim; - data[row_start..row_start + head_dim].copy_from_slice(&decoded); + // The per-vector WHT/codebook decode (`decode_vector_into`) is the per-step + // bottleneck (a `/usr/bin/sample` profile put ~35% of the decode driver in + // here, serial). Each vector writes a disjoint `kv_dim`-wide row, so fan it + // across the spin pool — this keeps the cache COMPRESSED (the engine's + // point: still decoded every step) but makes the decode parallel instead of + // single-threaded. Per-chunk scratch (decode needs mutable scratch), + // amortised over `CHUNK_VECS` vectors so it isn't reallocated per (vec,head). + const CHUNK_VECS: usize = 8; + larql_compute::cpu::spin_pool::par_chunks_mut(&mut data, kv_dim * CHUNK_VECS, |ci, chunk| { + let mut decoded = Vec::with_capacity(head_dim); + let mut scratch_u8 = Vec::with_capacity(head_dim); + let base_vec = ci * CHUNK_VECS; + let vecs_in_chunk = chunk.len() / kv_dim; + for v in 0..vecs_in_chunk { + let i = base_vec + v; + for h in 0..heads_per_vec { + let offset = (i * heads_per_vec + h) * bytes_per_head; + tq.decode_vector_into( + &bytes[offset..offset + bytes_per_head], + head_dim, + &mut decoded, + &mut scratch_u8, + ); + let row_start = v * kv_dim + h * head_dim; + chunk[row_start..row_start + head_dim].copy_from_slice(&decoded); + } } - } + }); Array2::from_shape_vec((num_vecs, kv_dim), data).expect("shape mismatch") } From 7fd742815a89f97f26df1b10b3dbb7e8a14301df Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Tue, 16 Jun 2026 23:57:22 +0100 Subject: [PATCH 6/8] stabilizing code --- .../commands/extraction/extract_index_cmd.rs | 12 +- crates/larql-compute/src/attention/decode.rs | 180 ++++++++- crates/larql-compute/src/attention/mod.rs | 5 +- .../larql-compute/src/cpu/ops/moe/expert.rs | 37 +- .../larql-compute/src/cpu/ops/moe/forward.rs | 35 +- crates/larql-compute/src/cpu/spin_pool.rs | 48 +++ .../larql-compute/src/forward/dump_config.rs | 23 +- crates/larql-compute/src/kv_dispatch/cpu.rs | 15 +- crates/larql-compute/src/options.rs | 239 +++++++++--- .../docs/specs/apollo-engine.md | 2 +- .../docs/specs/boundary-kv-engine.md | 14 + crates/larql-inference/src/attention/mod.rs | 3 +- crates/larql-inference/src/chat/mod.rs | 79 ++-- .../src/layer_graph/generate/lm_head.rs | 38 +- .../src/layer_graph/grid/config.rs | 55 ++- crates/larql-inference/src/vindex/dequant.rs | 20 +- .../src/vindex/kquant_forward/hidden.rs | 52 ++- .../src/vindex/walk_ffn/mod.rs | 30 +- crates/larql-kv/ROADMAP.md | 244 ++++++++++++ crates/larql-kv/docs/state-policy.md | 21 +- crates/larql-kv/src/engines/apollo/engine.rs | 20 + .../src/engines/boundary_kv/engine.rs | 76 +++- .../engines/boundary_per_layer/cold_tier.rs | 1 + .../engines/boundary_per_layer/dispatch.rs | 1 + .../src/engines/boundary_per_layer/engine.rs | 56 +++ .../engines/boundary_per_layer/executor.rs | 1 + .../src/engines/boundary_per_layer/store.rs | 8 + .../src/engines/boundary_per_layer/walk.rs | 236 +++++++++-- .../src/engines/markov_residual/compute.rs | 369 +++++++++++++----- .../src/engines/markov_residual/store.rs | 15 +- .../engines/markov_residual_codec/compute.rs | 287 ++++++++++---- crates/larql-kv/src/engines/mod.rs | 64 +-- .../src/engines/unlimited_context/engine.rs | 118 +++++- .../src/engines/unlimited_context/extend.rs | 81 ++++ .../executor/lifecycle/compile/into_model.rs | 8 +- .../executor/lifecycle/compile/into_vindex.rs | 6 +- crates/larql-lql/src/executor/tests.rs | 36 +- .../tests/common/synthetic_q4k_vindex.rs | 1 + .../benches/extract_throughput.rs | 3 + crates/larql-vindex/benches/q4k_vs_f32.rs | 2 + crates/larql-vindex/examples/q4k_demo.rs | 11 +- .../src/extract/streaming/context.rs | 68 ++++ .../larql-vindex/src/extract/streaming/mod.rs | 5 + .../src/extract/streaming/stages/down_meta.rs | 12 +- .../extract/streaming/stages/gate_vectors.rs | 11 +- .../src/index/storage/lm_head/knn.rs | 42 +- crates/larql-vindex/tests/golden_resume.rs | 1 + .../tests/test_streaming_stages_moe.rs | 241 ++++++++++-- crates/larql-vindex/tests/test_vindex.rs | 6 + .../larql-vindex/tests/test_vindex_to_q4k.rs | 3 + 50 files changed, 2296 insertions(+), 645 deletions(-) diff --git a/crates/larql-cli/src/commands/extraction/extract_index_cmd.rs b/crates/larql-cli/src/commands/extraction/extract_index_cmd.rs index c8db6ed68..4a1a89ce6 100644 --- a/crates/larql-cli/src/commands/extraction/extract_index_cmd.rs +++ b/crates/larql-cli/src/commands/extraction/extract_index_cmd.rs @@ -366,14 +366,9 @@ pub fn run(args: ExtractIndexArgs) -> Result<(), Box> { feature_major_down: args.feature_major_down, }; - // Per-expert SVD-summary tier — opt-in via flag. Threaded as env var - // so the streaming gate path can read it without an API break. - if args.summary_features_per_expert > 0 { - std::env::set_var( - "LARQL_SUMMARY_FEATURES_PER_EXPERT", - args.summary_features_per_expert.to_string(), - ); - } + // Per-expert SVD-summary tier (opt-in via `--summary-features-per-expert`) + // is threaded as a parameter to `build_vindex_streaming` below — see the + // `summary_features_per_expert` arg. (Was an env side-channel.) // Dispatch: // @@ -454,6 +449,7 @@ pub fn run(args: ExtractIndexArgs) -> Result<(), Box> { model_name, output, args.down_top_k, + args.summary_features_per_expert, level, dtype, args.quant, diff --git a/crates/larql-compute/src/attention/decode.rs b/crates/larql-compute/src/attention/decode.rs index 7d8dcc609..1e48fa702 100644 --- a/crates/larql-compute/src/attention/decode.rs +++ b/crates/larql-compute/src/attention/decode.rs @@ -749,6 +749,95 @@ pub fn run_attention_block_decode_step_q4k_direct( Some((h_post_attn, (k_concat, v_concat))) } +/// Append one `[1, cols]` row to a doubling-capacity `[cap, cols]` buffer at +/// logical row `len`, growing (doubling) the buffer first if it is full. Mirror +/// of `larql-kv`'s `helpers::append_row` — kept here because larql-compute can't +/// depend on larql-kv. The caller increments its logical length after. +fn append_kv_row(buf: &mut Array2, row: &Array2, len: usize) { + let cap = buf.shape()[0]; + if len == cap { + let cols = buf.shape()[1]; + let new_cap = (cap * 2).max(8); + let mut grown = Array2::::zeros((new_cap, cols)); + grown + .slice_mut(ndarray::s![..len, ..]) + .assign(&buf.slice(ndarray::s![..len, ..])); + *buf = grown; + } + buf.slice_mut(ndarray::s![len..len + 1, ..]).assign(row); +} + +/// In-place Q4K-direct decode-step attention for walk engines that hold their +/// hot K/V as **doubling-capacity** buffers (markov_residual / _codec). It +/// projects the new token's K/V, appends the RoPE'd row into `k_cache`/`v_cache` +/// at logical row `cache_len` (growing the buffer if full), then attends over +/// the `[..cache_len + 1]` views — eliminating the per-step O(ctx) owned concat +/// that [`run_attention_block_decode_step_q4k_direct`] pays. Over an L-token +/// generation that turns the cache copy from O(L²) total into O(L). +/// +/// On return the caller's buffers hold `cache_len + 1` logical rows and the +/// function yields `h_post_attn`. Returns `None` — leaving the buffers +/// **untouched** (the projection runs before any mutation) — when the index has +/// no Q4K attention bytes for this layer, so the caller can fall back to the +/// owned-concat path. Bit-identical to the concat form: same data attended, same +/// kernels; only the cache representation (in-place views vs fresh owned concat) +/// differs. +#[allow(clippy::too_many_arguments)] +pub fn run_attention_block_decode_step_q4k_direct_inplace( + weights: &larql_models::ModelWeights, + h_new: &Array2, + layer: usize, + k_cache: &mut Array2, + v_cache: &mut Array2, + cache_len: usize, + abs_position: usize, + backend: &dyn crate::ComputeBackend, + index: &dyn crate::KvIndex, +) -> Option> { + let proj = decode_step_project_q4k_direct(weights, h_new, layer, abs_position, backend, index)?; + append_kv_row(k_cache, &proj.k_new_rope, cache_len); + append_kv_row(v_cache, &proj.v_new, cache_len); + let total = cache_len + 1; + decode_step_attend_q4k_direct( + weights, + h_new, + layer, + &proj.q_rope, + k_cache.slice(ndarray::s![..total, ..]), + v_cache.slice(ndarray::s![..total, ..]), + backend, + index, + ) +} + +/// Best-available in-place decode-step attention for walk engines that own a +/// doubling-capacity K/V buffer: the Q4K-direct in-place path when the flag is +/// on and an index with attention bytes is supplied, else `None` so the caller +/// uses the owned-concat [`run_attention_block_decode_step_auto`]. The SAME +/// per-layer Q4K-vs-f32 choice the dispatch path makes — see +/// [`run_attention_block_decode_step_auto`]. +#[allow(clippy::too_many_arguments)] +pub fn run_attention_block_decode_step_auto_inplace( + weights: &larql_models::ModelWeights, + h_new: &Array2, + layer: usize, + k_cache: &mut Array2, + v_cache: &mut Array2, + cache_len: usize, + abs_position: usize, + backend: Option<&dyn crate::ComputeBackend>, + index: Option<&dyn crate::KvIndex>, +) -> Option> { + if q4k_direct_attn_enabled() { + if let (Some(be), Some(idx)) = (backend, index) { + return run_attention_block_decode_step_q4k_direct_inplace( + weights, h_new, layer, k_cache, v_cache, cache_len, abs_position, be, idx, + ); + } + } + None +} + #[cfg(test)] mod tests { use super::*; @@ -958,6 +1047,82 @@ mod tests { assert!(h2.iter().all(|x| x.is_finite())); } + /// The in-place form must be **bit-identical** to the owned-concat form + /// across a multi-step decode: same h_post_attn every step, and the + /// doubling-capacity buffer's `[..len]` view must equal the concat's owned + /// K/V. This is the parity gate that lets the walk engines drop the O(ctx) + /// concat. Runs a real multi-layer Q4K fixture for several steps so the + /// buffer crosses a capacity doubling. + #[test] + fn q4k_direct_inplace_is_bit_identical_to_owned_concat() { + let weights = make_test_q4k_weights(); + let idx = make_q4k_fixture_index(&weights); + let backend = crate::CpuBackend; + let num_layers = weights.num_layers; + let kv_dim = { + let arch = &*weights.arch; + arch.num_kv_heads_for_layer(0) * arch.head_dim_for_layer(0) + }; + + // Concat-path cache: one owned SharedKV per layer (grows by concat). + let mut concat_kv: Vec> = vec![None; num_layers]; + // In-place cache: doubling-capacity buffers per layer + a logical length. + let mut inplace_k: Vec> = + (0..num_layers).map(|_| Array2::zeros((0, kv_dim))).collect(); + let mut inplace_v: Vec> = + (0..num_layers).map(|_| Array2::zeros((0, kv_dim))).collect(); + + for step in 0..6 { + // The buffer's logical length at the start of this step == `step`. + let len = step; + let h = Array2::from_elem((1, weights.hidden_size), 0.05 * (step as f32 + 1.0)); + for layer in 0..num_layers { + let (h_concat, new_kv) = run_attention_block_decode_step_q4k_direct( + &weights, + &h, + layer, + concat_kv[layer].as_ref(), + step, + &backend, + &idx, + ) + .expect("concat step"); + + let h_inplace = run_attention_block_decode_step_q4k_direct_inplace( + &weights, + &h, + layer, + &mut inplace_k[layer], + &mut inplace_v[layer], + len, + step, + &backend, + &idx, + ) + .expect("inplace step"); + + // h_post_attn must match bit-for-bit. + for (a, b) in h_concat.iter().zip(h_inplace.iter()) { + assert_eq!(a.to_bits(), b.to_bits(), "h_post_attn diverged step {step} layer {layer}"); + } + // The in-place buffer's logical view must equal the concat K/V. + let total = len + 1; + let k_view = inplace_k[layer].slice(ndarray::s![..total, ..]); + let v_view = inplace_v[layer].slice(ndarray::s![..total, ..]); + assert_eq!(new_kv.0.shape(), k_view.shape(), "K shape step {step} layer {layer}"); + for (a, b) in new_kv.0.iter().zip(k_view.iter()) { + assert_eq!(a.to_bits(), b.to_bits(), "K diverged step {step} layer {layer}"); + } + for (a, b) in new_kv.1.iter().zip(v_view.iter()) { + assert_eq!(a.to_bits(), b.to_bits(), "V diverged step {step} layer {layer}"); + } + concat_kv[layer] = Some(new_kv); + } + } + // Buffer must have grown past its first allocation (crossed a doubling). + assert!(inplace_k[0].shape()[0] >= 6, "buffer should have grown to hold 6 rows"); + } + #[test] fn q4k_direct_decode_step_all_layers_succeed() { let weights = make_test_q4k_weights(); @@ -1014,18 +1179,9 @@ mod tests { // This pins the strict <1e-3 WEIGHT parity of the *f32-activation* // Q4K-direct path. The int8 activation route is now on by default and // carries a looser (~2% scale-relative) bound by design, so disable it - // here. The guard restores the env even if an assertion panics. - struct EnvGuard(Option); - impl Drop for EnvGuard { - fn drop(&mut self) { - match self.0.take() { - Some(v) => std::env::set_var("LARQL_Q4K_ATTN_INT8", v), - None => std::env::remove_var("LARQL_Q4K_ATTN_INT8"), - } - } - } - let _guard = EnvGuard(std::env::var_os("LARQL_Q4K_ATTN_INT8")); - std::env::set_var("LARQL_Q4K_ATTN_INT8", "0"); + // here. Thread-local override (NOT `set_var`, which races concurrent + // `getenv` on the decode path → SIGSEGV); cleared on drop. + let _guard = crate::options::FastPathGuard::set(&[(crate::options::ENV_Q4K_ATTN_INT8, false)]); // Parity contract (roadmap #16, "<1e-3"): the Q4K-direct decode // step should track the f32-BLAS path that runs on the SAME bytes diff --git a/crates/larql-compute/src/attention/mod.rs b/crates/larql-compute/src/attention/mod.rs index b569fd4fd..ce238f9c2 100644 --- a/crates/larql-compute/src/attention/mod.rs +++ b/crates/larql-compute/src/attention/mod.rs @@ -61,8 +61,9 @@ pub use block::{ }; pub use decode::{ gqa_attention_decode_step, run_attention_block_decode_step, - run_attention_block_decode_step_auto, run_attention_block_decode_step_backend, - run_attention_block_decode_step_q4k_direct, + run_attention_block_decode_step_auto, run_attention_block_decode_step_auto_inplace, + run_attention_block_decode_step_backend, run_attention_block_decode_step_q4k_direct, + run_attention_block_decode_step_q4k_direct_inplace, }; pub use gpu::{ q4_attention_proj, run_attention_block_gpu, run_attention_with_kv, diff --git a/crates/larql-compute/src/cpu/ops/moe/expert.rs b/crates/larql-compute/src/cpu/ops/moe/expert.rs index b806d2889..b36ceedd3 100644 --- a/crates/larql-compute/src/cpu/ops/moe/expert.rs +++ b/crates/larql-compute/src/cpu/ops/moe/expert.rs @@ -831,35 +831,24 @@ mod tests { ); } - static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); - fn with_env_in_thread( vars: &'static [(&'static str, Option<&'static str>)], f: impl FnOnce() -> T + Send + 'static, ) -> T { - let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); - let previous: Vec<_> = vars - .iter() - .map(|(n, _)| (*n, std::env::var_os(n))) - .collect(); - for (n, v) in vars { - match v { - Some(s) => std::env::set_var(n, s), - None => std::env::remove_var(n), + // Run on a fresh thread so the TLS-cached env reads (`Q4K_DIRECT`, + // `EXPERT_TIMING`) initialise from this dispatch's values rather than + // inheriting an earlier `false` from another thread. The override is + // thread-local, so it must be set *inside* the spawned thread (where the + // TLS statics initialise); it dies with the thread → no cleanup, no + // `std::env::set_var` (which races `getenv` → SIGSEGV), no lock. + std::thread::spawn(move || { + for (n, v) in vars { + crate::options::set_env_override(n, *v); } - } - // Cross thread boundary so TLS-cached env reads (`Q4K_DIRECT`, - // `EXPERT_TIMING`) initialise to the freshly-set value rather - // than inheriting an earlier `false` from this process's main - // thread. - let result = std::thread::spawn(f).join().expect("thread did not panic"); - for (n, v) in previous { - match v { - Some(s) => std::env::set_var(n, s), - None => std::env::remove_var(n), - } - } - result + f() + }) + .join() + .expect("thread did not panic") } /// `LARQL_Q4K_DIRECT=1` opts in to the q4k-direct matvec path inside diff --git a/crates/larql-compute/src/cpu/ops/moe/forward.rs b/crates/larql-compute/src/cpu/ops/moe/forward.rs index 240dd58e3..cb324d8fc 100644 --- a/crates/larql-compute/src/cpu/ops/moe/forward.rs +++ b/crates/larql-compute/src/cpu/ops/moe/forward.rs @@ -494,33 +494,22 @@ mod tests { ); } - // ENV_LOCK protects tests that set process-wide env vars. The cpu_moe_forward - // hot path reads `LARQL_SKIP_MOE`, `LARQL_MOE_DEBUG`, and (once per thread, - // through a TLS cache) `LARQL_MOE_FWD_TIMING`. - static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); - + // Override env flags on the current thread (`LARQL_SKIP_MOE`, + // `LARQL_MOE_DEBUG`, `LARQL_MOE_FWD_TIMING` — all read via the override-aware + // `options::env_flag`) WITHOUT `std::env::set_var`, which races concurrent + // `getenv` → SIGSEGV. Per-thread, cleared on drop → no lock, no leakage. fn with_env(vars: &[(&'static str, Option<&'static str>)], f: impl FnOnce() -> T) -> T { - // Recover from prior-test panics so an unrelated failure in one - // env-sensitive test doesn't cascade-fail the others. - let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); - let previous: Vec<_> = vars - .iter() - .map(|(name, _)| (*name, std::env::var_os(name))) - .collect(); - for (name, value) in vars { - match value { - Some(v) => std::env::set_var(name, v), - None => std::env::remove_var(name), + struct Clear; + impl Drop for Clear { + fn drop(&mut self) { + crate::options::clear_fast_path_overrides(); } } - let result = f(); - for (name, value) in previous { - match value { - Some(v) => std::env::set_var(name, v), - None => std::env::remove_var(name), - } + let _clear = Clear; + for (name, value) in vars { + crate::options::set_env_override(name, *value); } - result + f() } fn trivial_moe_inputs() -> (usize, usize, Vec, Vec, Vec, Vec) { diff --git a/crates/larql-compute/src/cpu/spin_pool.rs b/crates/larql-compute/src/cpu/spin_pool.rs index 33669875b..cf62591ba 100644 --- a/crates/larql-compute/src/cpu/spin_pool.rs +++ b/crates/larql-compute/src/cpu/spin_pool.rs @@ -519,6 +519,54 @@ mod tests { }); } + /// Cross-dispatch read-after-write — the real decode pipeline shape + /// (dispatch A writes a buffer; the *next* dispatch B reads it and writes a + /// derived buffer). Exercises the visibility the disjoint-write tests don't: + /// workers running dispatch B must observe ALL of dispatch A's writes (the + /// `barrier_A.Acquire → epoch_B.Release → worker_B.Acquire` chain). The pool + /// is oversubscribed (more workers than cores) so the barrier routinely waits + /// on a descheduled worker. Kept fast (a few hundred rounds) — under EXTREME + /// oversubscription (2× burners, 4000 rounds) this and the disjoint-write + /// path stayed correct, so this is a regression guard, not a repro. + #[test] + fn stress_cross_dispatch_read_after_write() { + let cores = std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(8); + // Oversubscribe the pool itself (more workers than cores) so the barrier + // routinely waits on a descheduled worker. + let pool = SpinPool::new((cores + 2).max(4)); + let n = 61usize; // chunks; not a multiple of the thread count + let mut a = vec![0u64; n]; + let mut b = vec![0u64; n]; + for round in 1..=400u64 { + // Dispatch A: fill `a` with a round-derived pattern. + let pa = a.as_mut_ptr() as usize; + pool.for_each_chunk(n, |c| { + // SAFETY: chunk c owns element c. + unsafe { *(pa as *mut u64).add(c) = round.wrapping_mul(c as u64 + 1) | 1 }; + }); + // Dispatch B: read `a`, write `b = f(a)`. If B's workers don't see + // all of A's writes, `b[c]` is wrong (or derived from a stale 0). + let pa_r = a.as_ptr() as usize; + let pb = b.as_mut_ptr() as usize; + pool.for_each_chunk(n, |c| { + // SAFETY: read element c (written by A's chunk c), write b[c]. + let av = unsafe { *(pa_r as *const u64).add(c) }; + unsafe { *(pb as *mut u64).add(c) = av.wrapping_mul(31).wrapping_add(7) }; + }); + for c in 0..n { + let want_a = round.wrapping_mul(c as u64 + 1) | 1; + assert_eq!(a[c], want_a, "round {round} chunk {c}: A wrong"); + assert_eq!( + b[c], + want_a.wrapping_mul(31).wrapping_add(7), + "round {round} chunk {c}: B read a stale/partial A" + ); + } + } + } + #[test] fn back_to_back_dispatches_reuse_workers() { // Exercises the epoch path: many tiny dispatches in a row (the decode diff --git a/crates/larql-compute/src/forward/dump_config.rs b/crates/larql-compute/src/forward/dump_config.rs index d86071302..4c425b287 100644 --- a/crates/larql-compute/src/forward/dump_config.rs +++ b/crates/larql-compute/src/forward/dump_config.rs @@ -139,13 +139,12 @@ impl DumpConfig { /// Read the three env vars and assemble a `DumpConfig`. Public so test /// fixtures can build one without touching the process env. pub fn from_env() -> Self { + // Read through the override-aware `options` helpers so tests can toggle + // these via the thread-local override (no `set_var`/`getenv` race). Self { - layer_dump_dir: std::env::var(ENV_CPU_DUMP_LAYERS).ok(), - stage_dump_dir: std::env::var(ENV_CPU_STAGE_DUMP).ok(), - stage_dump_layer: std::env::var(ENV_STAGE_DUMP_LAYER) - .ok() - .and_then(|s| s.parse::().ok()) - .unwrap_or(0), + layer_dump_dir: crate::options::env_value(ENV_CPU_DUMP_LAYERS), + stage_dump_dir: crate::options::env_value(ENV_CPU_STAGE_DUMP), + stage_dump_layer: crate::options::env_usize(ENV_STAGE_DUMP_LAYER).unwrap_or(0), } } @@ -227,16 +226,12 @@ mod tests { // out a stale path. `get()` now returns a freshly-built // `DumpConfig` per call so flipping the env var actually // takes effect. - // Save + restore to keep the process env clean for other tests. - let prev = std::env::var(ENV_CPU_DUMP_LAYERS).ok(); - std::env::set_var(ENV_CPU_DUMP_LAYERS, "/tmp/dump-a"); + // Thread-local override (no `set_var`/`getenv` race); cleared at the end. + crate::options::set_env_override(ENV_CPU_DUMP_LAYERS, Some("/tmp/dump-a")); let a_dir = DumpConfig::get().layer_dump_dir; - std::env::set_var(ENV_CPU_DUMP_LAYERS, "/tmp/dump-b"); + crate::options::set_env_override(ENV_CPU_DUMP_LAYERS, Some("/tmp/dump-b")); let b_dir = DumpConfig::get().layer_dump_dir; - match prev { - Some(v) => std::env::set_var(ENV_CPU_DUMP_LAYERS, v), - None => std::env::remove_var(ENV_CPU_DUMP_LAYERS), - } + crate::options::clear_fast_path_overrides(); assert_eq!(a_dir, Some("/tmp/dump-a".to_string())); assert_eq!(b_dir, Some("/tmp/dump-b".to_string())); } diff --git a/crates/larql-compute/src/kv_dispatch/cpu.rs b/crates/larql-compute/src/kv_dispatch/cpu.rs index 377a56c9d..47b0aeb2a 100644 --- a/crates/larql-compute/src/kv_dispatch/cpu.rs +++ b/crates/larql-compute/src/kv_dispatch/cpu.rs @@ -942,11 +942,10 @@ mod tests { use crate::test_fixtures::make_q4k_fixture_index; use larql_models::test_fixtures::make_test_q4k_weights; - std::env::set_var("LARQL_Q4K_DIRECT_ATTN", "1"); - // Sanity: the gate now reads as enabled (first read seeds the - // OnceLock to `true` for the remainder of the test binary; nothing - // else here passes an index to `attention_step`, so leaving it set - // is harmless). + // Thread-local override (NOT `set_var`, which races concurrent `getenv` + // → SIGSEGV); cleared on drop, so it can't leak to a sibling test. + let _guard = + crate::options::FastPathGuard::set(&[(crate::options::ENV_Q4K_DIRECT_ATTN, true)]); assert!(q4k_direct_attn_enabled()); let b = backend(); @@ -981,9 +980,9 @@ mod tests { fn attention_step_q4k_direct_falls_back_to_f32_on_empty_index() { use larql_models::test_fixtures::make_test_q4k_weights; - // Flag is already (or will be) enabled process-wide by the sibling - // test; set it again defensively so this test is order-independent. - std::env::set_var("LARQL_Q4K_DIRECT_ATTN", "1"); + // Enable the q4k-direct gate on this thread (override, not `set_var`). + let _guard = + crate::options::FastPathGuard::set(&[(crate::options::ENV_Q4K_DIRECT_ATTN, true)]); struct EmptyIdx; impl crate::KvIndex for EmptyIdx {} diff --git a/crates/larql-compute/src/options.rs b/crates/larql-compute/src/options.rs index ebf5c5716..6aa80f515 100644 --- a/crates/larql-compute/src/options.rs +++ b/crates/larql-compute/src/options.rs @@ -159,44 +159,169 @@ pub const ENV_Q4K_ASM: &str = "LARQL_Q4K_ASM"; /// Spin-barrier thread pool for the decode hot path (vs rayon's sleeping pool). pub const ENV_SPIN_POOL: &str = "LARQL_SPIN_POOL"; -/// A decode fast-path stage is ON unless explicitly disabled -/// (`=0`/`false`/`off`/`no`). -fn fast_path_on(name: &str) -> bool { - !env_opt_out(name) +thread_local! { + /// Per-thread override for env-var reads ([`env_override`]). Tests inject + /// values here to toggle a flag WITHOUT `std::env::set_var`, which is + /// thread-unsafe against the concurrent `getenv` every other parallel test + /// does on the decode path — that race SIGSEGVs libc. Each entry is the raw + /// value the env helper should see: `Some("v")` = "set to v", `None` = "act + /// as if unset". Production never touches this; the map is empty so every + /// helper falls through to the process env unchanged. + static ENV_OVERRIDES: std::cell::RefCell< + std::collections::HashMap<&'static str, Option>, + > = std::cell::RefCell::new(std::collections::HashMap::new()); } -// The per-layer / per-token stages read the env each call (an uncontended -// single-thread `getenv` ~ns, negligible at layer granularity) so they stay -// togglable in tests. The two genuinely hot stages — `asm` (per matvec) and -// `spin_pool` (per parallel section) — cache at first read; no test toggles -// them via env (their unit tests drive the kernels / `SpinPool` directly). +/// The current thread's test override for `name`, if any. The outer `Option` +/// tells overridden-vs-not; the inner is the (possibly-unset) raw value. +fn env_override(name: &str) -> Option> { + ENV_OVERRIDES.with(|o| o.borrow().get(name).cloned()) +} + +/// Effective raw value for `name`: the thread-local override if present, else +/// the process env. The single choke point every env helper reads through. +fn env_effective(name: &str) -> Option { + match env_override(name) { + Some(v) => v, + None => std::env::var(name).ok(), + } +} + +// ── Pure value parsers (no env) — directly unit-tested; the env helpers below +// just feed them the effective raw value. Keeps the "0"/"true"/… vocabulary +// in one place and testable without touching process env. +fn is_opt_out_value(v: Option<&str>) -> bool { + matches!(v, Some("0") | Some("false") | Some("off") | Some("no")) +} +fn is_opt_in_value(v: Option<&str>) -> bool { + matches!(v, Some("1") | Some("true") | Some("on") | Some("yes")) +} + +/// The current thread's override for a fast-path stage flag as a bool, if set. +/// `None` in production → the accessor uses the cached [`decode_options`] value. +fn fast_path_override(name: &'static str) -> Option { + env_override(name).map(|v| !is_opt_out_value(v.as_deref())) +} + +/// Override an env flag on the current thread to a raw string value (`Some`) or +/// unset (`None`) — test-only escape hatch ([`ENV_OVERRIDES`]). Lets tests +/// toggle any flag without process-global env mutation (which segfaults under +/// parallel `getenv`). Clear with [`clear_fast_path_overrides`] on teardown. +#[doc(hidden)] +pub fn set_env_override(name: &'static str, value: Option<&str>) { + ENV_OVERRIDES.with(|o| { + o.borrow_mut().insert(name, value.map(str::to_string)); + }); +} + +/// Override a decode fast-path stage flag on the current thread (test-only). +/// Bool convenience over [`set_env_override`] (`true` → `"1"`, `false` → `"0"`). +#[doc(hidden)] +pub fn set_fast_path_override(name: &'static str, on: bool) { + set_env_override(name, Some(if on { "1" } else { "0" })); +} + +/// Clear all thread-local env overrides (test-only). +#[doc(hidden)] +pub fn clear_fast_path_overrides() { + ENV_OVERRIDES.with(|o| o.borrow_mut().clear()); +} + +/// RAII guard that sets fast-path stage overrides on the current thread and +/// clears them on drop (test-only). Replaces the `std::env::set_var` pattern, +/// which races concurrent `getenv` on the decode path and SIGSEGVs libc. +#[cfg(test)] +pub(crate) struct FastPathGuard; + +#[cfg(test)] +impl FastPathGuard { + pub(crate) fn set(flags: &[(&'static str, bool)]) -> Self { + for &(name, on) in flags { + set_fast_path_override(name, on); + } + FastPathGuard + } +} + +#[cfg(test)] +impl Drop for FastPathGuard { + fn drop(&mut self) { + clear_fast_path_overrides(); + } +} + +/// The decode fast-path stage flags — the single source of truth for "which +/// decode stages are on". Read ONCE from the process env at first use and +/// cached (see [`decode_options`]); each stage is default-ON, opt out with +/// `LARQL_=0`. This folds what were four per-token `getenv`s and two ad-hoc +/// per-stage `OnceLock`s (`asm`, `spin_pool`) into one typed registry. Tests +/// toggle stages per-thread via [`set_fast_path_override`] (no `set_var`, which +/// races the per-token `getenv` and SIGSEGVs libc), and the override wins over +/// this cache. +#[derive(Debug, Clone, Copy)] +pub struct DecodeOptions { + /// Q4_K-direct attention projections (read Q4_K bytes from the index). + pub q4k_direct_attn: bool, + /// Int8 (Q8_K) activation route for the Q4_K-direct attention projections. + pub q4k_attn_int8: bool, + /// Q4_K lm_head (vocab projection straight from the Q4_K view). + pub q4k_lm_head: bool, + /// Q4_K-direct dense-FFN decode slab (prefill stays f32 gemm). + pub q4k_direct_ffn: bool, + /// Hand-asm aarch64 Q4_K/Q6_K kernels (bit-exact with the intrinsic path). + pub q4k_asm: bool, + /// Spin-barrier thread pool for the decode hot path (vs rayon's pool). + pub spin_pool: bool, +} + +impl DecodeOptions { + fn from_env() -> Self { + // RAW process env (bypass the per-thread override): this is the + // process-wide cached production value, and a test's thread-local + // override must not be baked into it (the accessors apply the override + // per-call instead, via `fast_path_override`). + let on = |name: &str| !is_opt_out_value(std::env::var(name).ok().as_deref()); + Self { + q4k_direct_attn: on(ENV_Q4K_DIRECT_ATTN), + q4k_attn_int8: on(ENV_Q4K_ATTN_INT8), + q4k_lm_head: on(ENV_Q4K_LM_HEAD), + q4k_direct_ffn: on(ENV_Q4K_DIRECT_FFN), + q4k_asm: on(ENV_Q4K_ASM), + spin_pool: on(ENV_SPIN_POOL), + } + } +} + +/// Process-wide decode fast-path flags, built from env on first use and cached. +/// The single registry the per-stage `*_enabled()` accessors read. +pub fn decode_options() -> &'static DecodeOptions { + static OPTS: std::sync::OnceLock = std::sync::OnceLock::new(); + OPTS.get_or_init(DecodeOptions::from_env) +} /// Q4_K-direct attention projections enabled (default on). pub fn q4k_direct_attn_enabled() -> bool { - fast_path_on(ENV_Q4K_DIRECT_ATTN) + fast_path_override(ENV_Q4K_DIRECT_ATTN).unwrap_or(decode_options().q4k_direct_attn) } /// Int8 attention projection route enabled (default on). pub fn q4k_attn_int8_enabled() -> bool { - fast_path_on(ENV_Q4K_ATTN_INT8) + fast_path_override(ENV_Q4K_ATTN_INT8).unwrap_or(decode_options().q4k_attn_int8) } /// Q4_K lm_head enabled (default on; falls back to f32 without a head view). pub fn q4k_lm_head_enabled() -> bool { - fast_path_on(ENV_Q4K_LM_HEAD) + fast_path_override(ENV_Q4K_LM_HEAD).unwrap_or(decode_options().q4k_lm_head) } /// Q4_K-direct dense-FFN decode slab enabled (default on). pub fn q4k_direct_ffn_enabled() -> bool { - fast_path_on(ENV_Q4K_DIRECT_FFN) + fast_path_override(ENV_Q4K_DIRECT_FFN).unwrap_or(decode_options().q4k_direct_ffn) } -/// Hand-asm Q4_K/Q6_K kernels enabled (default on; aarch64 only). Cached — read -/// per matvec. +/// Hand-asm Q4_K/Q6_K kernels enabled (default on; aarch64 only). pub fn q4k_asm_enabled() -> bool { - static ON: std::sync::OnceLock = std::sync::OnceLock::new(); - *ON.get_or_init(|| fast_path_on(ENV_Q4K_ASM)) + fast_path_override(ENV_Q4K_ASM).unwrap_or(decode_options().q4k_asm) } -/// Spin-barrier decode pool enabled (default on). Cached — read per section. +/// Spin-barrier decode pool enabled (default on). pub fn spin_pool_enabled() -> bool { - static ON: std::sync::OnceLock = std::sync::OnceLock::new(); - *ON.get_or_init(|| fast_path_on(ENV_SPIN_POOL)) + fast_path_override(ENV_SPIN_POOL).unwrap_or(decode_options().spin_pool) } // Helpers below are `pub` (not `pub(crate)`) because sibling backend @@ -206,8 +331,15 @@ pub fn spin_pool_enabled() -> bool { // `env::var_os`/`parse::` boilerplate and risk drift in how // "set" / "true" / "1" are interpreted across backends. +// All of these read through `env_effective` so the thread-local test override +// applies uniformly (no `std::env::set_var` in tests → no `setenv`/`getenv` +// SIGSEGV race). In production the override map is empty, so each is exactly +// the prior `std::env::var*` read. pub fn env_flag(name: &str) -> bool { - std::env::var_os(name).is_some() + match env_override(name) { + Some(v) => v.is_some(), + None => std::env::var_os(name).is_some(), + } } pub fn env_flag_any(names: &[&str]) -> bool { @@ -215,11 +347,11 @@ pub fn env_flag_any(names: &[&str]) -> bool { } pub fn env_usize(name: &str) -> Option { - std::env::var(name).ok()?.parse().ok() + env_effective(name)?.parse().ok() } pub fn env_value(name: &str) -> Option { - std::env::var(name).ok() + env_effective(name) } pub fn env_nonempty_value(name: &str) -> Option { @@ -227,21 +359,15 @@ pub fn env_nonempty_value(name: &str) -> Option { } pub fn env_opt_in(name: &str) -> bool { - matches!( - std::env::var(name).as_deref(), - Ok("1") | Ok("true") | Ok("on") | Ok("yes") - ) + is_opt_in_value(env_effective(name).as_deref()) } pub fn env_opt_out(name: &str) -> bool { - matches!( - std::env::var(name).as_deref(), - Ok("0") | Ok("false") | Ok("off") | Ok("no") - ) + is_opt_out_value(env_effective(name).as_deref()) } pub fn env_not_zero_or_default(name: &str, default: bool) -> bool { - std::env::var(name) + env_effective(name) .map(|value| value != "0") .unwrap_or(default) } @@ -262,34 +388,43 @@ pub fn split_profile_requested() -> bool { mod tests { use super::*; - static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); - - fn with_env_vars(vars: &[(&str, Option<&str>)], f: impl FnOnce() -> T) -> T { - let _guard = ENV_LOCK.lock().expect("env test mutex poisoned"); - let previous: Vec<_> = vars - .iter() - .map(|(name, _)| (*name, std::env::var_os(name))) - .collect(); - for (name, value) in vars { - match value { - Some(value) => std::env::set_var(name, value), - None => std::env::remove_var(name), + /// Run `f` with the given env flags overridden on the current thread via the + /// thread-local override (NOT `std::env::set_var`, which races concurrent + /// `getenv` → SIGSEGV). Cleared on drop, so no cross-test leakage and no + /// serialization needed. + fn with_env_vars(vars: &[(&'static str, Option<&str>)], f: impl FnOnce() -> T) -> T { + struct Clear; + impl Drop for Clear { + fn drop(&mut self) { + clear_fast_path_overrides(); } } - let result = f(); - for (name, value) in previous { - match value { - Some(value) => std::env::set_var(name, value), - None => std::env::remove_var(name), - } + let _clear = Clear; + for (name, value) in vars { + set_env_override(name, *value); } - result + f() } - fn with_env(name: &str, value: Option<&str>, f: impl FnOnce() -> T) -> T { + fn with_env(name: &'static str, value: Option<&str>, f: impl FnOnce() -> T) -> T { with_env_vars(&[(name, value)], f) } + #[test] + fn opt_value_parsers_recognise_the_vocabulary() { + for v in ["0", "false", "off", "no"] { + assert!(is_opt_out_value(Some(v))); + assert!(!is_opt_in_value(Some(v))); + } + for v in ["1", "true", "on", "yes"] { + assert!(is_opt_in_value(Some(v))); + assert!(!is_opt_out_value(Some(v))); + } + assert!(!is_opt_out_value(None)); + assert!(!is_opt_in_value(None)); + assert!(!is_opt_out_value(Some("maybe"))); + } + #[test] fn env_flag_and_value_helpers_read_presence_and_content() { with_env(ENV_GPU_TIMING, Some("1"), || { diff --git a/crates/larql-inference/docs/specs/apollo-engine.md b/crates/larql-inference/docs/specs/apollo-engine.md index 031248455..fd701e55c 100644 --- a/crates/larql-inference/docs/specs/apollo-engine.md +++ b/crates/larql-inference/docs/specs/apollo-engine.md @@ -114,7 +114,7 @@ that consumes them. | Concern | Location | |---|---| -| Engine struct + `KvEngine` impl | `crates/larql-kv/src/engines/apollo/engine.rs` | +| Engine struct + `RetrievalEngine` impl (NOT `KvEngine` — see the 2026-05-24 note above) | `crates/larql-kv/src/engines/apollo/engine.rs` | | Store schema | `crates/larql-apollo/` | | `forward_from_layer` (run-tail) | `crates/larql-inference/src/forward/from_layer.rs` | | Residual capture (offline) | `crates/larql-apollo::capture` | diff --git a/crates/larql-inference/docs/specs/boundary-kv-engine.md b/crates/larql-inference/docs/specs/boundary-kv-engine.md index 3d2d4a186..cfbddc742 100644 --- a/crates/larql-inference/docs/specs/boundary-kv-engine.md +++ b/crates/larql-inference/docs/specs/boundary-kv-engine.md @@ -1,6 +1,16 @@ # BoundaryKvEngine — Specification **Status:** 📝 Draft v0.1 (2026-05-17). + +> ⚠️ **Implementation status (2026-06-14): the emit half is shipped; the +> RESUME half is NOT.** Boundary-frame *emission* at chunk boundaries +> (§6.1/§6.2, Phase 1–2) is live and tested. Everything describing +> *restore* — §2.2 (cross-session restore contract), §2.3, §6.3 +> (`resume`), §8.3, §8.5, and Phase 3 — specifies *intended* behaviour +> that is **not yet built**. Treat those sections as design, not +> as-implemented. (`BoundaryKvEngine::resume` does not exist in the +> code; the frame chain is currently a write-only transport artifact.) + **Audience:** LARQL contributors. **Scope:** Contract for a KV-cache engine in `larql-kv` that emits and consumes `larql-boundary` frames at chunk boundaries, enabling compact @@ -300,6 +310,10 @@ the step), emit a frame. ### 6.3 `resume(boundary_chain, optional_hot_tokens) -> State` +> ⚠️ **NOT IMPLEMENTED** (2026-06-14). This describes intended behaviour; +> `BoundaryKvEngine::resume` does not exist in the code yet. See the +> banner at the top of this spec. + Reconstructs a decode state from a previously emitted boundary chain. Required behaviour: diff --git a/crates/larql-inference/src/attention/mod.rs b/crates/larql-inference/src/attention/mod.rs index 551b2d14b..9ff9408a8 100644 --- a/crates/larql-inference/src/attention/mod.rs +++ b/crates/larql-inference/src/attention/mod.rs @@ -32,7 +32,8 @@ pub use block::{ }; pub use decode::{ gqa_attention_decode_step, run_attention_block_decode_step, - run_attention_block_decode_step_auto, run_attention_block_decode_step_backend, + run_attention_block_decode_step_auto, run_attention_block_decode_step_auto_inplace, + run_attention_block_decode_step_backend, }; pub use gpu::{ q4_attention_proj, run_attention_block_gpu, run_attention_with_kv, diff --git a/crates/larql-inference/src/chat/mod.rs b/crates/larql-inference/src/chat/mod.rs index 41b12fcd7..a8c02b22a 100644 --- a/crates/larql-inference/src/chat/mod.rs +++ b/crates/larql-inference/src/chat/mod.rs @@ -170,10 +170,16 @@ pub fn render_user_prompt( family: &str, user_prompt: &str, ) -> Result { - let raw_prompt = std::env::var(ENV_RAW_PROMPT).is_ok(); - let enable_thinking = std::env::var(ENV_THINKING).is_ok(); - let user_system = std::env::var(ENV_SYSTEM).ok(); - let suppress_default = std::env::var(ENV_NO_DEFAULT_SYSTEM).is_ok(); + // Read through the override-aware helpers (NOT raw `std::env::var`) so + // tests can toggle these per-thread without `std::env::set_var`, which + // races concurrent `getenv` on the decode path and SIGSEGVs libc. + // `env_flag` matches the old `var(_).is_ok()` (set, even empty) and + // `env_value` matches `var(_).ok()`. + use larql_compute::options::{env_flag, env_value}; + let raw_prompt = env_flag(ENV_RAW_PROMPT); + let enable_thinking = env_flag(ENV_THINKING); + let user_system = env_value(ENV_SYSTEM); + let suppress_default = env_flag(ENV_NO_DEFAULT_SYSTEM); if raw_prompt { return Ok(user_prompt.to_string()); @@ -400,47 +406,55 @@ mod integration_tests { assert!(read_chat_template(tmp.path()).is_none()); } - // ── render_user_prompt — serialised via mutex because the function - // reads process-global env vars that other tests would race on. ──── - use std::sync::{Mutex, OnceLock}; - - fn env_lock() -> &'static Mutex<()> { - static LOCK: OnceLock> = OnceLock::new(); - LOCK.get_or_init(|| Mutex::new(())) + // ── render_user_prompt — `render_user_prompt` reads its prompt-affecting + // env vars through the override-aware `larql_compute::options` helpers, + // so tests toggle them per-thread (NOT `std::env::set_var`, which races + // concurrent `getenv` on the decode path → SIGSEGV). No global lock is + // needed: each test's overrides are thread-local and cleared on drop. ── + + /// RAII guard that pins the prompt-affecting env vars on the current + /// thread (all unset by default, then any caller-supplied values) and + /// clears every override on drop. Panic-safe; needs no lock. + struct EnvGuard; + impl Drop for EnvGuard { + fn drop(&mut self) { + larql_compute::options::clear_fast_path_overrides(); + } } - - /// Helper: clear all prompt-affecting env vars while holding the - /// global lock. Returns the lock guard so the caller's mutations - /// stay isolated for the rest of the test. - fn lock_and_clear_env() -> std::sync::MutexGuard<'static, ()> { - // Recover from a poisoned lock (a panicking earlier test) — we - // just want serialisation, not panic propagation. - let guard = env_lock().lock().unwrap_or_else(|p| p.into_inner()); + /// Clear all prompt-affecting env vars on this thread (overrides them to + /// "unset"), then set any caller-supplied values. Returns the guard so the + /// overrides stay live for the rest of the test and are cleared on drop. + fn lock_and_clear_env_with(vars: &[(&'static str, &str)]) -> EnvGuard { for k in [ ENV_RAW_PROMPT, ENV_THINKING, ENV_SYSTEM, ENV_NO_DEFAULT_SYSTEM, ] { - unsafe { std::env::remove_var(k) }; + larql_compute::options::set_env_override(k, None); } - guard + for &(name, value) in vars { + larql_compute::options::set_env_override(name, Some(value)); + } + EnvGuard + } + + /// Convenience for the common "clear everything, set nothing" case. + fn lock_and_clear_env() -> EnvGuard { + lock_and_clear_env_with(&[]) } #[test] fn render_user_prompt_passes_through_when_raw_env_set() { - let _g = lock_and_clear_env(); - unsafe { std::env::set_var(ENV_RAW_PROMPT, "1") }; + let _g = lock_and_clear_env_with(&[(ENV_RAW_PROMPT, "1")]); let tmp = tempfile::tempdir().unwrap(); let result = render_user_prompt(tmp.path(), "tinymodel", "hello").unwrap(); - unsafe { std::env::remove_var(ENV_RAW_PROMPT) }; assert_eq!(result, "hello"); } #[test] fn render_user_prompt_uses_vindex_template_when_no_system() { - let _g = lock_and_clear_env(); - unsafe { std::env::set_var(ENV_NO_DEFAULT_SYSTEM, "1") }; + let _g = lock_and_clear_env_with(&[(ENV_NO_DEFAULT_SYSTEM, "1")]); let tmp = tempfile::tempdir().unwrap(); std::fs::write( tmp.path().join("chat_template.jinja"), @@ -448,17 +462,14 @@ mod integration_tests { ) .unwrap(); let result = render_user_prompt(tmp.path(), "tinymodel", "hi"); - unsafe { std::env::remove_var(ENV_NO_DEFAULT_SYSTEM) }; assert_eq!(result.unwrap(), "WRAP:hi"); } #[test] fn render_user_prompt_errors_when_no_template_and_system_requested() { - let _g = lock_and_clear_env(); - unsafe { std::env::set_var(ENV_SYSTEM, "you are helpful") }; + let _g = lock_and_clear_env_with(&[(ENV_SYSTEM, "you are helpful")]); let tmp = tempfile::tempdir().unwrap(); let err = render_user_prompt(tmp.path(), "unknown-family", "hi"); - unsafe { std::env::remove_var(ENV_SYSTEM) }; let msg = err.unwrap_err(); assert!( msg.contains("no chat template"), @@ -468,8 +479,7 @@ mod integration_tests { #[test] fn render_user_prompt_renders_with_explicit_system_message() { - let _g = lock_and_clear_env(); - unsafe { std::env::set_var(ENV_SYSTEM, "you are helpful") }; + let _g = lock_and_clear_env_with(&[(ENV_SYSTEM, "you are helpful")]); let tmp = tempfile::tempdir().unwrap(); std::fs::write( tmp.path().join("chat_template.jinja"), @@ -477,7 +487,6 @@ mod integration_tests { ) .unwrap(); let result = render_user_prompt(tmp.path(), "tinymodel", "hi"); - unsafe { std::env::remove_var(ENV_SYSTEM) }; let s = result.unwrap(); assert!(s.contains("you are helpful")); assert!(s.contains("hi")); @@ -485,8 +494,7 @@ mod integration_tests { #[test] fn render_user_prompt_uses_family_default_when_thinking_set() { - let _g = lock_and_clear_env(); - unsafe { std::env::set_var(ENV_THINKING, "1") }; + let _g = lock_and_clear_env_with(&[(ENV_THINKING, "1")]); let tmp = tempfile::tempdir().unwrap(); std::fs::write( tmp.path().join("chat_template.jinja"), @@ -494,7 +502,6 @@ mod integration_tests { ) .unwrap(); let result = render_user_prompt(tmp.path(), "gemma4", "hi"); - unsafe { std::env::remove_var(ENV_THINKING) }; let s = result.unwrap(); assert!(s.contains("")); assert!(s.contains("hi")); diff --git a/crates/larql-inference/src/layer_graph/generate/lm_head.rs b/crates/larql-inference/src/layer_graph/generate/lm_head.rs index c599513f6..a40c3e5d8 100644 --- a/crates/larql-inference/src/layer_graph/generate/lm_head.rs +++ b/crates/larql-inference/src/layer_graph/generate/lm_head.rs @@ -18,11 +18,13 @@ impl LmHeadPolicy { } } +/// `1`/`true`/`on`/`yes` → true, everything else (incl. unset) → false. +/// Delegates to the override-aware `options::env_opt_in` (identical vocabulary) +/// so tests can toggle the flag via the thread-local override instead of +/// `std::env::set_var`, which races concurrent `getenv` on the decode path → +/// SIGSEGV. fn env_bool(name: &str) -> bool { - matches!( - std::env::var(name).as_deref(), - Ok("1") | Ok("true") | Ok("on") | Ok("yes") - ) + larql_compute::options::env_opt_in(name) } /// Top-K logits lookup that transparently handles models with tied @@ -322,19 +324,27 @@ mod tests { #[test] fn env_bool_recognises_truthy_values() { - std::env::remove_var("LARQL_TEST_LMHEAD_ENV_BOOL"); - assert!(!env_bool("LARQL_TEST_LMHEAD_ENV_BOOL")); + // Toggle via the thread-local override (NOT `std::env::set_var`, which + // races concurrent `getenv` on the decode path → SIGSEGV). Cleared on + // drop so a panicking assert can't leak into a later test. + const VAR: &str = "LARQL_LM_HEAD_SKIP_Q4K"; + struct Clear; + impl Drop for Clear { + fn drop(&mut self) { + larql_compute::options::clear_fast_path_overrides(); + } + } + let _clear = Clear; + + larql_compute::options::set_env_override(VAR, None); + assert!(!env_bool(VAR)); for &v in &["1", "true", "on", "yes"] { - std::env::set_var("LARQL_TEST_LMHEAD_ENV_BOOL", v); - assert!( - env_bool("LARQL_TEST_LMHEAD_ENV_BOOL"), - "value {v:?} should be truthy" - ); + larql_compute::options::set_env_override(VAR, Some(v)); + assert!(env_bool(VAR), "value {v:?} should be truthy"); } // Falsy: anything else. - std::env::set_var("LARQL_TEST_LMHEAD_ENV_BOOL", "no"); - assert!(!env_bool("LARQL_TEST_LMHEAD_ENV_BOOL")); - std::env::remove_var("LARQL_TEST_LMHEAD_ENV_BOOL"); + larql_compute::options::set_env_override(VAR, Some("no")); + assert!(!env_bool(VAR)); } #[test] diff --git a/crates/larql-inference/src/layer_graph/grid/config.rs b/crates/larql-inference/src/layer_graph/grid/config.rs index e096111c8..023c37192 100644 --- a/crates/larql-inference/src/layer_graph/grid/config.rs +++ b/crates/larql-inference/src/layer_graph/grid/config.rs @@ -14,13 +14,17 @@ pub(super) struct GridRuntimeConfig { impl GridRuntimeConfig { pub fn from_env() -> Self { + // Read through the override-aware `options` helpers so tests can toggle + // these via the thread-local override (NOT `std::env::set_var`, which + // races concurrent `getenv` on the decode path → SIGSEGV). Behaviour is + // identical: `env_usize` = `var().ok().and_then(parse)`, `env_flag` = + // `var().is_ok()` (presence-as-truth, any value incl. empty). + use larql_compute::options::{env_flag, env_usize}; Self { - moe_top_k_override: std::env::var(ENV_MOE_TOP_K) - .ok() - .and_then(|s| s.parse::().ok()), - skip_moe: std::env::var(ENV_SKIP_MOE).is_ok(), - timing_enabled: std::env::var(ENV_MOE_TIMING).is_ok(), - split_disabled: std::env::var(ENV_MOE_NO_SPLIT).is_ok(), + moe_top_k_override: env_usize(ENV_MOE_TOP_K), + skip_moe: env_flag(ENV_SKIP_MOE), + timing_enabled: env_flag(ENV_MOE_TIMING), + split_disabled: env_flag(ENV_MOE_NO_SPLIT), token_policy: TokenSelectionPolicy::from_env(), } } @@ -38,34 +42,25 @@ mod tests { #[test] fn from_env_returns_default_when_no_vars_set() { - // Ensure none of the env vars are set so we exercise the default - // arms of every `.is_ok()` / `.parse().ok()` chain. - let prev_topk = std::env::var(ENV_MOE_TOP_K).ok(); - let prev_skip = std::env::var(ENV_SKIP_MOE).ok(); - let prev_timing = std::env::var(ENV_MOE_TIMING).ok(); - let prev_split = std::env::var(ENV_MOE_NO_SPLIT).ok(); - std::env::remove_var(ENV_MOE_TOP_K); - std::env::remove_var(ENV_SKIP_MOE); - std::env::remove_var(ENV_MOE_TIMING); - std::env::remove_var(ENV_MOE_NO_SPLIT); + // Force every MoE var to act as unset via the thread-local override + // (NOT `std::env::set_var`, which races concurrent `getenv` on the + // decode path → SIGSEGV) so we exercise the default arms of every + // `env_usize` / `env_flag` read regardless of the ambient process env. + // Cleared on drop; per-thread, so no cross-test leakage. + struct Clear; + impl Drop for Clear { + fn drop(&mut self) { + larql_compute::options::clear_fast_path_overrides(); + } + } + let _clear = Clear; + for var in [ENV_MOE_TOP_K, ENV_SKIP_MOE, ENV_MOE_TIMING, ENV_MOE_NO_SPLIT] { + larql_compute::options::set_env_override(var, None); + } let cfg = GridRuntimeConfig::from_env(); assert!(cfg.moe_top_k_override.is_none()); assert!(!cfg.skip_moe); - - // Restore. - if let Some(v) = prev_topk { - std::env::set_var(ENV_MOE_TOP_K, v); - } - if let Some(v) = prev_skip { - std::env::set_var(ENV_SKIP_MOE, v); - } - if let Some(v) = prev_timing { - std::env::set_var(ENV_MOE_TIMING, v); - } - if let Some(v) = prev_split { - std::env::set_var(ENV_MOE_NO_SPLIT, v); - } } #[test] diff --git a/crates/larql-inference/src/vindex/dequant.rs b/crates/larql-inference/src/vindex/dequant.rs index 62d214da6..746db7315 100644 --- a/crates/larql-inference/src/vindex/dequant.rs +++ b/crates/larql-inference/src/vindex/dequant.rs @@ -86,21 +86,21 @@ mod tests { /// Pin `LARQL_Q4K_ATTN_INT8=0` for the f32-activation Q4K-direct parity /// tests below: they assert the strict `<1e-3` weight bound, which only the /// f32-activation route satisfies. The int8 route is on by default and - /// carries a looser (~2% scale-relative) bound by design. Restores the - /// prior env value on drop (even if an assertion panics). - struct Int8OffGuard(Option); + /// carries a looser (~2% scale-relative) bound by design. Uses the + /// thread-local override (NOT `std::env::set_var`, which races concurrent + /// `getenv` on the decode path → SIGSEGV); cleared on drop. + struct Int8OffGuard; impl Drop for Int8OffGuard { fn drop(&mut self) { - match self.0.take() { - Some(v) => std::env::set_var("LARQL_Q4K_ATTN_INT8", v), - None => std::env::remove_var("LARQL_Q4K_ATTN_INT8"), - } + larql_compute::options::clear_fast_path_overrides(); } } fn pin_int8_off() -> Int8OffGuard { - let guard = Int8OffGuard(std::env::var_os("LARQL_Q4K_ATTN_INT8")); - std::env::set_var("LARQL_Q4K_ATTN_INT8", "0"); - guard + larql_compute::options::set_fast_path_override( + larql_compute::options::ENV_Q4K_ATTN_INT8, + false, + ); + Int8OffGuard } /// `ensure_attn_tensors_dequantised` populates every layer's diff --git a/crates/larql-inference/src/vindex/kquant_forward/hidden.rs b/crates/larql-inference/src/vindex/kquant_forward/hidden.rs index aeb4d5534..2c9cd23a7 100644 --- a/crates/larql-inference/src/vindex/kquant_forward/hidden.rs +++ b/crates/larql-inference/src/vindex/kquant_forward/hidden.rs @@ -491,26 +491,36 @@ mod tests { /// `predict_kquant_hidden` with both `LARQL_CPU_DUMP_LAYERS` and /// `LARQL_CPU_STAGE_DUMP` set drives the dump branches inside the /// main loop (lines 30-33, 78-84) and inside `run_moe_layer_cpu` - /// (lines 143-147, 190-194). Serialized via a local mutex because - /// `DumpConfig::get()` reads process-global env vars on every call. + /// (lines 143-147, 190-194). The flags are toggled via the thread-local + /// override (NOT `std::env::set_var`, which races concurrent `getenv` on + /// the decode path → SIGSEGV); `DumpConfig::from_env` reads them through + /// the override-aware `options::env_value` helper, so the override reaches + /// the producer in this same thread. No serialising mutex needed — the + /// override is per-thread, so it can't leak into a parallel test. #[test] fn predict_kquant_hidden_writes_dumps_when_env_vars_set() { - use std::sync::{Mutex, OnceLock}; - static LOCK: OnceLock> = OnceLock::new(); - let _g = LOCK - .get_or_init(|| Mutex::new(())) - .lock() - .unwrap_or_else(|p| p.into_inner()); + use larql_compute::forward::dump_config::{ENV_CPU_DUMP_LAYERS, ENV_CPU_STAGE_DUMP}; + + /// Clears the thread-local overrides on drop so a panicking assert + /// can't leak them into a later test on the same worker thread. + struct DumpEnvGuard; + impl Drop for DumpEnvGuard { + fn drop(&mut self) { + larql_compute::options::clear_fast_path_overrides(); + } + } let layer_dir = tempfile::tempdir().expect("layer dump tempdir"); let stage_dir = tempfile::tempdir().expect("stage dump tempdir"); - let prev_l = std::env::var("LARQL_CPU_DUMP_LAYERS").ok(); - let prev_s = std::env::var("LARQL_CPU_STAGE_DUMP").ok(); - // SAFETY: held lock serialises env reads/writes for this test. - unsafe { - std::env::set_var("LARQL_CPU_DUMP_LAYERS", layer_dir.path()); - std::env::set_var("LARQL_CPU_STAGE_DUMP", stage_dir.path()); - } + let _guard = DumpEnvGuard; + larql_compute::options::set_env_override( + ENV_CPU_DUMP_LAYERS, + Some(layer_dir.path().to_str().expect("utf-8 tempdir path")), + ); + larql_compute::options::set_env_override( + ENV_CPU_STAGE_DUMP, + Some(stage_dir.path().to_str().expect("utf-8 tempdir path")), + ); use crate::test_utils::{make_test_gemma4_moe_weights, make_test_q4k_vindex}; let mut weights = make_test_gemma4_moe_weights(); @@ -518,18 +528,6 @@ mod tests { let h = predict_kquant_hidden(&mut weights, &[0u32, 1], &index, None); assert_eq!(h.shape(), &[2, weights.hidden_size]); - // Restore env (or remove if not previously set). - unsafe { - match prev_l { - Some(v) => std::env::set_var("LARQL_CPU_DUMP_LAYERS", v), - None => std::env::remove_var("LARQL_CPU_DUMP_LAYERS"), - } - match prev_s { - Some(v) => std::env::set_var("LARQL_CPU_STAGE_DUMP", v), - None => std::env::remove_var("LARQL_CPU_STAGE_DUMP"), - } - } - // Embed dump must exist (written at line 33 unconditionally when // layer_dir is Some). Per-layer dumps land under cpu_layer_NN.f32. assert!( diff --git a/crates/larql-inference/src/vindex/walk_ffn/mod.rs b/crates/larql-inference/src/vindex/walk_ffn/mod.rs index a33221aa4..347b5e803 100644 --- a/crates/larql-inference/src/vindex/walk_ffn/mod.rs +++ b/crates/larql-inference/src/vindex/walk_ffn/mod.rs @@ -199,12 +199,18 @@ thread_local! { static WALK_TRACE_ENABLED: std::cell::Cell> = const { std::cell::Cell::new(None) }; } +/// `LARQL_WALK_TRACE=1` — emit the per-feature walk trace to stderr. +const ENV_WALK_TRACE: &str = "LARQL_WALK_TRACE"; + fn walk_trace_env_enabled() -> bool { WALK_TRACE_ENABLED.with(|c| { if let Some(v) = c.get() { return v; } - let enabled = std::env::var("LARQL_WALK_TRACE").ok().as_deref() == Some("1"); + // Route through the override-aware `options` helper so tests can toggle + // this via the thread-local override (NOT `std::env::set_var`, which + // races concurrent `getenv` on the decode path → SIGSEGV). + let enabled = larql_compute::options::env_value(ENV_WALK_TRACE).as_deref() == Some("1"); c.set(Some(enabled)); enabled }) @@ -861,19 +867,17 @@ mod dispatch_tests { /// `walk_trace_env_enabled` caches the env-var lookup in a thread-local. /// Spawn a fresh thread so the cell starts empty, set `LARQL_WALK_TRACE=1` - /// in that thread, then drive `forward` — `trace_path` reads the cache - /// (first call populates it as `Some(true)`) and emits to stderr - /// (line 145-147). + /// in that thread via the thread-local override (NOT `std::env::set_var`, + /// which races concurrent `getenv` on the decode path → SIGSEGV), then + /// drive `forward` — `trace_path` reads the cache (first call populates it + /// as `Some(true)`) and emits to stderr (line 145-147). The override is + /// set INSIDE the spawned thread because both the override map and the + /// `WALK_TRACE_ENABLED` cache are thread-local; it is cleared before the + /// thread returns. #[test] fn walk_ffn_trace_path_honours_env_var_in_fresh_thread() { let handle = std::thread::spawn(|| { - // SAFETY: thread-isolated env var. The whole point of running - // in a dedicated thread is that no other test sees this var - // mid-flight — and we wipe it before returning. Set + remove - // are bracketed within a single thread's lifetime. - unsafe { - std::env::set_var("LARQL_WALK_TRACE", "1"); - } + larql_compute::options::set_env_override(super::ENV_WALK_TRACE, Some("1")); let weights = make_test_weights(); let idx = MockGateIndex { n_features: weights.intermediate_size, @@ -888,9 +892,7 @@ mod dispatch_tests { .unwrap(); // Drives trace_path, which checks walk_trace_env_enabled. ffn.forward(0, &x); - unsafe { - std::env::remove_var("LARQL_WALK_TRACE"); - } + larql_compute::options::clear_fast_path_overrides(); }); handle.join().expect("env-var thread panicked"); } diff --git a/crates/larql-kv/ROADMAP.md b/crates/larql-kv/ROADMAP.md index ffa66a942..2fc6a8c1c 100644 --- a/crates/larql-kv/ROADMAP.md +++ b/crates/larql-kv/ROADMAP.md @@ -90,6 +90,95 @@ f32 attention. **Fixed:** Final matrix: standard 34.5 / unlimited 32.1 / markov 27.9 / codec 27.7 / boundary-kv 27.4 / turbo 21.1 — all **0.6–1.0× of standard** (was 0.24–0.31× for the slow three). 756 kv tests green debug+release, clippy clean. +- **Comparative bottleneck review + walk allocation fix 2026-06-14.** Profiled + each engine's driver vs standard: the **shared** wall is the Q6_K expert + matvec (all engines inherit it); each engine's *delta* is its feature + machinery. markov/codec's −19/−20% was NOT the residual-store memcpy (~0.8% of + the driver) — it was **per-step allocation churn**: the resident walk's + `Array2::zeros((s_old+1, h))` rebuild + the cached-K/V `to_owned` + (`__bzero`+`szone_malloc` ≈ 2450 driver samples, idling the worker pool at 48% + vs standard's 80%). **Fixed:** the cache_eligible walk now `append_row`s + `stored` in place into the W8.2 doubling-capacity buffer (mirrors dispatch.rs) + and borrows `hot_kv` into attention via `Cow` instead of copying. Churn + collapsed 2450→150 samples (~16×); **markov/standard ratio 0.81×→0.975×, codec + 0.80×→~1.0×** (same battery state, back-to-back). Parity: resident_identity + (markov+codec, 10-step, buffer doubles) bit-exact + debug K/V assert. turbo's + −39% is **inherent** (must decode compressed K/V to attend; already + parallelized); boundary-kv/unlimited deltas are small (frame-emit/windowing). + Remaining markov/codec ~2.5% = walk-attention serial work (shared walk + frontier — full K/V concat + generic GQA vs standard's in-place handle). +- **In-place hot-K/V on the resident walk 2026-06-14 (closes the concat half).** + The named ~2.5% above was the walk-attention **owned concat**: the resident + walk drove `run_attention_block_decode_step_q4k_direct`, which allocates a + fresh `[ctx+1, kv_dim]` K *and* V every layer every step and copies the whole + prior cache into it before attending — **O(L²)** cache copy over an L-token + generation, vs `standard`'s in-place append handle (O(L)). The split + project→append→attend halves already existed for the dispatch path; the walk + just didn't use them. **Built** `run_attention_block_decode_step_{q4k_direct, + auto}_inplace` (larql-compute `attention/decode.rs`): projects the new row, + appends it into the caller's **doubling-capacity** K/V buffer (grows like + `stored`), and attends over the `[..len+1]` views — no concat. **Wired** + markov_residual + markov_residual_codec resident walks: step-1 still + recompute-seeds `hot_kv`; steps 2+ append in place (the steady state). The + windowed/cold tiers and the flags-off f32 path keep the owned concat + unchanged. Gated `LARQL_MARKOV_INPLACE_KV` (default on; `=0` → owned concat, + the A/B reference + escape hatch). **Parity (bit-exact, 4 gates):** compute- + level `inplace ≡ q4k_direct` concat across a capacity doubling; engine-level + in-place-vs-owned-concat A/B with Q4K-direct **on** for markov *and* codec + (hidden states bit-identical every step); `resident_identity` flags-off still + green (in-place branch's None-fallback = owned concat); 758 kv + 705 compute + + 1220 inference green debug & opt, clippy clean. (The debug `hot_kv ≡ + recompute_kv` assert is gated to the f32 path — the Q4K route's projections + differ from `recompute_kv` by >1e-2 even in f32-act; its oracle is the A/B.) + The two q4k-flag-mutating tests serialise on `Q4K_FLAG_ENV_LOCK` (those flags + read process env on the driver thread — no thread-local). **Perf is + structural** (eliminates the O(L²) per-step copy; the win grows with context — + it's the long-ctx tax behind the C10 1.29× vs short 1.15×). **Measured (26B, + CPU MoE in-process, M3 Max t=8, n=128 warm, `LARQL_MARKOV_INPLACE_KV` A/B, + same engine ordering):** markov 32.5→34.5, codec 32.5→34.6 with in-place on — + and the three untouched controls (standard/unlimited/turbo) drifted *down* + −3/−8/−6% across the A/B (machine warming), so drift-corrected the change is + **~+11–12%**. Final warm matrix (in-place on = production default): codec + **36.5** / standard 36.0 / markov **36.0** / unlimited 33.3 / boundary-kv 36.5 + p50 (mean skewed by frame-emit spikes) / turbo 21.2 (inherent) — **markov/codec + now AT parity with standard** (was 0.81× at the arc's start), the whole cached + cluster **~12% ahead of llama.cpp's 32.1**. Caveat: bench box was at ~58% + charging (not cool-dedicated); ordering + A/B *direction* are robust, absolutes + drifted ~5–8% run-to-run — a cool-box rerun would firm them. (NB: the first + engine in a fresh process eats the 30GB page-in — standard read 21.8 cold, + 34–36 warm; warm runs are the fair matrix.) +- **Propagated the in-place lever to the two remaining walk engines + faithfulness + audit 2026-06-14.** A full cross-engine spec/contract audit (all 9 engines vs + `state-policy.md`'s `(canonical, derivative, contract)` triple) found every + engine faithful, and flagged the two siblings still paying the O(L²) owned + concat the markov/codec in-place change eliminated: + - **boundary_per_layer (was the one NEEDS-FIX)** — carried NO `hot_kv` at all: + it `recompute_kv`'d the whole hot tier *and* rebuilt an owned `[ctx+1]` concat + every layer every step (worse than markov *pre*-W2). Added a `hot_kv` + derivative + the W2-cache + `run_attention_block_decode_step_auto_inplace` + steady state, mirroring its twin codec — only active in the `cache_eligible` + (unbounded, no cold) path, like codec; the windowed/cold path (its primary + purpose) is untouched. `hot_kv` is excluded from `memory_bytes` (droppable + derivative, matches markov). Engine-level in-place-vs-owned-concat A/B (q4k on) + bit-identical; f32-gated debug `hot_kv ≈ recompute_kv` assert. + - **unlimited_context** — its CPU window walk (`extend.rs`) passed the whole + window K/V by value → backend re-concats `[n+1]` per layer per step (its own + doc admitted "O(window²) total"). Added `rs_extend_inplace` (appends into the + window's doubling-capacity buffer, attends over views), wired into + `extend_current` only when eligible (index + toggle + q4k); `replay_window` / + quant / executor / tests keep the owned concat. The engine's existing + `current_window_kv_len` counter already treated the buffers as over-allocated + (the dispatch path did), so `close_window`/`current_kv_bytes` needed no change. + A/B (q4k on) bit-identical; `resident_identity` flags-off still green. + Both reuse the shared `LARQL_MARKOV_INPLACE_KV` toggle + `Q4K_FLAG_ENV_LOCK`. + Also: **apollo footgun guard** — `injection_layer < crystal_layer` silently + no-ops the retrieval-injection (the compressed forward starts at `crystal`); + added a one-time runtime warning in `prepare_injection` (experimental engine, + warn-don't-fail). Doc-drift swept: boundary-kv spec now flags `resume` as + NOT-IMPLEMENTED (emit half only), apollo spec `KvEngine`→`RetrievalEngine`, + `state-policy.md` `fallback_mode` marked retired (per its own §8 resolution). + 760 kv tests green debug + opt, clippy clean. (Same caveat as above: turbo's + −39% is inherent; boundary-kv inherits standard's opts via resident forwarding.) Prefill stays on the f32 BLAS gemm for all engines deliberately (the task #16 prefill falsification: q4k repeated-matvec loses ~20× to AMX at @@ -290,6 +379,161 @@ config-injection refactor. ## Open work +### P0 — codebase-health frontier (audit 2026-06-14) + +A whole-codebase review (engine faithfulness audit + clippy/coverage sweep) +surfaced four "finish-the-started-refactor" items. None is greenfield — the +ROADMAP already points at #7 and the `LayerExecutor` migration. Ordered by +risk/leverage; the first is a live correctness bug. + +1. **Spin pool under heavy oversubscription — INVESTIGATED, pool is SOUND + (2026-06-14).** On a heavily-loaded host (the spin-barrier pool spinning while + the user's work pinned every core), the parallel test suite showed *rare* + intermittent failures across diverse tests — clean with `LARQL_SPIN_POOL=0` + (faster too) and single-threaded, which read as a contention correctness bug. + **It is not.** The pool's synchronization was falsified-as-buggy two ways: + (a) code analysis — the completion barrier's `completed.fetch_add(Release)` / + `load(Acquire)`-on-the-final-count and the `epoch.fetch_add(Release)` / + `load(Acquire)` task publication are a correct release/acquire pair, and the + static strided ownership + the barrier make the dispatcher wait for every + worker before advancing (so `data`/`tramp` can't go stale and cross-dispatch + read-after-write is visible); (b) two new stress guards in `spin_pool.rs` — + disjoint-write under EXTREME oversubscription (2× burner threads + N + concurrent dispatchers + 4000 rounds) and **cross-dispatch read-after-write** + under oversubscription — both stayed correct. Several of the "failures" were + also misreads: `--nocapture` surfaces `#[should_panic]` and + internally-`catch_unwind`'d expected panics (e.g. the empty-haystack + `embed` test) that are NOT failures. **ROOT CAUSE FOUND — it was the env + race, not the pool.** The decode path reads the q4k flags via `getenv` + (`larql_compute::options::fast_path_on`) on every token; several TESTS toggled + those flags with `std::env::set_var`, and concurrent `setenv`/`getenv` + SIGSEGVs libc (and, short of a crash, returns an *inconsistent* flag mid-test + → e.g. the in-place form reads int8-on while the owned-concat form reads + int8-off → a bit-identity test "diverges"). Reproduced deterministically: + `larql-compute`'s `q4k_direct_decode_step_matches_dequant_path` `set_var`s + `LARQL_Q4K_ATTN_INT8` and flaked the sibling `q4k_direct_inplace_is_bit_identical` + test. **Fixed:** all q4k `set_var` test sites in BOTH crates (5 in larql-kv, + 3 in larql-compute) moved to a **thread-local override** + (`set_fast_path_override` / `FastPathGuard` / `Q4kFlagGuard`); no test mutates + process env for these flags anymore. Both suites now pass clean 3× in parallel + (706 compute + 765 kv) under load. The spin pool just amplified the window by + slowing runs. **Remaining:** the generic `with_env*` helpers (moe/options + tests) still `set_var` *other* vars — same class, folded into the env-sprawl + item below. Two spin-pool stress guards (disjoint-write + cross-dispatch + read-after-write under oversubscription) stay as regression pins. + +2. **Env-var sprawl.** ~141 `LARQL_*` literals across 9 crates, **5 partial + registries** with 3 different patterns, no single source. The + `set_var`-in-tests pattern is also a **segfault class** — concurrent + `setenv`/`getenv` SIGSEGVs libc. + + **Phase 1 — decode fast-path flags registry: DONE (2026-06-14).** Folded the + six decode fast-path flags (`LARQL_Q4K_DIRECT_ATTN`/`_ATTN_INT8`/`_LM_HEAD`/ + `_DIRECT_FFN`/`_ASM`, `LARQL_SPIN_POOL`) — four former per-token `getenv`s + + two ad-hoc per-stage `OnceLock`s — into ONE typed `larql_compute::options:: + DecodeOptions`, `from_env()` once and cached (`decode_options()`); the + `*_enabled()` accessors read it (no per-token `getenv`). Tests toggle stages + via a **thread-local override** (`set_fast_path_override` / `FastPathGuard` / + larql-kv `Q4kFlagGuard`), which wins over the cache — so no test mutates + process env for these flags. **All `set_var` sites of these flags migrated** + workspace-wide (5 larql-kv + 3 larql-compute + 1 larql-inference) → the + segfault/flake class is gone for the decode path; compute 706 + kv 765 + + inference 1220 green, stable 3× in parallel, clippy clean. + + **Phase 2a — general override + larql-compute fully migrated: DONE + (2026-06-14).** Generalised the thread-local override to ALL of + `larql_compute::options`' env helpers (`env_flag`/`env_opt_out`/`env_opt_in`/ + `env_usize`/`env_value`/`env_nonempty_value`/`env_not_zero_or_default`) via a + single `ENV_OVERRIDES` map + an `env_effective(name)` choke point; extracted + the `"0"/"true"/…` vocabulary into pure `is_opt_{out,in}_value` parsers + (directly unit-tested). Added `set_env_override(name, Option<&str>)` (value + override; `set_fast_path_override` is now a bool wrapper). Migrated **every + remaining `set_var` test helper in larql-compute** to it — `options`' + `with_env_vars`, `moe/forward`'s `with_env`, `moe/expert`'s + `with_env_in_thread` (sets the override *inside* the spawned thread so the + TLS-cached `Q4K_DIRECT`/`EXPERT_TIMING` reads see it), `dump_config` (now reads + via `env_value`/`env_usize`). **larql-compute src now has ZERO `env::set_var`**; + 707 tests stable 3× in parallel, clippy clean. The crate where the SIGSEGV was + demonstrated is now race-free for env. + + **Phase 2b — our-flag migration extended: largely DONE (2026-06-15).** + Migrated the our-flag `set_var` test sites in larql-inference (chat, + layer_graph/{generate/lm_head,grid/config}, vindex/{walk_ffn,kquant_forward/ + hidden}, plus the already-done dequant) and larql-lql (executor + compile + into_model/into_vindex) to the override (routing raw `std::env::var` reads + through `options::*` where needed). compute 707 + kv 765 + inference 1220 + + lql 726 + server 306 green, workspace builds + clippy clean. + + **Phase 2b — the remaining `set_var` is NOT override-addressable** (the key + finding). ~59 of the ~74 remaining sites are **external/process-global env**: + larql-vindex HF (`HF_HOME`/`HF_TOKEN`/`HF_HUB_CACHE`/`HOME`, read by the HF + client) and larql-models loading. The thread-local override **cannot** reach + them — an external reader uses real `getenv` — so they MUST use `set_var`; + they're already **serialised via a per-module `ENV_LOCK` Mutex**, which is the + correct (and only) mechanism for process-global env. Leave them (the residual + `HOME`-vs-unrelated-`getenv` race is inherent to testing process-global env, + not fixable by us). The small genuinely-remaining our-flag tail is all **cold + diagnostic/config**, low-risk: `residual_diff/{stages,capture}` (dump-dir + + env-save/restore-semantics tests — migrating changes what they test, do with + care), cli `diagnostics/parity` (cross-backend: CPU dump vars are now + override-aware via `DumpConfig`, the Metal dump var is read by larql-metal so + it'd need metal-side routing), server `env_flags` (its own OnceLock-cached + accessors — route through `options::*` or accept read-once), and metal + `options` `DecodeFlags` tests (separate platform-gated binary). The one PRODUCTION + smell — `larql-cli extract_index_cmd.rs` set `LARQL_SUMMARY_FEATURES_PER_EXPERT` + as an env **side-channel** into the streaming gate path — is **FIXED**: threaded + as a `summary_features_per_expert: usize` parameter from CLI → + `build_vindex_streaming` → `StreamingContext` → `down_meta`/`gate_vectors` + stages (the ~26 call-site API ripple the env hack was avoiding). The + `SummaryEnvGuard` test scaffold and its `#[serial]` are gone; the summary-tier + test passes K directly. No `LARQL_SUMMARY_FEATURES_PER_EXPERT` remains anywhere. + + **Phase 2c+ (open, lower-value).** markov cluster: own thread-local override + (`read_markov_env`), per-layer uncached but cheap-when-unset — fold into a + cached struct + unify with `ENV_OVERRIDES`. `LARQL_MOE_TIMING` read in 4 + places; collapse the ~7 timing flags → `LARQL_TIMING=…`, dump flags → + `LARQL_DUMP*` (user-facing → aliases). `SKIP_MOE` vs `LARQL_SKIP_MOE` are + **two different names** (compute `LARQL_SKIP_MOE`, inference `runtime.rs` + unprefixed `SKIP_MOE`) — back-compat alias, not a rename. (NB: `LARQL_W10_HONLY` + is **NOT** dead — live in the W10 mask cascade; an earlier audit mis-flagged + it.) Optional purity: thread `DecodeOptions` through engine signatures to drop + the global. + +3. **Quantization meshing — finish deferred ROADMAP #7 (`FormatRoute`).** + `QuantFormat` exists with helpers (`packed_matrix_bytes`, `packed_block_layout`, + `is_kquant_family`) and a clean dispatch point (`backend.quant_matvec`), but + hand-rolled fast paths bypass them and re-mesh magic numbers. Verified worst + offenders: `larql-compute/src/attention/decode.rs:405` (`(in_dim/256)*144` / + `*210` — duplicates `packed_matrix_bytes(1, in_dim)`); `cpu/ops/moe/expert.rs` + (silently **Q4_K-only**: `matches!(format, Q4_K)` + hardcoded + `Q4_K_BLOCK_BYTES`); `pipeline_layer.rs`'s twin `attn_str_to_format`/ + `ffn_str_to_format` panicking string tables. **Proposal:** *Step 1 (≈1 hr, + zero-risk):* swap the magic numbers for `packed_matrix_bytes()`. *Step 2:* a + `QuantFormat::q8k_matvec_into_fn()` kernel table + `from_registry_tag()` so a + new format is ~3 edits, not ~49 files. + +4. **Engine pluggability — finish the `LayerExecutor` migration.** A new engine + needs 4 required methods but **~8 boilerplate overrides** (the + `*_quant`/`*_resident`/`*_via_executor` cross-product, all of which every + shipped engine overrides) + **6 hand-synced registration sites** in + `lib.rs` (`EngineKind` variant / `from_name` / `display_name` / + `supported_names` / `build_with_profiling` / CLI) — one of them a **duplicate + `KvCacheKind` parser** in `larql-cli/run_cmd.rs`. Shared scaffolding exists + (`engines::layer_ffn_or_moe`, `run_attention_block_decode_step_auto`, + `LocalWalkExecutor`) but each engine still hand-wires its per-layer loop. + **Proposal:** one `decode_step_walk` + a `KvEngineState` policy trait (append/ + read K/V + state-policy hooks) collapses the 8-method cross-product to thin + adapters; a `register_engine!` macro (or `inventory`) removes the 6 sites and + makes `engine_kind_supported_names_covers_every_variant` unnecessary; delete + the duplicate `KvCacheKind` (route `--kv-cache` through `EngineKind::from_name`, + which already accepts `standard`/`none`/`markov-bounded`). `AnyEngine`'s + hand-written sum-type forwarders should be macro/`enum_dispatch`-generated too. + + **Quick wins** (low-risk, do-now candidates): quant Step 1 + (magic-numbers→helper), retire `LARQL_W10_HONLY` + fold `SKIP_MOE`, delete the + `KvCacheKind` duplicate. The larger refactors (DecodeOptions threading, the + engine-walk collapse, the kernel-fn table) are scoped follow-ups. + ### P1 — MoE-aware KV engines (C1) — new 2026-05-28 The KvEngine layer is **dense-only today**: `do_prefill` / `do_decode_step` diff --git a/crates/larql-kv/docs/state-policy.md b/crates/larql-kv/docs/state-policy.md index 1f0c89cd1..862f50fc9 100644 --- a/crates/larql-kv/docs/state-policy.md +++ b/crates/larql-kv/docs/state-policy.md @@ -165,13 +165,21 @@ The trait is a sketch, not a v1 commitment. Names and signatures will move; the *shape* — what an engine has to be able to answer — is the load-bearing claim. +> **Update (resolved in §8 Q1, 2026-05-24):** `fallback_mode` was +> retired. There is no implicit per-engine fallback. An engine that +> can't serve returns a typed `EngineError` (e.g. `RetrievalMiss`); +> composition is explicit via `AnyEngine::{Kv, Retrieval}`, not a +> hidden fall-through. The `fallback_mode` accessor below and the +> "`Apollo` falls through to `StandardEngine`" example are kept for +> the historical record only — neither is implemented. + ```rust pub trait StatePolicy { fn canonical_state(&self) -> CanonicalStateKind; fn derivative_state(&self) -> &[DerivativeKind]; fn correctness_contract(&self) -> CorrectnessContract; fn calibration_requirements(&self) -> CalibrationRequirements; - fn fallback_mode(&self) -> FallbackMode; + fn fallback_mode(&self) -> FallbackMode; // retired — see note above fn memory_accounting(&self) -> MemoryAccounting; fn execution_requirements(&self) -> ExecutionRequirements; } @@ -188,11 +196,12 @@ Each accessor's purpose: - **`calibration_requirements`** — does the engine need a calibration corpus before serving (`BoundaryPerLayerEngine` yes; `StandardEngine` no)? What does it calibrate over? -- **`fallback_mode`** — what does the engine do when its contract - can't hold? (`Apollo` falls through to `StandardEngine` on a - store miss; `MarkovResidualEngine` cannot fall back — its - contract is conditional on architecture, and the architecture - is a static fact.) +- **`fallback_mode`** *(retired — see the note above §4)* — the + original idea was "what does the engine do when its contract can't + hold?" The resolved design has no implicit fallback: `Apollo` + surfaces a store miss as `EngineError::RetrievalMiss` (the caller + decides), and `MarkovResidualEngine` cannot fall back anyway — its + contract is conditional on architecture, a static fact. - **`memory_accounting`** — `hot_bytes()` + `cold_bytes()` split, attributed to canonical vs derivative. Required to surface things like the `UnlimitedContextEngine` window-shadow diff --git a/crates/larql-kv/src/engines/apollo/engine.rs b/crates/larql-kv/src/engines/apollo/engine.rs index 552db8f08..5f62659bd 100644 --- a/crates/larql-kv/src/engines/apollo/engine.rs +++ b/crates/larql-kv/src/engines/apollo/engine.rs @@ -257,6 +257,26 @@ impl ApolloEngine { let boundary = store.boundaries.get(top_window as usize).cloned(); let crystal = store.manifest.crystal_layer; + // Guard the silent-no-op footgun: the compressed forward runs only + // `crystal..num_layers`, so an `injection_layer < crystal` never reaches + // the perturbation layer and the retrieval-injection is silently dropped + // (the engine then degrades to plain boundary replay). Warn once rather + // than failing — Apollo is experimental and the contract is only + // task-level — but make the misconfiguration loud. (Selector example + // `apollo:layer=25` against a default `crystal=30` store trips this.) + if boundary.is_some() && self.config.injection_layer < crystal { + static WARNED: std::sync::Once = std::sync::Once::new(); + WARNED.call_once(|| { + eprintln!( + "[apollo] WARNING: injection_layer ({}) < crystal_layer ({crystal}): the \ + compressed forward starts at crystal_layer, so the injection at \ + injection_layer is SKIPPED (no retrieval-injection occurs). Set \ + injection_layer >= crystal_layer.", + self.config.injection_layer, + ); + }); + } + Some((context, Array1::from(delta), boundary, crystal)) } diff --git a/crates/larql-kv/src/engines/boundary_kv/engine.rs b/crates/larql-kv/src/engines/boundary_kv/engine.rs index f8cc2e3f6..9e4c7bbc9 100644 --- a/crates/larql-kv/src/engines/boundary_kv/engine.rs +++ b/crates/larql-kv/src/engines/boundary_kv/engine.rs @@ -788,12 +788,74 @@ mod tests { assert!(chain.is_empty()); } - // ── Q4K paths ── + // ── Resident + Q4K-quant forwarding paths ── // - // `prefill_quant` / `decode_step_quant` delegate to `StandardEngine`, whose - // CPU fallback uses `ensure_attn_tensors_dequantised`. The synthetic - // `make_test_vindex` fixture doesn't carry Q4K attn slices, so the CPU - // fallback panics — these paths are Metal-only end-to-end. The - // delegation surface itself is a one-line passthrough; the underlying - // behaviour is covered by `standard.rs`'s Q4K tests. + // `prefill_resident`/`decode_step_resident`/`prefill_quant`/`decode_step_quant` + // forward to the inner `StandardEngine` (threading `index` → Q4K-direct) and + // emit a boundary frame identically to the plain path. The `make_test_q4k_*` + // fixtures carry Q4K attn slices, so the CPU dequant fallback runs (the same + // fixtures `resident_identity_tests` and `unlimited_context`'s quant tests + // use). chunk_tokens=2 + a 2-token prefill lands on a boundary, exercising + // the frame-emit branch on each. + + #[test] + fn prefill_and_decode_resident_forward_and_emit() { + use larql_inference::ffn::NullFfn; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let ffn = NullFfn; + let mut eng = BoundaryKvEngine::new(config("seq", 2)); + let h = eng + .prefill_resident(&weights, &ffn, &index, &[0u32, 1]) + .expect("prefill_resident"); + assert!(h.iter().all(|v| v.is_finite())); + assert_eq!(eng.abs_position(), 2); + assert_eq!(eng.archive().total_frames(), Some(1), "landed on boundary → frame"); + let h2 = eng + .decode_step_resident(&weights, &ffn, &index, 2) + .expect("decode_step_resident"); + assert!(h2.iter().all(|v| v.is_finite())); + assert_eq!(eng.abs_position(), 3); + } + + #[test] + fn prefill_and_decode_quant_forward_and_emit() { + use larql_inference::ffn::NullFfn; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + let mut weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let backend = larql_compute::cpu_backend(); + let ffn = NullFfn; + let mut eng = BoundaryKvEngine::new(config("seq", 2)); + let h = eng + .prefill_quant(&mut weights, &ffn, &index, &[0u32, 1], &*backend) + .expect("prefill_quant"); + assert!(h.iter().all(|v| v.is_finite())); + assert_eq!(eng.archive().total_frames(), Some(1)); + let h2 = eng + .decode_step_quant(&mut weights, &ffn, &index, 2, &*backend) + .expect("decode_step_quant"); + assert!(h2.iter().all(|v| v.is_finite())); + assert_eq!(eng.abs_position(), 3); + } + + #[test] + fn resident_and_quant_reject_empty_prompt() { + use larql_inference::ffn::NullFfn; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + let mut weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let backend = larql_compute::cpu_backend(); + let ffn = NullFfn; + let mut eng = BoundaryKvEngine::new(config("s", 4)); + assert!(matches!( + eng.prefill_resident(&weights, &ffn, &index, &[]), + Err(EngineError::EmptyPrompt) + )); + assert!(matches!( + eng.prefill_quant(&mut weights, &ffn, &index, &[], &*backend), + Err(EngineError::EmptyPrompt) + )); + } } diff --git a/crates/larql-kv/src/engines/boundary_per_layer/cold_tier.rs b/crates/larql-kv/src/engines/boundary_per_layer/cold_tier.rs index fc9aa30e9..65f7e984b 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/cold_tier.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/cold_tier.rs @@ -156,6 +156,7 @@ mod tests { .collect(), cold_encoded: None, cold_kv: None, + hot_kv: None, cold_abs_start: 0, next_position: 0, max_window: None, diff --git a/crates/larql-kv/src/engines/boundary_per_layer/dispatch.rs b/crates/larql-kv/src/engines/boundary_per_layer/dispatch.rs index 1c1550713..81c2ee822 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/dispatch.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/dispatch.rs @@ -81,6 +81,7 @@ pub(super) fn try_prefill_via_dispatch( stored, cold_encoded: None, cold_kv: None, + hot_kv: None, cold_abs_start: 0, next_position: prompt_len, max_window: window_size, diff --git a/crates/larql-kv/src/engines/boundary_per_layer/engine.rs b/crates/larql-kv/src/engines/boundary_per_layer/engine.rs index 4ae411f77..76aeeec3b 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/engine.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/engine.rs @@ -860,6 +860,62 @@ mod tests { assert_eq!(h2.shape(), &[1, weights.hidden_size]); } + #[test] + fn decode_step_resident_threads_index_through_walk() { + // decode_step_resident forwards to decode_step_impl with Some(index), + // threading the vindex into walk::run_decode (the Q4K-direct route / + // in-place hot-K/V path). Covers the resident decode method body. + use larql_inference::ffn::NullFfn; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let policy = BoundaryLayerPolicy::bf16_uniform("test", weights.num_layers); + let store = store_with_record(&policy); + let mut engine = + BoundaryPerLayerEngine::new(None, policy, weights.num_layers, &store).unwrap(); + let ffn = NullFfn; + engine.prefill(&weights, &ffn, &[0u32, 1]).expect("prefill"); + let h = engine + .decode_step_resident(&weights, &ffn, &index, 2) + .expect("decode_step_resident"); + assert_eq!(h.shape(), &[1, weights.hidden_size]); + assert!(h.iter().all(|v| v.is_finite())); + // A second resident step exercises the in-place steady state. + let h2 = engine + .decode_step_resident(&weights, &ffn, &index, 3) + .expect("decode_step_resident #2"); + assert!(h2.iter().all(|v| v.is_finite())); + } + + #[test] + fn test_trait_helper_methods_are_exercised() { + // The CountingFfn / FusedStubExecutor scaffolding implements trait + // methods that the behavioural tests don't all call; invoke them + // directly so the coverage reflects them (same pattern as boundary_kv's + // `failing_archive_load_chain_returns_empty_ok`). + use larql_inference::ffn::FfnBackend; + use larql_inference::layer_executor::LayerExecutor; + let ffn = CountingFfn { + calls: std::sync::atomic::AtomicUsize::new(0), + hidden: 4, + }; + let x = ndarray::Array2::::zeros((1, 4)); + let (out, act) = ffn.forward_with_activation(0, &x); + assert_eq!(out.shape(), &[1, 4]); + assert_eq!(act.shape(), &[1, 4]); + assert_eq!(ffn.name(), "counting"); + let exec = FusedStubExecutor { + backend: larql_compute::CpuBackend, + }; + assert_eq!(exec.name(), "fused-stub"); + // `backend()` returns the dyn backend — calling it covers the method. + let _b = exec.backend(); + assert!(matches!( + exec.dispatch_kind(), + larql_inference::layer_executor::ExecutorDispatchKind::Fused + )); + } + // ─── EngineError surface coverage ──────────────────────────────────────── // // The Option → Result migration added typed-error guards at every diff --git a/crates/larql-kv/src/engines/boundary_per_layer/executor.rs b/crates/larql-kv/src/engines/boundary_per_layer/executor.rs index 12f6d9c8f..9cda5e1a3 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/executor.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/executor.rs @@ -52,6 +52,7 @@ pub(super) fn run_prefill( stored, cold_encoded: None, cold_kv: None, + hot_kv: None, cold_abs_start: 0, next_position: seq_len, max_window: window_size, diff --git a/crates/larql-kv/src/engines/boundary_per_layer/store.rs b/crates/larql-kv/src/engines/boundary_per_layer/store.rs index b1d281e9c..2c7b25b9e 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/store.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/store.rs @@ -61,6 +61,13 @@ pub struct RsStorePerLayer { pub stored: Vec>, pub cold_encoded: Option>, pub cold_kv: Option>, + /// W2 hot-K/V cache (twin of `markov_residual`). When there is no cold + /// tier (unbounded window), this holds the FULL K/V and the decode walk + /// appends to it IN PLACE instead of `recompute_kv`-ing the whole hot tier + /// every step. It is a **droppable derivative** of the canonical hot + /// residuals (`stored`): set it to `None` and the next step rebuilds it. + /// Excluded from `memory_bytes` for that reason (matches markov). + pub hot_kv: Option>, pub cold_abs_start: usize, pub next_position: usize, pub max_window: Option, @@ -179,6 +186,7 @@ mod tests { stored, cold_encoded: None, cold_kv: None, + hot_kv: None, cold_abs_start: 0, next_position: seq_len, max_window: None, diff --git a/crates/larql-kv/src/engines/boundary_per_layer/walk.rs b/crates/larql-kv/src/engines/boundary_per_layer/walk.rs index 66445e066..9ec29bedd 100644 --- a/crates/larql-kv/src/engines/boundary_per_layer/walk.rs +++ b/crates/larql-kv/src/engines/boundary_per_layer/walk.rs @@ -53,6 +53,7 @@ pub(super) fn run_prefill( stored, cold_encoded: None, cold_kv: None, + hot_kv: None, cold_abs_start: 0, next_position: seq_len, max_window: window_size, @@ -101,57 +102,151 @@ pub(super) fn run_decode( let mut h_new = embed_tokens_pub(weights, &[token_id]); let mut new_stored: Vec> = Vec::with_capacity(num_layers); + // W2 hot-K/V cache (twin of markov_residual_codec). When unbounded with no + // cold tier, `hot_kv` holds the full K/V and the steady state (step 2+) + // appends the new row IN PLACE + attends over views — instead of + // `recompute_kv`-ing the whole hot tier AND rebuilding an owned `[ctx+1]` + // concat every layer every step (the pre-W2 cost this engine carried). The + // canonical state is still `stored` (the per-layer residuals); `hot_kv` is a + // droppable derivative. The in-place / owned-concat choice is gated by the + // shared `LARQL_MARKOV_INPLACE_KV` toggle (default on); both are + // bit-identical (engine-level A/B test). Windowed/cold configs (the engine's + // primary purpose) are NOT cache_eligible and keep the recompute path. + let cache_eligible = + rs.max_window.is_none() && rs.cold_encoded.is_none() && rs.cold_kv.is_none(); + let mut step_new_kv: Vec = Vec::with_capacity(num_layers); + let mut hot_kv_store = rs.hot_kv.take(); + let had_hot_kv = hot_kv_store.is_some(); + let idx_kv: Option<&dyn larql_compute::KvIndex> = + index.map(|v| v as &dyn larql_compute::KvIndex); + let inplace_enabled = crate::engines::markov_residual::compute::markov_inplace_kv_enabled(); + for layer in 0..num_layers { - let h_hot = &rs.stored[layer]; - let s_hot = h_hot.shape()[0]; + // `stored` is push_row-grown, so `shape()[0]` IS the logical hot length. + let s_hot = rs.stored[layer].shape()[0]; let hot_abs_start = abs_position.saturating_sub(s_hot); - let (k_full, v_full) = if let Some(cold_kv) = &rs.cold_kv { - let (k_cold, v_cold) = &cold_kv[layer]; - let (k_hot, v_hot) = recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; - let c = k_cold.shape()[0]; - let kv_dim = k_cold.shape()[1]; - let mut k_combined = Array2::::zeros((c + s_hot, kv_dim)); - k_combined.slice_mut(s![..c, ..]).assign(k_cold); - k_combined.slice_mut(s![c.., ..]).assign(&k_hot); - let mut v_combined = Array2::::zeros((c + s_hot, kv_dim)); - v_combined.slice_mut(s![..c, ..]).assign(v_cold); - v_combined.slice_mut(s![c.., ..]).assign(&v_hot); - (k_combined, v_combined) + new_stored.push(h_new.clone()); + + let h_post_attn = if cache_eligible && had_hot_kv { + // STEADY STATE (step 2+): append in place into the doubling-capacity + // `hot_kv` buffer and attend over the `[..s_hot+1]` views. + let bufs = hot_kv_store.as_mut().expect("had_hot_kv"); + #[cfg(debug_assertions)] + { + // f32-path parity gate (the Q4K route's projections differ from + // f32 `recompute_kv` by >1e-2; it has its own A/B oracle). + if !larql_compute::options::q4k_direct_attn_enabled() { + let (k_buf, v_buf) = &bufs[layer]; + if let Some((rk, rv)) = + recompute_kv(weights, &rs.stored[layer], layer, hot_abs_start, backend, None) + { + let kd = k_buf + .slice(s![..s_hot, ..]) + .iter() + .zip(rk.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + let vd = v_buf + .slice(s![..s_hot, ..]) + .iter() + .zip(rv.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + debug_assert!(kd < 1e-2, "boundary-per-layer hot_kv K diverged: {kd}"); + debug_assert!(vd < 1e-2, "boundary-per-layer hot_kv V diverged: {vd}"); + } + } + } + let (k_buf, v_buf) = &mut bufs[layer]; + let inplace = if inplace_enabled { + larql_inference::attention::run_attention_block_decode_step_auto_inplace( + weights, &h_new, layer, k_buf, v_buf, s_hot, abs_position, Some(backend), idx_kv, + ) + } else { + None + }; + match inplace { + Some(h) => h, + None => { + // Q4K-direct off (flags-off parity) or no attn bytes: owned + // concat over the buffer view, then replace. + let prior: SharedKV = ( + k_buf.slice(s![..s_hot, ..]).to_owned(), + v_buf.slice(s![..s_hot, ..]).to_owned(), + ); + let (h, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h_new, + layer, + Some(&prior), + abs_position, + Some(backend), + idx_kv, + )?; + *k_buf = new_kv.0; + *v_buf = new_kv.1; + h + } + } } else { - let (h_full, full_abs_start) = if let Some(cold_layers) = &rs.cold_encoded { - let enc = &cold_layers[layer]; - if enc.n_positions > 0 { - let decoded = enc.decode(); - let hidden = h_hot.shape()[1]; - let mut combined = Array2::::zeros((decoded.shape()[0] + s_hot, hidden)); - combined - .slice_mut(s![..decoded.shape()[0], ..]) - .assign(&decoded); - combined - .slice_mut(s![decoded.shape()[0].., ..]) - .assign(h_hot); - (combined, rs.cold_abs_start) + // FIRST STEP (cache None → seed) or windowed/cold tier: recompute the + // prior K/V, let attention concat the new row, collect it (the + // cache_eligible first step seeds `hot_kv`). + let h_hot = &rs.stored[layer]; + let (k_full, v_full) = if let Some(cold_kv) = &rs.cold_kv { + let (k_cold, v_cold) = &cold_kv[layer]; + let (k_hot, v_hot) = + recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; + let c = k_cold.shape()[0]; + let kv_dim = k_cold.shape()[1]; + let mut k_combined = Array2::::zeros((c + s_hot, kv_dim)); + k_combined.slice_mut(s![..c, ..]).assign(k_cold); + k_combined.slice_mut(s![c.., ..]).assign(&k_hot); + let mut v_combined = Array2::::zeros((c + s_hot, kv_dim)); + v_combined.slice_mut(s![..c, ..]).assign(v_cold); + v_combined.slice_mut(s![c.., ..]).assign(&v_hot); + (k_combined, v_combined) + } else { + let (h_full, full_abs_start) = if let Some(cold_layers) = &rs.cold_encoded { + let enc = &cold_layers[layer]; + if enc.n_positions > 0 { + let decoded = enc.decode(); + let hidden = h_hot.shape()[1]; + let mut combined = + Array2::::zeros((decoded.shape()[0] + s_hot, hidden)); + combined + .slice_mut(s![..decoded.shape()[0], ..]) + .assign(&decoded); + combined + .slice_mut(s![decoded.shape()[0].., ..]) + .assign(h_hot); + (combined, rs.cold_abs_start) + } else { + (h_hot.clone(), hot_abs_start) + } } else { (h_hot.clone(), hot_abs_start) - } - } else { - (h_hot.clone(), hot_abs_start) + }; + recompute_kv(weights, &h_full, layer, full_abs_start, backend, None)? }; - recompute_kv(weights, &h_full, layer, full_abs_start, backend, None)? - }; - new_stored.push(h_new.clone()); - - let (h_post_attn, _new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( - weights, - &h_new, - layer, - Some(&(k_full, v_full)), - abs_position, - Some(backend), - index.map(|v| v as &dyn larql_compute::KvIndex), - )?; + let (h_post_attn, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h_new, + layer, + Some(&(k_full, v_full)), + abs_position, + Some(backend), + idx_kv, + )?; + if cache_eligible { + step_new_kv.push(new_kv); + } + h_post_attn + }; let h_out = crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, ffn, Some(ffn)); h_new = h_out; @@ -165,6 +260,17 @@ pub(super) fn run_decode( .expect("push_row shape mismatch"); } rs.next_position = abs_position + 1; + // Step 2+ mutated `hot_kv_store` in place; the first step seeds it. Cleared + // for windowed/cold configs (the recompute path stays canonical there). + rs.hot_kv = if cache_eligible { + if had_hot_kv { + hot_kv_store + } else { + Some(step_new_kv) + } + } else { + None + }; let mut overflow_per_layer: Vec> = Vec::with_capacity(num_layers); for layer in 0..num_layers { @@ -293,6 +399,50 @@ mod tests { assert_eq!(hidden.shape(), &[1, weights.hidden_size]); } + /// Flags-ON parity gate for the W2 in-place hot-K/V fast path: an A/B of the + /// in-place steady state vs the owned-concat reference, both with Q4K-direct + /// attention live. Twin of the markov/codec tests — the two paths must + /// produce bit-identical hidden states every step. Serialised on + /// `Q4K_FLAG_ENV_LOCK`; the path is selected via the shared + /// `LARQL_MARKOV_INPLACE_KV` thread-local override. + #[test] + fn run_decode_inplace_matches_owned_concat_flags_on() { + use crate::engines::markov_residual::compute::set_markov_env_override; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + + let _q4k = crate::engines::Q4kFlagGuard::set(&[ + (larql_compute::options::ENV_Q4K_DIRECT_ATTN, true), + (larql_compute::options::ENV_Q4K_ATTN_INT8, false), + ]); + + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let backend = CpuBackend; + let ffn = NullFfn; + let policy = BoundaryLayerPolicy::bf16_uniform("test", weights.num_layers); + + let run = |inplace: bool| -> (Vec>, usize) { + set_markov_env_override("LARQL_MARKOV_INPLACE_KV", Some(if inplace { "1" } else { "0" })); + let (_, mut rs) = + run_prefill(&weights, &ffn, &backend, &policy, None, &[0u32, 1, 2]).unwrap(); + let mut hiddens = Vec::new(); + for tok in 3u32..=12 { + let (h, rs2) = + run_decode(&weights, &ffn, &backend, &policy, rs, tok, Some(&index)).unwrap(); + assert!(h.iter().all(|v| v.is_finite())); + hiddens.push(h.iter().map(|v| v.to_bits()).collect()); + rs = rs2; + } + (hiddens, rs.next_position) + }; + + let (a, a_pos) = run(true); + let (b, b_pos) = run(false); + assert_eq!(a_pos, 13, "3 prompt + 10 decode"); + assert_eq!(a_pos, b_pos); + assert_eq!(a, b, "boundary-per-layer in-place vs owned-concat hidden states diverged"); + } + #[test] fn run_decode_promotes_to_cold_tier_on_overflow() { // Prefill 3 with window=2 → 1 in cold. Decode 1 → 2 in cold. diff --git a/crates/larql-kv/src/engines/markov_residual/compute.rs b/crates/larql-kv/src/engines/markov_residual/compute.rs index 8f9ac524e..cf1c5658c 100644 --- a/crates/larql-kv/src/engines/markov_residual/compute.rs +++ b/crates/larql-kv/src/engines/markov_residual/compute.rs @@ -6,6 +6,7 @@ use ndarray::{s, Array2, ArrayBase, ArrayView1, Data, Ix2}; use std::cell::RefCell; use std::cmp::Ordering; +use super::helpers::append_row; use super::store::RsStore; use crate::profiler::EngineProfiler; use larql_inference::attention::SharedKV; @@ -65,14 +66,14 @@ fn read_markov_env(key: &'static str) -> Option { } #[cfg(test)] -fn set_markov_env_override(key: &'static str, value: Option<&str>) { +pub(crate) fn set_markov_env_override(key: &'static str, value: Option<&str>) { MARKOV_ENV_OVERRIDE.with(|o| { o.borrow_mut().insert(key, value.map(|s| s.to_string())); }); } #[cfg(test)] -fn clear_markov_env_overrides() { +pub(crate) fn clear_markov_env_overrides() { MARKOV_ENV_OVERRIDE.with(|o| o.borrow_mut().clear()); } @@ -212,114 +213,196 @@ fn rs_decode_step_inner( rs.max_window.is_none() && rs.cold_residuals.is_none() && rs.cold_kv.is_none(); let mut step_new_kv: Vec = Vec::with_capacity(num_layers); + // Move the hot K/V cache out so the cache_eligible steady state (step 2+) + // can append into it IN PLACE — borrowing `hot_kv_store` mutably while + // reading `rs.stored` (a disjoint field) immutably. `had_hot_kv` marks the + // seeded-cache case (step 2+); the first decode step has `hot_kv = None` + // and seeds it from `step_new_kv` below. + let mut hot_kv_store = rs.hot_kv; + let had_hot_kv = hot_kv_store.is_some(); + let idx_kv: Option<&dyn larql_compute::KvIndex> = + index.map(|v| v as &dyn larql_compute::KvIndex); for layer in 0..num_layers { - let h_hot = &rs.stored[layer]; - let s_hot = h_hot.shape()[0]; + // `stored` is a doubling-capacity buffer (W8.2): the logical row count + // is `hot_len`, not `shape()[0]` (see RsStore docs). + let s_hot = rs.hot_len; let hot_abs_start = abs_position.saturating_sub(s_hot); - let (k_full, v_full) = if let Some(hot_kv) = rs.hot_kv.as_ref().filter(|_| cache_eligible) { - // W2 cached path: no cold tier, so `hot_kv` IS the full K/V — - // read it instead of the per-step `recompute_kv` matmul. Debug - // builds assert it matches a fresh recompute (the parity gate). - let (k_buf, v_buf) = &hot_kv[layer]; - let k = k_buf.slice(s![..s_hot, ..]).to_owned(); - let v = v_buf.slice(s![..s_hot, ..]).to_owned(); + new_stored.push(h_new.clone()); + + let h_post_attn = if cache_eligible && had_hot_kv { + // STEADY STATE (step 2+): `hot_kv` holds the full prior K/V in a + // doubling-capacity buffer. Append this token's projected+RoPE'd row + // IN PLACE and attend over the `[..s_hot+1]` views — no per-step + // O(ctx) owned concat (the previous `_auto` path rebuilt the whole + // K/V every layer every step, i.e. O(L²) copy over a generation; this + // is O(L), matching `standard`'s in-place handle). The residual + // `stored` stays the canonical re-derivable state; the K/V is a + // droppable derivative. Debug builds assert the cached prior matches + // a fresh recompute (the parity gate) before appending. + let bufs = hot_kv_store.as_mut().expect("had_hot_kv"); #[cfg(debug_assertions)] - if let Some((rk, rv)) = - recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None) { - let kd = k - .iter() - .zip(rk.iter()) - .map(|(a, b)| (a - b).abs()) - .fold(0.0f32, f32::max); - let vd = v - .iter() - .zip(rv.iter()) - .map(|(a, b)| (a - b).abs()) - .fold(0.0f32, f32::max); - debug_assert!(kd < 1e-2, "markov hot_kv K cache diverged: {kd}"); - debug_assert!(vd < 1e-2, "markov hot_kv V cache diverged: {vd}"); + // Parity gate for the f32 path: the cached prior K/V must match a + // fresh f32 `recompute_kv`. Only meaningful when attention is NOT + // on the Q4K-direct route — that route's projections differ from + // `recompute_kv` by more than the 1e-2 bound even in f32-activation + // (different kernels/byte sources), so it has its own oracles: the + // compute-level bit-identity test (`run_..._inplace` ≡ the concat + // form) and the engine-level in-place-vs-owned-concat A/B test. + let q4k_on = larql_compute::options::q4k_direct_attn_enabled(); + if !q4k_on { + let (k_buf, v_buf) = &bufs[layer]; + let h_logical = rs.stored[layer].slice(s![..s_hot, ..]).to_owned(); + if let Some((rk, rv)) = + recompute_kv(weights, &h_logical, layer, hot_abs_start, backend, None) + { + let kd = k_buf + .slice(s![..s_hot, ..]) + .iter() + .zip(rk.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + let vd = v_buf + .slice(s![..s_hot, ..]) + .iter() + .zip(rv.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + debug_assert!(kd < 1e-2, "markov hot_kv K cache diverged: {kd}"); + debug_assert!(vd < 1e-2, "markov hot_kv V cache diverged: {vd}"); + } + } } - (k, v) - } else if let Some(cold_kv) = &rs.cold_kv { - let (k_cold_buf, v_cold_buf) = &cold_kv[layer]; - // 2026-05-19 audit fix: slice to cold_len, not shape()[0]. - // cold_kv now uses doubling-capacity (see RsStore::append_cold_overflow). - let c = rs.cold_len; - let k_cold = k_cold_buf.slice(s![..c, ..]); - let v_cold = v_cold_buf.slice(s![..c, ..]); - let t_hot = if profiler.is_some() { + let (k_buf, v_buf) = &mut bufs[layer]; + let t_attn = if profiler.is_some() { Some(Instant::now()) } else { None }; - let (k_hot, v_hot) = recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; - if let Some(t) = t_hot { - recompute_hot_us += t.elapsed().as_secs_f64() * 1e6; + let inplace = if markov_inplace_kv_enabled() { + larql_inference::attention::run_attention_block_decode_step_auto_inplace( + weights, &h_new, layer, k_buf, v_buf, s_hot, abs_position, Some(backend), idx_kv, + ) + } else { + None + }; + let h = match inplace { + Some(h) => h, + None => { + // Q4K-direct disabled (the flags-off parity baseline) or no + // attn bytes for this layer: fall back to the owned concat + // over the buffer's logical view, then replace the buffer with + // the exact-length result. Bit-identical to the legacy borrow + // path; only the non-default flags-off case pays this copy. + let prior: SharedKV = ( + k_buf.slice(s![..s_hot, ..]).to_owned(), + v_buf.slice(s![..s_hot, ..]).to_owned(), + ); + let (h, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h_new, + layer, + Some(&prior), + abs_position, + Some(backend), + idx_kv, + )?; + *k_buf = new_kv.0; + *v_buf = new_kv.1; + h + } + }; + if let Some(t) = t_attn { + attention_us += t.elapsed().as_secs_f64() * 1e6; } - let kv_dim = k_cold_buf.shape()[1]; - let mut k_combined = Array2::::zeros((c + s_hot, kv_dim)); - k_combined.slice_mut(s![..c, ..]).assign(&k_cold); - k_combined.slice_mut(s![c.., ..]).assign(&k_hot); - let mut v_combined = Array2::::zeros((c + s_hot, kv_dim)); - v_combined.slice_mut(s![..c, ..]).assign(&v_cold); - v_combined.slice_mut(s![c.., ..]).assign(&v_hot); - (k_combined, v_combined) + h } else { - let (h_full, full_abs_start) = if let Some(cold) = &rs.cold_residuals { + // FIRST STEP (cache None → seed) or windowed/cold tier: recompute the + // prior K/V, let attention concat the new row, and (when + // cache_eligible) collect the result to seed `hot_kv`. + let h_hot = &rs.stored[layer]; + let kv_arg: SharedKV = if let Some(cold_kv) = &rs.cold_kv { + let (k_cold_buf, v_cold_buf) = &cold_kv[layer]; // 2026-05-19 audit fix: slice to cold_len, not shape()[0]. - let s_cold = rs.cold_len; - if s_cold > 0 { - let h_cold = cold[layer].slice(s![..s_cold, ..]); - let hidden = h_hot.shape()[1]; - let mut combined = Array2::::zeros((s_cold + s_hot, hidden)); - combined.slice_mut(s![..s_cold, ..]).assign(&h_cold); - combined.slice_mut(s![s_cold.., ..]).assign(h_hot); - (combined, rs.cold_abs_start) + // cold_kv now uses doubling-capacity (see append_cold_overflow). + let c = rs.cold_len; + let k_cold = k_cold_buf.slice(s![..c, ..]); + let v_cold = v_cold_buf.slice(s![..c, ..]); + let t_hot = if profiler.is_some() { + Some(Instant::now()) } else { - (h_hot.clone(), hot_abs_start) + None + }; + let (k_hot, v_hot) = + recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; + if let Some(t) = t_hot { + recompute_hot_us += t.elapsed().as_secs_f64() * 1e6; } + let kv_dim = k_cold_buf.shape()[1]; + let mut k_combined = Array2::::zeros((c + s_hot, kv_dim)); + k_combined.slice_mut(s![..c, ..]).assign(&k_cold); + k_combined.slice_mut(s![c.., ..]).assign(&k_hot); + let mut v_combined = Array2::::zeros((c + s_hot, kv_dim)); + v_combined.slice_mut(s![..c, ..]).assign(&v_cold); + v_combined.slice_mut(s![c.., ..]).assign(&v_hot); + (k_combined, v_combined) } else { - (h_hot.clone(), hot_abs_start) + let (h_full, full_abs_start) = if let Some(cold) = &rs.cold_residuals { + // 2026-05-19 audit fix: slice to cold_len, not shape()[0]. + let s_cold = rs.cold_len; + if s_cold > 0 { + let h_cold = cold[layer].slice(s![..s_cold, ..]); + let hidden = h_hot.shape()[1]; + let mut combined = Array2::::zeros((s_cold + s_hot, hidden)); + combined.slice_mut(s![..s_cold, ..]).assign(&h_cold); + combined.slice_mut(s![s_cold.., ..]).assign(h_hot); + (combined, rs.cold_abs_start) + } else { + (h_hot.clone(), hot_abs_start) + } + } else { + (h_hot.clone(), hot_abs_start) + }; + let t_cold = if profiler.is_some() { + Some(Instant::now()) + } else { + None + }; + let (k, v) = recompute_kv(weights, &h_full, layer, full_abs_start, backend, None)?; + if let Some(t) = t_cold { + recompute_cold_us += t.elapsed().as_secs_f64() * 1e6; + } + (k, v) }; - let t_cold = if profiler.is_some() { + + let t_attn = if profiler.is_some() { Some(Instant::now()) } else { None }; - let (k, v) = recompute_kv(weights, &h_full, layer, full_abs_start, backend, None)?; - if let Some(t) = t_cold { - recompute_cold_us += t.elapsed().as_secs_f64() * 1e6; + let (h_post_attn, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h_new, + layer, + Some(&kv_arg), + abs_position, + Some(backend), + idx_kv, + )?; + if let Some(t) = t_attn { + attention_us += t.elapsed().as_secs_f64() * 1e6; } - (k, v) - }; - - new_stored.push(h_new.clone()); - - let t_attn = if profiler.is_some() { - Some(Instant::now()) - } else { - None + // The attention step already projected the new token's K/V (RoPE'd) — + // free; collect it to seed `hot_kv` for the in-place steady state. + if cache_eligible { + step_new_kv.push(new_kv); + } + h_post_attn }; - let (h_post_attn, new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( - weights, - &h_new, - layer, - Some(&(k_full, v_full)), - abs_position, - Some(backend), - index.map(|v| v as &dyn larql_compute::KvIndex), - )?; - // The attention step already projected the new token's K/V (RoPE'd) — - // free; append it to the cache for next step instead of re-deriving. - if cache_eligible { - step_new_kv.push(new_kv); - } - if let Some(t) = t_attn { - attention_us += t.elapsed().as_secs_f64() * 1e6; - } let t_ffn = if profiler.is_some() { Some(Instant::now()) @@ -346,18 +429,35 @@ fn rs_decode_step_inner( prof.decode_total.record(t_step); } - let mut updated_stored: Vec> = Vec::with_capacity(num_layers); - for (stored, new_row) in rs.stored.iter().zip(new_stored.iter()) { - let s_old = stored.shape()[0]; - let hidden_dim = stored.shape()[1]; - let mut combined = Array2::::zeros((s_old + 1, hidden_dim)); - combined.slice_mut(s![..s_old, ..]).assign(stored); - combined.slice_mut(s![s_old.., ..]).assign(new_row); - updated_stored.push(combined); - } + // W8.2: in the cache_eligible path `stored` is a doubling-capacity buffer + // (no window → never clips), so append the new row in place rather than + // allocating + bzeroing a fresh `[s_old+1, hidden]` array every step. That + // rebuild was the resident walk's dominant per-step malloc — `__bzero` + + // `szone_malloc` were ~32% of the driver's serial work, idling the worker + // pool (see helpers::append_row, mirrors the dispatch path). The + // windowed/cold path keeps the rebuild: it clips and is not cache_eligible. + let (updated_stored, new_hot_len) = if cache_eligible { + let mut buf = rs.stored; + for (layer, new_row) in new_stored.iter().enumerate() { + append_row(&mut buf[layer], new_row, rs.hot_len); + } + (buf, rs.hot_len + 1) + } else { + let mut rebuilt: Vec> = Vec::with_capacity(num_layers); + for (stored, new_row) in rs.stored.iter().zip(new_stored.iter()) { + let s_old = stored.shape()[0]; + let hidden_dim = stored.shape()[1]; + let mut combined = Array2::::zeros((s_old + 1, hidden_dim)); + combined.slice_mut(s![..s_old, ..]).assign(stored); + combined.slice_mut(s![s_old.., ..]).assign(new_row); + rebuilt.push(combined); + } + let len = rebuilt.first().map_or(0, |s| s.shape()[0]); + (rebuilt, len) + }; let mut updated_rs = RsStore { - hot_len: updated_stored.first().map_or(0, |s| s.shape()[0]), + hot_len: new_hot_len, stored: updated_stored, cold_residuals: rs.cold_residuals, cold_kv: rs.cold_kv, @@ -365,8 +465,14 @@ fn rs_decode_step_inner( // Cache the full K/V (returned by attention) for next step when there's // no cold tier; else None (the cold/windowed path recomputes). The clip // loop below clips `hot_kv` in lockstep with `stored` when a window is set. + // Step 2+ mutated `hot_kv_store` in place (the in-place fast path); the + // first step seeds it from the freshly-collected `step_new_kv`. hot_kv: if cache_eligible { - Some(step_new_kv) + if had_hot_kv { + hot_kv_store + } else { + Some(step_new_kv) + } } else { None }, @@ -641,6 +747,22 @@ fn markov_kv_force_f32_projection() -> bool { .is_some_and(|v| matches!(v.trim(), "1" | "true" | "TRUE" | "yes" | "on")) } +/// In-place hot-K/V append on the resident walk's steady state (default ON). +/// When enabled, step 2+ appends the new K/V row into the doubling-capacity +/// `hot_kv` buffer and attends over views — O(L) total cache copy vs the +/// owned-concat path's O(L²). Set `LARQL_MARKOV_INPLACE_KV=0` to fall back to +/// the owned concat: the reference the parity test A/Bs against, and a +/// production escape hatch. Both paths are bit-identical (proven by +/// `run_..._inplace ≡ run_..._q4k_direct` at the compute level and the +/// engine-level A/B test). Shared with the codec twin (same mechanism, one +/// toggle for both residual engines). +pub(crate) fn markov_inplace_kv_enabled() -> bool { + !matches!( + read_markov_env("LARQL_MARKOV_INPLACE_KV").as_deref(), + Some("0") | Some("false") | Some("off") | Some("no") + ) +} + fn markov_walk_kv_diag_layer(layer: usize) -> bool { // `is_none_or` is MSRV 1.82; project pins MSRV 1.80. Equivalent // semantics: env-var absent → true (diag applies to all layers), @@ -1095,6 +1217,59 @@ mod tests { assert!(h2.iter().all(|v| v.is_finite())); } + /// Flags-ON parity gate for the in-place hot-K/V fast path: an A/B of the + /// in-place steady state against the owned-concat reference, both with the + /// Q4K-direct attention path live (int8 OFF so the per-step debug cache + /// assert's 1e-2 bound holds against the q4k `recompute_kv` oracle). The two + /// paths must produce **bit-identical** hidden states at every step — the + /// in-place append only changes the cache *representation* (doubling buffer + + /// views vs fresh owned concat), never the data attended. Runs past a + /// capacity doubling so the grow path is exercised. The `LARQL_MARKOV_INPLACE_KV` + /// override (thread-local; no process-env race) selects the path. + #[test] + fn rs_decode_step_inplace_matches_owned_concat_flags_on() { + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + + // Drive the Q4K flags via the thread-local override (no process-env + // mutation → no segfault race with parallel decode tests). Q4K-direct on, + // int8 off (so the debug cache assert's f32 oracle stays valid). + let _q4k = crate::engines::Q4kFlagGuard::set(&[ + (larql_compute::options::ENV_Q4K_DIRECT_ATTN, true), + (larql_compute::options::ENV_Q4K_ATTN_INT8, false), + ]); + + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + + // Run a 10-step decode and collect per-step hidden states. + let run = |inplace: bool| -> (Vec>, usize, usize) { + set_markov_env_override("LARQL_MARKOV_INPLACE_KV", Some(if inplace { "1" } else { "0" })); + let prefill = rs_prefill(&weights, &[0u32, 1, 2], None, &CpuBackend, None); + let mut rs = prefill.store; + let mut hiddens = Vec::new(); + for tok in 3u32..=12 { + let (h, rs2) = rs_decode_step(&weights, tok, rs, &CpuBackend, None, Some(&index)) + .expect("decode"); + assert!(h.iter().all(|v| v.is_finite())); + hiddens.push(h.iter().map(|v| v.to_bits()).collect()); + rs = rs2; + } + let cap = rs.hot_kv.as_ref().expect("hot_kv populated")[0].0.shape()[0]; + (hiddens, rs.hot_len, cap) + }; + + let (a_hiddens, a_len, a_cap) = run(true); + let (b_hiddens, b_len, _b_cap) = run(false); + + assert_eq!(a_len, 13, "3 prompt + 10 decode rows"); + assert_eq!(a_len, b_len, "hot_len must agree across paths"); + assert!(a_cap >= a_len, "in-place buffer cap {a_cap} < len {a_len} (no doubling?)"); + assert_eq!( + a_hiddens, b_hiddens, + "in-place and owned-concat hidden states diverged (q4k-direct on)" + ); + } + #[test] fn kv_memory_bytes_for_seq_scales_linearly() { let weights = make_test_weights(); diff --git a/crates/larql-kv/src/engines/markov_residual/store.rs b/crates/larql-kv/src/engines/markov_residual/store.rs index 2b5902099..f42edff35 100644 --- a/crates/larql-kv/src/engines/markov_residual/store.rs +++ b/crates/larql-kv/src/engines/markov_residual/store.rs @@ -14,18 +14,21 @@ use ndarray::{s, Array2}; /// next step recomputes from `stored`. Bit-equivalent to the /// non-cached path under fixed RoPE positions. /// -/// Invariants when `hot_kv = Some(kv)`: +/// Invariants when `hot_kv = Some(kv)` (logical length is `hot_len`, not +/// `shape()[0]` — both `stored` and `hot_kv` may be over-allocated): /// - `kv.len() == stored.len()` (one entry per layer) -/// - `kv[l].0.shape()[0] == stored[l].shape()[0]` for every `l` -/// - row `i` of `kv[l]` corresponds to row `i` of `stored[l]` at -/// RoPE position `next_position - stored[l].shape()[0] + i` +/// - `kv[l].0` and `stored[l]` agree on their first `hot_len` rows +/// - row `i` (`i < hot_len`) of `kv[l]` corresponds to row `i` of +/// `stored[l]` at RoPE position `next_position - hot_len + i` pub struct RsStore { /// Per-layer residual stream. **Possibly over-allocated**: with W8.2, /// the dispatch hot path pre-allocates `stored[l]` to a doubling /// capacity and only the first `hot_len` rows are logically valid. /// Readers that want the row count **must** use [`Self::hot_len`], - /// not `stored[l].shape()[0]`. Non-dispatch paths (CPU walk, - /// rs_extend_from_checkpoint_*) still write narrow arrays where + /// not `stored[l].shape()[0]`. The resident walk's cache_eligible + /// path (`rs_decode_step_inner`) now appends in place into the same + /// doubling buffer; only the windowed/cold rebuild and + /// `rs_extend_from_checkpoint_*` still write narrow arrays where /// `hot_len == shape()[0]`. pub stored: Vec>, /// Per-layer cold residuals. **Doubling-capacity** as of 2026-05-19 diff --git a/crates/larql-kv/src/engines/markov_residual_codec/compute.rs b/crates/larql-kv/src/engines/markov_residual_codec/compute.rs index 74af7b6c4..1d939f0e7 100644 --- a/crates/larql-kv/src/engines/markov_residual_codec/compute.rs +++ b/crates/larql-kv/src/engines/markov_residual_codec/compute.rs @@ -16,6 +16,7 @@ use ndarray::{s, Array2}; use crate::engines::markov_residual::recompute_kv; use crate::engines::markov_residual_codec::codec::ColdResidualCodec; +use crate::engines::markov_residual_codec::helpers::append_row; use crate::engines::markov_residual_codec::store::{EncodedColdLayer, RsStoreCodec}; pub struct RsPrefillResultCodec { @@ -118,112 +119,191 @@ pub fn rs_decode_step_codec( rs.max_window.is_none() && rs.cold_encoded.is_none() && rs.cold_kv.is_none(); let mut step_new_kv: Vec = Vec::with_capacity(num_layers); + // Move the hot K/V cache out so the steady state (step 2+) can append in + // place — twin of `markov_residual::compute::rs_decode_step_inner`. + let mut hot_kv_store = rs.hot_kv; + let had_hot_kv = hot_kv_store.is_some(); + let idx_kv: Option<&dyn larql_compute::KvIndex> = + index.map(|v| v as &dyn larql_compute::KvIndex); + let inplace_enabled = + crate::engines::markov_residual::compute::markov_inplace_kv_enabled(); for layer in 0..num_layers { - let h_hot = &rs.stored[layer]; - let s_hot = h_hot.shape()[0]; + // `stored` is a doubling-capacity buffer (W8.2): logical row count is + // `hot_len`, not `shape()[0]`. + let s_hot = rs.hot_len; let hot_abs_start = abs_position.saturating_sub(s_hot); - let (k_full, v_full) = if let Some(hot_kv) = rs.hot_kv.as_ref().filter(|_| cache_eligible) { - // W2 cached path (no cold tier): hot_kv IS the full K/V — read it, - // skip recompute. Debug builds assert it matches a fresh recompute. - let (k_buf, v_buf) = &hot_kv[layer]; - let k = k_buf.slice(s![..s_hot, ..]).to_owned(); - let v = v_buf.slice(s![..s_hot, ..]).to_owned(); + new_stored.push(h_new.clone()); + + let h_post_attn = if cache_eligible && had_hot_kv { + // STEADY STATE (step 2+): append this token's projected+RoPE'd K/V row + // IN PLACE into the doubling-capacity `hot_kv` buffer and attend over + // the `[..s_hot+1]` views — no per-step O(ctx) owned concat (O(L) + // total cache copy vs O(L²)). See the markov twin for the rationale. + let bufs = hot_kv_store.as_mut().expect("had_hot_kv"); #[cfg(debug_assertions)] - if let Some((rk, rv)) = - recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None) { - let kd = k - .iter() - .zip(rk.iter()) - .map(|(a, b)| (a - b).abs()) - .fold(0.0f32, f32::max); - let vd = v - .iter() - .zip(rv.iter()) - .map(|(a, b)| (a - b).abs()) - .fold(0.0f32, f32::max); - debug_assert!(kd < 1e-2, "codec hot_kv K cache diverged: {kd}"); - debug_assert!(vd < 1e-2, "codec hot_kv V cache diverged: {vd}"); + // f32-path parity gate only (the Q4K-direct route has its own + // oracles: the compute-level bit-identity test + the engine A/B). + let q4k_on = larql_compute::options::q4k_direct_attn_enabled(); + if !q4k_on { + let (k_buf, v_buf) = &bufs[layer]; + let h_logical = rs.stored[layer].slice(s![..s_hot, ..]).to_owned(); + if let Some((rk, rv)) = + recompute_kv(weights, &h_logical, layer, hot_abs_start, backend, None) + { + let kd = k_buf + .slice(s![..s_hot, ..]) + .iter() + .zip(rk.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + let vd = v_buf + .slice(s![..s_hot, ..]) + .iter() + .zip(rv.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + debug_assert!(kd < 1e-2, "codec hot_kv K cache diverged: {kd}"); + debug_assert!(vd < 1e-2, "codec hot_kv V cache diverged: {vd}"); + } + } + } + let (k_buf, v_buf) = &mut bufs[layer]; + let inplace = if inplace_enabled { + larql_inference::attention::run_attention_block_decode_step_auto_inplace( + weights, &h_new, layer, k_buf, v_buf, s_hot, abs_position, Some(backend), idx_kv, + ) + } else { + None + }; + match inplace { + Some(h) => h, + None => { + // Q4K-direct off (flags-off parity) or no attn bytes: owned + // concat over the buffer view, then replace. Bit-identical to + // the legacy borrow path. + let prior: SharedKV = ( + k_buf.slice(s![..s_hot, ..]).to_owned(), + v_buf.slice(s![..s_hot, ..]).to_owned(), + ); + let (h, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h_new, + layer, + Some(&prior), + abs_position, + Some(backend), + idx_kv, + )?; + *k_buf = new_kv.0; + *v_buf = new_kv.1; + h + } } - (k, v) - } else if let Some(cold_kv) = &rs.cold_kv { - let (k_cold, v_cold) = &cold_kv[layer]; - let (k_hot, v_hot) = recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; - let c = k_cold.shape()[0]; - let kv_dim = k_cold.shape()[1]; - let mut k_combined = Array2::::zeros((c + s_hot, kv_dim)); - k_combined.slice_mut(s![..c, ..]).assign(k_cold); - k_combined.slice_mut(s![c.., ..]).assign(&k_hot); - let mut v_combined = Array2::::zeros((c + s_hot, kv_dim)); - v_combined.slice_mut(s![..c, ..]).assign(v_cold); - v_combined.slice_mut(s![c.., ..]).assign(&v_hot); - (k_combined, v_combined) } else { - let (h_full, full_abs_start) = if let Some(cold_layers) = &rs.cold_encoded { - let enc = &cold_layers[layer]; - if enc.n_positions > 0 { - let decoded = enc.decode(rs.codec); - let hidden = h_hot.shape()[1]; - let mut combined = Array2::::zeros((decoded.shape()[0] + s_hot, hidden)); - combined - .slice_mut(s![..decoded.shape()[0], ..]) - .assign(&decoded); - combined - .slice_mut(s![decoded.shape()[0].., ..]) - .assign(h_hot); - (combined, rs.cold_abs_start) + // FIRST STEP (cache None → seed) or windowed/cold tier. + let h_hot = &rs.stored[layer]; + let kv_arg: SharedKV = if let Some(cold_kv) = &rs.cold_kv { + let (k_cold, v_cold) = &cold_kv[layer]; + let (k_hot, v_hot) = + recompute_kv(weights, h_hot, layer, hot_abs_start, backend, None)?; + let c = k_cold.shape()[0]; + let kv_dim = k_cold.shape()[1]; + let mut k_combined = Array2::::zeros((c + s_hot, kv_dim)); + k_combined.slice_mut(s![..c, ..]).assign(k_cold); + k_combined.slice_mut(s![c.., ..]).assign(&k_hot); + let mut v_combined = Array2::::zeros((c + s_hot, kv_dim)); + v_combined.slice_mut(s![..c, ..]).assign(v_cold); + v_combined.slice_mut(s![c.., ..]).assign(&v_hot); + (k_combined, v_combined) + } else { + let (h_full, full_abs_start) = if let Some(cold_layers) = &rs.cold_encoded { + let enc = &cold_layers[layer]; + if enc.n_positions > 0 { + let decoded = enc.decode(rs.codec); + let hidden = h_hot.shape()[1]; + let mut combined = + Array2::::zeros((decoded.shape()[0] + s_hot, hidden)); + combined + .slice_mut(s![..decoded.shape()[0], ..]) + .assign(&decoded); + combined + .slice_mut(s![decoded.shape()[0].., ..]) + .assign(h_hot); + (combined, rs.cold_abs_start) + } else { + (h_hot.clone(), hot_abs_start) + } } else { (h_hot.clone(), hot_abs_start) - } - } else { - (h_hot.clone(), hot_abs_start) + }; + let (k, v) = recompute_kv(weights, &h_full, layer, full_abs_start, backend, None)?; + (k, v) }; - let (k, v) = recompute_kv(weights, &h_full, layer, full_abs_start, backend, None)?; - (k, v) - }; - - new_stored.push(h_new.clone()); - let (h_post_attn, new_kv) = larql_inference::attention::run_attention_block_decode_step_auto( - weights, - &h_new, - layer, - Some(&(k_full, v_full)), - abs_position, - Some(backend), - index.map(|v| v as &dyn larql_compute::KvIndex), - )?; - if cache_eligible { - step_new_kv.push(new_kv); - } + let (h_post_attn, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h_new, + layer, + Some(&kv_arg), + abs_position, + Some(backend), + idx_kv, + )?; + if cache_eligible { + step_new_kv.push(new_kv); + } + h_post_attn + }; let bffn = BackendFfn { weights, backend }; let h_out = crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, &bffn, moe_ffn); h_new = h_out; } - // Append the new row to each layer's hot tier. - let mut updated_stored: Vec> = Vec::with_capacity(num_layers); - for (stored, new_row) in rs.stored.iter().zip(new_stored.iter()) { - let s_old = stored.shape()[0]; - let hidden_dim = stored.shape()[1]; - let mut combined = Array2::::zeros((s_old + 1, hidden_dim)); - combined.slice_mut(s![..s_old, ..]).assign(stored); - combined.slice_mut(s![s_old.., ..]).assign(new_row); - updated_stored.push(combined); - } + // Append the new row to each layer's hot tier. W8.2: in the cache_eligible + // path `stored` is a doubling-capacity buffer (no window → never clips), so + // append in place rather than allocating + bzeroing a fresh `[s_old+1, + // hidden]` array every step (the resident walk's dominant per-step malloc; + // see helpers::append_row). The windowed/cold path keeps the rebuild. + let (updated_stored, new_hot_len) = if cache_eligible { + let mut buf = rs.stored; + for (layer, new_row) in new_stored.iter().enumerate() { + append_row(&mut buf[layer], new_row, rs.hot_len); + } + (buf, rs.hot_len + 1) + } else { + let mut rebuilt: Vec> = Vec::with_capacity(num_layers); + for (stored, new_row) in rs.stored.iter().zip(new_stored.iter()) { + let s_old = stored.shape()[0]; + let hidden_dim = stored.shape()[1]; + let mut combined = Array2::::zeros((s_old + 1, hidden_dim)); + combined.slice_mut(s![..s_old, ..]).assign(stored); + combined.slice_mut(s![s_old.., ..]).assign(new_row); + rebuilt.push(combined); + } + let len = rebuilt.first().map_or(0, |s| s.shape()[0]); + (rebuilt, len) + }; let mut updated_rs = RsStoreCodec { - hot_len: updated_stored.first().map_or(0, |s| s.shape()[0]), + hot_len: new_hot_len, stored: updated_stored, cold_encoded: rs.cold_encoded, cold_kv: rs.cold_kv, // Cache the full K/V for next step when there's no cold tier; else None // (cold/windowed recomputes). clip_layer_overflow clips hot_kv in step. + // Step 2+ mutated `hot_kv_store` in place; the first step seeds it. hot_kv: if cache_eligible { - Some(step_new_kv) + if had_hot_kv { + hot_kv_store + } else { + Some(step_new_kv) + } } else { None }, @@ -408,4 +488,51 @@ mod tests { assert!((orig - got).abs() < 0.1); } } + + /// Flags-ON parity gate for the codec engine's in-place hot-K/V fast path: + /// an A/B of the in-place steady state against the owned-concat reference, + /// both with Q4K-direct attention live. Twin of the markov test — the two + /// paths must produce bit-identical hidden states at every step. Twin of the + /// markov test; q4k flags driven via the thread-local override (no env race), + /// in-place path selected through the shared `LARQL_MARKOV_INPLACE_KV` + /// thread-local override. + #[test] + fn rs_decode_step_codec_inplace_matches_owned_concat_flags_on() { + use crate::engines::markov_residual::compute::set_markov_env_override; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + + let _q4k = crate::engines::Q4kFlagGuard::set(&[ + (larql_compute::options::ENV_Q4K_DIRECT_ATTN, true), + (larql_compute::options::ENV_Q4K_ATTN_INT8, false), + ]); + + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + + let run = |inplace: bool| -> (Vec>, usize) { + set_markov_env_override("LARQL_MARKOV_INPLACE_KV", Some(if inplace { "1" } else { "0" })); + let prefill = + rs_prefill_codec(&weights, &[0u32, 1, 2], None, ColdResidualCodec::Bf16, &CpuBackend, None); + let mut rs = prefill.store; + let mut hiddens = Vec::new(); + for tok in 3u32..=12 { + let (h, rs2) = + rs_decode_step_codec(&weights, tok, rs, &CpuBackend, None, Some(&index)) + .expect("decode"); + assert!(h.iter().all(|v| v.is_finite())); + hiddens.push(h.iter().map(|v| v.to_bits()).collect()); + rs = rs2; + } + (hiddens, rs.hot_len) + }; + + let (a_hiddens, a_len) = run(true); + let (b_hiddens, b_len) = run(false); + assert_eq!(a_len, 13, "3 prompt + 10 decode rows"); + assert_eq!(a_len, b_len); + assert_eq!( + a_hiddens, b_hiddens, + "codec in-place and owned-concat hidden states diverged (q4k-direct on)" + ); + } } diff --git a/crates/larql-kv/src/engines/mod.rs b/crates/larql-kv/src/engines/mod.rs index ca412def0..a455d033b 100644 --- a/crates/larql-kv/src/engines/mod.rs +++ b/crates/larql-kv/src/engines/mod.rs @@ -202,6 +202,36 @@ mod layer_ffn_or_moe_tests { } } +/// Test-only RAII helper to drive the Q4K decode fast-path flags via +/// `larql_compute::options`' **thread-local** override (NOT `std::env::set_var`, +/// which is thread-unsafe vs the concurrent `getenv` every parallel decode test +/// does — that race SIGSEGVs libc). Each entry sets one `ENV_*` flag on the +/// current thread; everything is cleared on drop. Because the override is +/// per-thread, these tests need no cross-test serialization. +#[cfg(test)] +pub(crate) struct Q4kFlagGuard; + +#[cfg(test)] +impl Q4kFlagGuard { + /// Override the given `(ENV_* name, on)` flags on this thread for the + /// guard's lifetime. Also clears the `LARQL_MARKOV_*` thread-local overrides + /// on drop (the A/B tests set `LARQL_MARKOV_INPLACE_KV` there). + pub(crate) fn set(flags: &[(&'static str, bool)]) -> Self { + for &(name, on) in flags { + larql_compute::options::set_fast_path_override(name, on); + } + Q4kFlagGuard + } +} + +#[cfg(test)] +impl Drop for Q4kFlagGuard { + fn drop(&mut self) { + larql_compute::options::clear_fast_path_overrides(); + crate::engines::markov_residual::compute::clear_markov_env_overrides(); + } +} + #[cfg(test)] mod resident_identity_tests { //! Structural-gap pin (2026-06-13): every pluggable engine overrides @@ -220,32 +250,14 @@ mod resident_identity_tests { // The Q4K decode fast path is on by default now; this pin asserts the // flags-OFF f32 identity (resident must equal plain when the resident // route is *not* taking the Q4K-direct branch), so disable the stages - // that change the resident hidden state. They read the env uncached, so - // this takes effect; restored on drop even if an assertion panics. - struct FlagsOff(Vec<(&'static str, Option)>); - impl Drop for FlagsOff { - fn drop(&mut self) { - for (k, v) in self.0.drain(..) { - match v { - Some(v) => std::env::set_var(k, v), - None => std::env::remove_var(k), - } - } - } - } - let _flags_off = { - let names = [ - "LARQL_Q4K_DIRECT_ATTN", - "LARQL_Q4K_ATTN_INT8", - "LARQL_Q4K_DIRECT_FFN", - "LARQL_Q4K_LM_HEAD", - ]; - let saved: Vec<_> = names.iter().map(|n| (*n, std::env::var_os(n))).collect(); - for n in names { - std::env::set_var(n, "0"); - } - FlagsOff(saved) - }; + // that change the resident hidden state — via the thread-local override + // (no `set_var`, so no segfault race with parallel decode tests). + let _flags_off = super::Q4kFlagGuard::set(&[ + (larql_compute::options::ENV_Q4K_DIRECT_ATTN, false), + (larql_compute::options::ENV_Q4K_ATTN_INT8, false), + (larql_compute::options::ENV_Q4K_DIRECT_FFN, false), + (larql_compute::options::ENV_Q4K_LM_HEAD, false), + ]); // Concrete specs (parameterised kinds need real params). Excluded: // apollo (bench-only, full re-forward by design; resident default = diff --git a/crates/larql-kv/src/engines/unlimited_context/engine.rs b/crates/larql-kv/src/engines/unlimited_context/engine.rs index dd1a00d52..3b1e8ce01 100644 --- a/crates/larql-kv/src/engines/unlimited_context/engine.rs +++ b/crates/larql-kv/src/engines/unlimited_context/engine.rs @@ -23,6 +23,7 @@ use serde::Serialize; use super::checkpoint_store::CheckpointStore; use super::extend::{ empty_prior, rs_extend_from_checkpoint_backend, rs_extend_from_checkpoint_quant, + rs_extend_inplace, }; use super::token_archive::TokenArchive; use crate::engines::markov_residual::ensure_attn_tensors_dequantised; @@ -331,35 +332,69 @@ impl UnlimitedContextEngine { return Some(()); } - let prior = if self.current_window_tokens.is_empty() { + // `prior_len` is the prior's LOGICAL row count — the window-KV counter + // mid-window, the checkpoint's row count at a window start, or 0. + let (mut prior, prior_len) = if self.current_window_tokens.is_empty() { if self.current_window_id > 0 && self.checkpoints.contains(self.current_window_id - 1) { let (ckpt, _) = self.checkpoints.load(self.current_window_id - 1)?; - ckpt + let len = ckpt.first().map_or(0, |(k, _)| k.shape()[0]); + (ckpt, len) } else { - empty_prior(weights) + (empty_prior(weights), 0) } } else { - self.current_window_kv - .take() - .unwrap_or_else(|| empty_prior(weights)) + ( + self.current_window_kv + .take() + .unwrap_or_else(|| empty_prior(weights)), + self.current_window_kv_len, + ) }; let abs_start = self.abs_offset + self.current_window_tokens.len(); - let out = rs_extend_from_checkpoint_backend( - weights, - chunk, - prior, - abs_start, - self.backend.as_ref(), - moe_ffn, - index, - )?; - self.last_hidden = Some(out.last_hidden); - // CPU walk path: see comment on extend_current_quant — narrow - // arrays, counter == shape[0]. - self.current_window_kv_len = out.kv_cache.first().map_or(0, |(k, _)| k.shape()[0]); - self.current_window_kv = Some(out.kv_cache); + // In-place fast path: append the chunk's K/V rows into the window's + // doubling-capacity buffers instead of rebuilding an owned `[len+1]` + // concat every layer every step (O(window) → O(1) per step). Gated to + // the Q4K-direct route (with the shared `LARQL_MARKOV_INPLACE_KV` + // toggle); flags-off keeps the unchanged owned-concat path bit-for-bit, + // which is what `resident_identity_tests` pins. The window's existing + // `current_window_kv_len` counter already treats the buffers as + // over-allocated (the dispatch path does too), so close_window / + // current_kv_bytes need no change. + let use_inplace = index.is_some() + && crate::engines::markov_residual::compute::markov_inplace_kv_enabled() + && larql_compute::options::q4k_direct_attn_enabled(); + + if use_inplace { + let last = rs_extend_inplace( + weights, + chunk, + &mut prior, + prior_len, + abs_start, + self.backend.as_ref(), + moe_ffn, + index, + )?; + self.last_hidden = Some(last); + self.current_window_kv_len = prior_len + chunk.len(); + self.current_window_kv = Some(prior); + } else { + let out = rs_extend_from_checkpoint_backend( + weights, + chunk, + prior, + abs_start, + self.backend.as_ref(), + moe_ffn, + index, + )?; + self.last_hidden = Some(out.last_hidden); + // CPU walk path: narrow arrays, counter == shape[0]. + self.current_window_kv_len = out.kv_cache.first().map_or(0, |(k, _)| k.shape()[0]); + self.current_window_kv = Some(out.kv_cache); + } self.current_window_tokens.extend_from_slice(chunk); Some(()) } @@ -977,6 +1012,49 @@ mod tests { assert_eq!(h.shape(), &[1, weights.hidden_size]); } + /// Flags-ON parity gate for the in-place window K/V fast path: an A/B of the + /// in-place steady state vs the owned-concat reference, both driving the + /// resident decode path (`extend_current`) with Q4K-direct attention live. + /// The two must produce bit-identical hidden states every step — the + /// in-place append only changes the window-buffer representation (doubling + + /// views vs fresh owned concat). 13 tokens < window(512), so it stays in one + /// window (no close). Serialised on `Q4K_FLAG_ENV_LOCK`; path selected via + /// the shared `LARQL_MARKOV_INPLACE_KV` thread-local override. + #[test] + fn decode_inplace_matches_owned_concat_flags_on() { + use crate::engines::markov_residual::compute::set_markov_env_override; + use larql_inference::ffn::NullFfn; + use larql_inference::test_utils::{make_test_q4k_vindex, make_test_q4k_weights}; + + let _q4k = crate::engines::Q4kFlagGuard::set(&[ + (larql_compute::options::ENV_Q4K_DIRECT_ATTN, true), + (larql_compute::options::ENV_Q4K_ATTN_INT8, false), + ]); + + let weights = make_test_q4k_weights(); + let index = make_test_q4k_vindex(&weights); + let ffn = NullFfn; + + let run = |inplace: bool| -> Vec> { + set_markov_env_override("LARQL_MARKOV_INPLACE_KV", Some(if inplace { "1" } else { "0" })); + let mut engine = UnlimitedContextEngine::new(512); + engine.prefill(&weights, &ffn, &[0u32, 1, 2]).expect("prefill"); + let mut hiddens = Vec::new(); + for tok in 3u32..=12 { + let h = engine + .decode_step_resident(&weights, &ffn, &index, tok) + .expect("decode_step_resident"); + assert!(h.iter().all(|v| v.is_finite())); + hiddens.push(h.iter().map(|v| v.to_bits()).collect()); + } + hiddens + }; + + let a = run(true); + let b = run(false); + assert_eq!(a, b, "unlimited in-place vs owned-concat hidden states diverged (q4k on)"); + } + #[test] fn decode_step_quant_without_prefill_returns_none() { use larql_inference::ffn::NullFfn; diff --git a/crates/larql-kv/src/engines/unlimited_context/extend.rs b/crates/larql-kv/src/engines/unlimited_context/extend.rs index 4059db8e6..3b819763d 100644 --- a/crates/larql-kv/src/engines/unlimited_context/extend.rs +++ b/crates/larql-kv/src/engines/unlimited_context/extend.rs @@ -118,6 +118,87 @@ pub fn rs_extend_from_checkpoint_backend( }) } +/// In-place multi-token extend for the decode hot path (the W8.2/in-place twin +/// of [`rs_extend_from_checkpoint_backend`]). Appends each token's K/V row into +/// the caller's **doubling-capacity** `kv_cache` buffers — starting at logical +/// row `prior_len` — and attends over the growing `[..len]` views, instead of +/// rebuilding an owned `[len+1, kv_dim]` concat every layer every step (the +/// O(window²) cost the owned-concat form pays). On return each buffer holds +/// `prior_len + token_ids.len()` logical rows (track that count in the caller — +/// the buffers are over-allocated, so `shape()[0]` is capacity, not length). +/// +/// Per-layer fallback: if a layer has no Q4K attn bytes the in-place projection +/// returns `None`, so this writes the owned-concat result back into the buffer +/// for that layer — buffers stay consistent. Same numerics as the owned-concat +/// form (engine-level A/B test), only the cache representation differs. +#[allow(clippy::too_many_arguments)] +pub fn rs_extend_inplace( + weights: &ModelWeights, + token_ids: &[u32], + kv_cache: &mut [SharedKV], + prior_len: usize, + abs_start: usize, + backend: &dyn ComputeBackend, + moe_ffn: Option<&dyn larql_inference::ffn::FfnBackend>, + index: Option<&larql_vindex::VectorIndex>, +) -> Option> { + let num_layers = weights.num_layers; + if token_ids.is_empty() || kv_cache.len() != num_layers { + return None; + } + let idx_kv: Option<&dyn larql_compute::KvIndex> = + index.map(|v| v as &dyn larql_compute::KvIndex); + let mut last_hidden: Option> = None; + + for (i, &token_id) in token_ids.iter().enumerate() { + let abs_position = abs_start + i; + // Logical row count of every buffer at the start of this token: the + // prior window length plus the tokens already appended this call. + let len = prior_len + i; + let mut h = embed_tokens_pub(weights, &[token_id]); + + for (layer, (k_buf, v_buf)) in kv_cache.iter_mut().enumerate() { + let h_post_attn = + match larql_inference::attention::run_attention_block_decode_step_auto_inplace( + weights, &h, layer, k_buf, v_buf, len, abs_position, Some(backend), idx_kv, + ) { + Some(hp) => hp, + None => { + // Q4K-direct off / no attn bytes: owned concat over the + // `[..len]` view, then replace the buffer. + let prior: SharedKV = ( + k_buf.slice(ndarray::s![..len, ..]).to_owned(), + v_buf.slice(ndarray::s![..len, ..]).to_owned(), + ); + let prior_ref = if len > 0 { Some(&prior) } else { None }; + let (hp, new_kv) = + larql_inference::attention::run_attention_block_decode_step_auto( + weights, + &h, + layer, + prior_ref, + abs_position, + Some(backend), + idx_kv, + )?; + *k_buf = new_kv.0; + *v_buf = new_kv.1; + hp + } + }; + + let bffn = BackendFfn { weights, backend }; + let h_out = + crate::engines::layer_ffn_or_moe(weights, &h_post_attn, layer, &bffn, moe_ffn); + h = h_out; + } + + last_hidden = Some(h); + } + + last_hidden +} + /// CPU Q4K variant of [`rs_extend_from_checkpoint_backend`]. /// /// Uses `WalkFfn` (reads Q4K bytes directly from `index`) for FFN instead of diff --git a/crates/larql-lql/src/executor/lifecycle/compile/into_model.rs b/crates/larql-lql/src/executor/lifecycle/compile/into_model.rs index 42b732a2e..e1330807e 100644 --- a/crates/larql-lql/src/executor/lifecycle/compile/into_model.rs +++ b/crates/larql-lql/src/executor/lifecycle/compile/into_model.rs @@ -66,9 +66,11 @@ impl Session { let mut out = Vec::new(); out.extend(collected.warnings); // MEMIT is opt-in via `LARQL_MEMIT_ENABLE=1`; see the matching - // block in the COMPILE INTO VINDEX path for the rationale. - let memit_enabled = std::env::var("LARQL_MEMIT_ENABLE") - .ok() + // block in the COMPILE INTO VINDEX path for the rationale. Read + // through the override-aware `larql_compute::options` helper so tests + // can toggle it without `std::env::set_var` (which races concurrent + // `getenv` on the decode path and SIGSEGVs libc). + let memit_enabled = larql_compute::options::env_value("LARQL_MEMIT_ENABLE") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false); if !memit_facts.is_empty() && memit_enabled { diff --git a/crates/larql-lql/src/executor/lifecycle/compile/into_vindex.rs b/crates/larql-lql/src/executor/lifecycle/compile/into_vindex.rs index d20c0dca7..95ba3f1c0 100644 --- a/crates/larql-lql/src/executor/lifecycle/compile/into_vindex.rs +++ b/crates/larql-lql/src/executor/lifecycle/compile/into_vindex.rs @@ -144,8 +144,10 @@ impl Session { // form solve cannot separate installs from natives. Pure // compose column-replace is the default COMPILE path and is // what produces the working Gemma installs. - let memit_enabled = std::env::var("LARQL_MEMIT_ENABLE") - .ok() + // Read through the override-aware `larql_compute::options` helper so + // tests can toggle this without `std::env::set_var` (which races + // concurrent `getenv` on the decode path and SIGSEGVs libc). + let memit_enabled = larql_compute::options::env_value("LARQL_MEMIT_ENABLE") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false); let memit_results = if !memit_facts.is_empty() && config.has_model_weights && memit_enabled diff --git a/crates/larql-lql/src/executor/tests.rs b/crates/larql-lql/src/executor/tests.rs index a7758824e..ef882c260 100644 --- a/crates/larql-lql/src/executor/tests.rs +++ b/crates/larql-lql/src/executor/tests.rs @@ -11,6 +11,27 @@ fn lql_path(path: impl AsRef) -> String { path.as_ref().display().to_string().replace('\\', "\\\\") } +/// RAII guard that forces `LARQL_MEMIT_ENABLE=1` on the current thread via the +/// `larql_compute::options` thread-local override and clears it on drop. +/// Replaces `std::env::set_var`/`remove_var`, which mutate process-global env +/// and race the concurrent `getenv` other parallel tests drive on the decode +/// path → SIGSEGV in libc. The COMPILE paths read this flag through +/// `larql_compute::options::env_value`, so the override is honoured. +struct MemitEnableGuard; + +impl MemitEnableGuard { + fn on() -> Self { + larql_compute::options::set_env_override("LARQL_MEMIT_ENABLE", Some("1")); + MemitEnableGuard + } +} + +impl Drop for MemitEnableGuard { + fn drop(&mut self) { + larql_compute::options::clear_fast_path_overrides(); + } +} + // ── Session state: no backend ── #[test] @@ -4665,9 +4686,13 @@ fn compile_into_vindex_with_memit_enabled_runs_solver_path() { )) .unwrap(); - std::env::set_var("LARQL_MEMIT_ENABLE", "1"); + // Toggle MEMIT on for the duration of this call via the thread-local + // override (NOT `std::env::set_var`, which races concurrent `getenv` on + // the decode path that other parallel tests drive → SIGSEGV). Production + // reads this through `larql_compute::options::env_value`, so the override + // wins. Cleared on guard drop. + let _memit = MemitEnableGuard::on(); let result = session.execute(&stmt); - std::env::remove_var("LARQL_MEMIT_ENABLE"); // Random-init weights mean the MEMIT solve might not produce // a useful delta but the code path runs end-to-end. @@ -4747,10 +4772,11 @@ fn compile_into_model_with_memit_enabled_runs() { )) .unwrap(); - // Toggle MEMIT on for the duration of this call. - std::env::set_var("LARQL_MEMIT_ENABLE", "1"); + // Toggle MEMIT on for the duration of this call via the thread-local + // override (NOT `std::env::set_var`, which races concurrent `getenv` on + // the decode path → SIGSEGV). See `compile_into_vindex_with_memit_enabled_runs_solver_path`. + let _memit = MemitEnableGuard::on(); let result = session.execute(&stmt); - std::env::remove_var("LARQL_MEMIT_ENABLE"); // The MEMIT solve may or may not converge cleanly with random-init // weights — accept either outcome but exercise the path. diff --git a/crates/larql-server/tests/common/synthetic_q4k_vindex.rs b/crates/larql-server/tests/common/synthetic_q4k_vindex.rs index 05a7d3b06..91e2333c3 100644 --- a/crates/larql-server/tests/common/synthetic_q4k_vindex.rs +++ b/crates/larql-server/tests/common/synthetic_q4k_vindex.rs @@ -174,6 +174,7 @@ pub fn build() -> SyntheticQ4kVindex { "test/synthetic-q4k", &vindex_dir, 4, // down_top_k + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Inference, larql_vindex::StorageDtype::F32, larql_vindex::QuantFormat::Q4K, diff --git a/crates/larql-vindex/benches/extract_throughput.rs b/crates/larql-vindex/benches/extract_throughput.rs index 387c349a0..a0dadf233 100644 --- a/crates/larql-vindex/benches/extract_throughput.rs +++ b/crates/larql-vindex/benches/extract_throughput.rs @@ -152,6 +152,7 @@ fn bench_extract_throughput(c: &mut Criterion) { "bench/extract", &out_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::All, StorageDtype::F32, q, @@ -179,6 +180,7 @@ fn bench_extract_throughput(c: &mut Criterion) { "bench/extract", &ref_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::All, StorageDtype::F32, QuantFormat::Q4K, @@ -225,6 +227,7 @@ fn bench_extract_throughput(c: &mut Criterion) { "bench/extract", &resume_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::All, StorageDtype::F32, QuantFormat::Q4K, diff --git a/crates/larql-vindex/benches/q4k_vs_f32.rs b/crates/larql-vindex/benches/q4k_vs_f32.rs index 7b7d236fd..94229f1f1 100644 --- a/crates/larql-vindex/benches/q4k_vs_f32.rs +++ b/crates/larql-vindex/benches/q4k_vs_f32.rs @@ -169,6 +169,7 @@ fn bench_q4k_vs_f32(c: &mut Criterion) { "bench/q4k_vs_f32", &f32_dir, 5, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::All, larql_vindex::StorageDtype::F32, larql_vindex::QuantFormat::None, @@ -186,6 +187,7 @@ fn bench_q4k_vs_f32(c: &mut Criterion) { "bench/q4k_vs_f32", &q4k_dir, 5, + 0, larql_vindex::ExtractLevel::All, larql_vindex::StorageDtype::F32, larql_vindex::QuantFormat::Q4K, diff --git a/crates/larql-vindex/examples/q4k_demo.rs b/crates/larql-vindex/examples/q4k_demo.rs index ed21f5567..1b5e6f6f5 100644 --- a/crates/larql-vindex/examples/q4k_demo.rs +++ b/crates/larql-vindex/examples/q4k_demo.rs @@ -61,6 +61,7 @@ fn main() { "demo/q4k", &out_f32, 5, + 0, // summary_features_per_expert (off) ExtractLevel::All, StorageDtype::F32, QuantFormat::None, @@ -84,6 +85,7 @@ fn main() { "demo/q4k", &out_q4k, 5, + 0, // summary_features_per_expert (off) ExtractLevel::All, StorageDtype::F32, QuantFormat::Q4K, @@ -131,9 +133,12 @@ fn main() { // ── Manifest preview ── - println!("\n── attn_weights_q4k_manifest.json (first 2 entries) ──"); - let attn_manifest = - std::fs::read_to_string(out_q4k.join("attn_weights_q4k_manifest.json")).unwrap(); + // K-quant writer emits `attn_weights_kquant_manifest.json` (the old + // `attn_weights_q4k_manifest.json` name is a legacy fallback). Use the + // canonical filename constant so this demo tracks future renames. + let attn_manifest_name = larql_vindex::format::filenames::ATTN_WEIGHTS_KQUANT_MANIFEST_JSON; + println!("\n── {attn_manifest_name} (first 2 entries) ──"); + let attn_manifest = std::fs::read_to_string(out_q4k.join(attn_manifest_name)).unwrap(); let attn_entries: Vec = serde_json::from_str(&attn_manifest).unwrap(); for entry in attn_entries.iter().take(2) { println!(" {{ key: {},", entry["key"].as_str().unwrap()); diff --git a/crates/larql-vindex/src/extract/streaming/context.rs b/crates/larql-vindex/src/extract/streaming/context.rs index 15e0a34ff..fe9a3db22 100644 --- a/crates/larql-vindex/src/extract/streaming/context.rs +++ b/crates/larql-vindex/src/extract/streaming/context.rs @@ -39,6 +39,11 @@ pub(super) struct StreamingContext<'a> { pub(super) drop_gate_vectors: bool, pub(super) extract_level: crate::ExtractLevel, pub(super) down_top_k: usize, + /// Per-expert summary tier: when `> 0`, cap each expert's gate/down + /// feature columns to a top-K (SVD for gate) so many-experts MoE doesn't + /// explode. `0` = full per-expert features. Threaded from + /// `--summary-features-per-expert` (was an env side-channel). + pub(super) summary_features_per_expert: usize, // Architecture (owned, set in `new`) pub(super) arch: Box, @@ -79,6 +84,7 @@ impl<'a> StreamingContext<'a> { model_name: &'a str, output_dir: &'a Path, down_top_k: usize, + summary_features_per_expert: usize, extract_level: crate::ExtractLevel, dtype: StorageDtype, quant: QuantFormat, @@ -202,6 +208,7 @@ impl<'a> StreamingContext<'a> { drop_gate_vectors, extract_level, down_top_k, + summary_features_per_expert, arch, prefixes, num_layers, @@ -368,4 +375,65 @@ mod tests { other => panic!("expected NoSafetensors, got {other:?}"), } } + + /// Write `len` filler bytes so size-comparison branches have something + /// to rank (the GGUF detector reads `metadata().len()` for the + /// largest-file fallback, never the contents). + fn write_filler(path: &Path, len: usize) { + std::fs::write(path, vec![0u8; len]).unwrap(); + } + + #[test] + fn detect_gguf_entry_returns_single_file_as_is() { + let tmp = tempfile::tempdir().unwrap(); + let gguf = tmp.path().join("model.gguf"); + write_filler(&gguf, 16); + + let got = detect_gguf_entry(&gguf).unwrap(); + assert_eq!(got.as_deref(), Some(gguf.as_path())); + } + + #[test] + fn detect_gguf_entry_returns_none_for_missing_path() { + let tmp = tempfile::tempdir().unwrap(); + // Path neither a file nor a directory. + let got = detect_gguf_entry(&tmp.path().join("does-not-exist")).unwrap(); + assert!(got.is_none()); + } + + #[test] + fn detect_gguf_entry_returns_none_when_dir_has_no_gguf() { + let tmp = tempfile::tempdir().unwrap(); + touch(&tmp.path().join("config.json")); + touch(&tmp.path().join("model.safetensors")); + + let got = detect_gguf_entry(tmp.path()).unwrap(); + assert!(got.is_none()); + } + + #[test] + fn detect_gguf_entry_prefers_shard1_when_multi_shard_named() { + let tmp = tempfile::tempdir().unwrap(); + // Shard 1 is deliberately the *smallest* so we prove the + // `-00001-of-` name wins over the largest-file fallback. + let shard1 = tmp.path().join("model-00001-of-00002.gguf"); + let shard2 = tmp.path().join("model-00002-of-00002.gguf"); + write_filler(&shard1, 8); + write_filler(&shard2, 512); + + let got = detect_gguf_entry(tmp.path()).unwrap(); + assert_eq!(got.as_deref(), Some(shard1.as_path())); + } + + #[test] + fn detect_gguf_entry_falls_back_to_largest_without_shard_naming() { + let tmp = tempfile::tempdir().unwrap(); + let small = tmp.path().join("a.gguf"); + let large = tmp.path().join("b.gguf"); + write_filler(&small, 32); + write_filler(&large, 4096); + + let got = detect_gguf_entry(tmp.path()).unwrap(); + assert_eq!(got.as_deref(), Some(large.as_path())); + } } diff --git a/crates/larql-vindex/src/extract/streaming/mod.rs b/crates/larql-vindex/src/extract/streaming/mod.rs index b89932259..ef71edc3d 100644 --- a/crates/larql-vindex/src/extract/streaming/mod.rs +++ b/crates/larql-vindex/src/extract/streaming/mod.rs @@ -42,6 +42,10 @@ pub fn build_vindex_streaming( model_name: &str, output_dir: &Path, down_top_k: usize, + // Per-expert summary tier (`--summary-features-per-expert`): `> 0` caps + // each expert's gate/down features to a top-K so many-experts MoE doesn't + // explode; `0` = full per-expert features. + summary_features_per_expert: usize, extract_level: crate::ExtractLevel, dtype: StorageDtype, quant: QuantFormat, @@ -87,6 +91,7 @@ pub fn build_vindex_streaming( model_name, output_dir, down_top_k, + summary_features_per_expert, extract_level, dtype, quant, diff --git a/crates/larql-vindex/src/extract/streaming/stages/down_meta.rs b/crates/larql-vindex/src/extract/streaming/stages/down_meta.rs index bd7967791..4f20c55f8 100644 --- a/crates/larql-vindex/src/extract/streaming/stages/down_meta.rs +++ b/crates/larql-vindex/src/extract/streaming/stages/down_meta.rs @@ -146,8 +146,8 @@ impl<'a> StreamingContext<'a> { continue; } - // Same `LARQL_SUMMARY_FEATURES_PER_EXPERT` env that gates the - // gate-vectors SVD path also caps how many down_proj feature + // The `--summary-features-per-expert` cap (which gates the + // gate-vectors SVD path) also caps how many down_proj feature // columns we compute meta for. Without this cap, many-experts // MoE explodes: // 43 layers × 256 experts × 2048 features × (vocab × hidden) @@ -155,12 +155,8 @@ impl<'a> StreamingContext<'a> { // With K=64: ~32× speedup → ~2 hrs total. Limitation: records // meta for the first K columns rather than the SVD-selected // most-important features. Acceptable for the summary tier — - // full meta is still available via the default - // `--summary-features-per-expert 0`. - let summary_k = std::env::var("LARQL_SUMMARY_FEATURES_PER_EXPERT") - .ok() - .and_then(|s| s.parse::().ok()) - .unwrap_or(0); + // full meta is still available with the default `0`. + let summary_k = self.summary_features_per_expert; let mut feature_offset = 0usize; for w_down in &down_matrices { diff --git a/crates/larql-vindex/src/extract/streaming/stages/gate_vectors.rs b/crates/larql-vindex/src/extract/streaming/stages/gate_vectors.rs index aaa0279a8..556740611 100644 --- a/crates/larql-vindex/src/extract/streaming/stages/gate_vectors.rs +++ b/crates/larql-vindex/src/extract/streaming/stages/gate_vectors.rs @@ -171,16 +171,13 @@ impl<'a> StreamingContext<'a> { // (shape [num_features, hidden]) — fine for low-expert-count // MoE (Mixtral's 8 per layer = ~1.8 GB/layer at hidden=4096). // - // • Summary: when LARQL_SUMMARY_FEATURES_PER_EXPERT is set - // to a positive integer K, do a top-K randomized SVD of - // each expert's gate_proj and write only the top-K right + // • Summary: when `--summary-features-per-expert` is a + // positive integer K, do a top-K randomized SVD of each + // expert's gate_proj and write only the top-K right // singular vectors (K × hidden floats per expert). Required // for many-experts MoE (DeepSeek-V4 family at 256-384 // experts/layer would otherwise produce 100s of GB). - let summary_k = std::env::var("LARQL_SUMMARY_FEATURES_PER_EXPERT") - .ok() - .and_then(|s| s.parse::().ok()) - .unwrap_or(0); + let summary_k = self.summary_features_per_expert; let mut total_features = 0usize; let mut layer_bytes = 0u64; diff --git a/crates/larql-vindex/src/index/storage/lm_head/knn.rs b/crates/larql-vindex/src/index/storage/lm_head/knn.rs index aaf8843f6..5d21bedbb 100644 --- a/crates/larql-vindex/src/index/storage/lm_head/knn.rs +++ b/crates/larql-vindex/src/index/storage/lm_head/knn.rs @@ -16,10 +16,15 @@ enum Stride32Mode { First, } +/// Our own `LARQL_*` flag — read through `larql_compute::options` so the +/// thread-local override applies (no `std::env::set_var` in tests, which +/// races concurrent `getenv` on the decode path → SIGSEGV). +const ENV_LM_HEAD_STRIDE32: &str = "LARQL_LM_HEAD_STRIDE32"; + fn lm_head_stride32_mode() -> Stride32Mode { - match std::env::var("LARQL_LM_HEAD_STRIDE32") { - Ok(v) if matches!(v.as_str(), "1" | "true" | "on" | "yes") => Stride32Mode::First, - Ok(v) if matches!(v.as_str(), "0" | "false" | "off" | "no") => Stride32Mode::Disabled, + match larql_compute::options::env_value(ENV_LM_HEAD_STRIDE32).as_deref() { + Some("1") | Some("true") | Some("on") | Some("yes") => Stride32Mode::First, + Some("0") | Some("false") | Some("off") | Some("no") => Stride32Mode::Disabled, _ => Stride32Mode::Fallback, } } @@ -446,39 +451,30 @@ mod tests { // ── lm_head_stride32_mode ── - /// RAII env-var override for LARQL_LM_HEAD_STRIDE32. Restores prior - /// value on drop. - struct EnvSet { - prev: Option, - } + /// RAII override for `LARQL_LM_HEAD_STRIDE32` via the larql-compute + /// thread-local override (NOT `std::env::set_var`, which races the + /// concurrent `getenv` every parallel decode test does → SIGSEGVs + /// libc). Thread-local, so no `#[serial]` needed; cleared on drop. + struct EnvSet; impl EnvSet { fn set(value: Option<&str>) -> Self { - let prev = std::env::var("LARQL_LM_HEAD_STRIDE32").ok(); - match value { - Some(v) => std::env::set_var("LARQL_LM_HEAD_STRIDE32", v), - None => std::env::remove_var("LARQL_LM_HEAD_STRIDE32"), - } - Self { prev } + larql_compute::options::set_env_override(ENV_LM_HEAD_STRIDE32, value); + Self } } impl Drop for EnvSet { fn drop(&mut self) { - match self.prev.take() { - Some(v) => std::env::set_var("LARQL_LM_HEAD_STRIDE32", v), - None => std::env::remove_var("LARQL_LM_HEAD_STRIDE32"), - } + larql_compute::options::clear_fast_path_overrides(); } } #[test] - #[serial_test::serial] fn stride32_mode_unset_falls_back() { let _g = EnvSet::set(None); assert_eq!(lm_head_stride32_mode(), Stride32Mode::Fallback); } #[test] - #[serial_test::serial] fn stride32_mode_truthy_values_select_first() { for val in ["1", "true", "on", "yes"] { let _g = EnvSet::set(Some(val)); @@ -487,7 +483,6 @@ mod tests { } #[test] - #[serial_test::serial] fn stride32_mode_falsy_values_select_disabled() { for val in ["0", "false", "off", "no"] { let _g = EnvSet::set(Some(val)); @@ -500,7 +495,6 @@ mod tests { } #[test] - #[serial_test::serial] fn stride32_mode_unknown_value_falls_back() { let _g = EnvSet::set(Some("maybe")); assert_eq!(lm_head_stride32_mode(), Stride32Mode::Fallback); @@ -515,7 +509,6 @@ mod tests { // f32 fallback fires when neither Q4 nor f16 is loaded. #[test] - #[serial_test::serial] fn lm_head_knn_backend_falls_back_to_f32_when_no_q4_or_f16() { // Stride-32 disabled so the backend tries f16 (none) → f32. let _g = EnvSet::set(Some("0")); @@ -532,7 +525,6 @@ mod tests { } #[test] - #[serial_test::serial] fn lm_head_knn_backend_skip_q4k_falls_back_to_f32() { let _g = EnvSet::set(Some("0")); let lm_head: Vec = vec![ @@ -551,7 +543,6 @@ mod tests { /// is loaded on the storage façade. Exercised through the public /// `lm_head_knn_backend` with stride32 enabled. #[test] - #[serial_test::serial] fn lm_head_stride32_path_returns_none_without_q4_mmap() { let _g = EnvSet::set(Some("1")); let lm_head: Vec = vec![1.0, 0.0, 0.0, 1.0]; @@ -569,7 +560,6 @@ mod tests { /// We force the path by populating the f16 mmap then setting /// vocab_size = 0 — the early-return on `vocab > 0` fires. #[test] - #[serial_test::serial] fn lm_head_f16_path_returns_none_when_vocab_zero() { let _g = EnvSet::set(Some("0")); // skip stride32 diff --git a/crates/larql-vindex/tests/golden_resume.rs b/crates/larql-vindex/tests/golden_resume.rs index c95f47a15..a03f6cb24 100644 --- a/crates/larql-vindex/tests/golden_resume.rs +++ b/crates/larql-vindex/tests/golden_resume.rs @@ -129,6 +129,7 @@ fn run_extract(model_dir: &Path, output_dir: &Path) { "test/resume", output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, diff --git a/crates/larql-vindex/tests/test_streaming_stages_moe.rs b/crates/larql-vindex/tests/test_streaming_stages_moe.rs index 8d13286cb..965cd1a0e 100644 --- a/crates/larql-vindex/tests/test_streaming_stages_moe.rs +++ b/crates/larql-vindex/tests/test_streaming_stages_moe.rs @@ -175,6 +175,7 @@ fn streaming_extract_mixtral_exercises_moe_arms() { "test/mixtral-synthetic", &output_dir, 5, // down_top_k + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -414,6 +415,7 @@ fn streaming_extract_gemma4_hybrid_moe_exercises_packed_bf16_arms() { "test/gemma4-hybrid-moe-synthetic", &output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -660,6 +662,7 @@ fn streaming_extract_gpt_oss_exercises_packed_mxfp4_arms() { "test/gpt-oss-synthetic", &output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -690,32 +693,10 @@ fn streaming_extract_gpt_oss_exercises_packed_mxfp4_arms() { } } -// ─── LARQL_SUMMARY_FEATURES_PER_EXPERT path (gate_vectors SVD + down_meta cap) ── - -/// RAII guard for the `LARQL_SUMMARY_FEATURES_PER_EXPERT` env var. The -/// summary tier is gated on a process-global env var, so tests reading -/// or writing it must serialise via `#[serial]` (and the var must be -/// cleared on drop so neighbouring tests aren't affected). -struct SummaryEnvGuard { - prev: Option, -} - -impl SummaryEnvGuard { - fn set(value: &str) -> Self { - let prev = std::env::var("LARQL_SUMMARY_FEATURES_PER_EXPERT").ok(); - std::env::set_var("LARQL_SUMMARY_FEATURES_PER_EXPERT", value); - Self { prev } - } -} - -impl Drop for SummaryEnvGuard { - fn drop(&mut self) { - match self.prev.take() { - Some(v) => std::env::set_var("LARQL_SUMMARY_FEATURES_PER_EXPERT", v), - None => std::env::remove_var("LARQL_SUMMARY_FEATURES_PER_EXPERT"), - } - } -} +// ─── summary-features-per-expert path (gate_vectors SVD + down_meta cap) ── +// The summary tier is now a `build_vindex_streaming` parameter +// (`summary_features_per_expert`), passed directly per call — no +// process-global env, so these tests need no serialisation. /// Build a Mixtral fixture with a tokenizer that has a non-empty vocab, /// so `down_meta` actually decodes some `token_id → string` and exercises @@ -799,6 +780,7 @@ fn streaming_extract_mixtral_resumes_when_run_twice_on_same_output_dir() { "test/mixtral-resume", &output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -830,6 +812,7 @@ fn streaming_extract_mixtral_resumes_when_run_twice_on_same_output_dir() { "test/mixtral-resume", &output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -882,6 +865,7 @@ fn streaming_extract_mixtral_with_real_tokenizer_records_top_k_entries() { "test/mixtral-real-tok", &output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -941,6 +925,7 @@ fn streaming_extract_mixtral_with_drop_gate_vectors_removes_zero_byte_file() { "test/mixtral-drop-gate", &output_dir, 5, + 0, // summary_features_per_expert (off) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::Q4K, // required by drop_gate_vectors @@ -967,10 +952,9 @@ fn streaming_extract_mixtral_with_drop_gate_vectors_removes_zero_byte_file() { } #[test] -#[serial_test::serial] fn streaming_extract_mixtral_with_summary_k_runs_svd_and_caps_down_meta() { // Same Mixtral fixture as the baseline test, but with - // `LARQL_SUMMARY_FEATURES_PER_EXPERT=2`. Triggers: + // `summary_features_per_expert = 2`. Triggers: // - the SVD-summary path in `gate_vectors.rs` Standard-MoE branch // (writes K rows per expert instead of full intermediate=4) // - the down_meta `summary_k`-cap branch (truncates `num_features` @@ -987,8 +971,6 @@ fn streaming_extract_mixtral_with_summary_k_runs_svd_and_caps_down_meta() { let vocab = 16usize; let summary_k = 2usize; - let _env = SummaryEnvGuard::set(&summary_k.to_string()); - let tmp = tempfile::tempdir().unwrap(); let model_dir = tmp.path().join("model"); let output_dir = tmp.path().join("vindex"); @@ -1010,6 +992,7 @@ fn streaming_extract_mixtral_with_summary_k_runs_svd_and_caps_down_meta() { "test/mixtral-summary-k", &output_dir, 5, + summary_k, // summary_features_per_expert (SVD-summary tier) ExtractLevel::Browse, StorageDtype::F32, QuantFormat::None, @@ -1040,3 +1023,201 @@ fn streaming_extract_mixtral_with_summary_k_runs_svd_and_caps_down_meta() { "gate_vectors.bin sized for K rows × hidden, not intermediate" ); } + +// ─── GGUF-backed streaming extract ─────────────────────────────────────── +// Every fixture above is safetensors-backed. This section drives +// `build_vindex_streaming` against a hand-built GGUF model so the GGUF +// arms exercise end-to-end: arch detection in `streaming::mod` +// (`GgufFile::open` → `to_config_json` → `detect_from_json`) and the +// `GgufTensorSource` setup branch in `streaming::context::new`. + +use larql_models::loading::gguf::{GgufTensor, GgufValue, GgufWriter}; + +/// Deterministic f32 ramp → little-endian bytes (non-degenerate so the +/// down-projection argmax isn't a tie across the whole vocab). +fn gguf_f32_ramp(n: usize) -> Vec { + (0..n) + .flat_map(|i| ((i as f32) * 0.01).to_le_bytes()) + .collect() +} + +/// Write a tiny but complete llama-architecture GGUF model. +/// +/// FFN is square (`hidden == intermediate == 4`) on purpose: the +/// canonical FFN orientation in `GgufTensorSource::get_tensor_f32` +/// becomes a no-op (`orient` short-circuits when rows == cols), so the +/// synthetic data can't trip a transpose mismatch. Tensors use GGUF +/// naming (`blk.L.ffn_gate.weight`); the source adapter maps them back +/// to HF keys via `normalize_gguf_key`. +fn write_synthetic_llama_gguf(path: &Path, num_layers: usize, vocab: usize) { + const DIM: u64 = 4; // hidden == intermediate + let v = vocab as u64; + + let mut w = GgufWriter::new(); + w.meta("general.architecture", GgufValue::String("llama".into())) + .meta("llama.embedding_length", GgufValue::U32(DIM as u32)) + .meta("llama.block_count", GgufValue::U32(num_layers as u32)) + .meta("llama.feed_forward_length", GgufValue::U32(DIM as u32)) + .meta("llama.attention.head_count", GgufValue::U32(2)) + .meta("llama.attention.head_count_kv", GgufValue::U32(2)) + .meta("llama.attention.key_length", GgufValue::U32(2)) + .meta("llama.rope.freq_base", GgufValue::F32(10000.0)); + + // GGUF dims are innermost-first: [hidden, vocab] reshapes to the + // Array2 (vocab, hidden) the embeddings stage expects. + w.tensor(GgufTensor { + name: "token_embd.weight".into(), + dims: vec![DIM, v], + ggml_type: 0, // GGML_TYPE_F32 + data: gguf_f32_ramp((DIM * v) as usize), + }); + w.tensor(GgufTensor { + name: "output.weight".into(), + dims: vec![DIM, v], + ggml_type: 0, + data: gguf_f32_ramp((DIM * v) as usize), + }); + w.tensor(GgufTensor { + name: "output_norm.weight".into(), + dims: vec![DIM], + ggml_type: 0, + data: gguf_f32_ramp(DIM as usize), + }); + for layer in 0..num_layers { + w.tensor(GgufTensor { + name: format!("blk.{layer}.ffn_gate.weight"), + dims: vec![DIM, DIM], + ggml_type: 0, + data: gguf_f32_ramp((DIM * DIM) as usize), + }); + w.tensor(GgufTensor { + name: format!("blk.{layer}.ffn_down.weight"), + dims: vec![DIM, DIM], + ggml_type: 0, + data: gguf_f32_ramp((DIM * DIM) as usize), + }); + } + w.write_to_file(path).unwrap(); +} + +#[test] +fn streaming_extract_gguf_llama_browse_runs_end_to_end() { + let num_layers = 2usize; + let vocab = 24usize; + + let tmp = tempfile::tempdir().unwrap(); + let model_dir = tmp.path().join("gguf_model"); + std::fs::create_dir_all(&model_dir).unwrap(); + // A directory containing exactly one `.gguf` — exercises the + // dir-scan branch of `detect_gguf_entry` as well as the GGUF arms. + write_synthetic_llama_gguf(&model_dir.join("model.gguf"), num_layers, vocab); + + let tok_json = + r#"{"version":"1.0","model":{"type":"BPE","vocab":{},"merges":[]},"added_tokens":[]}"#; + let tokenizer = larql_vindex::tokenizers::Tokenizer::from_bytes(tok_json.as_bytes()).unwrap(); + + let output_dir = tmp.path().join("vindex"); + let mut cb = SilentBuildCallbacks; + build_vindex_streaming( + &model_dir, + &tokenizer, + "test/llama-gguf", + &output_dir, + 5, // down_top_k + 0, // summary_features_per_expert (off) + ExtractLevel::Browse, + StorageDtype::F32, + QuantFormat::None, + WriteWeightsOptions::default(), + KquantWriteOptions::default(), + false, // drop_gate_vectors + &mut cb, + ) + .expect("streaming extract on GGUF llama fixture"); + + let config = larql_vindex::load_vindex_config(&output_dir).unwrap(); + assert_eq!( + config.layers.len(), + num_layers, + "GGUF extract should record one layer_info per block" + ); + assert!( + output_dir.join("gate_vectors.bin").exists(), + "GGUF extract should write gate_vectors.bin" + ); + assert!( + output_dir.join("embeddings.bin").exists(), + "GGUF extract should write embeddings.bin" + ); +} + +#[test] +fn streaming_extract_gguf_single_file_path_is_accepted() { + // Point `build_vindex_streaming` directly at the `.gguf` file rather + // than its parent dir — exercises the single-file arm of + // `detect_gguf_entry` through the full pipeline. + let tmp = tempfile::tempdir().unwrap(); + let gguf_path = tmp.path().join("solo.gguf"); + write_synthetic_llama_gguf(&gguf_path, 1, 16); + + let tok_json = + r#"{"version":"1.0","model":{"type":"BPE","vocab":{},"merges":[]},"added_tokens":[]}"#; + let tokenizer = larql_vindex::tokenizers::Tokenizer::from_bytes(tok_json.as_bytes()).unwrap(); + + let output_dir = tmp.path().join("vindex"); + let mut cb = SilentBuildCallbacks; + build_vindex_streaming( + &gguf_path, + &tokenizer, + "test/llama-gguf-solo", + &output_dir, + 3, + 0, + ExtractLevel::Browse, + StorageDtype::F32, + QuantFormat::None, + WriteWeightsOptions::default(), + KquantWriteOptions::default(), + false, + &mut cb, + ) + .expect("streaming extract on single-file GGUF"); + + let config = larql_vindex::load_vindex_config(&output_dir).unwrap(); + assert_eq!(config.layers.len(), 1); +} + +#[test] +fn streaming_extract_drop_gate_without_q4k_is_rejected() { + // `--drop-gate-vectors` is only recoverable when interleaved Q4K is + // also written; with `QuantFormat::None` the orchestrator must refuse + // before touching the output dir. + let tmp = tempfile::tempdir().unwrap(); + let model_dir = tmp.path().join("model"); + let output_dir = tmp.path().join("vindex"); + let tokenizer = write_synthetic_mixtral_model(&model_dir, 8, 4, 1, 2, 1, 16); + + let mut cb = SilentBuildCallbacks; + let err = build_vindex_streaming( + &model_dir, + &tokenizer, + "test/drop-gate-bad", + &output_dir, + 5, + 0, + ExtractLevel::Browse, + StorageDtype::F32, + QuantFormat::None, // not Q4K — drop_gate is invalid here + WriteWeightsOptions::default(), + KquantWriteOptions::default(), + true, // drop_gate_vectors + &mut cb, + ) + .expect_err("drop_gate_vectors without Q4K must be rejected"); + + let msg = format!("{err}").to_lowercase(); + assert!( + msg.contains("drop-gate-vectors") || msg.contains("q4k"), + "unexpected error message: {msg}" + ); +} diff --git a/crates/larql-vindex/tests/test_vindex.rs b/crates/larql-vindex/tests/test_vindex.rs index d88d7bca4..aff2f8510 100644 --- a/crates/larql-vindex/tests/test_vindex.rs +++ b/crates/larql-vindex/tests/test_vindex.rs @@ -2601,6 +2601,7 @@ fn streaming_extract_from_safetensors() { "test/streaming", &output_dir, 5, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Browse, larql_vindex::StorageDtype::F32, larql_vindex::QuantFormat::None, @@ -2805,6 +2806,7 @@ fn streaming_extract_q4k_from_safetensors() { "test/streaming-q4k", &output_dir, 5, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Browse, larql_vindex::StorageDtype::F32, QuantFormat::Q4K, @@ -3819,6 +3821,7 @@ fn streaming_extract_q4k_carries_ple_tensors() { "test/streaming-q4k-ple", &output_dir, 5, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Browse, larql_vindex::StorageDtype::F32, QuantFormat::Q4K, @@ -3997,6 +4000,7 @@ fn streaming_extract_noquant_carries_ple_tensors() { "test/streaming-noquant-ple", &output_dir, 5, + 0, // summary_features_per_expert (off) // Inference (not Browse): non-Q4 only writes model weights when // the level includes attn. larql_vindex::ExtractLevel::Inference, @@ -4169,6 +4173,7 @@ fn load_model_weights_rejects_ple_arch_with_missing_sidecars() { "test/ple-missing-sidecar", &output_dir, 5, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Inference, larql_vindex::StorageDtype::F32, QuantFormat::None, @@ -4381,6 +4386,7 @@ fn streaming_extract_preserves_per_layer_intermediate_for_variable_ffn() { "test/variable-ffn", &output_dir, 5, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Browse, larql_vindex::StorageDtype::F32, QuantFormat::Q4K, diff --git a/crates/larql-vindex/tests/test_vindex_to_q4k.rs b/crates/larql-vindex/tests/test_vindex_to_q4k.rs index c418afb1d..71dc8336e 100644 --- a/crates/larql-vindex/tests/test_vindex_to_q4k.rs +++ b/crates/larql-vindex/tests/test_vindex_to_q4k.rs @@ -280,6 +280,7 @@ fn q4k_end_to_end_from_synthetic_safetensors() { "test/q4k-e2e-source", &src_dir, 4, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Inference, larql_vindex::StorageDtype::F32, larql_vindex::QuantFormat::None, @@ -433,6 +434,7 @@ fn q4k_feature_major_down_round_trip() { "test/fm-down", &src_dir, 4, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Inference, larql_vindex::StorageDtype::F32, QuantFormat::None, @@ -536,6 +538,7 @@ fn legacy_q4k_filenames_load_via_dual_read() { "test/legacy-round-trip", &src_dir, 4, + 0, // summary_features_per_expert (off) larql_vindex::ExtractLevel::Inference, larql_vindex::StorageDtype::F32, QuantFormat::None, From 692c63fa8f8eacb3245c1a17c038b30961df34a0 Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Wed, 17 Jun 2026 00:09:33 +0100 Subject: [PATCH 7/8] increaded coverage --- crates/larql-vindex/coverage-policy.json | 3 - .../tests/test_streaming_stages_moe.rs | 203 ++++++++++++++++-- 2 files changed, 191 insertions(+), 15 deletions(-) diff --git a/crates/larql-vindex/coverage-policy.json b/crates/larql-vindex/coverage-policy.json index 67c03eb99..67167142d 100644 --- a/crates/larql-vindex/coverage-policy.json +++ b/crates/larql-vindex/coverage-policy.json @@ -17,10 +17,7 @@ "crates/larql-vindex/src/extract/build/down_meta.rs": 66.0, "crates/larql-vindex/src/extract/build/index_json.rs": 84.0, "crates/larql-vindex/src/extract/build/mod.rs": 89.0, - "crates/larql-vindex/src/extract/streaming/context.rs": 80.0, - "crates/larql-vindex/src/extract/streaming/mod.rs": 80.0, "crates/larql-vindex/src/extract/streaming/stages/model_weights.rs": 79.0, - "crates/larql-vindex/src/extract/streaming/stages/down_meta.rs": 84.0, "crates/larql-vindex/src/extract/streaming/stages/router_weights.rs": 87.0, "crates/larql-vindex/src/extract/streaming/tensor_io.rs": 70.0, "crates/larql-vindex/src/format/fp4_codec.rs": 85.9, diff --git a/crates/larql-vindex/tests/test_streaming_stages_moe.rs b/crates/larql-vindex/tests/test_streaming_stages_moe.rs index 965cd1a0e..7e3b9cbbe 100644 --- a/crates/larql-vindex/tests/test_streaming_stages_moe.rs +++ b/crates/larql-vindex/tests/test_streaming_stages_moe.rs @@ -40,6 +40,33 @@ fn write_synthetic_mixtral_model( num_experts: usize, num_experts_per_tok: usize, vocab: usize, +) -> larql_vindex::tokenizers::Tokenizer { + write_synthetic_mixtral_model_opts( + model_dir, + hidden, + intermediate, + num_layers, + num_experts, + num_experts_per_tok, + vocab, + true, + ) +} + +/// As [`write_synthetic_mixtral_model`], but `include_expert_down = false` +/// omits every expert's `w2` (down) tensor. With no per-expert down +/// matrices to gather, `down_meta`'s MoE arm produces an empty +/// `down_matrices` and takes its `is_empty()` skip branch for each layer. +#[allow(clippy::too_many_arguments)] +fn write_synthetic_mixtral_model_opts( + model_dir: &Path, + hidden: usize, + intermediate: usize, + num_layers: usize, + num_experts: usize, + num_experts_per_tok: usize, + vocab: usize, + include_expert_down: bool, ) -> larql_vindex::tokenizers::Tokenizer { std::fs::create_dir_all(model_dir).unwrap(); @@ -107,7 +134,9 @@ fn write_synthetic_mixtral_model( for e in 0..num_experts { let ep = format!("{lp}.block_sparse_moe.experts.{e}"); push(&format!("{ep}.w1.weight"), vec![intermediate, hidden]); - push(&format!("{ep}.w2.weight"), vec![hidden, intermediate]); + if include_expert_down { + push(&format!("{ep}.w2.weight"), vec![hidden, intermediate]); + } push(&format!("{ep}.w3.weight"), vec![intermediate, hidden]); } } @@ -724,11 +753,14 @@ fn write_synthetic_mixtral_model_with_real_tokenizer( vocab, ); - // Populate the BPE `vocab` map directly so `decode(&[id], true)` - // (which skips special tokens) still returns the printable token - // string for every ID in 0..min(vocab, 8). IDs beyond that decode - // to empty and exercise the `.filter(|s| !s.is_empty())` skip path. - let vocab_entries: Vec = (0..(vocab.min(8))) + // Populate the BPE `vocab` map for *every* ID so `decode(&[id], true)` + // returns a printable, non-empty token for whichever feature the + // down-projection argmax selects. This exercises the `TopKEntry` + // keep arm in `down_meta` (the `.map(|token| TopKEntry { .. })` that + // earlier capped at ID 8 never reached, because the argmax always + // landed on a higher ID). The complementary empty-string skip path + // stays covered by the empty-tokenizer fixtures above. + let vocab_entries: Vec = (0..vocab) .map(|i| format!("\"tok{i}\":{i}")) .collect(); let tok_json = format!( @@ -1050,6 +1082,19 @@ fn gguf_f32_ramp(n: usize) -> Vec { /// naming (`blk.L.ffn_gate.weight`); the source adapter maps them back /// to HF keys via `normalize_gguf_key`. fn write_synthetic_llama_gguf(path: &Path, num_layers: usize, vocab: usize) { + write_synthetic_llama_gguf_opts(path, num_layers, vocab, true); +} + +/// As [`write_synthetic_llama_gguf`], but `include_ffn_down = false` +/// omits the `blk.L.ffn_down.weight` tensors — so `down_meta`'s dense +/// arm hits its missing-tensor `continue` (the down projection is +/// skipped for every layer). +fn write_synthetic_llama_gguf_opts( + path: &Path, + num_layers: usize, + vocab: usize, + include_ffn_down: bool, +) { const DIM: u64 = 4; // hidden == intermediate let v = vocab as u64; @@ -1090,12 +1135,14 @@ fn write_synthetic_llama_gguf(path: &Path, num_layers: usize, vocab: usize) { ggml_type: 0, data: gguf_f32_ramp((DIM * DIM) as usize), }); - w.tensor(GgufTensor { - name: format!("blk.{layer}.ffn_down.weight"), - dims: vec![DIM, DIM], - ggml_type: 0, - data: gguf_f32_ramp((DIM * DIM) as usize), - }); + if include_ffn_down { + w.tensor(GgufTensor { + name: format!("blk.{layer}.ffn_down.weight"), + dims: vec![DIM, DIM], + ggml_type: 0, + data: gguf_f32_ramp((DIM * DIM) as usize), + }); + } } w.write_to_file(path).unwrap(); } @@ -1221,3 +1268,135 @@ fn streaming_extract_drop_gate_without_q4k_is_rejected() { "unexpected error message: {msg}" ); } + +// ─── down_meta edge arms (missing-tensor + resume skips) ───────────────── + +#[test] +fn streaming_extract_dense_with_missing_ffn_down_skips_down_projection() { + // Dense llama GGUF with no `blk.L.ffn_down.weight` — `down_meta`'s + // dense arm must hit its missing-tensor `continue` for every layer + // (no down projection), yet the extract still succeeds and writes a + // config with the right layer count. + let tmp = tempfile::tempdir().unwrap(); + let model_dir = tmp.path().join("gguf_model"); + std::fs::create_dir_all(&model_dir).unwrap(); + write_synthetic_llama_gguf_opts(&model_dir.join("model.gguf"), 2, 16, false); + + let tok_json = + r#"{"version":"1.0","model":{"type":"BPE","vocab":{},"merges":[]},"added_tokens":[]}"#; + let tokenizer = larql_vindex::tokenizers::Tokenizer::from_bytes(tok_json.as_bytes()).unwrap(); + + let output_dir = tmp.path().join("vindex"); + let mut cb = SilentBuildCallbacks; + build_vindex_streaming( + &model_dir, + &tokenizer, + "test/llama-gguf-nodown", + &output_dir, + 5, + 0, + ExtractLevel::Browse, + StorageDtype::F32, + QuantFormat::None, + WriteWeightsOptions::default(), + KquantWriteOptions::default(), + false, + &mut cb, + ) + .expect("extract should succeed even when ffn_down is absent"); + + let config = larql_vindex::load_vindex_config(&output_dir).unwrap(); + assert_eq!(config.layers.len(), 2); +} + +#[test] +#[serial_test::serial] +fn streaming_extract_moe_with_missing_expert_down_skips_layer() { + // Mixtral fixture with no expert `w2` (down) tensors — `down_meta`'s + // MoE arm gathers an empty `down_matrices` and takes the + // `is_empty()` skip branch for every layer. The other stages + // (gate / router / embeddings) still have what they need. + let tmp = tempfile::tempdir().unwrap(); + let model_dir = tmp.path().join("model"); + let output_dir = tmp.path().join("vindex"); + let tokenizer = write_synthetic_mixtral_model_opts( + &model_dir, 8, 4, 2, 2, 1, 16, /* include_expert_down = */ false, + ); + + let mut cb = SilentBuildCallbacks; + build_vindex_streaming( + &model_dir, + &tokenizer, + "test/mixtral-nodown", + &output_dir, + 5, + 0, + ExtractLevel::Browse, + StorageDtype::F32, + QuantFormat::None, + WriteWeightsOptions::default(), + KquantWriteOptions::default(), + false, + &mut cb, + ) + .expect("extract should succeed when expert down tensors are absent"); + + // Gate still written; the run completes despite the empty down arm. + assert!(output_dir.join("gate_vectors.bin").exists()); + assert!(larql_vindex::load_vindex_config(&output_dir).is_ok()); +} + +#[test] +#[serial_test::serial] +fn streaming_extract_resumes_and_skips_down_meta_when_checkpoint_marks_it() { + use larql_vindex::extract::{Checkpoint, ExtractPhase}; + + // Full extract once, then plant a checkpoint that marks the down_meta + // phase complete and re-run. The second run must take `write_down_meta`'s + // resume-skip branch — `down_meta.bin` is reused byte-for-byte, never + // recomputed. + let tmp = tempfile::tempdir().unwrap(); + let model_dir = tmp.path().join("model"); + let output_dir = tmp.path().join("vindex"); + let num_layers = 2usize; + let tokenizer = write_synthetic_mixtral_model(&model_dir, 8, 4, num_layers, 2, 1, 16); + let model_name = "test/resume-down-meta"; + + let run = |out: &Path, tok: &larql_vindex::tokenizers::Tokenizer| { + let mut cb = SilentBuildCallbacks; + build_vindex_streaming( + &model_dir, + tok, + model_name, + out, + 5, + 0, + ExtractLevel::Browse, + StorageDtype::F32, + QuantFormat::None, + WriteWeightsOptions::default(), + KquantWriteOptions::default(), + false, + &mut cb, + ) + .expect("streaming extract"); + }; + + run(&output_dir, &tokenizer); + let down_meta_before = std::fs::read(output_dir.join("down_meta.bin")).unwrap(); + + // Plant a checkpoint marking *only* down_meta complete (the gate stage + // re-runs fresh, so `layer_infos` is rebuilt for index.json — we just + // want down_meta to be skipped). `mark` persists to disk. + let mut cp = Checkpoint::fresh(&model_dir, model_name, num_layers); + cp.mark(ExtractPhase::DownMeta, &output_dir).unwrap(); + assert!(cp.is_complete(ExtractPhase::DownMeta)); + + run(&output_dir, &tokenizer); + let down_meta_after = std::fs::read(output_dir.join("down_meta.bin")).unwrap(); + + assert_eq!( + down_meta_before, down_meta_after, + "resumed down_meta.bin must be reused unchanged, not recomputed" + ); +} From 396fb3f02821d7a9d4dba0d2c3d0f37c8678d5ab Mon Sep 17 00:00:00 2001 From: chrishayuk Date: Wed, 17 Jun 2026 21:12:14 +0100 Subject: [PATCH 8/8] environment variable cleanup --- .../tests/test_streaming_stages_moe.rs | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/crates/larql-vindex/tests/test_streaming_stages_moe.rs b/crates/larql-vindex/tests/test_streaming_stages_moe.rs index 7e3b9cbbe..4123814d9 100644 --- a/crates/larql-vindex/tests/test_streaming_stages_moe.rs +++ b/crates/larql-vindex/tests/test_streaming_stages_moe.rs @@ -1,22 +1,28 @@ -//! Synthetic-safetensors fixtures for the streaming-extract MoE arms. +//! Synthetic fixtures for the streaming-extract pipeline. //! //! Hand-built, deterministic, in-process — no HuggingFace, no large -//! model downloads. Each fixture writes a tempdir tree (`config.json` + -//! `tokenizer.json` + `model.safetensors`) shaped like a real -//! architecture and drives [`larql_vindex::build_vindex_streaming`] -//! against it. The point is to exercise the per-format arms inside -//! `extract::streaming::stages::*` that the dense Llama fixture in -//! `test_vindex.rs` doesn't reach: +//! model downloads. Fixtures write a tempdir tree (safetensors: +//! `config.json` + `tokenizer.json` + `model.safetensors`; GGUF: a +//! single `model.gguf`) shaped like a real architecture and drive +//! [`larql_vindex::build_vindex_streaming`] against it. //! -//! - `gate_vectors::write_gate_vectors` — standard MoE arm -//! - `down_meta::write_down_meta` — standard MoE arm +//! Coverage targets across `extract::streaming::{mod, context, stages}` +//! that the dense Llama fixture in `test_vindex.rs` doesn't reach: +//! +//! - `gate_vectors::write_gate_vectors` / `down_meta::write_down_meta` — +//! standard MoE arms (Mixtral happy path) //! - `router_weights::write_router_weights` — whole body (early-returns //! on dense; only fires when `is_moe`) //! - `index_json::write_index_json` — MoE config branch + has-experts //! per-layer tracking -//! -//! Single Mixtral-shaped happy path is enough to flip all four files -//! into "MoE arm exercised" territory. +//! - the **GGUF arms**: arch detection in `streaming::mod`, the +//! `GgufTensorSource` branch of `context::new`, and `detect_gguf_entry` +//! (single-file, multi-shard, largest-fallback) — driven by a hand- +//! built `GgufWriter` llama model +//! - `down_meta` **edge arms**: missing-tensor `continue`s (dense and +//! per-expert MoE down absent), the resume-skip path (checkpoint marks +//! down_meta complete), and the `TopKEntry` keep arm (full-vocab +//! tokenizer so the down argmax decodes to a non-empty token) use std::collections::HashMap; use std::path::Path; @@ -760,9 +766,7 @@ fn write_synthetic_mixtral_model_with_real_tokenizer( // earlier capped at ID 8 never reached, because the argmax always // landed on a higher ID). The complementary empty-string skip path // stays covered by the empty-tokenizer fixtures above. - let vocab_entries: Vec = (0..vocab) - .map(|i| format!("\"tok{i}\":{i}")) - .collect(); + let vocab_entries: Vec = (0..vocab).map(|i| format!("\"tok{i}\":{i}")).collect(); let tok_json = format!( r#"{{"version":"1.0","model":{{"type":"BPE","vocab":{{{}}},"merges":[]}},"added_tokens":[]}}"#, vocab_entries.join(",")