chrishayuk · chrishayuk · Jun 21, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/README.md b/README.md
@@ -625,6 +625,8 @@ vs ollama gemma3:4b on the same machine: ~103 tok/s steady → **gap 1.17×**, w
 
 **CPU vs llama.cpp** (reconciled 2026-06-02, M3 Max, 8 threads, warm): larql **26.4** (StandardEngine) / 23.5 (legacy `bench --cpu`) vs **llama.cpp `-ngl 0` 43.0** tok/s → **gap ~1.6–1.8×**. The gap is per-core kernel quality — both attention and FFN already run the int8 Q8_K SDOT kernel; closing it is C12 (hand-asm; an opt-in `LARQL_Q4K_ASM=1` v1 lands +~4% isolated). `larql bench --cpu` now reports both the legacy and production-StandardEngine rows; `--ollama-cpu` forces a true CPU ollama baseline (default `--ollama` runs on Metal GPU). The earlier 1.5×/1.9× spread was two measurement confounds (path mismatch + an unwarmed-ollama artifact), not a regression — see `bench/baselines/c10_gemma3-4b_cpu_reconciled.json`.
 
+**CPU prefill** (2026-06-22): the per-layer f32 dequant — long the dominant prefill cost (~2.7 s / ~2 tok/s on the 5-token prompt) — is gone. Q/K/V/O **and** gate/up/down now project straight from the Q4_K/Q6_K vindex bytes via amortised `q4k_matmul` / `q6k_matmul` (the Q6_K twin handles the default Q6_K `v_proj` / `down_proj`) with a hand-written aarch64 NEON inner dot. Gemma 3 4B Q4_K CPU prefill: **2746 ms → 233 ms (11.8×)**, closing the gap to llama.cpp `pp5` from ~55× to **~3×**; the NEON `q4k_matmul` at seq=5 beats f32 AMX sgemm while still skipping the dequant. See `bench/baselines/cpu/COMPARISON.md`.
+
 **Cross-arch coverage (2026-05-09)**: Gemma 3, Gemma 4 31B dense, Llama 2 7B, Mistral 7B all dispatch correctly through Metal. Gemma 4 E2B currently falls back to CPU (Per-Layer Embeddings not yet in Metal — ROADMAP D-METAL-PLE). See [crates/larql-compute/docs/architecture-shader-map.md](crates/larql-compute/docs/architecture-shader-map.md) for the per-architecture shader dispatch table.
 
 CPU walk breakdown:

diff --git a/ROADMAP.md b/ROADMAP.md
diff --git a/bench/baselines/cpu/COMPARISON.md b/bench/baselines/cpu/COMPARISON.md
@@ -1,5 +1,27 @@
 # larql vs llama.cpp — CPU decode on Gemma 3 4B Q4_K
 
+> **Update 2026-06-22 — prefill gap largely closed.** The q4k-direct prefill
+> work changed the picture: Q4_K/Q6_K attention (Q/K/V/O) and FFN (gate/up/down)
+> projections now run straight from the vindex bytes with no per-layer f32
+> dequant — `q4k_matmul`/`q6k_matmul` (the Q6_K twin, used by the default Q6_K
+> `down_proj` and `v_proj`), with a hand-written aarch64 NEON inner dot.
+> Apple M3 Max, CPU only (`-t 8`), same model + prompt as below.
+>
+> | Metric | larql (standard) | llama.cpp | Ratio |
+> |---|---:|---:|---:|
+> | Decode (tg, tok/s)                   | ~42              | ~38   | **~1.1× ahead** |
+> | Prefill (5-tok prompt, ms)           | 233              | ~70   | **~3.3× behind** (was 55×) |
+> | Prefill vs the May full-dequant path | 2746 → 233 ms    |       | **11.8× faster** |
+>
+> Decode is now at/ahead of llama.cpp; prefill went from 55× behind to ~3×. The
+> NEON `q4k_matmul` at seq=5 actually *beats* f32 AMX sgemm (1.0–1.3×) while
+> skipping the dequant. The remaining prefill gap is constant-factor kernel work
+> (our matmul vs llama.cpp's hand-tuned asm) plus batched attention, not dequant.
+> Numbers are same-session (machine warm from builds) — ratios hold; cold
+> absolutes run a touch faster. The 2026-05-15 baseline below is kept for history.
+
+---
+
 Recorded 2026-05-15 on Apple M3 Max, 12 threads, BLAS / Accelerate enabled,
 no GPU. Both engines load the same model weights — `output/larql-gemma-3-4b-it.gguf`
 quantized to Q4_K_M for llama.cpp, the matching `output/gemma3-4b-q4k-v2.vindex`

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/basis.rs b/crates/larql-cli/src/commands/dev/ov_rd/basis.rs
@@ -259,8 +259,12 @@ pub(super) fn fit_z_pca_bases(
         for layer in 0..weights.num_layers {
             let inserted = insert_q4k_layer_tensors(weights, index, layer)?;
             if let Some(layer_heads) = heads_by_layer.get(&layer) {
-                let (_, pre_o) = run_attention_block_with_pre_o(weights, &h, layer)
-                    .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
+                let (_, pre_o) = run_attention_block_with_pre_o(
+                    larql_models::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                )
+                .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
                 let head_dim = weights.arch.head_dim_for_layer(layer);
                 for head in layer_heads {
                     let basis = bases.get(head).expect("basis pre-created for PCA fit");
@@ -287,9 +291,15 @@ pub(super) fn fit_z_pca_bases(
 
             {
                 let ffn = WeightFfn { weights };
-                if let Some((h_new, _, _)) =
-                    run_layer_with_ffn(weights, &h, layer, &ffn, false, ple_inputs.get(layer), None)
-                {
+                if let Some((h_new, _, _)) = run_layer_with_ffn(
+                    larql_inference::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                    &ffn,
+                    false,
+                    ple_inputs.get(layer),
+                    None,
+                ) {
                     h = h_new;
                 }
             }

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/capture.rs b/crates/larql-cli/src/commands/dev/ov_rd/capture.rs
@@ -132,8 +132,12 @@ pub(super) fn run_capture(args: CaptureArgs) -> Result<(), Box<dyn std::error::E
             let inserted = insert_q4k_layer_tensors(&mut weights, &index, layer)?;
 
             if capture_layer(layer) {
-                let (_, pre_o) = run_attention_block_with_pre_o(&weights, &h, layer)
-                    .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
+                let (_, pre_o) = run_attention_block_with_pre_o(
+                    larql_models::WeightsView::dense(&weights),
+                    &h,
+                    layer,
+                )
+                .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
                 add_pre_o_stats(
                     &mut stats[layer],
                     &pre_o,
@@ -160,7 +164,7 @@ pub(super) fn run_capture(args: CaptureArgs) -> Result<(), Box<dyn std::error::E
             {
                 let ffn = WeightFfn { weights: &weights };
                 if let Some((h_new, _, _)) = run_layer_with_ffn(
-                    &weights,
+                    larql_inference::WeightsView::dense(&weights),
                     &h,
                     layer,
                     &ffn,

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/edit_catalog.rs b/crates/larql-cli/src/commands/dev/ov_rd/edit_catalog.rs
@@ -278,7 +278,7 @@ pub(super) fn run_oracle_edit_catalog(
         }
         let stratum = record.stratum.as_deref().unwrap_or("unknown");
         let baseline_hidden =
-            larql_inference::vindex::predict_kquant_hidden(&mut weights, &token_ids, &index, None);
+            larql_inference::vindex::predict_kquant_hidden(&weights, &token_ids, &index, None);
         let baseline_logits = final_logits(&weights, &baseline_hidden);
         let baseline_logp = log_softmax(&baseline_logits);
         let baseline_top1 = argmax(&baseline_logits);
@@ -483,8 +483,12 @@ fn fit_edit_catalogs(
         for layer in 0..weights.num_layers {
             let inserted = insert_q4k_layer_tensors(weights, index, layer)?;
             if let Some(layer_heads) = heads_by_layer.get(&layer) {
-                let (_, pre_o) = run_attention_block_with_pre_o(weights, &h, layer)
-                    .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
+                let (_, pre_o) = run_attention_block_with_pre_o(
+                    larql_models::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                )
+                .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
                 let head_dim = weights.arch.head_dim_for_layer(layer);
                 for head in layer_heads {
                     let basis = bases.get(head).expect("basis pre-created for edit catalog");
@@ -537,9 +541,15 @@ fn fit_edit_catalogs(
 
             {
                 let ffn = WeightFfn { weights };
-                if let Some((h_new, _, _)) =
-                    run_layer_with_ffn(weights, &h, layer, &ffn, false, ple_inputs.get(layer), None)
-                {
+                if let Some((h_new, _, _)) = run_layer_with_ffn(
+                    larql_inference::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                    &ffn,
+                    false,
+                    ple_inputs.get(layer),
+                    None,
+                ) {
                     h = h_new;
                 }
             }

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/eval_program/mod.rs b/crates/larql-cli/src/commands/dev/ov_rd/eval_program/mod.rs
@@ -254,15 +254,10 @@ pub(super) fn run_eval_program(args: EvalProgramArgs) -> Result<(), Box<dyn std:
             {
                 h
             } else {
-                larql_inference::vindex::predict_kquant_hidden(
-                    &mut weights,
-                    &token_ids,
-                    &index,
-                    None,
-                )
+                larql_inference::vindex::predict_kquant_hidden(&weights, &token_ids, &index, None)
             }
         } else {
-            larql_inference::vindex::predict_kquant_hidden(&mut weights, &token_ids, &index, None)
+            larql_inference::vindex::predict_kquant_hidden(&weights, &token_ids, &index, None)
         };
         let baseline_logits = final_logits(&weights, &baseline_h);
         let baseline_logp = log_softmax(&baseline_logits);

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/gamma_address.rs b/crates/larql-cli/src/commands/dev/ov_rd/gamma_address.rs
@@ -733,8 +733,12 @@ fn collect_gamma_code_samples(
             let inserted = insert_q4k_layer_tensors(weights, index, layer)?;
             if let Some(layer_heads) = heads_by_layer.get(&layer) {
                 let layer_input = h.clone();
-                let (_, pre_o) = run_attention_block_with_pre_o(weights, &h, layer)
-                    .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
+                let (_, pre_o) = run_attention_block_with_pre_o(
+                    larql_models::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                )
+                .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
                 let head_dim = weights.arch.head_dim_for_layer(layer);
                 for head in layer_heads {
                     let basis = bases.get(head).ok_or_else(|| {
@@ -787,9 +791,15 @@ fn collect_gamma_code_samples(
 
             {
                 let ffn = WeightFfn { weights };
-                if let Some((h_new, _, _)) =
-                    run_layer_with_ffn(weights, &h, layer, &ffn, false, ple_inputs.get(layer), None)
-                {
+                if let Some((h_new, _, _)) = run_layer_with_ffn(
+                    larql_inference::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                    &ffn,
+                    false,
+                    ple_inputs.get(layer),
+                    None,
+                ) {
                     h = h_new;
                 } else {
                     remove_layer_tensors(weights, inserted);

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/oracle.rs b/crates/larql-cli/src/commands/dev/ov_rd/oracle.rs
@@ -270,7 +270,7 @@ pub(super) fn run_oracle_roundtrip(
         let stratum = record.stratum.as_deref().unwrap_or("unknown");
 
         let baseline_hidden =
-            larql_inference::vindex::predict_kquant_hidden(&mut weights, &token_ids, &index, None);
+            larql_inference::vindex::predict_kquant_hidden(&weights, &token_ids, &index, None);
         let baseline_logits = final_logits(&weights, &baseline_hidden);
         let baseline_logp = log_softmax(&baseline_logits);
 
@@ -414,7 +414,7 @@ pub(super) fn run_oracle_lowrank(
         let stratum = record.stratum.as_deref().unwrap_or("unknown");
 
         let baseline_hidden =
-            larql_inference::vindex::predict_kquant_hidden(&mut weights, &token_ids, &index, None);
+            larql_inference::vindex::predict_kquant_hidden(&weights, &token_ids, &index, None);
         let baseline_logits = final_logits(&weights, &baseline_hidden);
         let baseline_logp = log_softmax(&baseline_logits);
         let baseline_top1 = argmax(&baseline_logits);

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq.rs b/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq.rs
@@ -1715,7 +1715,7 @@ pub(super) fn run_oracle_pq(args: OraclePqArgs) -> Result<(), Box<dyn std::error
         let stratum = record.stratum.as_deref().unwrap_or("unknown");
 
         let baseline_hidden =
-            larql_inference::vindex::predict_kquant_hidden(&mut weights, &token_ids, &index, None);
+            larql_inference::vindex::predict_kquant_hidden(&weights, &token_ids, &index, None);
         let baseline_logits = final_logits(&weights, &baseline_hidden);
         let baseline_logp = log_softmax(&baseline_logits);
         let baseline_top1 = argmax(&baseline_logits);

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq_address.rs b/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq_address.rs
@@ -1314,7 +1314,11 @@ where
                     if let Some(qk_rank) = reduced_qk_rank {
                         let (_, pre_o, all_weights) =
                             run_attention_block_with_pre_o_and_reduced_qk_attention_weights(
-                                weights, &h, layer, None, qk_rank,
+                                larql_models::WeightsView::dense(weights),
+                                &h,
+                                layer,
+                                None,
+                                qk_rank,
                             )
                             .ok_or_else(|| {
                                 format!(
@@ -1325,16 +1329,23 @@ where
                     } else {
                         let (_, pre_o, all_weights) =
                             run_attention_block_with_pre_o_and_all_attention_weights(
-                                weights, &h, layer, None,
+                                larql_models::WeightsView::dense(weights),
+                                &h,
+                                layer,
+                                None,
                             )
                             .ok_or_else(|| {
                                 format!("pre-W_O/all-attention capture failed at layer {layer}")
                             })?;
                         (pre_o, Some(all_weights))
                     }
                 } else {
-                    let (_, pre_o) = run_attention_block_with_pre_o(weights, &h, layer)
-                        .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
+                    let (_, pre_o) = run_attention_block_with_pre_o(
+                        larql_models::WeightsView::dense(weights),
+                        &h,
+                        layer,
+                    )
+                    .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
                     (pre_o, None)
                 };
                 let (pre_o, all_weights) = capture;
@@ -1405,7 +1416,7 @@ where
             {
                 let ffn = WeightFfn { weights };
                 if let Some((h_new, activation, _)) = run_layer_with_ffn(
-                    weights,
+                    larql_inference::WeightsView::dense(weights),
                     &h,
                     layer,
                     &ffn,

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq_fit.rs b/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq_fit.rs
@@ -56,8 +56,12 @@ pub(super) fn fit_pq_codebooks(
         for layer in 0..weights.num_layers {
             let inserted = insert_q4k_layer_tensors(weights, index, layer)?;
             if let Some(layer_heads) = heads_by_layer.get(&layer) {
-                let (_, pre_o) = run_attention_block_with_pre_o(weights, &h, layer)
-                    .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
+                let (_, pre_o) = run_attention_block_with_pre_o(
+                    larql_models::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                )
+                .ok_or_else(|| format!("pre-W_O capture failed at layer {layer}"))?;
                 let head_dim = weights.arch.head_dim_for_layer(layer);
                 for head in layer_heads {
                     let basis = bases.get(head).expect("basis pre-created for PQ fit");
@@ -102,9 +106,15 @@ pub(super) fn fit_pq_codebooks(
 
             {
                 let ffn = WeightFfn { weights };
-                if let Some((h_new, _, _)) =
-                    run_layer_with_ffn(weights, &h, layer, &ffn, false, ple_inputs.get(layer), None)
-                {
+                if let Some((h_new, _, _)) = run_layer_with_ffn(
+                    larql_inference::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                    &ffn,
+                    false,
+                    ple_inputs.get(layer),
+                    None,
+                ) {
                     h = h_new;
                 }
             }

diff --git a/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq_forward.rs b/crates/larql-cli/src/commands/dev/ov_rd/oracle_pq_forward.rs
@@ -189,7 +189,7 @@ pub(super) fn capture_layer_input_hidden(
                 .and_then(|src| kv_cache.get(&src));
             let ffn = WeightFfn { weights };
             run_layer_with_ffn(
-                weights,
+                larql_inference::WeightsView::dense(weights),
                 &h,
                 layer,
                 &ffn,
@@ -239,7 +239,7 @@ pub(super) fn capture_prev_ffn_feature_keys(
                 .and_then(|src| kv_cache.get(&src));
             let ffn = WeightFfn { weights };
             run_layer_with_ffn(
-                weights,
+                larql_inference::WeightsView::dense(weights),
                 &h,
                 layer,
                 &ffn,
@@ -305,7 +305,7 @@ pub(super) fn capture_ffn_first_feature_keys(
                 .and_then(|src| kv_cache.get(&src));
             let ffn = WeightFfn { weights };
             run_layer_with_ffn(
-                weights,
+                larql_inference::WeightsView::dense(weights),
                 &h,
                 layer,
                 &ffn,
@@ -348,7 +348,10 @@ pub(super) fn capture_attention_relation_rows(
                 .kv_shared_source_layer(layer)
                 .and_then(|src| kv_cache.get(&src));
             let (_, _, all_weights) = run_attention_block_with_pre_o_and_all_attention_weights(
-                weights, &h, layer, shared_kv,
+                larql_models::WeightsView::dense(weights),
+                &h,
+                layer,
+                shared_kv,
             )
             .ok_or_else(|| {
                 format!(
@@ -369,7 +372,7 @@ pub(super) fn capture_attention_relation_rows(
                 .and_then(|src| kv_cache.get(&src));
             let ffn = WeightFfn { weights };
             run_layer_with_ffn(
-                weights,
+                larql_inference::WeightsView::dense(weights),
                 &h,
                 layer,
                 &ffn,
@@ -414,7 +417,11 @@ pub(super) fn capture_reduced_qk_attention_rows(
                 .and_then(|src| kv_cache.get(&src));
             let (_, _, all_weights) =
                 run_attention_block_with_pre_o_and_reduced_qk_attention_weights(
-                    weights, &h, layer, shared_kv, qk_rank,
+                    larql_models::WeightsView::dense(weights),
+                    &h,
+                    layer,
+                    shared_kv,
+                    qk_rank,
                 )
                 .ok_or_else(|| {
                     format!(
@@ -439,7 +446,7 @@ pub(super) fn capture_reduced_qk_attention_rows(
                 .and_then(|src| kv_cache.get(&src));
             let ffn = WeightFfn { weights };
             run_layer_with_ffn(
-                weights,
+                larql_inference::WeightsView::dense(weights),
                 &h,
                 layer,
                 &ffn,