From ba510ab85c6024ac7e9507bff482076f8e1df9ba Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 12:10:51 +0100
Subject: [PATCH 01/12] avoid heap allocations in `poseidon_sponge`

In the 2^32 benchmark during key generation this avoids about 500k
temporary heap allocations when running for about 30s.

Likely only a small performance cost, but we can avoid them without
making the code much more complicated.
---
 src/symmetric/tweak_hash/poseidon.rs | 45 ++++++++++++++++++----------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 7ab2d7b..775e1e8 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -201,34 +201,47 @@ where
     );
     let rate = WIDTH - capacity_value.len();
 
-    let extra_elements = (rate - (input.len() % rate)) % rate;
-    let mut input_vector = input.to_vec();
-    // We pad the input with zeros to make its length a multiple of the rate.
-    //
-    // This is safe because the input's original length is effectively encoded
-    // in the `capacity_value`, which serves as a domain separator.
-    input_vector.resize(input.len() + extra_elements, A::ZERO);
-
     // initialize
     let mut state = [A::ZERO; WIDTH];
     state[rate..].copy_from_slice(capacity_value);
 
-    // absorb
-    for chunk in input_vector.chunks(rate) {
+    let extra_elements = (rate - (input.len() % rate)) % rate;
+    // Instead of converting the input to a vector, resizing and feeding the data into the
+    // sponge, we instead fill in the vector from all chunks until we are left with a non
+    // full chunk. We only add to the state, so padded data does not mutate `state` at all.
+
+    // 1. fill in all full chunks and permute
+    let mut it = input.chunks_exact(rate);
+    for chunk in &mut it {
+        //input.chunks_exact(rate) {
+        // iterate the chunks
         for i in 0..chunk.len() {
             state[i] += chunk[i];
         }
         perm.permute_mut(&mut state);
     }
+    // 2. fill the remainder and extend with zeros
+    let remainder = rate - extra_elements;
+    if remainder > 0 {
+        for (i, x) in it.remainder().iter().enumerate() {
+            state[i] += *x;
+        }
+        // was a remainder, so permute. No need to mutate `state` as we *add* only anyway
+        perm.permute_mut(&mut state);
+    }
 
     // squeeze
-    let mut out = vec![];
-    while out.len() < OUT_LEN {
-        out.extend_from_slice(&state[..rate]);
-        perm.permute_mut(&mut state);
+    let mut out = [A::ZERO; OUT_LEN];
+    let mut out_idx = 0;
+    while out_idx < OUT_LEN {
+        let chunk_size = (OUT_LEN - out_idx).min(rate);
+        out[out_idx..out_idx + chunk_size].copy_from_slice(&state[..chunk_size]);
+        out_idx += chunk_size;
+        if out_idx < OUT_LEN {
+            perm.permute_mut(&mut state);
+        }
     }
-    let slice = &out[0..OUT_LEN];
-    slice.try_into().expect("Length mismatch")
+    out
 }
 
 /// A tweakable hash function implemented using Poseidon2

From e3e5ceb234da7063803e380ba32d3b521efa54d3 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 12:12:02 +0100
Subject: [PATCH 02/12] avoid heap allocations in `compute_tree_leaves` by
 using `for_each_init`

Each rayon worker job had to allocate the full `packed_leaf_input`. We
now use `for_each_init` to preallocate a vector for every Rayon worker
instead. We overwrite the entire vector in every job, so not even a
need to `fill(0)` the vector in each job.

This drops another ~100k allocations when running the 2^32 bench over
30s.

Brings us down to only 3k temporary allocations total in that time frame.
---
 src/symmetric/tweak_hash/poseidon.rs | 235 ++++++++++++++-------------
 1 file changed, 123 insertions(+), 112 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 775e1e8..50005a1 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -505,6 +505,11 @@ impl<
         let capacity_val: [PackedF; CAPACITY] =
             poseidon_safe_domain_separator::<CAPACITY>(&sponge_perm, &lengths).map(PackedF::from);
 
+        // Compute sponge input length. Required to init packed input vector for each rayon worker
+        let sponge_tweak_offset = PARAMETER_LEN;
+        let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
+        let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
+
         // PARALLEL SIMD PROCESSING
         //
         // Process epochs in batches of size `width`.
@@ -513,126 +518,132 @@ impl<
         epochs
             .par_chunks_exact(width)
             .zip(leaves.par_chunks_exact_mut(width))
-            .for_each(|(epoch_chunk, leaves_chunk)| {
-                // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
-                //
-                // For each chain, generate starting points for all epochs in the chunk.
-                // Use vertical packing: transpose from [lane][element] to [element][lane].
-                //
-                // This layout enables efficient SIMD operations across epochs.
-
-                let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
-                    array::from_fn(|c_idx| {
-                        // Generate starting points for this chain across all epochs.
-                        let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
-                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64).into()
+            .for_each_init(
+                || vec![PackedF::ZERO; sponge_input_len],
+                |packed_leaf_input, (epoch_chunk, leaves_chunk)| {
+                    // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
+                    //
+                    // For each chain, generate starting points for all epochs in the chunk.
+                    // Use vertical packing: transpose from [lane][element] to [element][lane].
+                    //
+                    // This layout enables efficient SIMD operations across epochs.
+
+                    let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
+                        array::from_fn(|c_idx| {
+                            // Generate starting points for this chain across all epochs.
+                            let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
+                                PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
+                                    .into()
+                            });
+
+                            // Transpose to vertical packing for SIMD efficiency.
+                            pack_array(&starts)
                         });
 
-                        // Transpose to vertical packing for SIMD efficiency.
-                        pack_array(&starts)
-                    });
-
-                // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
-                //
-                // For each chain, walk all epochs simultaneously using SIMD.
-                // The chains start at their initial values and are walked step-by-step
-                // until they reach their endpoints.
-                //
-                // Cache strategy: process one chain at a time to maximize locality.
-                // All epochs for that chain stay in registers across iterations.
-
-                // Offsets for chain compression: [parameter | tweak | current_value]
-                let chain_tweak_offset = PARAMETER_LEN;
-                let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
-
-                for (chain_index, packed_chain) in
-                    packed_chains.iter_mut().enumerate().take(num_chains)
-                {
-                    // Walk this chain for `chain_length - 1` steps.
-                    // The starting point is step 0, so we need `chain_length - 1` iterations.
-                    for step in 0..chain_length - 1 {
-                        // Current position in the chain.
-                        let pos = (step + 1) as u8;
-
-                        // Assemble the packed input for the hash function.
-                        // Layout: [parameter | tweak | current_value]
-                        let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
-
-                        // Copy pre-packed parameter
-                        packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                        // Pack tweaks directly into destination
-                        pack_fn_into::<TWEAK_LEN>(
-                            &mut packed_input,
-                            chain_tweak_offset,
-                            |t_idx, lane| {
-                                Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
-                                    .to_field_elements::<TWEAK_LEN>()[t_idx]
-                            },
-                        );
-
-                        // Copy current chain value (already packed)
-                        packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
-                            .copy_from_slice(packed_chain);
-
-                        // Apply the hash function to advance the chain.
-                        // This single call processes all epochs in parallel.
-                        *packed_chain =
-                            poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                                &chain_perm,
-                                &packed_input,
+                    // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
+                    //
+                    // For each chain, walk all epochs simultaneously using SIMD.
+                    // The chains start at their initial values and are walked step-by-step
+                    // until they reach their endpoints.
+                    //
+                    // Cache strategy: process one chain at a time to maximize locality.
+                    // All epochs for that chain stay in registers across iterations.
+
+                    // Offsets for chain compression: [parameter | tweak | current_value]
+                    let chain_tweak_offset = PARAMETER_LEN;
+                    let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+
+                    for (chain_index, packed_chain) in
+                        packed_chains.iter_mut().enumerate().take(num_chains)
+                    {
+                        // Walk this chain for `chain_length - 1` steps.
+                        // The starting point is step 0, so we need `chain_length - 1` iterations.
+                        for step in 0..chain_length - 1 {
+                            // Current position in the chain.
+                            let pos = (step + 1) as u8;
+
+                            // Assemble the packed input for the hash function.
+                            // Layout: [parameter | tweak | current_value]
+                            let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
+
+                            // Copy pre-packed parameter
+                            packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                            // Pack tweaks directly into destination
+                            pack_fn_into::<TWEAK_LEN>(
+                                &mut packed_input,
+                                chain_tweak_offset,
+                                |t_idx, lane| {
+                                    Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
+                                        .to_field_elements::<TWEAK_LEN>()[t_idx]
+                                },
                             );
-                    }
-                }
-
-                // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
-                //
-                // All chains have been walked to their endpoints.
-                // Now hash all chain ends together to form the tree leaf.
-                //
-                // This uses the sponge construction for variable-length input.
 
-                // Assemble the sponge input.
-                // Layout: [parameter | tree_tweak | all_chain_ends]
-                let sponge_tweak_offset = PARAMETER_LEN;
-                let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
-                let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
+                            // Copy current chain value (already packed)
+                            packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
+                                .copy_from_slice(packed_chain);
+
+                            // Apply the hash function to advance the chain.
+                            // This single call processes all epochs in parallel.
+                            *packed_chain =
+                                poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                                    &chain_perm,
+                                    &packed_input,
+                                );
+                        }
+                    }
 
-                let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+                    // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
+                    //
+                    // All chains have been walked to their endpoints.
+                    // Now hash all chain ends together to form the tree leaf.
+                    //
+                    // This uses the sponge construction for variable-length input.
+
+                    // Assemble the sponge input.
+                    // Layout: [parameter | tree_tweak | all_chain_ends]
+                    // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
+                    // vector in each iteration, so no need to `fill(0)`!
+                    //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+
+                    // Copy pre-packed parameter
+                    packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                    // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                    pack_fn_into::<TWEAK_LEN>(
+                        packed_leaf_input,
+                        sponge_tweak_offset,
+                        |t_idx, lane| {
+                            Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
+                                [t_idx]
+                        },
+                    );
 
-                // Copy pre-packed parameter
-                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
-                pack_fn_into::<TWEAK_LEN>(
-                    &mut packed_leaf_input,
-                    sponge_tweak_offset,
-                    |t_idx, lane| {
-                        Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
-                            [t_idx]
-                    },
-                );
+                    // Copy all chain ends (already packed)
+                    let dst = &mut packed_leaf_input[sponge_chains_offset
+                        ..sponge_chains_offset + packed_chains.len() * HASH_LEN];
+                    for (dst_chunk, src_chain) in
+                        dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter())
+                    {
+                        dst_chunk.copy_from_slice(src_chain);
+                    }
 
-                // Copy all chain ends (already packed)
-                let dst = &mut packed_leaf_input[sponge_chains_offset .. sponge_chains_offset + packed_chains.len() * HASH_LEN];
-                for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) {
-                    dst_chunk.copy_from_slice(src_chain);
-                }
-
-                // Apply the sponge hash to produce the leaf.
-                // This absorbs all chain ends and squeezes out the final hash.
-                let packed_leaves = poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                    &sponge_perm,
-                    &capacity_val,
-                    &packed_leaf_input,
-                );
+                    // Apply the sponge hash to produce the leaf.
+                    // This absorbs all chain ends and squeezes out the final hash.
+                    let packed_leaves =
+                        poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                            &sponge_perm,
+                            &capacity_val,
+                            &packed_leaf_input,
+                        );
 
-                // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
-                //
-                // Convert from vertical packing back to scalar layout.
-                // Each lane becomes one leaf in the output slice.
-                unpack_array(&packed_leaves, leaves_chunk);
-            });
+                    // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
+                    //
+                    // Convert from vertical packing back to scalar layout.
+                    // Each lane becomes one leaf in the output slice.
+                    unpack_array(&packed_leaves, leaves_chunk);
+                },
+            );
 
         // HANDLE REMAINDER EPOCHS
         //

From ed2f132e26d73fd280a20693d83dd86e549b3239 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 12:52:47 +0100
Subject: [PATCH 03/12] alternative implementation using thread local storage

This way we essentially avoid all allocations, i.e. we get a single
allocation per thread.

`for_each_init` is known to allocate multiple times due to the rayon
work stealing / splitting approach. See:

https://github.com/rayon-rs/rayon/issues/742
---
 Cargo.toml                           |   2 +
 src/symmetric/tweak_hash/poseidon.rs | 164 ++++++++++++++-------------
 2 files changed, 88 insertions(+), 78 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5962f18..ae304e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,8 @@ p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312"
 p3-koala-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 
+thread_local = "1.1.9"
+
 [dev-dependencies]
 criterion = "0.7"
 proptest = "1.7"
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 50005a1..4d36f31 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -17,6 +17,8 @@ use crate::{F, PackedF};
 use super::TweakableHash;
 
 use p3_koala_bear::Poseidon2KoalaBear;
+use std::cell::RefCell;
+use thread_local::ThreadLocal;
 
 const DOMAIN_PARAMETERS_LENGTH: usize = 4;
 /// The state width for compressing a single hash in a chain.
@@ -510,6 +512,8 @@ impl<
         let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
         let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
 
+        let tls: ThreadLocal<RefCell<Vec<PackedF>>> = ThreadLocal::new();
+
         // PARALLEL SIMD PROCESSING
         //
         // Process epochs in batches of size `width`.
@@ -518,42 +522,46 @@ impl<
         epochs
             .par_chunks_exact(width)
             .zip(leaves.par_chunks_exact_mut(width))
-            .for_each_init(
-                || vec![PackedF::ZERO; sponge_input_len],
-                |packed_leaf_input, (epoch_chunk, leaves_chunk)| {
-                    // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
-                    //
-                    // For each chain, generate starting points for all epochs in the chunk.
-                    // Use vertical packing: transpose from [lane][element] to [element][lane].
-                    //
-                    // This layout enables efficient SIMD operations across epochs.
-
-                    let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
-                        array::from_fn(|c_idx| {
-                            // Generate starting points for this chain across all epochs.
-                            let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
-                                PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
-                                    .into()
-                            });
-
-                            // Transpose to vertical packing for SIMD efficiency.
-                            pack_array(&starts)
+            .for_each(|(epoch_chunk, leaves_chunk)| {
+                // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
+                //
+                // For each chain, generate starting points for all epochs in the chunk.
+                // Use vertical packing: transpose from [lane][element] to [element][lane].
+                //
+                // This layout enables efficient SIMD operations across epochs.
+
+                let cell = tls.get_or(|| {
+                    RefCell::new(vec![PackedF::ZERO; sponge_input_len])
+                });
+                let mut packed_leaf_input = cell.borrow_mut();
+                // reset not needed
+
+                let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
+                    array::from_fn(|c_idx| {
+                        // Generate starting points for this chain across all epochs.
+                        let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
+                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
+                                .into()
                         });
 
-                    // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
-                    //
-                    // For each chain, walk all epochs simultaneously using SIMD.
-                    // The chains start at their initial values and are walked step-by-step
-                    // until they reach their endpoints.
-                    //
-                    // Cache strategy: process one chain at a time to maximize locality.
-                    // All epochs for that chain stay in registers across iterations.
+                        // Transpose to vertical packing for SIMD efficiency.
+                        pack_array(&starts)
+                    });
 
-                    // Offsets for chain compression: [parameter | tweak | current_value]
-                    let chain_tweak_offset = PARAMETER_LEN;
-                    let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+                // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
+                //
+                // For each chain, walk all epochs simultaneously using SIMD.
+                // The chains start at their initial values and are walked step-by-step
+                // until they reach their endpoints.
+                //
+                // Cache strategy: process one chain at a time to maximize locality.
+                // All epochs for that chain stay in registers across iterations.
 
-                    for (chain_index, packed_chain) in
+                // Offsets for chain compression: [parameter | tweak | current_value]
+                let chain_tweak_offset = PARAMETER_LEN;
+                let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+
+                for (chain_index, packed_chain) in
                         packed_chains.iter_mut().enumerate().take(num_chains)
                     {
                         // Walk this chain for `chain_length - 1` steps.
@@ -593,56 +601,56 @@ impl<
                         }
                     }
 
-                    // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
-                    //
-                    // All chains have been walked to their endpoints.
-                    // Now hash all chain ends together to form the tree leaf.
-                    //
-                    // This uses the sponge construction for variable-length input.
-
-                    // Assemble the sponge input.
-                    // Layout: [parameter | tree_tweak | all_chain_ends]
-                    // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
-                    // vector in each iteration, so no need to `fill(0)`!
-                    //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
-
-                    // Copy pre-packed parameter
-                    packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                    // Pack tree tweaks directly (level 0 for bottom-layer leaves)
-                    pack_fn_into::<TWEAK_LEN>(
-                        packed_leaf_input,
-                        sponge_tweak_offset,
-                        |t_idx, lane| {
-                            Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
+                // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
+                //
+                // All chains have been walked to their endpoints.
+                // Now hash all chain ends together to form the tree leaf.
+                //
+                // This uses the sponge construction for variable-length input.
+
+                // Assemble the sponge input.
+                // Layout: [parameter | tree_tweak | all_chain_ends]
+                // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
+                // vector in each iteration, so no need to `fill(0)`!
+                //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+
+                // Copy pre-packed parameter
+                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                pack_fn_into::<TWEAK_LEN>(
+                    &mut packed_leaf_input,
+                    sponge_tweak_offset,
+                    |t_idx, lane| {
+                        Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
                                 [t_idx]
-                        },
-                    );
+                    },
+                );
 
-                    // Copy all chain ends (already packed)
-                    let dst = &mut packed_leaf_input[sponge_chains_offset
+                // Copy all chain ends (already packed)
+                let dst = &mut packed_leaf_input[sponge_chains_offset
                         ..sponge_chains_offset + packed_chains.len() * HASH_LEN];
-                    for (dst_chunk, src_chain) in
+                for (dst_chunk, src_chain) in
                         dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter())
                     {
                         dst_chunk.copy_from_slice(src_chain);
                     }
 
-                    // Apply the sponge hash to produce the leaf.
-                    // This absorbs all chain ends and squeezes out the final hash.
-                    let packed_leaves =
-                        poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                            &sponge_perm,
-                            &capacity_val,
-                            &packed_leaf_input,
-                        );
-
-                    // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
-                    //
-                    // Convert from vertical packing back to scalar layout.
-                    // Each lane becomes one leaf in the output slice.
-                    unpack_array(&packed_leaves, leaves_chunk);
-                },
+                // Apply the sponge hash to produce the leaf.
+                // This absorbs all chain ends and squeezes out the final hash.
+                let packed_leaves =
+                    poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                        &sponge_perm,
+                        &capacity_val,
+                        &packed_leaf_input,
+                    );
+
+                // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
+                //
+                // Convert from vertical packing back to scalar layout.
+                // Each lane becomes one leaf in the output slice.
+                unpack_array(&packed_leaves, leaves_chunk);
+            },
             );
 
         // HANDLE REMAINDER EPOCHS
@@ -1679,13 +1687,13 @@ mod tests {
 
             let parameter = PoseidonTweak44::rand_parameter(&mut rng);
             let children: Vec<_> = (0..num_pairs * 2)
-                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
-                .collect();
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
 
             let simd_result =
-                PoseidonTweak44::compute_tree_layer(&parameter, level, parent_start, &children);
+            PoseidonTweak44::compute_tree_layer(&parameter, level, parent_start, &children);
             let scalar_result =
-                compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, level, parent_start, &children);
+            compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, level, parent_start, &children);
 
             prop_assert_eq!(simd_result.len(), num_pairs);
             prop_assert_eq!(simd_result, scalar_result);

From 595dbe088ca12e7f00bbc6c430b19d6e2b67f4d9 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 16:38:29 +0100
Subject: [PATCH 04/12] avoid heap allocations in 2/3 branches of `apply`

No need for a `Vec` in these two branches as we know at compile time
how much data is required for each input.

Only relevant if `apply` is part of a hot code path, which normally is
unlikely to be the case. Still, the code is not significantly more,
only more ugly :(

It gets rid of a large number of allocations when running the 2^8
benchmark case.
---
 src/symmetric/tweak_hash/poseidon.rs | 54 +++++++++++++++-------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 4d36f31..19f4dfe 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -312,36 +312,40 @@ impl<
             [single] => {
                 // we compress parameter, tweak, message
                 let perm = poseidon2_16();
-                let combined_input: Vec<F> = parameter
-                    .iter()
-                    .chain(tweak_fe.iter())
-                    .chain(single.iter())
-                    .copied()
-                    .collect();
-                FieldArray(
-                    poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                        &perm,
-                        &combined_input,
-                    ),
-                )
+
+                // Build input on stack: [parameter | tweak | message]
+                let mut combined_input = [F::ZERO; CHAIN_COMPRESSION_WIDTH];
+                combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
+                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
+                    .copy_from_slice(&single.0);
+
+                FieldArray(poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                    &perm,
+                    &combined_input,
+                ))
             }
 
             [left, right] => {
                 // we compress parameter, tweak, message (now containing two parts)
                 let perm = poseidon2_24();
-                let combined_input: Vec<F> = parameter
-                    .iter()
-                    .chain(tweak_fe.iter())
-                    .chain(left.iter())
-                    .chain(right.iter())
-                    .copied()
-                    .collect();
-                FieldArray(
-                    poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                        &perm,
-                        &combined_input,
-                    ),
-                )
+
+                // Build input on stack: [parameter | tweak | left | right]
+                let mut combined_input = [F::ZERO; MERGE_COMPRESSION_WIDTH];
+                combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
+                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
+                    .copy_from_slice(&left.0);
+                combined_input
+                    [PARAMETER_LEN + TWEAK_LEN + HASH_LEN..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN]
+                    .copy_from_slice(&right.0);
+
+                FieldArray(poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                    &perm,
+                    &combined_input,
+                ))
             }
 
             _ if message.len() > 2 => {

From 41d240eeb7aa6875c864341c71a7522c7ec280a0 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 16:50:21 +0100
Subject: [PATCH 05/12] add profiling Cargo profile

Can't hurt to have this in here.
---
 Cargo.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index ae304e8..60002f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,3 +62,7 @@ with-gen-benches-poseidon-top-level = []
 [[bench]]
 name = "benchmark"
 harness = false
+
+[profile.profiling]
+inherits = "release"
+debug = true
\ No newline at end of file

From 816fbbef94711cf81cbac52de2f43df246cc20e2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:27:21 +0100
Subject: [PATCH 06/12] cargo fmt fixes

---
 src/symmetric/tweak_hash/poseidon.rs | 30 +++++++++++++++-------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 19f4dfe..9aaff45 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -316,15 +316,16 @@ impl<
                 // Build input on stack: [parameter | tweak | message]
                 let mut combined_input = [F::ZERO; CHAIN_COMPRESSION_WIDTH];
                 combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
-                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
-                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN].copy_from_slice(&tweak_fe);
                 combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
                     .copy_from_slice(&single.0);
 
-                FieldArray(poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                    &perm,
-                    &combined_input,
-                ))
+                FieldArray(
+                    poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                        &perm,
+                        &combined_input,
+                    ),
+                )
             }
 
             [left, right] => {
@@ -334,18 +335,19 @@ impl<
                 // Build input on stack: [parameter | tweak | left | right]
                 let mut combined_input = [F::ZERO; MERGE_COMPRESSION_WIDTH];
                 combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
-                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
-                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN].copy_from_slice(&tweak_fe);
                 combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
                     .copy_from_slice(&left.0);
-                combined_input
-                    [PARAMETER_LEN + TWEAK_LEN + HASH_LEN..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN]
+                combined_input[PARAMETER_LEN + TWEAK_LEN + HASH_LEN
+                    ..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN]
                     .copy_from_slice(&right.0);
 
-                FieldArray(poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                    &perm,
-                    &combined_input,
-                ))
+                FieldArray(
+                    poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                        &perm,
+                        &combined_input,
+                    ),
+                )
             }
 
             _ if message.len() > 2 => {

From 945320812aa862a7e4f87272e57afc91a21df5a5 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:28:49 +0100
Subject: [PATCH 07/12] remove dead line & update comment

---
 src/symmetric/tweak_hash/poseidon.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 9aaff45..8d079f2 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -518,6 +518,8 @@ impl<
         let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
         let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
 
+        // We use a thread local storage to guarantee the `packed_leaf_input` vector is only allocated
+        // once per thread
         let tls: ThreadLocal<RefCell<Vec<PackedF>>> = ThreadLocal::new();
 
         // PARALLEL SIMD PROCESSING
@@ -616,9 +618,8 @@ impl<
 
                 // Assemble the sponge input.
                 // Layout: [parameter | tree_tweak | all_chain_ends]
-                // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
+                // NOTE: `packed_leaf_input` is preallocated per thread. We overwrite the entire
                 // vector in each iteration, so no need to `fill(0)`!
-                //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
 
                 // Copy pre-packed parameter
                 packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);

From a1abd1e49774c91f0395bd2a65e12132c95134ec Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:30:01 +0100
Subject: [PATCH 08/12] fix indentation of inner for loop

Somehow this is a case where cargo fmt has no opinion about
it. Earlier when using `for_each_init` the indentation was changed,
but this part didn't want to "come back" to what it was before...
---
 src/symmetric/tweak_hash/poseidon.rs | 66 ++++++++++++++--------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 8d079f2..c702d2e 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -570,44 +570,44 @@ impl<
                 let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
 
                 for (chain_index, packed_chain) in
-                        packed_chains.iter_mut().enumerate().take(num_chains)
-                    {
-                        // Walk this chain for `chain_length - 1` steps.
-                        // The starting point is step 0, so we need `chain_length - 1` iterations.
-                        for step in 0..chain_length - 1 {
-                            // Current position in the chain.
-                            let pos = (step + 1) as u8;
-
-                            // Assemble the packed input for the hash function.
-                            // Layout: [parameter | tweak | current_value]
-                            let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
-
-                            // Copy pre-packed parameter
-                            packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                            // Pack tweaks directly into destination
-                            pack_fn_into::<TWEAK_LEN>(
-                                &mut packed_input,
-                                chain_tweak_offset,
-                                |t_idx, lane| {
-                                    Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
-                                        .to_field_elements::<TWEAK_LEN>()[t_idx]
-                                },
-                            );
-
-                            // Copy current chain value (already packed)
-                            packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
-                                .copy_from_slice(packed_chain);
-
-                            // Apply the hash function to advance the chain.
-                            // This single call processes all epochs in parallel.
-                            *packed_chain =
+                    packed_chains.iter_mut().enumerate().take(num_chains)
+                {
+                    // Walk this chain for `chain_length - 1` steps.
+                    // The starting point is step 0, so we need `chain_length - 1` iterations.
+                    for step in 0..chain_length - 1 {
+                        // Current position in the chain.
+                        let pos = (step + 1) as u8;
+
+                        // Assemble the packed input for the hash function.
+                        // Layout: [parameter | tweak | current_value]
+                        let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
+
+                        // Copy pre-packed parameter
+                        packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                        // Pack tweaks directly into destination
+                        pack_fn_into::<TWEAK_LEN>(
+                            &mut packed_input,
+                            chain_tweak_offset,
+                            |t_idx, lane| {
+                                Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
+                                    .to_field_elements::<TWEAK_LEN>()[t_idx]
+                            },
+                        );
+
+                        // Copy current chain value (already packed)
+                        packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
+                            .copy_from_slice(packed_chain);
+
+                        // Apply the hash function to advance the chain.
+                        // This single call processes all epochs in parallel.
+                        *packed_chain =
                                 poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
                                     &chain_perm,
                                     &packed_input,
                                 );
-                        }
                     }
+                }
 
                 // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
                 //

From 7d7d0aad55a3e9aa4e15672792729370d11c4be2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:35:44 +0100
Subject: [PATCH 09/12] [examples] add two examples for key gen for 2^8 and
 2^32 elements

following the benchmarks for the smallest and largest case
---
 examples/single_keygen.rs      | 24 ++++++++++++++++++++++++
 examples/single_keygen_2_32.rs | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 examples/single_keygen.rs
 create mode 100644 examples/single_keygen_2_32.rs

diff --git a/examples/single_keygen.rs b/examples/single_keygen.rs
new file mode 100644
index 0000000..449c5c2
--- /dev/null
+++ b/examples/single_keygen.rs
@@ -0,0 +1,24 @@
+use std::hint::black_box;
+
+use leansig::signature::{
+    SignatureScheme,
+    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_8::SIGTopLevelTargetSumLifetime8Dim64Base8,
+};
+
+fn main() {
+    let mut rng = rand::rng();
+
+    // 2^8 lifetime, full activation
+    let activation_duration = SIGTopLevelTargetSumLifetime8Dim64Base8::LIFETIME as usize;
+
+    eprintln!("Running single key_gen for 2^8 lifetime...");
+    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime8Dim64Base8::key_gen(
+        &mut rng,
+        0,
+        activation_duration,
+    ));
+    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
+
+    // Prevent optimization from removing the key_gen call
+    black_box((pk, sk));
+}
diff --git a/examples/single_keygen_2_32.rs b/examples/single_keygen_2_32.rs
new file mode 100644
index 0000000..4bc0b39
--- /dev/null
+++ b/examples/single_keygen_2_32.rs
@@ -0,0 +1,33 @@
+use std::hint::black_box;
+
+use leansig::signature::{
+    SignatureScheme,
+    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_32::size_optimized::SIGTopLevelTargetSumLifetime32Dim32Base26,
+};
+
+/// Cap activation duration to 2^18 to keep runtime reasonable (same as benchmark)
+const MAX_LOG_ACTIVATION_DURATION: usize = 18;
+
+fn main() {
+    let mut rng = rand::rng();
+
+    // 2^32 lifetime, activation capped at 2^18
+    let activation_duration = std::cmp::min(
+        1 << MAX_LOG_ACTIVATION_DURATION,
+        SIGTopLevelTargetSumLifetime32Dim32Base26::LIFETIME as usize,
+    );
+
+    eprintln!(
+        "Running single key_gen for 2^32 lifetime (activation 2^{})...",
+        MAX_LOG_ACTIVATION_DURATION
+    );
+    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime32Dim32Base26::key_gen(
+        &mut rng,
+        0,
+        activation_duration,
+    ));
+    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
+
+    // Prevent optimization from removing the key_gen call
+    black_box((pk, sk));
+}

From d01fa2c708abb89b363fc522fa497e9741449ed2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 22 Dec 2025 16:23:03 +0100
Subject: [PATCH 10/12] use iterator approach when adding chunks to state

---
 src/symmetric/tweak_hash/poseidon.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index c702d2e..5035c10 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -215,10 +215,9 @@ where
     // 1. fill in all full chunks and permute
     let mut it = input.chunks_exact(rate);
     for chunk in &mut it {
-        //input.chunks_exact(rate) {
         // iterate the chunks
-        for i in 0..chunk.len() {
-            state[i] += chunk[i];
+        for (s, &x) in state.iter_mut().take(rate).zip(chunk) {
+            *s += x;
         }
         perm.permute_mut(&mut state);
     }

From ceab87d5e3cb397538db610cc3c63ae28713e76a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 22 Dec 2025 16:24:41 +0100
Subject: [PATCH 11/12] delete keygen examples / profiling helpers

---
 examples/single_keygen.rs      | 24 ------------------------
 examples/single_keygen_2_32.rs | 33 ---------------------------------
 2 files changed, 57 deletions(-)
 delete mode 100644 examples/single_keygen.rs
 delete mode 100644 examples/single_keygen_2_32.rs

diff --git a/examples/single_keygen.rs b/examples/single_keygen.rs
deleted file mode 100644
index 449c5c2..0000000
--- a/examples/single_keygen.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-use std::hint::black_box;
-
-use leansig::signature::{
-    SignatureScheme,
-    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_8::SIGTopLevelTargetSumLifetime8Dim64Base8,
-};
-
-fn main() {
-    let mut rng = rand::rng();
-
-    // 2^8 lifetime, full activation
-    let activation_duration = SIGTopLevelTargetSumLifetime8Dim64Base8::LIFETIME as usize;
-
-    eprintln!("Running single key_gen for 2^8 lifetime...");
-    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime8Dim64Base8::key_gen(
-        &mut rng,
-        0,
-        activation_duration,
-    ));
-    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
-
-    // Prevent optimization from removing the key_gen call
-    black_box((pk, sk));
-}
diff --git a/examples/single_keygen_2_32.rs b/examples/single_keygen_2_32.rs
deleted file mode 100644
index 4bc0b39..0000000
--- a/examples/single_keygen_2_32.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use std::hint::black_box;
-
-use leansig::signature::{
-    SignatureScheme,
-    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_32::size_optimized::SIGTopLevelTargetSumLifetime32Dim32Base26,
-};
-
-/// Cap activation duration to 2^18 to keep runtime reasonable (same as benchmark)
-const MAX_LOG_ACTIVATION_DURATION: usize = 18;
-
-fn main() {
-    let mut rng = rand::rng();
-
-    // 2^32 lifetime, activation capped at 2^18
-    let activation_duration = std::cmp::min(
-        1 << MAX_LOG_ACTIVATION_DURATION,
-        SIGTopLevelTargetSumLifetime32Dim32Base26::LIFETIME as usize,
-    );
-
-    eprintln!(
-        "Running single key_gen for 2^32 lifetime (activation 2^{})...",
-        MAX_LOG_ACTIVATION_DURATION
-    );
-    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime32Dim32Base26::key_gen(
-        &mut rng,
-        0,
-        activation_duration,
-    ));
-    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
-
-    // Prevent optimization from removing the key_gen call
-    black_box((pk, sk));
-}

From 1305e33560fb375f2beced732527a934837bc613 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 22 Dec 2025 18:19:26 +0100
Subject: [PATCH 12/12] use stdlib `thread_local!` macro instead of
 thread_local crate

---
 Cargo.toml                           |  2 -
 src/symmetric/tweak_hash/poseidon.rs | 70 ++++++++++++++--------------
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 60002f8..41637e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,8 +46,6 @@ p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312"
 p3-koala-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 
-thread_local = "1.1.9"
-
 [dev-dependencies]
 criterion = "0.7"
 proptest = "1.7"
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 5035c10..e3e5395 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -18,7 +18,6 @@ use super::TweakableHash;
 
 use p3_koala_bear::Poseidon2KoalaBear;
 use std::cell::RefCell;
-use thread_local::ThreadLocal;
 
 const DOMAIN_PARAMETERS_LENGTH: usize = 4;
 /// The state width for compressing a single hash in a chain.
@@ -517,9 +516,11 @@ impl<
         let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
         let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
 
-        // We use a thread local storage to guarantee the `packed_leaf_input` vector is only allocated
+        // We use thread-local storage to guarantee the `packed_leaf_input` vector is only allocated
         // once per thread
-        let tls: ThreadLocal<RefCell<Vec<PackedF>>> = ThreadLocal::new();
+        thread_local! {
+            static PACKED_LEAF_INPUT: RefCell<Vec<PackedF>> = const { RefCell::new(Vec::new()) };
+        }
 
         // PARALLEL SIMD PROCESSING
         //
@@ -537,18 +538,11 @@ impl<
                 //
                 // This layout enables efficient SIMD operations across epochs.
 
-                let cell = tls.get_or(|| {
-                    RefCell::new(vec![PackedF::ZERO; sponge_input_len])
-                });
-                let mut packed_leaf_input = cell.borrow_mut();
-                // reset not needed
-
                 let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
                     array::from_fn(|c_idx| {
                         // Generate starting points for this chain across all epochs.
                         let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
-                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
-                                .into()
+                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64).into()
                         });
 
                         // Transpose to vertical packing for SIMD efficiency.
@@ -601,10 +595,10 @@ impl<
                         // Apply the hash function to advance the chain.
                         // This single call processes all epochs in parallel.
                         *packed_chain =
-                                poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                                    &chain_perm,
-                                    &packed_input,
-                                );
+                            poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                                &chain_perm,
+                                &packed_input,
+                            );
                     }
                 }
 
@@ -619,45 +613,49 @@ impl<
                 // Layout: [parameter | tree_tweak | all_chain_ends]
                 // NOTE: `packed_leaf_input` is preallocated per thread. We overwrite the entire
                 // vector in each iteration, so no need to `fill(0)`!
+                let packed_leaves = PACKED_LEAF_INPUT.with_borrow_mut(|packed_leaf_input| {
+                    // Resize on first use for this thread
+                    if packed_leaf_input.len() != sponge_input_len {
+                        packed_leaf_input.resize(sponge_input_len, PackedF::ZERO);
+                    }
 
-                // Copy pre-packed parameter
-                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
-                pack_fn_into::<TWEAK_LEN>(
-                    &mut packed_leaf_input,
-                    sponge_tweak_offset,
-                    |t_idx, lane| {
-                        Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
+                    // Copy pre-packed parameter
+                    packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                    // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                    pack_fn_into::<TWEAK_LEN>(
+                        packed_leaf_input,
+                        sponge_tweak_offset,
+                        |t_idx, lane| {
+                            Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
                                 [t_idx]
-                    },
-                );
+                        },
+                    );
 
-                // Copy all chain ends (already packed)
-                let dst = &mut packed_leaf_input[sponge_chains_offset
+                    // Copy all chain ends (already packed)
+                    let dst = &mut packed_leaf_input[sponge_chains_offset
                         ..sponge_chains_offset + packed_chains.len() * HASH_LEN];
-                for (dst_chunk, src_chain) in
+                    for (dst_chunk, src_chain) in
                         dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter())
                     {
                         dst_chunk.copy_from_slice(src_chain);
                     }
 
-                // Apply the sponge hash to produce the leaf.
-                // This absorbs all chain ends and squeezes out the final hash.
-                let packed_leaves =
+                    // Apply the sponge hash to produce the leaf.
+                    // This absorbs all chain ends and squeezes out the final hash.
                     poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
                         &sponge_perm,
                         &capacity_val,
-                        &packed_leaf_input,
-                    );
+                        packed_leaf_input,
+                    )
+                });
 
                 // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
                 //
                 // Convert from vertical packing back to scalar layout.
                 // Each lane becomes one leaf in the output slice.
                 unpack_array(&packed_leaves, leaves_chunk);
-            },
-            );
+            });
 
         // HANDLE REMAINDER EPOCHS
         //