From ba510ab85c6024ac7e9507bff482076f8e1df9ba Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 12:10:51 +0100 Subject: [PATCH 01/12] avoid heap allocations in `poseidon_sponge` In the 2^32 benchmark during key generation this avoids about 500k temporary heap allocations when running for about 30s. Likely only a small performance cost, but we can avoid them without making the code much more complicated. --- src/symmetric/tweak_hash/poseidon.rs | 45 ++++++++++++++++++---------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 7ab2d7b..775e1e8 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -201,34 +201,47 @@ where ); let rate = WIDTH - capacity_value.len(); - let extra_elements = (rate - (input.len() % rate)) % rate; - let mut input_vector = input.to_vec(); - // We pad the input with zeros to make its length a multiple of the rate. - // - // This is safe because the input's original length is effectively encoded - // in the `capacity_value`, which serves as a domain separator. - input_vector.resize(input.len() + extra_elements, A::ZERO); - // initialize let mut state = [A::ZERO; WIDTH]; state[rate..].copy_from_slice(capacity_value); - // absorb - for chunk in input_vector.chunks(rate) { + let extra_elements = (rate - (input.len() % rate)) % rate; + // Instead of converting the input to a vector, resizing and feeding the data into the + // sponge, we instead fill in the vector from all chunks until we are left with a non + // full chunk. We only add to the state, so padded data does not mutate `state` at all. + + // 1. fill in all full chunks and permute + let mut it = input.chunks_exact(rate); + for chunk in &mut it { + //input.chunks_exact(rate) { + // iterate the chunks for i in 0..chunk.len() { state[i] += chunk[i]; } perm.permute_mut(&mut state); } + // 2. fill the remainder and extend with zeros + let remainder = rate - extra_elements; + if remainder > 0 { + for (i, x) in it.remainder().iter().enumerate() { + state[i] += *x; + } + // was a remainder, so permute. No need to mutate `state` as we *add* only anyway + perm.permute_mut(&mut state); + } // squeeze - let mut out = vec![]; - while out.len() < OUT_LEN { - out.extend_from_slice(&state[..rate]); - perm.permute_mut(&mut state); + let mut out = [A::ZERO; OUT_LEN]; + let mut out_idx = 0; + while out_idx < OUT_LEN { + let chunk_size = (OUT_LEN - out_idx).min(rate); + out[out_idx..out_idx + chunk_size].copy_from_slice(&state[..chunk_size]); + out_idx += chunk_size; + if out_idx < OUT_LEN { + perm.permute_mut(&mut state); + } } - let slice = &out[0..OUT_LEN]; - slice.try_into().expect("Length mismatch") + out } /// A tweakable hash function implemented using Poseidon2 From e3e5ceb234da7063803e380ba32d3b521efa54d3 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 12:12:02 +0100 Subject: [PATCH 02/12] avoid heap allocations in `compute_tree_leaves` by using `for_each_init` Each rayon worker job had to allocate the full `packed_leaf_input`. We now use `for_each_init` to preallocate a vector for every Rayon worker instead. We overwrite the entire vector in every job, so not even a need to `fill(0)` the vector in each job. This drops another ~100k allocations when running the 2^32 bench over 30s. Brings us down to only 3k temporary allocations total in that time frame. --- src/symmetric/tweak_hash/poseidon.rs | 235 ++++++++++++++------------- 1 file changed, 123 insertions(+), 112 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 775e1e8..50005a1 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -505,6 +505,11 @@ impl< let capacity_val: [PackedF; CAPACITY] = poseidon_safe_domain_separator::(&sponge_perm, &lengths).map(PackedF::from); + // Compute sponge input length. Required to init packed input vector for each rayon worker + let sponge_tweak_offset = PARAMETER_LEN; + let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN; + let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN; + // PARALLEL SIMD PROCESSING // // Process epochs in batches of size `width`. @@ -513,126 +518,132 @@ impl< epochs .par_chunks_exact(width) .zip(leaves.par_chunks_exact_mut(width)) - .for_each(|(epoch_chunk, leaves_chunk)| { - // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS - // - // For each chain, generate starting points for all epochs in the chunk. - // Use vertical packing: transpose from [lane][element] to [element][lane]. - // - // This layout enables efficient SIMD operations across epochs. - - let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] = - array::from_fn(|c_idx| { - // Generate starting points for this chain across all epochs. - let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| { - PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64).into() + .for_each_init( + || vec![PackedF::ZERO; sponge_input_len], + |packed_leaf_input, (epoch_chunk, leaves_chunk)| { + // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS + // + // For each chain, generate starting points for all epochs in the chunk. + // Use vertical packing: transpose from [lane][element] to [element][lane]. + // + // This layout enables efficient SIMD operations across epochs. + + let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] = + array::from_fn(|c_idx| { + // Generate starting points for this chain across all epochs. + let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| { + PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64) + .into() + }); + + // Transpose to vertical packing for SIMD efficiency. + pack_array(&starts) }); - // Transpose to vertical packing for SIMD efficiency. - pack_array(&starts) - }); - - // STEP 2: WALK CHAINS IN PARALLEL USING SIMD - // - // For each chain, walk all epochs simultaneously using SIMD. - // The chains start at their initial values and are walked step-by-step - // until they reach their endpoints. - // - // Cache strategy: process one chain at a time to maximize locality. - // All epochs for that chain stay in registers across iterations. - - // Offsets for chain compression: [parameter | tweak | current_value] - let chain_tweak_offset = PARAMETER_LEN; - let chain_value_offset = PARAMETER_LEN + TWEAK_LEN; - - for (chain_index, packed_chain) in - packed_chains.iter_mut().enumerate().take(num_chains) - { - // Walk this chain for `chain_length - 1` steps. - // The starting point is step 0, so we need `chain_length - 1` iterations. - for step in 0..chain_length - 1 { - // Current position in the chain. - let pos = (step + 1) as u8; - - // Assemble the packed input for the hash function. - // Layout: [parameter | tweak | current_value] - let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH]; - - // Copy pre-packed parameter - packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - - // Pack tweaks directly into destination - pack_fn_into::( - &mut packed_input, - chain_tweak_offset, - |t_idx, lane| { - Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos) - .to_field_elements::()[t_idx] - }, - ); - - // Copy current chain value (already packed) - packed_input[chain_value_offset..chain_value_offset + HASH_LEN] - .copy_from_slice(packed_chain); - - // Apply the hash function to advance the chain. - // This single call processes all epochs in parallel. - *packed_chain = - poseidon_compress::( - &chain_perm, - &packed_input, + // STEP 2: WALK CHAINS IN PARALLEL USING SIMD + // + // For each chain, walk all epochs simultaneously using SIMD. + // The chains start at their initial values and are walked step-by-step + // until they reach their endpoints. + // + // Cache strategy: process one chain at a time to maximize locality. + // All epochs for that chain stay in registers across iterations. + + // Offsets for chain compression: [parameter | tweak | current_value] + let chain_tweak_offset = PARAMETER_LEN; + let chain_value_offset = PARAMETER_LEN + TWEAK_LEN; + + for (chain_index, packed_chain) in + packed_chains.iter_mut().enumerate().take(num_chains) + { + // Walk this chain for `chain_length - 1` steps. + // The starting point is step 0, so we need `chain_length - 1` iterations. + for step in 0..chain_length - 1 { + // Current position in the chain. + let pos = (step + 1) as u8; + + // Assemble the packed input for the hash function. + // Layout: [parameter | tweak | current_value] + let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH]; + + // Copy pre-packed parameter + packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); + + // Pack tweaks directly into destination + pack_fn_into::( + &mut packed_input, + chain_tweak_offset, + |t_idx, lane| { + Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos) + .to_field_elements::()[t_idx] + }, ); - } - } - - // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES - // - // All chains have been walked to their endpoints. - // Now hash all chain ends together to form the tree leaf. - // - // This uses the sponge construction for variable-length input. - // Assemble the sponge input. - // Layout: [parameter | tree_tweak | all_chain_ends] - let sponge_tweak_offset = PARAMETER_LEN; - let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN; - let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN; + // Copy current chain value (already packed) + packed_input[chain_value_offset..chain_value_offset + HASH_LEN] + .copy_from_slice(packed_chain); + + // Apply the hash function to advance the chain. + // This single call processes all epochs in parallel. + *packed_chain = + poseidon_compress::( + &chain_perm, + &packed_input, + ); + } + } - let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len]; + // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES + // + // All chains have been walked to their endpoints. + // Now hash all chain ends together to form the tree leaf. + // + // This uses the sponge construction for variable-length input. + + // Assemble the sponge input. + // Layout: [parameter | tree_tweak | all_chain_ends] + // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire + // vector in each iteration, so no need to `fill(0)`! + //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len]; + + // Copy pre-packed parameter + packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); + + // Pack tree tweaks directly (level 0 for bottom-layer leaves) + pack_fn_into::( + packed_leaf_input, + sponge_tweak_offset, + |t_idx, lane| { + Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() + [t_idx] + }, + ); - // Copy pre-packed parameter - packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - - // Pack tree tweaks directly (level 0 for bottom-layer leaves) - pack_fn_into::( - &mut packed_leaf_input, - sponge_tweak_offset, - |t_idx, lane| { - Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() - [t_idx] - }, - ); + // Copy all chain ends (already packed) + let dst = &mut packed_leaf_input[sponge_chains_offset + ..sponge_chains_offset + packed_chains.len() * HASH_LEN]; + for (dst_chunk, src_chain) in + dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) + { + dst_chunk.copy_from_slice(src_chain); + } - // Copy all chain ends (already packed) - let dst = &mut packed_leaf_input[sponge_chains_offset .. sponge_chains_offset + packed_chains.len() * HASH_LEN]; - for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) { - dst_chunk.copy_from_slice(src_chain); - } - - // Apply the sponge hash to produce the leaf. - // This absorbs all chain ends and squeezes out the final hash. - let packed_leaves = poseidon_sponge::( - &sponge_perm, - &capacity_val, - &packed_leaf_input, - ); + // Apply the sponge hash to produce the leaf. + // This absorbs all chain ends and squeezes out the final hash. + let packed_leaves = + poseidon_sponge::( + &sponge_perm, + &capacity_val, + &packed_leaf_input, + ); - // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION - // - // Convert from vertical packing back to scalar layout. - // Each lane becomes one leaf in the output slice. - unpack_array(&packed_leaves, leaves_chunk); - }); + // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION + // + // Convert from vertical packing back to scalar layout. + // Each lane becomes one leaf in the output slice. + unpack_array(&packed_leaves, leaves_chunk); + }, + ); // HANDLE REMAINDER EPOCHS // From ed2f132e26d73fd280a20693d83dd86e549b3239 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 12:52:47 +0100 Subject: [PATCH 03/12] alternative implementation using thread local storage This way we essentially avoid all allocations, i.e. we get a single allocation per thread. `for_each_init` is known to allocate multiple times due to the rayon work stealing / splitting approach. See: https://github.com/rayon-rs/rayon/issues/742 --- Cargo.toml | 2 + src/symmetric/tweak_hash/poseidon.rs | 164 ++++++++++++++------------- 2 files changed, 88 insertions(+), 78 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5962f18..ae304e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,8 @@ p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" p3-koala-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" } p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" } +thread_local = "1.1.9" + [dev-dependencies] criterion = "0.7" proptest = "1.7" diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 50005a1..4d36f31 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -17,6 +17,8 @@ use crate::{F, PackedF}; use super::TweakableHash; use p3_koala_bear::Poseidon2KoalaBear; +use std::cell::RefCell; +use thread_local::ThreadLocal; const DOMAIN_PARAMETERS_LENGTH: usize = 4; /// The state width for compressing a single hash in a chain. @@ -510,6 +512,8 @@ impl< let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN; let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN; + let tls: ThreadLocal>> = ThreadLocal::new(); + // PARALLEL SIMD PROCESSING // // Process epochs in batches of size `width`. @@ -518,42 +522,46 @@ impl< epochs .par_chunks_exact(width) .zip(leaves.par_chunks_exact_mut(width)) - .for_each_init( - || vec![PackedF::ZERO; sponge_input_len], - |packed_leaf_input, (epoch_chunk, leaves_chunk)| { - // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS - // - // For each chain, generate starting points for all epochs in the chunk. - // Use vertical packing: transpose from [lane][element] to [element][lane]. - // - // This layout enables efficient SIMD operations across epochs. - - let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] = - array::from_fn(|c_idx| { - // Generate starting points for this chain across all epochs. - let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| { - PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64) - .into() - }); - - // Transpose to vertical packing for SIMD efficiency. - pack_array(&starts) + .for_each(|(epoch_chunk, leaves_chunk)| { + // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS + // + // For each chain, generate starting points for all epochs in the chunk. + // Use vertical packing: transpose from [lane][element] to [element][lane]. + // + // This layout enables efficient SIMD operations across epochs. + + let cell = tls.get_or(|| { + RefCell::new(vec![PackedF::ZERO; sponge_input_len]) + }); + let mut packed_leaf_input = cell.borrow_mut(); + // reset not needed + + let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] = + array::from_fn(|c_idx| { + // Generate starting points for this chain across all epochs. + let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| { + PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64) + .into() }); - // STEP 2: WALK CHAINS IN PARALLEL USING SIMD - // - // For each chain, walk all epochs simultaneously using SIMD. - // The chains start at their initial values and are walked step-by-step - // until they reach their endpoints. - // - // Cache strategy: process one chain at a time to maximize locality. - // All epochs for that chain stay in registers across iterations. + // Transpose to vertical packing for SIMD efficiency. + pack_array(&starts) + }); - // Offsets for chain compression: [parameter | tweak | current_value] - let chain_tweak_offset = PARAMETER_LEN; - let chain_value_offset = PARAMETER_LEN + TWEAK_LEN; + // STEP 2: WALK CHAINS IN PARALLEL USING SIMD + // + // For each chain, walk all epochs simultaneously using SIMD. + // The chains start at their initial values and are walked step-by-step + // until they reach their endpoints. + // + // Cache strategy: process one chain at a time to maximize locality. + // All epochs for that chain stay in registers across iterations. - for (chain_index, packed_chain) in + // Offsets for chain compression: [parameter | tweak | current_value] + let chain_tweak_offset = PARAMETER_LEN; + let chain_value_offset = PARAMETER_LEN + TWEAK_LEN; + + for (chain_index, packed_chain) in packed_chains.iter_mut().enumerate().take(num_chains) { // Walk this chain for `chain_length - 1` steps. @@ -593,56 +601,56 @@ impl< } } - // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES - // - // All chains have been walked to their endpoints. - // Now hash all chain ends together to form the tree leaf. - // - // This uses the sponge construction for variable-length input. - - // Assemble the sponge input. - // Layout: [parameter | tree_tweak | all_chain_ends] - // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire - // vector in each iteration, so no need to `fill(0)`! - //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len]; - - // Copy pre-packed parameter - packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - - // Pack tree tweaks directly (level 0 for bottom-layer leaves) - pack_fn_into::( - packed_leaf_input, - sponge_tweak_offset, - |t_idx, lane| { - Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() + // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES + // + // All chains have been walked to their endpoints. + // Now hash all chain ends together to form the tree leaf. + // + // This uses the sponge construction for variable-length input. + + // Assemble the sponge input. + // Layout: [parameter | tree_tweak | all_chain_ends] + // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire + // vector in each iteration, so no need to `fill(0)`! + //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len]; + + // Copy pre-packed parameter + packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); + + // Pack tree tweaks directly (level 0 for bottom-layer leaves) + pack_fn_into::( + &mut packed_leaf_input, + sponge_tweak_offset, + |t_idx, lane| { + Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() [t_idx] - }, - ); + }, + ); - // Copy all chain ends (already packed) - let dst = &mut packed_leaf_input[sponge_chains_offset + // Copy all chain ends (already packed) + let dst = &mut packed_leaf_input[sponge_chains_offset ..sponge_chains_offset + packed_chains.len() * HASH_LEN]; - for (dst_chunk, src_chain) in + for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) { dst_chunk.copy_from_slice(src_chain); } - // Apply the sponge hash to produce the leaf. - // This absorbs all chain ends and squeezes out the final hash. - let packed_leaves = - poseidon_sponge::( - &sponge_perm, - &capacity_val, - &packed_leaf_input, - ); - - // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION - // - // Convert from vertical packing back to scalar layout. - // Each lane becomes one leaf in the output slice. - unpack_array(&packed_leaves, leaves_chunk); - }, + // Apply the sponge hash to produce the leaf. + // This absorbs all chain ends and squeezes out the final hash. + let packed_leaves = + poseidon_sponge::( + &sponge_perm, + &capacity_val, + &packed_leaf_input, + ); + + // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION + // + // Convert from vertical packing back to scalar layout. + // Each lane becomes one leaf in the output slice. + unpack_array(&packed_leaves, leaves_chunk); + }, ); // HANDLE REMAINDER EPOCHS @@ -1679,13 +1687,13 @@ mod tests { let parameter = PoseidonTweak44::rand_parameter(&mut rng); let children: Vec<_> = (0..num_pairs * 2) - .map(|_| PoseidonTweak44::rand_domain(&mut rng)) - .collect(); + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); let simd_result = - PoseidonTweak44::compute_tree_layer(¶meter, level, parent_start, &children); + PoseidonTweak44::compute_tree_layer(¶meter, level, parent_start, &children); let scalar_result = - compute_tree_layer_scalar::(¶meter, level, parent_start, &children); + compute_tree_layer_scalar::(¶meter, level, parent_start, &children); prop_assert_eq!(simd_result.len(), num_pairs); prop_assert_eq!(simd_result, scalar_result); From 595dbe088ca12e7f00bbc6c430b19d6e2b67f4d9 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 16:38:29 +0100 Subject: [PATCH 04/12] avoid heap allocations in 2/3 branches of `apply` No need for a `Vec` in these two branches as we know at compile time how much data is required for each input. Only relevant if `apply` is part of a hot code path, which normally is unlikely to be the case. Still, the code is not significantly more, only more ugly :( It gets rid of a large number of allocations when running the 2^8 benchmark case. --- src/symmetric/tweak_hash/poseidon.rs | 54 +++++++++++++++------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 4d36f31..19f4dfe 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -312,36 +312,40 @@ impl< [single] => { // we compress parameter, tweak, message let perm = poseidon2_16(); - let combined_input: Vec = parameter - .iter() - .chain(tweak_fe.iter()) - .chain(single.iter()) - .copied() - .collect(); - FieldArray( - poseidon_compress::( - &perm, - &combined_input, - ), - ) + + // Build input on stack: [parameter | tweak | message] + let mut combined_input = [F::ZERO; CHAIN_COMPRESSION_WIDTH]; + combined_input[..PARAMETER_LEN].copy_from_slice(¶meter.0); + combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN] + .copy_from_slice(&tweak_fe); + combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN] + .copy_from_slice(&single.0); + + FieldArray(poseidon_compress::( + &perm, + &combined_input, + )) } [left, right] => { // we compress parameter, tweak, message (now containing two parts) let perm = poseidon2_24(); - let combined_input: Vec = parameter - .iter() - .chain(tweak_fe.iter()) - .chain(left.iter()) - .chain(right.iter()) - .copied() - .collect(); - FieldArray( - poseidon_compress::( - &perm, - &combined_input, - ), - ) + + // Build input on stack: [parameter | tweak | left | right] + let mut combined_input = [F::ZERO; MERGE_COMPRESSION_WIDTH]; + combined_input[..PARAMETER_LEN].copy_from_slice(¶meter.0); + combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN] + .copy_from_slice(&tweak_fe); + combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN] + .copy_from_slice(&left.0); + combined_input + [PARAMETER_LEN + TWEAK_LEN + HASH_LEN..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN] + .copy_from_slice(&right.0); + + FieldArray(poseidon_compress::( + &perm, + &combined_input, + )) } _ if message.len() > 2 => { From 41d240eeb7aa6875c864341c71a7522c7ec280a0 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 16:50:21 +0100 Subject: [PATCH 05/12] add profiling Cargo profile Can't hurt to have this in here. --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index ae304e8..60002f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,3 +62,7 @@ with-gen-benches-poseidon-top-level = [] [[bench]] name = "benchmark" harness = false + +[profile.profiling] +inherits = "release" +debug = true \ No newline at end of file From 816fbbef94711cf81cbac52de2f43df246cc20e2 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 17:27:21 +0100 Subject: [PATCH 06/12] cargo fmt fixes --- src/symmetric/tweak_hash/poseidon.rs | 30 +++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 19f4dfe..9aaff45 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -316,15 +316,16 @@ impl< // Build input on stack: [parameter | tweak | message] let mut combined_input = [F::ZERO; CHAIN_COMPRESSION_WIDTH]; combined_input[..PARAMETER_LEN].copy_from_slice(¶meter.0); - combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN] - .copy_from_slice(&tweak_fe); + combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN].copy_from_slice(&tweak_fe); combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN] .copy_from_slice(&single.0); - FieldArray(poseidon_compress::( - &perm, - &combined_input, - )) + FieldArray( + poseidon_compress::( + &perm, + &combined_input, + ), + ) } [left, right] => { @@ -334,18 +335,19 @@ impl< // Build input on stack: [parameter | tweak | left | right] let mut combined_input = [F::ZERO; MERGE_COMPRESSION_WIDTH]; combined_input[..PARAMETER_LEN].copy_from_slice(¶meter.0); - combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN] - .copy_from_slice(&tweak_fe); + combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN].copy_from_slice(&tweak_fe); combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN] .copy_from_slice(&left.0); - combined_input - [PARAMETER_LEN + TWEAK_LEN + HASH_LEN..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN] + combined_input[PARAMETER_LEN + TWEAK_LEN + HASH_LEN + ..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN] .copy_from_slice(&right.0); - FieldArray(poseidon_compress::( - &perm, - &combined_input, - )) + FieldArray( + poseidon_compress::( + &perm, + &combined_input, + ), + ) } _ if message.len() > 2 => { From 945320812aa862a7e4f87272e57afc91a21df5a5 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 17:28:49 +0100 Subject: [PATCH 07/12] remove dead line & update comment --- src/symmetric/tweak_hash/poseidon.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 9aaff45..8d079f2 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -518,6 +518,8 @@ impl< let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN; let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN; + // We use a thread local storage to guarantee the `packed_leaf_input` vector is only allocated + // once per thread let tls: ThreadLocal>> = ThreadLocal::new(); // PARALLEL SIMD PROCESSING @@ -616,9 +618,8 @@ impl< // Assemble the sponge input. // Layout: [parameter | tree_tweak | all_chain_ends] - // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire + // NOTE: `packed_leaf_input` is preallocated per thread. We overwrite the entire // vector in each iteration, so no need to `fill(0)`! - //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len]; // Copy pre-packed parameter packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); From a1abd1e49774c91f0395bd2a65e12132c95134ec Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 17:30:01 +0100 Subject: [PATCH 08/12] fix indentation of inner for loop Somehow this is a case where cargo fmt has no opinion about it. Earlier when using `for_each_init` the indentation was changed, but this part didn't want to "come back" to what it was before... --- src/symmetric/tweak_hash/poseidon.rs | 66 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 8d079f2..c702d2e 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -570,44 +570,44 @@ impl< let chain_value_offset = PARAMETER_LEN + TWEAK_LEN; for (chain_index, packed_chain) in - packed_chains.iter_mut().enumerate().take(num_chains) - { - // Walk this chain for `chain_length - 1` steps. - // The starting point is step 0, so we need `chain_length - 1` iterations. - for step in 0..chain_length - 1 { - // Current position in the chain. - let pos = (step + 1) as u8; - - // Assemble the packed input for the hash function. - // Layout: [parameter | tweak | current_value] - let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH]; - - // Copy pre-packed parameter - packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - - // Pack tweaks directly into destination - pack_fn_into::( - &mut packed_input, - chain_tweak_offset, - |t_idx, lane| { - Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos) - .to_field_elements::()[t_idx] - }, - ); - - // Copy current chain value (already packed) - packed_input[chain_value_offset..chain_value_offset + HASH_LEN] - .copy_from_slice(packed_chain); - - // Apply the hash function to advance the chain. - // This single call processes all epochs in parallel. - *packed_chain = + packed_chains.iter_mut().enumerate().take(num_chains) + { + // Walk this chain for `chain_length - 1` steps. + // The starting point is step 0, so we need `chain_length - 1` iterations. + for step in 0..chain_length - 1 { + // Current position in the chain. + let pos = (step + 1) as u8; + + // Assemble the packed input for the hash function. + // Layout: [parameter | tweak | current_value] + let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH]; + + // Copy pre-packed parameter + packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); + + // Pack tweaks directly into destination + pack_fn_into::( + &mut packed_input, + chain_tweak_offset, + |t_idx, lane| { + Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos) + .to_field_elements::()[t_idx] + }, + ); + + // Copy current chain value (already packed) + packed_input[chain_value_offset..chain_value_offset + HASH_LEN] + .copy_from_slice(packed_chain); + + // Apply the hash function to advance the chain. + // This single call processes all epochs in parallel. + *packed_chain = poseidon_compress::( &chain_perm, &packed_input, ); - } } + } // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES // From 7d7d0aad55a3e9aa4e15672792729370d11c4be2 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Thu, 18 Dec 2025 17:35:44 +0100 Subject: [PATCH 09/12] [examples] add two examples for key gen for 2^8 and 2^32 elements following the benchmarks for the smallest and largest case --- examples/single_keygen.rs | 24 ++++++++++++++++++++++++ examples/single_keygen_2_32.rs | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 examples/single_keygen.rs create mode 100644 examples/single_keygen_2_32.rs diff --git a/examples/single_keygen.rs b/examples/single_keygen.rs new file mode 100644 index 0000000..449c5c2 --- /dev/null +++ b/examples/single_keygen.rs @@ -0,0 +1,24 @@ +use std::hint::black_box; + +use leansig::signature::{ + SignatureScheme, + generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_8::SIGTopLevelTargetSumLifetime8Dim64Base8, +}; + +fn main() { + let mut rng = rand::rng(); + + // 2^8 lifetime, full activation + let activation_duration = SIGTopLevelTargetSumLifetime8Dim64Base8::LIFETIME as usize; + + eprintln!("Running single key_gen for 2^8 lifetime..."); + let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime8Dim64Base8::key_gen( + &mut rng, + 0, + activation_duration, + )); + eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk)); + + // Prevent optimization from removing the key_gen call + black_box((pk, sk)); +} diff --git a/examples/single_keygen_2_32.rs b/examples/single_keygen_2_32.rs new file mode 100644 index 0000000..4bc0b39 --- /dev/null +++ b/examples/single_keygen_2_32.rs @@ -0,0 +1,33 @@ +use std::hint::black_box; + +use leansig::signature::{ + SignatureScheme, + generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_32::size_optimized::SIGTopLevelTargetSumLifetime32Dim32Base26, +}; + +/// Cap activation duration to 2^18 to keep runtime reasonable (same as benchmark) +const MAX_LOG_ACTIVATION_DURATION: usize = 18; + +fn main() { + let mut rng = rand::rng(); + + // 2^32 lifetime, activation capped at 2^18 + let activation_duration = std::cmp::min( + 1 << MAX_LOG_ACTIVATION_DURATION, + SIGTopLevelTargetSumLifetime32Dim32Base26::LIFETIME as usize, + ); + + eprintln!( + "Running single key_gen for 2^32 lifetime (activation 2^{})...", + MAX_LOG_ACTIVATION_DURATION + ); + let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime32Dim32Base26::key_gen( + &mut rng, + 0, + activation_duration, + )); + eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk)); + + // Prevent optimization from removing the key_gen call + black_box((pk, sk)); +} From d01fa2c708abb89b363fc522fa497e9741449ed2 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 22 Dec 2025 16:23:03 +0100 Subject: [PATCH 10/12] use iterator approach when adding chunks to state --- src/symmetric/tweak_hash/poseidon.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index c702d2e..5035c10 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -215,10 +215,9 @@ where // 1. fill in all full chunks and permute let mut it = input.chunks_exact(rate); for chunk in &mut it { - //input.chunks_exact(rate) { // iterate the chunks - for i in 0..chunk.len() { - state[i] += chunk[i]; + for (s, &x) in state.iter_mut().take(rate).zip(chunk) { + *s += x; } perm.permute_mut(&mut state); } From ceab87d5e3cb397538db610cc3c63ae28713e76a Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 22 Dec 2025 16:24:41 +0100 Subject: [PATCH 11/12] delete keygen examples / profiling helpers --- examples/single_keygen.rs | 24 ------------------------ examples/single_keygen_2_32.rs | 33 --------------------------------- 2 files changed, 57 deletions(-) delete mode 100644 examples/single_keygen.rs delete mode 100644 examples/single_keygen_2_32.rs diff --git a/examples/single_keygen.rs b/examples/single_keygen.rs deleted file mode 100644 index 449c5c2..0000000 --- a/examples/single_keygen.rs +++ /dev/null @@ -1,24 +0,0 @@ -use std::hint::black_box; - -use leansig::signature::{ - SignatureScheme, - generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_8::SIGTopLevelTargetSumLifetime8Dim64Base8, -}; - -fn main() { - let mut rng = rand::rng(); - - // 2^8 lifetime, full activation - let activation_duration = SIGTopLevelTargetSumLifetime8Dim64Base8::LIFETIME as usize; - - eprintln!("Running single key_gen for 2^8 lifetime..."); - let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime8Dim64Base8::key_gen( - &mut rng, - 0, - activation_duration, - )); - eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk)); - - // Prevent optimization from removing the key_gen call - black_box((pk, sk)); -} diff --git a/examples/single_keygen_2_32.rs b/examples/single_keygen_2_32.rs deleted file mode 100644 index 4bc0b39..0000000 --- a/examples/single_keygen_2_32.rs +++ /dev/null @@ -1,33 +0,0 @@ -use std::hint::black_box; - -use leansig::signature::{ - SignatureScheme, - generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_32::size_optimized::SIGTopLevelTargetSumLifetime32Dim32Base26, -}; - -/// Cap activation duration to 2^18 to keep runtime reasonable (same as benchmark) -const MAX_LOG_ACTIVATION_DURATION: usize = 18; - -fn main() { - let mut rng = rand::rng(); - - // 2^32 lifetime, activation capped at 2^18 - let activation_duration = std::cmp::min( - 1 << MAX_LOG_ACTIVATION_DURATION, - SIGTopLevelTargetSumLifetime32Dim32Base26::LIFETIME as usize, - ); - - eprintln!( - "Running single key_gen for 2^32 lifetime (activation 2^{})...", - MAX_LOG_ACTIVATION_DURATION - ); - let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime32Dim32Base26::key_gen( - &mut rng, - 0, - activation_duration, - )); - eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk)); - - // Prevent optimization from removing the key_gen call - black_box((pk, sk)); -} From 1305e33560fb375f2beced732527a934837bc613 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 22 Dec 2025 18:19:26 +0100 Subject: [PATCH 12/12] use stdlib `thread_local!` macro instead of thread_local crate --- Cargo.toml | 2 - src/symmetric/tweak_hash/poseidon.rs | 70 ++++++++++++++-------------- 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 60002f8..41637e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,8 +46,6 @@ p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" p3-koala-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" } p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" } -thread_local = "1.1.9" - [dev-dependencies] criterion = "0.7" proptest = "1.7" diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 5035c10..e3e5395 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -18,7 +18,6 @@ use super::TweakableHash; use p3_koala_bear::Poseidon2KoalaBear; use std::cell::RefCell; -use thread_local::ThreadLocal; const DOMAIN_PARAMETERS_LENGTH: usize = 4; /// The state width for compressing a single hash in a chain. @@ -517,9 +516,11 @@ impl< let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN; let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN; - // We use a thread local storage to guarantee the `packed_leaf_input` vector is only allocated + // We use thread-local storage to guarantee the `packed_leaf_input` vector is only allocated // once per thread - let tls: ThreadLocal>> = ThreadLocal::new(); + thread_local! { + static PACKED_LEAF_INPUT: RefCell> = const { RefCell::new(Vec::new()) }; + } // PARALLEL SIMD PROCESSING // @@ -537,18 +538,11 @@ impl< // // This layout enables efficient SIMD operations across epochs. - let cell = tls.get_or(|| { - RefCell::new(vec![PackedF::ZERO; sponge_input_len]) - }); - let mut packed_leaf_input = cell.borrow_mut(); - // reset not needed - let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] = array::from_fn(|c_idx| { // Generate starting points for this chain across all epochs. let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| { - PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64) - .into() + PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64).into() }); // Transpose to vertical packing for SIMD efficiency. @@ -601,10 +595,10 @@ impl< // Apply the hash function to advance the chain. // This single call processes all epochs in parallel. *packed_chain = - poseidon_compress::( - &chain_perm, - &packed_input, - ); + poseidon_compress::( + &chain_perm, + &packed_input, + ); } } @@ -619,45 +613,49 @@ impl< // Layout: [parameter | tree_tweak | all_chain_ends] // NOTE: `packed_leaf_input` is preallocated per thread. We overwrite the entire // vector in each iteration, so no need to `fill(0)`! + let packed_leaves = PACKED_LEAF_INPUT.with_borrow_mut(|packed_leaf_input| { + // Resize on first use for this thread + if packed_leaf_input.len() != sponge_input_len { + packed_leaf_input.resize(sponge_input_len, PackedF::ZERO); + } - // Copy pre-packed parameter - packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - - // Pack tree tweaks directly (level 0 for bottom-layer leaves) - pack_fn_into::( - &mut packed_leaf_input, - sponge_tweak_offset, - |t_idx, lane| { - Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() + // Copy pre-packed parameter + packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); + + // Pack tree tweaks directly (level 0 for bottom-layer leaves) + pack_fn_into::( + packed_leaf_input, + sponge_tweak_offset, + |t_idx, lane| { + Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() [t_idx] - }, - ); + }, + ); - // Copy all chain ends (already packed) - let dst = &mut packed_leaf_input[sponge_chains_offset + // Copy all chain ends (already packed) + let dst = &mut packed_leaf_input[sponge_chains_offset ..sponge_chains_offset + packed_chains.len() * HASH_LEN]; - for (dst_chunk, src_chain) in + for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) { dst_chunk.copy_from_slice(src_chain); } - // Apply the sponge hash to produce the leaf. - // This absorbs all chain ends and squeezes out the final hash. - let packed_leaves = + // Apply the sponge hash to produce the leaf. + // This absorbs all chain ends and squeezes out the final hash. poseidon_sponge::( &sponge_perm, &capacity_val, - &packed_leaf_input, - ); + packed_leaf_input, + ) + }); // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION // // Convert from vertical packing back to scalar layout. // Each lane becomes one leaf in the output slice. unpack_array(&packed_leaves, leaves_chunk); - }, - ); + }); // HANDLE REMAINDER EPOCHS //