integritychain
diff --git a/‎.github/workflows/test.yml
-1 b/‎.github/workflows/test.yml
-1
diff --git a/‎CHANGELOG.md
+2-2 b/‎CHANGELOG.md
+2-2
diff --git a/‎benches/README.md
+3-1 b/‎benches/README.md
+3-1
diff --git a/‎dudect/src/main.rs
+1-1 b/‎dudect/src/main.rs
+1-1
diff --git a/‎src/encodings.rs
+12-12 b/‎src/encodings.rs
+12-12
diff --git a/‎src/hashing.rs
+12-13 b/‎src/hashing.rs
+12-13
diff --git a/‎src/helpers.rs
+7-8 b/‎src/helpers.rs
+7-8
diff --git a/‎src/high_low.rs
+1 b/‎src/high_low.rs
+1
@@ -52,7 +52,6 @@ jobs:
       - uses: EmbarkStudios/cargo-deny-action@v1
 
 
-# TODO: Temp 'fix' for Rust 1.80/1.81 problem involving 'time'; to be unwound...
   cargo_outdated:
     runs-on: ubuntu-latest
     steps:
 
@@ -5,10 +5,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## 0.4.4 (2024-10-XX)
+## 0.4.4 (2024-10-29)
 
 - Significant shrink of required stack size
-- Internal-only refactoring and polishing
+- Internal-only refactoring, clean-up and polishing
 
 ## 0.4.3 (2024-10-16)
 
 
@@ -3,7 +3,9 @@ Note that constant-time restrictions on the implementation do impact performance
 
 Additional performance optimizations are on the roadmap. Near-obvious uplift can be 
 had with more careful modular multiplication & addition using fewer reductions. Also, 
-'u16' arithmetic has an x86 performance penalty.
+'u16' arithmetic has an x86 performance penalty. The `cap_a_hat` pre-compute can be
+put into both PublicKey and PrivateKey structs, but current causes stack overflows on
+Windows with unoptimized dev builds...this will be investigated further.
 
 ~~~
 October 15, 2024
 
@@ -26,7 +26,7 @@ impl CryptoRng for TestRng {}
 #[repr(align(8))]
 pub struct AlignedBytes<const BYTE_LEN: usize>(pub(crate) [u8; BYTE_LEN]);
 
-
+#[allow(deprecated)] // calling dudect fn below in inner loop
 fn keygen_and_sign(runner: &mut CtRunner, mut _rng: &mut BenchRng) {
     const ITERATIONS_INNER: usize = 5;
     const ITERATIONS_OUTER: usize = 2_usize.pow(20); // 2**20 = 1_048_576
 
@@ -13,14 +13,14 @@ use crate::{D, Q};
 ///
 /// This is only used in `ml_dsa::key_gen()` and does not involve untrusted input.
 ///
-/// **Input**:  `ρ ∈ {0,1}^256`, `t1 ∈ R^k` with coefficients in `[0, 2^{bitlen(q−1)−d}-1]`. <br>
+/// **Input**:  `ρ ∈ B^{32}`, `t1 ∈ R^k` with coefficients in `[0, 2^{bitlen(q−1)−d}-1]`. <br>
 /// **Output**: Public key `pk ∈ B^{32+32·k·(bitlen(q−1)−d)}`.
 pub(crate) fn pk_encode<const K: usize, const PK_LEN: usize>(
     rho: &[u8; 32], t1: &[R; K],
 ) -> [u8; PK_LEN] {
-    let blqd = bit_length(Q - 1) - D as usize;
-    debug_assert!(t1.iter().all(|t| is_in_range(t, 0, (1 << blqd) - 1)), "Alg 22: t1 out of range");
-    debug_assert_eq!(PK_LEN, 32 + 32 * K * blqd, "Alg 22: bad pk/config size");
+    const BLQD: usize = bit_length(Q - 1) - D as usize;
+    debug_assert!(t1.iter().all(|t| is_in_range(t, 0, (1 << BLQD) - 1)), "Alg 22: t1 out of range");
+    debug_assert_eq!(PK_LEN, 32 + 32 * K * BLQD, "Alg 22: bad pk/config size");
     let mut pk = [0u8; PK_LEN];
 
     // 1: pk ← rho
@@ -30,10 +30,10 @@ pub(crate) fn pk_encode<const K: usize, const PK_LEN: usize>(
     // 3: pk ← pk || SimpleBitPack(t1[i], 2^{bitlen(q−1)−d}-1)
     // 4: end for
     pk[32..]
-        .chunks_mut(32 * blqd)
+        .chunks_mut(32 * BLQD)
         .enumerate()
         .take(K) // not strictly needed
-        .for_each(|(i, chunk)| simple_bit_pack(&t1[i], (1 << blqd) - 1, chunk));
+        .for_each(|(i, chunk)| simple_bit_pack(&t1[i], (1 << BLQD) - 1, chunk));
 
     // 5: return pk
     pk
@@ -47,7 +47,7 @@ pub(crate) fn pk_encode<const K: usize, const PK_LEN: usize>(
 /// `simple_bit_unpack()` will detect malformed input -- an overly conservative (?) route for now.
 ///
 /// **Input**:  Public key `pk ∈ B^{32+32·k·(bitlen(q−1)−d)}`. <br>
-/// **Output**: `ρ ∈ {0,1}^256`, `t1 ∈ R^k` with coefficients in `[0, 2^{bitlen(q−1)−d}−1]`).
+/// **Output**: `ρ ∈ B^{32}`, `t1 ∈ R^k` with coefficients in `[0, 2^{bitlen(q−1)−d}−1]`).
 ///
 /// # Errors
 /// Returns an error when the internal `simple_bit_unpack()` invocation finds an element of
@@ -85,7 +85,7 @@ pub(crate) fn pk_decode<const K: usize, const PK_LEN: usize>(
 ///
 /// This is only used in `ml_dsa::key_gen()` and does not involve untrusted input.
 ///
-/// **Input**: `ρ ∈ {0,1}^256`, `K ∈ {0,1}^256`, `tr ∈ {0,1}^512`,
+/// **Input**: `ρ ∈ B^{32}`, `K ∈ B^{32}`, `tr ∈ B^{64}`,
 ///            `s_1 ∈ R^l` with coefficients in `[−η, η]`,
 ///            `s_2 ∈ R^k` with coefficients in `[−η, η]`,
 ///            `t_0 ∈ R^k` with coefficients in `[−2^{d-1}+1, 2^{d-1}]`.
@@ -159,7 +159,7 @@ pub(crate) fn sk_encode<const K: usize, const L: usize, const SK_LEN: usize>(
 ///
 /// **Input**:  Private key, `sk ∈ B^{32+32+64+32·((ℓ+k)·bitlen(2η)+d·k)}`
 ///             Security parameter `η` (eta) must be either 2 or 4.<br>
-/// **Output**: `ρ ∈ {0,1}^256`, `K ∈ {0,1}^256`, `tr ∈ {0,1}^512`,
+/// **Output**: `ρ ∈ B^{32}`, `K ∈ B^{32}`, `tr ∈ B^{64}`,
 ///             `s_1 ∈ R^ℓ`, `s_2 ∈ R^k`, `t_0 ∈ R^k` with coefficients in `[−2^{d−1}+1, 2^{d−1}]`.
 ///
 /// # Errors
@@ -168,13 +168,13 @@ pub(crate) fn sk_encode<const K: usize, const L: usize, const SK_LEN: usize>(
 pub(crate) fn sk_decode<const K: usize, const L: usize, const SK_LEN: usize>(
     eta: i32, sk: &[u8; SK_LEN],
 ) -> Result<(&[u8; 32], &[u8; 32], &[u8; 64], [R; L], [R; K], [R; K]), &'static str> {
+    const TOP: i32 = 1 << (D - 1);
     debug_assert!((eta == 2) || (eta == 4), "Alg 25: incorrect eta");
     debug_assert_eq!(
         SK_LEN,
         128 + 32 * ((K + L) * bit_length(2 * eta) + D as usize * K),
         "Alg 25: bad sk/config size"
     );
-    let top = 1 << (D - 1);
     let (mut s_1, mut s_2, mut t_0) = ([R0; L], [R0; K], [R0; K]);
 
     // 1: (rho, 𝐾, tr, 𝑦0 , … , 𝑦ℓ−1 , 𝑧0 , … , 𝑧𝑘−1 , 𝑤0 , … , 𝑤𝑘−1 ) ∈
@@ -211,7 +211,7 @@ pub(crate) fn sk_decode<const K: usize, const L: usize, const SK_LEN: usize>(
     for i in 0..K {
         //
         // 9: t0[i] ← BitUnpack(wi, −2^{d−1} - 1, 2^{d−1})   ▷ This is always in the correct range
-        t_0[i] = bit_unpack(&sk[start + i * step..start + (i + 1) * step], top - 1, top)?;
+        t_0[i] = bit_unpack(&sk[start + i * step..start + (i + 1) * step], TOP - 1, TOP)?;
 
         // 10: end for
     }
@@ -231,7 +231,7 @@ pub(crate) fn sk_decode<const K: usize, const L: usize, const SK_LEN: usize>(
 /// The `CTEST` generic is only passed through to the `hint_bit_pack()` leaf function
 /// such that this logic becomes constant-time.
 ///
-/// **Input**: `c_tilde ∈ {0,1}^2λ` (bits),
+/// **Input**: `c_tilde ∈ B^{λ/4}`,
 ///            `z ∈ R^ℓ` with coefficients in `[−1*γ_1 + 1, γ_1]`,
 ///            `h ∈ R^k_2`. <br>
 /// **Output**: Signature, `σ ∈ B^{λ/4+l·32·(1+bitlen(γ_1-1)+ω+k}`
 
@@ -7,7 +7,6 @@ use sha2::{Digest, Sha256, Sha512};
 use sha3::digest::{ExtendableOutput, Update, XofReader};
 use sha3::{Shake128, Shake256};
 
-
 /// # Function H(v,d) of section 3.7 item 1 at bottom of page 14.
 /// Takes a reference to a list of byte-slice references and runs them through Shake256.
 /// Returns a xof reader for extracting extendable output.
@@ -86,7 +85,7 @@ pub(crate) fn sample_in_ball<const CTEST: bool>(tau: i32, rho: &[u8]) -> R {
         // 13: end for
     }
 
-    // slightly redundant...
+    // slightly redundant, but fuzz target
     debug_assert!(
         c.0.iter().map(|&e| usize::from(e != 0)).sum::<usize>() == tau,
         "Alg 29: bad hamming weight (a)"
@@ -124,10 +123,10 @@ pub(crate) fn rej_ntt_poly<const CTEST: bool>(rhos: &[&[u8]]) -> T {
     while j < 256 {
         //
         // 5: (ctx, 𝑠) ← G.Squeeze(ctx, 3)
-        // 6: a_hat[j] ← CoefFromThreeBytes(H128(ρ)[[c]], H128(ρ)[[c + 1]], H128(ρ)[[c + 2]])
-        let mut h128pc = [0u8; 3];
-        xof.read(&mut h128pc); // implicit c += 3
-        let a_hat_j = coeff_from_three_bytes::<CTEST>(h128pc); // gets a result
+        // 6: 𝑎[𝑗] ← CoeffFromThreeBytes(𝑠[0], 𝑠[1], 𝑠[2])
+        let mut h5 = [0u8; 3];
+        xof.read(&mut h5); // implicit c += 3
+        let a_hat_j = coeff_from_three_bytes::<CTEST>(h5); // gets a result
 
         // 7: if a_hat[j] != ⊥ then
         if let Ok(res) = a_hat_j {
@@ -154,7 +153,7 @@ pub(crate) fn rej_ntt_poly<const CTEST: bool>(rhos: &[&[u8]]) -> T {
 /// The `CTEST` generic is only passed through to the `coef_from_half_byte()` leaf function such
 /// that this logic becomes constant-time.
 ///
-/// **Input**: A seed `ρ ∈B^66`. <br>
+/// **Input**: A seed `ρ ∈B^{66}`. <br>
 /// **Output**: A polynomial `a ∈ Rq`.
 pub(crate) fn rej_bounded_poly<const CTEST: bool>(eta: i32, rhos: &[&[u8]]) -> R {
     debug_assert_eq!(rhos.iter().map(|&i| i.len()).sum::<usize>(), 528 / 8, "Alg 31: bad rho size");
@@ -221,7 +220,7 @@ pub(crate) fn rej_bounded_poly<const CTEST: bool>(eta: i32, rhos: &[&[u8]]) -> R
 /// such that this logic becomes constant-time.
 ///
 /// **Input**: `ρ ∈ B^{32}`. <br>
-/// **Output**: Matrix `cap_a_hat ∈ (𝑇𝑞)^{𝑘×ℓ}`
+/// **Output**: Matrix `cap_a_hat ∈ 𝑇_𝑞^{𝑘×ℓ}`
 #[allow(clippy::cast_possible_truncation)] // s and r as u8
 pub(crate) fn expand_a<const CTEST: bool, const K: usize, const L: usize>(
     rho: &[u8; 32],
@@ -247,7 +246,7 @@ pub(crate) fn expand_a<const CTEST: bool, const K: usize, const L: usize>(
 /// The `CTEST` generic is only passed through to the `rej_bounded_poly()` leaf function
 /// such that this logic becomes constant-time.
 ///
-/// **Input**: `ρ ∈ B^64` <br>
+/// **Input**: `ρ ∈ B^{64}` <br>
 /// **Output**: Vectors `s1`, `s2` of polynomials in `R_q`.
 #[allow(clippy::cast_possible_truncation)] // r and r+L
 pub(crate) fn expand_s<const CTEST: bool, const K: usize, const L: usize>(
@@ -267,8 +266,8 @@ pub(crate) fn expand_s<const CTEST: bool, const K: usize, const L: usize>(
         core::array::from_fn(|r| rej_bounded_poly::<CTEST>(eta, &[rho, &[(r + L) as u8], &[0]]));
 
     // 7: return (s_1 , s_2)
-    debug_assert!(s1.iter().all(|r| is_in_range(r, eta, eta)), "Alg 27: s1 out of range");
-    debug_assert!(s2.iter().all(|r| is_in_range(r, eta, eta)), "Alg 27: s2 out of range");
+    debug_assert!(s1.iter().all(|r| is_in_range(r, eta, eta)), "Alg 33: s1 out of range");
+    debug_assert!(s2.iter().all(|r| is_in_range(r, eta, eta)), "Alg 33: s2 out of range");
     (s1, s2)
 }
 
@@ -277,8 +276,8 @@ pub(crate) fn expand_s<const CTEST: bool, const K: usize, const L: usize>(
 /// Samples a vector `s ∈ R^ℓ_q` such that each polynomial `s_j` has coefficients
 /// between `−γ_1 + 1` and `γ_1`. This function is not exposed to untrusted input.
 ///
-/// **Input**: A bit string `ρ ∈ {0,1}^512` and a non-negative integer `µ`. <br>
-/// **Output**: Vector `y ∈ R^ℓ_q`.
+/// **Input**: A bit string `ρ ∈ B^{64}` and a non-negative integer `µ`. <br>
+/// **Output**: Vector `y ∈ R^ℓ`.
 pub(crate) fn expand_mask<const L: usize>(gamma1: i32, rho: &[u8; 64], mu: u16) -> [R; L] {
     let mut y = [R0; L];
     let mut v = [0u8; 32 * 20]; // leaving a few bytes on the table
 
@@ -4,7 +4,7 @@ use crate::{Q, ZETA};
 // Some arith routines leverage dilithium https://github.com/PQClean/PQClean/tree/master/crypto_sign
 
 
-/// Algorithm 43 `BitRev8()` is not implemented; zetas are pulled from pre-computed table
+/// # Algorithm 43 `BitRev8()` is not implemented; zetas are pulled from pre-computed table
 /// `ZETA_TABLE_MONT`; see below (near end)
 
 /// # Macro ensure!()
@@ -44,8 +44,7 @@ pub(crate) const fn partial_reduce64(a: i64) -> i32 {
 }
 
 
-// TODO: need to experiment a little with `mul_red(32, 32)`
-#[allow(dead_code)]
+#[allow(dead_code)]  // I may come back to this and experiment more
 #[allow(clippy::cast_possible_truncation)]
 pub(crate) const fn partial_reduce64b(a: i64) -> i32 {
     const MM: i64 = ((1 << 64) / (Q as i128)) as i64;
@@ -86,9 +85,9 @@ pub(crate) const fn bit_length(x: i32) -> usize { x.ilog2() as usize + 1 }
 
 
 /// Mod +/- see definition on page 6.
-/// If α is a positive integer and m ∈ Z or m ∈ `Z_α` , then m mod± α denotes the unique
-/// element m′ ∈ Z in the range −α/2 < m′ ≤ α/2 such that m and m′ are congruent
-/// modulo α.  'ready to optimize'
+/// If `α` is a positive integer and `m ∈ Z` or `m ∈ Z_α` , then m mod± α denotes the unique
+/// element `m′ ∈ Z` in the range `−α/2 < m′ ≤ α/2` such that `m` and `m′` are congruent
+/// modulo `α`.  'ready to optimize'
 pub(crate) fn center_mod(m: i32) -> i32 {
     debug_assert!(m.abs() < 2_143_289_344, "center_mod input"); // for clarity; caught in full_reduce32
     let t = full_reduce32(m);
@@ -120,7 +119,7 @@ pub(crate) fn mat_vec_mul<const K: usize, const L: usize>(
 
 // Note Algorithm 44 has been dissolved into its place of use(s)
 
-/// Algorithm 46: `AddVectorNTT(v_hat, w_hat)` on page 45.
+/// # Algorithm 46: `AddVectorNTT(v_hat, w_hat)` on page 45.
 /// Computes the sum `v_hat + w_hat` of two vectors `v_hat`, `w_hat` over `𝑇_𝑞`.
 ///
 /// **Input**:  `ℓ ∈ ℕ, v_hat ∈ 𝑇_𝑞^ℓ , w_hat ∈ 𝑇_𝑞^ℓ`. <br>
@@ -151,7 +150,7 @@ pub(crate) fn infinity_norm<const ROW: usize>(w: &[R; ROW]) -> i32 {
 }
 
 
-/// Algorithm 49: MontgomeryReduce(𝑎) on page 50.
+/// # Algorithm 49: MontgomeryReduce(𝑎) on page 50.
 /// Computes 𝑎 ⋅ 2−32 mod 𝑞.
 ///
 /// **Input**:  Integer 𝑎 with −231 𝑞 ≤ 𝑎 ≤ 231 𝑞.
 
@@ -155,6 +155,7 @@ pub(crate) fn make_hint(gamma2: i32, z: Zq, r: Zq) -> bool {
 pub(crate) fn use_hint(gamma2: i32, h: Zq, r: Zq) -> Zq {
     //
     // 1: m ← (q− 1)/(2*γ_2)
+    // dissolved into steps 3 and 4 below
 
     // 2: (r1, r0) ← Decompose(r)
     let (r1, r0) = decompose(gamma2, r);