From 47b5607ba93a9e3bbb8aa43d1f17c589a0ebf95d Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sat, 19 Apr 2025 23:44:22 +0200
Subject: [PATCH 01/43] passes local tests

---
 Cargo.toml                                    |   1 +
 libcrux-aesgcm/Cargo.toml                     |  38 +
 libcrux-aesgcm/src/aes_ctr.rs                 | 142 ++++
 libcrux-aesgcm/src/aes_generic.rs             |  53 ++
 libcrux-aesgcm/src/gf128_generic.rs           |  91 +++
 libcrux-aesgcm/src/lib.rs                     |   4 +
 libcrux-aesgcm/src/platform.rs                |  24 +
 libcrux-aesgcm/src/platform/portable.rs       |   4 +
 .../src/platform/portable/aes_core.rs         | 764 ++++++++++++++++++
 .../src/platform/portable/gf128_core.rs       |  71 ++
 10 files changed, 1192 insertions(+)
 create mode 100644 libcrux-aesgcm/Cargo.toml
 create mode 100644 libcrux-aesgcm/src/aes_ctr.rs
 create mode 100644 libcrux-aesgcm/src/aes_generic.rs
 create mode 100644 libcrux-aesgcm/src/gf128_generic.rs
 create mode 100644 libcrux-aesgcm/src/lib.rs
 create mode 100644 libcrux-aesgcm/src/platform.rs
 create mode 100644 libcrux-aesgcm/src/platform/portable.rs
 create mode 100644 libcrux-aesgcm/src/platform/portable/aes_core.rs
 create mode 100644 libcrux-aesgcm/src/platform/portable/gf128_core.rs
diff --git a/Cargo.toml b/Cargo.toml
index 528ebd552..5a30c20e4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,6 +10,7 @@ members = [
     "libcrux-ml-kem",
     "libcrux-ml-kem/fuzz",
     "libcrux-sha3",
+    "libcrux-aesgcm",
     "libcrux-ml-dsa",
     "libcrux-intrinsics",
     "libcrux-kem",
diff --git a/libcrux-aesgcm/Cargo.toml b/libcrux-aesgcm/Cargo.toml
new file mode 100644
index 000000000..676734c17
--- /dev/null
+++ b/libcrux-aesgcm/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name = "libcrux-aesgcm"
+version.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+edition.workspace = true
+repository.workspace = true
+readme = "README.md"
+description = "Libcrux AES-GCM implementation"
+exclude = ["/proofs", "/c.sh", "/c.yaml", "/tests/tv", "tests/cavp.rs"]
+
+[lib]
+bench = false # so libtest doesn't eat the arguments to criterion
+
+[dependencies]
+libcrux-platform = { version = "0.0.2-beta.2", path = "../sys/platform" }
+libcrux-intrinsics = { version = "0.0.2-beta.2", path = "../libcrux-intrinsics" }
+hax-lib = { version = "0.1.0-alpha.1", git = "https://github.com/hacspec/hax/" }
+
+[features]
+simd128 = []
+simd256 = []
+
+[[bench]]
+name = "aesgcm"
+harness = false
+
+[dev-dependencies]
+criterion = "0.5.1"
+hex = "0.4.3"
+rand = "0.8.5"
+cavp = { version = "0.0.2-beta.2", path = "../cavp" }
+pretty_env_logger = "0.5.0"
+rand_core = { version = "0.6" }
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(hax)', 'cfg(eurydice)'] }
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
new file mode 100644
index 000000000..70a35dd4d
--- /dev/null
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -0,0 +1,142 @@
+#![allow(non_camel_case_types)]
+
+use crate::{aes_generic::*, platform::AESState};
+
+pub struct AES_CTR_Context<T: AESState, const NUM_KEYS: usize> {
+    pub(crate) keyex: ExtendedKey<T, NUM_KEYS>,
+    pub(crate) ctr_nonce: [u8; 16]
+}
+
+pub type AES128_CTR_Context<T> = AES_CTR_Context<T, 11>;
+
+pub fn aes128_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES128_CTR_Context<T> {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 16);
+    let mut ctr_nonce = [0u8; 16];
+    ctr_nonce[0..12].copy_from_slice(nonce);
+    AES128_CTR_Context { 
+        keyex: aes128_key_expansion(key), 
+        ctr_nonce }
+}
+
+pub fn aes128_ctr_key_block<T: AESState>(ctx: &AES128_CTR_Context<T>, ctr:u32, out: &mut [u8]) {
+    debug_assert!(out.len() == 16);
+    let mut st_init = ctx.ctr_nonce;
+    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+    let mut st = T::new();
+    st.load_block(&st_init);
+    block_cipher(&mut st, ctx.keyex);
+    st.store_block(out);
+}
+
+pub fn aes128_ctr_xor_block<T: AESState>(ctx: &AES128_CTR_Context<T>, ctr:u32, inp:&[u8], out: &mut [u8]) {
+    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    let mut st_init = ctx.ctr_nonce;
+    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+    let mut st = T::new();
+    st.load_block(&st_init);
+    block_cipher(&mut st, ctx.keyex);
+    st.xor_block(inp, out);
+}
+
+pub fn aes128_ctr_encrypt<T: AESState>(key: &[u8], nonce: &[u8], ctr:u32, inp:&[u8], out: &mut [u8]) {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 16);
+    debug_assert!(inp.len() == out.len());
+    let ctx = aes128_ctr_init::<T>(key, nonce);
+    for i in 0..inp.len()/16 {
+        aes128_ctr_xor_block(&ctx, ctr.wrapping_add(i as u32), &inp[i*16..i*16+16], &mut out[i*16..i*16+16]);
+    }
+}
+
+pub type AES256_CTR_Context<T> = AES_CTR_Context<T, 15>;
+
+pub fn aes256_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES256_CTR_Context<T> {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 32);
+    let mut ctr_nonce = [0u8; 16];
+    ctr_nonce[0..12].copy_from_slice(nonce);
+    AES256_CTR_Context { 
+        keyex: aes256_key_expansion(key), 
+        ctr_nonce }
+}
+
+pub fn aes256_ctr_key_block<T: AESState>(ctx: &AES256_CTR_Context<T>, ctr:u32, out: &mut [u8]) {
+    debug_assert!(out.len() == 16);
+    let mut st_init = ctx.ctr_nonce;
+    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+    let mut st = T::new();
+    st.load_block(&st_init);
+    block_cipher(&mut st, ctx.keyex);
+    st.store_block(out);
+}
+
+pub fn aes256_ctr_xor_block<T: AESState>(ctx: &AES256_CTR_Context<T>, ctr:u32, inp:&[u8], out: &mut [u8]) {
+    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    let mut st_init = ctx.ctr_nonce;
+    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+    let mut st = T::new();
+    st.load_block(&st_init);
+    block_cipher(&mut st, ctx.keyex);
+    st.xor_block(inp, out);
+}
+
+pub fn aes256_ctr_encrypt<T: AESState>(key: &[u8], nonce: &[u8], ctr:u32, inp:&[u8], out: &mut [u8]) {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 32);
+    debug_assert!(inp.len() == out.len());
+    let ctx = aes256_ctr_init::<T>(key, nonce);
+    let blocks = inp.len() / 16;
+    for i in 0..blocks {
+        aes256_ctr_xor_block(&ctx, ctr.wrapping_add(i as u32), &inp[i*16..i*16+16], &mut out[i*16..i*16+16]);
+    }
+    let last = inp.len() - inp.len() % 16;
+    if (last < inp.len()) {        
+        aes256_ctr_xor_block(&ctx, ctr.wrapping_add(blocks as u32), &inp[last..], &mut out[last..]);
+    }
+}
+
+#[cfg(test)]
+ mod test {
+    use crate::platform;
+
+    use super::{aes128_ctr_init, aes128_ctr_xor_block, aes128_ctr_encrypt};
+
+    const input: [u8; 32] = [
+            0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
+            0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
+            0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
+            0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F];
+    const key: [u8; 16] = [
+        0x7E,0x24,0x06,0x78,0x17,0xFA,0xE0,0xD7,
+        0x43,0xD6,0xCE,0x1F,0x32,0x53,0x91,0x63];
+    const nonce: [u8; 12] = [
+        0x00,0x6C,0xB6,0xDB,0xC0,0x54,0x3B,0x59,
+        0xDA,0x48,0xD9,0x0B];
+    const expected: [u8; 32] = [
+        0x51,0x04,0xA1,0x06,0x16,0x8A,0x72,0xD9,
+        0x79,0x0D,0x41,0xEE,0x8E,0xDA,0xD3,0x88,
+        0xEB,0x2E,0x1E,0xFC,0x46,0xDA,0x57,0xC8,
+        0xFC,0xE6,0x30,0xDF,0x91,0x41,0xBE,0x28];
+
+    #[test]
+    fn  test_ctr_block () {        
+        let mut computed: [u8; 32] = [0u8; 32];
+        let ctx = aes128_ctr_init::<platform::portable::State>(&key, &nonce);
+        aes128_ctr_xor_block(&ctx, 1, &input[0..16], &mut computed[0..16]);
+        aes128_ctr_xor_block(&ctx, 2, &input[16..32], &mut computed[16..32]);
+        for i in 0..32{
+            if computed[i] != expected[i] {println!("mismatch at {}: expected is {}, computed is {}", i, expected[i], computed[i])}
+        }
+    }
+
+    #[test]
+    fn  test_ctr_encrypt () {        
+        let mut computed: [u8; 32] = [0u8; 32];
+        aes128_ctr_encrypt::<platform::portable::State>(&key, &nonce, 1, &input, &mut computed);
+        for i in 0..32{
+            if computed[i] != expected[i] {println!("mismatch at {}: expected is {}, computed is {}", i, expected[i], computed[i])}
+        }
+    }
+
+ }
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/aes_generic.rs b/libcrux-aesgcm/src/aes_generic.rs
new file mode 100644
index 000000000..c4c4228fd
--- /dev/null
+++ b/libcrux-aesgcm/src/aes_generic.rs
@@ -0,0 +1,53 @@
+use crate::platform::*;
+
+pub(crate) type ExtendedKey<T, const NUM_KEYS:usize> = [T; NUM_KEYS];
+
+const RCON: [u8; 11] = [
+    0x8d, 0x01, 0x02, 0x04,
+    0x08, 0x10, 0x20, 0x40,
+    0x80, 0x1b, 0x36
+];
+
+pub(crate) fn aes128_key_expansion<T:AESState>(key: &[u8]) -> ExtendedKey<T,11> {
+    debug_assert!(key.len() == 16);
+    let mut keyex = [T::new(); 11];
+    keyex[0].load_block(&key);
+    for i in 1..11 {
+        let prev = keyex[i-1];
+        keyex[i].aes_keygen_assist0(&prev,RCON[i]);
+        keyex[i].key_expansion_step(&prev);
+    }
+    keyex
+}
+
+pub(crate) fn aes256_key_expansion<T:AESState>(key: &[u8]) -> ExtendedKey<T,15> {
+    debug_assert!(key.len() == 32);
+    let mut keyex = [T::new(); 15];
+    keyex[0].load_block(&key[0..16]);
+    keyex[1].load_block(&key[16..32]);
+    for i in 2..14 {
+        let prev0 = keyex[i-2];
+        let prev1 = keyex[i-1];
+        keyex[i].aes_keygen_assist0(&prev1,RCON[i/2]);
+        keyex[i].key_expansion_step(&prev0);
+        let next0 = keyex[i];
+        keyex[i+1].aes_keygen_assist1(&next0);
+        keyex[i+1].key_expansion_step(&prev1);
+    }
+    let prev0 = keyex[12];
+    let prev1 = keyex[13];
+    keyex[14].aes_keygen_assist0(&prev1,RCON[7]);
+    keyex[14].key_expansion_step(&prev0);
+    keyex
+}
+    
+
+pub(crate) fn block_cipher<T:AESState, const NUM_KEYS:usize>(
+        st: &mut T, keyex: ExtendedKey<T, NUM_KEYS>) {
+    st.xor_key(&keyex[0]);
+    for i in 1..NUM_KEYS-1 {
+        st.aes_enc(&keyex[i]);
+    }
+    st.aes_enc_last(&keyex[NUM_KEYS-1]);
+}
+
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
new file mode 100644
index 000000000..7d4e3ad13
--- /dev/null
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -0,0 +1,91 @@
+use crate::platform::*;
+
+pub struct GF128State<T: GF128FieldElement> {
+    accumulator: T,
+    r: T
+}
+
+fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
+    debug_assert!(key.len() == 16);
+    GF128State { accumulator: T::zero(), r: T::load_elem(key) }
+}
+
+fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block:&[u8]) {
+    debug_assert!(block.len() == 16);
+    let block_elem = T::load_elem(block);
+    st.accumulator.add(&block_elem);
+    st.accumulator.mul(&st.r);   
+}
+
+fn gf128_update_last<T: GF128FieldElement>(st: &mut GF128State<T>, partial_block:&[u8]) {
+    debug_assert!(partial_block.len() < 16);
+    let mut block = [0u8; 16];
+    block[0..partial_block.len()].copy_from_slice(partial_block);
+    gf128_update(st, &block);
+}
+
+fn gf128_emit<T: GF128FieldElement>(st: &GF128State<T>, out:&mut [u8]) {
+    debug_assert!(out.len() == 16);
+    st.accumulator.store_elem(out);
+}
+ 
+fn gf128<T: GF128FieldElement>(key: &[u8], inp:&[u8], out:&mut [u8]) {
+    debug_assert!(key.len() == 16);
+    debug_assert!(out.len() == 16);
+
+    let mut st = gf128_init::<T>(key);
+    let blocks = inp.len() / 16;
+    for i in 0..blocks {
+        gf128_update(&mut st, &inp[i*16..i*16+16]);
+    }
+    let last = inp.len() - inp.len() % 16;
+    if (last < inp.len()) {        
+        gf128_update_last(&mut st, &inp[last..]);
+    }
+    gf128_emit(&st, out);
+}
+
+
+#[cfg(test)]
+ mod test {
+    use crate::platform;
+
+    use super::gf128;
+
+    const input: [u8;132] = [
+        0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+        0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+        0xab,0xad,0xda,0xd2,0x00,0x00,0x00,0x00,
+        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+        0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,
+        0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+        0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,
+        0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+        0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,
+        0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+        0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,
+        0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,
+        0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+        0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,
+        0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+        0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,
+        0x44,0xae,0x7e,0x3f];
+
+    const key: [u8;16] = [
+        0xac,0xbe,0xf2,0x05,0x79,0xb4,0xb8,0xeb,
+        0xce,0x88,0x9b,0xac,0x87,0x32,0xda,0xd7];
+
+    const expected: [u8;16] = [
+        0xfb,0xba,0xaa,0x70,0xa0,0x73,0x6f,0xf9,
+        0xed,0x2f,0xc4,0x62,0xde,0x72,0x61,0xe0];
+    
+    
+    #[test]
+    fn  test_gf128 () {        
+        let mut computed: [u8; 16] = [0u8; 16];
+        gf128::<crate::platform::portable::FieldElement>(&key, &input, &mut computed);
+        for i in 0..16{
+            if computed[i] != expected[i] {println!("mismatch at {}: expected is {}, computed is {}", i, expected[i], computed[i])}
+        }
+    }
+ }
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
new file mode 100644
index 000000000..95d421bab
--- /dev/null
+++ b/libcrux-aesgcm/src/lib.rs
@@ -0,0 +1,4 @@
+mod platform;
+mod aes_generic;
+mod gf128_generic;
+pub mod aes_ctr;
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
new file mode 100644
index 000000000..694d0bd48
--- /dev/null
+++ b/libcrux-aesgcm/src/platform.rs
@@ -0,0 +1,24 @@
+  
+pub mod portable;
+
+pub trait AESState: Copy {
+    fn new() -> Self;
+    fn load_block(&mut self, b:&[u8]);
+    fn store_block(&self, out:&mut[u8]);
+    fn xor_block(&self, inp:&[u8], out:&mut[u8]);
+
+    fn xor_key(&mut self, key: &Self);
+    fn aes_enc(&mut self, key: &Self);
+    fn aes_enc_last(&mut self, key: &Self);
+    fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8);
+    fn aes_keygen_assist1(&mut self, prev: &Self);
+    fn key_expansion_step(&mut self, prev:&Self);
+}
+
+pub trait GF128FieldElement: Copy {
+    fn zero() -> Self;
+    fn load_elem(b:&[u8]) -> Self;
+    fn store_elem(&self, b:&mut [u8]);
+    fn add(&mut self, other:&Self);
+    fn mul(&mut self, other:&Self);
+}
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/portable.rs b/libcrux-aesgcm/src/platform/portable.rs
new file mode 100644
index 000000000..9d14728fe
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/portable.rs
@@ -0,0 +1,4 @@
+mod aes_core;
+mod gf128_core;
+pub(crate) use aes_core::State;
+pub(crate) use gf128_core::FieldElement;
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/libcrux-aesgcm/src/platform/portable/aes_core.rs
new file mode 100644
index 000000000..e89d4a284
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/portable/aes_core.rs
@@ -0,0 +1,764 @@
+pub(crate) type State = [u16; 8];
+
+fn  new_state() -> State {
+    [0u16; 8]
+}
+
+fn  interleave_u8_1(i0:u8, i1:u8) -> u16 {
+    let mut x = i0 as u16;
+    x = (x | (x << 4)) & 0x0F0F;
+    x = (x | (x << 2)) & 0x3333;
+    x = (x | (x << 1)) & 0x5555;
+    let mut y = i1 as u16;
+    y = (y | (y << 4)) & 0x0F0F;
+    y = (y | (y << 2)) & 0x3333;
+    y = (y | (y << 1)) & 0x5555;
+    x | (y << 1)
+ }
+ 
+ fn  deinterleave_u8_1(i0:u16) -> (u8,u8) {
+    let mut x = i0 & 0x5555;
+    x = (x | (x >> 1)) & 0x3333;
+    x = (x | (x >> 2)) & 0x0F0F;
+    x = (x | (x >> 4)) & 0x00FF;
+    let mut y = (i0 >> 1) & 0x5555;
+    y = (y | (y >> 1)) & 0x3333;
+    y = (y | (y >> 2)) & 0x0F0F;
+    y = (y | (y >> 4)) & 0x00FF;
+    (x as u8, y as u8)
+ }
+
+ fn  interleave_u16_2(i0:u16, i1:u16) -> (u16,u16) {
+     let x = ((i1 & 0x3333) << 2) | (i0 & 0x3333);
+     let y = ((i0 & 0xcccc) >> 2) | (i1 & 0xcccc);
+     (x,y)
+ }
+ 
+ 
+ fn  interleave_u16_4(i0:u16, i1:u16) -> (u16,u16) {
+     let x = ((i1 & 0x0F0F) << 4) | (i0 & 0x0F0F);
+     let y = ((i0 & 0xF0F0) >> 4) | (i1 & 0xF0F0);
+     (x,y)
+ }
+ 
+ fn  interleave_u16_8(i0:u16, i1:u16) -> (u16,u16) {
+     let x = ((i1 & 0x00FF) << 8) | (i0 & 0x00FF);
+     let y = ((i0 & 0xFF00) >> 8) | (i1 & 0xFF00);
+     (x,y)
+ }
+ 
+ fn  transpose_u8x16(input: &[u8;16], output: &mut [u16;8]) {
+     let o0 = interleave_u8_1(input[0], input[1]);
+     let o1 = interleave_u8_1(input[2], input[3]);
+     let o2 = interleave_u8_1(input[4], input[5]);
+     let o3 = interleave_u8_1(input[6], input[7]);
+     let o4 = interleave_u8_1(input[8], input[9]);
+     let o5 = interleave_u8_1(input[10], input[11]);
+     let o6 = interleave_u8_1(input[12], input[13]);
+     let o7 = interleave_u8_1(input[14], input[15]);
+     let (o0,o1) = interleave_u16_2(o0, o1);
+     let (o2,o3) = interleave_u16_2(o2, o3);
+     let (o4,o5) = interleave_u16_2(o4, o5);
+     let (o6,o7) = interleave_u16_2(o6, o7);
+     let (o0,o2) = interleave_u16_4(o0, o2);
+     let (o1,o3) = interleave_u16_4(o1, o3);
+     let (o4,o6) = interleave_u16_4(o4, o6);
+     let (o5,o7) = interleave_u16_4(o5, o7);
+     let (o0,o4) = interleave_u16_8(o0, o4);
+     let (o1,o5) = interleave_u16_8(o1, o5);
+     let (o2,o6) = interleave_u16_8(o2, o6);
+     let (o3,o7) = interleave_u16_8(o3, o7);
+     output[0] = o0;
+     output[1] = o1;
+     output[2] = o2;
+     output[3] = o3;
+     output[4] = o4;
+     output[5] = o5;
+     output[6] = o6;
+     output[7] = o7;
+ }
+ 
+ fn  transpose_u16x8(input: &[u16;8], output: &mut [u8]) {
+    let (i0,i4) = interleave_u16_8(input[0], input[4]);
+    let (i1,i5) = interleave_u16_8(input[1], input[5]);
+    let (i2,i6) = interleave_u16_8(input[2], input[6]);
+    let (i3,i7) = interleave_u16_8(input[3], input[7]);
+    let (i0,i2) = interleave_u16_4(i0, i2);
+    let (i1,i3) = interleave_u16_4(i1, i3);
+    let (i4,i6) = interleave_u16_4(i4, i6);
+    let (i5,i7) = interleave_u16_4(i5, i7);
+    let (i0,i1) = interleave_u16_2(i0, i1);
+    let (i2,i3) = interleave_u16_2(i2, i3);
+    let (i4,i5) = interleave_u16_2(i4, i5);
+    let (i6,i7) = interleave_u16_2(i6, i7);
+
+    let (o0,o1) = deinterleave_u8_1(i0);
+    let (o2,o3) = deinterleave_u8_1(i1);
+    let (o4,o5) = deinterleave_u8_1(i2);
+    let (o6,o7) = deinterleave_u8_1(i3);
+    let (o8,o9) = deinterleave_u8_1(i4);
+    let (o10,o11) = deinterleave_u8_1(i5);
+    let (o12,o13) = deinterleave_u8_1(i6);
+    let (o14,o15) = deinterleave_u8_1(i7);
+
+    output[0] = o0;
+    output[1] = o1;
+    output[2] = o2;
+    output[3] = o3;
+    output[4] = o4;
+    output[5] = o5;
+    output[6] = o6;
+    output[7] = o7;
+    output[8] = o8;
+    output[9] = o9;
+    output[10] = o10;
+    output[11] = o11;
+    output[12] = o12;
+    output[13] = o13;
+    output[14] = o14;
+    output[15] = o15;
+}
+
+
+fn  xnor(a:u16, b:u16) -> u16 {
+    !(a ^ b)
+}
+
+#[allow(non_snake_case)]
+fn  sub_bytes_state(st:&mut State) {
+    let U0 = st[7];
+    let U1 = st[6];
+    let U2 = st[5];
+    let U3 = st[4];
+    let U4 = st[3];
+    let U5 = st[2];
+    let U6 = st[1];
+    let U7 = st[0];
+
+    let T1 = U6 ^ U4;
+    let T2 = U3 ^ U0;
+    let T3 = U1 ^ U2;
+    let T4 = U7 ^ T3;
+    let T5 = T1 ^ T2;
+    let T6 = U1 ^ U5;
+    let T7 = U0 ^ U6;
+    let T8 = T1 ^ T6;
+    let T9 = U6 ^ T4;
+    let T10 = U3 ^ T4;
+    let T11 = U7 ^ T5;
+    let T12 = T5 ^ T6;
+    let T13 = U2 ^ U5;
+    let T14 = T3 ^ T5;
+    let T15 = U5 ^ T7;
+    let T16 = U0 ^ U5;
+    let T17 = U7 ^ T8;
+    let T18 = U6 ^ U5;
+    let T19 = T2 ^ T18;
+    let T20 = T4 ^ T15;
+    let T21 = T1 ^ T13;
+    let T22 = U0 ^ T4;
+    let T39 = T21 ^ T5;
+    let T40 = T21 ^ T7;
+    let T41 = T7 ^ T19;
+    let T42 = T16 ^ T14;
+    let T43 = T22 ^ T17;
+    let T44 = T19 & T5;
+    let T45 = T20 & T11;
+    let T46 = T12 ^ T44;
+    let T47 = T10 & U7;
+    let T48 = T47 ^ T44;
+    let T49 = T7 & T21;
+    let T50 = T9 & T4;
+    let T51 = T40 ^ T49;
+    let T52 = T22 & T17;
+    let T53 = T52 ^ T49;
+    let T54 = T2 & T8;
+    let T55 = T41 & T39;
+    let T56 = T55 ^ T54;
+    let T57 = T16 & T14;
+    let T58 = T57 ^ T54;
+    let T59 = T46 ^ T45;
+    let T60 = T48 ^ T42;
+    let T61 = T51 ^ T50;
+    let T62 = T53 ^ T58;
+    let T63 = T59 ^ T56;
+    let T64 = T60 ^ T58;
+    let T65 = T61 ^ T56;
+    let T66 = T62 ^ T43;
+    let T67 = T65 ^ T66;
+    let T68 = T65 & T63;
+    let T69 = T64 ^ T68;
+    let T70 = T63 ^ T64;
+    let T71 = T66 ^ T68;
+    let T72 = T71 & T70;
+    let T73 = T69 & T67;
+    let T74 = T63 & T66;
+    let T75 = T70 & T74;
+    let T76 = T70 ^ T68;
+    let T77 = T64 & T65;
+    let T78 = T67 & T77;
+    let T79 = T67 ^ T68;
+    let T80 = T64 ^ T72;
+    let T81 = T75 ^ T76;
+    let T82 = T66 ^ T73;
+    let T83 = T78 ^ T79;
+    let T84 = T81 ^ T83;
+    let T85 = T80 ^ T82;
+    let T86 = T80 ^ T81;
+    let T87 = T82 ^ T83;
+    let T88 = T85 ^ T84;
+    let T89 = T87 & T5;
+    let T90 = T83 & T11;
+    let T91 = T82 & U7;
+    let T92 = T86 & T21;
+    let T93 = T81 & T4;
+    let T94 = T80 & T17;
+    let T95 = T85 & T8;
+    let T96 = T88 & T39;
+    let T97 = T84 & T14;
+    let T98 = T87 & T19;
+    let T99 = T83 & T20;
+    let T100 = T82 & T10;
+    let T101 = T86 & T7;
+    let T102 = T81 & T9;
+    let T103 = T80 & T22;
+    let T104 = T85 & T2;
+    let T105 = T88 & T41;
+    let T106 = T84 & T16;
+    let T107 = T104 ^ T105;
+    let T108 = T93 ^ T99;
+    let T109 = T96 ^ T107;
+    let T110 = T98 ^ T108;
+    let T111 = T91 ^ T101;
+    let T112 = T89 ^ T92;
+    let T113 = T107 ^ T112;
+    let T114 = T90 ^ T110;
+    let T115 = T89 ^ T95;
+    let T116 = T94 ^ T102;
+    let T117 = T97 ^ T103;
+    let T118 = T91 ^ T114;
+    let T119 = T111 ^ T117;
+    let T120 = T100 ^ T108;
+    let T121 = T92 ^ T95;
+    let T122 = T110 ^ T121;
+    let T123 = T106 ^ T119;
+    let T124 = T104 ^ T115;
+    let T125 = T111 ^ T116;
+    let S0 = T109 ^ T122;
+    let S2 = xnor(T123,T124);
+    let T128 = T94 ^ T107;
+    let S3 = T113 ^ T114;
+    let S4 = T118 ^ T128;
+    let T131 = T93 ^ T101;
+    let T132 = T112 ^ T120;
+    let S7 = xnor(T113,T125);
+    let T134 = T97 ^ T116;
+    let T135 = T131 ^ T134;
+    let T136 = T93 ^ T115;
+    let S6 = xnor(T109,T135);
+    let T138 = T119 ^ T132;
+    let S5 = T109 ^ T138;
+    let T140 = T114 ^ T136;
+    let S1 = xnor(T109,T140);
+    
+    st[0] = S7;
+    st[1] = S6;
+    st[2] = S5;
+    st[3] = S4;
+    st[4] = S3;
+    st[5] = S2;
+    st[6] = S1;
+    st[7] = S0;
+}
+ 
+
+#[allow(non_snake_case)]
+fn  sub_bytes_inv_state(st:&mut State) {
+    let U0 = st[7];
+    let U1 = st[6];
+    let U2 = st[5];
+    let U3 = st[4];
+    let U4 = st[3];
+    let U5 = st[2];
+    let U6 = st[1];
+    let U7 = st[0];
+
+    let T23 = U0 ^ U3;
+    let T22 = xnor(U1, U3);
+    let T2 = xnor(U0, U1);
+    let T1 = U3 ^ U4;
+    let T24 = xnor(U4, U7);
+    let R5 = U6 ^ U7;
+    let T8 = xnor(U1, T23);
+    let T19 = T22 ^ R5;
+    let T9 = xnor(U7, T1);
+    let T10 = T2 ^ T24;
+    let T13 = T2 ^ R5;
+    let T3 = T1 ^ R5;
+    let T25 = xnor(U2, T1);
+    let R13 = U1 ^ U6;
+    let T17 = xnor(U2, T19);
+    let T20 = T24 ^ R13;
+    let T4 = U4 ^ T8;
+    let R17 = xnor(U2, U5);
+    let R18 = xnor(U5, U6);
+    let R19 = xnor(U2, U4);
+    let Y5 = U0 ^ R17;
+    let T6 = T22 ^ R17;
+    let T16 = R13 ^ R19;
+    let T27 = T1 ^ R18;
+    let T15 = T10 ^ T27;
+    let T14 = T10 ^ R18;
+    let T26 = T3 ^ T16;
+    let M1 = T13 & T6;
+    let M2 = T23 & T8;
+    let M3 = T14 ^ M1;
+    let M4 = T19 & Y5;
+    let M5 = M4 ^ M1;
+    let M6 = T3 & T16;
+    let M7 = T22 & T9;
+    let M8 = T26 ^ M6;
+    let M9 = T20 & T17;
+    let M10 = M9 ^ M6;
+    let M11 = T1 & T15;
+    let M12 = T4 & T27;
+    let M13 = M12 ^ M11;
+    let M14 = T2 & T10;
+    let M15 = M14 ^ M11;
+    let M16 = M3 ^ M2;
+    let M17 = M5 ^ T24;
+    let M18 = M8 ^ M7;
+    let M19 = M10 ^ M15;
+    let M20 = M16 ^ M13;
+    let M21 = M17 ^ M15;
+    let M22 = M18 ^ M13;
+    let M23 = M19 ^ T25;
+    let M24 = M22 ^ M23;
+    let M25 = M22 & M20;
+    let M26 = M21 ^ M25;
+    let M27 = M20 ^ M21;
+    let M28 = M23 ^ M25;
+    let M29 = M28 & M27;
+    let M30 = M26 & M24;
+    let M31 = M20 & M23;
+    let M32 = M27 & M31;
+    let M33 = M27 ^ M25;
+    let M34 = M21 & M22;
+    let M35 = M24 & M34;
+    let M36 = M24 ^ M25;
+    let M37 = M21 ^ M29;
+    let M38 = M32 ^ M33;
+    let M39 = M23 ^ M30;
+    let M40 = M35 ^ M36;
+    let M41 = M38 ^ M40;
+    let M42 = M37 ^ M39;
+    let M43 = M37 ^ M38;
+    let M44 = M39 ^ M40;
+    let M45 = M42 ^ M41;
+    let M46 = M44 & T6;
+    let M47 = M40 & T8;
+    let M48 = M39 & Y5;
+    let M49 = M43 & T16;
+    let M50 = M38 & T9;
+    let M51 = M37 & T17;
+    let M52 = M42 & T15;
+    let M53 = M45 & T27;
+    let M54 = M41 & T10;
+    let M55 = M44 & T13;
+    let M56 = M40 & T23;
+    let M57 = M39 & T19;
+    let M58 = M43 & T3;
+    let M59 = M38 & T22;
+    let M60 = M37 & T20;
+    let M61 = M42 & T1;
+    let M62 = M45 & T4;
+    let M63 = M41 & T2;
+    let P0 = M52 ^ M61;
+    let P1 = M58 ^ M59;
+    let P2 = M54 ^ M62;
+    let P3 = M47 ^ M50;
+    let P4 = M48 ^ M56;
+    let P5 = M46 ^ M51;
+    let P6 = M49 ^ M60;
+    let P7 = P0 ^ P1;
+    let P8 = M50 ^ M53;
+    let P9 = M55 ^ M63;
+    let P10 = M57 ^ P4;
+    let P11 = P0 ^ P3;
+    let P12 = M46 ^ M48;
+    let P13 = M49 ^ M51;
+    let P14 = M49 ^ M62;
+    let P15 = M54 ^ M59;
+    let P16 = M57 ^ M61;
+    let P17 = M58 ^ P2;
+    let P18 = M63 ^ P5;
+    let P19 = P2 ^ P3;
+    let P20 = P4 ^ P6;
+    let P22 = P2 ^ P7;
+    let P23 = P7 ^ P8;
+    let P24 = P5 ^ P7;
+    let P25 = P6 ^ P10;
+    let P26 = P9 ^ P11;
+    let P27 = P10 ^ P18;
+    let P28 = P11 ^ P25;
+    let P29 = P15 ^ P20;
+    let W0 = P13 ^ P22;
+    let W1 = P26 ^ P29;
+    let W2 = P17 ^ P28;
+    let W3 = P12 ^ P22;
+    let W4 = P23 ^ P27;
+    let W5 = P19 ^ P24;
+    let W6 = P14 ^ P23;
+    let W7 = P9 ^ P16;
+    
+    st[0] = W7;
+    st[1] = W6;
+    st[2] = W5;
+    st[3] = W4;
+    st[4] = W3;
+    st[5] = W2;
+    st[6] = W1;
+    st[7] = W0;
+}
+
+fn  shift_row_u16(input:u16) -> u16 {
+   (input & 0x1111) |
+   ((input & 0x2220) >> 4) |
+   ((input & 0x0002) << 12) |
+   ((input & 0x4400) >> 8) |
+   ((input & 0x0044) << 8) |
+   ((input & 0x8000) >> 12) |
+   ((input & 0x0888) << 4)
+}
+
+fn shift_rows_state(st: &mut State) {
+    st[0] = shift_row_u16(st[0]);
+    st[1] = shift_row_u16(st[1]);
+    st[2] = shift_row_u16(st[2]);
+    st[3] = shift_row_u16(st[3]);
+    st[4] = shift_row_u16(st[4]);
+    st[5] = shift_row_u16(st[5]);
+    st[6] = shift_row_u16(st[6]);
+    st[7] = shift_row_u16(st[7]);
+}
+
+fn mix_columns_state(st: &mut State) {
+    let mut last_col: u16 = 0;
+    for i in 0..8 {
+        let col = st[i] ^
+            (((st[i] & 0xeeee) >> 1)
+            | ((st[i] & 0x1111) << 3));
+        st[i] = st[i] ^ last_col ^ col ^
+                (((col & 0xcccc) >> 2)
+                | ((col & 0x3333) << 2));
+        last_col = col;
+    }
+    st[0] ^= last_col;
+    st[1] ^= last_col;
+    st[3] ^= last_col;
+    st[4] ^= last_col;
+}
+
+fn  xor_key1_state(st: &mut State, k:&State) {
+    st[0] ^= k[0];
+    st[1] ^= k[1];
+    st[2] ^= k[2];
+    st[3] ^= k[3];
+    st[4] ^= k[4];
+    st[5] ^= k[5];
+    st[6] ^= k[6];
+    st[7] ^= k[7];
+}
+
+fn  aes_enc(st: &mut State, key: &State) {
+    sub_bytes_state(st);
+    shift_rows_state(st);
+    mix_columns_state(st);
+    xor_key1_state(st, key)
+}
+
+fn  aes_enc_last(st: &mut State, key: &State) {
+    sub_bytes_state(st);
+    shift_rows_state(st);
+    xor_key1_state(st, key)
+}
+
+fn aes_keygen_assisti(rcon:u8, i:usize, u:u16) -> u16 {
+  let u3 = u & 0xf000;
+  let n = u3 >> 12;
+  let n = ((n >> 1) | (n << 3)) & 0x000f;
+  let ri = ((rcon >> i) & 1) as u16;
+  let n = n ^ ri;
+  let n = n << 12;
+  n ^ (u3 >> 4)
+}
+
+fn  aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
+  next.copy_from_slice(prev);
+  sub_bytes_state(next);
+  next[0] = aes_keygen_assisti(rcon, 0, next[0]);
+  next[1] = aes_keygen_assisti(rcon, 1, next[1]);
+  next[2] = aes_keygen_assisti(rcon, 2, next[2]);
+  next[3] = aes_keygen_assisti(rcon, 3, next[3]);
+  next[4] = aes_keygen_assisti(rcon, 4, next[4]);
+  next[5] = aes_keygen_assisti(rcon, 5, next[5]);
+  next[6] = aes_keygen_assisti(rcon, 6, next[6]);
+  next[7] = aes_keygen_assisti(rcon, 7, next[7]);
+}
+
+fn  aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8){
+    aes_keygen_assist(next, prev, rcon);
+    fn  aux(mut n:u16) -> u16 {
+      n &= 0xf000;
+      n ^= n >> 4;
+      n ^= n >> 8;
+      n
+    }
+    next[0] = aux(next[0]);   
+    next[1] = aux(next[1]);   
+    next[2] = aux(next[2]);   
+    next[3] = aux(next[3]);   
+    next[4] = aux(next[4]);   
+    next[5] = aux(next[5]);   
+    next[6] = aux(next[6]);   
+    next[7] = aux(next[7]);   
+}   
+
+
+fn  aes_keygen_assist1(next: &mut State, prev: &State){
+    aes_keygen_assist(next, prev, 0);
+    fn  aux(mut n:u16) -> u16 {
+      n &= 0x0f00;
+      n ^= (n << 4);
+      n ^= (n >> 8);
+      n
+    }
+    next[0] = aux(next[0]);   
+    next[1] = aux(next[1]);   
+    next[2] = aux(next[2]);   
+    next[3] = aux(next[3]);   
+    next[4] = aux(next[4]);   
+    next[5] = aux(next[5]);   
+    next[6] = aux(next[6]);   
+    next[7] = aux(next[7]);   
+}   
+
+fn key_expand1(p:u16, n:u16) -> u16 {
+  let p = p ^ ((p & 0x0fff) << 4) ^ ((p & 0x00ff) << 8)
+            ^ ((p & 0x000f) << 12);
+  n ^ p
+}
+
+fn  key_expansion_step(next: &mut State, prev: &State) {
+    next[0] = key_expand1(prev[0], next[0]);
+    next[1] = key_expand1(prev[1], next[1]);
+    next[2] = key_expand1(prev[2], next[2]);
+    next[3] = key_expand1(prev[3], next[3]);
+    next[4] = key_expand1(prev[4], next[4]);
+    next[5] = key_expand1(prev[5], next[5]);
+    next[6] = key_expand1(prev[6], next[6]);
+    next[7] = key_expand1(prev[7], next[7]);
+}
+
+impl crate::platform::AESState for State {
+    fn new() -> Self {
+        new_state()
+    }
+
+    fn load_block(&mut self, b:&[u8]) {
+        debug_assert!(b.len() == 16);
+        transpose_u8x16(b.try_into().unwrap(), self);
+    }
+
+    fn store_block(&self, out:&mut[u8]) {
+        debug_assert!(out.len() == 16);
+        transpose_u16x8(self, out);
+    }
+
+    fn xor_block(&self, inp:&[u8], out:&mut[u8]) {
+        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+        let mut block = [0u8; 16];
+        self.store_block(&mut block);
+        for i in 0..inp.len() {
+            out[i] = inp[i] ^ block[i];
+        }
+    }
+
+    fn xor_key(&mut self, key: &Self) {
+        xor_key1_state(self, key);
+    }
+
+    fn aes_enc(&mut self, key: &Self) {
+        aes_enc(self, key);(self, key);
+    }
+
+    fn aes_enc_last(&mut self, key: &Self) {
+        aes_enc_last(self, key);
+    }
+
+    fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8) {
+        aes_keygen_assist0(self, prev, rcon);
+    }
+
+    fn aes_keygen_assist1(&mut self, prev: &Self) {
+        aes_keygen_assist1(self, prev);
+    }
+
+    fn key_expansion_step(&mut self, prev:&Self) {
+        key_expansion_step(self, prev)
+    }
+}
+
+ #[cfg(test)]
+ mod test {
+    fn  sbox_fwd(s:u8) -> u8 {
+        match s {
+            0 => 0x63 ,   1 => 0x7c ,   2 => 0x77 ,   3 => 0x7b ,   4 => 0xf2 ,   5 => 0x6b ,   6 => 0x6f ,   7 => 0xc5
+        ,   8 => 0x30 ,   9 => 0x01 ,  10 => 0x67 ,  11 => 0x2b ,  12 => 0xfe ,  13 => 0xd7 ,  14 => 0xab ,  15 => 0x76
+        ,  16 => 0xca ,  17 => 0x82 ,  18 => 0xc9 ,  19 => 0x7d ,  20 => 0xfa ,  21 => 0x59 ,  22 => 0x47 ,  23 => 0xf0
+        ,  24 => 0xad ,  25 => 0xd4 ,  26 => 0xa2 ,  27 => 0xaf ,  28 => 0x9c ,  29 => 0xa4 ,  30 => 0x72 ,  31 => 0xc0
+        ,  32 => 0xb7 ,  33 => 0xfd ,  34 => 0x93 ,  35 => 0x26 ,  36 => 0x36 ,  37 => 0x3f ,  38 => 0xf7 ,  39 => 0xcc
+        ,  40 => 0x34 ,  41 => 0xa5 ,  42 => 0xe5 ,  43 => 0xf1 ,  44 => 0x71 ,  45 => 0xd8 ,  46 => 0x31 ,  47 => 0x15
+        ,  48 => 0x04 ,  49 => 0xc7 ,  50 => 0x23 ,  51 => 0xc3 ,  52 => 0x18 ,  53 => 0x96 ,  54 => 0x05 ,  55 => 0x9a
+        ,  56 => 0x07 ,  57 => 0x12 ,  58 => 0x80 ,  59 => 0xe2 ,  60 => 0xeb ,  61 => 0x27 ,  62 => 0xb2 ,  63 => 0x75
+        ,  64 => 0x09 ,  65 => 0x83 ,  66 => 0x2c ,  67 => 0x1a ,  68 => 0x1b ,  69 => 0x6e ,  70 => 0x5a ,  71 => 0xa0
+        ,  72 => 0x52 ,  73 => 0x3b ,  74 => 0xd6 ,  75 => 0xb3 ,  76 => 0x29 ,  77 => 0xe3 ,  78 => 0x2f ,  79 => 0x84
+        ,  80 => 0x53 ,  81 => 0xd1 ,  82 => 0x00 ,  83 => 0xed ,  84 => 0x20 ,  85 => 0xfc ,  86 => 0xb1 ,  87 => 0x5b
+        ,  88 => 0x6a ,  89 => 0xcb ,  90 => 0xbe ,  91 => 0x39 ,  92 => 0x4a ,  93 => 0x4c ,  94 => 0x58 ,  95 => 0xcf
+        ,  96 => 0xd0 ,  97 => 0xef ,  98 => 0xaa ,  99 => 0xfb , 100 => 0x43 , 101 => 0x4d , 102 => 0x33 , 103 => 0x85
+        , 104 => 0x45 , 105 => 0xf9 , 106 => 0x02 , 107 => 0x7f , 108 => 0x50 , 109 => 0x3c , 110 => 0x9f , 111 => 0xa8
+        , 112 => 0x51 , 113 => 0xa3 , 114 => 0x40 , 115 => 0x8f , 116 => 0x92 , 117 => 0x9d , 118 => 0x38 , 119 => 0xf5
+        , 120 => 0xbc , 121 => 0xb6 , 122 => 0xda , 123 => 0x21 , 124 => 0x10 , 125 => 0xff , 126 => 0xf3 , 127 => 0xd2
+        , 128 => 0xcd , 129 => 0x0c , 130 => 0x13 , 131 => 0xec , 132 => 0x5f , 133 => 0x97 , 134 => 0x44 , 135 => 0x17
+        , 136 => 0xc4 , 137 => 0xa7 , 138 => 0x7e , 139 => 0x3d , 140 => 0x64 , 141 => 0x5d , 142 => 0x19 , 143 => 0x73
+        , 144 => 0x60 , 145 => 0x81 , 146 => 0x4f , 147 => 0xdc , 148 => 0x22 , 149 => 0x2a , 150 => 0x90 , 151 => 0x88
+        , 152 => 0x46 , 153 => 0xee , 154 => 0xb8 , 155 => 0x14 , 156 => 0xde , 157 => 0x5e , 158 => 0x0b , 159 => 0xdb
+        , 160 => 0xe0 , 161 => 0x32 , 162 => 0x3a , 163 => 0x0a , 164 => 0x49 , 165 => 0x06 , 166 => 0x24 , 167 => 0x5c
+        , 168 => 0xc2 , 169 => 0xd3 , 170 => 0xac , 171 => 0x62 , 172 => 0x91 , 173 => 0x95 , 174 => 0xe4 , 175 => 0x79
+        , 176 => 0xe7 , 177 => 0xc8 , 178 => 0x37 , 179 => 0x6d , 180 => 0x8d , 181 => 0xd5 , 182 => 0x4e , 183 => 0xa9
+        , 184 => 0x6c , 185 => 0x56 , 186 => 0xf4 , 187 => 0xea , 188 => 0x65 , 189 => 0x7a , 190 => 0xae , 191 => 0x08
+        , 192 => 0xba , 193 => 0x78 , 194 => 0x25 , 195 => 0x2e , 196 => 0x1c , 197 => 0xa6 , 198 => 0xb4 , 199 => 0xc6
+        , 200 => 0xe8 , 201 => 0xdd , 202 => 0x74 , 203 => 0x1f , 204 => 0x4b , 205 => 0xbd , 206 => 0x8b , 207 => 0x8a
+        , 208 => 0x70 , 209 => 0x3e , 210 => 0xb5 , 211 => 0x66 , 212 => 0x48 , 213 => 0x03 , 214 => 0xf6 , 215 => 0x0e
+        , 216 => 0x61 , 217 => 0x35 , 218 => 0x57 , 219 => 0xb9 , 220 => 0x86 , 221 => 0xc1 , 222 => 0x1d , 223 => 0x9e
+        , 224 => 0xe1 , 225 => 0xf8 , 226 => 0x98 , 227 => 0x11 , 228 => 0x69 , 229 => 0xd9 , 230 => 0x8e , 231 => 0x94
+        , 232 => 0x9b , 233 => 0x1e , 234 => 0x87 , 235 => 0xe9 , 236 => 0xce , 237 => 0x55 , 238 => 0x28 , 239 => 0xdf
+        , 240 => 0x8c , 241 => 0xa1 , 242 => 0x89 , 243 => 0x0d , 244 => 0xbf , 245 => 0xe6 , 246 => 0x42 , 247 => 0x68
+        , 248 => 0x41 , 249 => 0x99 , 250 => 0x2d , 251 => 0x0f , 252 => 0xb0 , 253 => 0x54 , 254 => 0xbb , 255 => 0x16
+        }
+    }
+    
+    fn  sbox_inv(s:u8) -> u8 {
+        match s {
+           0 => 0x52,   1 => 0x09,   2 => 0x6a,   3 => 0xd5,   4 => 0x30,   5 => 0x36,   6 => 0xa5,   7 => 0x38
+       ,   8 => 0xbf,   9 => 0x40,  10 => 0xa3,  11 => 0x9e,  12 => 0x81,  13 => 0xf3,  14 => 0xd7,  15 => 0xfb
+       ,  16 => 0x7c,  17 => 0xe3,  18 => 0x39,  19 => 0x82,  20 => 0x9b,  21 => 0x2f,  22 => 0xff,  23 => 0x87
+       ,  24 => 0x34,  25 => 0x8e,  26 => 0x43,  27 => 0x44,  28 => 0xc4,  29 => 0xde,  30 => 0xe9,  31 => 0xcb
+       ,  32 => 0x54,  33 => 0x7b,  34 => 0x94,  35 => 0x32,  36 => 0xa6,  37 => 0xc2,  38 => 0x23,  39 => 0x3d
+       ,  40 => 0xee,  41 => 0x4c,  42 => 0x95,  43 => 0x0b,  44 => 0x42,  45 => 0xfa,  46 => 0xc3,  47 => 0x4e
+       ,  48 => 0x08,  49 => 0x2e,  50 => 0xa1,  51 => 0x66,  52 => 0x28,  53 => 0xd9,  54 => 0x24,  55 => 0xb2
+       ,  56 => 0x76,  57 => 0x5b,  58 => 0xa2,  59 => 0x49,  60 => 0x6d,  61 => 0x8b,  62 => 0xd1,  63 => 0x25
+       ,  64 => 0x72,  65 => 0xf8,  66 => 0xf6,  67 => 0x64,  68 => 0x86,  69 => 0x68,  70 => 0x98,  71 => 0x16
+       ,  72 => 0xd4,  73 => 0xa4,  74 => 0x5c,  75 => 0xcc,  76 => 0x5d,  77 => 0x65,  78 => 0xb6,  79 => 0x92
+       ,  80 => 0x6c,  81 => 0x70,  82 => 0x48,  83 => 0x50,  84 => 0xfd,  85 => 0xed,  86 => 0xb9,  87 => 0xda
+       ,  88 => 0x5e,  89 => 0x15,  90 => 0x46,  91 => 0x57,  92 => 0xa7,  93 => 0x8d,  94 => 0x9d,  95 => 0x84
+       ,  96 => 0x90,  97 => 0xd8,  98 => 0xab,  99 => 0x00, 100 => 0x8c, 101 => 0xbc, 102 => 0xd3, 103 => 0x0a
+       , 104 => 0xf7, 105 => 0xe4, 106 => 0x58, 107 => 0x05, 108 => 0xb8, 109 => 0xb3, 110 => 0x45, 111 => 0x06
+       , 112 => 0xd0, 113 => 0x2c, 114 => 0x1e, 115 => 0x8f, 116 => 0xca, 117 => 0x3f, 118 => 0x0f, 119 => 0x02
+       , 120 => 0xc1, 121 => 0xaf, 122 => 0xbd, 123 => 0x03, 124 => 0x01, 125 => 0x13, 126 => 0x8a, 127 => 0x6b
+       , 128 => 0x3a, 129 => 0x91, 130 => 0x11, 131 => 0x41, 132 => 0x4f, 133 => 0x67, 134 => 0xdc, 135 => 0xea
+       , 136 => 0x97, 137 => 0xf2, 138 => 0xcf, 139 => 0xce, 140 => 0xf0, 141 => 0xb4, 142 => 0xe6, 143 => 0x73
+       , 144 => 0x96, 145 => 0xac, 146 => 0x74, 147 => 0x22, 148 => 0xe7, 149 => 0xad, 150 => 0x35, 151 => 0x85
+       , 152 => 0xe2, 153 => 0xf9, 154 => 0x37, 155 => 0xe8, 156 => 0x1c, 157 => 0x75, 158 => 0xdf, 159 => 0x6e
+       , 160 => 0x47, 161 => 0xf1, 162 => 0x1a, 163 => 0x71, 164 => 0x1d, 165 => 0x29, 166 => 0xc5, 167 => 0x89
+       , 168 => 0x6f, 169 => 0xb7, 170 => 0x62, 171 => 0x0e, 172 => 0xaa, 173 => 0x18, 174 => 0xbe, 175 => 0x1b
+       , 176 => 0xfc, 177 => 0x56, 178 => 0x3e, 179 => 0x4b, 180 => 0xc6, 181 => 0xd2, 182 => 0x79, 183 => 0x20
+       , 184 => 0x9a, 185 => 0xdb, 186 => 0xc0, 187 => 0xfe, 188 => 0x78, 189 => 0xcd, 190 => 0x5a, 191 => 0xf4
+       , 192 => 0x1f, 193 => 0xdd, 194 => 0xa8, 195 => 0x33, 196 => 0x88, 197 => 0x07, 198 => 0xc7, 199 => 0x31
+       , 200 => 0xb1, 201 => 0x12, 202 => 0x10, 203 => 0x59, 204 => 0x27, 205 => 0x80, 206 => 0xec, 207 => 0x5f
+       , 208 => 0x60, 209 => 0x51, 210 => 0x7f, 211 => 0xa9, 212 => 0x19, 213 => 0xb5, 214 => 0x4a, 215 => 0x0d
+       , 216 => 0x2d, 217 => 0xe5, 218 => 0x7a, 219 => 0x9f, 220 => 0x93, 221 => 0xc9, 222 => 0x9c, 223 => 0xef
+       , 224 => 0xa0, 225 => 0xe0, 226 => 0x3b, 227 => 0x4d, 228 => 0xae, 229 => 0x2a, 230 => 0xf5, 231 => 0xb0
+       , 232 => 0xc8, 233 => 0xeb, 234 => 0xbb, 235 => 0x3c, 236 => 0x83, 237 => 0x53, 238 => 0x99, 239 => 0x61
+       , 240 => 0x17, 241 => 0x2b, 242 => 0x04, 243 => 0x7e, 244 => 0xba, 245 => 0x77, 246 => 0xd6, 247 => 0x26
+       , 248 => 0xe1, 249 => 0x69, 250 => 0x14, 251 => 0x63, 252 => 0x55, 253 => 0x21, 254 => 0x0c, 255 => 0x7d        }
+    }
+
+    use rand_core::{RngCore,OsRng};
+    
+    fn  get_bit_u8(x:&[u8], i:usize, j:usize) -> u8 {
+        (x[i] >> j) & 0x1 
+    }
+    
+    fn  get_bit_u16(x:&[u16], i:usize, j:usize) -> u8 {
+        ((x[j] >> i) & 0x1) as u8
+    }
+
+    #[test]
+    fn  test_transpose () {
+        let mut x = [0u8; 16];
+        OsRng.fill_bytes(&mut x);
+        let mut y = [0u16; 8];
+        super::transpose_u8x16(&x, &mut y);
+        for i in 0..16 {
+            for j in 0..8 {
+                if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
+                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    println!("y[{},{}] = {}", i, j, get_bit_u16(&y,i, j));
+                    assert!(false);
+                }
+                else {println!("transpose ok: {},{}", i, j);}
+            }
+        }
+        let mut z = [0u8; 16];
+        super::transpose_u16x8(&y, &mut z);
+        for i in 0..16 {
+            for j in 0..8 {
+                if get_bit_u8(&x, i, j) != get_bit_u8(&z, i, j) {
+                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    println!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
+                    assert!(false);
+                }
+                else {println!("inv-transpose ok: {},{}", i, j);}
+            }
+        }
+    }
+
+    #[test]
+    fn  test_sbox () {
+        let mut x = [0u8; 16];
+        let mut y = [0u16; 8];
+        let mut w = [0u8; 16];
+        for i in 0..=255 {
+            x[0] = i;
+            x[9] = i;
+            super::transpose_u8x16(&x, &mut y);
+            super::sub_bytes_state(&mut y);
+            super::transpose_u16x8(&y, &mut w);
+            if w[0] != sbox_fwd(i as u8) {
+                println!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
+                assert!(false);
+            }
+            else {println!("sbox ok {}", i)}
+        }
+    }
+
+    #[test]
+    fn  test_sbox_inv () {
+        let mut x = [0u8; 16];
+        let mut y = [0u16; 8];
+        let mut w = [0u8; 16];
+        for i in 0..=255 {
+            x[0] = i;
+            x[9] = i;
+            super::transpose_u8x16(&x, &mut y);
+            super::sub_bytes_inv_state(&mut y);
+            super::transpose_u16x8(&y, &mut w);
+            if w[0] != sbox_inv(i as u8) {
+                println!("sbox_inv[{}] = {}, should be {}", i, w[0], sbox_inv(i as u8));
+                assert!(false);
+            }
+            else {println!("sbox inv ok {}", i)}
+        }
+    }
+}
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/portable/gf128_core.rs b/libcrux-aesgcm/src/platform/portable/gf128_core.rs
new file mode 100644
index 000000000..eca4b862a
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/portable/gf128_core.rs
@@ -0,0 +1,71 @@
+pub(crate) type FieldElement = u128;
+
+fn zero() -> FieldElement { 0 }
+fn load_elem(b:& [u8]) -> FieldElement {
+    debug_assert!(b.len() == 16);
+    u128::from_be_bytes(b.try_into().unwrap())
+}
+
+fn store_elem(elem:&FieldElement, b:&mut [u8]) {
+    debug_assert!(b.len() == 16);
+    b.copy_from_slice(&u128::to_be_bytes(*elem));
+}
+   
+fn add(elem: &FieldElement, other:&FieldElement) -> FieldElement {
+    elem ^ other
+}
+
+fn ith_bit_mask(elem: &FieldElement, i:usize) -> FieldElement {
+    debug_assert!(i < 128);
+    let bit:u16 = ((elem >> (127 - i)) as u16) & 0x1;
+    let bit_mask16 = (!bit).wrapping_add(1);
+    let bit_mask32 = (bit_mask16 as u32) ^ ((bit_mask16 as u32) << 16);
+    let bit_mask64 = (bit_mask32 as u64) ^ ((bit_mask32 as u64) << 32);
+    let bit_mask128 = (bit_mask64 as u128) ^ ((bit_mask64 as u128) << 64);
+    bit_mask128
+}
+
+const IRRED: FieldElement = 0xE100_0000_0000_0000_0000_0000_0000_0000;
+
+fn mul_x(elem: &mut FieldElement) {
+    let mask = ith_bit_mask(elem, 127);
+    *elem = (*elem >> 1) ^ (IRRED & mask)
+}
+
+fn mul_step(x: &FieldElement, y: &mut FieldElement, i:usize, result: &mut FieldElement) {
+    debug_assert!(i < 128);
+    let mask = ith_bit_mask(x, i);
+    *result ^= (*y & mask);
+    mul_x(y);
+}
+
+fn mul(x: &FieldElement, y:&FieldElement) -> FieldElement {
+    let mut result = 0;
+    let mut multiplicand = *y;
+    for i in 0..128{
+        mul_step(x, &mut multiplicand, i, &mut result)
+    }
+    result
+}
+
+impl crate::platform::GF128FieldElement for FieldElement {
+    fn zero() -> Self {
+        zero()
+    }
+
+    fn load_elem(b:&[u8]) -> Self {
+        load_elem(b)
+    }
+
+    fn store_elem(&self, b:&mut [u8]) {
+        store_elem(self, b);
+    }
+
+    fn add(&mut self, other:&Self) {
+        *self = add(self, other);
+    }
+
+    fn mul(&mut self, other:&Self) {
+        *self = mul(self, other)
+    }
+}

From d2217823fe1f25bda907097eeebbf651278108a6 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sun, 20 Apr 2025 10:34:39 +0200
Subject: [PATCH 02/43] gcm portable

---
 libcrux-aesgcm/src/aes_ctr.rs                 | 321 ++++--
 libcrux-aesgcm/src/aes_generic.rs             |  38 +-
 libcrux-aesgcm/src/gf128_generic.rs           | 109 +-
 libcrux-aesgcm/src/lib.rs                     |   7 +-
 libcrux-aesgcm/src/platform.rs                |  19 +-
 libcrux-aesgcm/src/platform/portable.rs       |   2 +-
 .../src/platform/portable/aes_core.rs         | 958 +++++++++++++-----
 .../src/platform/portable/gf128_core.rs       |  30 +-
 8 files changed, 1054 insertions(+), 430 deletions(-)

diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index 70a35dd4d..eac446054 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -1,25 +1,24 @@
 #![allow(non_camel_case_types)]
 
 use crate::{aes_generic::*, platform::AESState};
-
 pub struct AES_CTR_Context<T: AESState, const NUM_KEYS: usize> {
     pub(crate) keyex: ExtendedKey<T, NUM_KEYS>,
-    pub(crate) ctr_nonce: [u8; 16]
+    pub(crate) ctr_nonce: [u8; 16],
 }
 
-pub type AES128_CTR_Context<T> = AES_CTR_Context<T, 11>;
-
-pub fn aes128_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES128_CTR_Context<T> {
+fn aes_ctr_set_nonce<T: AESState, const NUM_KEYS: usize>(
+    ctx: &mut AES_CTR_Context<T, NUM_KEYS>,
+    nonce: &[u8],
+) {
     debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 16);
-    let mut ctr_nonce = [0u8; 16];
-    ctr_nonce[0..12].copy_from_slice(nonce);
-    AES128_CTR_Context { 
-        keyex: aes128_key_expansion(key), 
-        ctr_nonce }
+    ctx.ctr_nonce[0..12].copy_from_slice(nonce);
 }
 
-pub fn aes128_ctr_key_block<T: AESState>(ctx: &AES128_CTR_Context<T>, ctr:u32, out: &mut [u8]) {
+fn aes_ctr_key_block<T: AESState, const NUM_KEYS: usize>(
+    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctr: u32,
+    out: &mut [u8],
+) {
     debug_assert!(out.len() == 16);
     let mut st_init = ctx.ctr_nonce;
     st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
@@ -29,7 +28,12 @@ pub fn aes128_ctr_key_block<T: AESState>(ctx: &AES128_CTR_Context<T>, ctr:u32, o
     st.store_block(out);
 }
 
-pub fn aes128_ctr_xor_block<T: AESState>(ctx: &AES128_CTR_Context<T>, ctr:u32, inp:&[u8], out: &mut [u8]) {
+fn aes_ctr_xor_block<T: AESState, const NUM_KEYS: usize>(
+    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
     debug_assert!(inp.len() == out.len() && inp.len() <= 16);
     let mut st_init = ctx.ctr_nonce;
     st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
@@ -39,104 +43,263 @@ pub fn aes128_ctr_xor_block<T: AESState>(ctx: &AES128_CTR_Context<T>, ctr:u32, i
     st.xor_block(inp, out);
 }
 
-pub fn aes128_ctr_encrypt<T: AESState>(key: &[u8], nonce: &[u8], ctr:u32, inp:&[u8], out: &mut [u8]) {
-    debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 16);
+fn aes_ctr_xor_blocks<T: AESState, const NUM_KEYS: usize>(
+    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
+    let blocks = inp.len() / 16;
+    for i in 0..blocks {
+        aes_ctr_xor_block(
+            &ctx,
+            ctr.wrapping_add(i as u32),
+            &inp[i * 16..i * 16 + 16],
+            &mut out[i * 16..i * 16 + 16],
+        );
+    }
+}
+
+fn aes_ctr_update<T: AESState, const NUM_KEYS: usize>(
+    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
     debug_assert!(inp.len() == out.len());
-    let ctx = aes128_ctr_init::<T>(key, nonce);
-    for i in 0..inp.len()/16 {
-        aes128_ctr_xor_block(&ctx, ctr.wrapping_add(i as u32), &inp[i*16..i*16+16], &mut out[i*16..i*16+16]);
+    let blocks = inp.len() / 16;
+    aes_ctr_xor_blocks(&ctx, ctr, &inp[0..blocks * 16], &mut out[0..blocks * 16]);
+    let last = inp.len() - inp.len() % 16;
+    if (last < inp.len()) {
+        aes_ctr_xor_block(
+            &ctx,
+            ctr.wrapping_add(blocks as u32),
+            &inp[last..],
+            &mut out[last..],
+        );
     }
 }
 
-pub type AES256_CTR_Context<T> = AES_CTR_Context<T, 15>;
+mod aes128_ctr {
+    use super::AES_CTR_Context;
+    use crate::{
+        aes_ctr::{
+            aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update, aes_ctr_xor_block,
+            aes_ctr_xor_blocks,
+        },
+        aes_generic::*,
+        platform::AESState,
+    };
+    pub type AES128_CTR_Context<T> = AES_CTR_Context<T, 11>;
 
-pub fn aes256_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES256_CTR_Context<T> {
-    debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 32);
-    let mut ctr_nonce = [0u8; 16];
-    ctr_nonce[0..12].copy_from_slice(nonce);
-    AES256_CTR_Context { 
-        keyex: aes256_key_expansion(key), 
-        ctr_nonce }
-}
+    pub fn aes128_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES128_CTR_Context<T> {
+        debug_assert!(nonce.len() == 12);
+        debug_assert!(key.len() == 16);
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..12].copy_from_slice(nonce);
+        AES128_CTR_Context {
+            keyex: aes128_key_expansion(key),
+            ctr_nonce,
+        }
+    }
 
-pub fn aes256_ctr_key_block<T: AESState>(ctx: &AES256_CTR_Context<T>, ctr:u32, out: &mut [u8]) {
-    debug_assert!(out.len() == 16);
-    let mut st_init = ctx.ctr_nonce;
-    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
-    let mut st = T::new();
-    st.load_block(&st_init);
-    block_cipher(&mut st, ctx.keyex);
-    st.store_block(out);
-}
+    pub fn aes128_ctr_set_nonce<T: AESState>(ctx: &mut AES128_CTR_Context<T>, nonce: &[u8]) {
+        debug_assert!(nonce.len() == 12);
+        aes_ctr_set_nonce(ctx, nonce);
+    }
 
-pub fn aes256_ctr_xor_block<T: AESState>(ctx: &AES256_CTR_Context<T>, ctr:u32, inp:&[u8], out: &mut [u8]) {
-    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-    let mut st_init = ctx.ctr_nonce;
-    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
-    let mut st = T::new();
-    st.load_block(&st_init);
-    block_cipher(&mut st, ctx.keyex);
-    st.xor_block(inp, out);
+    pub fn aes128_ctr_key_block<T: AESState>(
+        ctx: &AES128_CTR_Context<T>,
+        ctr: u32,
+        out: &mut [u8],
+    ) {
+        debug_assert!(out.len() == 16);
+        aes_ctr_key_block(ctx, ctr, out);
+    }
+
+    pub fn aes128_ctr_xor_block<T: AESState>(
+        ctx: &AES128_CTR_Context<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+        aes_ctr_xor_block(ctx, ctr, inp, out);
+    }
+
+    pub fn aes128_ctr_xor_blocks<T: AESState>(
+        ctx: &AES128_CTR_Context<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
+        aes_ctr_xor_blocks(ctx, ctr, inp, out);
+    }
+
+    pub fn aes128_ctr_update<T: AESState>(
+        ctx: &AES128_CTR_Context<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len());
+        aes_ctr_update(ctx, ctr, inp, out);
+    }
+
+    pub fn aes128_ctr_encrypt<T: AESState>(
+        key: &[u8],
+        nonce: &[u8],
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(nonce.len() == 12);
+        debug_assert!(key.len() == 16);
+        debug_assert!(inp.len() == out.len());
+        let ctx = aes128_ctr_init::<T>(key, nonce);
+        aes128_ctr_update(&ctx, ctr, inp, out);
+    }
 }
 
-pub fn aes256_ctr_encrypt<T: AESState>(key: &[u8], nonce: &[u8], ctr:u32, inp:&[u8], out: &mut [u8]) {
-    debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 32);
-    debug_assert!(inp.len() == out.len());
-    let ctx = aes256_ctr_init::<T>(key, nonce);
-    let blocks = inp.len() / 16;
-    for i in 0..blocks {
-        aes256_ctr_xor_block(&ctx, ctr.wrapping_add(i as u32), &inp[i*16..i*16+16], &mut out[i*16..i*16+16]);
+mod aes256_ctr {
+    use super::AES_CTR_Context;
+    use crate::{
+        aes_ctr::{
+            aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update, aes_ctr_xor_block,
+            aes_ctr_xor_blocks,
+        },
+        aes_generic::*,
+        platform::AESState,
+    };
+
+    pub type AES256_CTR_Context<T> = AES_CTR_Context<T, 15>;
+
+    pub fn aes256_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES256_CTR_Context<T> {
+        debug_assert!(nonce.len() == 12);
+        debug_assert!(key.len() == 32);
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..12].copy_from_slice(nonce);
+        AES256_CTR_Context {
+            keyex: aes256_key_expansion(key),
+            ctr_nonce,
+        }
     }
-    let last = inp.len() - inp.len() % 16;
-    if (last < inp.len()) {        
-        aes256_ctr_xor_block(&ctx, ctr.wrapping_add(blocks as u32), &inp[last..], &mut out[last..]);
+
+    pub fn aes256_ctr_key_block<T: AESState>(
+        ctx: &AES256_CTR_Context<T>,
+        ctr: u32,
+        out: &mut [u8],
+    ) {
+        debug_assert!(out.len() == 16);
+        aes_ctr_key_block(ctx, ctr, out);
+    }
+
+    pub fn aes256_ctr_set_nonce<T: AESState>(ctx: &mut AES256_CTR_Context<T>, nonce: &[u8]) {
+        debug_assert!(nonce.len() == 12);
+        aes_ctr_set_nonce(ctx, nonce);
+    }
+
+    pub fn aes256_ctr_xor_block<T: AESState>(
+        ctx: &AES256_CTR_Context<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+        aes_ctr_xor_block(ctx, ctr, inp, out);
+    }
+
+    pub fn aes256_ctr_xor_blocks<T: AESState>(
+        ctx: &AES256_CTR_Context<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
+        aes_ctr_xor_blocks(ctx, ctr, inp, out);
+    }
+
+    pub fn aes256_ctr_update<T: AESState>(
+        ctx: &AES256_CTR_Context<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len());
+        aes_ctr_update(ctx, ctr, inp, out);
+    }
+
+    pub fn aes256_ctr_encrypt<T: AESState>(
+        key: &[u8],
+        nonce: &[u8],
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(nonce.len() == 12);
+        debug_assert!(key.len() == 32);
+        debug_assert!(inp.len() == out.len());
+        let ctx = aes256_ctr_init::<T>(key, nonce);
+        aes256_ctr_update(&ctx, ctr, inp, out);
     }
 }
 
+pub use aes128_ctr::*;
+pub use aes256_ctr::*;
+
 #[cfg(test)]
- mod test {
+mod test {
     use crate::platform;
 
-    use super::{aes128_ctr_init, aes128_ctr_xor_block, aes128_ctr_encrypt};
+    use super::{aes128_ctr_encrypt, aes128_ctr_init, aes128_ctr_xor_block};
 
     const input: [u8; 32] = [
-            0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
-            0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
-            0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
-            0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F];
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
+        0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D,
+        0x1E, 0x1F,
+    ];
     const key: [u8; 16] = [
-        0x7E,0x24,0x06,0x78,0x17,0xFA,0xE0,0xD7,
-        0x43,0xD6,0xCE,0x1F,0x32,0x53,0x91,0x63];
+        0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, 0x32, 0x53, 0x91,
+        0x63,
+    ];
     const nonce: [u8; 12] = [
-        0x00,0x6C,0xB6,0xDB,0xC0,0x54,0x3B,0x59,
-        0xDA,0x48,0xD9,0x0B];
+        0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B,
+    ];
     const expected: [u8; 32] = [
-        0x51,0x04,0xA1,0x06,0x16,0x8A,0x72,0xD9,
-        0x79,0x0D,0x41,0xEE,0x8E,0xDA,0xD3,0x88,
-        0xEB,0x2E,0x1E,0xFC,0x46,0xDA,0x57,0xC8,
-        0xFC,0xE6,0x30,0xDF,0x91,0x41,0xBE,0x28];
+        0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, 0xEE, 0x8E, 0xDA, 0xD3,
+        0x88, 0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41,
+        0xBE, 0x28,
+    ];
 
     #[test]
-    fn  test_ctr_block () {        
+    fn test_ctr_block() {
         let mut computed: [u8; 32] = [0u8; 32];
         let ctx = aes128_ctr_init::<platform::portable::State>(&key, &nonce);
         aes128_ctr_xor_block(&ctx, 1, &input[0..16], &mut computed[0..16]);
         aes128_ctr_xor_block(&ctx, 2, &input[16..32], &mut computed[16..32]);
-        for i in 0..32{
-            if computed[i] != expected[i] {println!("mismatch at {}: expected is {}, computed is {}", i, expected[i], computed[i])}
+        for i in 0..32 {
+            if computed[i] != expected[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, expected[i], computed[i]
+                )
+            }
         }
     }
 
     #[test]
-    fn  test_ctr_encrypt () {        
+    fn test_ctr_encrypt() {
         let mut computed: [u8; 32] = [0u8; 32];
         aes128_ctr_encrypt::<platform::portable::State>(&key, &nonce, 1, &input, &mut computed);
-        for i in 0..32{
-            if computed[i] != expected[i] {println!("mismatch at {}: expected is {}, computed is {}", i, expected[i], computed[i])}
+        for i in 0..32 {
+            if computed[i] != expected[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, expected[i], computed[i]
+                )
+            }
         }
     }
-
- }
\ No newline at end of file
+}
diff --git a/libcrux-aesgcm/src/aes_generic.rs b/libcrux-aesgcm/src/aes_generic.rs
index c4c4228fd..862a03397 100644
--- a/libcrux-aesgcm/src/aes_generic.rs
+++ b/libcrux-aesgcm/src/aes_generic.rs
@@ -1,53 +1,51 @@
 use crate::platform::*;
 
-pub(crate) type ExtendedKey<T, const NUM_KEYS:usize> = [T; NUM_KEYS];
+pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
 
 const RCON: [u8; 11] = [
-    0x8d, 0x01, 0x02, 0x04,
-    0x08, 0x10, 0x20, 0x40,
-    0x80, 0x1b, 0x36
+    0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
 ];
 
-pub(crate) fn aes128_key_expansion<T:AESState>(key: &[u8]) -> ExtendedKey<T,11> {
+pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11> {
     debug_assert!(key.len() == 16);
     let mut keyex = [T::new(); 11];
     keyex[0].load_block(&key);
     for i in 1..11 {
-        let prev = keyex[i-1];
-        keyex[i].aes_keygen_assist0(&prev,RCON[i]);
+        let prev = keyex[i - 1];
+        keyex[i].aes_keygen_assist0(&prev, RCON[i]);
         keyex[i].key_expansion_step(&prev);
     }
     keyex
 }
 
-pub(crate) fn aes256_key_expansion<T:AESState>(key: &[u8]) -> ExtendedKey<T,15> {
+pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15> {
     debug_assert!(key.len() == 32);
     let mut keyex = [T::new(); 15];
     keyex[0].load_block(&key[0..16]);
     keyex[1].load_block(&key[16..32]);
     for i in 2..14 {
-        let prev0 = keyex[i-2];
-        let prev1 = keyex[i-1];
-        keyex[i].aes_keygen_assist0(&prev1,RCON[i/2]);
+        let prev0 = keyex[i - 2];
+        let prev1 = keyex[i - 1];
+        keyex[i].aes_keygen_assist0(&prev1, RCON[i / 2]);
         keyex[i].key_expansion_step(&prev0);
         let next0 = keyex[i];
-        keyex[i+1].aes_keygen_assist1(&next0);
-        keyex[i+1].key_expansion_step(&prev1);
+        keyex[i + 1].aes_keygen_assist1(&next0);
+        keyex[i + 1].key_expansion_step(&prev1);
     }
     let prev0 = keyex[12];
     let prev1 = keyex[13];
-    keyex[14].aes_keygen_assist0(&prev1,RCON[7]);
+    keyex[14].aes_keygen_assist0(&prev1, RCON[7]);
     keyex[14].key_expansion_step(&prev0);
     keyex
 }
-    
 
-pub(crate) fn block_cipher<T:AESState, const NUM_KEYS:usize>(
-        st: &mut T, keyex: ExtendedKey<T, NUM_KEYS>) {
+pub(crate) fn block_cipher<T: AESState, const NUM_KEYS: usize>(
+    st: &mut T,
+    keyex: ExtendedKey<T, NUM_KEYS>,
+) {
     st.xor_key(&keyex[0]);
-    for i in 1..NUM_KEYS-1 {
+    for i in 1..NUM_KEYS - 1 {
         st.aes_enc(&keyex[i]);
     }
-    st.aes_enc_last(&keyex[NUM_KEYS-1]);
+    st.aes_enc_last(&keyex[NUM_KEYS - 1]);
 }
-
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index 7d4e3ad13..07e139063 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -2,90 +2,101 @@ use crate::platform::*;
 
 pub struct GF128State<T: GF128FieldElement> {
     accumulator: T,
-    r: T
+    r: T,
 }
 
-fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
+pub fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
     debug_assert!(key.len() == 16);
-    GF128State { accumulator: T::zero(), r: T::load_elem(key) }
+    GF128State {
+        accumulator: T::zero(),
+        r: T::load_elem(key),
+    }
 }
 
-fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block:&[u8]) {
+pub fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block: &[u8]) {
     debug_assert!(block.len() == 16);
     let block_elem = T::load_elem(block);
     st.accumulator.add(&block_elem);
-    st.accumulator.mul(&st.r);   
+    st.accumulator.mul(&st.r);
+}
+
+pub fn gf128_update_blocks<T: GF128FieldElement>(st: &mut GF128State<T>, input: &[u8]) {
+    debug_assert!(input.len() % 16 == 0);
+    let blocks = input.len() / 16;
+    for i in 0..blocks {
+        gf128_update(st, &input[i * 16..i * 16 + 16]);
+    }
 }
 
-fn gf128_update_last<T: GF128FieldElement>(st: &mut GF128State<T>, partial_block:&[u8]) {
+pub fn gf128_update_last<T: GF128FieldElement>(st: &mut GF128State<T>, partial_block: &[u8]) {
     debug_assert!(partial_block.len() < 16);
     let mut block = [0u8; 16];
     block[0..partial_block.len()].copy_from_slice(partial_block);
     gf128_update(st, &block);
 }
 
-fn gf128_emit<T: GF128FieldElement>(st: &GF128State<T>, out:&mut [u8]) {
+pub fn gf128_update_padded<T: GF128FieldElement>(st: &mut GF128State<T>, input: &[u8]) {
+    let blocks = input.len() / 16;
+    gf128_update_blocks(st, &input[0..blocks * 16]);
+    let last = input.len() - input.len() % 16;
+    if last < input.len() {
+        gf128_update_last(st, &input[last..]);
+    }
+}
+
+pub fn gf128_emit<T: GF128FieldElement>(st: &GF128State<T>, out: &mut [u8]) {
     debug_assert!(out.len() == 16);
     st.accumulator.store_elem(out);
 }
- 
-fn gf128<T: GF128FieldElement>(key: &[u8], inp:&[u8], out:&mut [u8]) {
+
+pub fn gf128<T: GF128FieldElement>(key: &[u8], inp: &[u8], out: &mut [u8]) {
     debug_assert!(key.len() == 16);
     debug_assert!(out.len() == 16);
 
     let mut st = gf128_init::<T>(key);
-    let blocks = inp.len() / 16;
-    for i in 0..blocks {
-        gf128_update(&mut st, &inp[i*16..i*16+16]);
-    }
-    let last = inp.len() - inp.len() % 16;
-    if (last < inp.len()) {        
-        gf128_update_last(&mut st, &inp[last..]);
-    }
+    gf128_update_padded(&mut st, inp);
     gf128_emit(&st, out);
 }
 
-
 #[cfg(test)]
- mod test {
+mod test {
     use crate::platform;
 
     use super::gf128;
 
-    const input: [u8;132] = [
-        0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-        0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-        0xab,0xad,0xda,0xd2,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,
-        0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
-        0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,
-        0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
-        0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,
-        0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
-        0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,
-        0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,
-        0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
-        0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,
-        0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
-        0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,
-        0x44,0xae,0x7e,0x3f];
+    const input: [u8; 132] = [
+        0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
+        0xef, 0xab, 0xad, 0xda, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65,
+        0x9e, 0x2a, 0x20, 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, 0xa0, 0x58, 0xab, 0x4f,
+        0x6f, 0x74, 0x6b, 0xf4, 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x2d, 0xa3, 0xeb,
+        0xf1, 0xc5, 0xd8, 0x2c, 0xde, 0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e, 0x5a, 0x8d,
+        0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20, 0xee,
+        0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
+        0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x44, 0xae, 0x7e, 0x3f,
+    ];
+
+    const key: [u8; 16] = [
+        0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda,
+        0xd7,
+    ];
 
-    const key: [u8;16] = [
-        0xac,0xbe,0xf2,0x05,0x79,0xb4,0xb8,0xeb,
-        0xce,0x88,0x9b,0xac,0x87,0x32,0xda,0xd7];
+    const expected: [u8; 16] = [
+        0xfb, 0xba, 0xaa, 0x70, 0xa0, 0x73, 0x6f, 0xf9, 0xed, 0x2f, 0xc4, 0x62, 0xde, 0x72, 0x61,
+        0xe0,
+    ];
 
-    const expected: [u8;16] = [
-        0xfb,0xba,0xaa,0x70,0xa0,0x73,0x6f,0xf9,
-        0xed,0x2f,0xc4,0x62,0xde,0x72,0x61,0xe0];
-    
-    
     #[test]
-    fn  test_gf128 () {        
+    fn test_gf128() {
         let mut computed: [u8; 16] = [0u8; 16];
         gf128::<crate::platform::portable::FieldElement>(&key, &input, &mut computed);
-        for i in 0..16{
-            if computed[i] != expected[i] {println!("mismatch at {}: expected is {}, computed is {}", i, expected[i], computed[i])}
+        for i in 0..16 {
+            if computed[i] != expected[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, expected[i], computed[i]
+                )
+            }
         }
     }
- }
+}
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 95d421bab..4c4ce69d6 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -1,4 +1,5 @@
-mod platform;
+pub mod aes_ctr;
+mod aes_gcm;
 mod aes_generic;
-mod gf128_generic;
-pub mod aes_ctr;
\ No newline at end of file
+pub mod gf128_generic;
+mod platform;
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 694d0bd48..25b961ccf 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -1,24 +1,23 @@
-  
 pub mod portable;
 
 pub trait AESState: Copy {
     fn new() -> Self;
-    fn load_block(&mut self, b:&[u8]);
-    fn store_block(&self, out:&mut[u8]);
-    fn xor_block(&self, inp:&[u8], out:&mut[u8]);
+    fn load_block(&mut self, b: &[u8]);
+    fn store_block(&self, out: &mut [u8]);
+    fn xor_block(&self, inp: &[u8], out: &mut [u8]);
 
     fn xor_key(&mut self, key: &Self);
     fn aes_enc(&mut self, key: &Self);
     fn aes_enc_last(&mut self, key: &Self);
     fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8);
     fn aes_keygen_assist1(&mut self, prev: &Self);
-    fn key_expansion_step(&mut self, prev:&Self);
+    fn key_expansion_step(&mut self, prev: &Self);
 }
 
 pub trait GF128FieldElement: Copy {
     fn zero() -> Self;
-    fn load_elem(b:&[u8]) -> Self;
-    fn store_elem(&self, b:&mut [u8]);
-    fn add(&mut self, other:&Self);
-    fn mul(&mut self, other:&Self);
-}
\ No newline at end of file
+    fn load_elem(b: &[u8]) -> Self;
+    fn store_elem(&self, b: &mut [u8]);
+    fn add(&mut self, other: &Self);
+    fn mul(&mut self, other: &Self);
+}
diff --git a/libcrux-aesgcm/src/platform/portable.rs b/libcrux-aesgcm/src/platform/portable.rs
index 9d14728fe..7fe9d7462 100644
--- a/libcrux-aesgcm/src/platform/portable.rs
+++ b/libcrux-aesgcm/src/platform/portable.rs
@@ -1,4 +1,4 @@
 mod aes_core;
 mod gf128_core;
 pub(crate) use aes_core::State;
-pub(crate) use gf128_core::FieldElement;
\ No newline at end of file
+pub(crate) use gf128_core::FieldElement;
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/libcrux-aesgcm/src/platform/portable/aes_core.rs
index e89d4a284..8f2b10178 100644
--- a/libcrux-aesgcm/src/platform/portable/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/aes_core.rs
@@ -1,10 +1,10 @@
 pub(crate) type State = [u16; 8];
 
-fn  new_state() -> State {
+fn new_state() -> State {
     [0u16; 8]
 }
 
-fn  interleave_u8_1(i0:u8, i1:u8) -> u16 {
+fn interleave_u8_1(i0: u8, i1: u8) -> u16 {
     let mut x = i0 as u16;
     x = (x | (x << 4)) & 0x0F0F;
     x = (x | (x << 2)) & 0x3333;
@@ -14,9 +14,9 @@ fn  interleave_u8_1(i0:u8, i1:u8) -> u16 {
     y = (y | (y << 2)) & 0x3333;
     y = (y | (y << 1)) & 0x5555;
     x | (y << 1)
- }
- 
- fn  deinterleave_u8_1(i0:u16) -> (u8,u8) {
+}
+
+fn deinterleave_u8_1(i0: u16) -> (u8, u8) {
     let mut x = i0 & 0x5555;
     x = (x | (x >> 1)) & 0x3333;
     x = (x | (x >> 2)) & 0x0F0F;
@@ -26,80 +26,79 @@ fn  interleave_u8_1(i0:u8, i1:u8) -> u16 {
     y = (y | (y >> 2)) & 0x0F0F;
     y = (y | (y >> 4)) & 0x00FF;
     (x as u8, y as u8)
- }
-
- fn  interleave_u16_2(i0:u16, i1:u16) -> (u16,u16) {
-     let x = ((i1 & 0x3333) << 2) | (i0 & 0x3333);
-     let y = ((i0 & 0xcccc) >> 2) | (i1 & 0xcccc);
-     (x,y)
- }
- 
- 
- fn  interleave_u16_4(i0:u16, i1:u16) -> (u16,u16) {
-     let x = ((i1 & 0x0F0F) << 4) | (i0 & 0x0F0F);
-     let y = ((i0 & 0xF0F0) >> 4) | (i1 & 0xF0F0);
-     (x,y)
- }
- 
- fn  interleave_u16_8(i0:u16, i1:u16) -> (u16,u16) {
-     let x = ((i1 & 0x00FF) << 8) | (i0 & 0x00FF);
-     let y = ((i0 & 0xFF00) >> 8) | (i1 & 0xFF00);
-     (x,y)
- }
- 
- fn  transpose_u8x16(input: &[u8;16], output: &mut [u16;8]) {
-     let o0 = interleave_u8_1(input[0], input[1]);
-     let o1 = interleave_u8_1(input[2], input[3]);
-     let o2 = interleave_u8_1(input[4], input[5]);
-     let o3 = interleave_u8_1(input[6], input[7]);
-     let o4 = interleave_u8_1(input[8], input[9]);
-     let o5 = interleave_u8_1(input[10], input[11]);
-     let o6 = interleave_u8_1(input[12], input[13]);
-     let o7 = interleave_u8_1(input[14], input[15]);
-     let (o0,o1) = interleave_u16_2(o0, o1);
-     let (o2,o3) = interleave_u16_2(o2, o3);
-     let (o4,o5) = interleave_u16_2(o4, o5);
-     let (o6,o7) = interleave_u16_2(o6, o7);
-     let (o0,o2) = interleave_u16_4(o0, o2);
-     let (o1,o3) = interleave_u16_4(o1, o3);
-     let (o4,o6) = interleave_u16_4(o4, o6);
-     let (o5,o7) = interleave_u16_4(o5, o7);
-     let (o0,o4) = interleave_u16_8(o0, o4);
-     let (o1,o5) = interleave_u16_8(o1, o5);
-     let (o2,o6) = interleave_u16_8(o2, o6);
-     let (o3,o7) = interleave_u16_8(o3, o7);
-     output[0] = o0;
-     output[1] = o1;
-     output[2] = o2;
-     output[3] = o3;
-     output[4] = o4;
-     output[5] = o5;
-     output[6] = o6;
-     output[7] = o7;
- }
- 
- fn  transpose_u16x8(input: &[u16;8], output: &mut [u8]) {
-    let (i0,i4) = interleave_u16_8(input[0], input[4]);
-    let (i1,i5) = interleave_u16_8(input[1], input[5]);
-    let (i2,i6) = interleave_u16_8(input[2], input[6]);
-    let (i3,i7) = interleave_u16_8(input[3], input[7]);
-    let (i0,i2) = interleave_u16_4(i0, i2);
-    let (i1,i3) = interleave_u16_4(i1, i3);
-    let (i4,i6) = interleave_u16_4(i4, i6);
-    let (i5,i7) = interleave_u16_4(i5, i7);
-    let (i0,i1) = interleave_u16_2(i0, i1);
-    let (i2,i3) = interleave_u16_2(i2, i3);
-    let (i4,i5) = interleave_u16_2(i4, i5);
-    let (i6,i7) = interleave_u16_2(i6, i7);
-
-    let (o0,o1) = deinterleave_u8_1(i0);
-    let (o2,o3) = deinterleave_u8_1(i1);
-    let (o4,o5) = deinterleave_u8_1(i2);
-    let (o6,o7) = deinterleave_u8_1(i3);
-    let (o8,o9) = deinterleave_u8_1(i4);
-    let (o10,o11) = deinterleave_u8_1(i5);
-    let (o12,o13) = deinterleave_u8_1(i6);
-    let (o14,o15) = deinterleave_u8_1(i7);
+}
+
+fn interleave_u16_2(i0: u16, i1: u16) -> (u16, u16) {
+    let x = ((i1 & 0x3333) << 2) | (i0 & 0x3333);
+    let y = ((i0 & 0xcccc) >> 2) | (i1 & 0xcccc);
+    (x, y)
+}
+
+fn interleave_u16_4(i0: u16, i1: u16) -> (u16, u16) {
+    let x = ((i1 & 0x0F0F) << 4) | (i0 & 0x0F0F);
+    let y = ((i0 & 0xF0F0) >> 4) | (i1 & 0xF0F0);
+    (x, y)
+}
+
+fn interleave_u16_8(i0: u16, i1: u16) -> (u16, u16) {
+    let x = ((i1 & 0x00FF) << 8) | (i0 & 0x00FF);
+    let y = ((i0 & 0xFF00) >> 8) | (i1 & 0xFF00);
+    (x, y)
+}
+
+fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
+    let o0 = interleave_u8_1(input[0], input[1]);
+    let o1 = interleave_u8_1(input[2], input[3]);
+    let o2 = interleave_u8_1(input[4], input[5]);
+    let o3 = interleave_u8_1(input[6], input[7]);
+    let o4 = interleave_u8_1(input[8], input[9]);
+    let o5 = interleave_u8_1(input[10], input[11]);
+    let o6 = interleave_u8_1(input[12], input[13]);
+    let o7 = interleave_u8_1(input[14], input[15]);
+    let (o0, o1) = interleave_u16_2(o0, o1);
+    let (o2, o3) = interleave_u16_2(o2, o3);
+    let (o4, o5) = interleave_u16_2(o4, o5);
+    let (o6, o7) = interleave_u16_2(o6, o7);
+    let (o0, o2) = interleave_u16_4(o0, o2);
+    let (o1, o3) = interleave_u16_4(o1, o3);
+    let (o4, o6) = interleave_u16_4(o4, o6);
+    let (o5, o7) = interleave_u16_4(o5, o7);
+    let (o0, o4) = interleave_u16_8(o0, o4);
+    let (o1, o5) = interleave_u16_8(o1, o5);
+    let (o2, o6) = interleave_u16_8(o2, o6);
+    let (o3, o7) = interleave_u16_8(o3, o7);
+    output[0] = o0;
+    output[1] = o1;
+    output[2] = o2;
+    output[3] = o3;
+    output[4] = o4;
+    output[5] = o5;
+    output[6] = o6;
+    output[7] = o7;
+}
+
+fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
+    let (i0, i4) = interleave_u16_8(input[0], input[4]);
+    let (i1, i5) = interleave_u16_8(input[1], input[5]);
+    let (i2, i6) = interleave_u16_8(input[2], input[6]);
+    let (i3, i7) = interleave_u16_8(input[3], input[7]);
+    let (i0, i2) = interleave_u16_4(i0, i2);
+    let (i1, i3) = interleave_u16_4(i1, i3);
+    let (i4, i6) = interleave_u16_4(i4, i6);
+    let (i5, i7) = interleave_u16_4(i5, i7);
+    let (i0, i1) = interleave_u16_2(i0, i1);
+    let (i2, i3) = interleave_u16_2(i2, i3);
+    let (i4, i5) = interleave_u16_2(i4, i5);
+    let (i6, i7) = interleave_u16_2(i6, i7);
+
+    let (o0, o1) = deinterleave_u8_1(i0);
+    let (o2, o3) = deinterleave_u8_1(i1);
+    let (o4, o5) = deinterleave_u8_1(i2);
+    let (o6, o7) = deinterleave_u8_1(i3);
+    let (o8, o9) = deinterleave_u8_1(i4);
+    let (o10, o11) = deinterleave_u8_1(i5);
+    let (o12, o13) = deinterleave_u8_1(i6);
+    let (o14, o15) = deinterleave_u8_1(i7);
 
     output[0] = o0;
     output[1] = o1;
@@ -119,13 +118,12 @@ fn  interleave_u8_1(i0:u8, i1:u8) -> u16 {
     output[15] = o15;
 }
 
-
-fn  xnor(a:u16, b:u16) -> u16 {
+fn xnor(a: u16, b: u16) -> u16 {
     !(a ^ b)
 }
 
 #[allow(non_snake_case)]
-fn  sub_bytes_state(st:&mut State) {
+fn sub_bytes_state(st: &mut State) {
     let U0 = st[7];
     let U1 = st[6];
     let U2 = st[5];
@@ -245,22 +243,22 @@ fn  sub_bytes_state(st:&mut State) {
     let T124 = T104 ^ T115;
     let T125 = T111 ^ T116;
     let S0 = T109 ^ T122;
-    let S2 = xnor(T123,T124);
+    let S2 = xnor(T123, T124);
     let T128 = T94 ^ T107;
     let S3 = T113 ^ T114;
     let S4 = T118 ^ T128;
     let T131 = T93 ^ T101;
     let T132 = T112 ^ T120;
-    let S7 = xnor(T113,T125);
+    let S7 = xnor(T113, T125);
     let T134 = T97 ^ T116;
     let T135 = T131 ^ T134;
     let T136 = T93 ^ T115;
-    let S6 = xnor(T109,T135);
+    let S6 = xnor(T109, T135);
     let T138 = T119 ^ T132;
     let S5 = T109 ^ T138;
     let T140 = T114 ^ T136;
-    let S1 = xnor(T109,T140);
-    
+    let S1 = xnor(T109, T140);
+
     st[0] = S7;
     st[1] = S6;
     st[2] = S5;
@@ -270,10 +268,9 @@ fn  sub_bytes_state(st:&mut State) {
     st[6] = S1;
     st[7] = S0;
 }
- 
 
 #[allow(non_snake_case)]
-fn  sub_bytes_inv_state(st:&mut State) {
+fn sub_bytes_inv_state(st: &mut State) {
     let U0 = st[7];
     let U1 = st[6];
     let U2 = st[5];
@@ -410,7 +407,7 @@ fn  sub_bytes_inv_state(st:&mut State) {
     let W5 = P19 ^ P24;
     let W6 = P14 ^ P23;
     let W7 = P9 ^ P16;
-    
+
     st[0] = W7;
     st[1] = W6;
     st[2] = W5;
@@ -421,14 +418,14 @@ fn  sub_bytes_inv_state(st:&mut State) {
     st[7] = W0;
 }
 
-fn  shift_row_u16(input:u16) -> u16 {
-   (input & 0x1111) |
-   ((input & 0x2220) >> 4) |
-   ((input & 0x0002) << 12) |
-   ((input & 0x4400) >> 8) |
-   ((input & 0x0044) << 8) |
-   ((input & 0x8000) >> 12) |
-   ((input & 0x0888) << 4)
+fn shift_row_u16(input: u16) -> u16 {
+    (input & 0x1111)
+        | ((input & 0x2220) >> 4)
+        | ((input & 0x0002) << 12)
+        | ((input & 0x4400) >> 8)
+        | ((input & 0x0044) << 8)
+        | ((input & 0x8000) >> 12)
+        | ((input & 0x0888) << 4)
 }
 
 fn shift_rows_state(st: &mut State) {
@@ -445,12 +442,8 @@ fn shift_rows_state(st: &mut State) {
 fn mix_columns_state(st: &mut State) {
     let mut last_col: u16 = 0;
     for i in 0..8 {
-        let col = st[i] ^
-            (((st[i] & 0xeeee) >> 1)
-            | ((st[i] & 0x1111) << 3));
-        st[i] = st[i] ^ last_col ^ col ^
-                (((col & 0xcccc) >> 2)
-                | ((col & 0x3333) << 2));
+        let col = st[i] ^ (((st[i] & 0xeeee) >> 1) | ((st[i] & 0x1111) << 3));
+        st[i] = st[i] ^ last_col ^ col ^ (((col & 0xcccc) >> 2) | ((col & 0x3333) << 2));
         last_col = col;
     }
     st[0] ^= last_col;
@@ -459,7 +452,7 @@ fn mix_columns_state(st: &mut State) {
     st[4] ^= last_col;
 }
 
-fn  xor_key1_state(st: &mut State, k:&State) {
+fn xor_key1_state(st: &mut State, k: &State) {
     st[0] ^= k[0];
     st[1] ^= k[1];
     st[2] ^= k[2];
@@ -470,86 +463,84 @@ fn  xor_key1_state(st: &mut State, k:&State) {
     st[7] ^= k[7];
 }
 
-fn  aes_enc(st: &mut State, key: &State) {
+fn aes_enc(st: &mut State, key: &State) {
     sub_bytes_state(st);
     shift_rows_state(st);
     mix_columns_state(st);
     xor_key1_state(st, key)
 }
 
-fn  aes_enc_last(st: &mut State, key: &State) {
+fn aes_enc_last(st: &mut State, key: &State) {
     sub_bytes_state(st);
     shift_rows_state(st);
     xor_key1_state(st, key)
 }
 
-fn aes_keygen_assisti(rcon:u8, i:usize, u:u16) -> u16 {
-  let u3 = u & 0xf000;
-  let n = u3 >> 12;
-  let n = ((n >> 1) | (n << 3)) & 0x000f;
-  let ri = ((rcon >> i) & 1) as u16;
-  let n = n ^ ri;
-  let n = n << 12;
-  n ^ (u3 >> 4)
+fn aes_keygen_assisti(rcon: u8, i: usize, u: u16) -> u16 {
+    let u3 = u & 0xf000;
+    let n = u3 >> 12;
+    let n = ((n >> 1) | (n << 3)) & 0x000f;
+    let ri = ((rcon >> i) & 1) as u16;
+    let n = n ^ ri;
+    let n = n << 12;
+    n ^ (u3 >> 4)
 }
 
-fn  aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
-  next.copy_from_slice(prev);
-  sub_bytes_state(next);
-  next[0] = aes_keygen_assisti(rcon, 0, next[0]);
-  next[1] = aes_keygen_assisti(rcon, 1, next[1]);
-  next[2] = aes_keygen_assisti(rcon, 2, next[2]);
-  next[3] = aes_keygen_assisti(rcon, 3, next[3]);
-  next[4] = aes_keygen_assisti(rcon, 4, next[4]);
-  next[5] = aes_keygen_assisti(rcon, 5, next[5]);
-  next[6] = aes_keygen_assisti(rcon, 6, next[6]);
-  next[7] = aes_keygen_assisti(rcon, 7, next[7]);
+fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
+    next.copy_from_slice(prev);
+    sub_bytes_state(next);
+    next[0] = aes_keygen_assisti(rcon, 0, next[0]);
+    next[1] = aes_keygen_assisti(rcon, 1, next[1]);
+    next[2] = aes_keygen_assisti(rcon, 2, next[2]);
+    next[3] = aes_keygen_assisti(rcon, 3, next[3]);
+    next[4] = aes_keygen_assisti(rcon, 4, next[4]);
+    next[5] = aes_keygen_assisti(rcon, 5, next[5]);
+    next[6] = aes_keygen_assisti(rcon, 6, next[6]);
+    next[7] = aes_keygen_assisti(rcon, 7, next[7]);
 }
 
-fn  aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8){
+fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
-    fn  aux(mut n:u16) -> u16 {
-      n &= 0xf000;
-      n ^= n >> 4;
-      n ^= n >> 8;
-      n
+    fn aux(mut n: u16) -> u16 {
+        n &= 0xf000;
+        n ^= n >> 4;
+        n ^= n >> 8;
+        n
     }
-    next[0] = aux(next[0]);   
-    next[1] = aux(next[1]);   
-    next[2] = aux(next[2]);   
-    next[3] = aux(next[3]);   
-    next[4] = aux(next[4]);   
-    next[5] = aux(next[5]);   
-    next[6] = aux(next[6]);   
-    next[7] = aux(next[7]);   
-}   
-
-
-fn  aes_keygen_assist1(next: &mut State, prev: &State){
+    next[0] = aux(next[0]);
+    next[1] = aux(next[1]);
+    next[2] = aux(next[2]);
+    next[3] = aux(next[3]);
+    next[4] = aux(next[4]);
+    next[5] = aux(next[5]);
+    next[6] = aux(next[6]);
+    next[7] = aux(next[7]);
+}
+
+fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
-    fn  aux(mut n:u16) -> u16 {
-      n &= 0x0f00;
-      n ^= (n << 4);
-      n ^= (n >> 8);
-      n
+    fn aux(mut n: u16) -> u16 {
+        n &= 0x0f00;
+        n ^= (n << 4);
+        n ^= (n >> 8);
+        n
     }
-    next[0] = aux(next[0]);   
-    next[1] = aux(next[1]);   
-    next[2] = aux(next[2]);   
-    next[3] = aux(next[3]);   
-    next[4] = aux(next[4]);   
-    next[5] = aux(next[5]);   
-    next[6] = aux(next[6]);   
-    next[7] = aux(next[7]);   
-}   
-
-fn key_expand1(p:u16, n:u16) -> u16 {
-  let p = p ^ ((p & 0x0fff) << 4) ^ ((p & 0x00ff) << 8)
-            ^ ((p & 0x000f) << 12);
-  n ^ p
+    next[0] = aux(next[0]);
+    next[1] = aux(next[1]);
+    next[2] = aux(next[2]);
+    next[3] = aux(next[3]);
+    next[4] = aux(next[4]);
+    next[5] = aux(next[5]);
+    next[6] = aux(next[6]);
+    next[7] = aux(next[7]);
+}
+
+fn key_expand1(p: u16, n: u16) -> u16 {
+    let p = p ^ ((p & 0x0fff) << 4) ^ ((p & 0x00ff) << 8) ^ ((p & 0x000f) << 12);
+    n ^ p
 }
 
-fn  key_expansion_step(next: &mut State, prev: &State) {
+fn key_expansion_step(next: &mut State, prev: &State) {
     next[0] = key_expand1(prev[0], next[0]);
     next[1] = key_expand1(prev[1], next[1]);
     next[2] = key_expand1(prev[2], next[2]);
@@ -565,17 +556,17 @@ impl crate::platform::AESState for State {
         new_state()
     }
 
-    fn load_block(&mut self, b:&[u8]) {
+    fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
         transpose_u8x16(b.try_into().unwrap(), self);
     }
 
-    fn store_block(&self, out:&mut[u8]) {
+    fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
         transpose_u16x8(self, out);
     }
 
-    fn xor_block(&self, inp:&[u8], out:&mut[u8]) {
+    fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
         let mut block = [0u8; 16];
         self.store_block(&mut block);
@@ -589,7 +580,8 @@ impl crate::platform::AESState for State {
     }
 
     fn aes_enc(&mut self, key: &Self) {
-        aes_enc(self, key);(self, key);
+        aes_enc(self, key);
+        (self, key);
     }
 
     fn aes_enc_last(&mut self, key: &Self) {
@@ -604,98 +596,547 @@ impl crate::platform::AESState for State {
         aes_keygen_assist1(self, prev);
     }
 
-    fn key_expansion_step(&mut self, prev:&Self) {
+    fn key_expansion_step(&mut self, prev: &Self) {
         key_expansion_step(self, prev)
     }
 }
 
- #[cfg(test)]
- mod test {
-    fn  sbox_fwd(s:u8) -> u8 {
+#[cfg(test)]
+mod test {
+    fn sbox_fwd(s: u8) -> u8 {
         match s {
-            0 => 0x63 ,   1 => 0x7c ,   2 => 0x77 ,   3 => 0x7b ,   4 => 0xf2 ,   5 => 0x6b ,   6 => 0x6f ,   7 => 0xc5
-        ,   8 => 0x30 ,   9 => 0x01 ,  10 => 0x67 ,  11 => 0x2b ,  12 => 0xfe ,  13 => 0xd7 ,  14 => 0xab ,  15 => 0x76
-        ,  16 => 0xca ,  17 => 0x82 ,  18 => 0xc9 ,  19 => 0x7d ,  20 => 0xfa ,  21 => 0x59 ,  22 => 0x47 ,  23 => 0xf0
-        ,  24 => 0xad ,  25 => 0xd4 ,  26 => 0xa2 ,  27 => 0xaf ,  28 => 0x9c ,  29 => 0xa4 ,  30 => 0x72 ,  31 => 0xc0
-        ,  32 => 0xb7 ,  33 => 0xfd ,  34 => 0x93 ,  35 => 0x26 ,  36 => 0x36 ,  37 => 0x3f ,  38 => 0xf7 ,  39 => 0xcc
-        ,  40 => 0x34 ,  41 => 0xa5 ,  42 => 0xe5 ,  43 => 0xf1 ,  44 => 0x71 ,  45 => 0xd8 ,  46 => 0x31 ,  47 => 0x15
-        ,  48 => 0x04 ,  49 => 0xc7 ,  50 => 0x23 ,  51 => 0xc3 ,  52 => 0x18 ,  53 => 0x96 ,  54 => 0x05 ,  55 => 0x9a
-        ,  56 => 0x07 ,  57 => 0x12 ,  58 => 0x80 ,  59 => 0xe2 ,  60 => 0xeb ,  61 => 0x27 ,  62 => 0xb2 ,  63 => 0x75
-        ,  64 => 0x09 ,  65 => 0x83 ,  66 => 0x2c ,  67 => 0x1a ,  68 => 0x1b ,  69 => 0x6e ,  70 => 0x5a ,  71 => 0xa0
-        ,  72 => 0x52 ,  73 => 0x3b ,  74 => 0xd6 ,  75 => 0xb3 ,  76 => 0x29 ,  77 => 0xe3 ,  78 => 0x2f ,  79 => 0x84
-        ,  80 => 0x53 ,  81 => 0xd1 ,  82 => 0x00 ,  83 => 0xed ,  84 => 0x20 ,  85 => 0xfc ,  86 => 0xb1 ,  87 => 0x5b
-        ,  88 => 0x6a ,  89 => 0xcb ,  90 => 0xbe ,  91 => 0x39 ,  92 => 0x4a ,  93 => 0x4c ,  94 => 0x58 ,  95 => 0xcf
-        ,  96 => 0xd0 ,  97 => 0xef ,  98 => 0xaa ,  99 => 0xfb , 100 => 0x43 , 101 => 0x4d , 102 => 0x33 , 103 => 0x85
-        , 104 => 0x45 , 105 => 0xf9 , 106 => 0x02 , 107 => 0x7f , 108 => 0x50 , 109 => 0x3c , 110 => 0x9f , 111 => 0xa8
-        , 112 => 0x51 , 113 => 0xa3 , 114 => 0x40 , 115 => 0x8f , 116 => 0x92 , 117 => 0x9d , 118 => 0x38 , 119 => 0xf5
-        , 120 => 0xbc , 121 => 0xb6 , 122 => 0xda , 123 => 0x21 , 124 => 0x10 , 125 => 0xff , 126 => 0xf3 , 127 => 0xd2
-        , 128 => 0xcd , 129 => 0x0c , 130 => 0x13 , 131 => 0xec , 132 => 0x5f , 133 => 0x97 , 134 => 0x44 , 135 => 0x17
-        , 136 => 0xc4 , 137 => 0xa7 , 138 => 0x7e , 139 => 0x3d , 140 => 0x64 , 141 => 0x5d , 142 => 0x19 , 143 => 0x73
-        , 144 => 0x60 , 145 => 0x81 , 146 => 0x4f , 147 => 0xdc , 148 => 0x22 , 149 => 0x2a , 150 => 0x90 , 151 => 0x88
-        , 152 => 0x46 , 153 => 0xee , 154 => 0xb8 , 155 => 0x14 , 156 => 0xde , 157 => 0x5e , 158 => 0x0b , 159 => 0xdb
-        , 160 => 0xe0 , 161 => 0x32 , 162 => 0x3a , 163 => 0x0a , 164 => 0x49 , 165 => 0x06 , 166 => 0x24 , 167 => 0x5c
-        , 168 => 0xc2 , 169 => 0xd3 , 170 => 0xac , 171 => 0x62 , 172 => 0x91 , 173 => 0x95 , 174 => 0xe4 , 175 => 0x79
-        , 176 => 0xe7 , 177 => 0xc8 , 178 => 0x37 , 179 => 0x6d , 180 => 0x8d , 181 => 0xd5 , 182 => 0x4e , 183 => 0xa9
-        , 184 => 0x6c , 185 => 0x56 , 186 => 0xf4 , 187 => 0xea , 188 => 0x65 , 189 => 0x7a , 190 => 0xae , 191 => 0x08
-        , 192 => 0xba , 193 => 0x78 , 194 => 0x25 , 195 => 0x2e , 196 => 0x1c , 197 => 0xa6 , 198 => 0xb4 , 199 => 0xc6
-        , 200 => 0xe8 , 201 => 0xdd , 202 => 0x74 , 203 => 0x1f , 204 => 0x4b , 205 => 0xbd , 206 => 0x8b , 207 => 0x8a
-        , 208 => 0x70 , 209 => 0x3e , 210 => 0xb5 , 211 => 0x66 , 212 => 0x48 , 213 => 0x03 , 214 => 0xf6 , 215 => 0x0e
-        , 216 => 0x61 , 217 => 0x35 , 218 => 0x57 , 219 => 0xb9 , 220 => 0x86 , 221 => 0xc1 , 222 => 0x1d , 223 => 0x9e
-        , 224 => 0xe1 , 225 => 0xf8 , 226 => 0x98 , 227 => 0x11 , 228 => 0x69 , 229 => 0xd9 , 230 => 0x8e , 231 => 0x94
-        , 232 => 0x9b , 233 => 0x1e , 234 => 0x87 , 235 => 0xe9 , 236 => 0xce , 237 => 0x55 , 238 => 0x28 , 239 => 0xdf
-        , 240 => 0x8c , 241 => 0xa1 , 242 => 0x89 , 243 => 0x0d , 244 => 0xbf , 245 => 0xe6 , 246 => 0x42 , 247 => 0x68
-        , 248 => 0x41 , 249 => 0x99 , 250 => 0x2d , 251 => 0x0f , 252 => 0xb0 , 253 => 0x54 , 254 => 0xbb , 255 => 0x16
+            0 => 0x63,
+            1 => 0x7c,
+            2 => 0x77,
+            3 => 0x7b,
+            4 => 0xf2,
+            5 => 0x6b,
+            6 => 0x6f,
+            7 => 0xc5,
+            8 => 0x30,
+            9 => 0x01,
+            10 => 0x67,
+            11 => 0x2b,
+            12 => 0xfe,
+            13 => 0xd7,
+            14 => 0xab,
+            15 => 0x76,
+            16 => 0xca,
+            17 => 0x82,
+            18 => 0xc9,
+            19 => 0x7d,
+            20 => 0xfa,
+            21 => 0x59,
+            22 => 0x47,
+            23 => 0xf0,
+            24 => 0xad,
+            25 => 0xd4,
+            26 => 0xa2,
+            27 => 0xaf,
+            28 => 0x9c,
+            29 => 0xa4,
+            30 => 0x72,
+            31 => 0xc0,
+            32 => 0xb7,
+            33 => 0xfd,
+            34 => 0x93,
+            35 => 0x26,
+            36 => 0x36,
+            37 => 0x3f,
+            38 => 0xf7,
+            39 => 0xcc,
+            40 => 0x34,
+            41 => 0xa5,
+            42 => 0xe5,
+            43 => 0xf1,
+            44 => 0x71,
+            45 => 0xd8,
+            46 => 0x31,
+            47 => 0x15,
+            48 => 0x04,
+            49 => 0xc7,
+            50 => 0x23,
+            51 => 0xc3,
+            52 => 0x18,
+            53 => 0x96,
+            54 => 0x05,
+            55 => 0x9a,
+            56 => 0x07,
+            57 => 0x12,
+            58 => 0x80,
+            59 => 0xe2,
+            60 => 0xeb,
+            61 => 0x27,
+            62 => 0xb2,
+            63 => 0x75,
+            64 => 0x09,
+            65 => 0x83,
+            66 => 0x2c,
+            67 => 0x1a,
+            68 => 0x1b,
+            69 => 0x6e,
+            70 => 0x5a,
+            71 => 0xa0,
+            72 => 0x52,
+            73 => 0x3b,
+            74 => 0xd6,
+            75 => 0xb3,
+            76 => 0x29,
+            77 => 0xe3,
+            78 => 0x2f,
+            79 => 0x84,
+            80 => 0x53,
+            81 => 0xd1,
+            82 => 0x00,
+            83 => 0xed,
+            84 => 0x20,
+            85 => 0xfc,
+            86 => 0xb1,
+            87 => 0x5b,
+            88 => 0x6a,
+            89 => 0xcb,
+            90 => 0xbe,
+            91 => 0x39,
+            92 => 0x4a,
+            93 => 0x4c,
+            94 => 0x58,
+            95 => 0xcf,
+            96 => 0xd0,
+            97 => 0xef,
+            98 => 0xaa,
+            99 => 0xfb,
+            100 => 0x43,
+            101 => 0x4d,
+            102 => 0x33,
+            103 => 0x85,
+            104 => 0x45,
+            105 => 0xf9,
+            106 => 0x02,
+            107 => 0x7f,
+            108 => 0x50,
+            109 => 0x3c,
+            110 => 0x9f,
+            111 => 0xa8,
+            112 => 0x51,
+            113 => 0xa3,
+            114 => 0x40,
+            115 => 0x8f,
+            116 => 0x92,
+            117 => 0x9d,
+            118 => 0x38,
+            119 => 0xf5,
+            120 => 0xbc,
+            121 => 0xb6,
+            122 => 0xda,
+            123 => 0x21,
+            124 => 0x10,
+            125 => 0xff,
+            126 => 0xf3,
+            127 => 0xd2,
+            128 => 0xcd,
+            129 => 0x0c,
+            130 => 0x13,
+            131 => 0xec,
+            132 => 0x5f,
+            133 => 0x97,
+            134 => 0x44,
+            135 => 0x17,
+            136 => 0xc4,
+            137 => 0xa7,
+            138 => 0x7e,
+            139 => 0x3d,
+            140 => 0x64,
+            141 => 0x5d,
+            142 => 0x19,
+            143 => 0x73,
+            144 => 0x60,
+            145 => 0x81,
+            146 => 0x4f,
+            147 => 0xdc,
+            148 => 0x22,
+            149 => 0x2a,
+            150 => 0x90,
+            151 => 0x88,
+            152 => 0x46,
+            153 => 0xee,
+            154 => 0xb8,
+            155 => 0x14,
+            156 => 0xde,
+            157 => 0x5e,
+            158 => 0x0b,
+            159 => 0xdb,
+            160 => 0xe0,
+            161 => 0x32,
+            162 => 0x3a,
+            163 => 0x0a,
+            164 => 0x49,
+            165 => 0x06,
+            166 => 0x24,
+            167 => 0x5c,
+            168 => 0xc2,
+            169 => 0xd3,
+            170 => 0xac,
+            171 => 0x62,
+            172 => 0x91,
+            173 => 0x95,
+            174 => 0xe4,
+            175 => 0x79,
+            176 => 0xe7,
+            177 => 0xc8,
+            178 => 0x37,
+            179 => 0x6d,
+            180 => 0x8d,
+            181 => 0xd5,
+            182 => 0x4e,
+            183 => 0xa9,
+            184 => 0x6c,
+            185 => 0x56,
+            186 => 0xf4,
+            187 => 0xea,
+            188 => 0x65,
+            189 => 0x7a,
+            190 => 0xae,
+            191 => 0x08,
+            192 => 0xba,
+            193 => 0x78,
+            194 => 0x25,
+            195 => 0x2e,
+            196 => 0x1c,
+            197 => 0xa6,
+            198 => 0xb4,
+            199 => 0xc6,
+            200 => 0xe8,
+            201 => 0xdd,
+            202 => 0x74,
+            203 => 0x1f,
+            204 => 0x4b,
+            205 => 0xbd,
+            206 => 0x8b,
+            207 => 0x8a,
+            208 => 0x70,
+            209 => 0x3e,
+            210 => 0xb5,
+            211 => 0x66,
+            212 => 0x48,
+            213 => 0x03,
+            214 => 0xf6,
+            215 => 0x0e,
+            216 => 0x61,
+            217 => 0x35,
+            218 => 0x57,
+            219 => 0xb9,
+            220 => 0x86,
+            221 => 0xc1,
+            222 => 0x1d,
+            223 => 0x9e,
+            224 => 0xe1,
+            225 => 0xf8,
+            226 => 0x98,
+            227 => 0x11,
+            228 => 0x69,
+            229 => 0xd9,
+            230 => 0x8e,
+            231 => 0x94,
+            232 => 0x9b,
+            233 => 0x1e,
+            234 => 0x87,
+            235 => 0xe9,
+            236 => 0xce,
+            237 => 0x55,
+            238 => 0x28,
+            239 => 0xdf,
+            240 => 0x8c,
+            241 => 0xa1,
+            242 => 0x89,
+            243 => 0x0d,
+            244 => 0xbf,
+            245 => 0xe6,
+            246 => 0x42,
+            247 => 0x68,
+            248 => 0x41,
+            249 => 0x99,
+            250 => 0x2d,
+            251 => 0x0f,
+            252 => 0xb0,
+            253 => 0x54,
+            254 => 0xbb,
+            255 => 0x16,
         }
     }
-    
-    fn  sbox_inv(s:u8) -> u8 {
+
+    fn sbox_inv(s: u8) -> u8 {
         match s {
-           0 => 0x52,   1 => 0x09,   2 => 0x6a,   3 => 0xd5,   4 => 0x30,   5 => 0x36,   6 => 0xa5,   7 => 0x38
-       ,   8 => 0xbf,   9 => 0x40,  10 => 0xa3,  11 => 0x9e,  12 => 0x81,  13 => 0xf3,  14 => 0xd7,  15 => 0xfb
-       ,  16 => 0x7c,  17 => 0xe3,  18 => 0x39,  19 => 0x82,  20 => 0x9b,  21 => 0x2f,  22 => 0xff,  23 => 0x87
-       ,  24 => 0x34,  25 => 0x8e,  26 => 0x43,  27 => 0x44,  28 => 0xc4,  29 => 0xde,  30 => 0xe9,  31 => 0xcb
-       ,  32 => 0x54,  33 => 0x7b,  34 => 0x94,  35 => 0x32,  36 => 0xa6,  37 => 0xc2,  38 => 0x23,  39 => 0x3d
-       ,  40 => 0xee,  41 => 0x4c,  42 => 0x95,  43 => 0x0b,  44 => 0x42,  45 => 0xfa,  46 => 0xc3,  47 => 0x4e
-       ,  48 => 0x08,  49 => 0x2e,  50 => 0xa1,  51 => 0x66,  52 => 0x28,  53 => 0xd9,  54 => 0x24,  55 => 0xb2
-       ,  56 => 0x76,  57 => 0x5b,  58 => 0xa2,  59 => 0x49,  60 => 0x6d,  61 => 0x8b,  62 => 0xd1,  63 => 0x25
-       ,  64 => 0x72,  65 => 0xf8,  66 => 0xf6,  67 => 0x64,  68 => 0x86,  69 => 0x68,  70 => 0x98,  71 => 0x16
-       ,  72 => 0xd4,  73 => 0xa4,  74 => 0x5c,  75 => 0xcc,  76 => 0x5d,  77 => 0x65,  78 => 0xb6,  79 => 0x92
-       ,  80 => 0x6c,  81 => 0x70,  82 => 0x48,  83 => 0x50,  84 => 0xfd,  85 => 0xed,  86 => 0xb9,  87 => 0xda
-       ,  88 => 0x5e,  89 => 0x15,  90 => 0x46,  91 => 0x57,  92 => 0xa7,  93 => 0x8d,  94 => 0x9d,  95 => 0x84
-       ,  96 => 0x90,  97 => 0xd8,  98 => 0xab,  99 => 0x00, 100 => 0x8c, 101 => 0xbc, 102 => 0xd3, 103 => 0x0a
-       , 104 => 0xf7, 105 => 0xe4, 106 => 0x58, 107 => 0x05, 108 => 0xb8, 109 => 0xb3, 110 => 0x45, 111 => 0x06
-       , 112 => 0xd0, 113 => 0x2c, 114 => 0x1e, 115 => 0x8f, 116 => 0xca, 117 => 0x3f, 118 => 0x0f, 119 => 0x02
-       , 120 => 0xc1, 121 => 0xaf, 122 => 0xbd, 123 => 0x03, 124 => 0x01, 125 => 0x13, 126 => 0x8a, 127 => 0x6b
-       , 128 => 0x3a, 129 => 0x91, 130 => 0x11, 131 => 0x41, 132 => 0x4f, 133 => 0x67, 134 => 0xdc, 135 => 0xea
-       , 136 => 0x97, 137 => 0xf2, 138 => 0xcf, 139 => 0xce, 140 => 0xf0, 141 => 0xb4, 142 => 0xe6, 143 => 0x73
-       , 144 => 0x96, 145 => 0xac, 146 => 0x74, 147 => 0x22, 148 => 0xe7, 149 => 0xad, 150 => 0x35, 151 => 0x85
-       , 152 => 0xe2, 153 => 0xf9, 154 => 0x37, 155 => 0xe8, 156 => 0x1c, 157 => 0x75, 158 => 0xdf, 159 => 0x6e
-       , 160 => 0x47, 161 => 0xf1, 162 => 0x1a, 163 => 0x71, 164 => 0x1d, 165 => 0x29, 166 => 0xc5, 167 => 0x89
-       , 168 => 0x6f, 169 => 0xb7, 170 => 0x62, 171 => 0x0e, 172 => 0xaa, 173 => 0x18, 174 => 0xbe, 175 => 0x1b
-       , 176 => 0xfc, 177 => 0x56, 178 => 0x3e, 179 => 0x4b, 180 => 0xc6, 181 => 0xd2, 182 => 0x79, 183 => 0x20
-       , 184 => 0x9a, 185 => 0xdb, 186 => 0xc0, 187 => 0xfe, 188 => 0x78, 189 => 0xcd, 190 => 0x5a, 191 => 0xf4
-       , 192 => 0x1f, 193 => 0xdd, 194 => 0xa8, 195 => 0x33, 196 => 0x88, 197 => 0x07, 198 => 0xc7, 199 => 0x31
-       , 200 => 0xb1, 201 => 0x12, 202 => 0x10, 203 => 0x59, 204 => 0x27, 205 => 0x80, 206 => 0xec, 207 => 0x5f
-       , 208 => 0x60, 209 => 0x51, 210 => 0x7f, 211 => 0xa9, 212 => 0x19, 213 => 0xb5, 214 => 0x4a, 215 => 0x0d
-       , 216 => 0x2d, 217 => 0xe5, 218 => 0x7a, 219 => 0x9f, 220 => 0x93, 221 => 0xc9, 222 => 0x9c, 223 => 0xef
-       , 224 => 0xa0, 225 => 0xe0, 226 => 0x3b, 227 => 0x4d, 228 => 0xae, 229 => 0x2a, 230 => 0xf5, 231 => 0xb0
-       , 232 => 0xc8, 233 => 0xeb, 234 => 0xbb, 235 => 0x3c, 236 => 0x83, 237 => 0x53, 238 => 0x99, 239 => 0x61
-       , 240 => 0x17, 241 => 0x2b, 242 => 0x04, 243 => 0x7e, 244 => 0xba, 245 => 0x77, 246 => 0xd6, 247 => 0x26
-       , 248 => 0xe1, 249 => 0x69, 250 => 0x14, 251 => 0x63, 252 => 0x55, 253 => 0x21, 254 => 0x0c, 255 => 0x7d        }
+            0 => 0x52,
+            1 => 0x09,
+            2 => 0x6a,
+            3 => 0xd5,
+            4 => 0x30,
+            5 => 0x36,
+            6 => 0xa5,
+            7 => 0x38,
+            8 => 0xbf,
+            9 => 0x40,
+            10 => 0xa3,
+            11 => 0x9e,
+            12 => 0x81,
+            13 => 0xf3,
+            14 => 0xd7,
+            15 => 0xfb,
+            16 => 0x7c,
+            17 => 0xe3,
+            18 => 0x39,
+            19 => 0x82,
+            20 => 0x9b,
+            21 => 0x2f,
+            22 => 0xff,
+            23 => 0x87,
+            24 => 0x34,
+            25 => 0x8e,
+            26 => 0x43,
+            27 => 0x44,
+            28 => 0xc4,
+            29 => 0xde,
+            30 => 0xe9,
+            31 => 0xcb,
+            32 => 0x54,
+            33 => 0x7b,
+            34 => 0x94,
+            35 => 0x32,
+            36 => 0xa6,
+            37 => 0xc2,
+            38 => 0x23,
+            39 => 0x3d,
+            40 => 0xee,
+            41 => 0x4c,
+            42 => 0x95,
+            43 => 0x0b,
+            44 => 0x42,
+            45 => 0xfa,
+            46 => 0xc3,
+            47 => 0x4e,
+            48 => 0x08,
+            49 => 0x2e,
+            50 => 0xa1,
+            51 => 0x66,
+            52 => 0x28,
+            53 => 0xd9,
+            54 => 0x24,
+            55 => 0xb2,
+            56 => 0x76,
+            57 => 0x5b,
+            58 => 0xa2,
+            59 => 0x49,
+            60 => 0x6d,
+            61 => 0x8b,
+            62 => 0xd1,
+            63 => 0x25,
+            64 => 0x72,
+            65 => 0xf8,
+            66 => 0xf6,
+            67 => 0x64,
+            68 => 0x86,
+            69 => 0x68,
+            70 => 0x98,
+            71 => 0x16,
+            72 => 0xd4,
+            73 => 0xa4,
+            74 => 0x5c,
+            75 => 0xcc,
+            76 => 0x5d,
+            77 => 0x65,
+            78 => 0xb6,
+            79 => 0x92,
+            80 => 0x6c,
+            81 => 0x70,
+            82 => 0x48,
+            83 => 0x50,
+            84 => 0xfd,
+            85 => 0xed,
+            86 => 0xb9,
+            87 => 0xda,
+            88 => 0x5e,
+            89 => 0x15,
+            90 => 0x46,
+            91 => 0x57,
+            92 => 0xa7,
+            93 => 0x8d,
+            94 => 0x9d,
+            95 => 0x84,
+            96 => 0x90,
+            97 => 0xd8,
+            98 => 0xab,
+            99 => 0x00,
+            100 => 0x8c,
+            101 => 0xbc,
+            102 => 0xd3,
+            103 => 0x0a,
+            104 => 0xf7,
+            105 => 0xe4,
+            106 => 0x58,
+            107 => 0x05,
+            108 => 0xb8,
+            109 => 0xb3,
+            110 => 0x45,
+            111 => 0x06,
+            112 => 0xd0,
+            113 => 0x2c,
+            114 => 0x1e,
+            115 => 0x8f,
+            116 => 0xca,
+            117 => 0x3f,
+            118 => 0x0f,
+            119 => 0x02,
+            120 => 0xc1,
+            121 => 0xaf,
+            122 => 0xbd,
+            123 => 0x03,
+            124 => 0x01,
+            125 => 0x13,
+            126 => 0x8a,
+            127 => 0x6b,
+            128 => 0x3a,
+            129 => 0x91,
+            130 => 0x11,
+            131 => 0x41,
+            132 => 0x4f,
+            133 => 0x67,
+            134 => 0xdc,
+            135 => 0xea,
+            136 => 0x97,
+            137 => 0xf2,
+            138 => 0xcf,
+            139 => 0xce,
+            140 => 0xf0,
+            141 => 0xb4,
+            142 => 0xe6,
+            143 => 0x73,
+            144 => 0x96,
+            145 => 0xac,
+            146 => 0x74,
+            147 => 0x22,
+            148 => 0xe7,
+            149 => 0xad,
+            150 => 0x35,
+            151 => 0x85,
+            152 => 0xe2,
+            153 => 0xf9,
+            154 => 0x37,
+            155 => 0xe8,
+            156 => 0x1c,
+            157 => 0x75,
+            158 => 0xdf,
+            159 => 0x6e,
+            160 => 0x47,
+            161 => 0xf1,
+            162 => 0x1a,
+            163 => 0x71,
+            164 => 0x1d,
+            165 => 0x29,
+            166 => 0xc5,
+            167 => 0x89,
+            168 => 0x6f,
+            169 => 0xb7,
+            170 => 0x62,
+            171 => 0x0e,
+            172 => 0xaa,
+            173 => 0x18,
+            174 => 0xbe,
+            175 => 0x1b,
+            176 => 0xfc,
+            177 => 0x56,
+            178 => 0x3e,
+            179 => 0x4b,
+            180 => 0xc6,
+            181 => 0xd2,
+            182 => 0x79,
+            183 => 0x20,
+            184 => 0x9a,
+            185 => 0xdb,
+            186 => 0xc0,
+            187 => 0xfe,
+            188 => 0x78,
+            189 => 0xcd,
+            190 => 0x5a,
+            191 => 0xf4,
+            192 => 0x1f,
+            193 => 0xdd,
+            194 => 0xa8,
+            195 => 0x33,
+            196 => 0x88,
+            197 => 0x07,
+            198 => 0xc7,
+            199 => 0x31,
+            200 => 0xb1,
+            201 => 0x12,
+            202 => 0x10,
+            203 => 0x59,
+            204 => 0x27,
+            205 => 0x80,
+            206 => 0xec,
+            207 => 0x5f,
+            208 => 0x60,
+            209 => 0x51,
+            210 => 0x7f,
+            211 => 0xa9,
+            212 => 0x19,
+            213 => 0xb5,
+            214 => 0x4a,
+            215 => 0x0d,
+            216 => 0x2d,
+            217 => 0xe5,
+            218 => 0x7a,
+            219 => 0x9f,
+            220 => 0x93,
+            221 => 0xc9,
+            222 => 0x9c,
+            223 => 0xef,
+            224 => 0xa0,
+            225 => 0xe0,
+            226 => 0x3b,
+            227 => 0x4d,
+            228 => 0xae,
+            229 => 0x2a,
+            230 => 0xf5,
+            231 => 0xb0,
+            232 => 0xc8,
+            233 => 0xeb,
+            234 => 0xbb,
+            235 => 0x3c,
+            236 => 0x83,
+            237 => 0x53,
+            238 => 0x99,
+            239 => 0x61,
+            240 => 0x17,
+            241 => 0x2b,
+            242 => 0x04,
+            243 => 0x7e,
+            244 => 0xba,
+            245 => 0x77,
+            246 => 0xd6,
+            247 => 0x26,
+            248 => 0xe1,
+            249 => 0x69,
+            250 => 0x14,
+            251 => 0x63,
+            252 => 0x55,
+            253 => 0x21,
+            254 => 0x0c,
+            255 => 0x7d,
+        }
     }
 
-    use rand_core::{RngCore,OsRng};
-    
-    fn  get_bit_u8(x:&[u8], i:usize, j:usize) -> u8 {
-        (x[i] >> j) & 0x1 
+    use rand_core::{OsRng, RngCore};
+
+    fn get_bit_u8(x: &[u8], i: usize, j: usize) -> u8 {
+        (x[i] >> j) & 0x1
     }
-    
-    fn  get_bit_u16(x:&[u16], i:usize, j:usize) -> u8 {
+
+    fn get_bit_u16(x: &[u16], i: usize, j: usize) -> u8 {
         ((x[j] >> i) & 0x1) as u8
     }
 
     #[test]
-    fn  test_transpose () {
+    fn test_transpose() {
         let mut x = [0u8; 16];
         OsRng.fill_bytes(&mut x);
         let mut y = [0u16; 8];
@@ -704,10 +1145,11 @@ impl crate::platform::AESState for State {
             for j in 0..8 {
                 if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
                     println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
-                    println!("y[{},{}] = {}", i, j, get_bit_u16(&y,i, j));
+                    println!("y[{},{}] = {}", i, j, get_bit_u16(&y, i, j));
                     assert!(false);
+                } else {
+                    println!("transpose ok: {},{}", i, j);
                 }
-                else {println!("transpose ok: {},{}", i, j);}
             }
         }
         let mut z = [0u8; 16];
@@ -718,14 +1160,15 @@ impl crate::platform::AESState for State {
                     println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
                     println!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
                     assert!(false);
+                } else {
+                    println!("inv-transpose ok: {},{}", i, j);
                 }
-                else {println!("inv-transpose ok: {},{}", i, j);}
             }
         }
     }
 
     #[test]
-    fn  test_sbox () {
+    fn test_sbox() {
         let mut x = [0u8; 16];
         let mut y = [0u16; 8];
         let mut w = [0u8; 16];
@@ -738,13 +1181,14 @@ impl crate::platform::AESState for State {
             if w[0] != sbox_fwd(i as u8) {
                 println!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
                 assert!(false);
+            } else {
+                println!("sbox ok {}", i)
             }
-            else {println!("sbox ok {}", i)}
         }
     }
 
     #[test]
-    fn  test_sbox_inv () {
+    fn test_sbox_inv() {
         let mut x = [0u8; 16];
         let mut y = [0u16; 8];
         let mut w = [0u8; 16];
@@ -755,10 +1199,16 @@ impl crate::platform::AESState for State {
             super::sub_bytes_inv_state(&mut y);
             super::transpose_u16x8(&y, &mut w);
             if w[0] != sbox_inv(i as u8) {
-                println!("sbox_inv[{}] = {}, should be {}", i, w[0], sbox_inv(i as u8));
+                println!(
+                    "sbox_inv[{}] = {}, should be {}",
+                    i,
+                    w[0],
+                    sbox_inv(i as u8)
+                );
                 assert!(false);
+            } else {
+                println!("sbox inv ok {}", i)
             }
-            else {println!("sbox inv ok {}", i)}
         }
     }
-}
\ No newline at end of file
+}
diff --git a/libcrux-aesgcm/src/platform/portable/gf128_core.rs b/libcrux-aesgcm/src/platform/portable/gf128_core.rs
index eca4b862a..40a2724ed 100644
--- a/libcrux-aesgcm/src/platform/portable/gf128_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/gf128_core.rs
@@ -1,23 +1,25 @@
 pub(crate) type FieldElement = u128;
 
-fn zero() -> FieldElement { 0 }
-fn load_elem(b:& [u8]) -> FieldElement {
+fn zero() -> FieldElement {
+    0
+}
+fn load_elem(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
     u128::from_be_bytes(b.try_into().unwrap())
 }
 
-fn store_elem(elem:&FieldElement, b:&mut [u8]) {
+fn store_elem(elem: &FieldElement, b: &mut [u8]) {
     debug_assert!(b.len() == 16);
     b.copy_from_slice(&u128::to_be_bytes(*elem));
 }
-   
-fn add(elem: &FieldElement, other:&FieldElement) -> FieldElement {
+
+fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
     elem ^ other
 }
 
-fn ith_bit_mask(elem: &FieldElement, i:usize) -> FieldElement {
+fn ith_bit_mask(elem: &FieldElement, i: usize) -> FieldElement {
     debug_assert!(i < 128);
-    let bit:u16 = ((elem >> (127 - i)) as u16) & 0x1;
+    let bit: u16 = ((elem >> (127 - i)) as u16) & 0x1;
     let bit_mask16 = (!bit).wrapping_add(1);
     let bit_mask32 = (bit_mask16 as u32) ^ ((bit_mask16 as u32) << 16);
     let bit_mask64 = (bit_mask32 as u64) ^ ((bit_mask32 as u64) << 32);
@@ -32,17 +34,17 @@ fn mul_x(elem: &mut FieldElement) {
     *elem = (*elem >> 1) ^ (IRRED & mask)
 }
 
-fn mul_step(x: &FieldElement, y: &mut FieldElement, i:usize, result: &mut FieldElement) {
+fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut FieldElement) {
     debug_assert!(i < 128);
     let mask = ith_bit_mask(x, i);
     *result ^= (*y & mask);
     mul_x(y);
 }
 
-fn mul(x: &FieldElement, y:&FieldElement) -> FieldElement {
+fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
     let mut result = 0;
     let mut multiplicand = *y;
-    for i in 0..128{
+    for i in 0..128 {
         mul_step(x, &mut multiplicand, i, &mut result)
     }
     result
@@ -53,19 +55,19 @@ impl crate::platform::GF128FieldElement for FieldElement {
         zero()
     }
 
-    fn load_elem(b:&[u8]) -> Self {
+    fn load_elem(b: &[u8]) -> Self {
         load_elem(b)
     }
 
-    fn store_elem(&self, b:&mut [u8]) {
+    fn store_elem(&self, b: &mut [u8]) {
         store_elem(self, b);
     }
 
-    fn add(&mut self, other:&Self) {
+    fn add(&mut self, other: &Self) {
         *self = add(self, other);
     }
 
-    fn mul(&mut self, other:&Self) {
+    fn mul(&mut self, other: &Self) {
         *self = mul(self, other)
     }
 }

From 635639611bb4e8394920322a3a3cf4b4f9d50664 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sun, 20 Apr 2025 10:35:06 +0200
Subject: [PATCH 03/43] gcm portable

---
 libcrux-aesgcm/src/aes_gcm.rs | 271 ++++++++++++++++++++++++++++++++++
 1 file changed, 271 insertions(+)
 create mode 100644 libcrux-aesgcm/src/aes_gcm.rs

diff --git a/libcrux-aesgcm/src/aes_gcm.rs b/libcrux-aesgcm/src/aes_gcm.rs
new file mode 100644
index 000000000..007adaea8
--- /dev/null
+++ b/libcrux-aesgcm/src/aes_gcm.rs
@@ -0,0 +1,271 @@
+use crate::{
+    aes_ctr::{
+        aes128_ctr_encrypt, aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce,
+        aes128_ctr_update, aes128_ctr_xor_block, aes128_ctr_xor_blocks, AES128_CTR_Context,
+    },
+    gf128_generic::{
+        self, gf128_emit, gf128_init, gf128_update, gf128_update_blocks, gf128_update_last,
+        gf128_update_padded, GF128State,
+    },
+    platform::{AESState, GF128FieldElement},
+};
+
+#[allow(non_snake_case_types)]
+pub struct AES128_GCM_State<T: AESState, U: GF128FieldElement> {
+    aes_state: AES128_CTR_Context<T>,
+    gcm_state: GF128State<U>,
+    tag_mix: [u8; 16],
+}
+
+pub fn aes128_gcm_init<T: AESState, U: GF128FieldElement>(key: &[u8]) -> AES128_GCM_State<T, U> {
+    debug_assert!(key.len() == 16);
+    let nonce = [0u8; 12];
+    let mut gcm_key = [0u8; 16];
+    let tag_mix = [0u8; 16];
+    let aes_state = aes128_ctr_init(key, &nonce);
+    aes128_ctr_key_block(&aes_state, 0, &mut gcm_key);
+    let gcm_state = gf128_init(&gcm_key);
+    AES128_GCM_State {
+        aes_state,
+        gcm_state,
+        tag_mix,
+    }
+}
+
+pub fn aes128_gcm_set_nonce<T: AESState, U: GF128FieldElement>(
+    st: &mut AES128_GCM_State<T, U>,
+    nonce: &[u8],
+) {
+    debug_assert!(nonce.len() == 12);
+    aes128_ctr_set_nonce(&mut st.aes_state, nonce);
+    aes128_ctr_key_block(&st.aes_state, 1, &mut st.tag_mix);
+}
+
+pub fn aes128_gcm_encrypt<T: AESState, U: GF128FieldElement>(
+    st: &mut AES128_GCM_State<T, U>,
+    aad: &[u8],
+    plaintext: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    debug_assert!(ciphertext.len() == plaintext.len());
+    debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
+    debug_assert!(tag.len() == 16);
+    aes128_ctr_update(&st.aes_state, 2, plaintext, ciphertext);
+    gf128_update_padded(&mut st.gcm_state, aad);
+    gf128_update_padded(&mut st.gcm_state, ciphertext);
+    let mut last_block = [0u8; 16];
+    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+    gf128_update(&mut st.gcm_state, &last_block);
+    gf128_emit(&st.gcm_state, tag);
+    for i in 0..16 {
+        tag[i] ^= st.tag_mix[i];
+    }
+}
+
+pub struct DecryptError();
+
+pub fn aes128_gcm_decrypt<T: AESState, U: GF128FieldElement>(
+    st: &mut AES128_GCM_State<T, U>,
+    aad: &[u8],
+    ciphertext: &[u8],
+    tag: &[u8],
+    plaintext: &mut [u8],
+) -> Result<(), DecryptError> {
+    debug_assert!(plaintext.len() == ciphertext.len());
+    debug_assert!(ciphertext.len() / 16 <= u32::MAX as usize);
+    debug_assert!(tag.len() == 16);
+    gf128_update_padded(&mut st.gcm_state, aad);
+    gf128_update_padded(&mut st.gcm_state, ciphertext);
+    let mut last_block = [0u8; 16];
+    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+    gf128_update(&mut st.gcm_state, &last_block);
+    let mut computed_tag = [0u8; 16];
+    gf128_emit(&st.gcm_state, &mut computed_tag);
+    for i in 0..16 {
+        computed_tag[i] ^= st.tag_mix[i];
+    }
+    let mut eq_mask = 0u8;
+    for i in 0..16 {
+        eq_mask |= computed_tag[i] ^ tag[i];
+    }
+    if eq_mask == 0 {
+        aes128_ctr_update(&st.aes_state, 2, ciphertext, plaintext);
+        Ok(())
+    } else {
+        Err(DecryptError())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::platform::{self, portable};
+
+    use super::{aes128_gcm_encrypt, aes128_gcm_init, aes128_gcm_set_nonce};
+
+    const input1: [u8; 60] = [
+        0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26,
+        0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31,
+        0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49,
+        0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39,
+    ];
+    const key1: [u8; 16] = [
+        0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83,
+        0x08,
+    ];
+    const nonce1: [u8; 12] = [
+        0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
+    ];
+    const aad1: [u8; 20] = [
+        0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
+        0xef, 0xab, 0xad, 0xda, 0xd2,
+    ];
+    const expected1: [u8; 76] = [
+        0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4,
+        0x9c, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac,
+        0xa1, 0x2e, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac,
+        0x84, 0xaa, 0x05, 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91,
+        0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a,
+        0x47,
+    ];
+
+    const input2: [u8; 652] = [
+        0x08, 0x00, 0x00, 0x1e, 0x00, 0x1c, 0x00, 0x0a, 0x00, 0x14, 0x00, 0x12, 0x00, 0x1d, 0x00,
+        0x17, 0x00, 0x18, 0x00, 0x19, 0x01, 0x00, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03, 0x01, 0x04,
+        0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x01, 0xb9, 0x00, 0x00, 0x01, 0xb5, 0x00, 0x01, 0xb0,
+        0x30, 0x82, 0x01, 0xac, 0x30, 0x82, 0x01, 0x15, 0xa0, 0x03, 0x02, 0x01, 0x02, 0x02, 0x01,
+        0x02, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, 0x05,
+        0x00, 0x30, 0x0e, 0x31, 0x0c, 0x30, 0x0a, 0x06, 0x03, 0x55, 0x04, 0x03, 0x13, 0x03, 0x72,
+        0x73, 0x61, 0x30, 0x1e, 0x17, 0x0d, 0x31, 0x36, 0x30, 0x37, 0x33, 0x30, 0x30, 0x31, 0x32,
+        0x33, 0x35, 0x39, 0x5a, 0x17, 0x0d, 0x32, 0x36, 0x30, 0x37, 0x33, 0x30, 0x30, 0x31, 0x32,
+        0x33, 0x35, 0x39, 0x5a, 0x30, 0x0e, 0x31, 0x0c, 0x30, 0x0a, 0x06, 0x03, 0x55, 0x04, 0x03,
+        0x13, 0x03, 0x72, 0x73, 0x61, 0x30, 0x81, 0x9f, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48,
+        0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05, 0x00, 0x03, 0x81, 0x8d, 0x00, 0x30, 0x81, 0x89,
+        0x02, 0x81, 0x81, 0x00, 0xb4, 0xbb, 0x49, 0x8f, 0x82, 0x79, 0x30, 0x3d, 0x98, 0x08, 0x36,
+        0x39, 0x9b, 0x36, 0xc6, 0x98, 0x8c, 0x0c, 0x68, 0xde, 0x55, 0xe1, 0xbd, 0xb8, 0x26, 0xd3,
+        0x90, 0x1a, 0x24, 0x61, 0xea, 0xfd, 0x2d, 0xe4, 0x9a, 0x91, 0xd0, 0x15, 0xab, 0xbc, 0x9a,
+        0x95, 0x13, 0x7a, 0xce, 0x6c, 0x1a, 0xf1, 0x9e, 0xaa, 0x6a, 0xf9, 0x8c, 0x7c, 0xed, 0x43,
+        0x12, 0x09, 0x98, 0xe1, 0x87, 0xa8, 0x0e, 0xe0, 0xcc, 0xb0, 0x52, 0x4b, 0x1b, 0x01, 0x8c,
+        0x3e, 0x0b, 0x63, 0x26, 0x4d, 0x44, 0x9a, 0x6d, 0x38, 0xe2, 0x2a, 0x5f, 0xda, 0x43, 0x08,
+        0x46, 0x74, 0x80, 0x30, 0x53, 0x0e, 0xf0, 0x46, 0x1c, 0x8c, 0xa9, 0xd9, 0xef, 0xbf, 0xae,
+        0x8e, 0xa6, 0xd1, 0xd0, 0x3e, 0x2b, 0xd1, 0x93, 0xef, 0xf0, 0xab, 0x9a, 0x80, 0x02, 0xc4,
+        0x74, 0x28, 0xa6, 0xd3, 0x5a, 0x8d, 0x88, 0xd7, 0x9f, 0x7f, 0x1e, 0x3f, 0x02, 0x03, 0x01,
+        0x00, 0x01, 0xa3, 0x1a, 0x30, 0x18, 0x30, 0x09, 0x06, 0x03, 0x55, 0x1d, 0x13, 0x04, 0x02,
+        0x30, 0x00, 0x30, 0x0b, 0x06, 0x03, 0x55, 0x1d, 0x0f, 0x04, 0x04, 0x03, 0x02, 0x05, 0xa0,
+        0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, 0x05, 0x00,
+        0x03, 0x81, 0x81, 0x00, 0x85, 0xaa, 0xd2, 0xa0, 0xe5, 0xb9, 0x27, 0x6b, 0x90, 0x8c, 0x65,
+        0xf7, 0x3a, 0x72, 0x67, 0x17, 0x06, 0x18, 0xa5, 0x4c, 0x5f, 0x8a, 0x7b, 0x33, 0x7d, 0x2d,
+        0xf7, 0xa5, 0x94, 0x36, 0x54, 0x17, 0xf2, 0xea, 0xe8, 0xf8, 0xa5, 0x8c, 0x8f, 0x81, 0x72,
+        0xf9, 0x31, 0x9c, 0xf3, 0x6b, 0x7f, 0xd6, 0xc5, 0x5b, 0x80, 0xf2, 0x1a, 0x03, 0x01, 0x51,
+        0x56, 0x72, 0x60, 0x96, 0xfd, 0x33, 0x5e, 0x5e, 0x67, 0xf2, 0xdb, 0xf1, 0x02, 0x70, 0x2e,
+        0x60, 0x8c, 0xca, 0xe6, 0xbe, 0xc1, 0xfc, 0x63, 0xa4, 0x2a, 0x99, 0xbe, 0x5c, 0x3e, 0xb7,
+        0x10, 0x7c, 0x3c, 0x54, 0xe9, 0xb9, 0xeb, 0x2b, 0xd5, 0x20, 0x3b, 0x1c, 0x3b, 0x84, 0xe0,
+        0xa8, 0xb2, 0xf7, 0x59, 0x40, 0x9b, 0xa3, 0xea, 0xc9, 0xd9, 0x1d, 0x40, 0x2d, 0xcc, 0x0c,
+        0xc8, 0xf8, 0x96, 0x12, 0x29, 0xac, 0x91, 0x87, 0xb4, 0x2b, 0x4d, 0xe1, 0x00, 0x00, 0x0f,
+        0x00, 0x00, 0x84, 0x08, 0x04, 0x00, 0x80, 0x45, 0x47, 0xd6, 0x16, 0x8f, 0x25, 0x10, 0xc5,
+        0x50, 0xbd, 0x94, 0x9c, 0xd2, 0xbc, 0x63, 0x1f, 0xf1, 0x34, 0xfa, 0x10, 0xa8, 0x27, 0xff,
+        0x69, 0xb1, 0x66, 0xa6, 0xbd, 0x95, 0xe2, 0x49, 0xed, 0x0d, 0xaf, 0x57, 0x15, 0x92, 0xeb,
+        0xbe, 0x9f, 0xf1, 0x3d, 0xe6, 0xb0, 0x3a, 0xcc, 0x21, 0x81, 0x46, 0x78, 0x1f, 0x69, 0x3b,
+        0x5a, 0x69, 0x2b, 0x73, 0x19, 0xd7, 0x4f, 0xd2, 0xe5, 0x3b, 0x6a, 0x2d, 0xf0, 0xf6, 0x78,
+        0x5d, 0x62, 0x4f, 0x02, 0x4a, 0x44, 0x03, 0x0c, 0xa0, 0x0b, 0x86, 0x9a, 0xe8, 0x1a, 0x53,
+        0x2b, 0x19, 0xe4, 0x7e, 0x52, 0x5f, 0xf4, 0xa6, 0x2c, 0x51, 0xa5, 0x88, 0x9e, 0xb5, 0x65,
+        0xfe, 0xe2, 0x68, 0x59, 0x0d, 0x8a, 0x3c, 0xa3, 0xc1, 0xbc, 0x3b, 0xd5, 0x40, 0x4e, 0x39,
+        0x72, 0x0c, 0xa2, 0xea, 0xee, 0x30, 0x8f, 0x4e, 0x07, 0x00, 0x76, 0x1e, 0x98, 0x63, 0x89,
+        0x14, 0x00, 0x00, 0x20, 0x9e, 0xfe, 0xe0, 0x3e, 0xbf, 0xfb, 0xc0, 0xdc, 0x23, 0xd2, 0x6d,
+        0x95, 0x87, 0x44, 0xc0, 0x9e, 0x30, 0x00, 0x47, 0x7e, 0xff, 0x7a, 0xe3, 0x14, 0x8a, 0x50,
+        0xe5, 0x67, 0x00, 0x13, 0xaa, 0xaa, 0x16,
+    ];
+
+    const key2: [u8; 16] = [
+        0xfd, 0xa2, 0xa4, 0x40, 0x46, 0x70, 0x80, 0x8f, 0x49, 0x37, 0x47, 0x8b, 0x8b, 0x6e, 0x3f,
+        0xe1,
+    ];
+    const nonce2: [u8; 12] = [
+        0xb5, 0xf3, 0xa3, 0xfa, 0xe1, 0xcb, 0x25, 0xc9, 0xdc, 0xd7, 0x39, 0x93,
+    ];
+    const aad2: [u8; 0] = [];
+
+    const expected2: [u8; 668] = [
+        0xc1, 0xe6, 0x31, 0xf8, 0x1d, 0x2a, 0xf2, 0x21, 0xeb, 0xb6, 0xa9, 0x57, 0xf5, 0x8f, 0x3e,
+        0xe2, 0x66, 0x27, 0x26, 0x35, 0xe6, 0x7f, 0x99, 0xa7, 0x52, 0xf0, 0xdf, 0x08, 0xad, 0xeb,
+        0x33, 0xba, 0xb8, 0x61, 0x1e, 0x55, 0xf3, 0x3d, 0x72, 0xcf, 0x84, 0x38, 0x24, 0x61, 0xa8,
+        0xbf, 0xe0, 0xa6, 0x59, 0xba, 0x2d, 0xd1, 0x87, 0x3f, 0x6f, 0xcc, 0x70, 0x7a, 0x98, 0x41,
+        0xce, 0xfc, 0x1f, 0xb0, 0x35, 0x26, 0xb9, 0xca, 0x4f, 0xe3, 0x43, 0xe5, 0x80, 0x5e, 0x95,
+        0xa5, 0xc0, 0x1e, 0x56, 0x57, 0x06, 0x38, 0xa7, 0x6a, 0x4b, 0xc8, 0xfe, 0xb0, 0x7b, 0xe8,
+        0x79, 0xf9, 0x05, 0x68, 0x61, 0x7d, 0x90, 0x5f, 0xec, 0xd5, 0xb1, 0x61, 0x9f, 0xb8, 0xec,
+        0x4a, 0x66, 0x28, 0xd1, 0xbb, 0x2b, 0xb2, 0x24, 0xc4, 0x90, 0xff, 0x97, 0xa6, 0xc0, 0xe9,
+        0xac, 0xd0, 0x36, 0x04, 0xbc, 0x3a, 0x59, 0xd8, 0x6b, 0xda, 0xb4, 0xe0, 0x84, 0xc1, 0xc1,
+        0x45, 0x0f, 0x9c, 0x9d, 0x2a, 0xfe, 0xb1, 0x72, 0xc0, 0x72, 0x34, 0xd7, 0x39, 0x86, 0x8e,
+        0xbd, 0x62, 0xde, 0x20, 0x60, 0xa8, 0xde, 0x98, 0x94, 0x14, 0xa8, 0x29, 0x20, 0xda, 0xcd,
+        0x1c, 0xac, 0x0c, 0x6e, 0x72, 0xec, 0xd7, 0xf4, 0x01, 0x85, 0x74, 0xce, 0xac, 0xa6, 0xd2,
+        0x9f, 0x36, 0x1b, 0xc3, 0x7e, 0xe2, 0x88, 0x8b, 0x8e, 0x30, 0x2c, 0xa9, 0x56, 0x1a, 0x9d,
+        0xe9, 0x16, 0x3e, 0xdf, 0xa6, 0x6b, 0xad, 0xd4, 0x89, 0x48, 0x84, 0xc7, 0xb3, 0x59, 0xbc,
+        0xac, 0xae, 0x59, 0x08, 0x05, 0x1b, 0x37, 0x95, 0x2e, 0x10, 0xa4, 0x5f, 0xe7, 0x3f, 0xda,
+        0x12, 0x6e, 0xbd, 0x67, 0x57, 0x5f, 0x1b, 0xed, 0x8a, 0x99, 0x2a, 0x89, 0x47, 0x4d, 0x7d,
+        0xec, 0x1e, 0xed, 0x32, 0x78, 0x24, 0x12, 0x3a, 0x41, 0x4a, 0xdb, 0x66, 0xd5, 0xef, 0x7d,
+        0x08, 0x36, 0xff, 0x98, 0xc2, 0xcd, 0xd7, 0xfb, 0x07, 0x81, 0xe1, 0x92, 0xbf, 0x0c, 0x75,
+        0x68, 0xbf, 0x7d, 0x89, 0x0a, 0x51, 0xc3, 0x32, 0x87, 0x9b, 0x50, 0x37, 0xb2, 0x12, 0xd6,
+        0x22, 0x41, 0x2c, 0xa4, 0x8e, 0x83, 0x23, 0x81, 0x7b, 0xd6, 0xd7, 0x46, 0xee, 0xf6, 0x83,
+        0x84, 0x5c, 0xec, 0x4e, 0x3e, 0xf6, 0x4b, 0x3a, 0x18, 0xfc, 0xce, 0x51, 0x3e, 0xa9, 0x51,
+        0xf3, 0x36, 0x66, 0x93, 0xa7, 0xff, 0x49, 0x0d, 0x09, 0xd0, 0x8a, 0xb1, 0xf6, 0x3e, 0x13,
+        0x62, 0x5a, 0x54, 0x59, 0x61, 0x59, 0x9c, 0x0d, 0x9c, 0x7a, 0x09, 0x9d, 0x11, 0x63, 0xca,
+        0xd1, 0xb9, 0xbc, 0xf8, 0xe9, 0x17, 0xd7, 0x66, 0xb9, 0x88, 0x53, 0xef, 0x68, 0x77, 0x83,
+        0x4f, 0x89, 0x1d, 0xf1, 0x6b, 0xe1, 0xfc, 0xc9, 0xc1, 0x8e, 0xa1, 0x88, 0x2e, 0xa3, 0xf1,
+        0xf4, 0xb6, 0x43, 0x58, 0xe1, 0xb1, 0x46, 0xce, 0xbf, 0xb3, 0xe0, 0x2e, 0x15, 0x3f, 0xdb,
+        0x73, 0xaf, 0x26, 0x93, 0xf2, 0x2c, 0x6f, 0x59, 0x3f, 0xa4, 0x75, 0x38, 0x0b, 0xa6, 0x61,
+        0x17, 0x40, 0xad, 0x20, 0xe3, 0x19, 0xa6, 0x54, 0xac, 0x56, 0x84, 0x77, 0x52, 0x36, 0x16,
+        0x2e, 0x84, 0x47, 0xed, 0x80, 0x88, 0x61, 0xbf, 0xbd, 0xa6, 0xe1, 0x8e, 0xc9, 0x7a, 0xe0,
+        0x90, 0xbf, 0x70, 0x34, 0x75, 0xcf, 0xb9, 0x0f, 0xe2, 0x0a, 0x3c, 0x55, 0xbe, 0xf6, 0xf5,
+        0xeb, 0xa6, 0xe6, 0xa1, 0xda, 0x6a, 0x19, 0x96, 0xb8, 0xbd, 0xe4, 0x21, 0x80, 0x60, 0x8c,
+        0xa2, 0x27, 0x9d, 0xef, 0x8e, 0x81, 0x53, 0x89, 0x5c, 0xc8, 0x50, 0xdb, 0x64, 0x20, 0x56,
+        0x1c, 0x04, 0xb5, 0x72, 0x9c, 0xc6, 0x88, 0x34, 0x36, 0xea, 0x02, 0xee, 0x07, 0xeb, 0x9b,
+        0xae, 0xe2, 0xfb, 0x3a, 0x9e, 0x1b, 0xbd, 0xa8, 0x73, 0x0d, 0x6b, 0x22, 0x05, 0x76, 0xe2,
+        0x4d, 0xf7, 0x0a, 0xf6, 0x92, 0x8e, 0xb8, 0x65, 0xfe, 0xe8, 0xa1, 0xd1, 0xc0, 0xf1, 0x81,
+        0x8a, 0xca, 0x68, 0xd5, 0x00, 0x2a, 0xe4, 0xc6, 0x5b, 0x2f, 0x49, 0xc9, 0xe6, 0xe2, 0x1d,
+        0xcf, 0x76, 0x78, 0x4a, 0xdb, 0xd0, 0xe8, 0x87, 0xa3, 0x68, 0x32, 0xef, 0x85, 0xbe, 0xb1,
+        0x05, 0x87, 0xf1, 0x6c, 0x6f, 0xfe, 0x60, 0xd7, 0x45, 0x10, 0x59, 0xec, 0x7f, 0x10, 0x14,
+        0xc3, 0xef, 0xe1, 0x9e, 0x56, 0xae, 0xdb, 0x5a, 0xd3, 0x1a, 0x9f, 0x29, 0xdc, 0x44, 0x58,
+        0xcf, 0xbf, 0x0c, 0x70, 0x70, 0xc1, 0x75, 0xdc, 0xad, 0x46, 0xe1, 0x67, 0x52, 0x26, 0xb4,
+        0x7c, 0x07, 0x1a, 0xad, 0x31, 0x72, 0xeb, 0xd3, 0x3e, 0x45, 0xd7, 0x41, 0xcb, 0x91, 0x25,
+        0x3a, 0x01, 0xa6, 0x9a, 0xe3, 0xcc, 0x29, 0x2b, 0xce, 0x9c, 0x03, 0x24, 0x6a, 0xc9, 0x51,
+        0xe4, 0x5e, 0x97, 0xeb, 0xf0, 0x4a, 0x9d, 0x51, 0xfa, 0xb5, 0xcf, 0x06, 0xd9, 0x48, 0x5c,
+        0xce, 0x74, 0x6b, 0x1c, 0x07, 0x7b, 0xe6, 0x9a, 0xd1, 0x53, 0xf1, 0x65, 0x6e, 0xf8, 0x9f,
+        0xc7, 0xd1, 0xed, 0x8c, 0x3e, 0x2d, 0xa7, 0xa2,
+    ];
+
+    #[test]
+    fn test_gcm1() {
+        let mut computed1 = [0u8; 76];
+        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&key1);
+        aes128_gcm_set_nonce(&mut st, &nonce1);
+        let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
+        aes128_gcm_encrypt(&mut st, &aad1, &input1, &mut ciphertext, &mut tag);
+        for i in 0..76 {
+            if computed1[i] != expected1[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, expected1[i], computed1[i]
+                )
+            }
+        }
+    }
+
+    #[test]
+    fn test_gcm2() {
+        let mut computed2 = [0u8; 668];
+        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&key2);
+        aes128_gcm_set_nonce(&mut st, &nonce2);
+        let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
+        aes128_gcm_encrypt(&mut st, &aad2, &input2, &mut ciphertext, &mut tag);
+        for i in 0..668 {
+            if computed2[i] != expected2[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, expected2[i], computed2[i]
+                )
+            }
+        }
+    }
+}

From 6986f0285e84e19cb2e93ff0719d0b8ea2f492d7 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sun, 20 Apr 2025 22:09:53 +0200
Subject: [PATCH 04/43] benching

---
 libcrux-aesgcm/Cargo.toml                     |   1 +
 libcrux-aesgcm/benches/aesgcm.rs              | 105 ++++++++++++++++
 libcrux-aesgcm/src/aes_ctr.rs                 |  64 +++++++---
 libcrux-aesgcm/src/aes_gcm.rs                 |  92 ++++++++++----
 libcrux-aesgcm/src/gf128_generic.rs           |  32 +++--
 libcrux-aesgcm/src/lib.rs                     |  38 +++++-
 libcrux-aesgcm/src/platform.rs                |   3 +
 libcrux-aesgcm/src/platform/neon.rs           |   4 +
 libcrux-aesgcm/src/platform/neon/aes_core.rs  | 118 ++++++++++++++++++
 .../src/platform/neon/gf128_core.rs           |  75 +++++++++++
 .../src/platform/portable/aes_core.rs         |   4 +-
 .../src/platform/portable/gf128_core.rs       |   3 +-
 12 files changed, 483 insertions(+), 56 deletions(-)
 create mode 100644 libcrux-aesgcm/benches/aesgcm.rs
 create mode 100644 libcrux-aesgcm/src/platform/neon.rs
 create mode 100644 libcrux-aesgcm/src/platform/neon/aes_core.rs
 create mode 100644 libcrux-aesgcm/src/platform/neon/gf128_core.rs

diff --git a/libcrux-aesgcm/Cargo.toml b/libcrux-aesgcm/Cargo.toml
index 676734c17..bd3335c56 100644
--- a/libcrux-aesgcm/Cargo.toml
+++ b/libcrux-aesgcm/Cargo.toml
@@ -33,6 +33,7 @@ rand = "0.8.5"
 cavp = { version = "0.0.2-beta.2", path = "../cavp" }
 pretty_env_logger = "0.5.0"
 rand_core = { version = "0.6" }
+aes-gcm = "0.10.3"
 
 [lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = ['cfg(hax)', 'cfg(eurydice)'] }
diff --git a/libcrux-aesgcm/benches/aesgcm.rs b/libcrux-aesgcm/benches/aesgcm.rs
new file mode 100644
index 000000000..9a98e0d7a
--- /dev/null
+++ b/libcrux-aesgcm/benches/aesgcm.rs
@@ -0,0 +1,105 @@
+#![allow(non_snake_case)]
+use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
+
+
+pub fn randombytes(n: usize) -> Vec<u8> {
+    use rand::rngs::OsRng;
+    use rand::RngCore;
+
+    let mut bytes = vec![0u8; n];
+    OsRng.fill_bytes(&mut bytes);
+    bytes
+}
+
+pub fn fmt(x: usize) -> String {
+    let base = (x as f64).log(1024f64).floor() as usize;
+    let suffix = ["", "KB", "MB", "GB"];
+    format!("{} {}", x >> (10 * base), suffix[base])
+}
+
+
+macro_rules! impl_comp {
+    ($fun:ident, $portable_fun:expr, $neon_fun:expr, $rustcrypto_fun:expr) => {
+        // Comparing libcrux performance for different payload sizes and other implementations.
+        fn $fun(c: &mut Criterion) {
+            const PAYLOAD_SIZES: [usize; 3] = [128, 1024, 1024 * 1024 * 10];
+            
+            let mut group = c.benchmark_group(stringify!($fun).replace("_", " "));
+
+            for payload_size in PAYLOAD_SIZES.iter() {
+                group.throughput(Throughput::Bytes(*payload_size as u64));
+
+                group.bench_with_input(
+                    BenchmarkId::new("libcrux", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
+                            |(key,nonce,aad,payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $portable_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+
+
+                #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+                group.bench_with_input(
+                    BenchmarkId::new("neon-aes-clmul", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
+                            |(key,nonce,aad,payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $neon_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+
+                group.bench_with_input(
+                    BenchmarkId::new("rust-crypto", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
+                            |(key,nonce,aad,payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $rustcrypto_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+            }
+        }
+    };
+}
+
+use aes_gcm::{
+    aead::{Aead, AeadCore, KeyInit, OsRng},
+    Aes128Gcm, Nonce, Key // Or `Aes128Gcm`
+};
+
+fn rustcrypto_aes128_gcm_encrypt(key:&[u8], nonce:&[u8], aad:&[u8], plain:&[u8], ciphertext:&mut [u8], tag:&mut [u8]){
+    let cipher = Aes128Gcm::new(key.into());
+    let ctxt = cipher.encrypt(nonce.into(), plain).unwrap();
+    ciphertext.copy_from_slice(&ctxt[0..plain.len()]);
+    tag.copy_from_slice(&ctxt[plain.len()..]);
+}
+
+impl_comp!(AES128_GCM, libcrux_aesgcm::portable::aes128_gcm_encrypt, libcrux_aesgcm::neon::aes128_gcm_encrypt, rustcrypto_aes128_gcm_encrypt);
+
+fn benchmarks(c: &mut Criterion) {
+    AES128_GCM(c);
+}
+
+criterion_group!(benches, benchmarks);
+criterion_main!(benches);
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index eac446054..777afc992 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -71,7 +71,7 @@ fn aes_ctr_update<T: AESState, const NUM_KEYS: usize>(
     let blocks = inp.len() / 16;
     aes_ctr_xor_blocks(&ctx, ctr, &inp[0..blocks * 16], &mut out[0..blocks * 16]);
     let last = inp.len() - inp.len() % 16;
-    if (last < inp.len()) {
+    if last < inp.len() {
         aes_ctr_xor_block(
             &ctx,
             ctr.wrapping_add(blocks as u32),
@@ -255,19 +255,19 @@ mod test {
 
     use super::{aes128_ctr_encrypt, aes128_ctr_init, aes128_ctr_xor_block};
 
-    const input: [u8; 32] = [
+    const INPUT: [u8; 32] = [
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
         0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D,
         0x1E, 0x1F,
     ];
-    const key: [u8; 16] = [
+    const KEY: [u8; 16] = [
         0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, 0x32, 0x53, 0x91,
         0x63,
     ];
-    const nonce: [u8; 12] = [
+    const NONCE: [u8; 12] = [
         0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B,
     ];
-    const expected: [u8; 32] = [
+    const EXPECTED: [u8; 32] = [
         0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, 0xEE, 0x8E, 0xDA, 0xD3,
         0x88, 0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41,
         0xBE, 0x28,
@@ -276,15 +276,33 @@ mod test {
     #[test]
     fn test_ctr_block() {
         let mut computed: [u8; 32] = [0u8; 32];
-        let ctx = aes128_ctr_init::<platform::portable::State>(&key, &nonce);
-        aes128_ctr_xor_block(&ctx, 1, &input[0..16], &mut computed[0..16]);
-        aes128_ctr_xor_block(&ctx, 2, &input[16..32], &mut computed[16..32]);
+        let ctx = aes128_ctr_init::<platform::portable::State>(&KEY, &NONCE);
+        aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+        aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
         for i in 0..32 {
-            if computed[i] != expected[i] {
+            if computed[i] != EXPECTED[i] {
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
-                    i, expected[i], computed[i]
-                )
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn test_ctr_block_neon() {
+        let mut computed: [u8; 32] = [0u8; 32];
+        let ctx = aes128_ctr_init::<platform::neon::State>(&KEY, &NONCE);
+        aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+        aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
+        for i in 0..32 {
+            if computed[i] != EXPECTED[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
             }
         }
     }
@@ -292,13 +310,29 @@ mod test {
     #[test]
     fn test_ctr_encrypt() {
         let mut computed: [u8; 32] = [0u8; 32];
-        aes128_ctr_encrypt::<platform::portable::State>(&key, &nonce, 1, &input, &mut computed);
+        aes128_ctr_encrypt::<platform::portable::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+        for i in 0..32 {
+            if computed[i] != EXPECTED[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn test_ctr_encrypt_neon() {
+        let mut computed: [u8; 32] = [0u8; 32];
+        aes128_ctr_encrypt::<platform::neon::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
         for i in 0..32 {
-            if computed[i] != expected[i] {
+            if computed[i] != EXPECTED[i] {
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
-                    i, expected[i], computed[i]
-                )
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
             }
         }
     }
diff --git a/libcrux-aesgcm/src/aes_gcm.rs b/libcrux-aesgcm/src/aes_gcm.rs
index 007adaea8..2933fa216 100644
--- a/libcrux-aesgcm/src/aes_gcm.rs
+++ b/libcrux-aesgcm/src/aes_gcm.rs
@@ -1,16 +1,16 @@
 use crate::{
     aes_ctr::{
-        aes128_ctr_encrypt, aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce,
-        aes128_ctr_update, aes128_ctr_xor_block, aes128_ctr_xor_blocks, AES128_CTR_Context,
+        aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce,
+        aes128_ctr_update, AES128_CTR_Context,
     },
     gf128_generic::{
-        self, gf128_emit, gf128_init, gf128_update, gf128_update_blocks, gf128_update_last,
+        gf128_emit, gf128_init, gf128_update,
         gf128_update_padded, GF128State,
     },
     platform::{AESState, GF128FieldElement},
 };
 
-#[allow(non_snake_case_types)]
+#[allow(non_camel_case_types)]
 pub struct AES128_GCM_State<T: AESState, U: GF128FieldElement> {
     aes_state: AES128_CTR_Context<T>,
     gcm_state: GF128State<U>,
@@ -101,28 +101,28 @@ pub fn aes128_gcm_decrypt<T: AESState, U: GF128FieldElement>(
 
 #[cfg(test)]
 mod test {
-    use crate::platform::{self, portable};
+    use crate::platform::{portable, neon};
 
     use super::{aes128_gcm_encrypt, aes128_gcm_init, aes128_gcm_set_nonce};
 
-    const input1: [u8; 60] = [
+    const INPUT1: [u8; 60] = [
         0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26,
         0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31,
         0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49,
         0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39,
     ];
-    const key1: [u8; 16] = [
+    const KEY1: [u8; 16] = [
         0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83,
         0x08,
     ];
-    const nonce1: [u8; 12] = [
+    const NONCE1: [u8; 12] = [
         0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
     ];
-    const aad1: [u8; 20] = [
+    const AAD1: [u8; 20] = [
         0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
         0xef, 0xab, 0xad, 0xda, 0xd2,
     ];
-    const expected1: [u8; 76] = [
+    const EXPECTED1: [u8; 76] = [
         0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4,
         0x9c, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac,
         0xa1, 0x2e, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac,
@@ -131,7 +131,7 @@ mod test {
         0x47,
     ];
 
-    const input2: [u8; 652] = [
+    const INPUT2: [u8; 652] = [
         0x08, 0x00, 0x00, 0x1e, 0x00, 0x1c, 0x00, 0x0a, 0x00, 0x14, 0x00, 0x12, 0x00, 0x1d, 0x00,
         0x17, 0x00, 0x18, 0x00, 0x19, 0x01, 0x00, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03, 0x01, 0x04,
         0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x01, 0xb9, 0x00, 0x00, 0x01, 0xb5, 0x00, 0x01, 0xb0,
@@ -178,16 +178,16 @@ mod test {
         0xe5, 0x67, 0x00, 0x13, 0xaa, 0xaa, 0x16,
     ];
 
-    const key2: [u8; 16] = [
+    const KEY2: [u8; 16] = [
         0xfd, 0xa2, 0xa4, 0x40, 0x46, 0x70, 0x80, 0x8f, 0x49, 0x37, 0x47, 0x8b, 0x8b, 0x6e, 0x3f,
         0xe1,
     ];
-    const nonce2: [u8; 12] = [
+    const NONCE2: [u8; 12] = [
         0xb5, 0xf3, 0xa3, 0xfa, 0xe1, 0xcb, 0x25, 0xc9, 0xdc, 0xd7, 0x39, 0x93,
     ];
-    const aad2: [u8; 0] = [];
+    const AAD2: [u8; 0] = [];
 
-    const expected2: [u8; 668] = [
+    const EXPECTED2: [u8; 668] = [
         0xc1, 0xe6, 0x31, 0xf8, 0x1d, 0x2a, 0xf2, 0x21, 0xeb, 0xb6, 0xa9, 0x57, 0xf5, 0x8f, 0x3e,
         0xe2, 0x66, 0x27, 0x26, 0x35, 0xe6, 0x7f, 0x99, 0xa7, 0x52, 0xf0, 0xdf, 0x08, 0xad, 0xeb,
         0x33, 0xba, 0xb8, 0x61, 0x1e, 0x55, 0xf3, 0x3d, 0x72, 0xcf, 0x84, 0x38, 0x24, 0x61, 0xa8,
@@ -238,16 +238,17 @@ mod test {
     #[test]
     fn test_gcm1() {
         let mut computed1 = [0u8; 76];
-        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&key1);
-        aes128_gcm_set_nonce(&mut st, &nonce1);
+        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY1);
+        aes128_gcm_set_nonce(&mut st, &NONCE1);
         let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes128_gcm_encrypt(&mut st, &aad1, &input1, &mut ciphertext, &mut tag);
+        aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
         for i in 0..76 {
-            if computed1[i] != expected1[i] {
+            if computed1[i] != EXPECTED1[i] {
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
-                    i, expected1[i], computed1[i]
-                )
+                    i, EXPECTED1[i], computed1[i]
+                );
+                assert!(false);
             }
         }
     }
@@ -255,16 +256,53 @@ mod test {
     #[test]
     fn test_gcm2() {
         let mut computed2 = [0u8; 668];
-        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&key2);
-        aes128_gcm_set_nonce(&mut st, &nonce2);
+        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY2);
+        aes128_gcm_set_nonce(&mut st, &NONCE2);
         let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes128_gcm_encrypt(&mut st, &aad2, &input2, &mut ciphertext, &mut tag);
+        aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
         for i in 0..668 {
-            if computed2[i] != expected2[i] {
+            if computed2[i] != EXPECTED2[i] {
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
-                    i, expected2[i], computed2[i]
-                )
+                    i, EXPECTED2[i], computed2[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn test_gcm1_neon() {
+        let mut computed1 = [0u8; 76];
+        let mut st = aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY1);
+        aes128_gcm_set_nonce(&mut st, &NONCE1);
+        let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
+        aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
+        for i in 0..76 {
+            if computed1[i] != EXPECTED1[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED1[i], computed1[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn test_gcm2_neon() {
+        let mut computed2 = [0u8; 668];
+        let mut st = aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY2);
+        aes128_gcm_set_nonce(&mut st, &NONCE2);
+        let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
+        aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
+        for i in 0..668 {
+            if computed2[i] != EXPECTED2[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED2[i], computed2[i]
+                );
+                assert!(false);
             }
         }
     }
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index 07e139063..8e7a2215e 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -60,11 +60,9 @@ pub fn gf128<T: GF128FieldElement>(key: &[u8], inp: &[u8], out: &mut [u8]) {
 
 #[cfg(test)]
 mod test {
-    use crate::platform;
-
     use super::gf128;
 
-    const input: [u8; 132] = [
+    const INPUT: [u8; 132] = [
         0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
         0xef, 0xab, 0xad, 0xda, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65,
@@ -76,12 +74,12 @@ mod test {
         0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x44, 0xae, 0x7e, 0x3f,
     ];
 
-    const key: [u8; 16] = [
+    const KEY: [u8; 16] = [
         0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda,
         0xd7,
     ];
 
-    const expected: [u8; 16] = [
+    const EXPECTED: [u8; 16] = [
         0xfb, 0xba, 0xaa, 0x70, 0xa0, 0x73, 0x6f, 0xf9, 0xed, 0x2f, 0xc4, 0x62, 0xde, 0x72, 0x61,
         0xe0,
     ];
@@ -89,13 +87,29 @@ mod test {
     #[test]
     fn test_gf128() {
         let mut computed: [u8; 16] = [0u8; 16];
-        gf128::<crate::platform::portable::FieldElement>(&key, &input, &mut computed);
+        gf128::<crate::platform::portable::FieldElement>(&KEY, &INPUT, &mut computed);
+        for i in 0..16 {
+            if computed[i] != EXPECTED[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn test_gf128_neon() {
+        let mut computed: [u8; 16] = [0u8; 16];
+        gf128::<crate::platform::neon::FieldElement>(&KEY, &INPUT, &mut computed);
         for i in 0..16 {
-            if computed[i] != expected[i] {
+            if computed[i] != EXPECTED[i] {
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
-                    i, expected[i], computed[i]
-                )
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
             }
         }
     }
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 4c4ce69d6..295dae202 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -1,5 +1,39 @@
-pub mod aes_ctr;
+mod aes_ctr;
 mod aes_gcm;
 mod aes_generic;
-pub mod gf128_generic;
+mod gf128_generic;
 mod platform;
+
+pub use aes_gcm::DecryptError;
+
+pub mod portable{
+    use crate::{aes_gcm::{self, DecryptError}, platform};
+
+    pub fn aes128_gcm_encrypt(key: &[u8], nonce: &[u8], aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]){
+        let mut st = aes_gcm::aes128_gcm_init::<platform::portable::State,platform::portable::FieldElement>(key);
+        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
+    }
+
+    pub fn aes128_gcm_decrypt(key: &[u8], nonce: &[u8], aad: &[u8], ciphertext: &[u8], tag: &[u8], plaintext: &mut [u8]) -> Result<(), DecryptError>{
+        let mut st = aes_gcm::aes128_gcm_init::<platform::portable::State,platform::portable::FieldElement>(key);
+        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
+    }
+}
+
+pub mod neon{
+    use crate::{aes_gcm::{self, DecryptError}, platform};
+
+    pub fn aes128_gcm_encrypt(key: &[u8], nonce: &[u8], aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]){
+        let mut st = aes_gcm::aes128_gcm_init::<platform::neon::State,platform::neon::FieldElement>(key);
+        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
+    }
+
+    pub fn aes128_gcm_decrypt(key: &[u8], nonce: &[u8], aad: &[u8], ciphertext: &[u8], tag: &[u8], plaintext: &mut [u8]) -> Result<(), DecryptError>{
+        let mut st = aes_gcm::aes128_gcm_init::<platform::neon::State,platform::neon::FieldElement>(key);
+        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
+    }
+}
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 25b961ccf..1174022b6 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -1,5 +1,8 @@
 pub mod portable;
 
+#[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+pub mod neon;
+
 pub trait AESState: Copy {
     fn new() -> Self;
     fn load_block(&mut self, b: &[u8]);
diff --git a/libcrux-aesgcm/src/platform/neon.rs b/libcrux-aesgcm/src/platform/neon.rs
new file mode 100644
index 000000000..7fe9d7462
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/neon.rs
@@ -0,0 +1,4 @@
+mod aes_core;
+mod gf128_core;
+pub(crate) use aes_core::State;
+pub(crate) use gf128_core::FieldElement;
diff --git a/libcrux-aesgcm/src/platform/neon/aes_core.rs b/libcrux-aesgcm/src/platform/neon/aes_core.rs
new file mode 100644
index 000000000..00b921402
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/neon/aes_core.rs
@@ -0,0 +1,118 @@
+use core::arch::aarch64::*;
+
+pub(crate) type State = uint8x16_t;
+
+fn new_state() -> State {
+    unsafe {vdupq_n_u8(0)}
+}
+
+fn xor_key1_state(st: &mut State, k: &State) {
+    unsafe {*st = veorq_u8(*st, *k)}
+}
+
+fn aes_enc(st: &mut State, key: &State) {
+    unsafe {*st = veorq_u8(vaesmcq_u8(vaeseq_u8(*st, vdupq_n_u8(0))),*key)}
+}
+
+fn aes_enc_last(st: &mut State, key: &State) {
+    unsafe {*st = veorq_u8(vaeseq_u8(*st, vdupq_n_u8(0)),*key)}
+}
+
+fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
+    unsafe {
+        let st = vaeseq_u8(*prev, vdupq_n_u8(0));
+        let mut tmp = [0u8; 16];
+        vst1q_u8(tmp.as_mut_ptr(), st);
+        let tmp_new = [tmp[4], tmp[1], tmp[14], tmp[11],
+                        tmp[1], tmp[14], tmp[11], tmp[4],
+                        tmp[12], tmp[9], tmp[6], tmp[3],
+                        tmp[9], tmp[6], tmp[3], tmp[12]];
+        let st_new = vld1q_u8(tmp_new.as_ptr());
+        let rcon_array = [0, rcon as u32, 0, rcon as u32];
+        let rcon_vec = vreinterpretq_u8_u32(vld1q_u32(rcon_array.as_ptr()));
+        *next = veorq_u8(st_new , rcon_vec);
+    }
+}
+
+fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
+    aes_keygen_assist(next, prev, rcon);
+    unsafe {*next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 3))}
+}
+
+fn aes_keygen_assist1(next: &mut State, prev: &State) {
+    aes_keygen_assist(next, prev, 0);
+    unsafe {*next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 2))}
+}
+
+
+fn key_expansion_step(next: &mut State, prev: &State) {
+    unsafe{
+        let zero = vdupq_n_u32(0);
+        let prev0 = vreinterpretq_u32_u8(*prev);
+        let prev1 = veorq_u32(prev0, vextq_u32(zero, prev0, 3));
+        let prev2 = veorq_u32(prev1, vextq_u32(zero, prev1, 3));
+        let prev3 = veorq_u32(prev2, vextq_u32(zero, prev2, 3));
+        *next = veorq_u8(*next,vreinterpretq_u8_u32(prev3));
+    }
+}
+
+impl crate::platform::AESState for State {
+    fn new() -> Self {
+        new_state()
+    }
+
+    fn load_block(&mut self, b: &[u8]) {
+        debug_assert!(b.len() == 16);
+        unsafe {*self = vld1q_u8(b.as_ptr())};
+    }
+
+    fn store_block(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
+        unsafe {vst1q_u8(out.as_mut_ptr(), *self)}
+    }
+
+    fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
+        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+        let inp_vec = unsafe {vld1q_u8(inp.as_ptr()) };
+        let out_vec = unsafe {veorq_u8(inp_vec, *self)};
+        unsafe {vst1q_u8(out.as_mut_ptr(),out_vec)}
+    }
+
+    fn xor_key(&mut self, key: &Self) {
+        xor_key1_state(self, key);
+    }
+
+    fn aes_enc(&mut self, key: &Self) {
+        aes_enc(self, key);
+        (self, key);
+    }
+
+    fn aes_enc_last(&mut self, key: &Self) {
+        aes_enc_last(self, key);
+    }
+
+    fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8) {
+        aes_keygen_assist0(self, prev, rcon);
+    }
+
+    fn aes_keygen_assist1(&mut self, prev: &Self) {
+        aes_keygen_assist1(self, prev);
+    }
+
+    fn key_expansion_step(&mut self, prev: &Self) {
+        key_expansion_step(self, prev)
+    }
+}
+
+
+#[test]
+fn test () {
+    unsafe {
+        let zero = vdupq_n_u32(0);
+        let arr : [u32;4] = [0,1,2,3];
+        let x = vld1q_u32(arr.as_ptr());
+        let y = vdupq_laneq_u32(x, 3);
+        let z = vextq_u32(zero, x, 3);
+        println!("arr:{:?}, x: {:?}, y: {:?}, z: {:?}",arr,x,y,z);
+    }
+}
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/neon/gf128_core.rs b/libcrux-aesgcm/src/platform/neon/gf128_core.rs
new file mode 100644
index 000000000..f6bd7e344
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/neon/gf128_core.rs
@@ -0,0 +1,75 @@
+use core::arch::aarch64::*;
+
+#[derive(Clone,Copy)]
+pub struct FieldElement(pub u128);
+
+fn zero() -> FieldElement {
+    FieldElement(0)
+}
+
+fn load_elem(b: &[u8]) -> FieldElement {
+    debug_assert!(b.len() == 16);
+    FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
+}
+
+fn store_elem(elem: &FieldElement, b: &mut [u8]) {
+    debug_assert!(b.len() == 16);
+    b.copy_from_slice(&elem.0.to_be_bytes());
+}
+
+fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
+    FieldElement((*elem).0 ^ (*other).0)
+}
+
+fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+    let l0 = (*elem).0 as u64;
+    let h0 = ((*elem).0 >> 64) as u64;
+    let l1 = (*other).0 as u64;
+    let h1 = ((*other).0 >> 64) as u64;
+    let low : u128 = unsafe {vmull_p64(l0, l1)};
+    let m1 : u128 = unsafe {vmull_p64(l0, h1)};
+    let m2 : u128 = unsafe {vmull_p64(l1, h0)};
+    let high : u128 = unsafe {vmull_p64(h0, h1)};
+    let mid = m1 ^ m2;
+    let m0 = mid << 64;
+    let m1 = mid >> 64;
+    let low = low ^ m0;
+    let high = high ^ m1;
+    (FieldElement(high), FieldElement(low))
+}
+
+fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
+    let high = ((*high).0 << 1) ^ ((*low).0 >> 127);
+    let low = (*low).0 << 1;
+    let x0_0 = low << 64;
+    let x1_x0 = low ^ (x0_0 << 63) ^ (x0_0 << 62) ^ (x0_0 << 57);
+    let x1_x0 = x1_x0 ^ (x1_x0 >> 1) ^ (x1_x0 >> 2) ^ (x1_x0 >> 7);
+    FieldElement(x1_x0 ^ high)
+}
+
+fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
+    let (high,low) = mul_wide(x,y);
+    reduce(&high,&low)
+}
+
+impl crate::platform::GF128FieldElement for FieldElement {
+    fn zero() -> Self {
+        zero()
+    }
+
+    fn load_elem(b: &[u8]) -> Self {
+        load_elem(b)
+    }
+
+    fn store_elem(&self, b: &mut [u8]) {
+        store_elem(self, b);
+    }
+
+    fn add(&mut self, other: &Self) {
+        *self = add(self, other);
+    }
+
+    fn mul(&mut self, other: &Self) {
+        *self = mul(self, other)
+    }
+}
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/libcrux-aesgcm/src/platform/portable/aes_core.rs
index 8f2b10178..2a27ccd84 100644
--- a/libcrux-aesgcm/src/platform/portable/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/aes_core.rs
@@ -521,8 +521,8 @@ fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
     fn aux(mut n: u16) -> u16 {
         n &= 0x0f00;
-        n ^= (n << 4);
-        n ^= (n >> 8);
+        n ^= n << 4;
+        n ^= n >> 8;
         n
     }
     next[0] = aux(next[0]);
diff --git a/libcrux-aesgcm/src/platform/portable/gf128_core.rs b/libcrux-aesgcm/src/platform/portable/gf128_core.rs
index 40a2724ed..508bc601b 100644
--- a/libcrux-aesgcm/src/platform/portable/gf128_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/gf128_core.rs
@@ -3,6 +3,7 @@ pub(crate) type FieldElement = u128;
 fn zero() -> FieldElement {
     0
 }
+
 fn load_elem(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
     u128::from_be_bytes(b.try_into().unwrap())
@@ -37,7 +38,7 @@ fn mul_x(elem: &mut FieldElement) {
 fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut FieldElement) {
     debug_assert!(i < 128);
     let mask = ith_bit_mask(x, i);
-    *result ^= (*y & mask);
+    *result ^= *y & mask;
     mul_x(y);
 }
 

From 5ccd52bd3d012814f8825cdbaf4ead072b2a30f6 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 23 Apr 2025 17:20:58 +0200
Subject: [PATCH 05/43] wip

---
 libcrux-aesgcm/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 295dae202..2acea349f 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -1,8 +1,8 @@
-mod aes_ctr;
+pub mod aes_ctr;
 mod aes_gcm;
 mod aes_generic;
 mod gf128_generic;
-mod platform;
+pub mod platform;
 
 pub use aes_gcm::DecryptError;
 

From 632735dbb7308264b6249e7f9ed457ea92653448 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Wed, 23 Apr 2025 15:31:18 +0000
Subject: [PATCH 06/43] fixes for portability

---
 libcrux-aesgcm/Cargo.toml           | 6 +++---
 libcrux-aesgcm/src/aes_ctr.rs       | 2 ++
 libcrux-aesgcm/src/aes_gcm.rs       | 7 ++++++-
 libcrux-aesgcm/src/gf128_generic.rs | 1 +
 libcrux-aesgcm/src/lib.rs           | 1 +
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/libcrux-aesgcm/Cargo.toml b/libcrux-aesgcm/Cargo.toml
index bd3335c56..b2700c901 100644
--- a/libcrux-aesgcm/Cargo.toml
+++ b/libcrux-aesgcm/Cargo.toml
@@ -14,9 +14,9 @@ exclude = ["/proofs", "/c.sh", "/c.yaml", "/tests/tv", "tests/cavp.rs"]
 bench = false # so libtest doesn't eat the arguments to criterion
 
 [dependencies]
-libcrux-platform = { version = "0.0.2-beta.2", path = "../sys/platform" }
-libcrux-intrinsics = { version = "0.0.2-beta.2", path = "../libcrux-intrinsics" }
-hax-lib = { version = "0.1.0-alpha.1", git = "https://github.com/hacspec/hax/" }
+libcrux-platform = { version = "0.0.2", path = "../sys/platform" }
+libcrux-intrinsics = { version = "0.0.2", path = "../libcrux-intrinsics" }
+hax-lib = { version = "0.2", git = "https://github.com/cryspen/hax/" }
 
 [features]
 simd128 = []
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index 777afc992..5ab4bb041 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -290,6 +290,7 @@ mod test {
         }
     }
 
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
     #[test]
     fn test_ctr_block_neon() {
         let mut computed: [u8; 32] = [0u8; 32];
@@ -322,6 +323,7 @@ mod test {
         }
     }
 
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
     #[test]
     fn test_ctr_encrypt_neon() {
         let mut computed: [u8; 32] = [0u8; 32];
diff --git a/libcrux-aesgcm/src/aes_gcm.rs b/libcrux-aesgcm/src/aes_gcm.rs
index 2933fa216..e1d6a3651 100644
--- a/libcrux-aesgcm/src/aes_gcm.rs
+++ b/libcrux-aesgcm/src/aes_gcm.rs
@@ -101,7 +101,7 @@ pub fn aes128_gcm_decrypt<T: AESState, U: GF128FieldElement>(
 
 #[cfg(test)]
 mod test {
-    use crate::platform::{portable, neon};
+    use crate::platform::portable;
 
     use super::{aes128_gcm_encrypt, aes128_gcm_init, aes128_gcm_set_nonce};
 
@@ -270,7 +270,11 @@ mod test {
             }
         }
     }
+    
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    use crate::platform::neon;
 
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
     #[test]
     fn test_gcm1_neon() {
         let mut computed1 = [0u8; 76];
@@ -289,6 +293,7 @@ mod test {
         }
     }
 
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
     #[test]
     fn test_gcm2_neon() {
         let mut computed2 = [0u8; 668];
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index 8e7a2215e..f44635b54 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -99,6 +99,7 @@ mod test {
         }
     }
 
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
     #[test]
     fn test_gf128_neon() {
         let mut computed: [u8; 16] = [0u8; 16];
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 2acea349f..82fe68050 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -22,6 +22,7 @@ pub mod portable{
     }
 }
 
+#[cfg(all(target_arch = "aarch64", target_feature="aes"))]
 pub mod neon{
     use crate::{aes_gcm::{self, DecryptError}, platform};
 

From 1d77af8b5d64e67ca32ebfb534fcc08fb2af0b3d Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Sun, 27 Apr 2025 17:26:05 +0000
Subject: [PATCH 07/43] rcon generic

---
 libcrux-aesgcm/src/aes_ctr.rs                 | 18 ++++++
 libcrux-aesgcm/src/aes_generic.rs             | 56 ++++++++++++++-----
 libcrux-aesgcm/src/platform.rs                |  5 +-
 libcrux-aesgcm/src/platform/intel_ni.rs       |  2 +
 .../src/platform/portable/aes_core.rs         |  4 +-
 5 files changed, 69 insertions(+), 16 deletions(-)
 create mode 100644 libcrux-aesgcm/src/platform/intel_ni.rs

diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index 5ab4bb041..785691e13 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -308,6 +308,24 @@ mod test {
         }
     }
 
+    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    #[test]
+    fn test_ctr_block_neon() {
+        let mut computed: [u8; 32] = [0u8; 32];
+        let ctx = aes128_ctr_init::<platform::neon::State>(&KEY, &NONCE);
+        aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+        aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
+        for i in 0..32 {
+            if computed[i] != EXPECTED[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
     #[test]
     fn test_ctr_encrypt() {
         let mut computed: [u8; 32] = [0u8; 32];
diff --git a/libcrux-aesgcm/src/aes_generic.rs b/libcrux-aesgcm/src/aes_generic.rs
index 862a03397..a3b041dae 100644
--- a/libcrux-aesgcm/src/aes_generic.rs
+++ b/libcrux-aesgcm/src/aes_generic.rs
@@ -10,11 +10,24 @@ pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11
     debug_assert!(key.len() == 16);
     let mut keyex = [T::new(); 11];
     keyex[0].load_block(&key);
-    for i in 1..11 {
-        let prev = keyex[i - 1];
-        keyex[i].aes_keygen_assist0(&prev, RCON[i]);
-        keyex[i].key_expansion_step(&prev);
+
+    macro_rules! expansion_step128 {
+        ($i:expr,$rcon:expr) => {
+            let prev = keyex[$i-1];
+            keyex[$i].aes_keygen_assist0::<$rcon>(&prev);
+            keyex[$i].key_expansion_step(&prev);
+        }
     }
+    expansion_step128!(1,0x01);
+    expansion_step128!(2,0x02);
+    expansion_step128!(3,0x04);
+    expansion_step128!(4,0x08);
+    expansion_step128!(5,0x10);
+    expansion_step128!(6,0x20);
+    expansion_step128!(7,0x40);
+    expansion_step128!(8,0x80);
+    expansion_step128!(9,0x1b);
+    expansion_step128!(10,0x36);
     keyex
 }
 
@@ -23,18 +36,35 @@ pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15
     let mut keyex = [T::new(); 15];
     keyex[0].load_block(&key[0..16]);
     keyex[1].load_block(&key[16..32]);
-    for i in 2..14 {
-        let prev0 = keyex[i - 2];
-        let prev1 = keyex[i - 1];
-        keyex[i].aes_keygen_assist0(&prev1, RCON[i / 2]);
-        keyex[i].key_expansion_step(&prev0);
-        let next0 = keyex[i];
-        keyex[i + 1].aes_keygen_assist1(&next0);
-        keyex[i + 1].key_expansion_step(&prev1);
+
+    macro_rules! expansion_step256 {
+        ($i:expr,$rcon:expr) => {
+            let prev0 = keyex[$i-2];
+            let prev1 = keyex[$i-1];
+            keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
+            keyex[$i].key_expansion_step(&prev0);
+            let next0 = keyex[$i];
+            keyex[$i+1].aes_keygen_assist1(&next0);
+            keyex[$i+1].key_expansion_step(&prev1);
+        }
     }
+
+    expansion_step256!(2,0x01);
+    expansion_step256!(3,0x01);
+    expansion_step256!(4,0x02);
+    expansion_step256!(5,0x02);
+    expansion_step256!(6,0x04);
+    expansion_step256!(7,0x04);
+    expansion_step256!(8,0x08);
+    expansion_step256!(9,0x08);
+    expansion_step256!(10,0x10);
+    expansion_step256!(11,0x10);
+    expansion_step256!(12,0x20);
+    expansion_step256!(13,0x20);
+
     let prev0 = keyex[12];
     let prev1 = keyex[13];
-    keyex[14].aes_keygen_assist0(&prev1, RCON[7]);
+    keyex[14].aes_keygen_assist0::<0x40>(&prev1);
     keyex[14].key_expansion_step(&prev0);
     keyex
 }
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 1174022b6..9142fd2e9 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -3,6 +3,9 @@ pub mod portable;
 #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
 pub mod neon;
 
+//#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+pub mod intel_ni;
+
 pub trait AESState: Copy {
     fn new() -> Self;
     fn load_block(&mut self, b: &[u8]);
@@ -12,7 +15,7 @@ pub trait AESState: Copy {
     fn xor_key(&mut self, key: &Self);
     fn aes_enc(&mut self, key: &Self);
     fn aes_enc_last(&mut self, key: &Self);
-    fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8);
+    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self);
     fn aes_keygen_assist1(&mut self, prev: &Self);
     fn key_expansion_step(&mut self, prev: &Self);
 }
diff --git a/libcrux-aesgcm/src/platform/intel_ni.rs b/libcrux-aesgcm/src/platform/intel_ni.rs
new file mode 100644
index 000000000..329cdcbee
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/intel_ni.rs
@@ -0,0 +1,2 @@
+mod aes_core;
+mod gf128_core;
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/libcrux-aesgcm/src/platform/portable/aes_core.rs
index 2a27ccd84..ca87e09f4 100644
--- a/libcrux-aesgcm/src/platform/portable/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/aes_core.rs
@@ -588,8 +588,8 @@ impl crate::platform::AESState for State {
         aes_enc_last(self, key);
     }
 
-    fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8) {
-        aes_keygen_assist0(self, prev, rcon);
+    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self) {
+        aes_keygen_assist0(self, prev, RCON as u8);
     }
 
     fn aes_keygen_assist1(&mut self, prev: &Self) {

From 169cadd598f9f1e32927653b85525843be0c16cf Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Mon, 28 Apr 2025 09:38:55 +0000
Subject: [PATCH 08/43] intel

---
 libcrux-aesgcm/src/aes_ctr.rs                 | 17 ++++++++
 libcrux-aesgcm/src/aes_gcm.rs                 | 41 +++++++++++++++++++
 libcrux-aesgcm/src/gf128_generic.rs           | 16 ++++++++
 libcrux-aesgcm/src/platform.rs                |  2 +-
 libcrux-aesgcm/src/platform/intel_ni.rs       |  4 +-
 .../src/platform/neon/gf128_core.rs           |  1 +
 6 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index 785691e13..b12ffdc4d 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -356,4 +356,21 @@ mod test {
             }
         }
     }
+
+    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[test]
+    fn test_ctr_encrypt_intel() {
+        let mut computed: [u8; 32] = [0u8; 32];
+        aes128_ctr_encrypt::<platform::intel_ni::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+        for i in 0..32 {
+            if computed[i] != EXPECTED[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+    
 }
diff --git a/libcrux-aesgcm/src/aes_gcm.rs b/libcrux-aesgcm/src/aes_gcm.rs
index e1d6a3651..18294b441 100644
--- a/libcrux-aesgcm/src/aes_gcm.rs
+++ b/libcrux-aesgcm/src/aes_gcm.rs
@@ -311,4 +311,45 @@ mod test {
             }
         }
     }
+
+    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    use crate::platform::intel_ni;
+
+    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[test]
+    fn test_gcm1_intel() {
+        let mut computed1 = [0u8; 76];
+        let mut st = aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY1);
+        aes128_gcm_set_nonce(&mut st, &NONCE1);
+        let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
+        aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
+        for i in 0..76 {
+            if computed1[i] != EXPECTED1[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED1[i], computed1[i]
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[test]
+    fn test_gcm2_intel() {
+        let mut computed2 = [0u8; 668];
+        let mut st = aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY2);
+        aes128_gcm_set_nonce(&mut st, &NONCE2);
+        let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
+        aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
+        for i in 0..668 {
+            if computed2[i] != EXPECTED2[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED2[i], computed2[i]
+                );
+                assert!(false);
+            }
+        }
+    }
 }
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index f44635b54..144837fab 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -114,4 +114,20 @@ mod test {
             }
         }
     }
+
+    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[test]
+    fn test_gf128_intel() {
+        let mut computed: [u8; 16] = [0u8; 16];
+        gf128::<crate::platform::intel_ni::FieldElement>(&KEY, &INPUT, &mut computed);
+        for i in 0..16 {
+            if computed[i] != EXPECTED[i] {
+                println!(
+                    "mismatch at {}: expected is {}, computed is {}",
+                    i, EXPECTED[i], computed[i]
+                );
+                assert!(false);
+            }
+        }
+    }
 }
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 9142fd2e9..02452382c 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -3,7 +3,7 @@ pub mod portable;
 #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
 pub mod neon;
 
-//#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
 pub mod intel_ni;
 
 pub trait AESState: Copy {
diff --git a/libcrux-aesgcm/src/platform/intel_ni.rs b/libcrux-aesgcm/src/platform/intel_ni.rs
index 329cdcbee..9d14728fe 100644
--- a/libcrux-aesgcm/src/platform/intel_ni.rs
+++ b/libcrux-aesgcm/src/platform/intel_ni.rs
@@ -1,2 +1,4 @@
 mod aes_core;
-mod gf128_core;
\ No newline at end of file
+mod gf128_core;
+pub(crate) use aes_core::State;
+pub(crate) use gf128_core::FieldElement;
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/neon/gf128_core.rs b/libcrux-aesgcm/src/platform/neon/gf128_core.rs
index f6bd7e344..2976fb355 100644
--- a/libcrux-aesgcm/src/platform/neon/gf128_core.rs
+++ b/libcrux-aesgcm/src/platform/neon/gf128_core.rs
@@ -73,3 +73,4 @@ impl crate::platform::GF128FieldElement for FieldElement {
         *self = mul(self, other)
     }
 }
+

From e21bb4fdea2a57ef5dd4d94cfc22cd2043c887e5 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Mon, 28 Apr 2025 09:49:20 +0000
Subject: [PATCH 09/43] intel-ni

---
 libcrux-aesgcm/benches/aesgcm.rs              |  21 +++-
 libcrux-aesgcm/src/lib.rs                     |  17 +++
 libcrux-aesgcm/src/platform.rs                |   2 +-
 .../src/platform/intel_ni/aes_core.rs         | 108 ++++++++++++++++++
 .../src/platform/intel_ni/gf128_core.rs       |  88 ++++++++++++++
 5 files changed, 233 insertions(+), 3 deletions(-)
 create mode 100644 libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
 create mode 100644 libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs

diff --git a/libcrux-aesgcm/benches/aesgcm.rs b/libcrux-aesgcm/benches/aesgcm.rs
index 9a98e0d7a..ddfaea713 100644
--- a/libcrux-aesgcm/benches/aesgcm.rs
+++ b/libcrux-aesgcm/benches/aesgcm.rs
@@ -19,7 +19,7 @@ pub fn fmt(x: usize) -> String {
 
 
 macro_rules! impl_comp {
-    ($fun:ident, $portable_fun:expr, $neon_fun:expr, $rustcrypto_fun:expr) => {
+    ($fun:ident, $portable_fun:expr, $neon_fun:expr, $intel_fun:expr, $rustcrypto_fun:expr) => {
         // Comparing libcrux performance for different payload sizes and other implementations.
         fn $fun(c: &mut Criterion) {
             const PAYLOAD_SIZES: [usize; 3] = [128, 1024, 1024 * 1024 * 10];
@@ -63,6 +63,23 @@ macro_rules! impl_comp {
                     },
                 );
 
+//                #[cfg(all(target_arch = "x86", target_feature="aes"))]
+                group.bench_with_input(
+                    BenchmarkId::new("intel-aes-clmul", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
+                            |(key,nonce,aad,payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $intel_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+
                 group.bench_with_input(
                     BenchmarkId::new("rust-crypto", fmt(*payload_size)),
                     payload_size,
@@ -95,7 +112,7 @@ fn rustcrypto_aes128_gcm_encrypt(key:&[u8], nonce:&[u8], aad:&[u8], plain:&[u8],
     tag.copy_from_slice(&ctxt[plain.len()..]);
 }
 
-impl_comp!(AES128_GCM, libcrux_aesgcm::portable::aes128_gcm_encrypt, libcrux_aesgcm::neon::aes128_gcm_encrypt, rustcrypto_aes128_gcm_encrypt);
+impl_comp!(AES128_GCM, libcrux_aesgcm::portable::aes128_gcm_encrypt, libcrux_aesgcm::neon::aes128_gcm_encrypt, libcrux_aesgcm::intel_ni::aes128_gcm_encrypt, rustcrypto_aes128_gcm_encrypt);
 
 fn benchmarks(c: &mut Criterion) {
     AES128_GCM(c);
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 82fe68050..27db8b6f9 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -37,4 +37,21 @@ pub mod neon{
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
     }
+}
+
+//#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+pub mod intel_ni{
+    use crate::{aes_gcm::{self, DecryptError}, platform};
+
+    pub fn aes128_gcm_encrypt(key: &[u8], nonce: &[u8], aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]){
+        let mut st = aes_gcm::aes128_gcm_init::<platform::intel_ni::State,platform::intel_ni::FieldElement>(key);
+        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
+    }
+
+    pub fn aes128_gcm_decrypt(key: &[u8], nonce: &[u8], aad: &[u8], ciphertext: &[u8], tag: &[u8], plaintext: &mut [u8]) -> Result<(), DecryptError>{
+        let mut st = aes_gcm::aes128_gcm_init::<platform::intel_ni::State,platform::intel_ni::FieldElement>(key);
+        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
+    }
 }
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 02452382c..9142fd2e9 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -3,7 +3,7 @@ pub mod portable;
 #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
 pub mod neon;
 
-#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+//#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
 pub mod intel_ni;
 
 pub trait AESState: Copy {
diff --git a/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs b/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
new file mode 100644
index 000000000..6c20adc58
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
@@ -0,0 +1,108 @@
+use core::arch::x86_64::*;
+
+pub(crate) type State = __m128i;
+
+fn new_state() -> State {
+    unsafe { _mm_setzero_si128() }
+}
+
+fn xor_key1_state(st: &mut State, k: &State) {
+    unsafe { *st = _mm_xor_si128(*st, *k) }
+}
+
+fn aes_enc(st: &mut State, key: &State) {
+    unsafe { *st = _mm_aesenc_si128(*st, *key) }
+}
+
+fn aes_enc_last(st: &mut State, key: &State) {
+    unsafe { *st = _mm_aesenclast_si128(*st, *key) }
+}
+
+fn aes_keygen_assist<const RCON:i32>(next: &mut State, prev: &State) {
+    unsafe { *next = _mm_aeskeygenassist_si128::<RCON>(*prev)}
+}
+
+fn aes_keygen_assist0<const RCON:i32>(next: &mut State, prev: &State) {
+    aes_keygen_assist::<RCON>(next, prev);
+    unsafe { *next = _mm_shuffle_epi32(*next, 0xff) }
+}
+
+
+fn aes_keygen_assist1(next: &mut State, prev: &State) {
+    aes_keygen_assist::<0>(next, prev);
+    unsafe { *next = _mm_shuffle_epi32(*next, 0xaa) }
+}
+
+fn key_expansion_step(next: &mut State, prev: &State) {
+    unsafe{
+        let p0 = _mm_xor_si128(*prev, _mm_slli_si128(*prev,4));
+        let p1 = _mm_xor_si128(p0, _mm_slli_si128(p0,4));
+        let p2 = _mm_xor_si128(p1, _mm_slli_si128(p1,4));
+        *next = _mm_xor_si128(*next,p2);
+    }
+}
+
+
+impl crate::platform::AESState for State {
+    fn new() -> Self {
+        new_state()
+    }
+
+    fn load_block(&mut self, b: &[u8]) {
+        debug_assert!(b.len() == 16);
+        unsafe { *self = _mm_loadu_si128(b.as_ptr() as *const __m128i) };
+    }
+
+    fn store_block(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
+        unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, *self) }
+    }
+
+    fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
+        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+        let inp_vec = unsafe { _mm_loadu_si128(inp.as_ptr() as *const __m128i) };
+        let out_vec = unsafe { _mm_xor_si128(inp_vec, *self) };
+        unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, out_vec) }
+    }
+
+    fn xor_key(&mut self, key: &Self) {
+        xor_key1_state(self, key);
+    }
+
+    fn aes_enc(&mut self, key: &Self) {
+        aes_enc(self, key);
+        (self, key);
+    }
+
+    fn aes_enc_last(&mut self, key: &Self) {
+        aes_enc_last(self, key);
+    }
+
+    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self) {
+        aes_keygen_assist0::<RCON>(self, prev);
+    }
+
+    fn aes_keygen_assist1(&mut self, prev: &Self) {
+        aes_keygen_assist1(self, prev);
+    }
+
+    fn key_expansion_step(&mut self, prev: &Self) {
+        key_expansion_step(self, prev)
+    }
+}
+
+#[test]
+fn test() {
+    unsafe {
+        let x = _mm_set_epi32(3,2,1,0);
+        let y = _mm_shuffle_epi32(x,0xaa);
+        let w = _mm_slli_si128(x,4);
+        let mut z:[i32; 4] = [0;4];
+        _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, x);
+        println!("{:?}",z);
+        _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, w);
+        println!("shift right 4 {:?}",z);
+        _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, y);
+        println!("shuffle aa {:?}",z);
+    }
+}
\ No newline at end of file
diff --git a/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs b/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
new file mode 100644
index 000000000..b58fb3c2f
--- /dev/null
+++ b/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -0,0 +1,88 @@
+use core::arch::x86_64::*;
+
+#[derive(Clone,Copy)]
+pub struct FieldElement(pub u128);
+
+fn zero() -> FieldElement {
+    FieldElement(0)
+}
+
+fn load_elem(b: &[u8]) -> FieldElement {
+    debug_assert!(b.len() == 16);
+    FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
+}
+
+fn store_elem(elem: &FieldElement, b: &mut [u8]) {
+    debug_assert!(b.len() == 16);
+    b.copy_from_slice(&elem.0.to_be_bytes());
+}
+
+fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
+    FieldElement((*elem).0 ^ (*other).0)
+}
+
+fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+    let lhs : __m128i = unsafe { std::mem::transmute((*elem).0) };
+    let rhs : __m128i = unsafe { std::mem::transmute((*other).0) };
+    let low = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x11) };
+    let mid0 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x10) };
+    let mid1 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x01) };
+    let high = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x00) };
+    let mid = unsafe { _mm_xor_si128(mid0, mid1) };
+    let m0 = unsafe { _mm_srli_si128(mid, 8) };
+    let m1 = unsafe { _mm_slli_si128(mid, 8) };
+    let low = unsafe { _mm_xor_si128(low, m0) };
+    let high = unsafe { _mm_xor_si128(high, m1) };
+
+    let low128 : u128 = unsafe { std::mem::transmute(low) };
+    let high128 : u128 = unsafe { std::mem::transmute(high) };   
+   (FieldElement(low128), FieldElement(high128))
+}
+
+fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
+    let high = ((*high).0 << 1) ^ ((*low).0 >> 127);
+    let low = (*low).0 << 1;
+    let x0_0 = low << 64;
+    let x1_x0 = low ^ (x0_0 << 63) ^ (x0_0 << 62) ^ (x0_0 << 57);
+    let x1_x0 = x1_x0 ^ (x1_x0 >> 1) ^ (x1_x0 >> 2) ^ (x1_x0 >> 7);
+    FieldElement(x1_x0 ^ high)
+}
+
+fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
+    let (high,low) = mul_wide(x,y);
+    reduce(&high,&low)
+}
+
+impl crate::platform::GF128FieldElement for FieldElement {
+    fn zero() -> Self {
+        zero()
+    }
+
+    fn load_elem(b: &[u8]) -> Self {
+        load_elem(b)
+    }
+
+    fn store_elem(&self, b: &mut [u8]) {
+        store_elem(self, b);
+    }
+
+    fn add(&mut self, other: &Self) {
+        *self = add(self, other);
+    }
+
+    fn mul(&mut self, other: &Self) {
+        *self = mul(self, other)
+    }
+}
+
+#[test]
+fn test_transmute() {
+    let x = 1u128 << 64 ^ 2u128;
+    let xv : __m128i = unsafe { std::mem::transmute(x)};
+    let xv : __m128i = unsafe { _mm_slli_si128(xv,8)};
+    let x : u128 = unsafe { std::mem::transmute(xv)};
+    println!("trans {:x}", x);
+    let mut u64s = [0u64; 2];
+    unsafe { _mm_storeu_si128(u64s.as_mut_ptr() as *mut __m128i, xv)};
+    println!("store {:?}", u64s)
+}
\ No newline at end of file

From 749f32dec75ec6cbd1f3dfa81031a2044f66d84b Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Mon, 28 Apr 2025 09:54:28 +0000
Subject: [PATCH 10/43] flags

---
 libcrux-aesgcm/benches/aesgcm.rs                   | 2 +-
 libcrux-aesgcm/src/aes_ctr.rs                      | 2 +-
 libcrux-aesgcm/src/gf128_generic.rs                | 2 +-
 libcrux-aesgcm/src/platform.rs                     | 2 +-
 libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs | 2 ++
 5 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/libcrux-aesgcm/benches/aesgcm.rs b/libcrux-aesgcm/benches/aesgcm.rs
index ddfaea713..924433719 100644
--- a/libcrux-aesgcm/benches/aesgcm.rs
+++ b/libcrux-aesgcm/benches/aesgcm.rs
@@ -63,7 +63,7 @@ macro_rules! impl_comp {
                     },
                 );
 
-//                #[cfg(all(target_arch = "x86", target_feature="aes"))]
+                #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
                 group.bench_with_input(
                     BenchmarkId::new("intel-aes-clmul", fmt(*payload_size)),
                     payload_size,
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index b12ffdc4d..c562e101f 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -357,7 +357,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
     #[test]
     fn test_ctr_encrypt_intel() {
         let mut computed: [u8; 32] = [0u8; 32];
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index 144837fab..1d2eecc8b 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -115,7 +115,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
     #[test]
     fn test_gf128_intel() {
         let mut computed: [u8; 16] = [0u8; 16];
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 9142fd2e9..8f93d2c55 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -3,7 +3,7 @@ pub mod portable;
 #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
 pub mod neon;
 
-//#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+#[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
 pub mod intel_ni;
 
 pub trait AESState: Copy {
diff --git a/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs b/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
index b58fb3c2f..6d80b94cd 100644
--- a/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
+++ b/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -1,5 +1,7 @@
 use core::arch::x86_64::*;
 
+// A lot of the code below is shared with NEON. Refactor!
+
 #[derive(Clone,Copy)]
 pub struct FieldElement(pub u128);
 

From 4e069642d11a99901a9bc8a7f8ec9d6a937de8b4 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Mon, 28 Apr 2025 11:01:12 +0100
Subject: [PATCH 11/43] fixed for arm

---
 libcrux-aesgcm/src/aes_ctr.rs                | 17 -----------------
 libcrux-aesgcm/src/lib.rs                    |  2 +-
 libcrux-aesgcm/src/platform/neon/aes_core.rs | 17 ++---------------
 3 files changed, 3 insertions(+), 33 deletions(-)

diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index c562e101f..6064a5158 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -308,23 +308,6 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
-    #[test]
-    fn test_ctr_block_neon() {
-        let mut computed: [u8; 32] = [0u8; 32];
-        let ctx = aes128_ctr_init::<platform::neon::State>(&KEY, &NONCE);
-        aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
-        aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
-        for i in 0..32 {
-            if computed[i] != EXPECTED[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
 
     #[test]
     fn test_ctr_encrypt() {
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 27db8b6f9..08f32d53b 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -39,7 +39,7 @@ pub mod neon{
     }
 }
 
-//#[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+#[cfg(all(target_arch = "x86_64"))] // REENABLE target_feature="aes"
 pub mod intel_ni{
     use crate::{aes_gcm::{self, DecryptError}, platform};
 
diff --git a/libcrux-aesgcm/src/platform/neon/aes_core.rs b/libcrux-aesgcm/src/platform/neon/aes_core.rs
index 00b921402..38e29a825 100644
--- a/libcrux-aesgcm/src/platform/neon/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/neon/aes_core.rs
@@ -91,8 +91,8 @@ impl crate::platform::AESState for State {
         aes_enc_last(self, key);
     }
 
-    fn aes_keygen_assist0(&mut self, prev: &Self, rcon: u8) {
-        aes_keygen_assist0(self, prev, rcon);
+    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self) {
+        aes_keygen_assist0(self, prev, RCON as u8);
     }
 
     fn aes_keygen_assist1(&mut self, prev: &Self) {
@@ -103,16 +103,3 @@ impl crate::platform::AESState for State {
         key_expansion_step(self, prev)
     }
 }
-
-
-#[test]
-fn test () {
-    unsafe {
-        let zero = vdupq_n_u32(0);
-        let arr : [u32;4] = [0,1,2,3];
-        let x = vld1q_u32(arr.as_ptr());
-        let y = vdupq_laneq_u32(x, 3);
-        let z = vextq_u32(zero, x, 3);
-        println!("arr:{:?}, x: {:?}, y: {:?}, z: {:?}",arr,x,y,z);
-    }
-}
\ No newline at end of file

From c8da11ebf5e6d62e8914f6a45a92de33bf749ff0 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Mon, 28 Apr 2025 11:15:47 +0100
Subject: [PATCH 12/43] fmt

---
 libcrux-aesgcm/benches/aesgcm.rs              |  96 ++++++++++++----
 libcrux-aesgcm/src/aes_ctr.rs                 |   6 +-
 libcrux-aesgcm/src/aes_gcm.rs                 |  23 ++--
 libcrux-aesgcm/src/aes_generic.rs             |  58 +++++-----
 libcrux-aesgcm/src/gf128_generic.rs           |   2 +-
 libcrux-aesgcm/src/lib.rs                     | 105 ++++++++++++++----
 libcrux-aesgcm/src/platform.rs                |   4 +-
 libcrux-aesgcm/src/platform/intel_ni.rs       |   2 +-
 .../src/platform/intel_ni/aes_core.rs         |  36 +++---
 .../src/platform/intel_ni/gf128_core.rs       |  26 ++---
 libcrux-aesgcm/src/platform/neon/aes_core.rs  |  39 ++++---
 .../src/platform/neon/gf128_core.rs           |  15 ++-
 .../src/platform/portable/aes_core.rs         |   2 +-
 13 files changed, 262 insertions(+), 152 deletions(-)

diff --git a/libcrux-aesgcm/benches/aesgcm.rs b/libcrux-aesgcm/benches/aesgcm.rs
index 924433719..31f7ab0f6 100644
--- a/libcrux-aesgcm/benches/aesgcm.rs
+++ b/libcrux-aesgcm/benches/aesgcm.rs
@@ -1,7 +1,6 @@
 #![allow(non_snake_case)]
 use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
 
-
 pub fn randombytes(n: usize) -> Vec<u8> {
     use rand::rngs::OsRng;
     use rand::RngCore;
@@ -17,13 +16,12 @@ pub fn fmt(x: usize) -> String {
     format!("{} {}", x >> (10 * base), suffix[base])
 }
 
-
 macro_rules! impl_comp {
     ($fun:ident, $portable_fun:expr, $neon_fun:expr, $intel_fun:expr, $rustcrypto_fun:expr) => {
         // Comparing libcrux performance for different payload sizes and other implementations.
         fn $fun(c: &mut Criterion) {
             const PAYLOAD_SIZES: [usize; 3] = [128, 1024, 1024 * 1024 * 10];
-            
+
             let mut group = c.benchmark_group(stringify!($fun).replace("_", " "));
 
             for payload_size in PAYLOAD_SIZES.iter() {
@@ -34,29 +32,49 @@ macro_rules! impl_comp {
                     payload_size,
                     |b, payload_size| {
                         b.iter_batched(
-                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
-                            |(key,nonce,aad,payload)| {
+                            || {
+                                (
+                                    randombytes(16),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
                                 let mut ciphertext = vec![0; *payload_size];
                                 let mut tag = [0u8; 16];
-                                $portable_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                                $portable_fun(
+                                    &key,
+                                    &nonce,
+                                    &aad,
+                                    &payload,
+                                    &mut ciphertext,
+                                    &mut tag,
+                                );
                             },
                             BatchSize::SmallInput,
                         )
                     },
                 );
 
-
-                #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+                #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
                 group.bench_with_input(
                     BenchmarkId::new("neon-aes-clmul", fmt(*payload_size)),
                     payload_size,
                     |b, payload_size| {
                         b.iter_batched(
-                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
-                            |(key,nonce,aad,payload)| {
+                            || {
+                                (
+                                    randombytes(16),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
                                 let mut ciphertext = vec![0; *payload_size];
                                 let mut tag = [0u8; 16];
-                                $neon_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                                $neon_fun(&key, &nonce, &aad, &payload, &mut ciphertext, &mut tag);
                             },
                             BatchSize::SmallInput,
                         )
@@ -69,11 +87,18 @@ macro_rules! impl_comp {
                     payload_size,
                     |b, payload_size| {
                         b.iter_batched(
-                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
-                            |(key,nonce,aad,payload)| {
+                            || {
+                                (
+                                    randombytes(16),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
                                 let mut ciphertext = vec![0; *payload_size];
                                 let mut tag = [0u8; 16];
-                                $intel_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                                $intel_fun(&key, &nonce, &aad, &payload, &mut ciphertext, &mut tag);
                             },
                             BatchSize::SmallInput,
                         )
@@ -85,11 +110,25 @@ macro_rules! impl_comp {
                     payload_size,
                     |b, payload_size| {
                         b.iter_batched(
-                            || (randombytes(16), randombytes(12), randombytes(32), randombytes(*payload_size)),
-                            |(key,nonce,aad,payload)| {
+                            || {
+                                (
+                                    randombytes(16),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
                                 let mut ciphertext = vec![0; *payload_size];
                                 let mut tag = [0u8; 16];
-                                $rustcrypto_fun(&key,&nonce,&aad,&payload,&mut ciphertext, &mut tag);
+                                $rustcrypto_fun(
+                                    &key,
+                                    &nonce,
+                                    &aad,
+                                    &payload,
+                                    &mut ciphertext,
+                                    &mut tag,
+                                );
                             },
                             BatchSize::SmallInput,
                         )
@@ -102,21 +141,36 @@ macro_rules! impl_comp {
 
 use aes_gcm::{
     aead::{Aead, AeadCore, KeyInit, OsRng},
-    Aes128Gcm, Nonce, Key // Or `Aes128Gcm`
+    Aes128Gcm,
+    Key, // Or `Aes128Gcm`
+    Nonce,
 };
 
-fn rustcrypto_aes128_gcm_encrypt(key:&[u8], nonce:&[u8], aad:&[u8], plain:&[u8], ciphertext:&mut [u8], tag:&mut [u8]){
+fn rustcrypto_aes128_gcm_encrypt(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    plain: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
     let cipher = Aes128Gcm::new(key.into());
     let ctxt = cipher.encrypt(nonce.into(), plain).unwrap();
     ciphertext.copy_from_slice(&ctxt[0..plain.len()]);
     tag.copy_from_slice(&ctxt[plain.len()..]);
 }
 
-impl_comp!(AES128_GCM, libcrux_aesgcm::portable::aes128_gcm_encrypt, libcrux_aesgcm::neon::aes128_gcm_encrypt, libcrux_aesgcm::intel_ni::aes128_gcm_encrypt, rustcrypto_aes128_gcm_encrypt);
+impl_comp!(
+    AES128_GCM,
+    libcrux_aesgcm::portable::aes128_gcm_encrypt,
+    libcrux_aesgcm::neon::aes128_gcm_encrypt,
+    libcrux_aesgcm::intel_ni::aes128_gcm_encrypt,
+    rustcrypto_aes128_gcm_encrypt
+);
 
 fn benchmarks(c: &mut Criterion) {
     AES128_GCM(c);
 }
 
 criterion_group!(benches, benchmarks);
-criterion_main!(benches);
\ No newline at end of file
+criterion_main!(benches);
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index 6064a5158..f592df426 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -290,7 +290,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
     #[test]
     fn test_ctr_block_neon() {
         let mut computed: [u8; 32] = [0u8; 32];
@@ -308,7 +308,6 @@ mod test {
         }
     }
 
-
     #[test]
     fn test_ctr_encrypt() {
         let mut computed: [u8; 32] = [0u8; 32];
@@ -324,7 +323,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
     #[test]
     fn test_ctr_encrypt_neon() {
         let mut computed: [u8; 32] = [0u8; 32];
@@ -355,5 +354,4 @@ mod test {
             }
         }
     }
-    
 }
diff --git a/libcrux-aesgcm/src/aes_gcm.rs b/libcrux-aesgcm/src/aes_gcm.rs
index 18294b441..97c6a39f6 100644
--- a/libcrux-aesgcm/src/aes_gcm.rs
+++ b/libcrux-aesgcm/src/aes_gcm.rs
@@ -1,12 +1,9 @@
 use crate::{
     aes_ctr::{
-        aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce,
-        aes128_ctr_update, AES128_CTR_Context,
-    },
-    gf128_generic::{
-        gf128_emit, gf128_init, gf128_update,
-        gf128_update_padded, GF128State,
+        aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce, aes128_ctr_update,
+        AES128_CTR_Context,
     },
+    gf128_generic::{gf128_emit, gf128_init, gf128_update, gf128_update_padded, GF128State},
     platform::{AESState, GF128FieldElement},
 };
 
@@ -270,11 +267,11 @@ mod test {
             }
         }
     }
-    
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
     use crate::platform::neon;
 
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
     #[test]
     fn test_gcm1_neon() {
         let mut computed1 = [0u8; 76];
@@ -293,7 +290,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
     #[test]
     fn test_gcm2_neon() {
         let mut computed2 = [0u8; 668];
@@ -312,10 +309,10 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[cfg(all(target_arch = "x86_64", target_feature = "aes"))]
     use crate::platform::intel_ni;
 
-    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[cfg(all(target_arch = "x86_64", target_feature = "aes"))]
     #[test]
     fn test_gcm1_intel() {
         let mut computed1 = [0u8; 76];
@@ -334,7 +331,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "x86_64", target_feature="aes"))]
+    #[cfg(all(target_arch = "x86_64", target_feature = "aes"))]
     #[test]
     fn test_gcm2_intel() {
         let mut computed2 = [0u8; 668];
diff --git a/libcrux-aesgcm/src/aes_generic.rs b/libcrux-aesgcm/src/aes_generic.rs
index a3b041dae..9155535c3 100644
--- a/libcrux-aesgcm/src/aes_generic.rs
+++ b/libcrux-aesgcm/src/aes_generic.rs
@@ -13,21 +13,21 @@ pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11
 
     macro_rules! expansion_step128 {
         ($i:expr,$rcon:expr) => {
-            let prev = keyex[$i-1];
+            let prev = keyex[$i - 1];
             keyex[$i].aes_keygen_assist0::<$rcon>(&prev);
             keyex[$i].key_expansion_step(&prev);
-        }
+        };
     }
-    expansion_step128!(1,0x01);
-    expansion_step128!(2,0x02);
-    expansion_step128!(3,0x04);
-    expansion_step128!(4,0x08);
-    expansion_step128!(5,0x10);
-    expansion_step128!(6,0x20);
-    expansion_step128!(7,0x40);
-    expansion_step128!(8,0x80);
-    expansion_step128!(9,0x1b);
-    expansion_step128!(10,0x36);
+    expansion_step128!(1, 0x01);
+    expansion_step128!(2, 0x02);
+    expansion_step128!(3, 0x04);
+    expansion_step128!(4, 0x08);
+    expansion_step128!(5, 0x10);
+    expansion_step128!(6, 0x20);
+    expansion_step128!(7, 0x40);
+    expansion_step128!(8, 0x80);
+    expansion_step128!(9, 0x1b);
+    expansion_step128!(10, 0x36);
     keyex
 }
 
@@ -39,28 +39,28 @@ pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15
 
     macro_rules! expansion_step256 {
         ($i:expr,$rcon:expr) => {
-            let prev0 = keyex[$i-2];
-            let prev1 = keyex[$i-1];
+            let prev0 = keyex[$i - 2];
+            let prev1 = keyex[$i - 1];
             keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
             keyex[$i].key_expansion_step(&prev0);
             let next0 = keyex[$i];
-            keyex[$i+1].aes_keygen_assist1(&next0);
-            keyex[$i+1].key_expansion_step(&prev1);
-        }
+            keyex[$i + 1].aes_keygen_assist1(&next0);
+            keyex[$i + 1].key_expansion_step(&prev1);
+        };
     }
 
-    expansion_step256!(2,0x01);
-    expansion_step256!(3,0x01);
-    expansion_step256!(4,0x02);
-    expansion_step256!(5,0x02);
-    expansion_step256!(6,0x04);
-    expansion_step256!(7,0x04);
-    expansion_step256!(8,0x08);
-    expansion_step256!(9,0x08);
-    expansion_step256!(10,0x10);
-    expansion_step256!(11,0x10);
-    expansion_step256!(12,0x20);
-    expansion_step256!(13,0x20);
+    expansion_step256!(2, 0x01);
+    expansion_step256!(3, 0x01);
+    expansion_step256!(4, 0x02);
+    expansion_step256!(5, 0x02);
+    expansion_step256!(6, 0x04);
+    expansion_step256!(7, 0x04);
+    expansion_step256!(8, 0x08);
+    expansion_step256!(9, 0x08);
+    expansion_step256!(10, 0x10);
+    expansion_step256!(11, 0x10);
+    expansion_step256!(12, 0x20);
+    expansion_step256!(13, 0x20);
 
     let prev0 = keyex[12];
     let prev1 = keyex[13];
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index 1d2eecc8b..68b4df82e 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -99,7 +99,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
     #[test]
     fn test_gf128_neon() {
         let mut computed: [u8; 16] = [0u8; 16];
diff --git a/libcrux-aesgcm/src/lib.rs b/libcrux-aesgcm/src/lib.rs
index 08f32d53b..f9a7cc646 100644
--- a/libcrux-aesgcm/src/lib.rs
+++ b/libcrux-aesgcm/src/lib.rs
@@ -6,52 +6,117 @@ pub mod platform;
 
 pub use aes_gcm::DecryptError;
 
-pub mod portable{
-    use crate::{aes_gcm::{self, DecryptError}, platform};
+pub mod portable {
+    use crate::{
+        aes_gcm::{self, DecryptError},
+        platform,
+    };
 
-    pub fn aes128_gcm_encrypt(key: &[u8], nonce: &[u8], aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]){
-        let mut st = aes_gcm::aes128_gcm_init::<platform::portable::State,platform::portable::FieldElement>(key);
+    pub fn aes128_gcm_encrypt(
+        key: &[u8],
+        nonce: &[u8],
+        aad: &[u8],
+        plaintext: &[u8],
+        ciphertext: &mut [u8],
+        tag: &mut [u8],
+    ) {
+        let mut st = aes_gcm::aes128_gcm_init::<
+            platform::portable::State,
+            platform::portable::FieldElement,
+        >(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
     }
 
-    pub fn aes128_gcm_decrypt(key: &[u8], nonce: &[u8], aad: &[u8], ciphertext: &[u8], tag: &[u8], plaintext: &mut [u8]) -> Result<(), DecryptError>{
-        let mut st = aes_gcm::aes128_gcm_init::<platform::portable::State,platform::portable::FieldElement>(key);
+    pub fn aes128_gcm_decrypt(
+        key: &[u8],
+        nonce: &[u8],
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError> {
+        let mut st = aes_gcm::aes128_gcm_init::<
+            platform::portable::State,
+            platform::portable::FieldElement,
+        >(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature="aes"))]
-pub mod neon{
-    use crate::{aes_gcm::{self, DecryptError}, platform};
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+pub mod neon {
+    use crate::{
+        aes_gcm::{self, DecryptError},
+        platform,
+    };
 
-    pub fn aes128_gcm_encrypt(key: &[u8], nonce: &[u8], aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]){
-        let mut st = aes_gcm::aes128_gcm_init::<platform::neon::State,platform::neon::FieldElement>(key);
+    pub fn aes128_gcm_encrypt(
+        key: &[u8],
+        nonce: &[u8],
+        aad: &[u8],
+        plaintext: &[u8],
+        ciphertext: &mut [u8],
+        tag: &mut [u8],
+    ) {
+        let mut st =
+            aes_gcm::aes128_gcm_init::<platform::neon::State, platform::neon::FieldElement>(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
     }
 
-    pub fn aes128_gcm_decrypt(key: &[u8], nonce: &[u8], aad: &[u8], ciphertext: &[u8], tag: &[u8], plaintext: &mut [u8]) -> Result<(), DecryptError>{
-        let mut st = aes_gcm::aes128_gcm_init::<platform::neon::State,platform::neon::FieldElement>(key);
+    pub fn aes128_gcm_decrypt(
+        key: &[u8],
+        nonce: &[u8],
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError> {
+        let mut st =
+            aes_gcm::aes128_gcm_init::<platform::neon::State, platform::neon::FieldElement>(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
     }
 }
 
 #[cfg(all(target_arch = "x86_64"))] // REENABLE target_feature="aes"
-pub mod intel_ni{
-    use crate::{aes_gcm::{self, DecryptError}, platform};
+pub mod intel_ni {
+    use crate::{
+        aes_gcm::{self, DecryptError},
+        platform,
+    };
 
-    pub fn aes128_gcm_encrypt(key: &[u8], nonce: &[u8], aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]){
-        let mut st = aes_gcm::aes128_gcm_init::<platform::intel_ni::State,platform::intel_ni::FieldElement>(key);
+    pub fn aes128_gcm_encrypt(
+        key: &[u8],
+        nonce: &[u8],
+        aad: &[u8],
+        plaintext: &[u8],
+        ciphertext: &mut [u8],
+        tag: &mut [u8],
+    ) {
+        let mut st = aes_gcm::aes128_gcm_init::<
+            platform::intel_ni::State,
+            platform::intel_ni::FieldElement,
+        >(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
     }
 
-    pub fn aes128_gcm_decrypt(key: &[u8], nonce: &[u8], aad: &[u8], ciphertext: &[u8], tag: &[u8], plaintext: &mut [u8]) -> Result<(), DecryptError>{
-        let mut st = aes_gcm::aes128_gcm_init::<platform::intel_ni::State,platform::intel_ni::FieldElement>(key);
+    pub fn aes128_gcm_decrypt(
+        key: &[u8],
+        nonce: &[u8],
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError> {
+        let mut st = aes_gcm::aes128_gcm_init::<
+            platform::intel_ni::State,
+            platform::intel_ni::FieldElement,
+        >(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
         aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
     }
-}
\ No newline at end of file
+}
diff --git a/libcrux-aesgcm/src/platform.rs b/libcrux-aesgcm/src/platform.rs
index 8f93d2c55..b46522746 100644
--- a/libcrux-aesgcm/src/platform.rs
+++ b/libcrux-aesgcm/src/platform.rs
@@ -1,6 +1,6 @@
 pub mod portable;
 
-#[cfg(all(target_arch = "aarch64", target_feature="aes"))]
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
 pub mod neon;
 
 #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
@@ -15,7 +15,7 @@ pub trait AESState: Copy {
     fn xor_key(&mut self, key: &Self);
     fn aes_enc(&mut self, key: &Self);
     fn aes_enc_last(&mut self, key: &Self);
-    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self);
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self);
     fn aes_keygen_assist1(&mut self, prev: &Self);
     fn key_expansion_step(&mut self, prev: &Self);
 }
diff --git a/libcrux-aesgcm/src/platform/intel_ni.rs b/libcrux-aesgcm/src/platform/intel_ni.rs
index 9d14728fe..7fe9d7462 100644
--- a/libcrux-aesgcm/src/platform/intel_ni.rs
+++ b/libcrux-aesgcm/src/platform/intel_ni.rs
@@ -1,4 +1,4 @@
 mod aes_core;
 mod gf128_core;
 pub(crate) use aes_core::State;
-pub(crate) use gf128_core::FieldElement;
\ No newline at end of file
+pub(crate) use gf128_core::FieldElement;
diff --git a/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs b/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
index 6c20adc58..327794bee 100644
--- a/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
@@ -18,31 +18,29 @@ fn aes_enc_last(st: &mut State, key: &State) {
     unsafe { *st = _mm_aesenclast_si128(*st, *key) }
 }
 
-fn aes_keygen_assist<const RCON:i32>(next: &mut State, prev: &State) {
-    unsafe { *next = _mm_aeskeygenassist_si128::<RCON>(*prev)}
+fn aes_keygen_assist<const RCON: i32>(next: &mut State, prev: &State) {
+    unsafe { *next = _mm_aeskeygenassist_si128::<RCON>(*prev) }
 }
 
-fn aes_keygen_assist0<const RCON:i32>(next: &mut State, prev: &State) {
+fn aes_keygen_assist0<const RCON: i32>(next: &mut State, prev: &State) {
     aes_keygen_assist::<RCON>(next, prev);
     unsafe { *next = _mm_shuffle_epi32(*next, 0xff) }
 }
 
-
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist::<0>(next, prev);
     unsafe { *next = _mm_shuffle_epi32(*next, 0xaa) }
 }
 
 fn key_expansion_step(next: &mut State, prev: &State) {
-    unsafe{
-        let p0 = _mm_xor_si128(*prev, _mm_slli_si128(*prev,4));
-        let p1 = _mm_xor_si128(p0, _mm_slli_si128(p0,4));
-        let p2 = _mm_xor_si128(p1, _mm_slli_si128(p1,4));
-        *next = _mm_xor_si128(*next,p2);
+    unsafe {
+        let p0 = _mm_xor_si128(*prev, _mm_slli_si128(*prev, 4));
+        let p1 = _mm_xor_si128(p0, _mm_slli_si128(p0, 4));
+        let p2 = _mm_xor_si128(p1, _mm_slli_si128(p1, 4));
+        *next = _mm_xor_si128(*next, p2);
     }
 }
 
-
 impl crate::platform::AESState for State {
     fn new() -> Self {
         new_state()
@@ -78,7 +76,7 @@ impl crate::platform::AESState for State {
         aes_enc_last(self, key);
     }
 
-    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self) {
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
         aes_keygen_assist0::<RCON>(self, prev);
     }
 
@@ -94,15 +92,15 @@ impl crate::platform::AESState for State {
 #[test]
 fn test() {
     unsafe {
-        let x = _mm_set_epi32(3,2,1,0);
-        let y = _mm_shuffle_epi32(x,0xaa);
-        let w = _mm_slli_si128(x,4);
-        let mut z:[i32; 4] = [0;4];
+        let x = _mm_set_epi32(3, 2, 1, 0);
+        let y = _mm_shuffle_epi32(x, 0xaa);
+        let w = _mm_slli_si128(x, 4);
+        let mut z: [i32; 4] = [0; 4];
         _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, x);
-        println!("{:?}",z);
+        println!("{:?}", z);
         _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, w);
-        println!("shift right 4 {:?}",z);
+        println!("shift right 4 {:?}", z);
         _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, y);
-        println!("shuffle aa {:?}",z);
+        println!("shuffle aa {:?}", z);
     }
-}
\ No newline at end of file
+}
diff --git a/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs b/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
index 6d80b94cd..139aeb438 100644
--- a/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
+++ b/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -2,7 +2,7 @@ use core::arch::x86_64::*;
 
 // A lot of the code below is shared with NEON. Refactor!
 
-#[derive(Clone,Copy)]
+#[derive(Clone, Copy)]
 pub struct FieldElement(pub u128);
 
 fn zero() -> FieldElement {
@@ -24,8 +24,8 @@ fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
 }
 
 fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
-    let lhs : __m128i = unsafe { std::mem::transmute((*elem).0) };
-    let rhs : __m128i = unsafe { std::mem::transmute((*other).0) };
+    let lhs: __m128i = unsafe { std::mem::transmute((*elem).0) };
+    let rhs: __m128i = unsafe { std::mem::transmute((*other).0) };
     let low = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x11) };
     let mid0 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x10) };
     let mid1 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x01) };
@@ -36,9 +36,9 @@ fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldEl
     let low = unsafe { _mm_xor_si128(low, m0) };
     let high = unsafe { _mm_xor_si128(high, m1) };
 
-    let low128 : u128 = unsafe { std::mem::transmute(low) };
-    let high128 : u128 = unsafe { std::mem::transmute(high) };   
-   (FieldElement(low128), FieldElement(high128))
+    let low128: u128 = unsafe { std::mem::transmute(low) };
+    let high128: u128 = unsafe { std::mem::transmute(high) };
+    (FieldElement(low128), FieldElement(high128))
 }
 
 fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
@@ -51,8 +51,8 @@ fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
 }
 
 fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
-    let (high,low) = mul_wide(x,y);
-    reduce(&high,&low)
+    let (high, low) = mul_wide(x, y);
+    reduce(&high, &low)
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
@@ -80,11 +80,11 @@ impl crate::platform::GF128FieldElement for FieldElement {
 #[test]
 fn test_transmute() {
     let x = 1u128 << 64 ^ 2u128;
-    let xv : __m128i = unsafe { std::mem::transmute(x)};
-    let xv : __m128i = unsafe { _mm_slli_si128(xv,8)};
-    let x : u128 = unsafe { std::mem::transmute(xv)};
+    let xv: __m128i = unsafe { std::mem::transmute(x) };
+    let xv: __m128i = unsafe { _mm_slli_si128(xv, 8) };
+    let x: u128 = unsafe { std::mem::transmute(xv) };
     println!("trans {:x}", x);
     let mut u64s = [0u64; 2];
-    unsafe { _mm_storeu_si128(u64s.as_mut_ptr() as *mut __m128i, xv)};
+    unsafe { _mm_storeu_si128(u64s.as_mut_ptr() as *mut __m128i, xv) };
     println!("store {:?}", u64s)
-}
\ No newline at end of file
+}
diff --git a/libcrux-aesgcm/src/platform/neon/aes_core.rs b/libcrux-aesgcm/src/platform/neon/aes_core.rs
index 38e29a825..166810eee 100644
--- a/libcrux-aesgcm/src/platform/neon/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/neon/aes_core.rs
@@ -3,19 +3,19 @@ use core::arch::aarch64::*;
 pub(crate) type State = uint8x16_t;
 
 fn new_state() -> State {
-    unsafe {vdupq_n_u8(0)}
+    unsafe { vdupq_n_u8(0) }
 }
 
 fn xor_key1_state(st: &mut State, k: &State) {
-    unsafe {*st = veorq_u8(*st, *k)}
+    unsafe { *st = veorq_u8(*st, *k) }
 }
 
 fn aes_enc(st: &mut State, key: &State) {
-    unsafe {*st = veorq_u8(vaesmcq_u8(vaeseq_u8(*st, vdupq_n_u8(0))),*key)}
+    unsafe { *st = veorq_u8(vaesmcq_u8(vaeseq_u8(*st, vdupq_n_u8(0))), *key) }
 }
 
 fn aes_enc_last(st: &mut State, key: &State) {
-    unsafe {*st = veorq_u8(vaeseq_u8(*st, vdupq_n_u8(0)),*key)}
+    unsafe { *st = veorq_u8(vaeseq_u8(*st, vdupq_n_u8(0)), *key) }
 }
 
 fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
@@ -23,36 +23,35 @@ fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
         let st = vaeseq_u8(*prev, vdupq_n_u8(0));
         let mut tmp = [0u8; 16];
         vst1q_u8(tmp.as_mut_ptr(), st);
-        let tmp_new = [tmp[4], tmp[1], tmp[14], tmp[11],
-                        tmp[1], tmp[14], tmp[11], tmp[4],
-                        tmp[12], tmp[9], tmp[6], tmp[3],
-                        tmp[9], tmp[6], tmp[3], tmp[12]];
+        let tmp_new = [
+            tmp[4], tmp[1], tmp[14], tmp[11], tmp[1], tmp[14], tmp[11], tmp[4], tmp[12], tmp[9],
+            tmp[6], tmp[3], tmp[9], tmp[6], tmp[3], tmp[12],
+        ];
         let st_new = vld1q_u8(tmp_new.as_ptr());
         let rcon_array = [0, rcon as u32, 0, rcon as u32];
         let rcon_vec = vreinterpretq_u8_u32(vld1q_u32(rcon_array.as_ptr()));
-        *next = veorq_u8(st_new , rcon_vec);
+        *next = veorq_u8(st_new, rcon_vec);
     }
 }
 
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
-    unsafe {*next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 3))}
+    unsafe { *next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 3)) }
 }
 
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
-    unsafe {*next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 2))}
+    unsafe { *next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 2)) }
 }
 
-
 fn key_expansion_step(next: &mut State, prev: &State) {
-    unsafe{
+    unsafe {
         let zero = vdupq_n_u32(0);
         let prev0 = vreinterpretq_u32_u8(*prev);
         let prev1 = veorq_u32(prev0, vextq_u32(zero, prev0, 3));
         let prev2 = veorq_u32(prev1, vextq_u32(zero, prev1, 3));
         let prev3 = veorq_u32(prev2, vextq_u32(zero, prev2, 3));
-        *next = veorq_u8(*next,vreinterpretq_u8_u32(prev3));
+        *next = veorq_u8(*next, vreinterpretq_u8_u32(prev3));
     }
 }
 
@@ -63,19 +62,19 @@ impl crate::platform::AESState for State {
 
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
-        unsafe {*self = vld1q_u8(b.as_ptr())};
+        unsafe { *self = vld1q_u8(b.as_ptr()) };
     }
 
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
-        unsafe {vst1q_u8(out.as_mut_ptr(), *self)}
+        unsafe { vst1q_u8(out.as_mut_ptr(), *self) }
     }
 
     fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        let inp_vec = unsafe {vld1q_u8(inp.as_ptr()) };
-        let out_vec = unsafe {veorq_u8(inp_vec, *self)};
-        unsafe {vst1q_u8(out.as_mut_ptr(),out_vec)}
+        let inp_vec = unsafe { vld1q_u8(inp.as_ptr()) };
+        let out_vec = unsafe { veorq_u8(inp_vec, *self) };
+        unsafe { vst1q_u8(out.as_mut_ptr(), out_vec) }
     }
 
     fn xor_key(&mut self, key: &Self) {
@@ -91,7 +90,7 @@ impl crate::platform::AESState for State {
         aes_enc_last(self, key);
     }
 
-    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self) {
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
         aes_keygen_assist0(self, prev, RCON as u8);
     }
 
diff --git a/libcrux-aesgcm/src/platform/neon/gf128_core.rs b/libcrux-aesgcm/src/platform/neon/gf128_core.rs
index 2976fb355..862b97139 100644
--- a/libcrux-aesgcm/src/platform/neon/gf128_core.rs
+++ b/libcrux-aesgcm/src/platform/neon/gf128_core.rs
@@ -1,6 +1,6 @@
 use core::arch::aarch64::*;
 
-#[derive(Clone,Copy)]
+#[derive(Clone, Copy)]
 pub struct FieldElement(pub u128);
 
 fn zero() -> FieldElement {
@@ -26,10 +26,10 @@ fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldEl
     let h0 = ((*elem).0 >> 64) as u64;
     let l1 = (*other).0 as u64;
     let h1 = ((*other).0 >> 64) as u64;
-    let low : u128 = unsafe {vmull_p64(l0, l1)};
-    let m1 : u128 = unsafe {vmull_p64(l0, h1)};
-    let m2 : u128 = unsafe {vmull_p64(l1, h0)};
-    let high : u128 = unsafe {vmull_p64(h0, h1)};
+    let low: u128 = unsafe { vmull_p64(l0, l1) };
+    let m1: u128 = unsafe { vmull_p64(l0, h1) };
+    let m2: u128 = unsafe { vmull_p64(l1, h0) };
+    let high: u128 = unsafe { vmull_p64(h0, h1) };
     let mid = m1 ^ m2;
     let m0 = mid << 64;
     let m1 = mid >> 64;
@@ -48,8 +48,8 @@ fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
 }
 
 fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
-    let (high,low) = mul_wide(x,y);
-    reduce(&high,&low)
+    let (high, low) = mul_wide(x, y);
+    reduce(&high, &low)
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
@@ -73,4 +73,3 @@ impl crate::platform::GF128FieldElement for FieldElement {
         *self = mul(self, other)
     }
 }
-
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/libcrux-aesgcm/src/platform/portable/aes_core.rs
index ca87e09f4..ed92da2b8 100644
--- a/libcrux-aesgcm/src/platform/portable/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/aes_core.rs
@@ -588,7 +588,7 @@ impl crate::platform::AESState for State {
         aes_enc_last(self, key);
     }
 
-    fn aes_keygen_assist0<const RCON:i32>(&mut self, prev: &Self) {
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
         aes_keygen_assist0(self, prev, RCON as u8);
     }
 

From f7d8fe2ddc165f0556b44edbd6c8e3ae642552fa Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 4 Jun 2025 09:50:59 +0200
Subject: [PATCH 13/43] fixups

---
 libcrux-aesgcm/Cargo.toml                     |   2 +-
 libcrux-aesgcm/src/aes_ctr.rs                 |   1 +
 libcrux-aesgcm/src/gf128_generic.rs           |   1 +
 .../src/platform/portable/aes_core.rs         | 333 +++++++++---------
 4 files changed, 179 insertions(+), 158 deletions(-)

diff --git a/libcrux-aesgcm/Cargo.toml b/libcrux-aesgcm/Cargo.toml
index b2700c901..fe71e9ba3 100644
--- a/libcrux-aesgcm/Cargo.toml
+++ b/libcrux-aesgcm/Cargo.toml
@@ -16,7 +16,7 @@ bench = false # so libtest doesn't eat the arguments to criterion
 [dependencies]
 libcrux-platform = { version = "0.0.2", path = "../sys/platform" }
 libcrux-intrinsics = { version = "0.0.2", path = "../libcrux-intrinsics" }
-hax-lib = { version = "0.2", git = "https://github.com/cryspen/hax/" }
+hax-lib.workspace = true
 
 [features]
 simd128 = []
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/libcrux-aesgcm/src/aes_ctr.rs
index f592df426..8f5f4a49e 100644
--- a/libcrux-aesgcm/src/aes_ctr.rs
+++ b/libcrux-aesgcm/src/aes_ctr.rs
@@ -28,6 +28,7 @@ fn aes_ctr_key_block<T: AESState, const NUM_KEYS: usize>(
     st.store_block(out);
 }
 
+#[inline(always)]
 fn aes_ctr_xor_block<T: AESState, const NUM_KEYS: usize>(
     ctx: &AES_CTR_Context<T, NUM_KEYS>,
     ctr: u32,
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/libcrux-aesgcm/src/gf128_generic.rs
index 68b4df82e..307f0ea34 100644
--- a/libcrux-aesgcm/src/gf128_generic.rs
+++ b/libcrux-aesgcm/src/gf128_generic.rs
@@ -13,6 +13,7 @@ pub fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
     }
 }
 
+#[inline(always)]
 pub fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block: &[u8]) {
     debug_assert!(block.len() == 16);
     let block_elem = T::load_elem(block);
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/libcrux-aesgcm/src/platform/portable/aes_core.rs
index ed92da2b8..dd5ee6ad7 100644
--- a/libcrux-aesgcm/src/platform/portable/aes_core.rs
+++ b/libcrux-aesgcm/src/platform/portable/aes_core.rs
@@ -4,6 +4,7 @@ fn new_state() -> State {
     [0u16; 8]
 }
 
+#[inline(always)]
 fn interleave_u8_1(i0: u8, i1: u8) -> u16 {
     let mut x = i0 as u16;
     x = (x | (x << 4)) & 0x0F0F;
@@ -16,6 +17,7 @@ fn interleave_u8_1(i0: u8, i1: u8) -> u16 {
     x | (y << 1)
 }
 
+#[inline(always)]
 fn deinterleave_u8_1(i0: u16) -> (u8, u8) {
     let mut x = i0 & 0x5555;
     x = (x | (x >> 1)) & 0x3333;
@@ -28,18 +30,21 @@ fn deinterleave_u8_1(i0: u16) -> (u8, u8) {
     (x as u8, y as u8)
 }
 
+#[inline(always)]
 fn interleave_u16_2(i0: u16, i1: u16) -> (u16, u16) {
     let x = ((i1 & 0x3333) << 2) | (i0 & 0x3333);
     let y = ((i0 & 0xcccc) >> 2) | (i1 & 0xcccc);
     (x, y)
 }
 
+#[inline(always)]
 fn interleave_u16_4(i0: u16, i1: u16) -> (u16, u16) {
     let x = ((i1 & 0x0F0F) << 4) | (i0 & 0x0F0F);
     let y = ((i0 & 0xF0F0) >> 4) | (i1 & 0xF0F0);
     (x, y)
 }
 
+#[inline(always)]
 fn interleave_u16_8(i0: u16, i1: u16) -> (u16, u16) {
     let x = ((i1 & 0x00FF) << 8) | (i0 & 0x00FF);
     let y = ((i0 & 0xFF00) >> 8) | (i1 & 0xFF00);
@@ -118,6 +123,7 @@ fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
     output[15] = o15;
 }
 
+#[inline(always)]
 fn xnor(a: u16, b: u16) -> u16 {
     !(a ^ b)
 }
@@ -269,155 +275,6 @@ fn sub_bytes_state(st: &mut State) {
     st[7] = S0;
 }
 
-#[allow(non_snake_case)]
-fn sub_bytes_inv_state(st: &mut State) {
-    let U0 = st[7];
-    let U1 = st[6];
-    let U2 = st[5];
-    let U3 = st[4];
-    let U4 = st[3];
-    let U5 = st[2];
-    let U6 = st[1];
-    let U7 = st[0];
-
-    let T23 = U0 ^ U3;
-    let T22 = xnor(U1, U3);
-    let T2 = xnor(U0, U1);
-    let T1 = U3 ^ U4;
-    let T24 = xnor(U4, U7);
-    let R5 = U6 ^ U7;
-    let T8 = xnor(U1, T23);
-    let T19 = T22 ^ R5;
-    let T9 = xnor(U7, T1);
-    let T10 = T2 ^ T24;
-    let T13 = T2 ^ R5;
-    let T3 = T1 ^ R5;
-    let T25 = xnor(U2, T1);
-    let R13 = U1 ^ U6;
-    let T17 = xnor(U2, T19);
-    let T20 = T24 ^ R13;
-    let T4 = U4 ^ T8;
-    let R17 = xnor(U2, U5);
-    let R18 = xnor(U5, U6);
-    let R19 = xnor(U2, U4);
-    let Y5 = U0 ^ R17;
-    let T6 = T22 ^ R17;
-    let T16 = R13 ^ R19;
-    let T27 = T1 ^ R18;
-    let T15 = T10 ^ T27;
-    let T14 = T10 ^ R18;
-    let T26 = T3 ^ T16;
-    let M1 = T13 & T6;
-    let M2 = T23 & T8;
-    let M3 = T14 ^ M1;
-    let M4 = T19 & Y5;
-    let M5 = M4 ^ M1;
-    let M6 = T3 & T16;
-    let M7 = T22 & T9;
-    let M8 = T26 ^ M6;
-    let M9 = T20 & T17;
-    let M10 = M9 ^ M6;
-    let M11 = T1 & T15;
-    let M12 = T4 & T27;
-    let M13 = M12 ^ M11;
-    let M14 = T2 & T10;
-    let M15 = M14 ^ M11;
-    let M16 = M3 ^ M2;
-    let M17 = M5 ^ T24;
-    let M18 = M8 ^ M7;
-    let M19 = M10 ^ M15;
-    let M20 = M16 ^ M13;
-    let M21 = M17 ^ M15;
-    let M22 = M18 ^ M13;
-    let M23 = M19 ^ T25;
-    let M24 = M22 ^ M23;
-    let M25 = M22 & M20;
-    let M26 = M21 ^ M25;
-    let M27 = M20 ^ M21;
-    let M28 = M23 ^ M25;
-    let M29 = M28 & M27;
-    let M30 = M26 & M24;
-    let M31 = M20 & M23;
-    let M32 = M27 & M31;
-    let M33 = M27 ^ M25;
-    let M34 = M21 & M22;
-    let M35 = M24 & M34;
-    let M36 = M24 ^ M25;
-    let M37 = M21 ^ M29;
-    let M38 = M32 ^ M33;
-    let M39 = M23 ^ M30;
-    let M40 = M35 ^ M36;
-    let M41 = M38 ^ M40;
-    let M42 = M37 ^ M39;
-    let M43 = M37 ^ M38;
-    let M44 = M39 ^ M40;
-    let M45 = M42 ^ M41;
-    let M46 = M44 & T6;
-    let M47 = M40 & T8;
-    let M48 = M39 & Y5;
-    let M49 = M43 & T16;
-    let M50 = M38 & T9;
-    let M51 = M37 & T17;
-    let M52 = M42 & T15;
-    let M53 = M45 & T27;
-    let M54 = M41 & T10;
-    let M55 = M44 & T13;
-    let M56 = M40 & T23;
-    let M57 = M39 & T19;
-    let M58 = M43 & T3;
-    let M59 = M38 & T22;
-    let M60 = M37 & T20;
-    let M61 = M42 & T1;
-    let M62 = M45 & T4;
-    let M63 = M41 & T2;
-    let P0 = M52 ^ M61;
-    let P1 = M58 ^ M59;
-    let P2 = M54 ^ M62;
-    let P3 = M47 ^ M50;
-    let P4 = M48 ^ M56;
-    let P5 = M46 ^ M51;
-    let P6 = M49 ^ M60;
-    let P7 = P0 ^ P1;
-    let P8 = M50 ^ M53;
-    let P9 = M55 ^ M63;
-    let P10 = M57 ^ P4;
-    let P11 = P0 ^ P3;
-    let P12 = M46 ^ M48;
-    let P13 = M49 ^ M51;
-    let P14 = M49 ^ M62;
-    let P15 = M54 ^ M59;
-    let P16 = M57 ^ M61;
-    let P17 = M58 ^ P2;
-    let P18 = M63 ^ P5;
-    let P19 = P2 ^ P3;
-    let P20 = P4 ^ P6;
-    let P22 = P2 ^ P7;
-    let P23 = P7 ^ P8;
-    let P24 = P5 ^ P7;
-    let P25 = P6 ^ P10;
-    let P26 = P9 ^ P11;
-    let P27 = P10 ^ P18;
-    let P28 = P11 ^ P25;
-    let P29 = P15 ^ P20;
-    let W0 = P13 ^ P22;
-    let W1 = P26 ^ P29;
-    let W2 = P17 ^ P28;
-    let W3 = P12 ^ P22;
-    let W4 = P23 ^ P27;
-    let W5 = P19 ^ P24;
-    let W6 = P14 ^ P23;
-    let W7 = P9 ^ P16;
-
-    st[0] = W7;
-    st[1] = W6;
-    st[2] = W5;
-    st[3] = W4;
-    st[4] = W3;
-    st[5] = W2;
-    st[6] = W1;
-    st[7] = W0;
-}
-
 fn shift_row_u16(input: u16) -> u16 {
     (input & 0x1111)
         | ((input & 0x2220) >> 4)
@@ -476,6 +333,7 @@ fn aes_enc_last(st: &mut State, key: &State) {
     xor_key1_state(st, key)
 }
 
+#[inline(always)]
 fn aes_keygen_assisti(rcon: u8, i: usize, u: u16) -> u16 {
     let u3 = u & 0xf000;
     let n = u3 >> 12;
@@ -501,12 +359,15 @@ fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
 
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
+
+    #[inline(always)]
     fn aux(mut n: u16) -> u16 {
         n &= 0xf000;
         n ^= n >> 4;
         n ^= n >> 8;
         n
     }
+
     next[0] = aux(next[0]);
     next[1] = aux(next[1]);
     next[2] = aux(next[2]);
@@ -519,12 +380,15 @@ fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
 
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
+
+    #[inline(always)]
     fn aux(mut n: u16) -> u16 {
         n &= 0x0f00;
         n ^= n << 4;
         n ^= n >> 8;
         n
     }
+
     next[0] = aux(next[0]);
     next[1] = aux(next[1]);
     next[2] = aux(next[2]);
@@ -535,6 +399,7 @@ fn aes_keygen_assist1(next: &mut State, prev: &State) {
     next[7] = aux(next[7]);
 }
 
+#[inline(always)]
 fn key_expand1(p: u16, n: u16) -> u16 {
     let p = p ^ ((p & 0x0fff) << 4) ^ ((p & 0x00ff) << 8) ^ ((p & 0x000f) << 12);
     n ^ p
@@ -566,6 +431,7 @@ impl crate::platform::AESState for State {
         transpose_u16x8(self, out);
     }
 
+    #[inline(always)]
     fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
         let mut block = [0u8; 16];
@@ -603,6 +469,157 @@ impl crate::platform::AESState for State {
 
 #[cfg(test)]
 mod test {
+    use super::*;
+
+    #[allow(non_snake_case)]
+    fn sub_bytes_inv_state(st: &mut State) {
+        let U0 = st[7];
+        let U1 = st[6];
+        let U2 = st[5];
+        let U3 = st[4];
+        let U4 = st[3];
+        let U5 = st[2];
+        let U6 = st[1];
+        let U7 = st[0];
+
+        let T23 = U0 ^ U3;
+        let T22 = xnor(U1, U3);
+        let T2 = xnor(U0, U1);
+        let T1 = U3 ^ U4;
+        let T24 = xnor(U4, U7);
+        let R5 = U6 ^ U7;
+        let T8 = xnor(U1, T23);
+        let T19 = T22 ^ R5;
+        let T9 = xnor(U7, T1);
+        let T10 = T2 ^ T24;
+        let T13 = T2 ^ R5;
+        let T3 = T1 ^ R5;
+        let T25 = xnor(U2, T1);
+        let R13 = U1 ^ U6;
+        let T17 = xnor(U2, T19);
+        let T20 = T24 ^ R13;
+        let T4 = U4 ^ T8;
+        let R17 = xnor(U2, U5);
+        let R18 = xnor(U5, U6);
+        let R19 = xnor(U2, U4);
+        let Y5 = U0 ^ R17;
+        let T6 = T22 ^ R17;
+        let T16 = R13 ^ R19;
+        let T27 = T1 ^ R18;
+        let T15 = T10 ^ T27;
+        let T14 = T10 ^ R18;
+        let T26 = T3 ^ T16;
+        let M1 = T13 & T6;
+        let M2 = T23 & T8;
+        let M3 = T14 ^ M1;
+        let M4 = T19 & Y5;
+        let M5 = M4 ^ M1;
+        let M6 = T3 & T16;
+        let M7 = T22 & T9;
+        let M8 = T26 ^ M6;
+        let M9 = T20 & T17;
+        let M10 = M9 ^ M6;
+        let M11 = T1 & T15;
+        let M12 = T4 & T27;
+        let M13 = M12 ^ M11;
+        let M14 = T2 & T10;
+        let M15 = M14 ^ M11;
+        let M16 = M3 ^ M2;
+        let M17 = M5 ^ T24;
+        let M18 = M8 ^ M7;
+        let M19 = M10 ^ M15;
+        let M20 = M16 ^ M13;
+        let M21 = M17 ^ M15;
+        let M22 = M18 ^ M13;
+        let M23 = M19 ^ T25;
+        let M24 = M22 ^ M23;
+        let M25 = M22 & M20;
+        let M26 = M21 ^ M25;
+        let M27 = M20 ^ M21;
+        let M28 = M23 ^ M25;
+        let M29 = M28 & M27;
+        let M30 = M26 & M24;
+        let M31 = M20 & M23;
+        let M32 = M27 & M31;
+        let M33 = M27 ^ M25;
+        let M34 = M21 & M22;
+        let M35 = M24 & M34;
+        let M36 = M24 ^ M25;
+        let M37 = M21 ^ M29;
+        let M38 = M32 ^ M33;
+        let M39 = M23 ^ M30;
+        let M40 = M35 ^ M36;
+        let M41 = M38 ^ M40;
+        let M42 = M37 ^ M39;
+        let M43 = M37 ^ M38;
+        let M44 = M39 ^ M40;
+        let M45 = M42 ^ M41;
+        let M46 = M44 & T6;
+        let M47 = M40 & T8;
+        let M48 = M39 & Y5;
+        let M49 = M43 & T16;
+        let M50 = M38 & T9;
+        let M51 = M37 & T17;
+        let M52 = M42 & T15;
+        let M53 = M45 & T27;
+        let M54 = M41 & T10;
+        let M55 = M44 & T13;
+        let M56 = M40 & T23;
+        let M57 = M39 & T19;
+        let M58 = M43 & T3;
+        let M59 = M38 & T22;
+        let M60 = M37 & T20;
+        let M61 = M42 & T1;
+        let M62 = M45 & T4;
+        let M63 = M41 & T2;
+        let P0 = M52 ^ M61;
+        let P1 = M58 ^ M59;
+        let P2 = M54 ^ M62;
+        let P3 = M47 ^ M50;
+        let P4 = M48 ^ M56;
+        let P5 = M46 ^ M51;
+        let P6 = M49 ^ M60;
+        let P7 = P0 ^ P1;
+        let P8 = M50 ^ M53;
+        let P9 = M55 ^ M63;
+        let P10 = M57 ^ P4;
+        let P11 = P0 ^ P3;
+        let P12 = M46 ^ M48;
+        let P13 = M49 ^ M51;
+        let P14 = M49 ^ M62;
+        let P15 = M54 ^ M59;
+        let P16 = M57 ^ M61;
+        let P17 = M58 ^ P2;
+        let P18 = M63 ^ P5;
+        let P19 = P2 ^ P3;
+        let P20 = P4 ^ P6;
+        let P22 = P2 ^ P7;
+        let P23 = P7 ^ P8;
+        let P24 = P5 ^ P7;
+        let P25 = P6 ^ P10;
+        let P26 = P9 ^ P11;
+        let P27 = P10 ^ P18;
+        let P28 = P11 ^ P25;
+        let P29 = P15 ^ P20;
+        let W0 = P13 ^ P22;
+        let W1 = P26 ^ P29;
+        let W2 = P17 ^ P28;
+        let W3 = P12 ^ P22;
+        let W4 = P23 ^ P27;
+        let W5 = P19 ^ P24;
+        let W6 = P14 ^ P23;
+        let W7 = P9 ^ P16;
+
+        st[0] = W7;
+        st[1] = W6;
+        st[2] = W5;
+        st[3] = W4;
+        st[4] = W3;
+        st[5] = W2;
+        st[6] = W1;
+        st[7] = W0;
+    }
+
     fn sbox_fwd(s: u8) -> u8 {
         match s {
             0 => 0x63,
@@ -1127,6 +1144,8 @@ mod test {
 
     use rand_core::{OsRng, RngCore};
 
+    use crate::platform::portable::aes_core::transpose_u8x16;
+
     fn get_bit_u8(x: &[u8], i: usize, j: usize) -> u8 {
         (x[i] >> j) & 0x1
     }
@@ -1140,7 +1159,7 @@ mod test {
         let mut x = [0u8; 16];
         OsRng.fill_bytes(&mut x);
         let mut y = [0u16; 8];
-        super::transpose_u8x16(&x, &mut y);
+        transpose_u8x16(&x, &mut y);
         for i in 0..16 {
             for j in 0..8 {
                 if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
@@ -1153,7 +1172,7 @@ mod test {
             }
         }
         let mut z = [0u8; 16];
-        super::transpose_u16x8(&y, &mut z);
+        transpose_u16x8(&y, &mut z);
         for i in 0..16 {
             for j in 0..8 {
                 if get_bit_u8(&x, i, j) != get_bit_u8(&z, i, j) {
@@ -1175,9 +1194,9 @@ mod test {
         for i in 0..=255 {
             x[0] = i;
             x[9] = i;
-            super::transpose_u8x16(&x, &mut y);
-            super::sub_bytes_state(&mut y);
-            super::transpose_u16x8(&y, &mut w);
+            transpose_u8x16(&x, &mut y);
+            sub_bytes_state(&mut y);
+            transpose_u16x8(&y, &mut w);
             if w[0] != sbox_fwd(i as u8) {
                 println!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
                 assert!(false);
@@ -1195,9 +1214,9 @@ mod test {
         for i in 0..=255 {
             x[0] = i;
             x[9] = i;
-            super::transpose_u8x16(&x, &mut y);
-            super::sub_bytes_inv_state(&mut y);
-            super::transpose_u16x8(&y, &mut w);
+            transpose_u8x16(&x, &mut y);
+            sub_bytes_inv_state(&mut y);
+            transpose_u16x8(&y, &mut w);
             if w[0] != sbox_inv(i as u8) {
                 println!(
                     "sbox_inv[{}] = {}, should be {}",

From b1f5e8172cbd8c28d51b4ef743af10b14c471cad Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Fri, 6 Jun 2025 14:43:31 +0200
Subject: [PATCH 14/43] move aesgmc

---
 {libcrux-aesgcm => aesgcm}/Cargo.toml                          | 0
 {libcrux-aesgcm => aesgcm}/benches/aesgcm.rs                   | 0
 {libcrux-aesgcm => aesgcm}/src/aes_ctr.rs                      | 0
 {libcrux-aesgcm => aesgcm}/src/aes_gcm.rs                      | 0
 {libcrux-aesgcm => aesgcm}/src/aes_generic.rs                  | 0
 {libcrux-aesgcm => aesgcm}/src/gf128_generic.rs                | 0
 {libcrux-aesgcm => aesgcm}/src/lib.rs                          | 0
 {libcrux-aesgcm => aesgcm}/src/platform.rs                     | 0
 {libcrux-aesgcm => aesgcm}/src/platform/intel_ni.rs            | 0
 {libcrux-aesgcm => aesgcm}/src/platform/intel_ni/aes_core.rs   | 0
 {libcrux-aesgcm => aesgcm}/src/platform/intel_ni/gf128_core.rs | 0
 {libcrux-aesgcm => aesgcm}/src/platform/neon.rs                | 0
 {libcrux-aesgcm => aesgcm}/src/platform/neon/aes_core.rs       | 0
 {libcrux-aesgcm => aesgcm}/src/platform/neon/gf128_core.rs     | 0
 {libcrux-aesgcm => aesgcm}/src/platform/portable.rs            | 0
 {libcrux-aesgcm => aesgcm}/src/platform/portable/aes_core.rs   | 0
 {libcrux-aesgcm => aesgcm}/src/platform/portable/gf128_core.rs | 0
 17 files changed, 0 insertions(+), 0 deletions(-)
 rename {libcrux-aesgcm => aesgcm}/Cargo.toml (100%)
 rename {libcrux-aesgcm => aesgcm}/benches/aesgcm.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/aes_ctr.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/aes_gcm.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/aes_generic.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/gf128_generic.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/lib.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/intel_ni.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/intel_ni/aes_core.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/intel_ni/gf128_core.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/neon.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/neon/aes_core.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/neon/gf128_core.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/portable.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/portable/aes_core.rs (100%)
 rename {libcrux-aesgcm => aesgcm}/src/platform/portable/gf128_core.rs (100%)

diff --git a/libcrux-aesgcm/Cargo.toml b/aesgcm/Cargo.toml
similarity index 100%
rename from libcrux-aesgcm/Cargo.toml
rename to aesgcm/Cargo.toml
diff --git a/libcrux-aesgcm/benches/aesgcm.rs b/aesgcm/benches/aesgcm.rs
similarity index 100%
rename from libcrux-aesgcm/benches/aesgcm.rs
rename to aesgcm/benches/aesgcm.rs
diff --git a/libcrux-aesgcm/src/aes_ctr.rs b/aesgcm/src/aes_ctr.rs
similarity index 100%
rename from libcrux-aesgcm/src/aes_ctr.rs
rename to aesgcm/src/aes_ctr.rs
diff --git a/libcrux-aesgcm/src/aes_gcm.rs b/aesgcm/src/aes_gcm.rs
similarity index 100%
rename from libcrux-aesgcm/src/aes_gcm.rs
rename to aesgcm/src/aes_gcm.rs
diff --git a/libcrux-aesgcm/src/aes_generic.rs b/aesgcm/src/aes_generic.rs
similarity index 100%
rename from libcrux-aesgcm/src/aes_generic.rs
rename to aesgcm/src/aes_generic.rs
diff --git a/libcrux-aesgcm/src/gf128_generic.rs b/aesgcm/src/gf128_generic.rs
similarity index 100%
rename from libcrux-aesgcm/src/gf128_generic.rs
rename to aesgcm/src/gf128_generic.rs
diff --git a/libcrux-aesgcm/src/lib.rs b/aesgcm/src/lib.rs
similarity index 100%
rename from libcrux-aesgcm/src/lib.rs
rename to aesgcm/src/lib.rs
diff --git a/libcrux-aesgcm/src/platform.rs b/aesgcm/src/platform.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform.rs
rename to aesgcm/src/platform.rs
diff --git a/libcrux-aesgcm/src/platform/intel_ni.rs b/aesgcm/src/platform/intel_ni.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/intel_ni.rs
rename to aesgcm/src/platform/intel_ni.rs
diff --git a/libcrux-aesgcm/src/platform/intel_ni/aes_core.rs b/aesgcm/src/platform/intel_ni/aes_core.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/intel_ni/aes_core.rs
rename to aesgcm/src/platform/intel_ni/aes_core.rs
diff --git a/libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs b/aesgcm/src/platform/intel_ni/gf128_core.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/intel_ni/gf128_core.rs
rename to aesgcm/src/platform/intel_ni/gf128_core.rs
diff --git a/libcrux-aesgcm/src/platform/neon.rs b/aesgcm/src/platform/neon.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/neon.rs
rename to aesgcm/src/platform/neon.rs
diff --git a/libcrux-aesgcm/src/platform/neon/aes_core.rs b/aesgcm/src/platform/neon/aes_core.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/neon/aes_core.rs
rename to aesgcm/src/platform/neon/aes_core.rs
diff --git a/libcrux-aesgcm/src/platform/neon/gf128_core.rs b/aesgcm/src/platform/neon/gf128_core.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/neon/gf128_core.rs
rename to aesgcm/src/platform/neon/gf128_core.rs
diff --git a/libcrux-aesgcm/src/platform/portable.rs b/aesgcm/src/platform/portable.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/portable.rs
rename to aesgcm/src/platform/portable.rs
diff --git a/libcrux-aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/portable/aes_core.rs
rename to aesgcm/src/platform/portable/aes_core.rs
diff --git a/libcrux-aesgcm/src/platform/portable/gf128_core.rs b/aesgcm/src/platform/portable/gf128_core.rs
similarity index 100%
rename from libcrux-aesgcm/src/platform/portable/gf128_core.rs
rename to aesgcm/src/platform/portable/gf128_core.rs

From f39fcbc70c3809d87eb7f91b0464a94f2540342c Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Fri, 6 Jun 2025 15:43:13 +0200
Subject: [PATCH 15/43] cleanup

---
 Cargo.toml                                 |   2 +-
 aesgcm/Cargo.toml                          |   1 +
 aesgcm/src/aes_ctr.rs                      | 201 +++------------------
 aesgcm/src/aes_ctr/aes128_ctr.rs           |  85 +++++++++
 aesgcm/src/aes_ctr/aes256_ctr.rs           |  75 ++++++++
 aesgcm/src/aes_gcm.rs                      | 138 +++-----------
 aesgcm/src/aes_gcm/aes_gcm_128.rs          | 114 ++++++++++++
 aesgcm/src/aes_gcm/aes_gcm_256.rs          |   0
 aesgcm/src/aes_generic.rs                  |  16 +-
 aesgcm/src/gf128_generic.rs                |  26 ++-
 aesgcm/src/lib.rs                          |   7 +-
 aesgcm/src/platform/portable/aes_core.rs   |   4 +
 aesgcm/src/platform/portable/gf128_core.rs |   1 +
 aesgcm/tests/wycheproof.rs                 |  81 +++++++++
 14 files changed, 441 insertions(+), 310 deletions(-)
 create mode 100644 aesgcm/src/aes_ctr/aes128_ctr.rs
 create mode 100644 aesgcm/src/aes_ctr/aes256_ctr.rs
 create mode 100644 aesgcm/src/aes_gcm/aes_gcm_128.rs
 create mode 100644 aesgcm/src/aes_gcm/aes_gcm_256.rs
 create mode 100644 aesgcm/tests/wycheproof.rs

diff --git a/Cargo.toml b/Cargo.toml
index 5a30c20e4..d7d1dd6d7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,6 @@
 [workspace]
 members = [
+    "aesgcm",
     "sys/hacl",
     "sys/libjade",
     "sys/platform",
@@ -10,7 +11,6 @@ members = [
     "libcrux-ml-kem",
     "libcrux-ml-kem/fuzz",
     "libcrux-sha3",
-    "libcrux-aesgcm",
     "libcrux-ml-dsa",
     "libcrux-intrinsics",
     "libcrux-kem",
diff --git a/aesgcm/Cargo.toml b/aesgcm/Cargo.toml
index fe71e9ba3..df460a3e0 100644
--- a/aesgcm/Cargo.toml
+++ b/aesgcm/Cargo.toml
@@ -34,6 +34,7 @@ cavp = { version = "0.0.2-beta.2", path = "../cavp" }
 pretty_env_logger = "0.5.0"
 rand_core = { version = "0.6" }
 aes-gcm = "0.10.3"
+wycheproof = "0.6.0"
 
 [lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = ['cfg(hax)', 'cfg(eurydice)'] }
diff --git a/aesgcm/src/aes_ctr.rs b/aesgcm/src/aes_ctr.rs
index 8f5f4a49e..ae1fafc6d 100644
--- a/aesgcm/src/aes_ctr.rs
+++ b/aesgcm/src/aes_ctr.rs
@@ -1,51 +1,64 @@
-#![allow(non_camel_case_types)]
-
 use crate::{aes_generic::*, platform::AESState};
-pub struct AES_CTR_Context<T: AESState, const NUM_KEYS: usize> {
+
+mod aes128_ctr;
+// mod aes256_ctr; // TODO: use
+
+pub(crate) use aes128_ctr::*;
+// pub(crate) use aes256_ctr::*;
+
+pub struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
     pub(crate) keyex: ExtendedKey<T, NUM_KEYS>,
     pub(crate) ctr_nonce: [u8; 16],
 }
 
 fn aes_ctr_set_nonce<T: AESState, const NUM_KEYS: usize>(
-    ctx: &mut AES_CTR_Context<T, NUM_KEYS>,
+    ctx: &mut AesCtrContext<T, NUM_KEYS>,
     nonce: &[u8],
 ) {
     debug_assert!(nonce.len() == 12);
+
     ctx.ctr_nonce[0..12].copy_from_slice(nonce);
 }
 
 fn aes_ctr_key_block<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctx: &AesCtrContext<T, NUM_KEYS>,
     ctr: u32,
     out: &mut [u8],
 ) {
     debug_assert!(out.len() == 16);
+
     let mut st_init = ctx.ctr_nonce;
     st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
     let mut st = T::new();
+
     st.load_block(&st_init);
+
     block_cipher(&mut st, ctx.keyex);
+
     st.store_block(out);
 }
 
 #[inline(always)]
 fn aes_ctr_xor_block<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctx: &AesCtrContext<T, NUM_KEYS>,
     ctr: u32,
     inp: &[u8],
     out: &mut [u8],
 ) {
     debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+
     let mut st_init = ctx.ctr_nonce;
     st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
     let mut st = T::new();
     st.load_block(&st_init);
+
     block_cipher(&mut st, ctx.keyex);
+
     st.xor_block(inp, out);
 }
 
 fn aes_ctr_xor_blocks<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctx: &AesCtrContext<T, NUM_KEYS>,
     ctr: u32,
     inp: &[u8],
     out: &mut [u8],
@@ -63,14 +76,16 @@ fn aes_ctr_xor_blocks<T: AESState, const NUM_KEYS: usize>(
 }
 
 fn aes_ctr_update<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AES_CTR_Context<T, NUM_KEYS>,
+    ctx: &AesCtrContext<T, NUM_KEYS>,
     ctr: u32,
     inp: &[u8],
     out: &mut [u8],
 ) {
     debug_assert!(inp.len() == out.len());
+
     let blocks = inp.len() / 16;
     aes_ctr_xor_blocks(&ctx, ctr, &inp[0..blocks * 16], &mut out[0..blocks * 16]);
+
     let last = inp.len() - inp.len() % 16;
     if last < inp.len() {
         aes_ctr_xor_block(
@@ -82,179 +97,11 @@ fn aes_ctr_update<T: AESState, const NUM_KEYS: usize>(
     }
 }
 
-mod aes128_ctr {
-    use super::AES_CTR_Context;
-    use crate::{
-        aes_ctr::{
-            aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update, aes_ctr_xor_block,
-            aes_ctr_xor_blocks,
-        },
-        aes_generic::*,
-        platform::AESState,
-    };
-    pub type AES128_CTR_Context<T> = AES_CTR_Context<T, 11>;
-
-    pub fn aes128_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES128_CTR_Context<T> {
-        debug_assert!(nonce.len() == 12);
-        debug_assert!(key.len() == 16);
-        let mut ctr_nonce = [0u8; 16];
-        ctr_nonce[0..12].copy_from_slice(nonce);
-        AES128_CTR_Context {
-            keyex: aes128_key_expansion(key),
-            ctr_nonce,
-        }
-    }
-
-    pub fn aes128_ctr_set_nonce<T: AESState>(ctx: &mut AES128_CTR_Context<T>, nonce: &[u8]) {
-        debug_assert!(nonce.len() == 12);
-        aes_ctr_set_nonce(ctx, nonce);
-    }
-
-    pub fn aes128_ctr_key_block<T: AESState>(
-        ctx: &AES128_CTR_Context<T>,
-        ctr: u32,
-        out: &mut [u8],
-    ) {
-        debug_assert!(out.len() == 16);
-        aes_ctr_key_block(ctx, ctr, out);
-    }
-
-    pub fn aes128_ctr_xor_block<T: AESState>(
-        ctx: &AES128_CTR_Context<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        aes_ctr_xor_block(ctx, ctr, inp, out);
-    }
-
-    pub fn aes128_ctr_xor_blocks<T: AESState>(
-        ctx: &AES128_CTR_Context<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
-        aes_ctr_xor_blocks(ctx, ctr, inp, out);
-    }
-
-    pub fn aes128_ctr_update<T: AESState>(
-        ctx: &AES128_CTR_Context<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len());
-        aes_ctr_update(ctx, ctr, inp, out);
-    }
-
-    pub fn aes128_ctr_encrypt<T: AESState>(
-        key: &[u8],
-        nonce: &[u8],
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(nonce.len() == 12);
-        debug_assert!(key.len() == 16);
-        debug_assert!(inp.len() == out.len());
-        let ctx = aes128_ctr_init::<T>(key, nonce);
-        aes128_ctr_update(&ctx, ctr, inp, out);
-    }
-}
-
-mod aes256_ctr {
-    use super::AES_CTR_Context;
-    use crate::{
-        aes_ctr::{
-            aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update, aes_ctr_xor_block,
-            aes_ctr_xor_blocks,
-        },
-        aes_generic::*,
-        platform::AESState,
-    };
-
-    pub type AES256_CTR_Context<T> = AES_CTR_Context<T, 15>;
-
-    pub fn aes256_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> AES256_CTR_Context<T> {
-        debug_assert!(nonce.len() == 12);
-        debug_assert!(key.len() == 32);
-        let mut ctr_nonce = [0u8; 16];
-        ctr_nonce[0..12].copy_from_slice(nonce);
-        AES256_CTR_Context {
-            keyex: aes256_key_expansion(key),
-            ctr_nonce,
-        }
-    }
-
-    pub fn aes256_ctr_key_block<T: AESState>(
-        ctx: &AES256_CTR_Context<T>,
-        ctr: u32,
-        out: &mut [u8],
-    ) {
-        debug_assert!(out.len() == 16);
-        aes_ctr_key_block(ctx, ctr, out);
-    }
-
-    pub fn aes256_ctr_set_nonce<T: AESState>(ctx: &mut AES256_CTR_Context<T>, nonce: &[u8]) {
-        debug_assert!(nonce.len() == 12);
-        aes_ctr_set_nonce(ctx, nonce);
-    }
-
-    pub fn aes256_ctr_xor_block<T: AESState>(
-        ctx: &AES256_CTR_Context<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        aes_ctr_xor_block(ctx, ctr, inp, out);
-    }
-
-    pub fn aes256_ctr_xor_blocks<T: AESState>(
-        ctx: &AES256_CTR_Context<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
-        aes_ctr_xor_blocks(ctx, ctr, inp, out);
-    }
-
-    pub fn aes256_ctr_update<T: AESState>(
-        ctx: &AES256_CTR_Context<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len());
-        aes_ctr_update(ctx, ctr, inp, out);
-    }
-
-    pub fn aes256_ctr_encrypt<T: AESState>(
-        key: &[u8],
-        nonce: &[u8],
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(nonce.len() == 12);
-        debug_assert!(key.len() == 32);
-        debug_assert!(inp.len() == out.len());
-        let ctx = aes256_ctr_init::<T>(key, nonce);
-        aes256_ctr_update(&ctx, ctr, inp, out);
-    }
-}
-
-pub use aes128_ctr::*;
-pub use aes256_ctr::*;
-
 #[cfg(test)]
 mod test {
     use crate::platform;
 
-    use super::{aes128_ctr_encrypt, aes128_ctr_init, aes128_ctr_xor_block};
+    use super::{aes128_ctr_init, test_utils::*};
 
     const INPUT: [u8; 32] = [
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
diff --git a/aesgcm/src/aes_ctr/aes128_ctr.rs b/aesgcm/src/aes_ctr/aes128_ctr.rs
new file mode 100644
index 000000000..be5baf8dd
--- /dev/null
+++ b/aesgcm/src/aes_ctr/aes128_ctr.rs
@@ -0,0 +1,85 @@
+use super::AesCtrContext;
+use crate::{
+    aes_ctr::{aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update},
+    aes_generic::*,
+    platform::AESState,
+};
+
+/// Type alias for the AES 128 ctr context
+pub(crate) type Aes128CtrContext<T> = AesCtrContext<T, 11>;
+
+pub fn aes128_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> Aes128CtrContext<T> {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 16);
+
+    let mut ctr_nonce = [0u8; 16];
+    ctr_nonce[0..12].copy_from_slice(nonce);
+
+    Aes128CtrContext {
+        keyex: aes128_key_expansion(key),
+        ctr_nonce,
+    }
+}
+
+pub fn aes128_ctr_set_nonce<T: AESState>(ctx: &mut Aes128CtrContext<T>, nonce: &[u8]) {
+    debug_assert!(nonce.len() == 12);
+
+    aes_ctr_set_nonce(ctx, nonce);
+}
+
+pub fn aes128_ctr_key_block<T: AESState>(ctx: &Aes128CtrContext<T>, ctr: u32, out: &mut [u8]) {
+    debug_assert!(out.len() == 16);
+
+    aes_ctr_key_block(ctx, ctr, out);
+}
+
+pub fn aes128_ctr_update<T: AESState>(
+    ctx: &Aes128CtrContext<T>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len());
+
+    aes_ctr_update(ctx, ctr, inp, out);
+}
+
+#[cfg(test)]
+pub(crate) mod test_utils {
+    use super::*;
+    use crate::aes_ctr::aes_ctr_xor_block;
+
+    pub fn aes128_ctr_xor_block<T: AESState>(
+        ctx: &Aes128CtrContext<T>,
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+        aes_ctr_xor_block(ctx, ctr, inp, out);
+    }
+
+    // pub fn aes128_ctr_xor_blocks<T: AESState>(
+    //     ctx: &Aes128CtrContext<T>,
+    //     ctr: u32,
+    //     inp: &[u8],
+    //     out: &mut [u8],
+    // ) {
+    //     debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
+    //     aes_ctr_xor_blocks(ctx, ctr, inp, out);
+    // }
+
+    pub fn aes128_ctr_encrypt<T: AESState>(
+        key: &[u8],
+        nonce: &[u8],
+        ctr: u32,
+        inp: &[u8],
+        out: &mut [u8],
+    ) {
+        debug_assert!(nonce.len() == 12);
+        debug_assert!(key.len() == 16);
+        debug_assert!(inp.len() == out.len());
+        let ctx = aes128_ctr_init::<T>(key, nonce);
+        aes128_ctr_update(&ctx, ctr, inp, out);
+    }
+}
diff --git a/aesgcm/src/aes_ctr/aes256_ctr.rs b/aesgcm/src/aes_ctr/aes256_ctr.rs
new file mode 100644
index 000000000..0c29ef79b
--- /dev/null
+++ b/aesgcm/src/aes_ctr/aes256_ctr.rs
@@ -0,0 +1,75 @@
+use super::AesCtrContext;
+use crate::{
+    aes_ctr::{
+        aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update, aes_ctr_xor_block, aes_ctr_xor_blocks,
+    },
+    aes_generic::*,
+    platform::AESState,
+};
+
+pub type Aes256CtrContext<T> = AesCtrContext<T, 15>;
+
+pub fn aes256_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> Aes256CtrContext<T> {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 32);
+    let mut ctr_nonce = [0u8; 16];
+    ctr_nonce[0..12].copy_from_slice(nonce);
+    Aes256CtrContext {
+        keyex: aes256_key_expansion(key),
+        ctr_nonce,
+    }
+}
+
+pub fn aes256_ctr_key_block<T: AESState>(ctx: &Aes256CtrContext<T>, ctr: u32, out: &mut [u8]) {
+    debug_assert!(out.len() == 16);
+    aes_ctr_key_block(ctx, ctr, out);
+}
+
+pub fn aes256_ctr_set_nonce<T: AESState>(ctx: &mut Aes256CtrContext<T>, nonce: &[u8]) {
+    debug_assert!(nonce.len() == 12);
+    aes_ctr_set_nonce(ctx, nonce);
+}
+
+pub fn aes256_ctr_xor_block<T: AESState>(
+    ctx: &Aes256CtrContext<T>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    aes_ctr_xor_block(ctx, ctr, inp, out);
+}
+
+pub fn aes256_ctr_xor_blocks<T: AESState>(
+    ctx: &Aes256CtrContext<T>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
+    aes_ctr_xor_blocks(ctx, ctr, inp, out);
+}
+
+pub fn aes256_ctr_update<T: AESState>(
+    ctx: &Aes256CtrContext<T>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len());
+    aes_ctr_update(ctx, ctr, inp, out);
+}
+
+pub fn aes256_ctr_encrypt<T: AESState>(
+    key: &[u8],
+    nonce: &[u8],
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(nonce.len() == 12);
+    debug_assert!(key.len() == 32);
+    debug_assert!(inp.len() == out.len());
+    let ctx = aes256_ctr_init::<T>(key, nonce);
+    aes256_ctr_update(&ctx, ctr, inp, out);
+}
diff --git a/aesgcm/src/aes_gcm.rs b/aesgcm/src/aes_gcm.rs
index 97c6a39f6..3a7ca1610 100644
--- a/aesgcm/src/aes_gcm.rs
+++ b/aesgcm/src/aes_gcm.rs
@@ -1,107 +1,13 @@
-use crate::{
-    aes_ctr::{
-        aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce, aes128_ctr_update,
-        AES128_CTR_Context,
-    },
-    gf128_generic::{gf128_emit, gf128_init, gf128_update, gf128_update_padded, GF128State},
-    platform::{AESState, GF128FieldElement},
-};
+pub(crate) mod aes_gcm_128;
+pub(crate) use aes_gcm_128::*;
 
-#[allow(non_camel_case_types)]
-pub struct AES128_GCM_State<T: AESState, U: GF128FieldElement> {
-    aes_state: AES128_CTR_Context<T>,
-    gcm_state: GF128State<U>,
-    tag_mix: [u8; 16],
-}
-
-pub fn aes128_gcm_init<T: AESState, U: GF128FieldElement>(key: &[u8]) -> AES128_GCM_State<T, U> {
-    debug_assert!(key.len() == 16);
-    let nonce = [0u8; 12];
-    let mut gcm_key = [0u8; 16];
-    let tag_mix = [0u8; 16];
-    let aes_state = aes128_ctr_init(key, &nonce);
-    aes128_ctr_key_block(&aes_state, 0, &mut gcm_key);
-    let gcm_state = gf128_init(&gcm_key);
-    AES128_GCM_State {
-        aes_state,
-        gcm_state,
-        tag_mix,
-    }
-}
-
-pub fn aes128_gcm_set_nonce<T: AESState, U: GF128FieldElement>(
-    st: &mut AES128_GCM_State<T, U>,
-    nonce: &[u8],
-) {
-    debug_assert!(nonce.len() == 12);
-    aes128_ctr_set_nonce(&mut st.aes_state, nonce);
-    aes128_ctr_key_block(&st.aes_state, 1, &mut st.tag_mix);
-}
-
-pub fn aes128_gcm_encrypt<T: AESState, U: GF128FieldElement>(
-    st: &mut AES128_GCM_State<T, U>,
-    aad: &[u8],
-    plaintext: &[u8],
-    ciphertext: &mut [u8],
-    tag: &mut [u8],
-) {
-    debug_assert!(ciphertext.len() == plaintext.len());
-    debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
-    debug_assert!(tag.len() == 16);
-    aes128_ctr_update(&st.aes_state, 2, plaintext, ciphertext);
-    gf128_update_padded(&mut st.gcm_state, aad);
-    gf128_update_padded(&mut st.gcm_state, ciphertext);
-    let mut last_block = [0u8; 16];
-    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-    gf128_update(&mut st.gcm_state, &last_block);
-    gf128_emit(&st.gcm_state, tag);
-    for i in 0..16 {
-        tag[i] ^= st.tag_mix[i];
-    }
-}
-
-pub struct DecryptError();
-
-pub fn aes128_gcm_decrypt<T: AESState, U: GF128FieldElement>(
-    st: &mut AES128_GCM_State<T, U>,
-    aad: &[u8],
-    ciphertext: &[u8],
-    tag: &[u8],
-    plaintext: &mut [u8],
-) -> Result<(), DecryptError> {
-    debug_assert!(plaintext.len() == ciphertext.len());
-    debug_assert!(ciphertext.len() / 16 <= u32::MAX as usize);
-    debug_assert!(tag.len() == 16);
-    gf128_update_padded(&mut st.gcm_state, aad);
-    gf128_update_padded(&mut st.gcm_state, ciphertext);
-    let mut last_block = [0u8; 16];
-    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-    gf128_update(&mut st.gcm_state, &last_block);
-    let mut computed_tag = [0u8; 16];
-    gf128_emit(&st.gcm_state, &mut computed_tag);
-    for i in 0..16 {
-        computed_tag[i] ^= st.tag_mix[i];
-    }
-    let mut eq_mask = 0u8;
-    for i in 0..16 {
-        eq_mask |= computed_tag[i] ^ tag[i];
-    }
-    if eq_mask == 0 {
-        aes128_ctr_update(&st.aes_state, 2, ciphertext, plaintext);
-        Ok(())
-    } else {
-        Err(DecryptError())
-    }
-}
+pub(crate)  mod aes_gcm_256;
 
 #[cfg(test)]
 mod test {
+    use super::aes_gcm_128;
     use crate::platform::portable;
 
-    use super::{aes128_gcm_encrypt, aes128_gcm_init, aes128_gcm_set_nonce};
-
     const INPUT1: [u8; 60] = [
         0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26,
         0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31,
@@ -235,10 +141,10 @@ mod test {
     #[test]
     fn test_gcm1() {
         let mut computed1 = [0u8; 76];
-        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY1);
-        aes128_gcm_set_nonce(&mut st, &NONCE1);
+        let mut st = aes_gcm_128::aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY1);
+        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE1);
         let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
+        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
         for i in 0..76 {
             if computed1[i] != EXPECTED1[i] {
                 println!(
@@ -253,10 +159,10 @@ mod test {
     #[test]
     fn test_gcm2() {
         let mut computed2 = [0u8; 668];
-        let mut st = aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY2);
-        aes128_gcm_set_nonce(&mut st, &NONCE2);
+        let mut st = aes_gcm_128::aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY2);
+        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE2);
         let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
+        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
         for i in 0..668 {
             if computed2[i] != EXPECTED2[i] {
                 println!(
@@ -275,10 +181,10 @@ mod test {
     #[test]
     fn test_gcm1_neon() {
         let mut computed1 = [0u8; 76];
-        let mut st = aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY1);
-        aes128_gcm_set_nonce(&mut st, &NONCE1);
+        let mut st = aes_gcm_128::aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY1);
+        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE1);
         let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
+        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
         for i in 0..76 {
             if computed1[i] != EXPECTED1[i] {
                 println!(
@@ -294,10 +200,10 @@ mod test {
     #[test]
     fn test_gcm2_neon() {
         let mut computed2 = [0u8; 668];
-        let mut st = aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY2);
-        aes128_gcm_set_nonce(&mut st, &NONCE2);
+        let mut st = aes_gcm_128::aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY2);
+        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE2);
         let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
+        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
         for i in 0..668 {
             if computed2[i] != EXPECTED2[i] {
                 println!(
@@ -316,10 +222,10 @@ mod test {
     #[test]
     fn test_gcm1_intel() {
         let mut computed1 = [0u8; 76];
-        let mut st = aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY1);
-        aes128_gcm_set_nonce(&mut st, &NONCE1);
+        let mut st = aes_gcm_128::aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY1);
+        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE1);
         let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
+        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
         for i in 0..76 {
             if computed1[i] != EXPECTED1[i] {
                 println!(
@@ -335,10 +241,10 @@ mod test {
     #[test]
     fn test_gcm2_intel() {
         let mut computed2 = [0u8; 668];
-        let mut st = aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY2);
-        aes128_gcm_set_nonce(&mut st, &NONCE2);
+        let mut st = aes_gcm_128::aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY2);
+        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE2);
         let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
+        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
         for i in 0..668 {
             if computed2[i] != EXPECTED2[i] {
                 println!(
diff --git a/aesgcm/src/aes_gcm/aes_gcm_128.rs b/aesgcm/src/aes_gcm/aes_gcm_128.rs
new file mode 100644
index 000000000..8413c4140
--- /dev/null
+++ b/aesgcm/src/aes_gcm/aes_gcm_128.rs
@@ -0,0 +1,114 @@
+use crate::{
+    aes_ctr::{
+        aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce, aes128_ctr_update,
+        Aes128CtrContext,
+    },
+    gf128_generic::{gf128_emit, gf128_init, gf128_update, gf128_update_padded, GF128State},
+    platform::{AESState, GF128FieldElement},
+};
+
+#[allow(non_camel_case_types)]
+pub(crate) struct AES128_GCM_State<T: AESState, U: GF128FieldElement> {
+    pub(crate) aes_state: Aes128CtrContext<T>,
+    pub(crate) gcm_state: GF128State<U>,
+    pub(crate) tag_mix: [u8; 16],
+}
+
+pub(crate) fn aes128_gcm_init<T: AESState, U: GF128FieldElement>(
+    key: &[u8],
+) -> AES128_GCM_State<T, U> {
+    debug_assert!(key.len() == 16);
+
+    let nonce = [0u8; 12];
+    let mut gcm_key = [0u8; 16];
+    let tag_mix = [0u8; 16];
+
+    let aes_state = aes128_ctr_init(key, &nonce);
+    aes128_ctr_key_block(&aes_state, 0, &mut gcm_key);
+    let gcm_state = gf128_init(&gcm_key);
+
+    AES128_GCM_State {
+        aes_state,
+        gcm_state,
+        tag_mix,
+    }
+}
+
+pub(crate) fn aes128_gcm_set_nonce<T: AESState, U: GF128FieldElement>(
+    st: &mut AES128_GCM_State<T, U>,
+    nonce: &[u8],
+) {
+    debug_assert!(nonce.len() == 12);
+
+    aes128_ctr_set_nonce(&mut st.aes_state, nonce);
+    aes128_ctr_key_block(&st.aes_state, 1, &mut st.tag_mix);
+}
+
+pub(crate) fn aes128_gcm_encrypt<T: AESState, U: GF128FieldElement>(
+    st: &mut AES128_GCM_State<T, U>,
+    aad: &[u8],
+    plaintext: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    debug_assert!(ciphertext.len() == plaintext.len());
+    debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
+    debug_assert!(tag.len() == 16);
+
+    aes128_ctr_update(&st.aes_state, 2, plaintext, ciphertext);
+
+    gf128_update_padded(&mut st.gcm_state, aad);
+    gf128_update_padded(&mut st.gcm_state, ciphertext);
+
+    let mut last_block = [0u8; 16];
+    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+    gf128_update(&mut st.gcm_state, &last_block);
+    gf128_emit(&st.gcm_state, tag);
+
+    for i in 0..16 {
+        tag[i] ^= st.tag_mix[i];
+    }
+}
+
+/// AES-GCM decryption error.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct DecryptError();
+
+pub(crate) fn aes128_gcm_decrypt<T: AESState, U: GF128FieldElement>(
+    st: &mut AES128_GCM_State<T, U>,
+    aad: &[u8],
+    ciphertext: &[u8],
+    tag: &[u8],
+    plaintext: &mut [u8],
+) -> Result<(), DecryptError> {
+    debug_assert!(plaintext.len() == ciphertext.len());
+    debug_assert!(ciphertext.len() / 16 <= u32::MAX as usize);
+    debug_assert!(tag.len() == 16);
+
+    gf128_update_padded(&mut st.gcm_state, aad);
+    gf128_update_padded(&mut st.gcm_state, ciphertext);
+
+    let mut last_block = [0u8; 16];
+    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+    gf128_update(&mut st.gcm_state, &last_block);
+
+    let mut computed_tag = [0u8; 16];
+    gf128_emit(&st.gcm_state, &mut computed_tag);
+    for i in 0..16 {
+        computed_tag[i] ^= st.tag_mix[i];
+    }
+    let mut eq_mask = 0u8;
+    for i in 0..16 {
+        eq_mask |= computed_tag[i] ^ tag[i];
+    }
+    if eq_mask == 0 {
+        aes128_ctr_update(&st.aes_state, 2, ciphertext, plaintext);
+        Ok(())
+    } else {
+        Err(DecryptError())
+    }
+}
diff --git a/aesgcm/src/aes_gcm/aes_gcm_256.rs b/aesgcm/src/aes_gcm/aes_gcm_256.rs
new file mode 100644
index 000000000..e69de29bb
diff --git a/aesgcm/src/aes_generic.rs b/aesgcm/src/aes_generic.rs
index 9155535c3..561a506f2 100644
--- a/aesgcm/src/aes_generic.rs
+++ b/aesgcm/src/aes_generic.rs
@@ -2,12 +2,15 @@ use crate::platform::*;
 
 pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
 
-const RCON: [u8; 11] = [
-    0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
-];
+// This is inlined into the key expansion below.
+// const RCON: [u8; 11] = [
+//     0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+// ];
 
+/// 128 - Key expansion
 pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11> {
     debug_assert!(key.len() == 16);
+
     let mut keyex = [T::new(); 11];
     keyex[0].load_block(&key);
 
@@ -18,6 +21,7 @@ pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11
             keyex[$i].key_expansion_step(&prev);
         };
     }
+
     expansion_step128!(1, 0x01);
     expansion_step128!(2, 0x02);
     expansion_step128!(3, 0x04);
@@ -28,11 +32,16 @@ pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11
     expansion_step128!(8, 0x80);
     expansion_step128!(9, 0x1b);
     expansion_step128!(10, 0x36);
+
     keyex
 }
 
+/// 256 - Key expansion
+/// TODO: use
+#[allow(dead_code)]
 pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15> {
     debug_assert!(key.len() == 32);
+
     let mut keyex = [T::new(); 15];
     keyex[0].load_block(&key[0..16]);
     keyex[1].load_block(&key[16..32]);
@@ -43,6 +52,7 @@ pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15
             let prev1 = keyex[$i - 1];
             keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
             keyex[$i].key_expansion_step(&prev0);
+
             let next0 = keyex[$i];
             keyex[$i + 1].aes_keygen_assist1(&next0);
             keyex[$i + 1].key_expansion_step(&prev1);
diff --git a/aesgcm/src/gf128_generic.rs b/aesgcm/src/gf128_generic.rs
index 307f0ea34..047b7b332 100644
--- a/aesgcm/src/gf128_generic.rs
+++ b/aesgcm/src/gf128_generic.rs
@@ -7,6 +7,7 @@ pub struct GF128State<T: GF128FieldElement> {
 
 pub fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
     debug_assert!(key.len() == 16);
+
     GF128State {
         accumulator: T::zero(),
         r: T::load_elem(key),
@@ -16,6 +17,7 @@ pub fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
 #[inline(always)]
 pub fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block: &[u8]) {
     debug_assert!(block.len() == 16);
+
     let block_elem = T::load_elem(block);
     st.accumulator.add(&block_elem);
     st.accumulator.mul(&st.r);
@@ -23,6 +25,7 @@ pub fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block: &[u8])
 
 pub fn gf128_update_blocks<T: GF128FieldElement>(st: &mut GF128State<T>, input: &[u8]) {
     debug_assert!(input.len() % 16 == 0);
+
     let blocks = input.len() / 16;
     for i in 0..blocks {
         gf128_update(st, &input[i * 16..i * 16 + 16]);
@@ -31,6 +34,7 @@ pub fn gf128_update_blocks<T: GF128FieldElement>(st: &mut GF128State<T>, input:
 
 pub fn gf128_update_last<T: GF128FieldElement>(st: &mut GF128State<T>, partial_block: &[u8]) {
     debug_assert!(partial_block.len() < 16);
+
     let mut block = [0u8; 16];
     block[0..partial_block.len()].copy_from_slice(partial_block);
     gf128_update(st, &block);
@@ -39,6 +43,7 @@ pub fn gf128_update_last<T: GF128FieldElement>(st: &mut GF128State<T>, partial_b
 pub fn gf128_update_padded<T: GF128FieldElement>(st: &mut GF128State<T>, input: &[u8]) {
     let blocks = input.len() / 16;
     gf128_update_blocks(st, &input[0..blocks * 16]);
+
     let last = input.len() - input.len() % 16;
     if last < input.len() {
         gf128_update_last(st, &input[last..]);
@@ -47,21 +52,22 @@ pub fn gf128_update_padded<T: GF128FieldElement>(st: &mut GF128State<T>, input:
 
 pub fn gf128_emit<T: GF128FieldElement>(st: &GF128State<T>, out: &mut [u8]) {
     debug_assert!(out.len() == 16);
-    st.accumulator.store_elem(out);
-}
-
-pub fn gf128<T: GF128FieldElement>(key: &[u8], inp: &[u8], out: &mut [u8]) {
-    debug_assert!(key.len() == 16);
-    debug_assert!(out.len() == 16);
 
-    let mut st = gf128_init::<T>(key);
-    gf128_update_padded(&mut st, inp);
-    gf128_emit(&st, out);
+    st.accumulator.store_elem(out);
 }
 
 #[cfg(test)]
 mod test {
-    use super::gf128;
+    use super::*;
+
+    fn gf128<T: GF128FieldElement>(key: &[u8], inp: &[u8], out: &mut [u8]) {
+        debug_assert!(key.len() == 16);
+        debug_assert!(out.len() == 16);
+
+        let mut st = gf128_init::<T>(key);
+        gf128_update_padded(&mut st, inp);
+        gf128_emit(&st, out);
+    }
 
     const INPUT: [u8; 132] = [
         0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index f9a7cc646..79f4d8cb2 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -4,11 +4,11 @@ mod aes_generic;
 mod gf128_generic;
 pub mod platform;
 
-pub use aes_gcm::DecryptError;
+pub use aes_gcm::aes_gcm_128::DecryptError;
 
 pub mod portable {
     use crate::{
-        aes_gcm::{self, DecryptError},
+        aes_gcm::{self, aes_gcm_128::DecryptError},
         platform,
     };
 
@@ -25,6 +25,7 @@ pub mod portable {
             platform::portable::FieldElement,
         >(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
+        eprintln!("tag: {tag:x?}");
         aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
     }
 
@@ -117,6 +118,6 @@ pub mod intel_ni {
             platform::intel_ni::FieldElement,
         >(key);
         aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
+        aes_gcm::aes_gcm_128::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
     }
 }
diff --git a/aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
index dd5ee6ad7..a396e2dd6 100644
--- a/aesgcm/src/platform/portable/aes_core.rs
+++ b/aesgcm/src/platform/portable/aes_core.rs
@@ -423,19 +423,23 @@ impl crate::platform::AESState for State {
 
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
+
         transpose_u8x16(b.try_into().unwrap(), self);
     }
 
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
+
         transpose_u16x8(self, out);
     }
 
     #[inline(always)]
     fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+
         let mut block = [0u8; 16];
         self.store_block(&mut block);
+
         for i in 0..inp.len() {
             out[i] = inp[i] ^ block[i];
         }
diff --git a/aesgcm/src/platform/portable/gf128_core.rs b/aesgcm/src/platform/portable/gf128_core.rs
index 508bc601b..d86375e53 100644
--- a/aesgcm/src/platform/portable/gf128_core.rs
+++ b/aesgcm/src/platform/portable/gf128_core.rs
@@ -6,6 +6,7 @@ fn zero() -> FieldElement {
 
 fn load_elem(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
+
     u128::from_be_bytes(b.try_into().unwrap())
 }
 
diff --git a/aesgcm/tests/wycheproof.rs b/aesgcm/tests/wycheproof.rs
new file mode 100644
index 000000000..13ef4fdff
--- /dev/null
+++ b/aesgcm/tests/wycheproof.rs
@@ -0,0 +1,81 @@
+use wycheproof::TestResult;
+
+#[test]
+fn test() {
+    let test_set = wycheproof::aead::TestSet::load(wycheproof::aead::TestName::AesGcm).unwrap();
+
+    macro_rules! run {
+        ($encrypt:expr, $decrypt:expr, $test:expr, $key:expr, $nonce:expr, $aad:expr, $pt:expr) => {
+            let mut ciphertext = vec![0u8; $pt.len()];
+            let mut plaintext = vec![0u8; $pt.len()];
+            let mut tag = [0u8; 16];
+
+            $encrypt($key, $nonce, $aad, $pt, &mut ciphertext, &mut tag);
+            $decrypt($key, $nonce, $aad, &ciphertext, &tag, &mut plaintext).unwrap();
+
+            assert_eq!(plaintext.as_slice(), $pt.as_slice());
+
+            if $test.result == TestResult::Valid {
+                assert_eq!($test.ct.as_slice(), &ciphertext);
+                assert_eq!($test.tag.as_slice(), &tag);
+            } else {
+                let ct_ok = $test.ct.as_slice() == &ciphertext;
+                let tag_ok = $test.tag.as_slice() == &tag;
+                assert!(!ct_ok || !tag_ok);
+            }
+        };
+    }
+
+    for test_group in test_set.test_groups {
+        println!(
+            "* Group key size:{} tag size:{} nonce size:{}",
+            test_group.key_size, test_group.tag_size, test_group.nonce_size,
+        );
+
+        if test_group.nonce_size != 96 {
+            println!("  Skipping unsupported nonce size");
+            continue;
+        }
+
+        if test_group.key_size == 128 {
+            for test in test_group.tests {
+                println!("  Test {}", test.tc_id);
+                run!(
+                    libcrux_aesgcm::portable::aes128_gcm_encrypt,
+                    libcrux_aesgcm::portable::aes128_gcm_decrypt,
+                    test,
+                    &test.key,
+                    &test.nonce,
+                    &test.aad,
+                    &test.pt
+                );
+
+                #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+                run!(
+                    libcrux_aesgcm::neon::aes128_gcm_encrypt,
+                    libcrux_aesgcm::neon::aes128_gcm_decrypt,
+                    test,
+                    &test.key,
+                    &test.nonce,
+                    &test.aad,
+                    &test.pt
+                );
+
+                #[cfg(all(target_arch = "x86_64"))]
+                run!(
+                    libcrux_aesgcm::intel_ni::aes128_gcm_encrypt,
+                    libcrux_aesgcm::intel_ni::aes128_gcm_decrypt,
+                    test,
+                    &test.key,
+                    &test.nonce,
+                    &test.aad,
+                    &test.pt
+                );
+            }
+        } else if test_group.key_size == 256 {
+            for _test in test_group.tests {
+                // TODO
+            }
+        }
+    }
+}

From 242edc4877b2b6c569dec53a0a9d4a56c0f2106b Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Thu, 19 Jun 2025 14:15:03 +0200
Subject: [PATCH 16/43] aesgcm: fixup intrinsics version

---
 aesgcm/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aesgcm/Cargo.toml b/aesgcm/Cargo.toml
index df460a3e0..68fe2adee 100644
--- a/aesgcm/Cargo.toml
+++ b/aesgcm/Cargo.toml
@@ -15,7 +15,7 @@ bench = false # so libtest doesn't eat the arguments to criterion
 
 [dependencies]
 libcrux-platform = { version = "0.0.2", path = "../sys/platform" }
-libcrux-intrinsics = { version = "0.0.2", path = "../libcrux-intrinsics" }
+libcrux-intrinsics = { version = "0.0.3-alpha.3", path = "../libcrux-intrinsics" }
 hax-lib.workspace = true
 
 [features]

From a5a3c0f5c3b451d021e708cec0f192e2e7c0dcf8 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Fri, 4 Jul 2025 15:46:20 +0200
Subject: [PATCH 17/43] cleanup

---
 Cargo.toml                                 |  39 +++-
 aesgcm/Cargo.toml                          |  15 +-
 aesgcm/benches/aesgcm.rs                   |   6 +-
 aesgcm/src/aes_ctr.rs                      | 142 +++++-------
 aesgcm/src/aes_ctr/aes128_ctr.rs           |  74 +++---
 aesgcm/src/aes_gcm.rs                      | 258 ---------------------
 aesgcm/src/aes_gcm/aes_gcm_128.rs          | 114 ---------
 aesgcm/src/aes_gcm_128.rs                  | 121 ++++++++++
 aesgcm/src/{aes_gcm => }/aes_gcm_256.rs    |   0
 aesgcm/src/aes_generic.rs                  |  35 ++-
 aesgcm/src/gf128_generic.rs                |  87 +++----
 aesgcm/src/lib.rs                          | 207 ++++++++++++++---
 aesgcm/src/platform.rs                     |  10 +-
 aesgcm/src/platform/neon/aes_core.rs       |   1 -
 aesgcm/src/platform/neon/gf128_core.rs     |  55 ++---
 aesgcm/src/platform/portable/aes_core.rs   |  22 +-
 aesgcm/src/platform/portable/gf128_core.rs |  42 ++--
 aesgcm/tests/wycheproof.rs                 |  87 ++++---
 libcrux-intrinsics/src/arm64.rs            |   6 +
 sys/pqclean/src/bindings.rs                |   2 +-
 traits/Cargo.toml                          |   2 +-
 traits/src/lib.rs                          |   3 +-
 22 files changed, 623 insertions(+), 705 deletions(-)
 delete mode 100644 aesgcm/src/aes_gcm.rs
 delete mode 100644 aesgcm/src/aes_gcm/aes_gcm_128.rs
 create mode 100644 aesgcm/src/aes_gcm_128.rs
 rename aesgcm/src/{aes_gcm => }/aes_gcm_256.rs (100%)

diff --git a/Cargo.toml b/Cargo.toml
index d7d1dd6d7..5ea08e6b5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -50,6 +50,19 @@ allow-branch = ["main"]
 
 [workspace.dependencies]
 hax-lib = { version = "0.3.4" }
+libcrux-intrinsics = { version = "=0.0.3", path = "libcrux-intrinsics" }
+libcrux-aesgcm = { version = "=0.0.2", path = "aesgcm" }
+libcrux-traits = { version = "=0.0.3", path = "traits" }
+libcrux-hacl-rs = { version = "=0.0.3", path = "hacl-rs" }
+libcrux-hacl = { version = "=0.0.2", path = "sys/hacl" }
+libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
+libcrux-hkdf = { version = "=0.0.3", path = "libcrux-hkdf" }
+libcrux-hmac = { version = "=0.0.3", path = "libcrux-hmac" }
+libcrux-sha2 = { version = "=0.0.3", path = "sha2" }
+libcrux-ed25519 = { version = "=0.0.3", path = "ed25519" }
+libcrux-ecdh = { version = "=0.0.3", path = "libcrux-ecdh" }
+libcrux-ml-kem = { version = "=0.0.3", path = "libcrux-ml-kem" }
+libcrux-kem = { version = "=0.0.3", path = "libcrux-kem" }
 
 [package]
 name = "libcrux"
@@ -82,23 +95,27 @@ bench = false                               # so libtest doesn't eat the argumen
 libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
 
 [dependencies]
-libcrux-traits = { version = "=0.0.3", path = "traits" }
+libcrux-hacl-rs.workspace = true
 libcrux-chacha20poly1305 = { version = "=0.0.3", path = "chacha20poly1305" }
-libcrux-hacl-rs = { version = "=0.0.3", path = "hacl-rs" }
-libcrux-hacl = { version = "=0.0.2", path = "sys/hacl" }
-libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
-libcrux-hkdf = { version = "=0.0.3", path = "libcrux-hkdf" }
-libcrux-hmac = { version = "=0.0.3", path = "libcrux-hmac" }
-libcrux-sha2 = { version = "=0.0.3", path = "sha2" }
-libcrux-ed25519 = { version = "=0.0.3", path = "ed25519" }
-libcrux-ecdh = { version = "=0.0.3", path = "libcrux-ecdh" }
-libcrux-ml-kem = { version = "=0.0.3", path = "libcrux-ml-kem" }
-libcrux-kem = { version = "=0.0.3", path = "libcrux-kem" }
+libcrux-ml-kem.workspace = true
+libcrux-traits.workspace = true
+libcrux-hacl.workspace = true
+libcrux-platform.workspace = true
+libcrux-hkdf.workspace = true
+libcrux-hmac.workspace = true
+libcrux-sha2.workspace = true
+libcrux-ed25519.workspace = true
+libcrux-ecdh.workspace = true
+libcrux-kem.workspace = true
+
 rand = { version = "0.9" }
 log = { version = "0.4", optional = true }
+
 # WASM API
 wasm-bindgen = { version = "0.2.87", optional = true }
 getrandom = { version = "0.3", optional = true }
+
+# Proofs
 hax-lib.workspace = true
 
 [dev-dependencies]
diff --git a/aesgcm/Cargo.toml b/aesgcm/Cargo.toml
index 68fe2adee..13267aa9c 100644
--- a/aesgcm/Cargo.toml
+++ b/aesgcm/Cargo.toml
@@ -8,29 +8,32 @@ edition.workspace = true
 repository.workspace = true
 readme = "README.md"
 description = "Libcrux AES-GCM implementation"
-exclude = ["/proofs", "/c.sh", "/c.yaml", "/tests/tv", "tests/cavp.rs"]
+exclude = []
 
 [lib]
 bench = false # so libtest doesn't eat the arguments to criterion
 
 [dependencies]
-libcrux-platform = { version = "0.0.2", path = "../sys/platform" }
-libcrux-intrinsics = { version = "0.0.3-alpha.3", path = "../libcrux-intrinsics" }
-hax-lib.workspace = true
+libcrux-platform.workspace = true
+libcrux-intrinsics.workspace = true
+libcrux-traits.workspace = true
+
+rand = { version = "0.9", optional = true }
 
 [features]
+default = ["rand"]  # XXX: remove rand here when cleaning up
 simd128 = []
 simd256 = []
+rand = ["dep:rand"]
 
 [[bench]]
 name = "aesgcm"
 harness = false
 
 [dev-dependencies]
+cavp = { version = "0.0.2", path = "../cavp" }
 criterion = "0.5.1"
 hex = "0.4.3"
-rand = "0.8.5"
-cavp = { version = "0.0.2-beta.2", path = "../cavp" }
 pretty_env_logger = "0.5.0"
 rand_core = { version = "0.6" }
 aes-gcm = "0.10.3"
diff --git a/aesgcm/benches/aesgcm.rs b/aesgcm/benches/aesgcm.rs
index 31f7ab0f6..7d5c6c647 100644
--- a/aesgcm/benches/aesgcm.rs
+++ b/aesgcm/benches/aesgcm.rs
@@ -2,11 +2,8 @@
 use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
 
 pub fn randombytes(n: usize) -> Vec<u8> {
-    use rand::rngs::OsRng;
-    use rand::RngCore;
-
     let mut bytes = vec![0u8; n];
-    OsRng.fill_bytes(&mut bytes);
+    rand::rng().fill_bytes(&mut bytes);
     bytes
 }
 
@@ -145,6 +142,7 @@ use aes_gcm::{
     Key, // Or `Aes128Gcm`
     Nonce,
 };
+use rand::RngCore;
 
 fn rustcrypto_aes128_gcm_encrypt(
     key: &[u8],
diff --git a/aesgcm/src/aes_ctr.rs b/aesgcm/src/aes_ctr.rs
index ae1fafc6d..6538ac76c 100644
--- a/aesgcm/src/aes_ctr.rs
+++ b/aesgcm/src/aes_ctr.rs
@@ -1,4 +1,4 @@
-use crate::{aes_generic::*, platform::AESState};
+use crate::{aes_gcm_128, aes_generic::*, platform::AESState};
 
 mod aes128_ctr;
 // mod aes256_ctr; // TODO: use
@@ -6,102 +6,86 @@ mod aes128_ctr;
 pub(crate) use aes128_ctr::*;
 // pub(crate) use aes256_ctr::*;
 
-pub struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
-    pub(crate) keyex: ExtendedKey<T, NUM_KEYS>,
-    pub(crate) ctr_nonce: [u8; 16],
+const NONCE_LEN: usize = 16;
+
+/// Generic AES CTR context.
+pub(crate) struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
+    pub(crate) extended_key: ExtendedKey<T, NUM_KEYS>,
+    pub(crate) ctr_nonce: [u8; NONCE_LEN],
 }
 
-fn aes_ctr_set_nonce<T: AESState, const NUM_KEYS: usize>(
-    ctx: &mut AesCtrContext<T, NUM_KEYS>,
-    nonce: &[u8],
-) {
-    debug_assert!(nonce.len() == 12);
+impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
+    fn aes_ctr_set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == aes_gcm_128::NONCE_LEN);
 
-    ctx.ctr_nonce[0..12].copy_from_slice(nonce);
-}
+        self.ctr_nonce[0..aes_gcm_128::NONCE_LEN].copy_from_slice(nonce);
+    }
 
-fn aes_ctr_key_block<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AesCtrContext<T, NUM_KEYS>,
-    ctr: u32,
-    out: &mut [u8],
-) {
-    debug_assert!(out.len() == 16);
+    fn aes_ctr_key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
 
-    let mut st_init = ctx.ctr_nonce;
-    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
-    let mut st = T::new();
+        let mut st_init = self.ctr_nonce;
+        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+        let mut st = T::new();
 
-    st.load_block(&st_init);
+        st.load_block(&st_init);
 
-    block_cipher(&mut st, ctx.keyex);
+        block_cipher(&mut st, &self.extended_key);
 
-    st.store_block(out);
-}
+        st.store_block(out);
+    }
 
-#[inline(always)]
-fn aes_ctr_xor_block<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AesCtrContext<T, NUM_KEYS>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    #[inline(always)]
+    fn aes_ctr_xor_block(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= 16);
 
-    let mut st_init = ctx.ctr_nonce;
-    st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
-    let mut st = T::new();
-    st.load_block(&st_init);
+        let mut st_init = self.ctr_nonce;
+        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+        let mut st = T::new();
+        st.load_block(&st_init);
 
-    block_cipher(&mut st, ctx.keyex);
+        block_cipher(&mut st, &self.extended_key);
 
-    st.xor_block(inp, out);
-}
+        st.xor_block(input, out);
+    }
 
-fn aes_ctr_xor_blocks<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AesCtrContext<T, NUM_KEYS>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
-    let blocks = inp.len() / 16;
-    for i in 0..blocks {
-        aes_ctr_xor_block(
-            &ctx,
-            ctr.wrapping_add(i as u32),
-            &inp[i * 16..i * 16 + 16],
-            &mut out[i * 16..i * 16 + 16],
-        );
+    fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() % 16 == 0);
+        debug_assert!(input.len() / 16 < u32::MAX as usize);
+
+        let blocks = input.len() / 16;
+        for i in 0..blocks {
+            self.aes_ctr_xor_block(
+                ctr.wrapping_add(i as u32),
+                &input[i * 16..i * 16 + 16],
+                &mut out[i * 16..i * 16 + 16],
+            );
+        }
     }
-}
 
-fn aes_ctr_update<T: AESState, const NUM_KEYS: usize>(
-    ctx: &AesCtrContext<T, NUM_KEYS>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len());
-
-    let blocks = inp.len() / 16;
-    aes_ctr_xor_blocks(&ctx, ctr, &inp[0..blocks * 16], &mut out[0..blocks * 16]);
-
-    let last = inp.len() - inp.len() % 16;
-    if last < inp.len() {
-        aes_ctr_xor_block(
-            &ctx,
-            ctr.wrapping_add(blocks as u32),
-            &inp[last..],
-            &mut out[last..],
-        );
+    fn aes_ctr_update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len());
+        debug_assert!(input.len() / 16 < u32::MAX as usize);
+
+        let blocks = input.len() / 16;
+        self.aes_ctr_xor_blocks(ctr, &input[0..blocks * 16], &mut out[0..blocks * 16]);
+
+        let last = input.len() - input.len() % 16;
+        if last < input.len() {
+            self.aes_ctr_xor_block(
+                ctr.wrapping_add(blocks as u32),
+                &input[last..],
+                &mut out[last..],
+            );
+        }
     }
 }
 
 #[cfg(test)]
 mod test {
-    use crate::platform;
+    use crate::{aes_ctr::Aes128CtrContext, platform};
 
-    use super::{aes128_ctr_init, test_utils::*};
+    use super::test_utils::*;
 
     const INPUT: [u8; 32] = [
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
@@ -124,7 +108,7 @@ mod test {
     #[test]
     fn test_ctr_block() {
         let mut computed: [u8; 32] = [0u8; 32];
-        let ctx = aes128_ctr_init::<platform::portable::State>(&KEY, &NONCE);
+        let ctx = Aes128CtrContext::<platform::portable::State>::init(&KEY, &NONCE);
         aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
         aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
         for i in 0..32 {
@@ -142,7 +126,7 @@ mod test {
     #[test]
     fn test_ctr_block_neon() {
         let mut computed: [u8; 32] = [0u8; 32];
-        let ctx = aes128_ctr_init::<platform::neon::State>(&KEY, &NONCE);
+        let ctx = Aes128CtrContext::<platform::neon::State>::init(&KEY, &NONCE);
         aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
         aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
         for i in 0..32 {
@@ -187,7 +171,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
+    #[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
     #[test]
     fn test_ctr_encrypt_intel() {
         let mut computed: [u8; 32] = [0u8; 32];
diff --git a/aesgcm/src/aes_ctr/aes128_ctr.rs b/aesgcm/src/aes_ctr/aes128_ctr.rs
index be5baf8dd..293221513 100644
--- a/aesgcm/src/aes_ctr/aes128_ctr.rs
+++ b/aesgcm/src/aes_ctr/aes128_ctr.rs
@@ -1,6 +1,6 @@
 use super::AesCtrContext;
 use crate::{
-    aes_ctr::{aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update},
+    aes_gcm_128::{KEY_LEN, NONCE_LEN},
     aes_generic::*,
     platform::AESState,
 };
@@ -8,78 +8,64 @@ use crate::{
 /// Type alias for the AES 128 ctr context
 pub(crate) type Aes128CtrContext<T> = AesCtrContext<T, 11>;
 
-pub fn aes128_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> Aes128CtrContext<T> {
-    debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 16);
+impl<T: AESState> Aes128CtrContext<T> {
+    pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(key.len() == KEY_LEN);
 
-    let mut ctr_nonce = [0u8; 16];
-    ctr_nonce[0..12].copy_from_slice(nonce);
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..12].copy_from_slice(nonce);
 
-    Aes128CtrContext {
-        keyex: aes128_key_expansion(key),
-        ctr_nonce,
+        Self {
+            extended_key: aes128_key_expansion(key),
+            ctr_nonce,
+        }
     }
-}
 
-pub fn aes128_ctr_set_nonce<T: AESState>(ctx: &mut Aes128CtrContext<T>, nonce: &[u8]) {
-    debug_assert!(nonce.len() == 12);
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
 
-    aes_ctr_set_nonce(ctx, nonce);
-}
+        self.aes_ctr_set_nonce(nonce);
+    }
 
-pub fn aes128_ctr_key_block<T: AESState>(ctx: &Aes128CtrContext<T>, ctr: u32, out: &mut [u8]) {
-    debug_assert!(out.len() == 16);
+    pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == KEY_LEN);
 
-    aes_ctr_key_block(ctx, ctr, out);
-}
+        self.aes_ctr_key_block(ctr, out);
+    }
 
-pub fn aes128_ctr_update<T: AESState>(
-    ctx: &Aes128CtrContext<T>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len());
+    pub(crate) fn update(&self, ctr: u32, inp: &[u8], out: &mut [u8]) {
+        debug_assert!(inp.len() == out.len());
 
-    aes_ctr_update(ctx, ctr, inp, out);
+        self.aes_ctr_update(ctr, inp, out);
+    }
 }
 
 #[cfg(test)]
 pub(crate) mod test_utils {
     use super::*;
-    use crate::aes_ctr::aes_ctr_xor_block;
 
-    pub fn aes128_ctr_xor_block<T: AESState>(
+    pub(crate) fn aes128_ctr_xor_block<T: AESState>(
         ctx: &Aes128CtrContext<T>,
         ctr: u32,
         inp: &[u8],
         out: &mut [u8],
     ) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        aes_ctr_xor_block(ctx, ctr, inp, out);
+        ctx.aes_ctr_xor_block(ctr, inp, out);
     }
 
-    // pub fn aes128_ctr_xor_blocks<T: AESState>(
-    //     ctx: &Aes128CtrContext<T>,
-    //     ctr: u32,
-    //     inp: &[u8],
-    //     out: &mut [u8],
-    // ) {
-    //     debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
-    //     aes_ctr_xor_blocks(ctx, ctr, inp, out);
-    // }
-
-    pub fn aes128_ctr_encrypt<T: AESState>(
+    pub(crate) fn aes128_ctr_encrypt<T: AESState>(
         key: &[u8],
         nonce: &[u8],
         ctr: u32,
         inp: &[u8],
         out: &mut [u8],
     ) {
-        debug_assert!(nonce.len() == 12);
-        debug_assert!(key.len() == 16);
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(key.len() == KEY_LEN);
         debug_assert!(inp.len() == out.len());
-        let ctx = aes128_ctr_init::<T>(key, nonce);
-        aes128_ctr_update(&ctx, ctr, inp, out);
+        let ctx = Aes128CtrContext::<T>::init(key, nonce);
+        ctx.update(ctr, inp, out);
     }
 }
diff --git a/aesgcm/src/aes_gcm.rs b/aesgcm/src/aes_gcm.rs
deleted file mode 100644
index 3a7ca1610..000000000
--- a/aesgcm/src/aes_gcm.rs
+++ /dev/null
@@ -1,258 +0,0 @@
-pub(crate) mod aes_gcm_128;
-pub(crate) use aes_gcm_128::*;
-
-pub(crate)  mod aes_gcm_256;
-
-#[cfg(test)]
-mod test {
-    use super::aes_gcm_128;
-    use crate::platform::portable;
-
-    const INPUT1: [u8; 60] = [
-        0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26,
-        0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31,
-        0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49,
-        0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39,
-    ];
-    const KEY1: [u8; 16] = [
-        0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83,
-        0x08,
-    ];
-    const NONCE1: [u8; 12] = [
-        0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88,
-    ];
-    const AAD1: [u8; 20] = [
-        0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
-        0xef, 0xab, 0xad, 0xda, 0xd2,
-    ];
-    const EXPECTED1: [u8; 76] = [
-        0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4,
-        0x9c, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac,
-        0xa1, 0x2e, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac,
-        0x84, 0xaa, 0x05, 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91,
-        0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a,
-        0x47,
-    ];
-
-    const INPUT2: [u8; 652] = [
-        0x08, 0x00, 0x00, 0x1e, 0x00, 0x1c, 0x00, 0x0a, 0x00, 0x14, 0x00, 0x12, 0x00, 0x1d, 0x00,
-        0x17, 0x00, 0x18, 0x00, 0x19, 0x01, 0x00, 0x01, 0x01, 0x01, 0x02, 0x01, 0x03, 0x01, 0x04,
-        0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x01, 0xb9, 0x00, 0x00, 0x01, 0xb5, 0x00, 0x01, 0xb0,
-        0x30, 0x82, 0x01, 0xac, 0x30, 0x82, 0x01, 0x15, 0xa0, 0x03, 0x02, 0x01, 0x02, 0x02, 0x01,
-        0x02, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, 0x05,
-        0x00, 0x30, 0x0e, 0x31, 0x0c, 0x30, 0x0a, 0x06, 0x03, 0x55, 0x04, 0x03, 0x13, 0x03, 0x72,
-        0x73, 0x61, 0x30, 0x1e, 0x17, 0x0d, 0x31, 0x36, 0x30, 0x37, 0x33, 0x30, 0x30, 0x31, 0x32,
-        0x33, 0x35, 0x39, 0x5a, 0x17, 0x0d, 0x32, 0x36, 0x30, 0x37, 0x33, 0x30, 0x30, 0x31, 0x32,
-        0x33, 0x35, 0x39, 0x5a, 0x30, 0x0e, 0x31, 0x0c, 0x30, 0x0a, 0x06, 0x03, 0x55, 0x04, 0x03,
-        0x13, 0x03, 0x72, 0x73, 0x61, 0x30, 0x81, 0x9f, 0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48,
-        0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01, 0x05, 0x00, 0x03, 0x81, 0x8d, 0x00, 0x30, 0x81, 0x89,
-        0x02, 0x81, 0x81, 0x00, 0xb4, 0xbb, 0x49, 0x8f, 0x82, 0x79, 0x30, 0x3d, 0x98, 0x08, 0x36,
-        0x39, 0x9b, 0x36, 0xc6, 0x98, 0x8c, 0x0c, 0x68, 0xde, 0x55, 0xe1, 0xbd, 0xb8, 0x26, 0xd3,
-        0x90, 0x1a, 0x24, 0x61, 0xea, 0xfd, 0x2d, 0xe4, 0x9a, 0x91, 0xd0, 0x15, 0xab, 0xbc, 0x9a,
-        0x95, 0x13, 0x7a, 0xce, 0x6c, 0x1a, 0xf1, 0x9e, 0xaa, 0x6a, 0xf9, 0x8c, 0x7c, 0xed, 0x43,
-        0x12, 0x09, 0x98, 0xe1, 0x87, 0xa8, 0x0e, 0xe0, 0xcc, 0xb0, 0x52, 0x4b, 0x1b, 0x01, 0x8c,
-        0x3e, 0x0b, 0x63, 0x26, 0x4d, 0x44, 0x9a, 0x6d, 0x38, 0xe2, 0x2a, 0x5f, 0xda, 0x43, 0x08,
-        0x46, 0x74, 0x80, 0x30, 0x53, 0x0e, 0xf0, 0x46, 0x1c, 0x8c, 0xa9, 0xd9, 0xef, 0xbf, 0xae,
-        0x8e, 0xa6, 0xd1, 0xd0, 0x3e, 0x2b, 0xd1, 0x93, 0xef, 0xf0, 0xab, 0x9a, 0x80, 0x02, 0xc4,
-        0x74, 0x28, 0xa6, 0xd3, 0x5a, 0x8d, 0x88, 0xd7, 0x9f, 0x7f, 0x1e, 0x3f, 0x02, 0x03, 0x01,
-        0x00, 0x01, 0xa3, 0x1a, 0x30, 0x18, 0x30, 0x09, 0x06, 0x03, 0x55, 0x1d, 0x13, 0x04, 0x02,
-        0x30, 0x00, 0x30, 0x0b, 0x06, 0x03, 0x55, 0x1d, 0x0f, 0x04, 0x04, 0x03, 0x02, 0x05, 0xa0,
-        0x30, 0x0d, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0b, 0x05, 0x00,
-        0x03, 0x81, 0x81, 0x00, 0x85, 0xaa, 0xd2, 0xa0, 0xe5, 0xb9, 0x27, 0x6b, 0x90, 0x8c, 0x65,
-        0xf7, 0x3a, 0x72, 0x67, 0x17, 0x06, 0x18, 0xa5, 0x4c, 0x5f, 0x8a, 0x7b, 0x33, 0x7d, 0x2d,
-        0xf7, 0xa5, 0x94, 0x36, 0x54, 0x17, 0xf2, 0xea, 0xe8, 0xf8, 0xa5, 0x8c, 0x8f, 0x81, 0x72,
-        0xf9, 0x31, 0x9c, 0xf3, 0x6b, 0x7f, 0xd6, 0xc5, 0x5b, 0x80, 0xf2, 0x1a, 0x03, 0x01, 0x51,
-        0x56, 0x72, 0x60, 0x96, 0xfd, 0x33, 0x5e, 0x5e, 0x67, 0xf2, 0xdb, 0xf1, 0x02, 0x70, 0x2e,
-        0x60, 0x8c, 0xca, 0xe6, 0xbe, 0xc1, 0xfc, 0x63, 0xa4, 0x2a, 0x99, 0xbe, 0x5c, 0x3e, 0xb7,
-        0x10, 0x7c, 0x3c, 0x54, 0xe9, 0xb9, 0xeb, 0x2b, 0xd5, 0x20, 0x3b, 0x1c, 0x3b, 0x84, 0xe0,
-        0xa8, 0xb2, 0xf7, 0x59, 0x40, 0x9b, 0xa3, 0xea, 0xc9, 0xd9, 0x1d, 0x40, 0x2d, 0xcc, 0x0c,
-        0xc8, 0xf8, 0x96, 0x12, 0x29, 0xac, 0x91, 0x87, 0xb4, 0x2b, 0x4d, 0xe1, 0x00, 0x00, 0x0f,
-        0x00, 0x00, 0x84, 0x08, 0x04, 0x00, 0x80, 0x45, 0x47, 0xd6, 0x16, 0x8f, 0x25, 0x10, 0xc5,
-        0x50, 0xbd, 0x94, 0x9c, 0xd2, 0xbc, 0x63, 0x1f, 0xf1, 0x34, 0xfa, 0x10, 0xa8, 0x27, 0xff,
-        0x69, 0xb1, 0x66, 0xa6, 0xbd, 0x95, 0xe2, 0x49, 0xed, 0x0d, 0xaf, 0x57, 0x15, 0x92, 0xeb,
-        0xbe, 0x9f, 0xf1, 0x3d, 0xe6, 0xb0, 0x3a, 0xcc, 0x21, 0x81, 0x46, 0x78, 0x1f, 0x69, 0x3b,
-        0x5a, 0x69, 0x2b, 0x73, 0x19, 0xd7, 0x4f, 0xd2, 0xe5, 0x3b, 0x6a, 0x2d, 0xf0, 0xf6, 0x78,
-        0x5d, 0x62, 0x4f, 0x02, 0x4a, 0x44, 0x03, 0x0c, 0xa0, 0x0b, 0x86, 0x9a, 0xe8, 0x1a, 0x53,
-        0x2b, 0x19, 0xe4, 0x7e, 0x52, 0x5f, 0xf4, 0xa6, 0x2c, 0x51, 0xa5, 0x88, 0x9e, 0xb5, 0x65,
-        0xfe, 0xe2, 0x68, 0x59, 0x0d, 0x8a, 0x3c, 0xa3, 0xc1, 0xbc, 0x3b, 0xd5, 0x40, 0x4e, 0x39,
-        0x72, 0x0c, 0xa2, 0xea, 0xee, 0x30, 0x8f, 0x4e, 0x07, 0x00, 0x76, 0x1e, 0x98, 0x63, 0x89,
-        0x14, 0x00, 0x00, 0x20, 0x9e, 0xfe, 0xe0, 0x3e, 0xbf, 0xfb, 0xc0, 0xdc, 0x23, 0xd2, 0x6d,
-        0x95, 0x87, 0x44, 0xc0, 0x9e, 0x30, 0x00, 0x47, 0x7e, 0xff, 0x7a, 0xe3, 0x14, 0x8a, 0x50,
-        0xe5, 0x67, 0x00, 0x13, 0xaa, 0xaa, 0x16,
-    ];
-
-    const KEY2: [u8; 16] = [
-        0xfd, 0xa2, 0xa4, 0x40, 0x46, 0x70, 0x80, 0x8f, 0x49, 0x37, 0x47, 0x8b, 0x8b, 0x6e, 0x3f,
-        0xe1,
-    ];
-    const NONCE2: [u8; 12] = [
-        0xb5, 0xf3, 0xa3, 0xfa, 0xe1, 0xcb, 0x25, 0xc9, 0xdc, 0xd7, 0x39, 0x93,
-    ];
-    const AAD2: [u8; 0] = [];
-
-    const EXPECTED2: [u8; 668] = [
-        0xc1, 0xe6, 0x31, 0xf8, 0x1d, 0x2a, 0xf2, 0x21, 0xeb, 0xb6, 0xa9, 0x57, 0xf5, 0x8f, 0x3e,
-        0xe2, 0x66, 0x27, 0x26, 0x35, 0xe6, 0x7f, 0x99, 0xa7, 0x52, 0xf0, 0xdf, 0x08, 0xad, 0xeb,
-        0x33, 0xba, 0xb8, 0x61, 0x1e, 0x55, 0xf3, 0x3d, 0x72, 0xcf, 0x84, 0x38, 0x24, 0x61, 0xa8,
-        0xbf, 0xe0, 0xa6, 0x59, 0xba, 0x2d, 0xd1, 0x87, 0x3f, 0x6f, 0xcc, 0x70, 0x7a, 0x98, 0x41,
-        0xce, 0xfc, 0x1f, 0xb0, 0x35, 0x26, 0xb9, 0xca, 0x4f, 0xe3, 0x43, 0xe5, 0x80, 0x5e, 0x95,
-        0xa5, 0xc0, 0x1e, 0x56, 0x57, 0x06, 0x38, 0xa7, 0x6a, 0x4b, 0xc8, 0xfe, 0xb0, 0x7b, 0xe8,
-        0x79, 0xf9, 0x05, 0x68, 0x61, 0x7d, 0x90, 0x5f, 0xec, 0xd5, 0xb1, 0x61, 0x9f, 0xb8, 0xec,
-        0x4a, 0x66, 0x28, 0xd1, 0xbb, 0x2b, 0xb2, 0x24, 0xc4, 0x90, 0xff, 0x97, 0xa6, 0xc0, 0xe9,
-        0xac, 0xd0, 0x36, 0x04, 0xbc, 0x3a, 0x59, 0xd8, 0x6b, 0xda, 0xb4, 0xe0, 0x84, 0xc1, 0xc1,
-        0x45, 0x0f, 0x9c, 0x9d, 0x2a, 0xfe, 0xb1, 0x72, 0xc0, 0x72, 0x34, 0xd7, 0x39, 0x86, 0x8e,
-        0xbd, 0x62, 0xde, 0x20, 0x60, 0xa8, 0xde, 0x98, 0x94, 0x14, 0xa8, 0x29, 0x20, 0xda, 0xcd,
-        0x1c, 0xac, 0x0c, 0x6e, 0x72, 0xec, 0xd7, 0xf4, 0x01, 0x85, 0x74, 0xce, 0xac, 0xa6, 0xd2,
-        0x9f, 0x36, 0x1b, 0xc3, 0x7e, 0xe2, 0x88, 0x8b, 0x8e, 0x30, 0x2c, 0xa9, 0x56, 0x1a, 0x9d,
-        0xe9, 0x16, 0x3e, 0xdf, 0xa6, 0x6b, 0xad, 0xd4, 0x89, 0x48, 0x84, 0xc7, 0xb3, 0x59, 0xbc,
-        0xac, 0xae, 0x59, 0x08, 0x05, 0x1b, 0x37, 0x95, 0x2e, 0x10, 0xa4, 0x5f, 0xe7, 0x3f, 0xda,
-        0x12, 0x6e, 0xbd, 0x67, 0x57, 0x5f, 0x1b, 0xed, 0x8a, 0x99, 0x2a, 0x89, 0x47, 0x4d, 0x7d,
-        0xec, 0x1e, 0xed, 0x32, 0x78, 0x24, 0x12, 0x3a, 0x41, 0x4a, 0xdb, 0x66, 0xd5, 0xef, 0x7d,
-        0x08, 0x36, 0xff, 0x98, 0xc2, 0xcd, 0xd7, 0xfb, 0x07, 0x81, 0xe1, 0x92, 0xbf, 0x0c, 0x75,
-        0x68, 0xbf, 0x7d, 0x89, 0x0a, 0x51, 0xc3, 0x32, 0x87, 0x9b, 0x50, 0x37, 0xb2, 0x12, 0xd6,
-        0x22, 0x41, 0x2c, 0xa4, 0x8e, 0x83, 0x23, 0x81, 0x7b, 0xd6, 0xd7, 0x46, 0xee, 0xf6, 0x83,
-        0x84, 0x5c, 0xec, 0x4e, 0x3e, 0xf6, 0x4b, 0x3a, 0x18, 0xfc, 0xce, 0x51, 0x3e, 0xa9, 0x51,
-        0xf3, 0x36, 0x66, 0x93, 0xa7, 0xff, 0x49, 0x0d, 0x09, 0xd0, 0x8a, 0xb1, 0xf6, 0x3e, 0x13,
-        0x62, 0x5a, 0x54, 0x59, 0x61, 0x59, 0x9c, 0x0d, 0x9c, 0x7a, 0x09, 0x9d, 0x11, 0x63, 0xca,
-        0xd1, 0xb9, 0xbc, 0xf8, 0xe9, 0x17, 0xd7, 0x66, 0xb9, 0x88, 0x53, 0xef, 0x68, 0x77, 0x83,
-        0x4f, 0x89, 0x1d, 0xf1, 0x6b, 0xe1, 0xfc, 0xc9, 0xc1, 0x8e, 0xa1, 0x88, 0x2e, 0xa3, 0xf1,
-        0xf4, 0xb6, 0x43, 0x58, 0xe1, 0xb1, 0x46, 0xce, 0xbf, 0xb3, 0xe0, 0x2e, 0x15, 0x3f, 0xdb,
-        0x73, 0xaf, 0x26, 0x93, 0xf2, 0x2c, 0x6f, 0x59, 0x3f, 0xa4, 0x75, 0x38, 0x0b, 0xa6, 0x61,
-        0x17, 0x40, 0xad, 0x20, 0xe3, 0x19, 0xa6, 0x54, 0xac, 0x56, 0x84, 0x77, 0x52, 0x36, 0x16,
-        0x2e, 0x84, 0x47, 0xed, 0x80, 0x88, 0x61, 0xbf, 0xbd, 0xa6, 0xe1, 0x8e, 0xc9, 0x7a, 0xe0,
-        0x90, 0xbf, 0x70, 0x34, 0x75, 0xcf, 0xb9, 0x0f, 0xe2, 0x0a, 0x3c, 0x55, 0xbe, 0xf6, 0xf5,
-        0xeb, 0xa6, 0xe6, 0xa1, 0xda, 0x6a, 0x19, 0x96, 0xb8, 0xbd, 0xe4, 0x21, 0x80, 0x60, 0x8c,
-        0xa2, 0x27, 0x9d, 0xef, 0x8e, 0x81, 0x53, 0x89, 0x5c, 0xc8, 0x50, 0xdb, 0x64, 0x20, 0x56,
-        0x1c, 0x04, 0xb5, 0x72, 0x9c, 0xc6, 0x88, 0x34, 0x36, 0xea, 0x02, 0xee, 0x07, 0xeb, 0x9b,
-        0xae, 0xe2, 0xfb, 0x3a, 0x9e, 0x1b, 0xbd, 0xa8, 0x73, 0x0d, 0x6b, 0x22, 0x05, 0x76, 0xe2,
-        0x4d, 0xf7, 0x0a, 0xf6, 0x92, 0x8e, 0xb8, 0x65, 0xfe, 0xe8, 0xa1, 0xd1, 0xc0, 0xf1, 0x81,
-        0x8a, 0xca, 0x68, 0xd5, 0x00, 0x2a, 0xe4, 0xc6, 0x5b, 0x2f, 0x49, 0xc9, 0xe6, 0xe2, 0x1d,
-        0xcf, 0x76, 0x78, 0x4a, 0xdb, 0xd0, 0xe8, 0x87, 0xa3, 0x68, 0x32, 0xef, 0x85, 0xbe, 0xb1,
-        0x05, 0x87, 0xf1, 0x6c, 0x6f, 0xfe, 0x60, 0xd7, 0x45, 0x10, 0x59, 0xec, 0x7f, 0x10, 0x14,
-        0xc3, 0xef, 0xe1, 0x9e, 0x56, 0xae, 0xdb, 0x5a, 0xd3, 0x1a, 0x9f, 0x29, 0xdc, 0x44, 0x58,
-        0xcf, 0xbf, 0x0c, 0x70, 0x70, 0xc1, 0x75, 0xdc, 0xad, 0x46, 0xe1, 0x67, 0x52, 0x26, 0xb4,
-        0x7c, 0x07, 0x1a, 0xad, 0x31, 0x72, 0xeb, 0xd3, 0x3e, 0x45, 0xd7, 0x41, 0xcb, 0x91, 0x25,
-        0x3a, 0x01, 0xa6, 0x9a, 0xe3, 0xcc, 0x29, 0x2b, 0xce, 0x9c, 0x03, 0x24, 0x6a, 0xc9, 0x51,
-        0xe4, 0x5e, 0x97, 0xeb, 0xf0, 0x4a, 0x9d, 0x51, 0xfa, 0xb5, 0xcf, 0x06, 0xd9, 0x48, 0x5c,
-        0xce, 0x74, 0x6b, 0x1c, 0x07, 0x7b, 0xe6, 0x9a, 0xd1, 0x53, 0xf1, 0x65, 0x6e, 0xf8, 0x9f,
-        0xc7, 0xd1, 0xed, 0x8c, 0x3e, 0x2d, 0xa7, 0xa2,
-    ];
-
-    #[test]
-    fn test_gcm1() {
-        let mut computed1 = [0u8; 76];
-        let mut st = aes_gcm_128::aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY1);
-        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE1);
-        let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
-        for i in 0..76 {
-            if computed1[i] != EXPECTED1[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED1[i], computed1[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[test]
-    fn test_gcm2() {
-        let mut computed2 = [0u8; 668];
-        let mut st = aes_gcm_128::aes128_gcm_init::<portable::State, portable::FieldElement>(&KEY2);
-        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE2);
-        let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
-        for i in 0..668 {
-            if computed2[i] != EXPECTED2[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED2[i], computed2[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    use crate::platform::neon;
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    #[test]
-    fn test_gcm1_neon() {
-        let mut computed1 = [0u8; 76];
-        let mut st = aes_gcm_128::aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY1);
-        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE1);
-        let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
-        for i in 0..76 {
-            if computed1[i] != EXPECTED1[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED1[i], computed1[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    #[test]
-    fn test_gcm2_neon() {
-        let mut computed2 = [0u8; 668];
-        let mut st = aes_gcm_128::aes128_gcm_init::<neon::State, neon::FieldElement>(&KEY2);
-        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE2);
-        let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
-        for i in 0..668 {
-            if computed2[i] != EXPECTED2[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED2[i], computed2[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "aes"))]
-    use crate::platform::intel_ni;
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "aes"))]
-    #[test]
-    fn test_gcm1_intel() {
-        let mut computed1 = [0u8; 76];
-        let mut st = aes_gcm_128::aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY1);
-        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE1);
-        let (mut ciphertext, mut tag) = computed1.split_at_mut(60);
-        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD1, &INPUT1, &mut ciphertext, &mut tag);
-        for i in 0..76 {
-            if computed1[i] != EXPECTED1[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED1[i], computed1[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "aes"))]
-    #[test]
-    fn test_gcm2_intel() {
-        let mut computed2 = [0u8; 668];
-        let mut st = aes_gcm_128::aes128_gcm_init::<intel_ni::State, intel_ni::FieldElement>(&KEY2);
-        aes_gcm_128::aes128_gcm_set_nonce(&mut st, &NONCE2);
-        let (mut ciphertext, mut tag) = computed2.split_at_mut(652);
-        aes_gcm_128::aes128_gcm_encrypt(&mut st, &AAD2, &INPUT2, &mut ciphertext, &mut tag);
-        for i in 0..668 {
-            if computed2[i] != EXPECTED2[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED2[i], computed2[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-}
diff --git a/aesgcm/src/aes_gcm/aes_gcm_128.rs b/aesgcm/src/aes_gcm/aes_gcm_128.rs
deleted file mode 100644
index 8413c4140..000000000
--- a/aesgcm/src/aes_gcm/aes_gcm_128.rs
+++ /dev/null
@@ -1,114 +0,0 @@
-use crate::{
-    aes_ctr::{
-        aes128_ctr_init, aes128_ctr_key_block, aes128_ctr_set_nonce, aes128_ctr_update,
-        Aes128CtrContext,
-    },
-    gf128_generic::{gf128_emit, gf128_init, gf128_update, gf128_update_padded, GF128State},
-    platform::{AESState, GF128FieldElement},
-};
-
-#[allow(non_camel_case_types)]
-pub(crate) struct AES128_GCM_State<T: AESState, U: GF128FieldElement> {
-    pub(crate) aes_state: Aes128CtrContext<T>,
-    pub(crate) gcm_state: GF128State<U>,
-    pub(crate) tag_mix: [u8; 16],
-}
-
-pub(crate) fn aes128_gcm_init<T: AESState, U: GF128FieldElement>(
-    key: &[u8],
-) -> AES128_GCM_State<T, U> {
-    debug_assert!(key.len() == 16);
-
-    let nonce = [0u8; 12];
-    let mut gcm_key = [0u8; 16];
-    let tag_mix = [0u8; 16];
-
-    let aes_state = aes128_ctr_init(key, &nonce);
-    aes128_ctr_key_block(&aes_state, 0, &mut gcm_key);
-    let gcm_state = gf128_init(&gcm_key);
-
-    AES128_GCM_State {
-        aes_state,
-        gcm_state,
-        tag_mix,
-    }
-}
-
-pub(crate) fn aes128_gcm_set_nonce<T: AESState, U: GF128FieldElement>(
-    st: &mut AES128_GCM_State<T, U>,
-    nonce: &[u8],
-) {
-    debug_assert!(nonce.len() == 12);
-
-    aes128_ctr_set_nonce(&mut st.aes_state, nonce);
-    aes128_ctr_key_block(&st.aes_state, 1, &mut st.tag_mix);
-}
-
-pub(crate) fn aes128_gcm_encrypt<T: AESState, U: GF128FieldElement>(
-    st: &mut AES128_GCM_State<T, U>,
-    aad: &[u8],
-    plaintext: &[u8],
-    ciphertext: &mut [u8],
-    tag: &mut [u8],
-) {
-    debug_assert!(ciphertext.len() == plaintext.len());
-    debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
-    debug_assert!(tag.len() == 16);
-
-    aes128_ctr_update(&st.aes_state, 2, plaintext, ciphertext);
-
-    gf128_update_padded(&mut st.gcm_state, aad);
-    gf128_update_padded(&mut st.gcm_state, ciphertext);
-
-    let mut last_block = [0u8; 16];
-    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-
-    gf128_update(&mut st.gcm_state, &last_block);
-    gf128_emit(&st.gcm_state, tag);
-
-    for i in 0..16 {
-        tag[i] ^= st.tag_mix[i];
-    }
-}
-
-/// AES-GCM decryption error.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub struct DecryptError();
-
-pub(crate) fn aes128_gcm_decrypt<T: AESState, U: GF128FieldElement>(
-    st: &mut AES128_GCM_State<T, U>,
-    aad: &[u8],
-    ciphertext: &[u8],
-    tag: &[u8],
-    plaintext: &mut [u8],
-) -> Result<(), DecryptError> {
-    debug_assert!(plaintext.len() == ciphertext.len());
-    debug_assert!(ciphertext.len() / 16 <= u32::MAX as usize);
-    debug_assert!(tag.len() == 16);
-
-    gf128_update_padded(&mut st.gcm_state, aad);
-    gf128_update_padded(&mut st.gcm_state, ciphertext);
-
-    let mut last_block = [0u8; 16];
-    last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-    last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-
-    gf128_update(&mut st.gcm_state, &last_block);
-
-    let mut computed_tag = [0u8; 16];
-    gf128_emit(&st.gcm_state, &mut computed_tag);
-    for i in 0..16 {
-        computed_tag[i] ^= st.tag_mix[i];
-    }
-    let mut eq_mask = 0u8;
-    for i in 0..16 {
-        eq_mask |= computed_tag[i] ^ tag[i];
-    }
-    if eq_mask == 0 {
-        aes128_ctr_update(&st.aes_state, 2, ciphertext, plaintext);
-        Ok(())
-    } else {
-        Err(DecryptError())
-    }
-}
diff --git a/aesgcm/src/aes_gcm_128.rs b/aesgcm/src/aes_gcm_128.rs
new file mode 100644
index 000000000..38131faaf
--- /dev/null
+++ b/aesgcm/src/aes_gcm_128.rs
@@ -0,0 +1,121 @@
+#![allow(clippy::needless_range_loop)]
+
+use crate::{
+    aes_ctr::Aes128CtrContext,
+    aes_generic::AES_BLOCK_LEN,
+    gf128_generic::GF128State,
+    platform::{AESState, GF128FieldElement},
+    DecryptError,
+};
+
+/// Key length.
+pub(crate) const KEY_LEN: usize = 16;
+
+/// Tag length.
+pub(crate) const TAG_LEN: usize = 16;
+
+/// Nonce length.
+pub(crate) const NONCE_LEN: usize = 12;
+
+/// The AES-GCM 128 state
+pub(crate) struct State<T: AESState, U: GF128FieldElement> {
+    pub(crate) aes_state: Aes128CtrContext<T>,
+    pub(crate) gcm_state: GF128State<U>,
+    pub(crate) tag_mix: [u8; TAG_LEN],
+}
+
+impl<T: AESState, U: GF128FieldElement> State<T, U> {
+    /// Initialize the state
+    pub(crate) fn init(key: &[u8]) -> Self {
+        debug_assert!(key.len() == KEY_LEN);
+
+        let nonce = [0u8; NONCE_LEN];
+        let mut gcm_key = [0u8; KEY_LEN];
+        let tag_mix = [0u8; TAG_LEN];
+
+        let aes_state = Aes128CtrContext::<T>::init(key, &nonce);
+        aes_state.key_block(0, &mut gcm_key);
+        let gcm_state = GF128State::init(&gcm_key);
+
+        Self {
+            aes_state,
+            gcm_state,
+            tag_mix,
+        }
+    }
+
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
+
+        self.aes_state.set_nonce(nonce);
+        self.aes_state.key_block(1, &mut self.tag_mix);
+    }
+
+    pub(crate) fn encrypt(
+        &mut self,
+        aad: &[u8],
+        plaintext: &[u8],
+        ciphertext: &mut [u8],
+        tag: &mut [u8],
+    ) {
+        debug_assert!(ciphertext.len() == plaintext.len());
+        debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        self.aes_state.update(2, plaintext, ciphertext);
+
+        self.gcm_state.update_padded(aad);
+        self.gcm_state.update_padded(ciphertext);
+
+        let mut last_block = [0u8; AES_BLOCK_LEN];
+        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+        self.gcm_state.update(&last_block);
+        self.gcm_state.emit(tag);
+
+        for i in 0..16 {
+            tag[i] ^= self.tag_mix[i];
+        }
+    }
+
+    pub(crate) fn decrypt(
+        &mut self,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError> {
+        debug_assert!(plaintext.len() == ciphertext.len());
+        debug_assert!(ciphertext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        self.gcm_state.update_padded(aad);
+        self.gcm_state.update_padded(ciphertext);
+
+        let mut last_block = [0u8; AES_BLOCK_LEN];
+        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+        self.gcm_state.update(&last_block);
+
+        let mut computed_tag = [0u8; TAG_LEN];
+        self.gcm_state.emit(&mut computed_tag);
+
+        for i in 0..16 {
+            computed_tag[i] ^= self.tag_mix[i];
+        }
+
+        let mut eq_mask = 0u8;
+        for i in 0..16 {
+            eq_mask |= computed_tag[i] ^ tag[i];
+        }
+
+        if eq_mask == 0 {
+            self.aes_state.update(2, ciphertext, plaintext);
+            Ok(())
+        } else {
+            Err(DecryptError())
+        }
+    }
+}
diff --git a/aesgcm/src/aes_gcm/aes_gcm_256.rs b/aesgcm/src/aes_gcm_256.rs
similarity index 100%
rename from aesgcm/src/aes_gcm/aes_gcm_256.rs
rename to aesgcm/src/aes_gcm_256.rs
diff --git a/aesgcm/src/aes_generic.rs b/aesgcm/src/aes_generic.rs
index 561a506f2..b1e925c99 100644
--- a/aesgcm/src/aes_generic.rs
+++ b/aesgcm/src/aes_generic.rs
@@ -1,3 +1,7 @@
+#![allow(clippy::needless_range_loop)]
+
+use core::array::from_fn;
+
 use crate::platform::*;
 
 pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
@@ -7,16 +11,21 @@ pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
 //     0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
 // ];
 
+/// AES block size
+pub(crate) const AES_BLOCK_LEN: usize = 16;
+
+const AES128_NUM_KEYS: usize = 11;
+
 /// 128 - Key expansion
-pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11> {
+pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, AES128_NUM_KEYS> {
     debug_assert!(key.len() == 16);
 
-    let mut keyex = [T::new(); 11];
-    keyex[0].load_block(&key);
+    let mut keyex = from_fn(|_| T::new());
+    keyex[0].load_block(key);
 
     macro_rules! expansion_step128 {
         ($i:expr,$rcon:expr) => {
-            let prev = keyex[$i - 1];
+            let prev = keyex[$i - 1].clone();
             keyex[$i].aes_keygen_assist0::<$rcon>(&prev);
             keyex[$i].key_expansion_step(&prev);
         };
@@ -36,24 +45,26 @@ pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 11
     keyex
 }
 
+const AES256_NUM_KEYS: usize = 15;
+
 /// 256 - Key expansion
 /// TODO: use
 #[allow(dead_code)]
-pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15> {
+pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, AES256_NUM_KEYS> {
     debug_assert!(key.len() == 32);
 
-    let mut keyex = [T::new(); 15];
+    let mut keyex = from_fn(|_| T::new());
     keyex[0].load_block(&key[0..16]);
     keyex[1].load_block(&key[16..32]);
 
     macro_rules! expansion_step256 {
         ($i:expr,$rcon:expr) => {
-            let prev0 = keyex[$i - 2];
-            let prev1 = keyex[$i - 1];
+            let prev0 = keyex[$i - 2].clone();
+            let prev1 = keyex[$i - 1].clone();
             keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
             keyex[$i].key_expansion_step(&prev0);
 
-            let next0 = keyex[$i];
+            let next0 = keyex[$i].clone();
             keyex[$i + 1].aes_keygen_assist1(&next0);
             keyex[$i + 1].key_expansion_step(&prev1);
         };
@@ -72,8 +83,8 @@ pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15
     expansion_step256!(12, 0x20);
     expansion_step256!(13, 0x20);
 
-    let prev0 = keyex[12];
-    let prev1 = keyex[13];
+    let prev0 = keyex[12].clone();
+    let prev1 = keyex[13].clone();
     keyex[14].aes_keygen_assist0::<0x40>(&prev1);
     keyex[14].key_expansion_step(&prev0);
     keyex
@@ -81,7 +92,7 @@ pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, 15
 
 pub(crate) fn block_cipher<T: AESState, const NUM_KEYS: usize>(
     st: &mut T,
-    keyex: ExtendedKey<T, NUM_KEYS>,
+    keyex: &ExtendedKey<T, NUM_KEYS>,
 ) {
     st.xor_key(&keyex[0]);
     for i in 1..NUM_KEYS - 1 {
diff --git a/aesgcm/src/gf128_generic.rs b/aesgcm/src/gf128_generic.rs
index 047b7b332..59c456e93 100644
--- a/aesgcm/src/gf128_generic.rs
+++ b/aesgcm/src/gf128_generic.rs
@@ -1,72 +1,75 @@
-use crate::platform::*;
+use crate::{aes_gcm_128::KEY_LEN, aes_generic::AES_BLOCK_LEN, platform::*};
 
-pub struct GF128State<T: GF128FieldElement> {
+pub(crate) struct GF128State<T: GF128FieldElement> {
     accumulator: T,
     r: T,
 }
 
-pub fn gf128_init<T: GF128FieldElement>(key: &[u8]) -> GF128State<T> {
-    debug_assert!(key.len() == 16);
+impl<T: GF128FieldElement> GF128State<T> {
+    pub(crate) fn init(key: &[u8]) -> Self {
+        debug_assert!(key.len() == KEY_LEN);
 
-    GF128State {
-        accumulator: T::zero(),
-        r: T::load_elem(key),
+        Self {
+            accumulator: T::zero(),
+            r: T::load_element(key),
+        }
     }
-}
 
-#[inline(always)]
-pub fn gf128_update<T: GF128FieldElement>(st: &mut GF128State<T>, block: &[u8]) {
-    debug_assert!(block.len() == 16);
+    #[inline(always)]
+    pub(crate) fn update(&mut self, block: &[u8]) {
+        debug_assert!(block.len() == KEY_LEN);
 
-    let block_elem = T::load_elem(block);
-    st.accumulator.add(&block_elem);
-    st.accumulator.mul(&st.r);
-}
+        let block_elem = T::load_element(block);
+        self.accumulator.add(&block_elem);
+        self.accumulator.mul(&self.r);
+    }
 
-pub fn gf128_update_blocks<T: GF128FieldElement>(st: &mut GF128State<T>, input: &[u8]) {
-    debug_assert!(input.len() % 16 == 0);
+    pub(crate) fn update_blocks(&mut self, input: &[u8]) {
+        debug_assert!(input.len() % 16 == 0);
 
-    let blocks = input.len() / 16;
-    for i in 0..blocks {
-        gf128_update(st, &input[i * 16..i * 16 + 16]);
+        let blocks = input.len() / AES_BLOCK_LEN;
+        for i in 0..blocks {
+            let offset = i * AES_BLOCK_LEN;
+            self.update(&input[offset..offset + AES_BLOCK_LEN]);
+        }
     }
-}
 
-pub fn gf128_update_last<T: GF128FieldElement>(st: &mut GF128State<T>, partial_block: &[u8]) {
-    debug_assert!(partial_block.len() < 16);
+    pub(crate) fn update_last(&mut self, partial_block: &[u8]) {
+        debug_assert!(partial_block.len() < 16);
 
-    let mut block = [0u8; 16];
-    block[0..partial_block.len()].copy_from_slice(partial_block);
-    gf128_update(st, &block);
-}
+        let mut block = [0u8; 16];
+        block[0..partial_block.len()].copy_from_slice(partial_block);
+        self.update(&block);
+    }
 
-pub fn gf128_update_padded<T: GF128FieldElement>(st: &mut GF128State<T>, input: &[u8]) {
-    let blocks = input.len() / 16;
-    gf128_update_blocks(st, &input[0..blocks * 16]);
+    pub(crate) fn update_padded(&mut self, input: &[u8]) {
+        let blocks = input.len() / AES_BLOCK_LEN;
+        self.update_blocks(&input[0..blocks * AES_BLOCK_LEN]);
 
-    let last = input.len() - input.len() % 16;
-    if last < input.len() {
-        gf128_update_last(st, &input[last..]);
+        let last = input.len() - input.len() % AES_BLOCK_LEN;
+        if last < input.len() {
+            self.update_last(&input[last..]);
+        }
     }
-}
 
-pub fn gf128_emit<T: GF128FieldElement>(st: &GF128State<T>, out: &mut [u8]) {
-    debug_assert!(out.len() == 16);
+    pub(crate) fn emit(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
 
-    st.accumulator.store_elem(out);
+        self.accumulator.store_element(out);
+    }
 }
 
 #[cfg(test)]
 mod test {
     use super::*;
 
-    fn gf128<T: GF128FieldElement>(key: &[u8], inp: &[u8], out: &mut [u8]) {
+    fn gf128<T: GF128FieldElement>(key: &[u8], input: &[u8], out: &mut [u8]) {
         debug_assert!(key.len() == 16);
         debug_assert!(out.len() == 16);
 
-        let mut st = gf128_init::<T>(key);
-        gf128_update_padded(&mut st, inp);
-        gf128_emit(&st, out);
+        let mut st = GF128State::<T>::init(key);
+        st.update_padded(input);
+        st.emit(out);
     }
 
     const INPUT: [u8; 132] = [
@@ -122,7 +125,7 @@ mod test {
         }
     }
 
-    #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
+    #[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
     #[test]
     fn test_gf128_intel() {
         let mut computed: [u8; 16] = [0u8; 16];
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 79f4d8cb2..63ddba4f4 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -1,16 +1,165 @@
-pub mod aes_ctr;
-mod aes_gcm;
+// XXX: make this conditional when cleaning up
+// #[cfg(feature = "rand")]
+use rand::CryptoRng;
+
+mod aes_ctr;
 mod aes_generic;
 mod gf128_generic;
-pub mod platform;
+mod platform;
+
+mod aes_gcm_128;
+mod aes_gcm_256;
+
+use libcrux_traits::aead::{Aead, Error};
+
+/// AES-GCM decryption error.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct DecryptError();
+
+/// AES-GCM 128.
+pub struct AesGcm128 {}
+
+/// Portable AES-GCM 128.
+pub struct PortableAesGcm128 {}
+
+/// Neon AES-GCM 128.
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+pub struct NeonAesGcm128 {}
+#[cfg(not(all(target_arch = "aarch64", target_feature = "aes")))]
+pub type NeonAesGcm128 = PortableAesGcm128;
+
+/// AES-NI AES-GCM 128.
+#[cfg(target_arch = "x86_64")]
+pub struct X64AesGcm128 {}
+#[cfg(not(target_arch = "x86_64"))]
+pub type X64AesGcm128 = PortableAesGcm128;
+
+impl Aead for AesGcm128 {
+    type Key = [u8; 16];
+    type Tag = [u8; 16];
+    type Nonce = [u8; 12];
+
+    // XXX: make this conditional when cleaning up
+    // #[cfg(feature = "rand")]
+    fn key_gen(key: &mut Self::Key, rng: &mut impl CryptoRng) -> Result<(), Error> {
+        rng.fill_bytes(key);
+        Ok(())
+    }
+
+    fn encrypt(
+        ciphertext: &mut [u8],
+        tag: &mut Self::Tag,
+        key: &Self::Key,
+        nonce: &Self::Nonce,
+        aad: &[u8],
+        plaintext: &[u8],
+    ) -> Result<(), Error> {
+        if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
+            NeonAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+        } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+            X64AesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+        } else {
+            PortableAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+        }
+    }
+
+    fn decrypt(
+        plaintext: &mut [u8],
+        key: &Self::Key,
+        nonce: &Self::Nonce,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &Self::Tag,
+    ) -> Result<(), Error> {
+        if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
+            NeonAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+        } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+            X64AesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+        } else {
+            PortableAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+        }
+    }
+}
+
+impl Aead for PortableAesGcm128 {
+    type Key = [u8; 16];
+    type Tag = [u8; 16];
+    type Nonce = [u8; 12];
+
+    // XXX: make this conditional when cleaning up
+    // #[cfg(feature = "rand")]
+    fn key_gen(key: &mut Self::Key, rng: &mut impl CryptoRng) -> Result<(), Error> {
+        rng.fill_bytes(key);
+        Ok(())
+    }
+
+    fn encrypt(
+        ciphertext: &mut [u8],
+        tag: &mut Self::Tag,
+        key: &Self::Key,
+        nonce: &Self::Nonce,
+        aad: &[u8],
+        plaintext: &[u8],
+    ) -> Result<(), Error> {
+        portable::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+        Ok(())
+    }
+
+    fn decrypt(
+        plaintext: &mut [u8],
+        key: &Self::Key,
+        nonce: &Self::Nonce,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &Self::Tag,
+    ) -> Result<(), Error> {
+        portable::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+            .map_err(|_| Error::Decrypt)
+    }
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+impl Aead for NeonAesGcm128 {
+    type Key = [u8; 16];
+    type Tag = [u8; 16];
+    type Nonce = [u8; 12];
+
+    // XXX: make this conditional when cleaning up
+    // #[cfg(feature = "rand")]
+    fn key_gen(key: &mut Self::Key, rng: &mut impl CryptoRng) -> Result<(), Error> {
+        rng.fill_bytes(key);
+        Ok(())
+    }
+
+    fn encrypt(
+        ciphertext: &mut [u8],
+        tag: &mut Self::Tag,
+        key: &Self::Key,
+        nonce: &Self::Nonce,
+        aad: &[u8],
+        plaintext: &[u8],
+    ) -> Result<(), Error> {
+        neon::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+        Ok(())
+    }
 
-pub use aes_gcm::aes_gcm_128::DecryptError;
+    fn decrypt(
+        plaintext: &mut [u8],
+        key: &Self::Key,
+        nonce: &Self::Nonce,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &Self::Tag,
+    ) -> Result<(), Error> {
+        neon::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+            .map_err(|_| Error::Decrypt)
+    }
+}
 
 pub mod portable {
-    use crate::{
-        aes_gcm::{self, aes_gcm_128::DecryptError},
-        platform,
-    };
+    use crate::{aes_gcm_128, platform, DecryptError};
+
+    type State = aes_gcm_128::State<platform::portable::State, platform::portable::FieldElement>;
 
     pub fn aes128_gcm_encrypt(
         key: &[u8],
@@ -20,13 +169,9 @@ pub mod portable {
         ciphertext: &mut [u8],
         tag: &mut [u8],
     ) {
-        let mut st = aes_gcm::aes128_gcm_init::<
-            platform::portable::State,
-            platform::portable::FieldElement,
-        >(key);
-        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        eprintln!("tag: {tag:x?}");
-        aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
+        let mut st = State::init(key);
+        st.set_nonce(nonce);
+        st.encrypt(aad, plaintext, ciphertext, tag);
     }
 
     pub fn aes128_gcm_decrypt(
@@ -37,21 +182,17 @@ pub mod portable {
         tag: &[u8],
         plaintext: &mut [u8],
     ) -> Result<(), DecryptError> {
-        let mut st = aes_gcm::aes128_gcm_init::<
-            platform::portable::State,
-            platform::portable::FieldElement,
-        >(key);
-        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
+        let mut st = State::init(key);
+        st.set_nonce(nonce);
+        st.decrypt(aad, ciphertext, tag, plaintext)
     }
 }
 
 #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
 pub mod neon {
-    use crate::{
-        aes_gcm::{self, DecryptError},
-        platform,
-    };
+    use crate::{platform, DecryptError};
+
+    type State = crate::aes_gcm_128::State<platform::neon::State, platform::neon::FieldElement>;
 
     pub fn aes128_gcm_encrypt(
         key: &[u8],
@@ -61,10 +202,9 @@ pub mod neon {
         ciphertext: &mut [u8],
         tag: &mut [u8],
     ) {
-        let mut st =
-            aes_gcm::aes128_gcm_init::<platform::neon::State, platform::neon::FieldElement>(key);
-        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
+        let mut st = State::init(key);
+        st.set_nonce(nonce);
+        st.encrypt(aad, plaintext, ciphertext, tag);
     }
 
     pub fn aes128_gcm_decrypt(
@@ -75,14 +215,13 @@ pub mod neon {
         tag: &[u8],
         plaintext: &mut [u8],
     ) -> Result<(), DecryptError> {
-        let mut st =
-            aes_gcm::aes128_gcm_init::<platform::neon::State, platform::neon::FieldElement>(key);
-        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        aes_gcm::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
+        let mut st = State::init(key);
+        st.set_nonce(nonce);
+        st.decrypt(aad, ciphertext, tag, plaintext)
     }
 }
 
-#[cfg(all(target_arch = "x86_64"))] // REENABLE target_feature="aes"
+#[cfg(target_arch = "x86_64")] // REENABLE target_feature="aes"
 pub mod intel_ni {
     use crate::{
         aes_gcm::{self, DecryptError},
diff --git a/aesgcm/src/platform.rs b/aesgcm/src/platform.rs
index b46522746..c249626be 100644
--- a/aesgcm/src/platform.rs
+++ b/aesgcm/src/platform.rs
@@ -3,10 +3,10 @@ pub mod portable;
 #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
 pub mod neon;
 
-#[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
+#[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
 pub mod intel_ni;
 
-pub trait AESState: Copy {
+pub trait AESState: Clone {
     fn new() -> Self;
     fn load_block(&mut self, b: &[u8]);
     fn store_block(&self, out: &mut [u8]);
@@ -20,10 +20,10 @@ pub trait AESState: Copy {
     fn key_expansion_step(&mut self, prev: &Self);
 }
 
-pub trait GF128FieldElement: Copy {
+pub trait GF128FieldElement {
     fn zero() -> Self;
-    fn load_elem(b: &[u8]) -> Self;
-    fn store_elem(&self, b: &mut [u8]);
+    fn load_element(bytes: &[u8]) -> Self;
+    fn store_element(&self, bytes: &mut [u8]);
     fn add(&mut self, other: &Self);
     fn mul(&mut self, other: &Self);
 }
diff --git a/aesgcm/src/platform/neon/aes_core.rs b/aesgcm/src/platform/neon/aes_core.rs
index 166810eee..8b6967c5d 100644
--- a/aesgcm/src/platform/neon/aes_core.rs
+++ b/aesgcm/src/platform/neon/aes_core.rs
@@ -83,7 +83,6 @@ impl crate::platform::AESState for State {
 
     fn aes_enc(&mut self, key: &Self) {
         aes_enc(self, key);
-        (self, key);
     }
 
     fn aes_enc_last(&mut self, key: &Self) {
diff --git a/aesgcm/src/platform/neon/gf128_core.rs b/aesgcm/src/platform/neon/gf128_core.rs
index 862b97139..af3e4be8b 100644
--- a/aesgcm/src/platform/neon/gf128_core.rs
+++ b/aesgcm/src/platform/neon/gf128_core.rs
@@ -1,4 +1,4 @@
-use core::arch::aarch64::*;
+use libcrux_intrinsics::arm64::*;
 
 #[derive(Clone, Copy)]
 pub struct FieldElement(pub u128);
@@ -7,49 +7,52 @@ fn zero() -> FieldElement {
     FieldElement(0)
 }
 
-fn load_elem(b: &[u8]) -> FieldElement {
+fn load_element(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
     FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
 }
 
-fn store_elem(elem: &FieldElement, b: &mut [u8]) {
-    debug_assert!(b.len() == 16);
-    b.copy_from_slice(&elem.0.to_be_bytes());
+fn store_element(element: &FieldElement, bytes: &mut [u8]) {
+    debug_assert!(bytes.len() == 16);
+    bytes.copy_from_slice(&element.0.to_be_bytes());
 }
 
-fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
-    FieldElement((*elem).0 ^ (*other).0)
+fn add(element: &mut FieldElement, other: &FieldElement) {
+    element.0 ^= other.0;
 }
 
-fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
-    let l0 = (*elem).0 as u64;
-    let h0 = ((*elem).0 >> 64) as u64;
-    let l1 = (*other).0 as u64;
-    let h1 = ((*other).0 >> 64) as u64;
-    let low: u128 = unsafe { vmull_p64(l0, l1) };
-    let m1: u128 = unsafe { vmull_p64(l0, h1) };
-    let m2: u128 = unsafe { vmull_p64(l1, h0) };
-    let high: u128 = unsafe { vmull_p64(h0, h1) };
+fn mul_wide(element: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+    let l0 = element.0 as u64;
+    let h0 = (element.0 >> 64) as u64;
+    let l1 = other.0 as u64;
+    let h1 = (other.0 >> 64) as u64;
+
+    let low: u128 = _vmull_p64(l0, l1);
+    let m1: u128 = _vmull_p64(l0, h1);
+    let m2: u128 = _vmull_p64(l1, h0);
+    let high: u128 = _vmull_p64(h0, h1);
+
     let mid = m1 ^ m2;
     let m0 = mid << 64;
     let m1 = mid >> 64;
     let low = low ^ m0;
     let high = high ^ m1;
+
     (FieldElement(high), FieldElement(low))
 }
 
 fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
-    let high = ((*high).0 << 1) ^ ((*low).0 >> 127);
-    let low = (*low).0 << 1;
+    let high = (high.0 << 1) ^ (low.0 >> 127);
+    let low = low.0 << 1;
     let x0_0 = low << 64;
     let x1_x0 = low ^ (x0_0 << 63) ^ (x0_0 << 62) ^ (x0_0 << 57);
     let x1_x0 = x1_x0 ^ (x1_x0 >> 1) ^ (x1_x0 >> 2) ^ (x1_x0 >> 7);
     FieldElement(x1_x0 ^ high)
 }
 
-fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
+fn mul(x: &mut FieldElement, y: &FieldElement) {
     let (high, low) = mul_wide(x, y);
-    reduce(&high, &low)
+    *x = reduce(&high, &low);
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
@@ -57,19 +60,19 @@ impl crate::platform::GF128FieldElement for FieldElement {
         zero()
     }
 
-    fn load_elem(b: &[u8]) -> Self {
-        load_elem(b)
+    fn load_element(b: &[u8]) -> Self {
+        load_element(b)
     }
 
-    fn store_elem(&self, b: &mut [u8]) {
-        store_elem(self, b);
+    fn store_element(&self, b: &mut [u8]) {
+        store_element(self, b);
     }
 
     fn add(&mut self, other: &Self) {
-        *self = add(self, other);
+        add(self, other);
     }
 
     fn mul(&mut self, other: &Self) {
-        *self = mul(self, other)
+        mul(self, other)
     }
 }
diff --git a/aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
index a396e2dd6..65bb6cc1e 100644
--- a/aesgcm/src/platform/portable/aes_core.rs
+++ b/aesgcm/src/platform/portable/aes_core.rs
@@ -1,3 +1,7 @@
+#![allow(clippy::needless_range_loop)]
+
+use crate::aes_generic::AES_BLOCK_LEN;
+
 pub(crate) type State = [u16; 8];
 
 fn new_state() -> State {
@@ -51,6 +55,7 @@ fn interleave_u16_8(i0: u16, i1: u16) -> (u16, u16) {
     (x, y)
 }
 
+#[inline(always)]
 fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
     let o0 = interleave_u8_1(input[0], input[1]);
     let o1 = interleave_u8_1(input[2], input[3]);
@@ -82,6 +87,7 @@ fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
     output[7] = o7;
 }
 
+#[inline(always)]
 fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
     let (i0, i4) = interleave_u16_8(input[0], input[4]);
     let (i1, i5) = interleave_u16_8(input[1], input[5]);
@@ -275,6 +281,7 @@ fn sub_bytes_state(st: &mut State) {
     st[7] = S0;
 }
 
+#[inline(always)]
 fn shift_row_u16(input: u16) -> u16 {
     (input & 0x1111)
         | ((input & 0x2220) >> 4)
@@ -296,6 +303,7 @@ fn shift_rows_state(st: &mut State) {
     st[7] = shift_row_u16(st[7]);
 }
 
+#[inline(always)]
 fn mix_columns_state(st: &mut State) {
     let mut last_col: u16 = 0;
     for i in 0..8 {
@@ -309,6 +317,7 @@ fn mix_columns_state(st: &mut State) {
     st[4] ^= last_col;
 }
 
+#[inline(always)]
 fn xor_key1_state(st: &mut State, k: &State) {
     st[0] ^= k[0];
     st[1] ^= k[1];
@@ -428,20 +437,20 @@ impl crate::platform::AESState for State {
     }
 
     fn store_block(&self, out: &mut [u8]) {
-        debug_assert!(out.len() == 16);
+        debug_assert!(out.len() == AES_BLOCK_LEN);
 
         transpose_u16x8(self, out);
     }
 
     #[inline(always)]
-    fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
-        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    fn xor_block(&self, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
 
-        let mut block = [0u8; 16];
+        let mut block = [0u8; AES_BLOCK_LEN];
         self.store_block(&mut block);
 
-        for i in 0..inp.len() {
-            out[i] = inp[i] ^ block[i];
+        for i in 0..input.len() {
+            out[i] = input[i] ^ block[i];
         }
     }
 
@@ -451,7 +460,6 @@ impl crate::platform::AESState for State {
 
     fn aes_enc(&mut self, key: &Self) {
         aes_enc(self, key);
-        (self, key);
     }
 
     fn aes_enc_last(&mut self, key: &Self) {
diff --git a/aesgcm/src/platform/portable/gf128_core.rs b/aesgcm/src/platform/portable/gf128_core.rs
index d86375e53..ff138ffa0 100644
--- a/aesgcm/src/platform/portable/gf128_core.rs
+++ b/aesgcm/src/platform/portable/gf128_core.rs
@@ -1,41 +1,49 @@
 pub(crate) type FieldElement = u128;
 
+#[inline(always)]
 fn zero() -> FieldElement {
     0
 }
 
-fn load_elem(b: &[u8]) -> FieldElement {
-    debug_assert!(b.len() == 16);
+#[inline(always)]
+fn load_element(bytes: &[u8]) -> FieldElement {
+    debug_assert!(bytes.len() == 16);
 
-    u128::from_be_bytes(b.try_into().unwrap())
+    u128::from_be_bytes(bytes.try_into().unwrap())
 }
 
-fn store_elem(elem: &FieldElement, b: &mut [u8]) {
-    debug_assert!(b.len() == 16);
-    b.copy_from_slice(&u128::to_be_bytes(*elem));
+#[inline(always)]
+fn store_element(element: &FieldElement, bytes: &mut [u8]) {
+    debug_assert!(bytes.len() == 16);
+    bytes.copy_from_slice(&u128::to_be_bytes(*element));
 }
 
-fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
-    elem ^ other
+#[inline(always)]
+fn add(element: &FieldElement, other: &FieldElement) -> FieldElement {
+    element ^ other
 }
 
+#[inline(always)]
 fn ith_bit_mask(elem: &FieldElement, i: usize) -> FieldElement {
     debug_assert!(i < 128);
+
     let bit: u16 = ((elem >> (127 - i)) as u16) & 0x1;
     let bit_mask16 = (!bit).wrapping_add(1);
     let bit_mask32 = (bit_mask16 as u32) ^ ((bit_mask16 as u32) << 16);
     let bit_mask64 = (bit_mask32 as u64) ^ ((bit_mask32 as u64) << 32);
-    let bit_mask128 = (bit_mask64 as u128) ^ ((bit_mask64 as u128) << 64);
-    bit_mask128
+
+    (bit_mask64 as u128) ^ ((bit_mask64 as u128) << 64)
 }
 
 const IRRED: FieldElement = 0xE100_0000_0000_0000_0000_0000_0000_0000;
 
+#[inline(always)]
 fn mul_x(elem: &mut FieldElement) {
     let mask = ith_bit_mask(elem, 127);
     *elem = (*elem >> 1) ^ (IRRED & mask)
 }
 
+#[inline(always)]
 fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut FieldElement) {
     debug_assert!(i < 128);
     let mask = ith_bit_mask(x, i);
@@ -43,6 +51,7 @@ fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut Field
     mul_x(y);
 }
 
+#[inline(always)]
 fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
     let mut result = 0;
     let mut multiplicand = *y;
@@ -53,22 +62,27 @@ fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
+    #[inline(always)]
     fn zero() -> Self {
         zero()
     }
 
-    fn load_elem(b: &[u8]) -> Self {
-        load_elem(b)
+    #[inline(always)]
+    fn load_element(bytes: &[u8]) -> Self {
+        load_element(bytes)
     }
 
-    fn store_elem(&self, b: &mut [u8]) {
-        store_elem(self, b);
+    #[inline(always)]
+    fn store_element(&self, bytes: &mut [u8]) {
+        store_element(self, bytes);
     }
 
+    #[inline(always)]
     fn add(&mut self, other: &Self) {
         *self = add(self, other);
     }
 
+    #[inline(always)]
     fn mul(&mut self, other: &Self) {
         *self = mul(self, other)
     }
diff --git a/aesgcm/tests/wycheproof.rs b/aesgcm/tests/wycheproof.rs
index 13ef4fdff..c06916e06 100644
--- a/aesgcm/tests/wycheproof.rs
+++ b/aesgcm/tests/wycheproof.rs
@@ -1,29 +1,44 @@
-use wycheproof::TestResult;
+use libcrux_traits::aead::Aead;
+use wycheproof::{aead::Test, TestResult};
 
 #[test]
 fn test() {
     let test_set = wycheproof::aead::TestSet::load(wycheproof::aead::TestName::AesGcm).unwrap();
 
-    macro_rules! run {
-        ($encrypt:expr, $decrypt:expr, $test:expr, $key:expr, $nonce:expr, $aad:expr, $pt:expr) => {
-            let mut ciphertext = vec![0u8; $pt.len()];
-            let mut plaintext = vec![0u8; $pt.len()];
-            let mut tag = [0u8; 16];
+    fn run<Cipher: Aead<Tag = [u8; 16], Key = [u8; 16], Nonce = [u8; 12]>>(test: &Test) {
+        let mut ciphertext = vec![0u8; test.pt.len()];
+        let mut plaintext = vec![0u8; test.pt.len()];
+        let mut tag = [0u8; 16];
 
-            $encrypt($key, $nonce, $aad, $pt, &mut ciphertext, &mut tag);
-            $decrypt($key, $nonce, $aad, &ciphertext, &tag, &mut plaintext).unwrap();
+        Cipher::encrypt(
+            &mut ciphertext,
+            &mut tag,
+            test.key.as_ref().try_into().unwrap(),
+            test.nonce.as_ref().try_into().unwrap(),
+            &test.aad,
+            &test.pt,
+        )
+        .unwrap();
+        Cipher::decrypt(
+            &mut plaintext,
+            test.key.as_ref().try_into().unwrap(),
+            test.nonce.as_ref().try_into().unwrap(),
+            &test.aad,
+            &ciphertext,
+            tag.as_ref().try_into().unwrap(),
+        )
+        .unwrap();
 
-            assert_eq!(plaintext.as_slice(), $pt.as_slice());
+        assert_eq!(plaintext.as_slice(), test.pt.as_slice());
 
-            if $test.result == TestResult::Valid {
-                assert_eq!($test.ct.as_slice(), &ciphertext);
-                assert_eq!($test.tag.as_slice(), &tag);
-            } else {
-                let ct_ok = $test.ct.as_slice() == &ciphertext;
-                let tag_ok = $test.tag.as_slice() == &tag;
-                assert!(!ct_ok || !tag_ok);
-            }
-        };
+        if test.result == TestResult::Valid {
+            assert_eq!(test.ct.as_slice(), &ciphertext);
+            assert_eq!(test.tag.as_slice(), &tag);
+        } else {
+            let ct_ok = test.ct.as_slice() == &ciphertext;
+            let tag_ok = test.tag.as_slice() == &tag;
+            assert!(!ct_ok || !tag_ok);
+        }
     }
 
     for test_group in test_set.test_groups {
@@ -40,37 +55,19 @@ fn test() {
         if test_group.key_size == 128 {
             for test in test_group.tests {
                 println!("  Test {}", test.tc_id);
-                run!(
-                    libcrux_aesgcm::portable::aes128_gcm_encrypt,
-                    libcrux_aesgcm::portable::aes128_gcm_decrypt,
-                    test,
-                    &test.key,
-                    &test.nonce,
-                    &test.aad,
-                    &test.pt
-                );
+                // Multiplexing
+                run::<libcrux_aesgcm::AesGcm128>(&test);
+
+                // Portable
+                run::<libcrux_aesgcm::PortableAesGcm128>(&test);
 
+                // Neon
                 #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-                run!(
-                    libcrux_aesgcm::neon::aes128_gcm_encrypt,
-                    libcrux_aesgcm::neon::aes128_gcm_decrypt,
-                    test,
-                    &test.key,
-                    &test.nonce,
-                    &test.aad,
-                    &test.pt
-                );
+                run::<libcrux_aesgcm::NeonAesGcm128>(&test);
 
+                // x64
                 #[cfg(all(target_arch = "x86_64"))]
-                run!(
-                    libcrux_aesgcm::intel_ni::aes128_gcm_encrypt,
-                    libcrux_aesgcm::intel_ni::aes128_gcm_decrypt,
-                    test,
-                    &test.key,
-                    &test.nonce,
-                    &test.aad,
-                    &test.pt
-                );
+                run::<libcrux_aesgcm::X64AesGcm128>(&test);
             }
         } else if test_group.key_size == 256 {
             for _test in test_group.tests {
diff --git a/libcrux-intrinsics/src/arm64.rs b/libcrux-intrinsics/src/arm64.rs
index f6bbefb04..c78871555 100644
--- a/libcrux-intrinsics/src/arm64.rs
+++ b/libcrux-intrinsics/src/arm64.rs
@@ -351,6 +351,7 @@ pub fn _vld1q_u16(ptr: &[u16]) -> uint16x8_t {
 pub fn _vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
     unsafe { vcleq_s16(a, b) }
 }
+
 #[inline(always)]
 pub fn _vaddvq_u16(a: uint16x8_t) -> u16 {
     unsafe { vaddvq_u16(a) }
@@ -431,3 +432,8 @@ pub fn _vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
     )))]
     _veorq_u64(a, _vbicq_u64(b, c))
 }
+
+#[inline(always)]
+pub fn _vmull_p64(a: u64, b: u64) -> u128 {
+    unsafe { vmull_p64(a, b) }
+}
diff --git a/sys/pqclean/src/bindings.rs b/sys/pqclean/src/bindings.rs
index 581711da3..84b3eef4f 100644
--- a/sys/pqclean/src/bindings.rs
+++ b/sys/pqclean/src/bindings.rs
@@ -1,4 +1,4 @@
-/* automatically generated by rust-bindgen 0.72.0 */
+/* automatically generated by rust-bindgen 0.72.1 */
 
 pub const SHAKE128_RATE: u32 = 168;
 pub const SHAKE256_RATE: u32 = 136;
diff --git a/traits/Cargo.toml b/traits/Cargo.toml
index 2c5549140..883515e4a 100644
--- a/traits/Cargo.toml
+++ b/traits/Cargo.toml
@@ -21,5 +21,5 @@ generic-tests = []
 alloc = []
 
 [dependencies]
-rand = { version = "0.9", default-features = false }
+rand = { version = "0.9", default-features = false, optional = true }
 libcrux-secrets = { version = "=0.0.3", path = "../secrets" }
diff --git a/traits/src/lib.rs b/traits/src/lib.rs
index 6bb77bcdc..090234e3c 100644
--- a/traits/src/lib.rs
+++ b/traits/src/lib.rs
@@ -1,6 +1,7 @@
 #![no_std]
 
-extern crate alloc;
+pub mod aead;
+pub mod kem;
 
 // NOTE: This Digest trait and the new `digest` trait APIs overlap to some extent.
 // See issue #1039

From 0083c9402da46935f2bbe22936369de1e2af40ba Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Tue, 8 Jul 2025 11:04:11 +0200
Subject: [PATCH 18/43] towards aes256

---
 aesgcm/src/aes_ctr.rs                    |  37 +-
 aesgcm/src/aes_ctr/aes128_ctr.rs         |  45 ++-
 aesgcm/src/aes_ctr/aes256_ctr.rs         | 142 +++----
 aesgcm/src/aes_gcm_128.rs                |  10 +-
 aesgcm/src/aes_gcm_256.rs                | 116 ++++++
 aesgcm/src/aes_generic.rs                |  80 ----
 aesgcm/src/gf128_generic.rs              |   4 +-
 aesgcm/src/lib.rs                        | 258 +++++++------
 aesgcm/src/platform.rs                   |   2 +-
 aesgcm/src/platform/portable/aes_core.rs | 344 +++++++++--------
 aesgcm/test.py                           | 455 +++++++++++++++++++++++
 aesgcm/tests/wycheproof.rs               |  64 +++-
 traits/Cargo.toml                        |   2 +-
 traits/src/lib.rs                        |   3 +-
 14 files changed, 1098 insertions(+), 464 deletions(-)
 create mode 100644 aesgcm/test.py

diff --git a/aesgcm/src/aes_ctr.rs b/aesgcm/src/aes_ctr.rs
index 6538ac76c..ed89ad2e5 100644
--- a/aesgcm/src/aes_ctr.rs
+++ b/aesgcm/src/aes_ctr.rs
@@ -1,10 +1,10 @@
-use crate::{aes_gcm_128, aes_generic::*, platform::AESState};
+use crate::{aes_generic::*, platform::AESState};
 
 mod aes128_ctr;
-// mod aes256_ctr; // TODO: use
+mod aes256_ctr;
 
 pub(crate) use aes128_ctr::*;
-// pub(crate) use aes256_ctr::*;
+pub(crate) use aes256_ctr::*;
 
 const NONCE_LEN: usize = 16;
 
@@ -16,13 +16,13 @@ pub(crate) struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
 
 impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
     fn aes_ctr_set_nonce(&mut self, nonce: &[u8]) {
-        debug_assert!(nonce.len() == aes_gcm_128::NONCE_LEN);
+        debug_assert!(nonce.len() == crate::NONCE_LEN);
 
-        self.ctr_nonce[0..aes_gcm_128::NONCE_LEN].copy_from_slice(nonce);
+        self.ctr_nonce[0..crate::NONCE_LEN].copy_from_slice(nonce);
     }
 
     fn aes_ctr_key_block(&self, ctr: u32, out: &mut [u8]) {
-        debug_assert!(out.len() == 16);
+        debug_assert!(out.len() == AES_BLOCK_LEN);
 
         let mut st_init = self.ctr_nonce;
         st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
@@ -37,7 +37,7 @@ impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
 
     #[inline(always)]
     fn aes_ctr_xor_block(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len() && input.len() <= 16);
+        debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
 
         let mut st_init = self.ctr_nonce;
         st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
@@ -50,27 +50,32 @@ impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
     }
 
     fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len() && input.len() % 16 == 0);
-        debug_assert!(input.len() / 16 < u32::MAX as usize);
+        debug_assert!(input.len() == out.len() && input.len() % AES_BLOCK_LEN == 0);
+        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
 
-        let blocks = input.len() / 16;
+        let blocks = input.len() / AES_BLOCK_LEN;
         for i in 0..blocks {
+            let offset = i * AES_BLOCK_LEN;
             self.aes_ctr_xor_block(
                 ctr.wrapping_add(i as u32),
-                &input[i * 16..i * 16 + 16],
-                &mut out[i * 16..i * 16 + 16],
+                &input[offset..offset + AES_BLOCK_LEN],
+                &mut out[offset..offset + AES_BLOCK_LEN],
             );
         }
     }
 
     fn aes_ctr_update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
         debug_assert!(input.len() == out.len());
-        debug_assert!(input.len() / 16 < u32::MAX as usize);
+        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
 
-        let blocks = input.len() / 16;
-        self.aes_ctr_xor_blocks(ctr, &input[0..blocks * 16], &mut out[0..blocks * 16]);
+        let blocks = input.len() / AES_BLOCK_LEN;
+        self.aes_ctr_xor_blocks(
+            ctr,
+            &input[0..blocks * AES_BLOCK_LEN],
+            &mut out[0..blocks * AES_BLOCK_LEN],
+        );
 
-        let last = input.len() - input.len() % 16;
+        let last = input.len() - input.len() % AES_BLOCK_LEN;
         if last < input.len() {
             self.aes_ctr_xor_block(
                 ctr.wrapping_add(blocks as u32),
diff --git a/aesgcm/src/aes_ctr/aes128_ctr.rs b/aesgcm/src/aes_ctr/aes128_ctr.rs
index 293221513..41b9f437c 100644
--- a/aesgcm/src/aes_ctr/aes128_ctr.rs
+++ b/aesgcm/src/aes_ctr/aes128_ctr.rs
@@ -1,12 +1,12 @@
+use core::array::from_fn;
+
 use super::AesCtrContext;
-use crate::{
-    aes_gcm_128::{KEY_LEN, NONCE_LEN},
-    aes_generic::*,
-    platform::AESState,
-};
+use crate::{aes_gcm_128::KEY_LEN, aes_generic::*, platform::AESState, NONCE_LEN};
+
+pub(super) const NUM_KEYS: usize = 11;
 
-/// Type alias for the AES 128 ctr context
-pub(crate) type Aes128CtrContext<T> = AesCtrContext<T, 11>;
+/// Type alias for the AES 128 ctr context.
+pub(crate) type Aes128CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
 
 impl<T: AESState> Aes128CtrContext<T> {
     pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
@@ -17,7 +17,7 @@ impl<T: AESState> Aes128CtrContext<T> {
         ctr_nonce[0..12].copy_from_slice(nonce);
 
         Self {
-            extended_key: aes128_key_expansion(key),
+            extended_key: key_expansion(key),
             ctr_nonce,
         }
     }
@@ -41,6 +41,35 @@ impl<T: AESState> Aes128CtrContext<T> {
     }
 }
 
+/// 128 - Key expansion
+fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
+    debug_assert!(key.len() == KEY_LEN);
+
+    let mut keyex = from_fn(|_| T::new());
+    keyex[0].load_block(key);
+
+    macro_rules! expansion_step128 {
+        ($i:expr,$rcon:expr) => {
+            let prev = keyex[$i - 1].clone();
+            keyex[$i].aes_keygen_assist0::<$rcon>(&prev);
+            keyex[$i].key_expansion_step(&prev);
+        };
+    }
+
+    expansion_step128!(1, 0x01);
+    expansion_step128!(2, 0x02);
+    expansion_step128!(3, 0x04);
+    expansion_step128!(4, 0x08);
+    expansion_step128!(5, 0x10);
+    expansion_step128!(6, 0x20);
+    expansion_step128!(7, 0x40);
+    expansion_step128!(8, 0x80);
+    expansion_step128!(9, 0x1b);
+    expansion_step128!(10, 0x36);
+
+    keyex
+}
+
 #[cfg(test)]
 pub(crate) mod test_utils {
     use super::*;
diff --git a/aesgcm/src/aes_ctr/aes256_ctr.rs b/aesgcm/src/aes_ctr/aes256_ctr.rs
index 0c29ef79b..a10ed5cff 100644
--- a/aesgcm/src/aes_ctr/aes256_ctr.rs
+++ b/aesgcm/src/aes_ctr/aes256_ctr.rs
@@ -1,75 +1,89 @@
+use core::array::from_fn;
+
 use super::AesCtrContext;
-use crate::{
-    aes_ctr::{
-        aes_ctr_key_block, aes_ctr_set_nonce, aes_ctr_update, aes_ctr_xor_block, aes_ctr_xor_blocks,
-    },
-    aes_generic::*,
-    platform::AESState,
-};
-
-pub type Aes256CtrContext<T> = AesCtrContext<T, 15>;
-
-pub fn aes256_ctr_init<T: AESState>(key: &[u8], nonce: &[u8]) -> Aes256CtrContext<T> {
-    debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 32);
-    let mut ctr_nonce = [0u8; 16];
-    ctr_nonce[0..12].copy_from_slice(nonce);
-    Aes256CtrContext {
-        keyex: aes256_key_expansion(key),
-        ctr_nonce,
+use crate::{aes_gcm_256::KEY_LEN, aes_generic::*, platform::AESState, NONCE_LEN};
+
+pub(crate) const NUM_KEYS: usize = 15;
+
+/// Type alias for the AES 256 ctr context.
+pub(crate) type Aes256CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
+
+impl<T: AESState> Aes256CtrContext<T> {
+    pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(key.len() == KEY_LEN);
+
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..NONCE_LEN].copy_from_slice(nonce);
+
+        Self {
+            extended_key: key_expansion(key),
+            ctr_nonce,
+        }
     }
-}
 
-pub fn aes256_ctr_key_block<T: AESState>(ctx: &Aes256CtrContext<T>, ctr: u32, out: &mut [u8]) {
-    debug_assert!(out.len() == 16);
-    aes_ctr_key_block(ctx, ctr, out);
-}
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        self.aes_ctr_set_nonce(nonce);
+    }
 
-pub fn aes256_ctr_set_nonce<T: AESState>(ctx: &mut Aes256CtrContext<T>, nonce: &[u8]) {
-    debug_assert!(nonce.len() == 12);
-    aes_ctr_set_nonce(ctx, nonce);
-}
+    pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
+        self.aes_ctr_key_block(ctr, out);
+    }
 
-pub fn aes256_ctr_xor_block<T: AESState>(
-    ctx: &Aes256CtrContext<T>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-    aes_ctr_xor_block(ctx, ctr, inp, out);
+    pub(crate) fn update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len());
+        self.aes_ctr_update(ctr, input, out);
+    }
 }
 
-pub fn aes256_ctr_xor_blocks<T: AESState>(
-    ctx: &Aes256CtrContext<T>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len() && inp.len() % 16 == 0);
-    aes_ctr_xor_blocks(ctx, ctr, inp, out);
-}
+/// 256 - Key expansion
+fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
+    debug_assert!(key.len() == KEY_LEN);
+
+    let mut keyex = from_fn(|_| T::new());
+    keyex[0].load_block(&key[0..16]);
+    keyex[1].load_block(&key[16..32]);
+
+    macro_rules! expansion_step256 {
+        ($i:expr,$rcon:expr) => {
+            let prev0 = keyex[$i - 2].clone();
+            let prev1 = keyex[$i - 1].clone();
+
+            keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
+            keyex[$i].key_expansion_step(&prev0);
+
+            // XXX: avoid clone
+            let next0 = keyex[$i].clone();
+            keyex[$i + 1].aes_keygen_assist1(&next0);
+            keyex[$i + 1].key_expansion_step(&prev1);
+        };
+    }
+
+    expansion_step256!(2, 0x01);
+    expansion_step256!(3, 0x01);
+    expansion_step256!(4, 0x02);
+    expansion_step256!(5, 0x02);
+    expansion_step256!(6, 0x04);
+    expansion_step256!(7, 0x04);
+    expansion_step256!(8, 0x08);
+    expansion_step256!(9, 0x08);
+    expansion_step256!(10, 0x10);
+    expansion_step256!(11, 0x10);
+    expansion_step256!(12, 0x20);
+    expansion_step256!(13, 0x20);
+
+    let prev0 = keyex[12].clone();
+    let prev1 = keyex[13].clone();
+    keyex[14].aes_keygen_assist0::<0x40>(&prev1);
+    keyex[14].key_expansion_step(&prev0);
 
-pub fn aes256_ctr_update<T: AESState>(
-    ctx: &Aes256CtrContext<T>,
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(inp.len() == out.len());
-    aes_ctr_update(ctx, ctr, inp, out);
+    keyex
 }
 
-pub fn aes256_ctr_encrypt<T: AESState>(
-    key: &[u8],
-    nonce: &[u8],
-    ctr: u32,
-    inp: &[u8],
-    out: &mut [u8],
-) {
-    debug_assert!(nonce.len() == 12);
-    debug_assert!(key.len() == 32);
-    debug_assert!(inp.len() == out.len());
-    let ctx = aes256_ctr_init::<T>(key, nonce);
-    aes256_ctr_update(&ctx, ctr, inp, out);
+fn print<T: AESState>(s: &str, keyex: &T) {
+    let mut tmp = [0u8; 16];
+    keyex.store_block(&mut tmp);
+    eprintln!("{s}: {:02x?}", tmp);
 }
diff --git a/aesgcm/src/aes_gcm_128.rs b/aesgcm/src/aes_gcm_128.rs
index 38131faaf..31fca7add 100644
--- a/aesgcm/src/aes_gcm_128.rs
+++ b/aesgcm/src/aes_gcm_128.rs
@@ -5,18 +5,12 @@ use crate::{
     aes_generic::AES_BLOCK_LEN,
     gf128_generic::GF128State,
     platform::{AESState, GF128FieldElement},
-    DecryptError,
+    DecryptError, NONCE_LEN, TAG_LEN,
 };
 
 /// Key length.
 pub(crate) const KEY_LEN: usize = 16;
 
-/// Tag length.
-pub(crate) const TAG_LEN: usize = 16;
-
-/// Nonce length.
-pub(crate) const NONCE_LEN: usize = 12;
-
 /// The AES-GCM 128 state
 pub(crate) struct State<T: AESState, U: GF128FieldElement> {
     pub(crate) aes_state: Aes128CtrContext<T>,
@@ -59,7 +53,7 @@ impl<T: AESState, U: GF128FieldElement> State<T, U> {
         tag: &mut [u8],
     ) {
         debug_assert!(ciphertext.len() == plaintext.len());
-        debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
+        debug_assert!(plaintext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
         debug_assert!(tag.len() == TAG_LEN);
 
         self.aes_state.update(2, plaintext, ciphertext);
diff --git a/aesgcm/src/aes_gcm_256.rs b/aesgcm/src/aes_gcm_256.rs
index e69de29bb..3616a0e6b 100644
--- a/aesgcm/src/aes_gcm_256.rs
+++ b/aesgcm/src/aes_gcm_256.rs
@@ -0,0 +1,116 @@
+#![allow(clippy::needless_range_loop)]
+
+use crate::{
+    aes_ctr::Aes256CtrContext,
+    aes_generic::AES_BLOCK_LEN,
+    gf128_generic::GF128State,
+    platform::{AESState, GF128FieldElement},
+    DecryptError, NONCE_LEN, TAG_LEN,
+};
+
+/// Key length.
+pub(crate) const KEY_LEN: usize = 32;
+pub(crate) const GCM_KEY_LEN: usize = 16;
+
+/// The AES-GCM 256 state
+pub(crate) struct State<T: AESState, U: GF128FieldElement> {
+    pub(crate) aes_state: Aes256CtrContext<T>,
+    pub(crate) gcm_state: GF128State<U>,
+    pub(crate) tag_mix: [u8; TAG_LEN],
+}
+
+impl<T: AESState, U: GF128FieldElement> State<T, U> {
+    /// Initialize the state
+    pub(crate) fn init(key: &[u8]) -> Self {
+        debug_assert!(key.len() == KEY_LEN);
+
+        let nonce = [0u8; NONCE_LEN];
+        let mut gcm_key = [0u8; GCM_KEY_LEN];
+        let tag_mix = [0u8; TAG_LEN];
+
+        let aes_state = Aes256CtrContext::<T>::init(key, &nonce);
+        aes_state.key_block(0, &mut gcm_key);
+        let gcm_state = GF128State::init(&gcm_key);
+
+        Self {
+            aes_state,
+            gcm_state,
+            tag_mix,
+        }
+    }
+
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
+
+        self.aes_state.set_nonce(nonce);
+        self.aes_state.key_block(1, &mut self.tag_mix);
+    }
+
+    pub(crate) fn encrypt(
+        &mut self,
+        aad: &[u8],
+        plaintext: &[u8],
+        ciphertext: &mut [u8],
+        tag: &mut [u8],
+    ) {
+        debug_assert!(ciphertext.len() == plaintext.len());
+        debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        self.aes_state.update(2, plaintext, ciphertext);
+
+        self.gcm_state.update_padded(aad);
+        self.gcm_state.update_padded(ciphertext);
+
+        let mut last_block = [0u8; AES_BLOCK_LEN];
+        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+        self.gcm_state.update(&last_block);
+        self.gcm_state.emit(tag);
+
+        for i in 0..16 {
+            tag[i] ^= self.tag_mix[i];
+        }
+    }
+
+    pub(crate) fn decrypt(
+        &mut self,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError> {
+        debug_assert!(plaintext.len() == ciphertext.len());
+        debug_assert!(ciphertext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        self.gcm_state.update_padded(aad);
+        self.gcm_state.update_padded(ciphertext);
+
+        let mut last_block = [0u8; AES_BLOCK_LEN];
+        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+        self.gcm_state.update(&last_block);
+
+        let mut computed_tag = [0u8; TAG_LEN];
+        self.gcm_state.emit(&mut computed_tag);
+
+        for i in 0..16 {
+            computed_tag[i] ^= self.tag_mix[i];
+        }
+
+        let mut eq_mask = 0u8;
+        for i in 0..16 {
+            eq_mask |= computed_tag[i] ^ tag[i];
+        }
+
+        if eq_mask == 0 {
+            self.aes_state.update(2, ciphertext, plaintext);
+            Ok(())
+        } else {
+            Err(DecryptError())
+        }
+    }
+}
diff --git a/aesgcm/src/aes_generic.rs b/aesgcm/src/aes_generic.rs
index b1e925c99..f508d00a5 100644
--- a/aesgcm/src/aes_generic.rs
+++ b/aesgcm/src/aes_generic.rs
@@ -1,7 +1,3 @@
-#![allow(clippy::needless_range_loop)]
-
-use core::array::from_fn;
-
 use crate::platform::*;
 
 pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
@@ -14,82 +10,6 @@ pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
 /// AES block size
 pub(crate) const AES_BLOCK_LEN: usize = 16;
 
-const AES128_NUM_KEYS: usize = 11;
-
-/// 128 - Key expansion
-pub(crate) fn aes128_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, AES128_NUM_KEYS> {
-    debug_assert!(key.len() == 16);
-
-    let mut keyex = from_fn(|_| T::new());
-    keyex[0].load_block(key);
-
-    macro_rules! expansion_step128 {
-        ($i:expr,$rcon:expr) => {
-            let prev = keyex[$i - 1].clone();
-            keyex[$i].aes_keygen_assist0::<$rcon>(&prev);
-            keyex[$i].key_expansion_step(&prev);
-        };
-    }
-
-    expansion_step128!(1, 0x01);
-    expansion_step128!(2, 0x02);
-    expansion_step128!(3, 0x04);
-    expansion_step128!(4, 0x08);
-    expansion_step128!(5, 0x10);
-    expansion_step128!(6, 0x20);
-    expansion_step128!(7, 0x40);
-    expansion_step128!(8, 0x80);
-    expansion_step128!(9, 0x1b);
-    expansion_step128!(10, 0x36);
-
-    keyex
-}
-
-const AES256_NUM_KEYS: usize = 15;
-
-/// 256 - Key expansion
-/// TODO: use
-#[allow(dead_code)]
-pub(crate) fn aes256_key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, AES256_NUM_KEYS> {
-    debug_assert!(key.len() == 32);
-
-    let mut keyex = from_fn(|_| T::new());
-    keyex[0].load_block(&key[0..16]);
-    keyex[1].load_block(&key[16..32]);
-
-    macro_rules! expansion_step256 {
-        ($i:expr,$rcon:expr) => {
-            let prev0 = keyex[$i - 2].clone();
-            let prev1 = keyex[$i - 1].clone();
-            keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
-            keyex[$i].key_expansion_step(&prev0);
-
-            let next0 = keyex[$i].clone();
-            keyex[$i + 1].aes_keygen_assist1(&next0);
-            keyex[$i + 1].key_expansion_step(&prev1);
-        };
-    }
-
-    expansion_step256!(2, 0x01);
-    expansion_step256!(3, 0x01);
-    expansion_step256!(4, 0x02);
-    expansion_step256!(5, 0x02);
-    expansion_step256!(6, 0x04);
-    expansion_step256!(7, 0x04);
-    expansion_step256!(8, 0x08);
-    expansion_step256!(9, 0x08);
-    expansion_step256!(10, 0x10);
-    expansion_step256!(11, 0x10);
-    expansion_step256!(12, 0x20);
-    expansion_step256!(13, 0x20);
-
-    let prev0 = keyex[12].clone();
-    let prev1 = keyex[13].clone();
-    keyex[14].aes_keygen_assist0::<0x40>(&prev1);
-    keyex[14].key_expansion_step(&prev0);
-    keyex
-}
-
 pub(crate) fn block_cipher<T: AESState, const NUM_KEYS: usize>(
     st: &mut T,
     keyex: &ExtendedKey<T, NUM_KEYS>,
diff --git a/aesgcm/src/gf128_generic.rs b/aesgcm/src/gf128_generic.rs
index 59c456e93..8198473ba 100644
--- a/aesgcm/src/gf128_generic.rs
+++ b/aesgcm/src/gf128_generic.rs
@@ -1,10 +1,12 @@
-use crate::{aes_gcm_128::KEY_LEN, aes_generic::AES_BLOCK_LEN, platform::*};
+use crate::{aes_generic::AES_BLOCK_LEN, platform::*};
 
 pub(crate) struct GF128State<T: GF128FieldElement> {
     accumulator: T,
     r: T,
 }
 
+const KEY_LEN: usize = 16;
+
 impl<T: GF128FieldElement> GF128State<T> {
     pub(crate) fn init(key: &[u8]) -> Self {
         debug_assert!(key.len() == KEY_LEN);
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 63ddba4f4..bb61f8d59 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -1,7 +1,3 @@
-// XXX: make this conditional when cleaning up
-// #[cfg(feature = "rand")]
-use rand::CryptoRng;
-
 mod aes_ctr;
 mod aes_generic;
 mod gf128_generic;
@@ -10,7 +6,7 @@ mod platform;
 mod aes_gcm_128;
 mod aes_gcm_256;
 
-use libcrux_traits::aead::{Aead, Error};
+pub use libcrux_traits::aead::arrayref::Aead;
 
 /// AES-GCM decryption error.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -34,134 +30,162 @@ pub struct X64AesGcm128 {}
 #[cfg(not(target_arch = "x86_64"))]
 pub type X64AesGcm128 = PortableAesGcm128;
 
-impl Aead for AesGcm128 {
-    type Key = [u8; 16];
-    type Tag = [u8; 16];
-    type Nonce = [u8; 12];
+/// Tag length.
+pub(crate) const TAG_LEN: usize = 16;
 
-    // XXX: make this conditional when cleaning up
-    // #[cfg(feature = "rand")]
-    fn key_gen(key: &mut Self::Key, rng: &mut impl CryptoRng) -> Result<(), Error> {
-        rng.fill_bytes(key);
-        Ok(())
-    }
+/// Nonce length.
+pub(crate) const NONCE_LEN: usize = 12;
 
-    fn encrypt(
-        ciphertext: &mut [u8],
-        tag: &mut Self::Tag,
-        key: &Self::Key,
-        nonce: &Self::Nonce,
-        aad: &[u8],
-        plaintext: &[u8],
-    ) -> Result<(), Error> {
-        if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
-            NeonAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-        } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
-            X64AesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-        } else {
-            PortableAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+mod aes128 {
+    use super::*;
+    use aes_gcm_128::KEY_LEN;
+    use libcrux_traits::aead::arrayref::{DecryptError, EncryptError};
+
+    pub type Key = [u8; KEY_LEN];
+    pub type Tag = [u8; TAG_LEN];
+    pub type Nonce = [u8; NONCE_LEN];
+
+    impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for AesGcm128 {
+        fn encrypt(
+            ciphertext: &mut [u8],
+            tag: &mut Tag,
+            key: &Key,
+            nonce: &Nonce,
+            aad: &[u8],
+            plaintext: &[u8],
+        ) -> Result<(), EncryptError> {
+            if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
+                NeonAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+            } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                X64AesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+            } else {
+                PortableAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+            }
         }
-    }
 
-    fn decrypt(
-        plaintext: &mut [u8],
-        key: &Self::Key,
-        nonce: &Self::Nonce,
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &Self::Tag,
-    ) -> Result<(), Error> {
-        if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
-            NeonAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-        } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
-            X64AesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-        } else {
-            PortableAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+        fn decrypt(
+            plaintext: &mut [u8],
+            key: &Key,
+            nonce: &Nonce,
+            aad: &[u8],
+            ciphertext: &[u8],
+            tag: &Tag,
+        ) -> Result<(), DecryptError> {
+            if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
+                NeonAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+            } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                X64AesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+            } else {
+                PortableAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+            }
         }
     }
-}
 
-impl Aead for PortableAesGcm128 {
-    type Key = [u8; 16];
-    type Tag = [u8; 16];
-    type Nonce = [u8; 12];
+    impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for PortableAesGcm128 {
+        fn encrypt(
+            ciphertext: &mut [u8],
+            tag: &mut Tag,
+            key: &Key,
+            nonce: &Nonce,
+            aad: &[u8],
+            plaintext: &[u8],
+        ) -> Result<(), EncryptError> {
+            portable::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+            Ok(())
+        }
 
-    // XXX: make this conditional when cleaning up
-    // #[cfg(feature = "rand")]
-    fn key_gen(key: &mut Self::Key, rng: &mut impl CryptoRng) -> Result<(), Error> {
-        rng.fill_bytes(key);
-        Ok(())
+        fn decrypt(
+            plaintext: &mut [u8],
+            key: &Key,
+            nonce: &Nonce,
+            aad: &[u8],
+            ciphertext: &[u8],
+            tag: &Tag,
+        ) -> Result<(), DecryptError> {
+            portable::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                .map_err(|_| DecryptError::InvalidTag)
+        }
     }
 
-    fn encrypt(
-        ciphertext: &mut [u8],
-        tag: &mut Self::Tag,
-        key: &Self::Key,
-        nonce: &Self::Nonce,
-        aad: &[u8],
-        plaintext: &[u8],
-    ) -> Result<(), Error> {
-        portable::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-        Ok(())
-    }
+    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+    impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for NeonAesGcm128 {
+        fn encrypt(
+            ciphertext: &mut [u8],
+            tag: &mut Tag,
+            key: &Key,
+            nonce: &Nonce,
+            aad: &[u8],
+            plaintext: &[u8],
+        ) -> Result<(), EncryptError> {
+            neon::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+            Ok(())
+        }
 
-    fn decrypt(
-        plaintext: &mut [u8],
-        key: &Self::Key,
-        nonce: &Self::Nonce,
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &Self::Tag,
-    ) -> Result<(), Error> {
-        portable::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-            .map_err(|_| Error::Decrypt)
+        fn decrypt(
+            plaintext: &mut [u8],
+            key: &Key,
+            nonce: &Nonce,
+            aad: &[u8],
+            ciphertext: &[u8],
+            tag: &Tag,
+        ) -> Result<(), DecryptError> {
+            neon::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                .map_err(|_| DecryptError::InvalidTag)
+        }
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-impl Aead for NeonAesGcm128 {
-    type Key = [u8; 16];
-    type Tag = [u8; 16];
-    type Nonce = [u8; 12];
-
-    // XXX: make this conditional when cleaning up
-    // #[cfg(feature = "rand")]
-    fn key_gen(key: &mut Self::Key, rng: &mut impl CryptoRng) -> Result<(), Error> {
-        rng.fill_bytes(key);
-        Ok(())
-    }
+pub mod portable {
+    use crate::{
+        aes_gcm_128::{self},
+        aes_gcm_256::{self},
+        platform, DecryptError, NONCE_LEN, TAG_LEN,
+    };
 
-    fn encrypt(
-        ciphertext: &mut [u8],
-        tag: &mut Self::Tag,
-        key: &Self::Key,
-        nonce: &Self::Nonce,
+    // XXX: It doesn't really make sense to have these states. We should abstract
+    // this differently
+
+    type Aes128State =
+        aes_gcm_128::State<platform::portable::State, platform::portable::FieldElement>;
+
+    type Aes256State =
+        aes_gcm_256::State<platform::portable::State, platform::portable::FieldElement>;
+
+    pub fn aes128_gcm_encrypt(
+        key: &[u8],
+        nonce: &[u8],
         aad: &[u8],
         plaintext: &[u8],
-    ) -> Result<(), Error> {
-        neon::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-        Ok(())
+        ciphertext: &mut [u8],
+        tag: &mut [u8],
+    ) {
+        debug_assert!(key.len() == aes_gcm_128::KEY_LEN);
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        let mut st = Aes128State::init(key);
+        st.set_nonce(nonce);
+        st.encrypt(aad, plaintext, ciphertext, tag);
     }
 
-    fn decrypt(
-        plaintext: &mut [u8],
-        key: &Self::Key,
-        nonce: &Self::Nonce,
+    pub fn aes128_gcm_decrypt(
+        key: &[u8],
+        nonce: &[u8],
         aad: &[u8],
         ciphertext: &[u8],
-        tag: &Self::Tag,
-    ) -> Result<(), Error> {
-        neon::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-            .map_err(|_| Error::Decrypt)
-    }
-}
-
-pub mod portable {
-    use crate::{aes_gcm_128, platform, DecryptError};
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError> {
+        debug_assert!(key.len() == aes_gcm_128::KEY_LEN);
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(tag.len() == TAG_LEN);
 
-    type State = aes_gcm_128::State<platform::portable::State, platform::portable::FieldElement>;
+        let mut st = Aes128State::init(key);
+        st.set_nonce(nonce);
+        st.decrypt(aad, ciphertext, tag, plaintext)
+    }
 
-    pub fn aes128_gcm_encrypt(
+    pub fn aes256_gcm_encrypt(
         key: &[u8],
         nonce: &[u8],
         aad: &[u8],
@@ -169,12 +193,16 @@ pub mod portable {
         ciphertext: &mut [u8],
         tag: &mut [u8],
     ) {
-        let mut st = State::init(key);
+        debug_assert!(key.len() == aes_gcm_256::KEY_LEN);
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        let mut st = Aes256State::init(key);
         st.set_nonce(nonce);
         st.encrypt(aad, plaintext, ciphertext, tag);
     }
 
-    pub fn aes128_gcm_decrypt(
+    pub fn aes256_gcm_decrypt(
         key: &[u8],
         nonce: &[u8],
         aad: &[u8],
@@ -182,7 +210,11 @@ pub mod portable {
         tag: &[u8],
         plaintext: &mut [u8],
     ) -> Result<(), DecryptError> {
-        let mut st = State::init(key);
+        debug_assert!(key.len() == aes_gcm_256::KEY_LEN);
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(tag.len() == TAG_LEN);
+
+        let mut st = Aes256State::init(key);
         st.set_nonce(nonce);
         st.decrypt(aad, ciphertext, tag, plaintext)
     }
diff --git a/aesgcm/src/platform.rs b/aesgcm/src/platform.rs
index c249626be..29d2d4b97 100644
--- a/aesgcm/src/platform.rs
+++ b/aesgcm/src/platform.rs
@@ -6,7 +6,7 @@ pub mod neon;
 #[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
 pub mod intel_ni;
 
-pub trait AESState: Clone {
+pub trait AESState: Clone + core::fmt::Debug {
     fn new() -> Self;
     fn load_block(&mut self, b: &[u8]);
     fn store_block(&self, out: &mut [u8]);
diff --git a/aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
index 65bb6cc1e..a8887b716 100644
--- a/aesgcm/src/platform/portable/aes_core.rs
+++ b/aesgcm/src/platform/portable/aes_core.rs
@@ -8,54 +8,62 @@ fn new_state() -> State {
     [0u16; 8]
 }
 
-#[inline(always)]
+#[inline]
 fn interleave_u8_1(i0: u8, i1: u8) -> u16 {
     let mut x = i0 as u16;
+
     x = (x | (x << 4)) & 0x0F0F;
     x = (x | (x << 2)) & 0x3333;
     x = (x | (x << 1)) & 0x5555;
+
     let mut y = i1 as u16;
+
     y = (y | (y << 4)) & 0x0F0F;
     y = (y | (y << 2)) & 0x3333;
     y = (y | (y << 1)) & 0x5555;
+
     x | (y << 1)
 }
 
-#[inline(always)]
+#[inline]
 fn deinterleave_u8_1(i0: u16) -> (u8, u8) {
     let mut x = i0 & 0x5555;
+
     x = (x | (x >> 1)) & 0x3333;
     x = (x | (x >> 2)) & 0x0F0F;
     x = (x | (x >> 4)) & 0x00FF;
+
     let mut y = (i0 >> 1) & 0x5555;
+
     y = (y | (y >> 1)) & 0x3333;
     y = (y | (y >> 2)) & 0x0F0F;
     y = (y | (y >> 4)) & 0x00FF;
+
     (x as u8, y as u8)
 }
 
-#[inline(always)]
+#[inline]
 fn interleave_u16_2(i0: u16, i1: u16) -> (u16, u16) {
     let x = ((i1 & 0x3333) << 2) | (i0 & 0x3333);
     let y = ((i0 & 0xcccc) >> 2) | (i1 & 0xcccc);
     (x, y)
 }
 
-#[inline(always)]
+#[inline]
 fn interleave_u16_4(i0: u16, i1: u16) -> (u16, u16) {
     let x = ((i1 & 0x0F0F) << 4) | (i0 & 0x0F0F);
     let y = ((i0 & 0xF0F0) >> 4) | (i1 & 0xF0F0);
     (x, y)
 }
 
-#[inline(always)]
+#[inline]
 fn interleave_u16_8(i0: u16, i1: u16) -> (u16, u16) {
     let x = ((i1 & 0x00FF) << 8) | (i0 & 0x00FF);
     let y = ((i0 & 0xFF00) >> 8) | (i1 & 0xFF00);
     (x, y)
 }
 
-#[inline(always)]
+#[inline]
 fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
     let o0 = interleave_u8_1(input[0], input[1]);
     let o1 = interleave_u8_1(input[2], input[3]);
@@ -65,18 +73,22 @@ fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
     let o5 = interleave_u8_1(input[10], input[11]);
     let o6 = interleave_u8_1(input[12], input[13]);
     let o7 = interleave_u8_1(input[14], input[15]);
+
     let (o0, o1) = interleave_u16_2(o0, o1);
     let (o2, o3) = interleave_u16_2(o2, o3);
     let (o4, o5) = interleave_u16_2(o4, o5);
     let (o6, o7) = interleave_u16_2(o6, o7);
+
     let (o0, o2) = interleave_u16_4(o0, o2);
     let (o1, o3) = interleave_u16_4(o1, o3);
     let (o4, o6) = interleave_u16_4(o4, o6);
     let (o5, o7) = interleave_u16_4(o5, o7);
+
     let (o0, o4) = interleave_u16_8(o0, o4);
     let (o1, o5) = interleave_u16_8(o1, o5);
     let (o2, o6) = interleave_u16_8(o2, o6);
     let (o3, o7) = interleave_u16_8(o3, o7);
+
     output[0] = o0;
     output[1] = o1;
     output[2] = o2;
@@ -87,16 +99,18 @@ fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
     output[7] = o7;
 }
 
-#[inline(always)]
+#[inline]
 fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
     let (i0, i4) = interleave_u16_8(input[0], input[4]);
     let (i1, i5) = interleave_u16_8(input[1], input[5]);
     let (i2, i6) = interleave_u16_8(input[2], input[6]);
     let (i3, i7) = interleave_u16_8(input[3], input[7]);
+
     let (i0, i2) = interleave_u16_4(i0, i2);
     let (i1, i3) = interleave_u16_4(i1, i3);
     let (i4, i6) = interleave_u16_4(i4, i6);
     let (i5, i7) = interleave_u16_4(i5, i7);
+
     let (i0, i1) = interleave_u16_2(i0, i1);
     let (i2, i3) = interleave_u16_2(i2, i3);
     let (i4, i5) = interleave_u16_2(i4, i5);
@@ -107,6 +121,7 @@ fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
     let (o4, o5) = deinterleave_u8_1(i2);
     let (o6, o7) = deinterleave_u8_1(i3);
     let (o8, o9) = deinterleave_u8_1(i4);
+
     let (o10, o11) = deinterleave_u8_1(i5);
     let (o12, o13) = deinterleave_u8_1(i6);
     let (o14, o15) = deinterleave_u8_1(i7);
@@ -129,159 +144,163 @@ fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
     output[15] = o15;
 }
 
-#[inline(always)]
+#[inline]
 fn xnor(a: u16, b: u16) -> u16 {
     !(a ^ b)
 }
 
-#[allow(non_snake_case)]
 fn sub_bytes_state(st: &mut State) {
-    let U0 = st[7];
-    let U1 = st[6];
-    let U2 = st[5];
-    let U3 = st[4];
-    let U4 = st[3];
-    let U5 = st[2];
-    let U6 = st[1];
-    let U7 = st[0];
-
-    let T1 = U6 ^ U4;
-    let T2 = U3 ^ U0;
-    let T3 = U1 ^ U2;
-    let T4 = U7 ^ T3;
-    let T5 = T1 ^ T2;
-    let T6 = U1 ^ U5;
-    let T7 = U0 ^ U6;
-    let T8 = T1 ^ T6;
-    let T9 = U6 ^ T4;
-    let T10 = U3 ^ T4;
-    let T11 = U7 ^ T5;
-    let T12 = T5 ^ T6;
-    let T13 = U2 ^ U5;
-    let T14 = T3 ^ T5;
-    let T15 = U5 ^ T7;
-    let T16 = U0 ^ U5;
-    let T17 = U7 ^ T8;
-    let T18 = U6 ^ U5;
-    let T19 = T2 ^ T18;
-    let T20 = T4 ^ T15;
-    let T21 = T1 ^ T13;
-    let T22 = U0 ^ T4;
-    let T39 = T21 ^ T5;
-    let T40 = T21 ^ T7;
-    let T41 = T7 ^ T19;
-    let T42 = T16 ^ T14;
-    let T43 = T22 ^ T17;
-    let T44 = T19 & T5;
-    let T45 = T20 & T11;
-    let T46 = T12 ^ T44;
-    let T47 = T10 & U7;
-    let T48 = T47 ^ T44;
-    let T49 = T7 & T21;
-    let T50 = T9 & T4;
-    let T51 = T40 ^ T49;
-    let T52 = T22 & T17;
-    let T53 = T52 ^ T49;
-    let T54 = T2 & T8;
-    let T55 = T41 & T39;
-    let T56 = T55 ^ T54;
-    let T57 = T16 & T14;
-    let T58 = T57 ^ T54;
-    let T59 = T46 ^ T45;
-    let T60 = T48 ^ T42;
-    let T61 = T51 ^ T50;
-    let T62 = T53 ^ T58;
-    let T63 = T59 ^ T56;
-    let T64 = T60 ^ T58;
-    let T65 = T61 ^ T56;
-    let T66 = T62 ^ T43;
-    let T67 = T65 ^ T66;
-    let T68 = T65 & T63;
-    let T69 = T64 ^ T68;
-    let T70 = T63 ^ T64;
-    let T71 = T66 ^ T68;
-    let T72 = T71 & T70;
-    let T73 = T69 & T67;
-    let T74 = T63 & T66;
-    let T75 = T70 & T74;
-    let T76 = T70 ^ T68;
-    let T77 = T64 & T65;
-    let T78 = T67 & T77;
-    let T79 = T67 ^ T68;
-    let T80 = T64 ^ T72;
-    let T81 = T75 ^ T76;
-    let T82 = T66 ^ T73;
-    let T83 = T78 ^ T79;
-    let T84 = T81 ^ T83;
-    let T85 = T80 ^ T82;
-    let T86 = T80 ^ T81;
-    let T87 = T82 ^ T83;
-    let T88 = T85 ^ T84;
-    let T89 = T87 & T5;
-    let T90 = T83 & T11;
-    let T91 = T82 & U7;
-    let T92 = T86 & T21;
-    let T93 = T81 & T4;
-    let T94 = T80 & T17;
-    let T95 = T85 & T8;
-    let T96 = T88 & T39;
-    let T97 = T84 & T14;
-    let T98 = T87 & T19;
-    let T99 = T83 & T20;
-    let T100 = T82 & T10;
-    let T101 = T86 & T7;
-    let T102 = T81 & T9;
-    let T103 = T80 & T22;
-    let T104 = T85 & T2;
-    let T105 = T88 & T41;
-    let T106 = T84 & T16;
-    let T107 = T104 ^ T105;
-    let T108 = T93 ^ T99;
-    let T109 = T96 ^ T107;
-    let T110 = T98 ^ T108;
-    let T111 = T91 ^ T101;
-    let T112 = T89 ^ T92;
-    let T113 = T107 ^ T112;
-    let T114 = T90 ^ T110;
-    let T115 = T89 ^ T95;
-    let T116 = T94 ^ T102;
-    let T117 = T97 ^ T103;
-    let T118 = T91 ^ T114;
-    let T119 = T111 ^ T117;
-    let T120 = T100 ^ T108;
-    let T121 = T92 ^ T95;
-    let T122 = T110 ^ T121;
-    let T123 = T106 ^ T119;
-    let T124 = T104 ^ T115;
-    let T125 = T111 ^ T116;
-    let S0 = T109 ^ T122;
-    let S2 = xnor(T123, T124);
-    let T128 = T94 ^ T107;
-    let S3 = T113 ^ T114;
-    let S4 = T118 ^ T128;
-    let T131 = T93 ^ T101;
-    let T132 = T112 ^ T120;
-    let S7 = xnor(T113, T125);
-    let T134 = T97 ^ T116;
-    let T135 = T131 ^ T134;
-    let T136 = T93 ^ T115;
-    let S6 = xnor(T109, T135);
-    let T138 = T119 ^ T132;
-    let S5 = T109 ^ T138;
-    let T140 = T114 ^ T136;
-    let S1 = xnor(T109, T140);
-
-    st[0] = S7;
-    st[1] = S6;
-    st[2] = S5;
-    st[3] = S4;
-    st[4] = S3;
-    st[5] = S2;
-    st[6] = S1;
-    st[7] = S0;
+    let u0 = st[7];
+    let u1 = st[6];
+    let u2 = st[5];
+    let u3 = st[4];
+    let u4 = st[3];
+    let u5 = st[2];
+    let u6 = st[1];
+    let u7 = st[0];
+
+    let t1 = u6 ^ u4;
+    let t2 = u3 ^ u0;
+    let t3 = u1 ^ u2;
+    let t4 = u7 ^ t3;
+    let t5 = t1 ^ t2;
+    let t6 = u1 ^ u5;
+    let t7 = u0 ^ u6;
+    let t8 = t1 ^ t6;
+    let t9 = u6 ^ t4;
+    let t10 = u3 ^ t4;
+    let t11 = u7 ^ t5;
+    let t12 = t5 ^ t6;
+    let t13 = u2 ^ u5;
+    let t14 = t3 ^ t5;
+    let t15 = u5 ^ t7;
+    let t16 = u0 ^ u5;
+    let t17 = u7 ^ t8;
+    let t18 = u6 ^ u5;
+    let t19 = t2 ^ t18;
+    let t20 = t4 ^ t15;
+    let t21 = t1 ^ t13;
+    let t22 = u0 ^ t4;
+    let t39 = t21 ^ t5;
+    let t40 = t21 ^ t7;
+    let t41 = t7 ^ t19;
+    let t42 = t16 ^ t14;
+    let t43 = t22 ^ t17;
+    let t44 = t19 & t5;
+    let t45 = t20 & t11;
+    let t46 = t12 ^ t44;
+    let t47 = t10 & u7;
+    let t48 = t47 ^ t44;
+    let t49 = t7 & t21;
+    let t50 = t9 & t4;
+    let t51 = t40 ^ t49;
+    let t52 = t22 & t17;
+    let t53 = t52 ^ t49;
+    let t54 = t2 & t8;
+    let t55 = t41 & t39;
+    let t56 = t55 ^ t54;
+    let t57 = t16 & t14;
+    let t58 = t57 ^ t54;
+    let t59 = t46 ^ t45;
+    let t60 = t48 ^ t42;
+    let t61 = t51 ^ t50;
+    let t62 = t53 ^ t58;
+    let t63 = t59 ^ t56;
+    let t64 = t60 ^ t58;
+    let t65 = t61 ^ t56;
+    let t66 = t62 ^ t43;
+    let t67 = t65 ^ t66;
+    let t68 = t65 & t63;
+    let t69 = t64 ^ t68;
+    let t70 = t63 ^ t64;
+    let t71 = t66 ^ t68;
+    let t72 = t71 & t70;
+    let t73 = t69 & t67;
+    let t74 = t63 & t66;
+    let t75 = t70 & t74;
+    let t76 = t70 ^ t68;
+    let t77 = t64 & t65;
+    let t78 = t67 & t77;
+    let t79 = t67 ^ t68;
+    let t80 = t64 ^ t72;
+    let t81 = t75 ^ t76;
+    let t82 = t66 ^ t73;
+    let t83 = t78 ^ t79;
+    let t84 = t81 ^ t83;
+    let t85 = t80 ^ t82;
+    let t86 = t80 ^ t81;
+    let t87 = t82 ^ t83;
+    let t88 = t85 ^ t84;
+    let t89 = t87 & t5;
+    let t90 = t83 & t11;
+    let t91 = t82 & u7;
+    let t92 = t86 & t21;
+    let t93 = t81 & t4;
+    let t94 = t80 & t17;
+    let t95 = t85 & t8;
+    let t96 = t88 & t39;
+    let t97 = t84 & t14;
+    let t98 = t87 & t19;
+    let t99 = t83 & t20;
+    let t100 = t82 & t10;
+    let t101 = t86 & t7;
+    let t102 = t81 & t9;
+    let t103 = t80 & t22;
+    let t104 = t85 & t2;
+    let t105 = t88 & t41;
+    let t106 = t84 & t16;
+    let t107 = t104 ^ t105;
+    let t108 = t93 ^ t99;
+    let t109 = t96 ^ t107;
+    let t110 = t98 ^ t108;
+    let t111 = t91 ^ t101;
+    let t112 = t89 ^ t92;
+    let t113 = t107 ^ t112;
+    let t114 = t90 ^ t110;
+    let t115 = t89 ^ t95;
+    let t116 = t94 ^ t102;
+    let t117 = t97 ^ t103;
+    let t118 = t91 ^ t114;
+    let t119 = t111 ^ t117;
+    let t120 = t100 ^ t108;
+    let t121 = t92 ^ t95;
+    let t122 = t110 ^ t121;
+    let t123 = t106 ^ t119;
+    let t124 = t104 ^ t115;
+    let t125 = t111 ^ t116;
+
+    let t128 = t94 ^ t107;
+
+    let t131 = t93 ^ t101;
+    let t132 = t112 ^ t120;
+
+    let t134 = t97 ^ t116;
+    let t135 = t131 ^ t134;
+    let t136 = t93 ^ t115;
+
+    let t138 = t119 ^ t132;
+    let t140 = t114 ^ t136;
+
+    let s0 = t109 ^ t122;
+    let s2 = xnor(t123, t124);
+    let s3 = t113 ^ t114;
+    let s4 = t118 ^ t128;
+    let s7 = xnor(t113, t125);
+    let s6 = xnor(t109, t135);
+    let s5 = t109 ^ t138;
+    let s1 = xnor(t109, t140);
+
+    st[0] = s7;
+    st[1] = s6;
+    st[2] = s5;
+    st[3] = s4;
+    st[4] = s3;
+    st[5] = s2;
+    st[6] = s1;
+    st[7] = s0;
 }
 
-#[inline(always)]
+#[inline]
 fn shift_row_u16(input: u16) -> u16 {
     (input & 0x1111)
         | ((input & 0x2220) >> 4)
@@ -303,21 +322,23 @@ fn shift_rows_state(st: &mut State) {
     st[7] = shift_row_u16(st[7]);
 }
 
-#[inline(always)]
+#[inline]
 fn mix_columns_state(st: &mut State) {
     let mut last_col: u16 = 0;
+
     for i in 0..8 {
         let col = st[i] ^ (((st[i] & 0xeeee) >> 1) | ((st[i] & 0x1111) << 3));
         st[i] = st[i] ^ last_col ^ col ^ (((col & 0xcccc) >> 2) | ((col & 0x3333) << 2));
         last_col = col;
     }
+
     st[0] ^= last_col;
     st[1] ^= last_col;
     st[3] ^= last_col;
     st[4] ^= last_col;
 }
 
-#[inline(always)]
+#[inline]
 fn xor_key1_state(st: &mut State, k: &State) {
     st[0] ^= k[0];
     st[1] ^= k[1];
@@ -342,7 +363,7 @@ fn aes_enc_last(st: &mut State, key: &State) {
     xor_key1_state(st, key)
 }
 
-#[inline(always)]
+#[inline]
 fn aes_keygen_assisti(rcon: u8, i: usize, u: u16) -> u16 {
     let u3 = u & 0xf000;
     let n = u3 >> 12;
@@ -356,6 +377,7 @@ fn aes_keygen_assisti(rcon: u8, i: usize, u: u16) -> u16 {
 fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
     next.copy_from_slice(prev);
     sub_bytes_state(next);
+
     next[0] = aes_keygen_assisti(rcon, 0, next[0]);
     next[1] = aes_keygen_assisti(rcon, 1, next[1]);
     next[2] = aes_keygen_assisti(rcon, 2, next[2]);
@@ -369,7 +391,7 @@ fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
 
-    #[inline(always)]
+    #[inline]
     fn aux(mut n: u16) -> u16 {
         n &= 0xf000;
         n ^= n >> 4;
@@ -390,7 +412,7 @@ fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
 
-    #[inline(always)]
+    #[inline]
     fn aux(mut n: u16) -> u16 {
         n &= 0x0f00;
         n ^= n << 4;
@@ -408,7 +430,7 @@ fn aes_keygen_assist1(next: &mut State, prev: &State) {
     next[7] = aux(next[7]);
 }
 
-#[inline(always)]
+#[inline]
 fn key_expand1(p: u16, n: u16) -> u16 {
     let p = p ^ ((p & 0x0fff) << 4) ^ ((p & 0x00ff) << 8) ^ ((p & 0x000f) << 12);
     n ^ p
@@ -437,12 +459,12 @@ impl crate::platform::AESState for State {
     }
 
     fn store_block(&self, out: &mut [u8]) {
-        debug_assert!(out.len() == AES_BLOCK_LEN);
+        debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
 
         transpose_u16x8(self, out);
     }
 
-    #[inline(always)]
+    #[inline]
     fn xor_block(&self, input: &[u8], out: &mut [u8]) {
         debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
 
diff --git a/aesgcm/test.py b/aesgcm/test.py
new file mode 100644
index 000000000..f098e251b
--- /dev/null
+++ b/aesgcm/test.py
@@ -0,0 +1,455 @@
+import os
+# from cryptography.hazmat.primitives.ciphers import (
+#     Cipher, algorithms, modes
+# )
+# from cryptography.hazmat.backends import default_backend
+# from cryptography.exceptions import InvalidTag
+
+# --- AES Key Schedule Generation (for demonstration) ---
+
+# S-box: The substitution table for AES
+_S_BOX = (
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
+)
+
+# Rcon: The round constant word array
+_R_CON = (
+    0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+)
+
+def _generate_and_print_key_schedule(key: bytes):
+    """
+    Generates and prints the AES-256 key schedule from a 32-byte key.
+    This is for educational purposes to show the key expansion process.
+    """
+    print("--- AES-256 Key Schedule Expansion ---")
+    
+    # AES-256 constants
+    Nk = 8  # Number of 32-bit words in the key
+    Nr = 14 # Number of rounds
+    
+    # The expanded key schedule will hold 4 * (14 + 1) = 60 words
+    w = [0] * (4 * (Nr + 1))
+
+    # The first Nk words are the original key
+    for i in range(Nk):
+        w[i] = int.from_bytes(key[i*4 : i*4+4], 'big')
+
+    # Generate the rest of the words for the schedule
+    for i in range(Nk, len(w)):
+        temp = w[i - 1]
+        print(f"{i}: {i % Nk == 0} | {i % Nk ==4}")
+        print(f"{temp:08x}")
+        if i % Nk == 0:
+            print(f"{i} % Nk == 0")
+            # Rotate the word
+            temp = ((temp << 8) & 0xffffffff) | (temp >> 24)
+            # Apply S-box to each byte
+            temp = (_S_BOX[(temp >> 24) & 0xff] << 24) | \
+                   (_S_BOX[(temp >> 16) & 0xff] << 16) | \
+                   (_S_BOX[(temp >>  8) & 0xff] <<  8) | \
+                   (_S_BOX[ temp        & 0xff])
+            # XOR with the round constant
+            temp ^= (_R_CON[i // Nk] << 24)
+        elif i % Nk == 4: # Extra S-box substitution for AES-256
+            print(f"{i} % Nk == 4")
+            temp = (_S_BOX[(temp >> 24) & 0xff] << 24) | \
+                   (_S_BOX[(temp >> 16) & 0xff] << 16) | \
+                   (_S_BOX[(temp >>  8) & 0xff] <<  8) | \
+                   (_S_BOX[ temp        & 0xff])
+            print(f"{temp:08x}")
+        
+        w[i] = w[i - Nk] ^ temp
+
+    # Print the round keys
+    for r in range(Nr + 1):
+        round_key_words = w[r*4 : r*4+4]
+        round_key_hex = "".join([f'{word:08x}' for word in round_key_words])
+        print(f"Round {r:2d} Key: {round_key_hex}")
+    print("------------------------------------")
+
+
+# class AES_GCM_256:
+#     """
+#     A toy implementation of AES-GCM-256 encryption and decryption.
+
+#     This class demonstrates the core components of an AES-GCM authenticated
+#     encryption scheme. It is for educational purposes and uses the `cryptography`
+#     library for the underlying cryptographic operations.
+#     """
+
+#     def __init__(self, key: bytes):
+#         """
+#         Initializes the cipher with a 256-bit (32-byte) key.
+
+#         Args:
+#             key: A 32-byte key.
+
+#         Raises:
+#             ValueError: If the key is not 32 bytes long.
+#         """
+#         if len(key) != 32:
+#             raise ValueError("Key must be 256 bits (32 bytes) for AES-GCM-256.")
+#         self.key = key
+#         self.backend = default_backend()
+
+#     @staticmethod
+#     def generate_key() -> bytes:
+#         """
+#         Generates a random 256-bit (32-byte) key suitable for AES-GCM-256.
+
+#         Returns:
+#             A 32-byte key.
+#         """
+#         return os.urandom(32)
+
+#     def encrypt(self, plaintext: bytes, associated_data: bytes, nonce: bytes) -> tuple[bytes, bytes]:
+#         """
+#         Encrypts plaintext and authenticates associated data using AES-GCM.
+
+#         Args:
+#             plaintext: The data to encrypt.
+#             associated_data: Additional data to authenticate but not encrypt.
+#             nonce: A 12-byte (96-bit) nonce. Should be unique for each encryption
+#                    with the same key.
+
+#         Returns:
+#             A tuple containing the ciphertext and the authentication tag.
+        
+#         Raises:
+#             ValueError: If the nonce is not 12 bytes long.
+#         """
+#         if len(nonce) != 12:
+#             raise ValueError("Nonce must be 96 bits (12 bytes) for AES-GCM.")
+
+#         # Create an AES-GCM cipher object
+#         cipher = Cipher(algorithms.AES(self.key), modes.GCM(nonce), backend=self.backend)
+#         encryptor = cipher.encryptor()
+
+#         # Add the associated data. This data is authenticated but not encrypted.
+#         encryptor.authenticate_additional_data(associated_data)
+
+#         # Encrypt the plaintext
+#         ciphertext = encryptor.update(plaintext) + encryptor.finalize()
+
+#         # The authentication tag is generated automatically and is available after finalization.
+#         tag = encryptor.tag
+
+#         return ciphertext, tag
+
+#     def decrypt(self, ciphertext: bytes, associated_data: bytes, nonce: bytes, tag: bytes) -> bytes:
+#         """
+#         Decrypts ciphertext and verifies the authentication tag using AES-GCM.
+
+#         Args:
+#             ciphertext: The encrypted data.
+#             associated_data: The associated data that was authenticated.
+#             nonce: The 12-byte nonce used during encryption.
+#             tag: The 16-byte authentication tag generated during encryption.
+
+#         Returns:
+#             The original plaintext if decryption and authentication are successful.
+
+#         Raises:
+#             cryptography.exceptions.InvalidTag: If the authentication fails.
+#             ValueError: If the nonce is not 12 bytes long.
+#         """
+#         if len(nonce) != 12:
+#             raise ValueError("Nonce must be 96 bits (12 bytes) for AES-GCM.")
+
+#         # Create an AES-GCM cipher object with the nonce and tag
+#         cipher = Cipher(algorithms.AES(self.key), modes.GCM(nonce, tag), backend=self.backend)
+#         decryptor = cipher.decryptor()
+
+#         # Add the associated data for authentication verification.
+#         decryptor.authenticate_additional_data(associated_data)
+
+#         # Decrypt the ciphertext.
+#         # An InvalidTag exception will be raised if the tag does not match.
+#         try:
+#             plaintext = decryptor.update(ciphertext) + decryptor.finalize()
+#             return plaintext
+#         except InvalidTag as e:
+#             # Re-raising with a more informative message can be helpful.
+#             print(f"Decryption failed: Invalid authentication tag.")
+#             raise
+#         except Exception as e:
+#             print(f"An unexpected error occurred during decryption: {e}")
+#             raise
+
+def run_tests():
+    """
+    A suite of tests to verify the AES_GCM_256 implementation.
+    """
+    print("--- Running AES-GCM-256 Tests ---")
+
+    # 1. Generate a key
+    key = "92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20"
+    key = bytes.fromhex(key)
+    # aes_cipher = AES_GCM_256(key)
+    print(f"Generated Key (hex): {key.hex()}")
+    _generate_and_print_key_schedule(key)
+
+
+    # # 2. Define test data
+    # plaintext = b"This is a secret message that needs to be encrypted."
+    # associated_data = b"This is metadata that is authenticated but not secret."
+    # # A nonce should be unique for every encryption with the same key.
+    # # For this test, we'll generate a random 12-byte nonce.
+    # nonce = os.urandom(12)
+    
+    # print(f"Original Plaintext: {plaintext.decode()}")
+    # print(f"Associated Data: {associated_data.decode()}")
+    # print(f"Nonce (hex): {nonce.hex()}")
+
+    # # 3. Test successful encryption and decryption
+    # print("\n--- Test 1: Successful Encryption/Decryption ---")
+    # try:
+    #     ciphertext, tag = aes_cipher.encrypt(plaintext, associated_data, nonce)
+    #     print(f"Ciphertext (hex): {ciphertext.hex()}")
+    #     print(f"Authentication Tag (hex): {tag.hex()}")
+
+    #     decrypted_plaintext = aes_cipher.decrypt(ciphertext, associated_data, nonce, tag)
+    #     print(f"Decrypted Plaintext: {decrypted_plaintext.decode()}")
+
+    #     assert plaintext == decrypted_plaintext
+    #     print("SUCCESS: Decrypted plaintext matches original plaintext.")
+    # except Exception as e:
+    #     print(f"FAILURE: An unexpected error occurred: {e}")
+
+
+    # # 4. Test failure: incorrect tag
+    # print("\n--- Test 2: Decryption with Incorrect Tag ---")
+    # try:
+    #     invalid_tag = os.urandom(16) # A random, incorrect tag
+    #     print(f"Using incorrect tag (hex): {invalid_tag.hex()}")
+    #     aes_cipher.decrypt(ciphertext, associated_data, nonce, invalid_tag)
+    #     # The line above should raise an exception, so we should not reach here.
+    #     print("FAILURE: Decryption succeeded with an invalid tag.")
+    # except InvalidTag:
+    #     print(f"SUCCESS: Decryption failed as expected due to InvalidTag.")
+    # except Exception as e:
+    #     print(f"FAILURE: An unexpected error occurred: {e}")
+
+    # # 5. Test failure: modified ciphertext
+    # print("\n--- Test 3: Decryption with Modified Ciphertext ---")
+    # try:
+    #     # Tamper with the ciphertext (flip the first byte)
+    #     modified_ciphertext = bytes([ciphertext[0] ^ 0xFF]) + ciphertext[1:]
+    #     print(f"Using modified ciphertext (hex): {modified_ciphertext.hex()}")
+    #     aes_cipher.decrypt(modified_ciphertext, associated_data, nonce, tag)
+    #     print("FAILURE: Decryption succeeded with modified ciphertext.")
+    # except InvalidTag:
+    #     print(f"SUCCESS: Decryption failed as expected due to InvalidTag.")
+    # except Exception as e:
+    #     print(f"FAILURE: An unexpected error occurred: {e}")
+
+    # # 6. Test failure: modified associated data
+    # print("\n--- Test 4: Decryption with Modified Associated Data ---")
+    # try:
+    #     modified_ad = b"This is incorrect metadata."
+    #     print(f"Using modified AAD: {modified_ad.decode()}")
+    #     aes_cipher.decrypt(ciphertext, modified_ad, nonce, tag)
+    #     print("FAILURE: Decryption succeeded with modified associated data.")
+    # except InvalidTag:
+    #     print(f"SUCCESS: Decryption failed as expected due to InvalidTag.")
+    # except Exception as e:
+    #     print(f"FAILURE: An unexpected error occurred: {e}")
+
+    # # 7. Test with user-provided specific vector
+    # print("\n--- Test 5: Specific Vector Test Case ---")
+    # try:
+    #     key_hex = "92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20"
+    #     aad_hex = "00000000ffffffff"
+    #     plaintext_hex = "00010203040506070809"
+    #     # A fixed nonce is used for this specific test vector.
+    #     nonce_hex = "00112233445566778899aabb"
+
+    #     key_vec = bytes.fromhex(key_hex)
+    #     aad_vec = bytes.fromhex(aad_hex)
+    #     plaintext_vec = bytes.fromhex(plaintext_hex)
+    #     nonce_vec = bytes.fromhex(nonce_hex)
+
+    #     print(f"Using Key (hex): {key_vec.hex()}")
+    #     _generate_and_print_key_schedule(key_vec)
+    #     print(f"Using AAD (hex): {aad_vec.hex()}")
+    #     print(f"Using Plaintext (hex): {plaintext_vec.hex()}")
+    #     print(f"Using Nonce (hex): {nonce_vec.hex()}")
+
+    #     specific_cipher = AES_GCM_256(key_vec)
+    #     ciphertext_vec, tag_vec = specific_cipher.encrypt(plaintext_vec, aad_vec, nonce_vec)
+
+    #     print(f"Resulting Ciphertext (hex): {ciphertext_vec.hex()}")
+    #     print(f"Resulting Tag (hex): {tag_vec.hex()}")
+
+    #     decrypted_plaintext_vec = specific_cipher.decrypt(ciphertext_vec, aad_vec, nonce_vec, tag_vec)
+    #     print(f"Decrypted Plaintext (hex): {decrypted_plaintext_vec.hex()}")
+
+    #     assert plaintext_vec == decrypted_plaintext_vec
+    #     print("SUCCESS: Specific vector test passed. Decrypted plaintext matches original.")
+
+    # except Exception as e:
+    #     print(f"FAILURE: An unexpected error occurred in the specific vector test: {e}")
+        
+    print("\n--- All tests completed. ---")
+
+
+if __name__ == "__main__":
+    run_tests()
+
+# --- Running AES-GCM-256 Tests ---
+# Generated Key (hex): 92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20
+# --- AES-256 Key Schedule Expansion ---
+# 8: True | False
+# 7f123c20
+# 8 % Nk == 0
+# 9: False | False
+# 5a475431
+# 10: False | False
+# 128ad621
+# 11: False | False
+# 8047443b
+# 12: False | True
+# 2313274f
+# 12 % Nk == 4
+# 267dcc84
+# 13: False | False
+# 0fe778e6
+# 14: False | False
+# 068e6324
+# 15: False | False
+# 8d0931f5
+# 16: True | False
+# f21b0dd5
+# 16 % Nk == 0
+# 17: False | False
+# f79057b8
+# 18: False | False
+# e51a8199
+# 19: False | False
+# 655dc5a2
+# 20: False | True
+# 464ee2ed
+# 20 % Nk == 4
+# 5a2f9855
+# 21: False | False
+# 55c8e0b3
+# 22: False | False
+# 53468397
+# 23: False | False
+# de4fb262
+# 24: True | False
+# 2c54bfb7
+# 24 % Nk == 0
+# 25: False | False
+# d398fec9
+# 26: False | False
+# 36827f50
+# 27: False | False
+# 53dfbaf2
+# 28: False | True
+# 1591581f
+# 28 % Nk == 4
+# 59816ac0
+# 29: False | False
+# 0c498a73
+# 30: False | False
+# 5f0f09e4
+# 31: False | False
+# 8140bb86
+# 32: True | False
+# ad140431
+# 32 % Nk == 0
+# 33: False | False
+# 216a395c
+# 34: False | False
+# 17e8460c
+# 35: False | False
+# 4437fcfe
+# 36: False | True
+# 51a6a4e1
+# 36 % Nk == 4
+# d12449f8
+# 37: False | False
+# dd6dc38b
+# 38: False | False
+# 8262ca6f
+# 39: False | False
+# 032271e9
+# 40: True | False
+# ae3675d8
+# 40 % Nk == 0
+# 41: False | False
+# 34f758b8
+# 42: False | False
+# 231f1eb4
+# 43: False | False
+# 6728e24a
+# 44: False | True
+# 368e46ab
+# 44 % Nk == 4
+# 05195a62
+# 45: False | False
+# d87499e9
+# 46: False | False
+# 5a165386
+# 47: False | False
+# 5934226f
+# 48: True | False
+# f70257b7
+# 48 % Nk == 0
+# 49: False | False
+# 63acf1d0
+# 50: False | False
+# 40b3ef64
+# 51: False | False
+# 279b0d2e
+# 52: False | True
+# 11154b85
+# 52 % Nk == 4
+# 8259b397
+# 53: False | False
+# 5a2d2a7e
+# 54: False | False
+# 003b79f8
+# 55: False | False
+# 590f5b97
+# 56: True | False
+# ae0d0c20
+# 56 % Nk == 0
+# 57: False | False
+# f4524634
+# 58: False | False
+# b4e1a950
+# 59: False | False
+# 937aa47e
+# Round  0 Key: 92ace3e348cd821092cd921aa3546374
+# Round  1 Key: 299ab46209691bc28b8752d17f123c20
+# Round  2 Key: 5a475431128ad6218047443b2313274f
+# Round  3 Key: 0fe778e6068e63248d0931f5f21b0dd5
+# Round  4 Key: f79057b8e51a8199655dc5a2464ee2ed
+# Round  5 Key: 55c8e0b353468397de4fb2622c54bfb7
+# Round  6 Key: d398fec936827f5053dfbaf21591581f
+# Round  7 Key: 0c498a735f0f09e48140bb86ad140431
+# Round  8 Key: 216a395c17e8460c4437fcfe51a6a4e1
+# Round  9 Key: dd6dc38b8262ca6f032271e9ae3675d8
+# Round 10 Key: 34f758b8231f1eb46728e24a368e46ab
+# Round 11 Key: d87499e95a1653865934226ff70257b7
+# Round 12 Key: 63acf1d040b3ef64279b0d2e11154b85
+# Round 13 Key: 5a2d2a7e003b79f8590f5b97ae0d0c20
+# Round 14 Key: f4524634b4e1a950937aa47e826feffb
diff --git a/aesgcm/tests/wycheproof.rs b/aesgcm/tests/wycheproof.rs
index c06916e06..1ac5b0bb1 100644
--- a/aesgcm/tests/wycheproof.rs
+++ b/aesgcm/tests/wycheproof.rs
@@ -1,11 +1,14 @@
-use libcrux_traits::aead::Aead;
+use libcrux_aesgcm::{
+    portable::{aes256_gcm_decrypt, aes256_gcm_encrypt},
+    Aead,
+};
 use wycheproof::{aead::Test, TestResult};
 
 #[test]
 fn test() {
     let test_set = wycheproof::aead::TestSet::load(wycheproof::aead::TestName::AesGcm).unwrap();
 
-    fn run<Cipher: Aead<Tag = [u8; 16], Key = [u8; 16], Nonce = [u8; 12]>>(test: &Test) {
+    fn run<const KEY_LEN: usize, Cipher: Aead<16, 16, 12>>(test: &Test) {
         let mut ciphertext = vec![0u8; test.pt.len()];
         let mut plaintext = vec![0u8; test.pt.len()];
         let mut tag = [0u8; 16];
@@ -54,24 +57,67 @@ fn test() {
 
         if test_group.key_size == 128 {
             for test in test_group.tests {
-                println!("  Test {}", test.tc_id);
+                println!("  Test AES-GCM 128 {}", test.tc_id);
                 // Multiplexing
-                run::<libcrux_aesgcm::AesGcm128>(&test);
+                run::<16, libcrux_aesgcm::AesGcm128>(&test);
 
                 // Portable
-                run::<libcrux_aesgcm::PortableAesGcm128>(&test);
+                run::<16, libcrux_aesgcm::PortableAesGcm128>(&test);
 
                 // Neon
                 #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-                run::<libcrux_aesgcm::NeonAesGcm128>(&test);
+                run::<16, libcrux_aesgcm::NeonAesGcm128>(&test);
 
                 // x64
                 #[cfg(all(target_arch = "x86_64"))]
-                run::<libcrux_aesgcm::X64AesGcm128>(&test);
+                run::<16, libcrux_aesgcm::X64AesGcm128>(&test);
             }
         } else if test_group.key_size == 256 {
-            for _test in test_group.tests {
-                // TODO
+            for test in test_group.tests {
+                println!("  Test AES-GCM 256 {}", test.tc_id);
+                println!("    pt:    {:?}", &test.pt);
+                println!("    aad:   {:?}", &test.aad);
+                println!("    key:   {:?}", &test.key);
+                println!("    nonce: {:?}", &test.nonce);
+
+                let mut ciphertext = vec![0u8; test.pt.len()];
+                let mut plaintext = vec![0u8; test.pt.len()];
+                let mut tag = [0u8; 16];
+
+                aes256_gcm_encrypt(
+                    &test.key,
+                    &test.nonce,
+                    &test.aad,
+                    &test.pt,
+                    &mut ciphertext,
+                    &mut tag,
+                );
+                aes256_gcm_decrypt(
+                    &test.key,
+                    &test.nonce,
+                    &test.aad,
+                    &ciphertext,
+                    &tag,
+                    &mut plaintext,
+                )
+                .unwrap();
+
+                assert_eq!(plaintext.as_slice(), test.pt.as_slice());
+
+                if test.result == TestResult::Valid {
+                    assert_eq!(
+                        test.ct.as_slice(),
+                        &ciphertext,
+                        "\nExpected: {}\nGot: {}",
+                        hex::encode(test.ct.as_slice()),
+                        hex::encode(&ciphertext)
+                    );
+                    assert_eq!(test.tag.as_slice(), &tag);
+                } else {
+                    let ct_ok = test.ct.as_slice() == &ciphertext;
+                    let tag_ok = test.tag.as_slice() == &tag;
+                    assert!(!ct_ok || !tag_ok);
+                }
             }
         }
     }
diff --git a/traits/Cargo.toml b/traits/Cargo.toml
index 883515e4a..2c5549140 100644
--- a/traits/Cargo.toml
+++ b/traits/Cargo.toml
@@ -21,5 +21,5 @@ generic-tests = []
 alloc = []
 
 [dependencies]
-rand = { version = "0.9", default-features = false, optional = true }
+rand = { version = "0.9", default-features = false }
 libcrux-secrets = { version = "=0.0.3", path = "../secrets" }
diff --git a/traits/src/lib.rs b/traits/src/lib.rs
index 090234e3c..6bb77bcdc 100644
--- a/traits/src/lib.rs
+++ b/traits/src/lib.rs
@@ -1,7 +1,6 @@
 #![no_std]
 
-pub mod aead;
-pub mod kem;
+extern crate alloc;
 
 // NOTE: This Digest trait and the new `digest` trait APIs overlap to some extent.
 // See issue #1039

From 174ac2a59a52131676d3b188a133cc1c936bf6f1 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 3 Sep 2025 14:58:19 +0200
Subject: [PATCH 19/43] wip

---
 aesgcm/aes.py                    | 228 +++++++++++++++++++++++++++++++
 aesgcm/src/aes_ctr/aes256_ctr.rs |   4 +
 2 files changed, 232 insertions(+)
 create mode 100755 aesgcm/aes.py

diff --git a/aesgcm/aes.py b/aesgcm/aes.py
new file mode 100755
index 000000000..cfc091481
--- /dev/null
+++ b/aesgcm/aes.py
@@ -0,0 +1,228 @@
+# -*- coding: utf-8 -*-
+"""
+A bit-sliced implementation of the AES-256 key schedule in Python.
+
+This implementation is for educational purposes to demonstrate the bit-slicing
+technique. It is not intended for production use as it's not optimized for
+performance in the same way a C or assembly implementation would be.
+
+The key schedule logic has been refactored to use helper functions that
+mimic the style of CPU intrinsics like _mm_aeskeygenassist_si128.
+"""
+
+# AES constants
+RCON = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36]
+
+
+def bitslice_word(word):
+    """
+    Bitslices a 4-byte word into 8 32-bit planes.
+    Each byte of the input word corresponds to an 8-bit segment in the output planes.
+    """
+    planes = [0] * 8
+    for p in range(8):
+        plane = 0
+        for i in range(4):  # byte index
+            # If the p-th bit of byte i is set...
+            if (word[i] >> p) & 1:
+                # ...set all 8 bits corresponding to byte i in the plane.
+                plane |= 0xFF << (8 * (3 - i))
+        planes[p] = plane
+    return planes
+
+
+def unbitslice_word(planes):
+    """Unbitslices 8 32-bit planes back into a 4-byte word."""
+    word = [0] * 4
+    for i in range(4):  # byte index
+        byte = 0
+        # Mask to select the 8 bits corresponding to this byte
+        mask = 0xFF << (8 * (3 - i))
+        for p in range(8):  # plane index (and bit index of result)
+            # If the bits for this byte are set in plane p...
+            if planes[p] & mask:
+                # ...set the p-th bit of the result byte.
+                byte |= 1 << p
+        word[i] = byte
+    return word
+
+
+def sub_word_planes(x):
+    """
+    Bit-sliced AES S-box implementation operating on 32-bit planes.
+    This applies the S-box to all 4 bytes of a word in parallel.
+    """
+    x0, x1, x2, x3, x4, x5, x6, x7 = x
+    t2 = x6 ^ x1
+    t3 = x5 ^ x2
+    t4 = x4 ^ x3
+    t5 = t2 ^ t3
+    t6 = t4 ^ t5
+    t7 = x0 ^ t6
+    t8 = x7 ^ t6
+    t9 = t7 & t2
+    t10 = t5 ^ t8
+    t11 = t3 ^ t7
+    t12 = t11 & t10
+    t13 = t12 ^ t9
+    t14 = t4 ^ t8
+    t15 = t2 ^ t14
+    t16 = t7 ^ t15
+    t17 = t16 & t10
+    t18 = t17 ^ t9
+    t19 = x3 ^ x1
+    t20 = t19 & t16
+    t21 = t20 ^ t18
+    t22 = x7 ^ x5
+    t23 = x7 ^ x4
+    t24 = x7 ^ x2
+    t25 = t24 & t22
+    t26 = t25 ^ t21
+    t27 = x0 & t8
+    t28 = t27 ^ t26
+    t29 = t23 & t8
+    t30 = t29 ^ t28
+    y4 = t30
+    t32 = x0 ^ x5
+    t33 = x0 ^ x4
+    t34 = x0 ^ x1
+    t35 = t34 & t32
+    t36 = t35 ^ t30
+    t37 = t33 & t34
+    t38 = t37 ^ t36
+    y5 = t38
+    t40 = t22 ^ t32
+    t41 = y5 & t40
+    t42 = t41 ^ t36
+    y6 = t42
+    t44 = y4 & t33
+    t45 = t44 ^ t42
+    t46 = y5 ^ t23
+    t47 = t45 & t46
+    t48 = t47 ^ t38
+    y7 = t48
+    t50 = t13 & t32
+    t51 = t50 ^ t48
+    t52 = t21 ^ t33
+    t53 = t51 & t52
+    t54 = t53 ^ t45
+    y1 = t54
+    t56 = y6 ^ t22
+    t57 = y1 & t56
+    t58 = t57 ^ t51
+    t59 = y4 & t34
+    t60 = t59 ^ t58
+    y2 = t60
+    t62 = y7 ^ y5
+    t63 = y2 & t62
+    t64 = t63 ^ t58
+    t65 = y1 ^ y4
+    t66 = t64 & t65
+    t67 = t66 ^ t60
+    y3 = t67
+    t69 = y1 ^ y6
+    t70 = y3 & t69
+    t71 = t70 ^ t64
+    t72 = y2 ^ y7
+    t73 = t71 & t72
+    t74 = t73 ^ t67
+    y0 = t74
+    c = 0x63
+    c_planes = bitslice_word([c, c, c, c])
+    return [
+        y0 ^ c_planes[0],
+        y1 ^ c_planes[1],
+        y2 ^ c_planes[2],
+        y3 ^ c_planes[3],
+        y4 ^ c_planes[4],
+        y5 ^ c_planes[5],
+        y6 ^ c_planes[6],
+        y7 ^ c_planes[7],
+    ]
+
+
+def rot_word_planes(planes):
+    """Rotates the word (represented by planes) by 8 bits to the left."""
+    rotated_planes = []
+    for p in planes:
+        rotated_p = ((p << 8) & 0xFFFFFFFF) | (p >> 24)
+        rotated_planes.append(rotated_p)
+    return rotated_planes
+
+
+def aes_key_assist(word_planes, rcon_idx):
+    """
+    Mimics the _mm_aeskeygenassist_si128 intrinsic.
+    Performs RotWord, SubWord, and Rcon XOR for the key schedule.
+    """
+    rcon_val = RCON[rcon_idx]
+    # RotWord
+    rotated = rot_word_planes(word_planes)
+    # SubWord
+    subbed = sub_word_planes(rotated)
+    # XOR with RCON (applied only to the first byte)
+    rcon_word = [rcon_val, 0, 0, 0]
+    rcon_planes = bitslice_word(rcon_word)
+    result_planes = [subbed[p] ^ rcon_planes[p] for p in range(8)]
+    return result_planes
+
+
+def aes_256_key_schedule(key):
+    """
+    Generates the AES-256 key schedule from a 256-bit key.
+    The key should be a list of 32 bytes.
+    """
+    if len(key) != 32:
+        raise ValueError("Key must be 32 bytes (256 bits) long.")
+
+    # The key schedule is 60 words, stored in bitsliced form
+    w_planes = [None] * 60
+
+    # The first 8 words are the key itself, bitsliced
+    for i in range(8):
+        w_planes[i] = bitslice_word(key[i * 4 : i * 4 + 4])
+
+    for i in range(8, 60):
+        temp_planes = w_planes[i - 1]
+
+        if i % 8 == 0:
+            temp_planes = aes_key_assist(temp_planes, (i // 8) - 1)
+        elif i % 8 == 4:
+            # SubWord on w[i-1] for AES-256
+            temp_planes = sub_word_planes(temp_planes)
+
+        # XOR with w[i-8]
+        w_i_minus_8_planes = w_planes[i - 8]
+        w_planes[i] = [w_i_minus_8_planes[p] ^ temp_planes[p] for p in range(8)]
+
+    # Unslice the schedule for the final output
+    w_final = [unbitslice_word(p) for p in w_planes]
+    return w_final
+
+
+def main():
+    """Main function to demonstrate the key schedule."""
+    # Example 256-bit key (32 bytes)
+    key = bytes.fromhex(
+        "92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20"
+    )
+    # [
+    #     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    #     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    #     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    #     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+    # ]
+
+    print("Original Key:")
+    print(" ".join(f"{b:02x}" for b in key))
+    print("-" * 40)
+
+    key_schedule = aes_256_key_schedule(key)
+
+    print("Generated Key Schedule (60 words):")
+    for i, word in enumerate(key_schedule):
+        print(f"w[{i:2d}]: {' '.join(f'{b:02x}' for b in word)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/aesgcm/src/aes_ctr/aes256_ctr.rs b/aesgcm/src/aes_ctr/aes256_ctr.rs
index a10ed5cff..c60cdd53d 100644
--- a/aesgcm/src/aes_ctr/aes256_ctr.rs
+++ b/aesgcm/src/aes_ctr/aes256_ctr.rs
@@ -79,6 +79,10 @@ fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
     keyex[14].aes_keygen_assist0::<0x40>(&prev1);
     keyex[14].key_expansion_step(&prev0);
 
+    for i in 0..NUM_KEYS {
+        print(&format!("{i}"), &keyex[i]);
+    }
+
     keyex
 }
 

From f8a8e6943e3f50e123b9925d7dd3d1871a2a9b17 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 19 Sep 2025 08:30:34 +0200
Subject: [PATCH 20/43] aes256 fix

---
 aesgcm/src/aes_ctr/aes256_ctr.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/aesgcm/src/aes_ctr/aes256_ctr.rs b/aesgcm/src/aes_ctr/aes256_ctr.rs
index c60cdd53d..eeb1daa35 100644
--- a/aesgcm/src/aes_ctr/aes256_ctr.rs
+++ b/aesgcm/src/aes_ctr/aes256_ctr.rs
@@ -62,17 +62,11 @@ fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
     }
 
     expansion_step256!(2, 0x01);
-    expansion_step256!(3, 0x01);
     expansion_step256!(4, 0x02);
-    expansion_step256!(5, 0x02);
     expansion_step256!(6, 0x04);
-    expansion_step256!(7, 0x04);
     expansion_step256!(8, 0x08);
-    expansion_step256!(9, 0x08);
     expansion_step256!(10, 0x10);
-    expansion_step256!(11, 0x10);
     expansion_step256!(12, 0x20);
-    expansion_step256!(13, 0x20);
 
     let prev0 = keyex[12].clone();
     let prev1 = keyex[13].clone();

From e200882239c07b1b13bf5b58c526823845acbd26 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Fri, 19 Sep 2025 12:38:56 +0200
Subject: [PATCH 21/43] no_std

---
 aesgcm/Cargo.toml                             |   1 +
 aesgcm/aes.py                                 | 228 ------
 aesgcm/src/aes_ctr.rs                         |   4 +
 aesgcm/src/aes_ctr/aes256_ctr.rs              |  10 -
 aesgcm/src/gf128_generic.rs                   |   2 +
 aesgcm/src/lib.rs                             |   2 +
 aesgcm/src/platform/portable/aes_core.rs      | 764 +----------------
 aesgcm/src/platform/portable/aes_core/test.rs | 773 ++++++++++++++++++
 aesgcm/test.py                                | 455 -----------
 9 files changed, 783 insertions(+), 1456 deletions(-)
 delete mode 100755 aesgcm/aes.py
 create mode 100644 aesgcm/src/platform/portable/aes_core/test.rs
 delete mode 100644 aesgcm/test.py

diff --git a/aesgcm/Cargo.toml b/aesgcm/Cargo.toml
index 13267aa9c..3494c05e1 100644
--- a/aesgcm/Cargo.toml
+++ b/aesgcm/Cargo.toml
@@ -25,6 +25,7 @@ default = ["rand"]  # XXX: remove rand here when cleaning up
 simd128 = []
 simd256 = []
 rand = ["dep:rand"]
+std = []
 
 [[bench]]
 name = "aesgcm"
diff --git a/aesgcm/aes.py b/aesgcm/aes.py
deleted file mode 100755
index cfc091481..000000000
--- a/aesgcm/aes.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-A bit-sliced implementation of the AES-256 key schedule in Python.
-
-This implementation is for educational purposes to demonstrate the bit-slicing
-technique. It is not intended for production use as it's not optimized for
-performance in the same way a C or assembly implementation would be.
-
-The key schedule logic has been refactored to use helper functions that
-mimic the style of CPU intrinsics like _mm_aeskeygenassist_si128.
-"""
-
-# AES constants
-RCON = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36]
-
-
-def bitslice_word(word):
-    """
-    Bitslices a 4-byte word into 8 32-bit planes.
-    Each byte of the input word corresponds to an 8-bit segment in the output planes.
-    """
-    planes = [0] * 8
-    for p in range(8):
-        plane = 0
-        for i in range(4):  # byte index
-            # If the p-th bit of byte i is set...
-            if (word[i] >> p) & 1:
-                # ...set all 8 bits corresponding to byte i in the plane.
-                plane |= 0xFF << (8 * (3 - i))
-        planes[p] = plane
-    return planes
-
-
-def unbitslice_word(planes):
-    """Unbitslices 8 32-bit planes back into a 4-byte word."""
-    word = [0] * 4
-    for i in range(4):  # byte index
-        byte = 0
-        # Mask to select the 8 bits corresponding to this byte
-        mask = 0xFF << (8 * (3 - i))
-        for p in range(8):  # plane index (and bit index of result)
-            # If the bits for this byte are set in plane p...
-            if planes[p] & mask:
-                # ...set the p-th bit of the result byte.
-                byte |= 1 << p
-        word[i] = byte
-    return word
-
-
-def sub_word_planes(x):
-    """
-    Bit-sliced AES S-box implementation operating on 32-bit planes.
-    This applies the S-box to all 4 bytes of a word in parallel.
-    """
-    x0, x1, x2, x3, x4, x5, x6, x7 = x
-    t2 = x6 ^ x1
-    t3 = x5 ^ x2
-    t4 = x4 ^ x3
-    t5 = t2 ^ t3
-    t6 = t4 ^ t5
-    t7 = x0 ^ t6
-    t8 = x7 ^ t6
-    t9 = t7 & t2
-    t10 = t5 ^ t8
-    t11 = t3 ^ t7
-    t12 = t11 & t10
-    t13 = t12 ^ t9
-    t14 = t4 ^ t8
-    t15 = t2 ^ t14
-    t16 = t7 ^ t15
-    t17 = t16 & t10
-    t18 = t17 ^ t9
-    t19 = x3 ^ x1
-    t20 = t19 & t16
-    t21 = t20 ^ t18
-    t22 = x7 ^ x5
-    t23 = x7 ^ x4
-    t24 = x7 ^ x2
-    t25 = t24 & t22
-    t26 = t25 ^ t21
-    t27 = x0 & t8
-    t28 = t27 ^ t26
-    t29 = t23 & t8
-    t30 = t29 ^ t28
-    y4 = t30
-    t32 = x0 ^ x5
-    t33 = x0 ^ x4
-    t34 = x0 ^ x1
-    t35 = t34 & t32
-    t36 = t35 ^ t30
-    t37 = t33 & t34
-    t38 = t37 ^ t36
-    y5 = t38
-    t40 = t22 ^ t32
-    t41 = y5 & t40
-    t42 = t41 ^ t36
-    y6 = t42
-    t44 = y4 & t33
-    t45 = t44 ^ t42
-    t46 = y5 ^ t23
-    t47 = t45 & t46
-    t48 = t47 ^ t38
-    y7 = t48
-    t50 = t13 & t32
-    t51 = t50 ^ t48
-    t52 = t21 ^ t33
-    t53 = t51 & t52
-    t54 = t53 ^ t45
-    y1 = t54
-    t56 = y6 ^ t22
-    t57 = y1 & t56
-    t58 = t57 ^ t51
-    t59 = y4 & t34
-    t60 = t59 ^ t58
-    y2 = t60
-    t62 = y7 ^ y5
-    t63 = y2 & t62
-    t64 = t63 ^ t58
-    t65 = y1 ^ y4
-    t66 = t64 & t65
-    t67 = t66 ^ t60
-    y3 = t67
-    t69 = y1 ^ y6
-    t70 = y3 & t69
-    t71 = t70 ^ t64
-    t72 = y2 ^ y7
-    t73 = t71 & t72
-    t74 = t73 ^ t67
-    y0 = t74
-    c = 0x63
-    c_planes = bitslice_word([c, c, c, c])
-    return [
-        y0 ^ c_planes[0],
-        y1 ^ c_planes[1],
-        y2 ^ c_planes[2],
-        y3 ^ c_planes[3],
-        y4 ^ c_planes[4],
-        y5 ^ c_planes[5],
-        y6 ^ c_planes[6],
-        y7 ^ c_planes[7],
-    ]
-
-
-def rot_word_planes(planes):
-    """Rotates the word (represented by planes) by 8 bits to the left."""
-    rotated_planes = []
-    for p in planes:
-        rotated_p = ((p << 8) & 0xFFFFFFFF) | (p >> 24)
-        rotated_planes.append(rotated_p)
-    return rotated_planes
-
-
-def aes_key_assist(word_planes, rcon_idx):
-    """
-    Mimics the _mm_aeskeygenassist_si128 intrinsic.
-    Performs RotWord, SubWord, and Rcon XOR for the key schedule.
-    """
-    rcon_val = RCON[rcon_idx]
-    # RotWord
-    rotated = rot_word_planes(word_planes)
-    # SubWord
-    subbed = sub_word_planes(rotated)
-    # XOR with RCON (applied only to the first byte)
-    rcon_word = [rcon_val, 0, 0, 0]
-    rcon_planes = bitslice_word(rcon_word)
-    result_planes = [subbed[p] ^ rcon_planes[p] for p in range(8)]
-    return result_planes
-
-
-def aes_256_key_schedule(key):
-    """
-    Generates the AES-256 key schedule from a 256-bit key.
-    The key should be a list of 32 bytes.
-    """
-    if len(key) != 32:
-        raise ValueError("Key must be 32 bytes (256 bits) long.")
-
-    # The key schedule is 60 words, stored in bitsliced form
-    w_planes = [None] * 60
-
-    # The first 8 words are the key itself, bitsliced
-    for i in range(8):
-        w_planes[i] = bitslice_word(key[i * 4 : i * 4 + 4])
-
-    for i in range(8, 60):
-        temp_planes = w_planes[i - 1]
-
-        if i % 8 == 0:
-            temp_planes = aes_key_assist(temp_planes, (i // 8) - 1)
-        elif i % 8 == 4:
-            # SubWord on w[i-1] for AES-256
-            temp_planes = sub_word_planes(temp_planes)
-
-        # XOR with w[i-8]
-        w_i_minus_8_planes = w_planes[i - 8]
-        w_planes[i] = [w_i_minus_8_planes[p] ^ temp_planes[p] for p in range(8)]
-
-    # Unslice the schedule for the final output
-    w_final = [unbitslice_word(p) for p in w_planes]
-    return w_final
-
-
-def main():
-    """Main function to demonstrate the key schedule."""
-    # Example 256-bit key (32 bytes)
-    key = bytes.fromhex(
-        "92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20"
-    )
-    # [
-    #     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    #     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    #     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
-    #     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-    # ]
-
-    print("Original Key:")
-    print(" ".join(f"{b:02x}" for b in key))
-    print("-" * 40)
-
-    key_schedule = aes_256_key_schedule(key)
-
-    print("Generated Key Schedule (60 words):")
-    for i, word in enumerate(key_schedule):
-        print(f"w[{i:2d}]: {' '.join(f'{b:02x}' for b in word)}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/aesgcm/src/aes_ctr.rs b/aesgcm/src/aes_ctr.rs
index ed89ad2e5..a498602b1 100644
--- a/aesgcm/src/aes_ctr.rs
+++ b/aesgcm/src/aes_ctr.rs
@@ -118,6 +118,7 @@ mod test {
         aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
         for i in 0..32 {
             if computed[i] != EXPECTED[i] {
+                #[cfg(feature = "std")]
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
                     i, EXPECTED[i], computed[i]
@@ -136,6 +137,7 @@ mod test {
         aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
         for i in 0..32 {
             if computed[i] != EXPECTED[i] {
+                #[cfg(feature = "std")]
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
                     i, EXPECTED[i], computed[i]
@@ -151,6 +153,7 @@ mod test {
         aes128_ctr_encrypt::<platform::portable::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
         for i in 0..32 {
             if computed[i] != EXPECTED[i] {
+                #[cfg(feature = "std")]
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
                     i, EXPECTED[i], computed[i]
@@ -167,6 +170,7 @@ mod test {
         aes128_ctr_encrypt::<platform::neon::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
         for i in 0..32 {
             if computed[i] != EXPECTED[i] {
+                #[cfg(feature = "std")]
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
                     i, EXPECTED[i], computed[i]
diff --git a/aesgcm/src/aes_ctr/aes256_ctr.rs b/aesgcm/src/aes_ctr/aes256_ctr.rs
index eeb1daa35..e4bb42ace 100644
--- a/aesgcm/src/aes_ctr/aes256_ctr.rs
+++ b/aesgcm/src/aes_ctr/aes256_ctr.rs
@@ -73,15 +73,5 @@ fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
     keyex[14].aes_keygen_assist0::<0x40>(&prev1);
     keyex[14].key_expansion_step(&prev0);
 
-    for i in 0..NUM_KEYS {
-        print(&format!("{i}"), &keyex[i]);
-    }
-
     keyex
 }
-
-fn print<T: AESState>(s: &str, keyex: &T) {
-    let mut tmp = [0u8; 16];
-    keyex.store_block(&mut tmp);
-    eprintln!("{s}: {:02x?}", tmp);
-}
diff --git a/aesgcm/src/gf128_generic.rs b/aesgcm/src/gf128_generic.rs
index 8198473ba..4c26170d6 100644
--- a/aesgcm/src/gf128_generic.rs
+++ b/aesgcm/src/gf128_generic.rs
@@ -102,6 +102,7 @@ mod test {
         gf128::<crate::platform::portable::FieldElement>(&KEY, &INPUT, &mut computed);
         for i in 0..16 {
             if computed[i] != EXPECTED[i] {
+                #[cfg(feature = "std")]
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
                     i, EXPECTED[i], computed[i]
@@ -118,6 +119,7 @@ mod test {
         gf128::<crate::platform::neon::FieldElement>(&KEY, &INPUT, &mut computed);
         for i in 0..16 {
             if computed[i] != EXPECTED[i] {
+                #[cfg(feature = "std")]
                 println!(
                     "mismatch at {}: expected is {}, computed is {}",
                     i, EXPECTED[i], computed[i]
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index bb61f8d59..73e7f791d 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -1,3 +1,5 @@
+#![no_std]
+
 mod aes_ctr;
 mod aes_generic;
 mod gf128_generic;
diff --git a/aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
index a8887b716..7a6a1ec92 100644
--- a/aesgcm/src/platform/portable/aes_core.rs
+++ b/aesgcm/src/platform/portable/aes_core.rs
@@ -502,766 +502,4 @@ impl crate::platform::AESState for State {
 }
 
 #[cfg(test)]
-mod test {
-    use super::*;
-
-    #[allow(non_snake_case)]
-    fn sub_bytes_inv_state(st: &mut State) {
-        let U0 = st[7];
-        let U1 = st[6];
-        let U2 = st[5];
-        let U3 = st[4];
-        let U4 = st[3];
-        let U5 = st[2];
-        let U6 = st[1];
-        let U7 = st[0];
-
-        let T23 = U0 ^ U3;
-        let T22 = xnor(U1, U3);
-        let T2 = xnor(U0, U1);
-        let T1 = U3 ^ U4;
-        let T24 = xnor(U4, U7);
-        let R5 = U6 ^ U7;
-        let T8 = xnor(U1, T23);
-        let T19 = T22 ^ R5;
-        let T9 = xnor(U7, T1);
-        let T10 = T2 ^ T24;
-        let T13 = T2 ^ R5;
-        let T3 = T1 ^ R5;
-        let T25 = xnor(U2, T1);
-        let R13 = U1 ^ U6;
-        let T17 = xnor(U2, T19);
-        let T20 = T24 ^ R13;
-        let T4 = U4 ^ T8;
-        let R17 = xnor(U2, U5);
-        let R18 = xnor(U5, U6);
-        let R19 = xnor(U2, U4);
-        let Y5 = U0 ^ R17;
-        let T6 = T22 ^ R17;
-        let T16 = R13 ^ R19;
-        let T27 = T1 ^ R18;
-        let T15 = T10 ^ T27;
-        let T14 = T10 ^ R18;
-        let T26 = T3 ^ T16;
-        let M1 = T13 & T6;
-        let M2 = T23 & T8;
-        let M3 = T14 ^ M1;
-        let M4 = T19 & Y5;
-        let M5 = M4 ^ M1;
-        let M6 = T3 & T16;
-        let M7 = T22 & T9;
-        let M8 = T26 ^ M6;
-        let M9 = T20 & T17;
-        let M10 = M9 ^ M6;
-        let M11 = T1 & T15;
-        let M12 = T4 & T27;
-        let M13 = M12 ^ M11;
-        let M14 = T2 & T10;
-        let M15 = M14 ^ M11;
-        let M16 = M3 ^ M2;
-        let M17 = M5 ^ T24;
-        let M18 = M8 ^ M7;
-        let M19 = M10 ^ M15;
-        let M20 = M16 ^ M13;
-        let M21 = M17 ^ M15;
-        let M22 = M18 ^ M13;
-        let M23 = M19 ^ T25;
-        let M24 = M22 ^ M23;
-        let M25 = M22 & M20;
-        let M26 = M21 ^ M25;
-        let M27 = M20 ^ M21;
-        let M28 = M23 ^ M25;
-        let M29 = M28 & M27;
-        let M30 = M26 & M24;
-        let M31 = M20 & M23;
-        let M32 = M27 & M31;
-        let M33 = M27 ^ M25;
-        let M34 = M21 & M22;
-        let M35 = M24 & M34;
-        let M36 = M24 ^ M25;
-        let M37 = M21 ^ M29;
-        let M38 = M32 ^ M33;
-        let M39 = M23 ^ M30;
-        let M40 = M35 ^ M36;
-        let M41 = M38 ^ M40;
-        let M42 = M37 ^ M39;
-        let M43 = M37 ^ M38;
-        let M44 = M39 ^ M40;
-        let M45 = M42 ^ M41;
-        let M46 = M44 & T6;
-        let M47 = M40 & T8;
-        let M48 = M39 & Y5;
-        let M49 = M43 & T16;
-        let M50 = M38 & T9;
-        let M51 = M37 & T17;
-        let M52 = M42 & T15;
-        let M53 = M45 & T27;
-        let M54 = M41 & T10;
-        let M55 = M44 & T13;
-        let M56 = M40 & T23;
-        let M57 = M39 & T19;
-        let M58 = M43 & T3;
-        let M59 = M38 & T22;
-        let M60 = M37 & T20;
-        let M61 = M42 & T1;
-        let M62 = M45 & T4;
-        let M63 = M41 & T2;
-        let P0 = M52 ^ M61;
-        let P1 = M58 ^ M59;
-        let P2 = M54 ^ M62;
-        let P3 = M47 ^ M50;
-        let P4 = M48 ^ M56;
-        let P5 = M46 ^ M51;
-        let P6 = M49 ^ M60;
-        let P7 = P0 ^ P1;
-        let P8 = M50 ^ M53;
-        let P9 = M55 ^ M63;
-        let P10 = M57 ^ P4;
-        let P11 = P0 ^ P3;
-        let P12 = M46 ^ M48;
-        let P13 = M49 ^ M51;
-        let P14 = M49 ^ M62;
-        let P15 = M54 ^ M59;
-        let P16 = M57 ^ M61;
-        let P17 = M58 ^ P2;
-        let P18 = M63 ^ P5;
-        let P19 = P2 ^ P3;
-        let P20 = P4 ^ P6;
-        let P22 = P2 ^ P7;
-        let P23 = P7 ^ P8;
-        let P24 = P5 ^ P7;
-        let P25 = P6 ^ P10;
-        let P26 = P9 ^ P11;
-        let P27 = P10 ^ P18;
-        let P28 = P11 ^ P25;
-        let P29 = P15 ^ P20;
-        let W0 = P13 ^ P22;
-        let W1 = P26 ^ P29;
-        let W2 = P17 ^ P28;
-        let W3 = P12 ^ P22;
-        let W4 = P23 ^ P27;
-        let W5 = P19 ^ P24;
-        let W6 = P14 ^ P23;
-        let W7 = P9 ^ P16;
-
-        st[0] = W7;
-        st[1] = W6;
-        st[2] = W5;
-        st[3] = W4;
-        st[4] = W3;
-        st[5] = W2;
-        st[6] = W1;
-        st[7] = W0;
-    }
-
-    fn sbox_fwd(s: u8) -> u8 {
-        match s {
-            0 => 0x63,
-            1 => 0x7c,
-            2 => 0x77,
-            3 => 0x7b,
-            4 => 0xf2,
-            5 => 0x6b,
-            6 => 0x6f,
-            7 => 0xc5,
-            8 => 0x30,
-            9 => 0x01,
-            10 => 0x67,
-            11 => 0x2b,
-            12 => 0xfe,
-            13 => 0xd7,
-            14 => 0xab,
-            15 => 0x76,
-            16 => 0xca,
-            17 => 0x82,
-            18 => 0xc9,
-            19 => 0x7d,
-            20 => 0xfa,
-            21 => 0x59,
-            22 => 0x47,
-            23 => 0xf0,
-            24 => 0xad,
-            25 => 0xd4,
-            26 => 0xa2,
-            27 => 0xaf,
-            28 => 0x9c,
-            29 => 0xa4,
-            30 => 0x72,
-            31 => 0xc0,
-            32 => 0xb7,
-            33 => 0xfd,
-            34 => 0x93,
-            35 => 0x26,
-            36 => 0x36,
-            37 => 0x3f,
-            38 => 0xf7,
-            39 => 0xcc,
-            40 => 0x34,
-            41 => 0xa5,
-            42 => 0xe5,
-            43 => 0xf1,
-            44 => 0x71,
-            45 => 0xd8,
-            46 => 0x31,
-            47 => 0x15,
-            48 => 0x04,
-            49 => 0xc7,
-            50 => 0x23,
-            51 => 0xc3,
-            52 => 0x18,
-            53 => 0x96,
-            54 => 0x05,
-            55 => 0x9a,
-            56 => 0x07,
-            57 => 0x12,
-            58 => 0x80,
-            59 => 0xe2,
-            60 => 0xeb,
-            61 => 0x27,
-            62 => 0xb2,
-            63 => 0x75,
-            64 => 0x09,
-            65 => 0x83,
-            66 => 0x2c,
-            67 => 0x1a,
-            68 => 0x1b,
-            69 => 0x6e,
-            70 => 0x5a,
-            71 => 0xa0,
-            72 => 0x52,
-            73 => 0x3b,
-            74 => 0xd6,
-            75 => 0xb3,
-            76 => 0x29,
-            77 => 0xe3,
-            78 => 0x2f,
-            79 => 0x84,
-            80 => 0x53,
-            81 => 0xd1,
-            82 => 0x00,
-            83 => 0xed,
-            84 => 0x20,
-            85 => 0xfc,
-            86 => 0xb1,
-            87 => 0x5b,
-            88 => 0x6a,
-            89 => 0xcb,
-            90 => 0xbe,
-            91 => 0x39,
-            92 => 0x4a,
-            93 => 0x4c,
-            94 => 0x58,
-            95 => 0xcf,
-            96 => 0xd0,
-            97 => 0xef,
-            98 => 0xaa,
-            99 => 0xfb,
-            100 => 0x43,
-            101 => 0x4d,
-            102 => 0x33,
-            103 => 0x85,
-            104 => 0x45,
-            105 => 0xf9,
-            106 => 0x02,
-            107 => 0x7f,
-            108 => 0x50,
-            109 => 0x3c,
-            110 => 0x9f,
-            111 => 0xa8,
-            112 => 0x51,
-            113 => 0xa3,
-            114 => 0x40,
-            115 => 0x8f,
-            116 => 0x92,
-            117 => 0x9d,
-            118 => 0x38,
-            119 => 0xf5,
-            120 => 0xbc,
-            121 => 0xb6,
-            122 => 0xda,
-            123 => 0x21,
-            124 => 0x10,
-            125 => 0xff,
-            126 => 0xf3,
-            127 => 0xd2,
-            128 => 0xcd,
-            129 => 0x0c,
-            130 => 0x13,
-            131 => 0xec,
-            132 => 0x5f,
-            133 => 0x97,
-            134 => 0x44,
-            135 => 0x17,
-            136 => 0xc4,
-            137 => 0xa7,
-            138 => 0x7e,
-            139 => 0x3d,
-            140 => 0x64,
-            141 => 0x5d,
-            142 => 0x19,
-            143 => 0x73,
-            144 => 0x60,
-            145 => 0x81,
-            146 => 0x4f,
-            147 => 0xdc,
-            148 => 0x22,
-            149 => 0x2a,
-            150 => 0x90,
-            151 => 0x88,
-            152 => 0x46,
-            153 => 0xee,
-            154 => 0xb8,
-            155 => 0x14,
-            156 => 0xde,
-            157 => 0x5e,
-            158 => 0x0b,
-            159 => 0xdb,
-            160 => 0xe0,
-            161 => 0x32,
-            162 => 0x3a,
-            163 => 0x0a,
-            164 => 0x49,
-            165 => 0x06,
-            166 => 0x24,
-            167 => 0x5c,
-            168 => 0xc2,
-            169 => 0xd3,
-            170 => 0xac,
-            171 => 0x62,
-            172 => 0x91,
-            173 => 0x95,
-            174 => 0xe4,
-            175 => 0x79,
-            176 => 0xe7,
-            177 => 0xc8,
-            178 => 0x37,
-            179 => 0x6d,
-            180 => 0x8d,
-            181 => 0xd5,
-            182 => 0x4e,
-            183 => 0xa9,
-            184 => 0x6c,
-            185 => 0x56,
-            186 => 0xf4,
-            187 => 0xea,
-            188 => 0x65,
-            189 => 0x7a,
-            190 => 0xae,
-            191 => 0x08,
-            192 => 0xba,
-            193 => 0x78,
-            194 => 0x25,
-            195 => 0x2e,
-            196 => 0x1c,
-            197 => 0xa6,
-            198 => 0xb4,
-            199 => 0xc6,
-            200 => 0xe8,
-            201 => 0xdd,
-            202 => 0x74,
-            203 => 0x1f,
-            204 => 0x4b,
-            205 => 0xbd,
-            206 => 0x8b,
-            207 => 0x8a,
-            208 => 0x70,
-            209 => 0x3e,
-            210 => 0xb5,
-            211 => 0x66,
-            212 => 0x48,
-            213 => 0x03,
-            214 => 0xf6,
-            215 => 0x0e,
-            216 => 0x61,
-            217 => 0x35,
-            218 => 0x57,
-            219 => 0xb9,
-            220 => 0x86,
-            221 => 0xc1,
-            222 => 0x1d,
-            223 => 0x9e,
-            224 => 0xe1,
-            225 => 0xf8,
-            226 => 0x98,
-            227 => 0x11,
-            228 => 0x69,
-            229 => 0xd9,
-            230 => 0x8e,
-            231 => 0x94,
-            232 => 0x9b,
-            233 => 0x1e,
-            234 => 0x87,
-            235 => 0xe9,
-            236 => 0xce,
-            237 => 0x55,
-            238 => 0x28,
-            239 => 0xdf,
-            240 => 0x8c,
-            241 => 0xa1,
-            242 => 0x89,
-            243 => 0x0d,
-            244 => 0xbf,
-            245 => 0xe6,
-            246 => 0x42,
-            247 => 0x68,
-            248 => 0x41,
-            249 => 0x99,
-            250 => 0x2d,
-            251 => 0x0f,
-            252 => 0xb0,
-            253 => 0x54,
-            254 => 0xbb,
-            255 => 0x16,
-        }
-    }
-
-    fn sbox_inv(s: u8) -> u8 {
-        match s {
-            0 => 0x52,
-            1 => 0x09,
-            2 => 0x6a,
-            3 => 0xd5,
-            4 => 0x30,
-            5 => 0x36,
-            6 => 0xa5,
-            7 => 0x38,
-            8 => 0xbf,
-            9 => 0x40,
-            10 => 0xa3,
-            11 => 0x9e,
-            12 => 0x81,
-            13 => 0xf3,
-            14 => 0xd7,
-            15 => 0xfb,
-            16 => 0x7c,
-            17 => 0xe3,
-            18 => 0x39,
-            19 => 0x82,
-            20 => 0x9b,
-            21 => 0x2f,
-            22 => 0xff,
-            23 => 0x87,
-            24 => 0x34,
-            25 => 0x8e,
-            26 => 0x43,
-            27 => 0x44,
-            28 => 0xc4,
-            29 => 0xde,
-            30 => 0xe9,
-            31 => 0xcb,
-            32 => 0x54,
-            33 => 0x7b,
-            34 => 0x94,
-            35 => 0x32,
-            36 => 0xa6,
-            37 => 0xc2,
-            38 => 0x23,
-            39 => 0x3d,
-            40 => 0xee,
-            41 => 0x4c,
-            42 => 0x95,
-            43 => 0x0b,
-            44 => 0x42,
-            45 => 0xfa,
-            46 => 0xc3,
-            47 => 0x4e,
-            48 => 0x08,
-            49 => 0x2e,
-            50 => 0xa1,
-            51 => 0x66,
-            52 => 0x28,
-            53 => 0xd9,
-            54 => 0x24,
-            55 => 0xb2,
-            56 => 0x76,
-            57 => 0x5b,
-            58 => 0xa2,
-            59 => 0x49,
-            60 => 0x6d,
-            61 => 0x8b,
-            62 => 0xd1,
-            63 => 0x25,
-            64 => 0x72,
-            65 => 0xf8,
-            66 => 0xf6,
-            67 => 0x64,
-            68 => 0x86,
-            69 => 0x68,
-            70 => 0x98,
-            71 => 0x16,
-            72 => 0xd4,
-            73 => 0xa4,
-            74 => 0x5c,
-            75 => 0xcc,
-            76 => 0x5d,
-            77 => 0x65,
-            78 => 0xb6,
-            79 => 0x92,
-            80 => 0x6c,
-            81 => 0x70,
-            82 => 0x48,
-            83 => 0x50,
-            84 => 0xfd,
-            85 => 0xed,
-            86 => 0xb9,
-            87 => 0xda,
-            88 => 0x5e,
-            89 => 0x15,
-            90 => 0x46,
-            91 => 0x57,
-            92 => 0xa7,
-            93 => 0x8d,
-            94 => 0x9d,
-            95 => 0x84,
-            96 => 0x90,
-            97 => 0xd8,
-            98 => 0xab,
-            99 => 0x00,
-            100 => 0x8c,
-            101 => 0xbc,
-            102 => 0xd3,
-            103 => 0x0a,
-            104 => 0xf7,
-            105 => 0xe4,
-            106 => 0x58,
-            107 => 0x05,
-            108 => 0xb8,
-            109 => 0xb3,
-            110 => 0x45,
-            111 => 0x06,
-            112 => 0xd0,
-            113 => 0x2c,
-            114 => 0x1e,
-            115 => 0x8f,
-            116 => 0xca,
-            117 => 0x3f,
-            118 => 0x0f,
-            119 => 0x02,
-            120 => 0xc1,
-            121 => 0xaf,
-            122 => 0xbd,
-            123 => 0x03,
-            124 => 0x01,
-            125 => 0x13,
-            126 => 0x8a,
-            127 => 0x6b,
-            128 => 0x3a,
-            129 => 0x91,
-            130 => 0x11,
-            131 => 0x41,
-            132 => 0x4f,
-            133 => 0x67,
-            134 => 0xdc,
-            135 => 0xea,
-            136 => 0x97,
-            137 => 0xf2,
-            138 => 0xcf,
-            139 => 0xce,
-            140 => 0xf0,
-            141 => 0xb4,
-            142 => 0xe6,
-            143 => 0x73,
-            144 => 0x96,
-            145 => 0xac,
-            146 => 0x74,
-            147 => 0x22,
-            148 => 0xe7,
-            149 => 0xad,
-            150 => 0x35,
-            151 => 0x85,
-            152 => 0xe2,
-            153 => 0xf9,
-            154 => 0x37,
-            155 => 0xe8,
-            156 => 0x1c,
-            157 => 0x75,
-            158 => 0xdf,
-            159 => 0x6e,
-            160 => 0x47,
-            161 => 0xf1,
-            162 => 0x1a,
-            163 => 0x71,
-            164 => 0x1d,
-            165 => 0x29,
-            166 => 0xc5,
-            167 => 0x89,
-            168 => 0x6f,
-            169 => 0xb7,
-            170 => 0x62,
-            171 => 0x0e,
-            172 => 0xaa,
-            173 => 0x18,
-            174 => 0xbe,
-            175 => 0x1b,
-            176 => 0xfc,
-            177 => 0x56,
-            178 => 0x3e,
-            179 => 0x4b,
-            180 => 0xc6,
-            181 => 0xd2,
-            182 => 0x79,
-            183 => 0x20,
-            184 => 0x9a,
-            185 => 0xdb,
-            186 => 0xc0,
-            187 => 0xfe,
-            188 => 0x78,
-            189 => 0xcd,
-            190 => 0x5a,
-            191 => 0xf4,
-            192 => 0x1f,
-            193 => 0xdd,
-            194 => 0xa8,
-            195 => 0x33,
-            196 => 0x88,
-            197 => 0x07,
-            198 => 0xc7,
-            199 => 0x31,
-            200 => 0xb1,
-            201 => 0x12,
-            202 => 0x10,
-            203 => 0x59,
-            204 => 0x27,
-            205 => 0x80,
-            206 => 0xec,
-            207 => 0x5f,
-            208 => 0x60,
-            209 => 0x51,
-            210 => 0x7f,
-            211 => 0xa9,
-            212 => 0x19,
-            213 => 0xb5,
-            214 => 0x4a,
-            215 => 0x0d,
-            216 => 0x2d,
-            217 => 0xe5,
-            218 => 0x7a,
-            219 => 0x9f,
-            220 => 0x93,
-            221 => 0xc9,
-            222 => 0x9c,
-            223 => 0xef,
-            224 => 0xa0,
-            225 => 0xe0,
-            226 => 0x3b,
-            227 => 0x4d,
-            228 => 0xae,
-            229 => 0x2a,
-            230 => 0xf5,
-            231 => 0xb0,
-            232 => 0xc8,
-            233 => 0xeb,
-            234 => 0xbb,
-            235 => 0x3c,
-            236 => 0x83,
-            237 => 0x53,
-            238 => 0x99,
-            239 => 0x61,
-            240 => 0x17,
-            241 => 0x2b,
-            242 => 0x04,
-            243 => 0x7e,
-            244 => 0xba,
-            245 => 0x77,
-            246 => 0xd6,
-            247 => 0x26,
-            248 => 0xe1,
-            249 => 0x69,
-            250 => 0x14,
-            251 => 0x63,
-            252 => 0x55,
-            253 => 0x21,
-            254 => 0x0c,
-            255 => 0x7d,
-        }
-    }
-
-    use rand_core::{OsRng, RngCore};
-
-    use crate::platform::portable::aes_core::transpose_u8x16;
-
-    fn get_bit_u8(x: &[u8], i: usize, j: usize) -> u8 {
-        (x[i] >> j) & 0x1
-    }
-
-    fn get_bit_u16(x: &[u16], i: usize, j: usize) -> u8 {
-        ((x[j] >> i) & 0x1) as u8
-    }
-
-    #[test]
-    fn test_transpose() {
-        let mut x = [0u8; 16];
-        OsRng.fill_bytes(&mut x);
-        let mut y = [0u16; 8];
-        transpose_u8x16(&x, &mut y);
-        for i in 0..16 {
-            for j in 0..8 {
-                if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
-                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
-                    println!("y[{},{}] = {}", i, j, get_bit_u16(&y, i, j));
-                    assert!(false);
-                } else {
-                    println!("transpose ok: {},{}", i, j);
-                }
-            }
-        }
-        let mut z = [0u8; 16];
-        transpose_u16x8(&y, &mut z);
-        for i in 0..16 {
-            for j in 0..8 {
-                if get_bit_u8(&x, i, j) != get_bit_u8(&z, i, j) {
-                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
-                    println!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
-                    assert!(false);
-                } else {
-                    println!("inv-transpose ok: {},{}", i, j);
-                }
-            }
-        }
-    }
-
-    #[test]
-    fn test_sbox() {
-        let mut x = [0u8; 16];
-        let mut y = [0u16; 8];
-        let mut w = [0u8; 16];
-        for i in 0..=255 {
-            x[0] = i;
-            x[9] = i;
-            transpose_u8x16(&x, &mut y);
-            sub_bytes_state(&mut y);
-            transpose_u16x8(&y, &mut w);
-            if w[0] != sbox_fwd(i as u8) {
-                println!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
-                assert!(false);
-            } else {
-                println!("sbox ok {}", i)
-            }
-        }
-    }
-
-    #[test]
-    fn test_sbox_inv() {
-        let mut x = [0u8; 16];
-        let mut y = [0u16; 8];
-        let mut w = [0u8; 16];
-        for i in 0..=255 {
-            x[0] = i;
-            x[9] = i;
-            transpose_u8x16(&x, &mut y);
-            sub_bytes_inv_state(&mut y);
-            transpose_u16x8(&y, &mut w);
-            if w[0] != sbox_inv(i as u8) {
-                println!(
-                    "sbox_inv[{}] = {}, should be {}",
-                    i,
-                    w[0],
-                    sbox_inv(i as u8)
-                );
-                assert!(false);
-            } else {
-                println!("sbox inv ok {}", i)
-            }
-        }
-    }
-}
+mod test;
diff --git a/aesgcm/src/platform/portable/aes_core/test.rs b/aesgcm/src/platform/portable/aes_core/test.rs
new file mode 100644
index 000000000..c624a9dfe
--- /dev/null
+++ b/aesgcm/src/platform/portable/aes_core/test.rs
@@ -0,0 +1,773 @@
+use super::*;
+
+#[allow(non_snake_case)]
+fn sub_bytes_inv_state(st: &mut State) {
+    let U0 = st[7];
+    let U1 = st[6];
+    let U2 = st[5];
+    let U3 = st[4];
+    let U4 = st[3];
+    let U5 = st[2];
+    let U6 = st[1];
+    let U7 = st[0];
+
+    let T23 = U0 ^ U3;
+    let T22 = xnor(U1, U3);
+    let T2 = xnor(U0, U1);
+    let T1 = U3 ^ U4;
+    let T24 = xnor(U4, U7);
+    let R5 = U6 ^ U7;
+    let T8 = xnor(U1, T23);
+    let T19 = T22 ^ R5;
+    let T9 = xnor(U7, T1);
+    let T10 = T2 ^ T24;
+    let T13 = T2 ^ R5;
+    let T3 = T1 ^ R5;
+    let T25 = xnor(U2, T1);
+    let R13 = U1 ^ U6;
+    let T17 = xnor(U2, T19);
+    let T20 = T24 ^ R13;
+    let T4 = U4 ^ T8;
+    let R17 = xnor(U2, U5);
+    let R18 = xnor(U5, U6);
+    let R19 = xnor(U2, U4);
+    let Y5 = U0 ^ R17;
+    let T6 = T22 ^ R17;
+    let T16 = R13 ^ R19;
+    let T27 = T1 ^ R18;
+    let T15 = T10 ^ T27;
+    let T14 = T10 ^ R18;
+    let T26 = T3 ^ T16;
+    let M1 = T13 & T6;
+    let M2 = T23 & T8;
+    let M3 = T14 ^ M1;
+    let M4 = T19 & Y5;
+    let M5 = M4 ^ M1;
+    let M6 = T3 & T16;
+    let M7 = T22 & T9;
+    let M8 = T26 ^ M6;
+    let M9 = T20 & T17;
+    let M10 = M9 ^ M6;
+    let M11 = T1 & T15;
+    let M12 = T4 & T27;
+    let M13 = M12 ^ M11;
+    let M14 = T2 & T10;
+    let M15 = M14 ^ M11;
+    let M16 = M3 ^ M2;
+    let M17 = M5 ^ T24;
+    let M18 = M8 ^ M7;
+    let M19 = M10 ^ M15;
+    let M20 = M16 ^ M13;
+    let M21 = M17 ^ M15;
+    let M22 = M18 ^ M13;
+    let M23 = M19 ^ T25;
+    let M24 = M22 ^ M23;
+    let M25 = M22 & M20;
+    let M26 = M21 ^ M25;
+    let M27 = M20 ^ M21;
+    let M28 = M23 ^ M25;
+    let M29 = M28 & M27;
+    let M30 = M26 & M24;
+    let M31 = M20 & M23;
+    let M32 = M27 & M31;
+    let M33 = M27 ^ M25;
+    let M34 = M21 & M22;
+    let M35 = M24 & M34;
+    let M36 = M24 ^ M25;
+    let M37 = M21 ^ M29;
+    let M38 = M32 ^ M33;
+    let M39 = M23 ^ M30;
+    let M40 = M35 ^ M36;
+    let M41 = M38 ^ M40;
+    let M42 = M37 ^ M39;
+    let M43 = M37 ^ M38;
+    let M44 = M39 ^ M40;
+    let M45 = M42 ^ M41;
+    let M46 = M44 & T6;
+    let M47 = M40 & T8;
+    let M48 = M39 & Y5;
+    let M49 = M43 & T16;
+    let M50 = M38 & T9;
+    let M51 = M37 & T17;
+    let M52 = M42 & T15;
+    let M53 = M45 & T27;
+    let M54 = M41 & T10;
+    let M55 = M44 & T13;
+    let M56 = M40 & T23;
+    let M57 = M39 & T19;
+    let M58 = M43 & T3;
+    let M59 = M38 & T22;
+    let M60 = M37 & T20;
+    let M61 = M42 & T1;
+    let M62 = M45 & T4;
+    let M63 = M41 & T2;
+    let P0 = M52 ^ M61;
+    let P1 = M58 ^ M59;
+    let P2 = M54 ^ M62;
+    let P3 = M47 ^ M50;
+    let P4 = M48 ^ M56;
+    let P5 = M46 ^ M51;
+    let P6 = M49 ^ M60;
+    let P7 = P0 ^ P1;
+    let P8 = M50 ^ M53;
+    let P9 = M55 ^ M63;
+    let P10 = M57 ^ P4;
+    let P11 = P0 ^ P3;
+    let P12 = M46 ^ M48;
+    let P13 = M49 ^ M51;
+    let P14 = M49 ^ M62;
+    let P15 = M54 ^ M59;
+    let P16 = M57 ^ M61;
+    let P17 = M58 ^ P2;
+    let P18 = M63 ^ P5;
+    let P19 = P2 ^ P3;
+    let P20 = P4 ^ P6;
+    let P22 = P2 ^ P7;
+    let P23 = P7 ^ P8;
+    let P24 = P5 ^ P7;
+    let P25 = P6 ^ P10;
+    let P26 = P9 ^ P11;
+    let P27 = P10 ^ P18;
+    let P28 = P11 ^ P25;
+    let P29 = P15 ^ P20;
+    let W0 = P13 ^ P22;
+    let W1 = P26 ^ P29;
+    let W2 = P17 ^ P28;
+    let W3 = P12 ^ P22;
+    let W4 = P23 ^ P27;
+    let W5 = P19 ^ P24;
+    let W6 = P14 ^ P23;
+    let W7 = P9 ^ P16;
+
+    st[0] = W7;
+    st[1] = W6;
+    st[2] = W5;
+    st[3] = W4;
+    st[4] = W3;
+    st[5] = W2;
+    st[6] = W1;
+    st[7] = W0;
+}
+
+fn sbox_fwd(s: u8) -> u8 {
+    match s {
+        0 => 0x63,
+        1 => 0x7c,
+        2 => 0x77,
+        3 => 0x7b,
+        4 => 0xf2,
+        5 => 0x6b,
+        6 => 0x6f,
+        7 => 0xc5,
+        8 => 0x30,
+        9 => 0x01,
+        10 => 0x67,
+        11 => 0x2b,
+        12 => 0xfe,
+        13 => 0xd7,
+        14 => 0xab,
+        15 => 0x76,
+        16 => 0xca,
+        17 => 0x82,
+        18 => 0xc9,
+        19 => 0x7d,
+        20 => 0xfa,
+        21 => 0x59,
+        22 => 0x47,
+        23 => 0xf0,
+        24 => 0xad,
+        25 => 0xd4,
+        26 => 0xa2,
+        27 => 0xaf,
+        28 => 0x9c,
+        29 => 0xa4,
+        30 => 0x72,
+        31 => 0xc0,
+        32 => 0xb7,
+        33 => 0xfd,
+        34 => 0x93,
+        35 => 0x26,
+        36 => 0x36,
+        37 => 0x3f,
+        38 => 0xf7,
+        39 => 0xcc,
+        40 => 0x34,
+        41 => 0xa5,
+        42 => 0xe5,
+        43 => 0xf1,
+        44 => 0x71,
+        45 => 0xd8,
+        46 => 0x31,
+        47 => 0x15,
+        48 => 0x04,
+        49 => 0xc7,
+        50 => 0x23,
+        51 => 0xc3,
+        52 => 0x18,
+        53 => 0x96,
+        54 => 0x05,
+        55 => 0x9a,
+        56 => 0x07,
+        57 => 0x12,
+        58 => 0x80,
+        59 => 0xe2,
+        60 => 0xeb,
+        61 => 0x27,
+        62 => 0xb2,
+        63 => 0x75,
+        64 => 0x09,
+        65 => 0x83,
+        66 => 0x2c,
+        67 => 0x1a,
+        68 => 0x1b,
+        69 => 0x6e,
+        70 => 0x5a,
+        71 => 0xa0,
+        72 => 0x52,
+        73 => 0x3b,
+        74 => 0xd6,
+        75 => 0xb3,
+        76 => 0x29,
+        77 => 0xe3,
+        78 => 0x2f,
+        79 => 0x84,
+        80 => 0x53,
+        81 => 0xd1,
+        82 => 0x00,
+        83 => 0xed,
+        84 => 0x20,
+        85 => 0xfc,
+        86 => 0xb1,
+        87 => 0x5b,
+        88 => 0x6a,
+        89 => 0xcb,
+        90 => 0xbe,
+        91 => 0x39,
+        92 => 0x4a,
+        93 => 0x4c,
+        94 => 0x58,
+        95 => 0xcf,
+        96 => 0xd0,
+        97 => 0xef,
+        98 => 0xaa,
+        99 => 0xfb,
+        100 => 0x43,
+        101 => 0x4d,
+        102 => 0x33,
+        103 => 0x85,
+        104 => 0x45,
+        105 => 0xf9,
+        106 => 0x02,
+        107 => 0x7f,
+        108 => 0x50,
+        109 => 0x3c,
+        110 => 0x9f,
+        111 => 0xa8,
+        112 => 0x51,
+        113 => 0xa3,
+        114 => 0x40,
+        115 => 0x8f,
+        116 => 0x92,
+        117 => 0x9d,
+        118 => 0x38,
+        119 => 0xf5,
+        120 => 0xbc,
+        121 => 0xb6,
+        122 => 0xda,
+        123 => 0x21,
+        124 => 0x10,
+        125 => 0xff,
+        126 => 0xf3,
+        127 => 0xd2,
+        128 => 0xcd,
+        129 => 0x0c,
+        130 => 0x13,
+        131 => 0xec,
+        132 => 0x5f,
+        133 => 0x97,
+        134 => 0x44,
+        135 => 0x17,
+        136 => 0xc4,
+        137 => 0xa7,
+        138 => 0x7e,
+        139 => 0x3d,
+        140 => 0x64,
+        141 => 0x5d,
+        142 => 0x19,
+        143 => 0x73,
+        144 => 0x60,
+        145 => 0x81,
+        146 => 0x4f,
+        147 => 0xdc,
+        148 => 0x22,
+        149 => 0x2a,
+        150 => 0x90,
+        151 => 0x88,
+        152 => 0x46,
+        153 => 0xee,
+        154 => 0xb8,
+        155 => 0x14,
+        156 => 0xde,
+        157 => 0x5e,
+        158 => 0x0b,
+        159 => 0xdb,
+        160 => 0xe0,
+        161 => 0x32,
+        162 => 0x3a,
+        163 => 0x0a,
+        164 => 0x49,
+        165 => 0x06,
+        166 => 0x24,
+        167 => 0x5c,
+        168 => 0xc2,
+        169 => 0xd3,
+        170 => 0xac,
+        171 => 0x62,
+        172 => 0x91,
+        173 => 0x95,
+        174 => 0xe4,
+        175 => 0x79,
+        176 => 0xe7,
+        177 => 0xc8,
+        178 => 0x37,
+        179 => 0x6d,
+        180 => 0x8d,
+        181 => 0xd5,
+        182 => 0x4e,
+        183 => 0xa9,
+        184 => 0x6c,
+        185 => 0x56,
+        186 => 0xf4,
+        187 => 0xea,
+        188 => 0x65,
+        189 => 0x7a,
+        190 => 0xae,
+        191 => 0x08,
+        192 => 0xba,
+        193 => 0x78,
+        194 => 0x25,
+        195 => 0x2e,
+        196 => 0x1c,
+        197 => 0xa6,
+        198 => 0xb4,
+        199 => 0xc6,
+        200 => 0xe8,
+        201 => 0xdd,
+        202 => 0x74,
+        203 => 0x1f,
+        204 => 0x4b,
+        205 => 0xbd,
+        206 => 0x8b,
+        207 => 0x8a,
+        208 => 0x70,
+        209 => 0x3e,
+        210 => 0xb5,
+        211 => 0x66,
+        212 => 0x48,
+        213 => 0x03,
+        214 => 0xf6,
+        215 => 0x0e,
+        216 => 0x61,
+        217 => 0x35,
+        218 => 0x57,
+        219 => 0xb9,
+        220 => 0x86,
+        221 => 0xc1,
+        222 => 0x1d,
+        223 => 0x9e,
+        224 => 0xe1,
+        225 => 0xf8,
+        226 => 0x98,
+        227 => 0x11,
+        228 => 0x69,
+        229 => 0xd9,
+        230 => 0x8e,
+        231 => 0x94,
+        232 => 0x9b,
+        233 => 0x1e,
+        234 => 0x87,
+        235 => 0xe9,
+        236 => 0xce,
+        237 => 0x55,
+        238 => 0x28,
+        239 => 0xdf,
+        240 => 0x8c,
+        241 => 0xa1,
+        242 => 0x89,
+        243 => 0x0d,
+        244 => 0xbf,
+        245 => 0xe6,
+        246 => 0x42,
+        247 => 0x68,
+        248 => 0x41,
+        249 => 0x99,
+        250 => 0x2d,
+        251 => 0x0f,
+        252 => 0xb0,
+        253 => 0x54,
+        254 => 0xbb,
+        255 => 0x16,
+    }
+}
+
+fn sbox_inv(s: u8) -> u8 {
+    match s {
+        0 => 0x52,
+        1 => 0x09,
+        2 => 0x6a,
+        3 => 0xd5,
+        4 => 0x30,
+        5 => 0x36,
+        6 => 0xa5,
+        7 => 0x38,
+        8 => 0xbf,
+        9 => 0x40,
+        10 => 0xa3,
+        11 => 0x9e,
+        12 => 0x81,
+        13 => 0xf3,
+        14 => 0xd7,
+        15 => 0xfb,
+        16 => 0x7c,
+        17 => 0xe3,
+        18 => 0x39,
+        19 => 0x82,
+        20 => 0x9b,
+        21 => 0x2f,
+        22 => 0xff,
+        23 => 0x87,
+        24 => 0x34,
+        25 => 0x8e,
+        26 => 0x43,
+        27 => 0x44,
+        28 => 0xc4,
+        29 => 0xde,
+        30 => 0xe9,
+        31 => 0xcb,
+        32 => 0x54,
+        33 => 0x7b,
+        34 => 0x94,
+        35 => 0x32,
+        36 => 0xa6,
+        37 => 0xc2,
+        38 => 0x23,
+        39 => 0x3d,
+        40 => 0xee,
+        41 => 0x4c,
+        42 => 0x95,
+        43 => 0x0b,
+        44 => 0x42,
+        45 => 0xfa,
+        46 => 0xc3,
+        47 => 0x4e,
+        48 => 0x08,
+        49 => 0x2e,
+        50 => 0xa1,
+        51 => 0x66,
+        52 => 0x28,
+        53 => 0xd9,
+        54 => 0x24,
+        55 => 0xb2,
+        56 => 0x76,
+        57 => 0x5b,
+        58 => 0xa2,
+        59 => 0x49,
+        60 => 0x6d,
+        61 => 0x8b,
+        62 => 0xd1,
+        63 => 0x25,
+        64 => 0x72,
+        65 => 0xf8,
+        66 => 0xf6,
+        67 => 0x64,
+        68 => 0x86,
+        69 => 0x68,
+        70 => 0x98,
+        71 => 0x16,
+        72 => 0xd4,
+        73 => 0xa4,
+        74 => 0x5c,
+        75 => 0xcc,
+        76 => 0x5d,
+        77 => 0x65,
+        78 => 0xb6,
+        79 => 0x92,
+        80 => 0x6c,
+        81 => 0x70,
+        82 => 0x48,
+        83 => 0x50,
+        84 => 0xfd,
+        85 => 0xed,
+        86 => 0xb9,
+        87 => 0xda,
+        88 => 0x5e,
+        89 => 0x15,
+        90 => 0x46,
+        91 => 0x57,
+        92 => 0xa7,
+        93 => 0x8d,
+        94 => 0x9d,
+        95 => 0x84,
+        96 => 0x90,
+        97 => 0xd8,
+        98 => 0xab,
+        99 => 0x00,
+        100 => 0x8c,
+        101 => 0xbc,
+        102 => 0xd3,
+        103 => 0x0a,
+        104 => 0xf7,
+        105 => 0xe4,
+        106 => 0x58,
+        107 => 0x05,
+        108 => 0xb8,
+        109 => 0xb3,
+        110 => 0x45,
+        111 => 0x06,
+        112 => 0xd0,
+        113 => 0x2c,
+        114 => 0x1e,
+        115 => 0x8f,
+        116 => 0xca,
+        117 => 0x3f,
+        118 => 0x0f,
+        119 => 0x02,
+        120 => 0xc1,
+        121 => 0xaf,
+        122 => 0xbd,
+        123 => 0x03,
+        124 => 0x01,
+        125 => 0x13,
+        126 => 0x8a,
+        127 => 0x6b,
+        128 => 0x3a,
+        129 => 0x91,
+        130 => 0x11,
+        131 => 0x41,
+        132 => 0x4f,
+        133 => 0x67,
+        134 => 0xdc,
+        135 => 0xea,
+        136 => 0x97,
+        137 => 0xf2,
+        138 => 0xcf,
+        139 => 0xce,
+        140 => 0xf0,
+        141 => 0xb4,
+        142 => 0xe6,
+        143 => 0x73,
+        144 => 0x96,
+        145 => 0xac,
+        146 => 0x74,
+        147 => 0x22,
+        148 => 0xe7,
+        149 => 0xad,
+        150 => 0x35,
+        151 => 0x85,
+        152 => 0xe2,
+        153 => 0xf9,
+        154 => 0x37,
+        155 => 0xe8,
+        156 => 0x1c,
+        157 => 0x75,
+        158 => 0xdf,
+        159 => 0x6e,
+        160 => 0x47,
+        161 => 0xf1,
+        162 => 0x1a,
+        163 => 0x71,
+        164 => 0x1d,
+        165 => 0x29,
+        166 => 0xc5,
+        167 => 0x89,
+        168 => 0x6f,
+        169 => 0xb7,
+        170 => 0x62,
+        171 => 0x0e,
+        172 => 0xaa,
+        173 => 0x18,
+        174 => 0xbe,
+        175 => 0x1b,
+        176 => 0xfc,
+        177 => 0x56,
+        178 => 0x3e,
+        179 => 0x4b,
+        180 => 0xc6,
+        181 => 0xd2,
+        182 => 0x79,
+        183 => 0x20,
+        184 => 0x9a,
+        185 => 0xdb,
+        186 => 0xc0,
+        187 => 0xfe,
+        188 => 0x78,
+        189 => 0xcd,
+        190 => 0x5a,
+        191 => 0xf4,
+        192 => 0x1f,
+        193 => 0xdd,
+        194 => 0xa8,
+        195 => 0x33,
+        196 => 0x88,
+        197 => 0x07,
+        198 => 0xc7,
+        199 => 0x31,
+        200 => 0xb1,
+        201 => 0x12,
+        202 => 0x10,
+        203 => 0x59,
+        204 => 0x27,
+        205 => 0x80,
+        206 => 0xec,
+        207 => 0x5f,
+        208 => 0x60,
+        209 => 0x51,
+        210 => 0x7f,
+        211 => 0xa9,
+        212 => 0x19,
+        213 => 0xb5,
+        214 => 0x4a,
+        215 => 0x0d,
+        216 => 0x2d,
+        217 => 0xe5,
+        218 => 0x7a,
+        219 => 0x9f,
+        220 => 0x93,
+        221 => 0xc9,
+        222 => 0x9c,
+        223 => 0xef,
+        224 => 0xa0,
+        225 => 0xe0,
+        226 => 0x3b,
+        227 => 0x4d,
+        228 => 0xae,
+        229 => 0x2a,
+        230 => 0xf5,
+        231 => 0xb0,
+        232 => 0xc8,
+        233 => 0xeb,
+        234 => 0xbb,
+        235 => 0x3c,
+        236 => 0x83,
+        237 => 0x53,
+        238 => 0x99,
+        239 => 0x61,
+        240 => 0x17,
+        241 => 0x2b,
+        242 => 0x04,
+        243 => 0x7e,
+        244 => 0xba,
+        245 => 0x77,
+        246 => 0xd6,
+        247 => 0x26,
+        248 => 0xe1,
+        249 => 0x69,
+        250 => 0x14,
+        251 => 0x63,
+        252 => 0x55,
+        253 => 0x21,
+        254 => 0x0c,
+        255 => 0x7d,
+    }
+}
+
+use rand_core::{OsRng, RngCore};
+
+use crate::platform::portable::aes_core::transpose_u8x16;
+
+fn get_bit_u8(x: &[u8], i: usize, j: usize) -> u8 {
+    (x[i] >> j) & 0x1
+}
+
+fn get_bit_u16(x: &[u16], i: usize, j: usize) -> u8 {
+    ((x[j] >> i) & 0x1) as u8
+}
+
+#[test]
+fn test_transpose() {
+    let mut x = [0u8; 16];
+    OsRng.fill_bytes(&mut x);
+    let mut y = [0u16; 8];
+    transpose_u8x16(&x, &mut y);
+    for i in 0..16 {
+        for j in 0..8 {
+            if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
+                #[cfg(feature = "std")]
+                {
+                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    println!("y[{},{}] = {}", i, j, get_bit_u16(&y, i, j));
+                }
+                assert!(false);
+            } else {
+                #[cfg(feature = "std")]
+                println!("transpose ok: {},{}", i, j);
+            }
+        }
+    }
+    let mut z = [0u8; 16];
+    transpose_u16x8(&y, &mut z);
+    for i in 0..16 {
+        for j in 0..8 {
+            if get_bit_u8(&x, i, j) != get_bit_u8(&z, i, j) {
+                #[cfg(feature = "std")]
+                {
+                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    println!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
+                }
+                assert!(false);
+            } else {
+                #[cfg(feature = "std")]
+                println!("inv-transpose ok: {},{}", i, j);
+            }
+        }
+    }
+}
+
+#[test]
+fn test_sbox() {
+    let mut x = [0u8; 16];
+    let mut y = [0u16; 8];
+    let mut w = [0u8; 16];
+    for i in 0..=255 {
+        x[0] = i;
+        x[9] = i;
+        transpose_u8x16(&x, &mut y);
+        sub_bytes_state(&mut y);
+        transpose_u16x8(&y, &mut w);
+        if w[0] != sbox_fwd(i as u8) {
+            #[cfg(feature = "std")]
+            println!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
+            assert!(false);
+        } else {
+            #[cfg(feature = "std")]
+            println!("sbox ok {}", i)
+        }
+    }
+}
+
+#[test]
+fn test_sbox_inv() {
+    let mut x = [0u8; 16];
+    let mut y = [0u16; 8];
+    let mut w = [0u8; 16];
+    for i in 0..=255 {
+        x[0] = i;
+        x[9] = i;
+        transpose_u8x16(&x, &mut y);
+        sub_bytes_inv_state(&mut y);
+        transpose_u16x8(&y, &mut w);
+        if w[0] != sbox_inv(i as u8) {
+            #[cfg(feature = "std")]
+            println!(
+                "sbox_inv[{}] = {}, should be {}",
+                i,
+                w[0],
+                sbox_inv(i as u8)
+            );
+            assert!(false);
+        } else {
+            #[cfg(feature = "std")]
+            println!("sbox inv ok {}", i)
+        }
+    }
+}
diff --git a/aesgcm/test.py b/aesgcm/test.py
deleted file mode 100644
index f098e251b..000000000
--- a/aesgcm/test.py
+++ /dev/null
@@ -1,455 +0,0 @@
-import os
-# from cryptography.hazmat.primitives.ciphers import (
-#     Cipher, algorithms, modes
-# )
-# from cryptography.hazmat.backends import default_backend
-# from cryptography.exceptions import InvalidTag
-
-# --- AES Key Schedule Generation (for demonstration) ---
-
-# S-box: The substitution table for AES
-_S_BOX = (
-    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
-)
-
-# Rcon: The round constant word array
-_R_CON = (
-    0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
-)
-
-def _generate_and_print_key_schedule(key: bytes):
-    """
-    Generates and prints the AES-256 key schedule from a 32-byte key.
-    This is for educational purposes to show the key expansion process.
-    """
-    print("--- AES-256 Key Schedule Expansion ---")
-    
-    # AES-256 constants
-    Nk = 8  # Number of 32-bit words in the key
-    Nr = 14 # Number of rounds
-    
-    # The expanded key schedule will hold 4 * (14 + 1) = 60 words
-    w = [0] * (4 * (Nr + 1))
-
-    # The first Nk words are the original key
-    for i in range(Nk):
-        w[i] = int.from_bytes(key[i*4 : i*4+4], 'big')
-
-    # Generate the rest of the words for the schedule
-    for i in range(Nk, len(w)):
-        temp = w[i - 1]
-        print(f"{i}: {i % Nk == 0} | {i % Nk ==4}")
-        print(f"{temp:08x}")
-        if i % Nk == 0:
-            print(f"{i} % Nk == 0")
-            # Rotate the word
-            temp = ((temp << 8) & 0xffffffff) | (temp >> 24)
-            # Apply S-box to each byte
-            temp = (_S_BOX[(temp >> 24) & 0xff] << 24) | \
-                   (_S_BOX[(temp >> 16) & 0xff] << 16) | \
-                   (_S_BOX[(temp >>  8) & 0xff] <<  8) | \
-                   (_S_BOX[ temp        & 0xff])
-            # XOR with the round constant
-            temp ^= (_R_CON[i // Nk] << 24)
-        elif i % Nk == 4: # Extra S-box substitution for AES-256
-            print(f"{i} % Nk == 4")
-            temp = (_S_BOX[(temp >> 24) & 0xff] << 24) | \
-                   (_S_BOX[(temp >> 16) & 0xff] << 16) | \
-                   (_S_BOX[(temp >>  8) & 0xff] <<  8) | \
-                   (_S_BOX[ temp        & 0xff])
-            print(f"{temp:08x}")
-        
-        w[i] = w[i - Nk] ^ temp
-
-    # Print the round keys
-    for r in range(Nr + 1):
-        round_key_words = w[r*4 : r*4+4]
-        round_key_hex = "".join([f'{word:08x}' for word in round_key_words])
-        print(f"Round {r:2d} Key: {round_key_hex}")
-    print("------------------------------------")
-
-
-# class AES_GCM_256:
-#     """
-#     A toy implementation of AES-GCM-256 encryption and decryption.
-
-#     This class demonstrates the core components of an AES-GCM authenticated
-#     encryption scheme. It is for educational purposes and uses the `cryptography`
-#     library for the underlying cryptographic operations.
-#     """
-
-#     def __init__(self, key: bytes):
-#         """
-#         Initializes the cipher with a 256-bit (32-byte) key.
-
-#         Args:
-#             key: A 32-byte key.
-
-#         Raises:
-#             ValueError: If the key is not 32 bytes long.
-#         """
-#         if len(key) != 32:
-#             raise ValueError("Key must be 256 bits (32 bytes) for AES-GCM-256.")
-#         self.key = key
-#         self.backend = default_backend()
-
-#     @staticmethod
-#     def generate_key() -> bytes:
-#         """
-#         Generates a random 256-bit (32-byte) key suitable for AES-GCM-256.
-
-#         Returns:
-#             A 32-byte key.
-#         """
-#         return os.urandom(32)
-
-#     def encrypt(self, plaintext: bytes, associated_data: bytes, nonce: bytes) -> tuple[bytes, bytes]:
-#         """
-#         Encrypts plaintext and authenticates associated data using AES-GCM.
-
-#         Args:
-#             plaintext: The data to encrypt.
-#             associated_data: Additional data to authenticate but not encrypt.
-#             nonce: A 12-byte (96-bit) nonce. Should be unique for each encryption
-#                    with the same key.
-
-#         Returns:
-#             A tuple containing the ciphertext and the authentication tag.
-        
-#         Raises:
-#             ValueError: If the nonce is not 12 bytes long.
-#         """
-#         if len(nonce) != 12:
-#             raise ValueError("Nonce must be 96 bits (12 bytes) for AES-GCM.")
-
-#         # Create an AES-GCM cipher object
-#         cipher = Cipher(algorithms.AES(self.key), modes.GCM(nonce), backend=self.backend)
-#         encryptor = cipher.encryptor()
-
-#         # Add the associated data. This data is authenticated but not encrypted.
-#         encryptor.authenticate_additional_data(associated_data)
-
-#         # Encrypt the plaintext
-#         ciphertext = encryptor.update(plaintext) + encryptor.finalize()
-
-#         # The authentication tag is generated automatically and is available after finalization.
-#         tag = encryptor.tag
-
-#         return ciphertext, tag
-
-#     def decrypt(self, ciphertext: bytes, associated_data: bytes, nonce: bytes, tag: bytes) -> bytes:
-#         """
-#         Decrypts ciphertext and verifies the authentication tag using AES-GCM.
-
-#         Args:
-#             ciphertext: The encrypted data.
-#             associated_data: The associated data that was authenticated.
-#             nonce: The 12-byte nonce used during encryption.
-#             tag: The 16-byte authentication tag generated during encryption.
-
-#         Returns:
-#             The original plaintext if decryption and authentication are successful.
-
-#         Raises:
-#             cryptography.exceptions.InvalidTag: If the authentication fails.
-#             ValueError: If the nonce is not 12 bytes long.
-#         """
-#         if len(nonce) != 12:
-#             raise ValueError("Nonce must be 96 bits (12 bytes) for AES-GCM.")
-
-#         # Create an AES-GCM cipher object with the nonce and tag
-#         cipher = Cipher(algorithms.AES(self.key), modes.GCM(nonce, tag), backend=self.backend)
-#         decryptor = cipher.decryptor()
-
-#         # Add the associated data for authentication verification.
-#         decryptor.authenticate_additional_data(associated_data)
-
-#         # Decrypt the ciphertext.
-#         # An InvalidTag exception will be raised if the tag does not match.
-#         try:
-#             plaintext = decryptor.update(ciphertext) + decryptor.finalize()
-#             return plaintext
-#         except InvalidTag as e:
-#             # Re-raising with a more informative message can be helpful.
-#             print(f"Decryption failed: Invalid authentication tag.")
-#             raise
-#         except Exception as e:
-#             print(f"An unexpected error occurred during decryption: {e}")
-#             raise
-
-def run_tests():
-    """
-    A suite of tests to verify the AES_GCM_256 implementation.
-    """
-    print("--- Running AES-GCM-256 Tests ---")
-
-    # 1. Generate a key
-    key = "92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20"
-    key = bytes.fromhex(key)
-    # aes_cipher = AES_GCM_256(key)
-    print(f"Generated Key (hex): {key.hex()}")
-    _generate_and_print_key_schedule(key)
-
-
-    # # 2. Define test data
-    # plaintext = b"This is a secret message that needs to be encrypted."
-    # associated_data = b"This is metadata that is authenticated but not secret."
-    # # A nonce should be unique for every encryption with the same key.
-    # # For this test, we'll generate a random 12-byte nonce.
-    # nonce = os.urandom(12)
-    
-    # print(f"Original Plaintext: {plaintext.decode()}")
-    # print(f"Associated Data: {associated_data.decode()}")
-    # print(f"Nonce (hex): {nonce.hex()}")
-
-    # # 3. Test successful encryption and decryption
-    # print("\n--- Test 1: Successful Encryption/Decryption ---")
-    # try:
-    #     ciphertext, tag = aes_cipher.encrypt(plaintext, associated_data, nonce)
-    #     print(f"Ciphertext (hex): {ciphertext.hex()}")
-    #     print(f"Authentication Tag (hex): {tag.hex()}")
-
-    #     decrypted_plaintext = aes_cipher.decrypt(ciphertext, associated_data, nonce, tag)
-    #     print(f"Decrypted Plaintext: {decrypted_plaintext.decode()}")
-
-    #     assert plaintext == decrypted_plaintext
-    #     print("SUCCESS: Decrypted plaintext matches original plaintext.")
-    # except Exception as e:
-    #     print(f"FAILURE: An unexpected error occurred: {e}")
-
-
-    # # 4. Test failure: incorrect tag
-    # print("\n--- Test 2: Decryption with Incorrect Tag ---")
-    # try:
-    #     invalid_tag = os.urandom(16) # A random, incorrect tag
-    #     print(f"Using incorrect tag (hex): {invalid_tag.hex()}")
-    #     aes_cipher.decrypt(ciphertext, associated_data, nonce, invalid_tag)
-    #     # The line above should raise an exception, so we should not reach here.
-    #     print("FAILURE: Decryption succeeded with an invalid tag.")
-    # except InvalidTag:
-    #     print(f"SUCCESS: Decryption failed as expected due to InvalidTag.")
-    # except Exception as e:
-    #     print(f"FAILURE: An unexpected error occurred: {e}")
-
-    # # 5. Test failure: modified ciphertext
-    # print("\n--- Test 3: Decryption with Modified Ciphertext ---")
-    # try:
-    #     # Tamper with the ciphertext (flip the first byte)
-    #     modified_ciphertext = bytes([ciphertext[0] ^ 0xFF]) + ciphertext[1:]
-    #     print(f"Using modified ciphertext (hex): {modified_ciphertext.hex()}")
-    #     aes_cipher.decrypt(modified_ciphertext, associated_data, nonce, tag)
-    #     print("FAILURE: Decryption succeeded with modified ciphertext.")
-    # except InvalidTag:
-    #     print(f"SUCCESS: Decryption failed as expected due to InvalidTag.")
-    # except Exception as e:
-    #     print(f"FAILURE: An unexpected error occurred: {e}")
-
-    # # 6. Test failure: modified associated data
-    # print("\n--- Test 4: Decryption with Modified Associated Data ---")
-    # try:
-    #     modified_ad = b"This is incorrect metadata."
-    #     print(f"Using modified AAD: {modified_ad.decode()}")
-    #     aes_cipher.decrypt(ciphertext, modified_ad, nonce, tag)
-    #     print("FAILURE: Decryption succeeded with modified associated data.")
-    # except InvalidTag:
-    #     print(f"SUCCESS: Decryption failed as expected due to InvalidTag.")
-    # except Exception as e:
-    #     print(f"FAILURE: An unexpected error occurred: {e}")
-
-    # # 7. Test with user-provided specific vector
-    # print("\n--- Test 5: Specific Vector Test Case ---")
-    # try:
-    #     key_hex = "92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20"
-    #     aad_hex = "00000000ffffffff"
-    #     plaintext_hex = "00010203040506070809"
-    #     # A fixed nonce is used for this specific test vector.
-    #     nonce_hex = "00112233445566778899aabb"
-
-    #     key_vec = bytes.fromhex(key_hex)
-    #     aad_vec = bytes.fromhex(aad_hex)
-    #     plaintext_vec = bytes.fromhex(plaintext_hex)
-    #     nonce_vec = bytes.fromhex(nonce_hex)
-
-    #     print(f"Using Key (hex): {key_vec.hex()}")
-    #     _generate_and_print_key_schedule(key_vec)
-    #     print(f"Using AAD (hex): {aad_vec.hex()}")
-    #     print(f"Using Plaintext (hex): {plaintext_vec.hex()}")
-    #     print(f"Using Nonce (hex): {nonce_vec.hex()}")
-
-    #     specific_cipher = AES_GCM_256(key_vec)
-    #     ciphertext_vec, tag_vec = specific_cipher.encrypt(plaintext_vec, aad_vec, nonce_vec)
-
-    #     print(f"Resulting Ciphertext (hex): {ciphertext_vec.hex()}")
-    #     print(f"Resulting Tag (hex): {tag_vec.hex()}")
-
-    #     decrypted_plaintext_vec = specific_cipher.decrypt(ciphertext_vec, aad_vec, nonce_vec, tag_vec)
-    #     print(f"Decrypted Plaintext (hex): {decrypted_plaintext_vec.hex()}")
-
-    #     assert plaintext_vec == decrypted_plaintext_vec
-    #     print("SUCCESS: Specific vector test passed. Decrypted plaintext matches original.")
-
-    # except Exception as e:
-    #     print(f"FAILURE: An unexpected error occurred in the specific vector test: {e}")
-        
-    print("\n--- All tests completed. ---")
-
-
-if __name__ == "__main__":
-    run_tests()
-
-# --- Running AES-GCM-256 Tests ---
-# Generated Key (hex): 92ace3e348cd821092cd921aa3546374299ab46209691bc28b8752d17f123c20
-# --- AES-256 Key Schedule Expansion ---
-# 8: True | False
-# 7f123c20
-# 8 % Nk == 0
-# 9: False | False
-# 5a475431
-# 10: False | False
-# 128ad621
-# 11: False | False
-# 8047443b
-# 12: False | True
-# 2313274f
-# 12 % Nk == 4
-# 267dcc84
-# 13: False | False
-# 0fe778e6
-# 14: False | False
-# 068e6324
-# 15: False | False
-# 8d0931f5
-# 16: True | False
-# f21b0dd5
-# 16 % Nk == 0
-# 17: False | False
-# f79057b8
-# 18: False | False
-# e51a8199
-# 19: False | False
-# 655dc5a2
-# 20: False | True
-# 464ee2ed
-# 20 % Nk == 4
-# 5a2f9855
-# 21: False | False
-# 55c8e0b3
-# 22: False | False
-# 53468397
-# 23: False | False
-# de4fb262
-# 24: True | False
-# 2c54bfb7
-# 24 % Nk == 0
-# 25: False | False
-# d398fec9
-# 26: False | False
-# 36827f50
-# 27: False | False
-# 53dfbaf2
-# 28: False | True
-# 1591581f
-# 28 % Nk == 4
-# 59816ac0
-# 29: False | False
-# 0c498a73
-# 30: False | False
-# 5f0f09e4
-# 31: False | False
-# 8140bb86
-# 32: True | False
-# ad140431
-# 32 % Nk == 0
-# 33: False | False
-# 216a395c
-# 34: False | False
-# 17e8460c
-# 35: False | False
-# 4437fcfe
-# 36: False | True
-# 51a6a4e1
-# 36 % Nk == 4
-# d12449f8
-# 37: False | False
-# dd6dc38b
-# 38: False | False
-# 8262ca6f
-# 39: False | False
-# 032271e9
-# 40: True | False
-# ae3675d8
-# 40 % Nk == 0
-# 41: False | False
-# 34f758b8
-# 42: False | False
-# 231f1eb4
-# 43: False | False
-# 6728e24a
-# 44: False | True
-# 368e46ab
-# 44 % Nk == 4
-# 05195a62
-# 45: False | False
-# d87499e9
-# 46: False | False
-# 5a165386
-# 47: False | False
-# 5934226f
-# 48: True | False
-# f70257b7
-# 48 % Nk == 0
-# 49: False | False
-# 63acf1d0
-# 50: False | False
-# 40b3ef64
-# 51: False | False
-# 279b0d2e
-# 52: False | True
-# 11154b85
-# 52 % Nk == 4
-# 8259b397
-# 53: False | False
-# 5a2d2a7e
-# 54: False | False
-# 003b79f8
-# 55: False | False
-# 590f5b97
-# 56: True | False
-# ae0d0c20
-# 56 % Nk == 0
-# 57: False | False
-# f4524634
-# 58: False | False
-# b4e1a950
-# 59: False | False
-# 937aa47e
-# Round  0 Key: 92ace3e348cd821092cd921aa3546374
-# Round  1 Key: 299ab46209691bc28b8752d17f123c20
-# Round  2 Key: 5a475431128ad6218047443b2313274f
-# Round  3 Key: 0fe778e6068e63248d0931f5f21b0dd5
-# Round  4 Key: f79057b8e51a8199655dc5a2464ee2ed
-# Round  5 Key: 55c8e0b353468397de4fb2622c54bfb7
-# Round  6 Key: d398fec936827f5053dfbaf21591581f
-# Round  7 Key: 0c498a735f0f09e48140bb86ad140431
-# Round  8 Key: 216a395c17e8460c4437fcfe51a6a4e1
-# Round  9 Key: dd6dc38b8262ca6f032271e9ae3675d8
-# Round 10 Key: 34f758b8231f1eb46728e24a368e46ab
-# Round 11 Key: d87499e95a1653865934226ff70257b7
-# Round 12 Key: 63acf1d040b3ef64279b0d2e11154b85
-# Round 13 Key: 5a2d2a7e003b79f8590f5b97ae0d0c20
-# Round 14 Key: f4524634b4e1a950937aa47e826feffb

From ed96011b20f734f385903893b8256006534d1bef Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Sat, 20 Sep 2025 10:47:33 +0200
Subject: [PATCH 22/43] cleanup

---
 aesgcm/benches/aesgcm.rs                   |  55 ++-
 aesgcm/src/{aes_generic.rs => aes.rs}      |   4 +
 aesgcm/src/aes_ctr.rs                      | 198 ---------
 aesgcm/src/aes_ctr/aes256_ctr.rs           |  77 ----
 aesgcm/src/aes_gcm_128.rs                  |  22 +-
 aesgcm/src/aes_gcm_256.rs                  |  22 +-
 aesgcm/src/ctr.rs                          | 103 +++++
 aesgcm/src/{aes_ctr => ctr}/aes128_ctr.rs  |  46 +-
 aesgcm/src/ctr/aes256_ctr.rs               | 107 +++++
 aesgcm/src/ctr/test128.rs                  | 131 ++++++
 aesgcm/src/gf128.rs                        |  75 ++++
 aesgcm/src/gf128/test.rs                   |  79 ++++
 aesgcm/src/gf128_generic.rs                | 147 -------
 aesgcm/src/lib.rs                          | 480 ++++++++++-----------
 aesgcm/src/platform.rs                     |  14 +-
 aesgcm/src/platform/intel_ni/aes_core.rs   |  21 +-
 aesgcm/src/platform/intel_ni/gf128_core.rs |  17 +-
 aesgcm/src/platform/neon/aes_core.rs       |  19 +
 aesgcm/src/platform/neon/gf128_core.rs     |  15 +-
 aesgcm/src/platform/portable/aes_core.rs   |  26 +-
 aesgcm/src/platform/portable/gf128_core.rs |  28 +-
 aesgcm/tests/wycheproof.rs                 |  56 +--
 22 files changed, 930 insertions(+), 812 deletions(-)
 rename aesgcm/src/{aes_generic.rs => aes.rs} (87%)
 delete mode 100644 aesgcm/src/aes_ctr.rs
 delete mode 100644 aesgcm/src/aes_ctr/aes256_ctr.rs
 create mode 100644 aesgcm/src/ctr.rs
 rename aesgcm/src/{aes_ctr => ctr}/aes128_ctr.rs (63%)
 create mode 100644 aesgcm/src/ctr/aes256_ctr.rs
 create mode 100644 aesgcm/src/ctr/test128.rs
 create mode 100644 aesgcm/src/gf128.rs
 create mode 100644 aesgcm/src/gf128/test.rs
 delete mode 100644 aesgcm/src/gf128_generic.rs

diff --git a/aesgcm/benches/aesgcm.rs b/aesgcm/benches/aesgcm.rs
index 7d5c6c647..5dda968a0 100644
--- a/aesgcm/benches/aesgcm.rs
+++ b/aesgcm/benches/aesgcm.rs
@@ -14,7 +14,7 @@ pub fn fmt(x: usize) -> String {
 }
 
 macro_rules! impl_comp {
-    ($fun:ident, $portable_fun:expr, $neon_fun:expr, $intel_fun:expr, $rustcrypto_fun:expr) => {
+    ($fun:ident, $keylen:literal, $portable_fun:expr, $neon_fun:expr, $intel_fun:expr, $rustcrypto_fun:expr) => {
         // Comparing libcrux performance for different payload sizes and other implementations.
         fn $fun(c: &mut Criterion) {
             const PAYLOAD_SIZES: [usize; 3] = [128, 1024, 1024 * 1024 * 10];
@@ -31,7 +31,7 @@ macro_rules! impl_comp {
                         b.iter_batched(
                             || {
                                 (
-                                    randombytes(16),
+                                    randombytes($keylen),
                                     randombytes(12),
                                     randombytes(32),
                                     randombytes(*payload_size),
@@ -62,7 +62,7 @@ macro_rules! impl_comp {
                         b.iter_batched(
                             || {
                                 (
-                                    randombytes(16),
+                                    randombytes($keylen),
                                     randombytes(12),
                                     randombytes(32),
                                     randombytes(*payload_size),
@@ -86,7 +86,7 @@ macro_rules! impl_comp {
                         b.iter_batched(
                             || {
                                 (
-                                    randombytes(16),
+                                    randombytes($keylen),
                                     randombytes(12),
                                     randombytes(32),
                                     randombytes(*payload_size),
@@ -109,7 +109,7 @@ macro_rules! impl_comp {
                         b.iter_batched(
                             || {
                                 (
-                                    randombytes(16),
+                                    randombytes($keylen),
                                     randombytes(12),
                                     randombytes(32),
                                     randombytes(*payload_size),
@@ -137,10 +137,8 @@ macro_rules! impl_comp {
 }
 
 use aes_gcm::{
-    aead::{Aead, AeadCore, KeyInit, OsRng},
-    Aes128Gcm,
-    Key, // Or `Aes128Gcm`
-    Nonce,
+    aead::{Aead, KeyInit, Payload},
+    Aes128Gcm, Aes256Gcm,
 };
 use rand::RngCore;
 
@@ -148,26 +146,51 @@ fn rustcrypto_aes128_gcm_encrypt(
     key: &[u8],
     nonce: &[u8],
     aad: &[u8],
-    plain: &[u8],
+    msg: &[u8],
     ciphertext: &mut [u8],
     tag: &mut [u8],
 ) {
     let cipher = Aes128Gcm::new(key.into());
-    let ctxt = cipher.encrypt(nonce.into(), plain).unwrap();
-    ciphertext.copy_from_slice(&ctxt[0..plain.len()]);
-    tag.copy_from_slice(&ctxt[plain.len()..]);
+    let ctxt = cipher.encrypt(nonce.into(), Payload { msg, aad }).unwrap();
+    ciphertext.copy_from_slice(&ctxt[0..msg.len()]);
+    tag.copy_from_slice(&ctxt[msg.len()..]);
+}
+
+// XXX: We could work with the traits here, but this is quicker for now.
+fn rustcrypto_aes256_gcm_encrypt(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    msg: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    let cipher = Aes256Gcm::new(key.into());
+    let ctxt = cipher.encrypt(nonce.into(), Payload { msg, aad }).unwrap();
+    ciphertext.copy_from_slice(&ctxt[0..msg.len()]);
+    tag.copy_from_slice(&ctxt[msg.len()..]);
 }
 
 impl_comp!(
     AES128_GCM,
-    libcrux_aesgcm::portable::aes128_gcm_encrypt,
-    libcrux_aesgcm::neon::aes128_gcm_encrypt,
-    libcrux_aesgcm::intel_ni::aes128_gcm_encrypt,
+    16,
+    libcrux_aesgcm::portable::aes_gcm_128::encrypt,
+    libcrux_aesgcm::neon::aes_gcm_128::encrypt,
+    libcrux_aesgcm::intel_ni::aes_gcm_128::encrypt,
     rustcrypto_aes128_gcm_encrypt
 );
+impl_comp!(
+    AES256_GCM,
+    32,
+    libcrux_aesgcm::portable::aes_gcm_256::encrypt,
+    libcrux_aesgcm::neon::aes_gcm_256::encrypt,
+    libcrux_aesgcm::intel_ni::aes_gcm_256::encrypt,
+    rustcrypto_aes256_gcm_encrypt
+);
 
 fn benchmarks(c: &mut Criterion) {
     AES128_GCM(c);
+    AES256_GCM(c);
 }
 
 criterion_group!(benches, benchmarks);
diff --git a/aesgcm/src/aes_generic.rs b/aesgcm/src/aes.rs
similarity index 87%
rename from aesgcm/src/aes_generic.rs
rename to aesgcm/src/aes.rs
index f508d00a5..b6e460763 100644
--- a/aesgcm/src/aes_generic.rs
+++ b/aesgcm/src/aes.rs
@@ -1,3 +1,5 @@
+//! The AES block cipher function.
+
 use crate::platform::*;
 
 pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
@@ -10,6 +12,8 @@ pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
 /// AES block size
 pub(crate) const AES_BLOCK_LEN: usize = 16;
 
+/// The AES block cipher function.
+#[inline]
 pub(crate) fn block_cipher<T: AESState, const NUM_KEYS: usize>(
     st: &mut T,
     keyex: &ExtendedKey<T, NUM_KEYS>,
diff --git a/aesgcm/src/aes_ctr.rs b/aesgcm/src/aes_ctr.rs
deleted file mode 100644
index a498602b1..000000000
--- a/aesgcm/src/aes_ctr.rs
+++ /dev/null
@@ -1,198 +0,0 @@
-use crate::{aes_generic::*, platform::AESState};
-
-mod aes128_ctr;
-mod aes256_ctr;
-
-pub(crate) use aes128_ctr::*;
-pub(crate) use aes256_ctr::*;
-
-const NONCE_LEN: usize = 16;
-
-/// Generic AES CTR context.
-pub(crate) struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
-    pub(crate) extended_key: ExtendedKey<T, NUM_KEYS>,
-    pub(crate) ctr_nonce: [u8; NONCE_LEN],
-}
-
-impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
-    fn aes_ctr_set_nonce(&mut self, nonce: &[u8]) {
-        debug_assert!(nonce.len() == crate::NONCE_LEN);
-
-        self.ctr_nonce[0..crate::NONCE_LEN].copy_from_slice(nonce);
-    }
-
-    fn aes_ctr_key_block(&self, ctr: u32, out: &mut [u8]) {
-        debug_assert!(out.len() == AES_BLOCK_LEN);
-
-        let mut st_init = self.ctr_nonce;
-        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
-        let mut st = T::new();
-
-        st.load_block(&st_init);
-
-        block_cipher(&mut st, &self.extended_key);
-
-        st.store_block(out);
-    }
-
-    #[inline(always)]
-    fn aes_ctr_xor_block(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
-
-        let mut st_init = self.ctr_nonce;
-        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
-        let mut st = T::new();
-        st.load_block(&st_init);
-
-        block_cipher(&mut st, &self.extended_key);
-
-        st.xor_block(input, out);
-    }
-
-    fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len() && input.len() % AES_BLOCK_LEN == 0);
-        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
-
-        let blocks = input.len() / AES_BLOCK_LEN;
-        for i in 0..blocks {
-            let offset = i * AES_BLOCK_LEN;
-            self.aes_ctr_xor_block(
-                ctr.wrapping_add(i as u32),
-                &input[offset..offset + AES_BLOCK_LEN],
-                &mut out[offset..offset + AES_BLOCK_LEN],
-            );
-        }
-    }
-
-    fn aes_ctr_update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len());
-        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
-
-        let blocks = input.len() / AES_BLOCK_LEN;
-        self.aes_ctr_xor_blocks(
-            ctr,
-            &input[0..blocks * AES_BLOCK_LEN],
-            &mut out[0..blocks * AES_BLOCK_LEN],
-        );
-
-        let last = input.len() - input.len() % AES_BLOCK_LEN;
-        if last < input.len() {
-            self.aes_ctr_xor_block(
-                ctr.wrapping_add(blocks as u32),
-                &input[last..],
-                &mut out[last..],
-            );
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use crate::{aes_ctr::Aes128CtrContext, platform};
-
-    use super::test_utils::*;
-
-    const INPUT: [u8; 32] = [
-        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
-        0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D,
-        0x1E, 0x1F,
-    ];
-    const KEY: [u8; 16] = [
-        0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, 0x32, 0x53, 0x91,
-        0x63,
-    ];
-    const NONCE: [u8; 12] = [
-        0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B,
-    ];
-    const EXPECTED: [u8; 32] = [
-        0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, 0xEE, 0x8E, 0xDA, 0xD3,
-        0x88, 0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41,
-        0xBE, 0x28,
-    ];
-
-    #[test]
-    fn test_ctr_block() {
-        let mut computed: [u8; 32] = [0u8; 32];
-        let ctx = Aes128CtrContext::<platform::portable::State>::init(&KEY, &NONCE);
-        aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
-        aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
-        for i in 0..32 {
-            if computed[i] != EXPECTED[i] {
-                #[cfg(feature = "std")]
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    #[test]
-    fn test_ctr_block_neon() {
-        let mut computed: [u8; 32] = [0u8; 32];
-        let ctx = Aes128CtrContext::<platform::neon::State>::init(&KEY, &NONCE);
-        aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
-        aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
-        for i in 0..32 {
-            if computed[i] != EXPECTED[i] {
-                #[cfg(feature = "std")]
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[test]
-    fn test_ctr_encrypt() {
-        let mut computed: [u8; 32] = [0u8; 32];
-        aes128_ctr_encrypt::<platform::portable::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
-        for i in 0..32 {
-            if computed[i] != EXPECTED[i] {
-                #[cfg(feature = "std")]
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    #[test]
-    fn test_ctr_encrypt_neon() {
-        let mut computed: [u8; 32] = [0u8; 32];
-        aes128_ctr_encrypt::<platform::neon::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
-        for i in 0..32 {
-            if computed[i] != EXPECTED[i] {
-                #[cfg(feature = "std")]
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
-    #[test]
-    fn test_ctr_encrypt_intel() {
-        let mut computed: [u8; 32] = [0u8; 32];
-        aes128_ctr_encrypt::<platform::intel_ni::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
-        for i in 0..32 {
-            if computed[i] != EXPECTED[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-}
diff --git a/aesgcm/src/aes_ctr/aes256_ctr.rs b/aesgcm/src/aes_ctr/aes256_ctr.rs
deleted file mode 100644
index e4bb42ace..000000000
--- a/aesgcm/src/aes_ctr/aes256_ctr.rs
+++ /dev/null
@@ -1,77 +0,0 @@
-use core::array::from_fn;
-
-use super::AesCtrContext;
-use crate::{aes_gcm_256::KEY_LEN, aes_generic::*, platform::AESState, NONCE_LEN};
-
-pub(crate) const NUM_KEYS: usize = 15;
-
-/// Type alias for the AES 256 ctr context.
-pub(crate) type Aes256CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
-
-impl<T: AESState> Aes256CtrContext<T> {
-    pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
-        debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(key.len() == KEY_LEN);
-
-        let mut ctr_nonce = [0u8; 16];
-        ctr_nonce[0..NONCE_LEN].copy_from_slice(nonce);
-
-        Self {
-            extended_key: key_expansion(key),
-            ctr_nonce,
-        }
-    }
-
-    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
-        debug_assert!(nonce.len() == NONCE_LEN);
-        self.aes_ctr_set_nonce(nonce);
-    }
-
-    pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
-        debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
-        self.aes_ctr_key_block(ctr, out);
-    }
-
-    pub(crate) fn update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len());
-        self.aes_ctr_update(ctr, input, out);
-    }
-}
-
-/// 256 - Key expansion
-fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
-    debug_assert!(key.len() == KEY_LEN);
-
-    let mut keyex = from_fn(|_| T::new());
-    keyex[0].load_block(&key[0..16]);
-    keyex[1].load_block(&key[16..32]);
-
-    macro_rules! expansion_step256 {
-        ($i:expr,$rcon:expr) => {
-            let prev0 = keyex[$i - 2].clone();
-            let prev1 = keyex[$i - 1].clone();
-
-            keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
-            keyex[$i].key_expansion_step(&prev0);
-
-            // XXX: avoid clone
-            let next0 = keyex[$i].clone();
-            keyex[$i + 1].aes_keygen_assist1(&next0);
-            keyex[$i + 1].key_expansion_step(&prev1);
-        };
-    }
-
-    expansion_step256!(2, 0x01);
-    expansion_step256!(4, 0x02);
-    expansion_step256!(6, 0x04);
-    expansion_step256!(8, 0x08);
-    expansion_step256!(10, 0x10);
-    expansion_step256!(12, 0x20);
-
-    let prev0 = keyex[12].clone();
-    let prev1 = keyex[13].clone();
-    keyex[14].aes_keygen_assist0::<0x40>(&prev1);
-    keyex[14].key_expansion_step(&prev0);
-
-    keyex
-}
diff --git a/aesgcm/src/aes_gcm_128.rs b/aesgcm/src/aes_gcm_128.rs
index 31fca7add..3ddeb4eb3 100644
--- a/aesgcm/src/aes_gcm_128.rs
+++ b/aesgcm/src/aes_gcm_128.rs
@@ -1,9 +1,9 @@
 #![allow(clippy::needless_range_loop)]
 
 use crate::{
-    aes_ctr::Aes128CtrContext,
-    aes_generic::AES_BLOCK_LEN,
-    gf128_generic::GF128State,
+    aes::AES_BLOCK_LEN,
+    ctr::Aes128CtrContext,
+    gf128::GF128State,
     platform::{AESState, GF128FieldElement},
     DecryptError, NONCE_LEN, TAG_LEN,
 };
@@ -18,9 +18,9 @@ pub(crate) struct State<T: AESState, U: GF128FieldElement> {
     pub(crate) tag_mix: [u8; TAG_LEN],
 }
 
-impl<T: AESState, U: GF128FieldElement> State<T, U> {
+impl<T: AESState, U: GF128FieldElement> super::State for State<T, U> {
     /// Initialize the state
-    pub(crate) fn init(key: &[u8]) -> Self {
+    fn init(key: &[u8]) -> Self {
         debug_assert!(key.len() == KEY_LEN);
 
         let nonce = [0u8; NONCE_LEN];
@@ -38,20 +38,14 @@ impl<T: AESState, U: GF128FieldElement> State<T, U> {
         }
     }
 
-    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+    fn set_nonce(&mut self, nonce: &[u8]) {
         debug_assert!(nonce.len() == NONCE_LEN);
 
         self.aes_state.set_nonce(nonce);
         self.aes_state.key_block(1, &mut self.tag_mix);
     }
 
-    pub(crate) fn encrypt(
-        &mut self,
-        aad: &[u8],
-        plaintext: &[u8],
-        ciphertext: &mut [u8],
-        tag: &mut [u8],
-    ) {
+    fn encrypt(&mut self, aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]) {
         debug_assert!(ciphertext.len() == plaintext.len());
         debug_assert!(plaintext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
         debug_assert!(tag.len() == TAG_LEN);
@@ -73,7 +67,7 @@ impl<T: AESState, U: GF128FieldElement> State<T, U> {
         }
     }
 
-    pub(crate) fn decrypt(
+    fn decrypt(
         &mut self,
         aad: &[u8],
         ciphertext: &[u8],
diff --git a/aesgcm/src/aes_gcm_256.rs b/aesgcm/src/aes_gcm_256.rs
index 3616a0e6b..1a12e2a99 100644
--- a/aesgcm/src/aes_gcm_256.rs
+++ b/aesgcm/src/aes_gcm_256.rs
@@ -1,9 +1,9 @@
 #![allow(clippy::needless_range_loop)]
 
 use crate::{
-    aes_ctr::Aes256CtrContext,
-    aes_generic::AES_BLOCK_LEN,
-    gf128_generic::GF128State,
+    aes::AES_BLOCK_LEN,
+    ctr::Aes256CtrContext,
+    gf128::GF128State,
     platform::{AESState, GF128FieldElement},
     DecryptError, NONCE_LEN, TAG_LEN,
 };
@@ -19,9 +19,9 @@ pub(crate) struct State<T: AESState, U: GF128FieldElement> {
     pub(crate) tag_mix: [u8; TAG_LEN],
 }
 
-impl<T: AESState, U: GF128FieldElement> State<T, U> {
+impl<T: AESState, U: GF128FieldElement> super::State for State<T, U> {
     /// Initialize the state
-    pub(crate) fn init(key: &[u8]) -> Self {
+    fn init(key: &[u8]) -> Self {
         debug_assert!(key.len() == KEY_LEN);
 
         let nonce = [0u8; NONCE_LEN];
@@ -39,20 +39,14 @@ impl<T: AESState, U: GF128FieldElement> State<T, U> {
         }
     }
 
-    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+    fn set_nonce(&mut self, nonce: &[u8]) {
         debug_assert!(nonce.len() == NONCE_LEN);
 
         self.aes_state.set_nonce(nonce);
         self.aes_state.key_block(1, &mut self.tag_mix);
     }
 
-    pub(crate) fn encrypt(
-        &mut self,
-        aad: &[u8],
-        plaintext: &[u8],
-        ciphertext: &mut [u8],
-        tag: &mut [u8],
-    ) {
+    fn encrypt(&mut self, aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]) {
         debug_assert!(ciphertext.len() == plaintext.len());
         debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
         debug_assert!(tag.len() == TAG_LEN);
@@ -74,7 +68,7 @@ impl<T: AESState, U: GF128FieldElement> State<T, U> {
         }
     }
 
-    pub(crate) fn decrypt(
+    fn decrypt(
         &mut self,
         aad: &[u8],
         ciphertext: &[u8],
diff --git a/aesgcm/src/ctr.rs b/aesgcm/src/ctr.rs
new file mode 100644
index 000000000..6a8d9e321
--- /dev/null
+++ b/aesgcm/src/ctr.rs
@@ -0,0 +1,103 @@
+//! AES ctr mode implementation.
+//!
+//! This implementation is generic over the [`AESState`], which has different,
+//! platform dependent implementations.
+//!
+//! This get's instantiated in [`aes128_ctr`] and [`aes256_ctr`].
+
+use crate::{aes::*, platform::AESState};
+
+#[cfg(test)]
+mod test128;
+
+mod aes128_ctr;
+mod aes256_ctr;
+
+pub(crate) use aes128_ctr::*;
+pub(crate) use aes256_ctr::*;
+
+/// The ctr nonce length. This is different from the AES nonce length
+/// [`crate::NONCE_LEN`].
+const NONCE_LEN: usize = 16;
+
+/// Generic AES CTR context.
+pub(crate) struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
+    pub(crate) extended_key: ExtendedKey<T, NUM_KEYS>,
+    pub(crate) ctr_nonce: [u8; NONCE_LEN],
+}
+
+impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
+    #[inline]
+    fn aes_ctr_set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == crate::NONCE_LEN);
+
+        self.ctr_nonce[0..crate::NONCE_LEN].copy_from_slice(nonce);
+    }
+
+    #[inline]
+    fn aes_ctr_key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == AES_BLOCK_LEN);
+
+        let mut st_init = self.ctr_nonce;
+        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+        let mut st = T::new();
+
+        st.load_block(&st_init);
+
+        block_cipher(&mut st, &self.extended_key);
+
+        st.store_block(out);
+    }
+
+    #[inline]
+    fn aes_ctr_xor_block(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
+
+        let mut st_init = self.ctr_nonce;
+        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+        let mut st = T::new();
+        st.load_block(&st_init);
+
+        block_cipher(&mut st, &self.extended_key);
+
+        st.xor_block(input, out);
+    }
+
+    #[inline]
+    fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() % AES_BLOCK_LEN == 0);
+        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
+
+        let blocks = input.len() / AES_BLOCK_LEN;
+        for i in 0..blocks {
+            let offset = i * AES_BLOCK_LEN;
+            self.aes_ctr_xor_block(
+                ctr.wrapping_add(i as u32),
+                &input[offset..offset + AES_BLOCK_LEN],
+                &mut out[offset..offset + AES_BLOCK_LEN],
+            );
+        }
+    }
+
+    #[inline]
+    fn aes_ctr_update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len());
+        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
+
+        let blocks = input.len() / AES_BLOCK_LEN;
+        self.aes_ctr_xor_blocks(
+            ctr,
+            &input[0..blocks * AES_BLOCK_LEN],
+            &mut out[0..blocks * AES_BLOCK_LEN],
+        );
+
+        let last = input.len() - input.len() % AES_BLOCK_LEN;
+        if last < input.len() {
+            self.aes_ctr_xor_block(
+                ctr.wrapping_add(blocks as u32),
+                &input[last..],
+                &mut out[last..],
+            );
+        }
+    }
+}
diff --git a/aesgcm/src/aes_ctr/aes128_ctr.rs b/aesgcm/src/ctr/aes128_ctr.rs
similarity index 63%
rename from aesgcm/src/aes_ctr/aes128_ctr.rs
rename to aesgcm/src/ctr/aes128_ctr.rs
index 41b9f437c..2db42ebb5 100644
--- a/aesgcm/src/aes_ctr/aes128_ctr.rs
+++ b/aesgcm/src/ctr/aes128_ctr.rs
@@ -1,7 +1,9 @@
+//! AES128 ctr mode, generic over the platform [`AESState`].
+
 use core::array::from_fn;
 
 use super::AesCtrContext;
-use crate::{aes_gcm_128::KEY_LEN, aes_generic::*, platform::AESState, NONCE_LEN};
+use crate::{aes_gcm_128::KEY_LEN, aes::*, platform::AESState, NONCE_LEN};
 
 pub(super) const NUM_KEYS: usize = 11;
 
@@ -9,6 +11,7 @@ pub(super) const NUM_KEYS: usize = 11;
 pub(crate) type Aes128CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
 
 impl<T: AESState> Aes128CtrContext<T> {
+    #[inline]
     pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
         debug_assert!(nonce.len() == NONCE_LEN);
         debug_assert!(key.len() == KEY_LEN);
@@ -22,18 +25,21 @@ impl<T: AESState> Aes128CtrContext<T> {
         }
     }
 
+    #[inline]
     pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
         debug_assert!(nonce.len() == NONCE_LEN);
 
         self.aes_ctr_set_nonce(nonce);
     }
 
+    #[inline]
     pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
         debug_assert!(out.len() == KEY_LEN);
 
         self.aes_ctr_key_block(ctr, out);
     }
 
+    #[inline]
     pub(crate) fn update(&self, ctr: u32, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len());
 
@@ -42,6 +48,7 @@ impl<T: AESState> Aes128CtrContext<T> {
 }
 
 /// 128 - Key expansion
+#[inline]
 fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
     debug_assert!(key.len() == KEY_LEN);
 
@@ -50,9 +57,11 @@ fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
 
     macro_rules! expansion_step128 {
         ($i:expr,$rcon:expr) => {
-            let prev = keyex[$i - 1].clone();
-            keyex[$i].aes_keygen_assist0::<$rcon>(&prev);
-            keyex[$i].key_expansion_step(&prev);
+            // For hax we could clone here instead.
+            // let prev = keyex[$i - 1].clone();
+            let (prev, current) = keyex.split_at_mut($i);
+            current[0].aes_keygen_assist0::<$rcon>(&prev[$i - 1]);
+            current[0].key_expansion_step(&prev[$i - 1]);
         };
     }
 
@@ -69,32 +78,3 @@ fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
 
     keyex
 }
-
-#[cfg(test)]
-pub(crate) mod test_utils {
-    use super::*;
-
-    pub(crate) fn aes128_ctr_xor_block<T: AESState>(
-        ctx: &Aes128CtrContext<T>,
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        ctx.aes_ctr_xor_block(ctr, inp, out);
-    }
-
-    pub(crate) fn aes128_ctr_encrypt<T: AESState>(
-        key: &[u8],
-        nonce: &[u8],
-        ctr: u32,
-        inp: &[u8],
-        out: &mut [u8],
-    ) {
-        debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(key.len() == KEY_LEN);
-        debug_assert!(inp.len() == out.len());
-        let ctx = Aes128CtrContext::<T>::init(key, nonce);
-        ctx.update(ctr, inp, out);
-    }
-}
diff --git a/aesgcm/src/ctr/aes256_ctr.rs b/aesgcm/src/ctr/aes256_ctr.rs
new file mode 100644
index 000000000..ae1e3496c
--- /dev/null
+++ b/aesgcm/src/ctr/aes256_ctr.rs
@@ -0,0 +1,107 @@
+//! AES256 ctr mode, generic over the platform [`AESState`].
+
+use core::array::from_fn;
+
+use super::AesCtrContext;
+use crate::{aes_gcm_256::KEY_LEN, aes::*, platform::AESState, NONCE_LEN};
+
+pub(crate) const NUM_KEYS: usize = 15;
+
+/// Type alias for the AES 256 ctr context.
+pub(crate) type Aes256CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
+
+impl<T: AESState> Aes256CtrContext<T> {
+    #[inline]
+    pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(key.len() == KEY_LEN);
+
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..NONCE_LEN].copy_from_slice(nonce);
+
+        Self {
+            extended_key: key_expansion(key),
+            ctr_nonce,
+        }
+    }
+
+    #[inline]
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        self.aes_ctr_set_nonce(nonce);
+    }
+
+    #[inline]
+    pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
+        self.aes_ctr_key_block(ctr, out);
+    }
+
+    #[inline]
+    pub(crate) fn update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len());
+        self.aes_ctr_update(ctr, input, out);
+    }
+}
+
+/// 256 - Key expansion
+#[inline]
+fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
+    debug_assert!(key.len() == KEY_LEN);
+
+    let mut keyex = from_fn(|_| T::new());
+    keyex[0].load_block(&key[0..16]);
+    keyex[1].load_block(&key[16..32]);
+
+    macro_rules! expansion_step256 {
+        ($i:expr,$rcon:expr) => {
+            // Split at $i to get the one we currently look at and the previous
+            // blocks.
+            let (prev, current) = keyex.split_at_mut($i);
+
+            // Split again to get the $i and $i + 1 states to operate on.
+            let (c0, c1) = current.split_at_mut(1);
+            let key_i = &mut c0[0];
+            let key_i_plus_1 = &mut c1[0];
+
+            key_i.aes_keygen_assist0::<$rcon>(&prev[$i - 1]);
+            key_i.key_expansion_step(&prev[$i - 2]);
+
+            key_i_plus_1.aes_keygen_assist1(&key_i);
+            key_i_plus_1.key_expansion_step(&prev[$i - 1]);
+
+            // The following is what will go through hax right now. But it
+            // requires copies that are really not necessary.
+            // let prev0 = keyex[$i - 2].clone();
+            // let prev1 = keyex[$i - 1].clone();
+
+            // keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
+            // keyex[$i].key_expansion_step(&prev0);
+
+            // let next0 = keyex[$i].clone();
+            // keyex[$i + 1].aes_keygen_assist1(&next0);
+            // keyex[$i + 1].key_expansion_step(&prev1);
+        };
+    }
+
+    expansion_step256!(2, 0x01);
+    expansion_step256!(4, 0x02);
+    expansion_step256!(6, 0x04);
+    expansion_step256!(8, 0x08);
+    expansion_step256!(10, 0x10);
+    expansion_step256!(12, 0x20);
+
+    let (prev0, tmp) = keyex.split_at_mut(13);
+    let (prev1, last) = tmp.split_at_mut(1);
+    // let prev0 = &mut prev0[12];
+    // let prev1 = &mut prev1[0];
+    // let last = &mut last[0];
+    // To get through hax right now we'd have to clone instead.
+    // let prev0 = keyex[12].clone();
+    // let prev1 = keyex[13].clone();
+    // let last = &mut keyex[NUM_KEYS - 1];
+    last[0].aes_keygen_assist0::<0x40>(&prev1[0]);
+    last[0].key_expansion_step(&prev0[12]);
+
+    keyex
+}
diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
new file mode 100644
index 000000000..439414a44
--- /dev/null
+++ b/aesgcm/src/ctr/test128.rs
@@ -0,0 +1,131 @@
+use crate::{
+    ctr::Aes128CtrContext,
+    aes_gcm_128::KEY_LEN,
+    platform::{self, AESState},
+    NONCE_LEN,
+};
+
+pub(crate) fn aes128_ctr_xor_block<T: AESState>(
+    ctx: &Aes128CtrContext<T>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    ctx.aes_ctr_xor_block(ctr, inp, out);
+}
+
+pub(crate) fn aes128_ctr_encrypt<T: AESState>(
+    key: &[u8],
+    nonce: &[u8],
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(nonce.len() == NONCE_LEN);
+    debug_assert!(key.len() == KEY_LEN);
+    debug_assert!(inp.len() == out.len());
+    let ctx = Aes128CtrContext::<T>::init(key, nonce);
+    ctx.update(ctr, inp, out);
+}
+
+const INPUT: [u8; 32] = [
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+];
+const KEY: [u8; 16] = [
+    0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, 0x32, 0x53, 0x91, 0x63,
+];
+const NONCE: [u8; 12] = [
+    0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B,
+];
+const EXPECTED: [u8; 32] = [
+    0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, 0xEE, 0x8E, 0xDA, 0xD3, 0x88,
+    0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41, 0xBE, 0x28,
+];
+
+#[test]
+fn test_ctr_block() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    let ctx = Aes128CtrContext::<platform::portable::State>::init(&KEY, &NONCE);
+    aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+    aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[test]
+fn test_ctr_block_neon() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    let ctx = Aes128CtrContext::<platform::neon::State>::init(&KEY, &NONCE);
+    aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+    aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[test]
+fn test_ctr_encrypt() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    aes128_ctr_encrypt::<platform::portable::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[test]
+fn test_ctr_encrypt_neon() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    aes128_ctr_encrypt::<platform::neon::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
+#[test]
+fn test_ctr_encrypt_intel() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    aes128_ctr_encrypt::<platform::intel_ni::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
diff --git a/aesgcm/src/gf128.rs b/aesgcm/src/gf128.rs
new file mode 100644
index 000000000..26d4307dc
--- /dev/null
+++ b/aesgcm/src/gf128.rs
@@ -0,0 +1,75 @@
+//! Generic Gf128 implementation.
+//!
+//! Generic over platform dependent [`GF128FieldElement`].
+
+use crate::{aes::AES_BLOCK_LEN, platform::*};
+
+#[cfg(test)]
+mod test;
+
+/// Generic Gf128 state.
+pub(crate) struct GF128State<T: GF128FieldElement> {
+    accumulator: T,
+    r: T,
+}
+
+const KEY_LEN: usize = 16;
+
+impl<T: GF128FieldElement> GF128State<T> {
+    #[inline]
+    pub(crate) fn init(key: &[u8]) -> Self {
+        debug_assert!(key.len() == KEY_LEN);
+
+        Self {
+            accumulator: T::zero(),
+            r: T::load_element(key),
+        }
+    }
+
+    #[inline]
+    pub(crate) fn update(&mut self, block: &[u8]) {
+        debug_assert!(block.len() == KEY_LEN);
+
+        let block_elem = T::load_element(block);
+        self.accumulator.add(&block_elem);
+        self.accumulator.mul(&self.r);
+    }
+
+    #[inline]
+    pub(crate) fn update_blocks(&mut self, input: &[u8]) {
+        debug_assert!(input.len() % 16 == 0);
+
+        let blocks = input.len() / AES_BLOCK_LEN;
+        for i in 0..blocks {
+            let offset = i * AES_BLOCK_LEN;
+            self.update(&input[offset..offset + AES_BLOCK_LEN]);
+        }
+    }
+
+    #[inline]
+    pub(crate) fn update_last(&mut self, partial_block: &[u8]) {
+        debug_assert!(partial_block.len() < 16);
+
+        let mut block = [0u8; 16];
+        block[0..partial_block.len()].copy_from_slice(partial_block);
+        self.update(&block);
+    }
+
+    #[inline]
+    pub(crate) fn update_padded(&mut self, input: &[u8]) {
+        let blocks = input.len() / AES_BLOCK_LEN;
+        self.update_blocks(&input[0..blocks * AES_BLOCK_LEN]);
+
+        let last = input.len() - input.len() % AES_BLOCK_LEN;
+        if last < input.len() {
+            self.update_last(&input[last..]);
+        }
+    }
+
+    #[inline]
+    pub(crate) fn emit(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
+
+        self.accumulator.store_element(out);
+    }
+}
diff --git a/aesgcm/src/gf128/test.rs b/aesgcm/src/gf128/test.rs
new file mode 100644
index 000000000..b755bd68e
--- /dev/null
+++ b/aesgcm/src/gf128/test.rs
@@ -0,0 +1,79 @@
+use super::*;
+
+fn gf128<T: GF128FieldElement>(key: &[u8], input: &[u8], out: &mut [u8]) {
+    debug_assert!(key.len() == 16);
+    debug_assert!(out.len() == 16);
+
+    let mut st = GF128State::<T>::init(key);
+    st.update_padded(input);
+    st.emit(out);
+}
+
+const INPUT: [u8; 132] = [
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xab, 0xad, 0xda, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
+    0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
+    0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
+    0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e, 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
+    0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20, 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
+    0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4, 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
+    0x44, 0xae, 0x7e, 0x3f,
+];
+
+const KEY: [u8; 16] = [
+    0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7,
+];
+
+const EXPECTED: [u8; 16] = [
+    0xfb, 0xba, 0xaa, 0x70, 0xa0, 0x73, 0x6f, 0xf9, 0xed, 0x2f, 0xc4, 0x62, 0xde, 0x72, 0x61, 0xe0,
+];
+
+#[test]
+fn test_gf128() {
+    let mut computed: [u8; 16] = [0u8; 16];
+    gf128::<crate::platform::portable::FieldElement>(&KEY, &INPUT, &mut computed);
+    for i in 0..16 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[test]
+fn test_gf128_neon() {
+    let mut computed: [u8; 16] = [0u8; 16];
+    gf128::<crate::platform::neon::FieldElement>(&KEY, &INPUT, &mut computed);
+    for i in 0..16 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
+#[test]
+fn test_gf128_intel() {
+    let mut computed: [u8; 16] = [0u8; 16];
+    gf128::<crate::platform::intel_ni::FieldElement>(&KEY, &INPUT, &mut computed);
+    for i in 0..16 {
+        if computed[i] != EXPECTED[i] {
+            println!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i, EXPECTED[i], computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
diff --git a/aesgcm/src/gf128_generic.rs b/aesgcm/src/gf128_generic.rs
deleted file mode 100644
index 4c26170d6..000000000
--- a/aesgcm/src/gf128_generic.rs
+++ /dev/null
@@ -1,147 +0,0 @@
-use crate::{aes_generic::AES_BLOCK_LEN, platform::*};
-
-pub(crate) struct GF128State<T: GF128FieldElement> {
-    accumulator: T,
-    r: T,
-}
-
-const KEY_LEN: usize = 16;
-
-impl<T: GF128FieldElement> GF128State<T> {
-    pub(crate) fn init(key: &[u8]) -> Self {
-        debug_assert!(key.len() == KEY_LEN);
-
-        Self {
-            accumulator: T::zero(),
-            r: T::load_element(key),
-        }
-    }
-
-    #[inline(always)]
-    pub(crate) fn update(&mut self, block: &[u8]) {
-        debug_assert!(block.len() == KEY_LEN);
-
-        let block_elem = T::load_element(block);
-        self.accumulator.add(&block_elem);
-        self.accumulator.mul(&self.r);
-    }
-
-    pub(crate) fn update_blocks(&mut self, input: &[u8]) {
-        debug_assert!(input.len() % 16 == 0);
-
-        let blocks = input.len() / AES_BLOCK_LEN;
-        for i in 0..blocks {
-            let offset = i * AES_BLOCK_LEN;
-            self.update(&input[offset..offset + AES_BLOCK_LEN]);
-        }
-    }
-
-    pub(crate) fn update_last(&mut self, partial_block: &[u8]) {
-        debug_assert!(partial_block.len() < 16);
-
-        let mut block = [0u8; 16];
-        block[0..partial_block.len()].copy_from_slice(partial_block);
-        self.update(&block);
-    }
-
-    pub(crate) fn update_padded(&mut self, input: &[u8]) {
-        let blocks = input.len() / AES_BLOCK_LEN;
-        self.update_blocks(&input[0..blocks * AES_BLOCK_LEN]);
-
-        let last = input.len() - input.len() % AES_BLOCK_LEN;
-        if last < input.len() {
-            self.update_last(&input[last..]);
-        }
-    }
-
-    pub(crate) fn emit(&self, out: &mut [u8]) {
-        debug_assert!(out.len() == 16);
-
-        self.accumulator.store_element(out);
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    fn gf128<T: GF128FieldElement>(key: &[u8], input: &[u8], out: &mut [u8]) {
-        debug_assert!(key.len() == 16);
-        debug_assert!(out.len() == 16);
-
-        let mut st = GF128State::<T>::init(key);
-        st.update_padded(input);
-        st.emit(out);
-    }
-
-    const INPUT: [u8; 132] = [
-        0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe,
-        0xef, 0xab, 0xad, 0xda, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65,
-        0x9e, 0x2a, 0x20, 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, 0xa0, 0x58, 0xab, 0x4f,
-        0x6f, 0x74, 0x6b, 0xf4, 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x2d, 0xa3, 0xeb,
-        0xf1, 0xc5, 0xd8, 0x2c, 0xde, 0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e, 0x5a, 0x8d,
-        0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20, 0xee,
-        0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
-        0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x44, 0xae, 0x7e, 0x3f,
-    ];
-
-    const KEY: [u8; 16] = [
-        0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda,
-        0xd7,
-    ];
-
-    const EXPECTED: [u8; 16] = [
-        0xfb, 0xba, 0xaa, 0x70, 0xa0, 0x73, 0x6f, 0xf9, 0xed, 0x2f, 0xc4, 0x62, 0xde, 0x72, 0x61,
-        0xe0,
-    ];
-
-    #[test]
-    fn test_gf128() {
-        let mut computed: [u8; 16] = [0u8; 16];
-        gf128::<crate::platform::portable::FieldElement>(&KEY, &INPUT, &mut computed);
-        for i in 0..16 {
-            if computed[i] != EXPECTED[i] {
-                #[cfg(feature = "std")]
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    #[test]
-    fn test_gf128_neon() {
-        let mut computed: [u8; 16] = [0u8; 16];
-        gf128::<crate::platform::neon::FieldElement>(&KEY, &INPUT, &mut computed);
-        for i in 0..16 {
-            if computed[i] != EXPECTED[i] {
-                #[cfg(feature = "std")]
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-
-    #[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
-    #[test]
-    fn test_gf128_intel() {
-        let mut computed: [u8; 16] = [0u8; 16];
-        gf128::<crate::platform::intel_ni::FieldElement>(&KEY, &INPUT, &mut computed);
-        for i in 0..16 {
-            if computed[i] != EXPECTED[i] {
-                println!(
-                    "mismatch at {}: expected is {}, computed is {}",
-                    i, EXPECTED[i], computed[i]
-                );
-                assert!(false);
-            }
-        }
-    }
-}
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 73e7f791d..081f62264 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -1,8 +1,8 @@
 #![no_std]
 
-mod aes_ctr;
-mod aes_generic;
-mod gf128_generic;
+mod aes;
+mod ctr;
+mod gf128;
 mod platform;
 
 mod aes_gcm_128;
@@ -10,6 +10,21 @@ mod aes_gcm_256;
 
 pub use libcrux_traits::aead::arrayref::Aead;
 
+/// Trait for an AES State.
+/// Implemented for 128 and 256.
+pub(crate) trait State {
+    fn init(key: &[u8]) -> Self;
+    fn set_nonce(&mut self, nonce: &[u8]);
+    fn encrypt(&mut self, aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]);
+    fn decrypt(
+        &mut self,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError>;
+}
+
 /// AES-GCM decryption error.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct DecryptError();
@@ -32,265 +47,244 @@ pub struct X64AesGcm128 {}
 #[cfg(not(target_arch = "x86_64"))]
 pub type X64AesGcm128 = PortableAesGcm128;
 
+/// AES-GCM 256.
+pub struct AesGcm256 {}
+
+/// Portable AES-GCM 256.
+pub struct PortableAesGcm256 {}
+
+/// Neon AES-GCM 256.
+#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+pub struct NeonAesGcm256 {}
+
+/// Neon AES-GCM 256.
+#[cfg(not(all(target_arch = "aarch64", target_feature = "aes")))]
+pub type NeonAesGcm256 = PortableAesGcm256;
+
+/// AES-NI AES-GCM 256.
+#[cfg(target_arch = "x86_64")]
+pub struct X64AesGcm256 {}
+
+/// AES-NI AES-GCM 256.
+#[cfg(not(target_arch = "x86_64"))]
+pub type X64AesGcm256 = PortableAesGcm256;
+
 /// Tag length.
 pub(crate) const TAG_LEN: usize = 16;
 
 /// Nonce length.
 pub(crate) const NONCE_LEN: usize = 12;
 
-mod aes128 {
-    use super::*;
-    use aes_gcm_128::KEY_LEN;
-    use libcrux_traits::aead::arrayref::{DecryptError, EncryptError};
-
-    pub type Key = [u8; KEY_LEN];
-    pub type Tag = [u8; TAG_LEN];
-    pub type Nonce = [u8; NONCE_LEN];
-
-    impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for AesGcm128 {
-        fn encrypt(
-            ciphertext: &mut [u8],
-            tag: &mut Tag,
-            key: &Key,
-            nonce: &Nonce,
-            aad: &[u8],
-            plaintext: &[u8],
-        ) -> Result<(), EncryptError> {
-            if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
-                NeonAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-            } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
-                X64AesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-            } else {
-                PortableAesGcm128::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-            }
-        }
+/// Generic AES-GCM encrypt.
+pub(crate) fn encrypt<S: State>(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    plaintext: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    // XXX: debug_assert!(key.len() == KEY_LEN);
+    debug_assert!(nonce.len() == NONCE_LEN);
+    debug_assert!(tag.len() == TAG_LEN);
+
+    let mut st = S::init(key);
+    st.set_nonce(nonce);
+    st.encrypt(aad, plaintext, ciphertext, tag);
+}
 
-        fn decrypt(
-            plaintext: &mut [u8],
-            key: &Key,
-            nonce: &Nonce,
-            aad: &[u8],
-            ciphertext: &[u8],
-            tag: &Tag,
-        ) -> Result<(), DecryptError> {
-            if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
-                NeonAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-            } else if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
-                X64AesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-            } else {
-                PortableAesGcm128::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-            }
-        }
-    }
-
-    impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for PortableAesGcm128 {
-        fn encrypt(
-            ciphertext: &mut [u8],
-            tag: &mut Tag,
-            key: &Key,
-            nonce: &Nonce,
-            aad: &[u8],
-            plaintext: &[u8],
-        ) -> Result<(), EncryptError> {
-            portable::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-            Ok(())
-        }
+/// Generic AES-GCM decrypt.
+pub(crate) fn decrypt<S: State>(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    ciphertext: &[u8],
+    tag: &[u8],
+    plaintext: &mut [u8],
+) -> Result<(), DecryptError> {
+    // XXX: debug_assert!(key.len() == KEY_LEN);
+    debug_assert!(nonce.len() == NONCE_LEN);
+    debug_assert!(tag.len() == TAG_LEN);
+
+    let mut st = S::init(key);
+    st.set_nonce(nonce);
+    st.decrypt(aad, ciphertext, tag, plaintext)
+}
 
-        fn decrypt(
-            plaintext: &mut [u8],
-            key: &Key,
-            nonce: &Nonce,
-            aad: &[u8],
-            ciphertext: &[u8],
-            tag: &Tag,
-        ) -> Result<(), DecryptError> {
-            portable::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-                .map_err(|_| DecryptError::InvalidTag)
-        }
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-    impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for NeonAesGcm128 {
-        fn encrypt(
-            ciphertext: &mut [u8],
-            tag: &mut Tag,
-            key: &Key,
-            nonce: &Nonce,
-            aad: &[u8],
-            plaintext: &[u8],
-        ) -> Result<(), EncryptError> {
-            neon::aes128_gcm_encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-            Ok(())
-        }
+/// Macro to instantiate the different variants, both 128/256 and platforms.
+macro_rules! pub_mod {
+    ($variant_comment:literal, $mod_name:ident, $state:ty) => {
+        #[doc = $variant_comment]
+        pub mod $mod_name {
+            use crate::$mod_name::KEY_LEN;
+            use crate::{platform, DecryptError};
+
+            type State = $state;
+
+            #[doc = $variant_comment]
+            /// encrypt.
+            pub fn encrypt(
+                key: &[u8],
+                nonce: &[u8],
+                aad: &[u8],
+                plaintext: &[u8],
+                ciphertext: &mut [u8],
+                tag: &mut [u8],
+            ) {
+                debug_assert!(key.len() == KEY_LEN);
+                crate::encrypt::<State>(key, nonce, aad, plaintext, ciphertext, tag);
+            }
 
-        fn decrypt(
-            plaintext: &mut [u8],
-            key: &Key,
-            nonce: &Nonce,
-            aad: &[u8],
-            ciphertext: &[u8],
-            tag: &Tag,
-        ) -> Result<(), DecryptError> {
-            neon::aes128_gcm_decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-                .map_err(|_| DecryptError::InvalidTag)
+            #[doc = $variant_comment]
+            /// decrypt.
+            pub fn decrypt(
+                key: &[u8],
+                nonce: &[u8],
+                aad: &[u8],
+                ciphertext: &[u8],
+                tag: &[u8],
+                plaintext: &mut [u8],
+            ) -> Result<(), DecryptError> {
+                debug_assert!(key.len() == KEY_LEN);
+                crate::decrypt::<State>(key, nonce, aad, ciphertext, tag, plaintext)
+            }
         }
-    }
+    };
 }
 
 pub mod portable {
-    use crate::{
-        aes_gcm_128::{self},
-        aes_gcm_256::{self},
-        platform, DecryptError, NONCE_LEN, TAG_LEN,
-    };
-
-    // XXX: It doesn't really make sense to have these states. We should abstract
-    // this differently
-
-    type Aes128State =
-        aes_gcm_128::State<platform::portable::State, platform::portable::FieldElement>;
-
-    type Aes256State =
-        aes_gcm_256::State<platform::portable::State, platform::portable::FieldElement>;
-
-    pub fn aes128_gcm_encrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        plaintext: &[u8],
-        ciphertext: &mut [u8],
-        tag: &mut [u8],
-    ) {
-        debug_assert!(key.len() == aes_gcm_128::KEY_LEN);
-        debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        let mut st = Aes128State::init(key);
-        st.set_nonce(nonce);
-        st.encrypt(aad, plaintext, ciphertext, tag);
-    }
-
-    pub fn aes128_gcm_decrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &[u8],
-        plaintext: &mut [u8],
-    ) -> Result<(), DecryptError> {
-        debug_assert!(key.len() == aes_gcm_128::KEY_LEN);
-        debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        let mut st = Aes128State::init(key);
-        st.set_nonce(nonce);
-        st.decrypt(aad, ciphertext, tag, plaintext)
-    }
-
-    pub fn aes256_gcm_encrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        plaintext: &[u8],
-        ciphertext: &mut [u8],
-        tag: &mut [u8],
-    ) {
-        debug_assert!(key.len() == aes_gcm_256::KEY_LEN);
-        debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        let mut st = Aes256State::init(key);
-        st.set_nonce(nonce);
-        st.encrypt(aad, plaintext, ciphertext, tag);
-    }
-
-    pub fn aes256_gcm_decrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &[u8],
-        plaintext: &mut [u8],
-    ) -> Result<(), DecryptError> {
-        debug_assert!(key.len() == aes_gcm_256::KEY_LEN);
-        debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        let mut st = Aes256State::init(key);
-        st.set_nonce(nonce);
-        st.decrypt(aad, ciphertext, tag, plaintext)
-    }
+    pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::portable::State, platform::portable::FieldElement>);
+    pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::portable::State, platform::portable::FieldElement>);
 }
 
 #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
 pub mod neon {
-    use crate::{platform, DecryptError};
+    pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::neon::State, platform::neon::FieldElement>);
+    pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::neon::State, platform::neon::FieldElement>);
+}
 
-    type State = crate::aes_gcm_128::State<platform::neon::State, platform::neon::FieldElement>;
+/// Macro to implement the different structs and multiplexing.
+macro_rules! api {
+    ($mod_name:ident, $variant:ident, $multiplexing:ty, $portable:ident, $neon:ident, $x64:ident) => {
+        mod $mod_name {
+            use super::*;
+            use libcrux_traits::aead::arrayref::{DecryptError, EncryptError};
+            use $variant::KEY_LEN;
+
+            pub type Key = [u8; KEY_LEN];
+            pub type Tag = [u8; TAG_LEN];
+            pub type Nonce = [u8; NONCE_LEN];
+
+            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
+                fn encrypt(
+                    ciphertext: &mut [u8],
+                    tag: &mut Tag,
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    plaintext: &[u8],
+                ) -> Result<(), EncryptError> {
+                    if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
+                        $neon::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                    } else if libcrux_platform::simd256_support()
+                        && libcrux_platform::aes_ni_support()
+                    {
+                        $x64::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                    } else {
+                        $portable::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                    }
+                }
+
+                fn decrypt(
+                    plaintext: &mut [u8],
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    ciphertext: &[u8],
+                    tag: &Tag,
+                ) -> Result<(), DecryptError> {
+                    if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
+                        $neon::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                    } else if libcrux_platform::simd256_support()
+                        && libcrux_platform::aes_ni_support()
+                    {
+                        $x64::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                    } else {
+                        $portable::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                    }
+                }
+            }
 
-    pub fn aes128_gcm_encrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        plaintext: &[u8],
-        ciphertext: &mut [u8],
-        tag: &mut [u8],
-    ) {
-        let mut st = State::init(key);
-        st.set_nonce(nonce);
-        st.encrypt(aad, plaintext, ciphertext, tag);
-    }
-
-    pub fn aes128_gcm_decrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &[u8],
-        plaintext: &mut [u8],
-    ) -> Result<(), DecryptError> {
-        let mut st = State::init(key);
-        st.set_nonce(nonce);
-        st.decrypt(aad, ciphertext, tag, plaintext)
-    }
-}
+            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
+                fn encrypt(
+                    ciphertext: &mut [u8],
+                    tag: &mut Tag,
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    plaintext: &[u8],
+                ) -> Result<(), EncryptError> {
+                    portable::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                    Ok(())
+                }
+
+                fn decrypt(
+                    plaintext: &mut [u8],
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    ciphertext: &[u8],
+                    tag: &Tag,
+                ) -> Result<(), DecryptError> {
+                    portable::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                        .map_err(|_| DecryptError::InvalidTag)
+                }
+            }
 
-#[cfg(target_arch = "x86_64")] // REENABLE target_feature="aes"
-pub mod intel_ni {
-    use crate::{
-        aes_gcm::{self, DecryptError},
-        platform,
+            #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
+                fn encrypt(
+                    ciphertext: &mut [u8],
+                    tag: &mut Tag,
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    plaintext: &[u8],
+                ) -> Result<(), EncryptError> {
+                    neon::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                    Ok(())
+                }
+
+                fn decrypt(
+                    plaintext: &mut [u8],
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    ciphertext: &[u8],
+                    tag: &Tag,
+                ) -> Result<(), DecryptError> {
+                    neon::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                        .map_err(|_| DecryptError::InvalidTag)
+                }
+            }
+        }
     };
-
-    pub fn aes128_gcm_encrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        plaintext: &[u8],
-        ciphertext: &mut [u8],
-        tag: &mut [u8],
-    ) {
-        let mut st = aes_gcm::aes128_gcm_init::<
-            platform::intel_ni::State,
-            platform::intel_ni::FieldElement,
-        >(key);
-        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        aes_gcm::aes128_gcm_encrypt(&mut st, aad, plaintext, ciphertext, tag);
-    }
-
-    pub fn aes128_gcm_decrypt(
-        key: &[u8],
-        nonce: &[u8],
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &[u8],
-        plaintext: &mut [u8],
-    ) -> Result<(), DecryptError> {
-        let mut st = aes_gcm::aes128_gcm_init::<
-            platform::intel_ni::State,
-            platform::intel_ni::FieldElement,
-        >(key);
-        aes_gcm::aes128_gcm_set_nonce(&mut st, nonce);
-        aes_gcm::aes_gcm_128::aes128_gcm_decrypt(&mut st, aad, ciphertext, tag, plaintext)
-    }
 }
+
+api!(
+    aes128,
+    aes_gcm_128,
+    AesGcm128,
+    PortableAesGcm128,
+    NeonAesGcm128,
+    X64AesGcm128
+);
+
+api!(
+    aes256,
+    aes_gcm_256,
+    AesGcm256,
+    PortableAesGcm256,
+    NeonAesGcm256,
+    X64AesGcm256
+);
diff --git a/aesgcm/src/platform.rs b/aesgcm/src/platform.rs
index 29d2d4b97..8657dd2c6 100644
--- a/aesgcm/src/platform.rs
+++ b/aesgcm/src/platform.rs
@@ -1,12 +1,15 @@
-pub mod portable;
+//! Traits for platform dependent implementations
+
+pub(crate) mod portable;
 
 #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
-pub mod neon;
+pub(crate) mod neon;
 
 #[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
-pub mod intel_ni;
+pub(crate) mod intel_ni;
 
-pub trait AESState: Clone + core::fmt::Debug {
+/// The AES state.
+pub(crate) trait AESState: Clone + core::fmt::Debug {
     fn new() -> Self;
     fn load_block(&mut self, b: &[u8]);
     fn store_block(&self, out: &mut [u8]);
@@ -20,7 +23,8 @@ pub trait AESState: Clone + core::fmt::Debug {
     fn key_expansion_step(&mut self, prev: &Self);
 }
 
-pub trait GF128FieldElement {
+/// A gf128 field element.
+pub(crate) trait GF128FieldElement {
     fn zero() -> Self;
     fn load_element(bytes: &[u8]) -> Self;
     fn store_element(&self, bytes: &mut [u8]);
diff --git a/aesgcm/src/platform/intel_ni/aes_core.rs b/aesgcm/src/platform/intel_ni/aes_core.rs
index 327794bee..d94eca869 100644
--- a/aesgcm/src/platform/intel_ni/aes_core.rs
+++ b/aesgcm/src/platform/intel_ni/aes_core.rs
@@ -1,37 +1,46 @@
 use core::arch::x86_64::*;
 
-pub(crate) type State = __m128i;
+/// The avx2 state.
+pub(super) type State = __m128i;
 
+#[inline]
 fn new_state() -> State {
     unsafe { _mm_setzero_si128() }
 }
 
+#[inline]
 fn xor_key1_state(st: &mut State, k: &State) {
     unsafe { *st = _mm_xor_si128(*st, *k) }
 }
 
+#[inline]
 fn aes_enc(st: &mut State, key: &State) {
     unsafe { *st = _mm_aesenc_si128(*st, *key) }
 }
 
+#[inline]
 fn aes_enc_last(st: &mut State, key: &State) {
     unsafe { *st = _mm_aesenclast_si128(*st, *key) }
 }
 
+#[inline]
 fn aes_keygen_assist<const RCON: i32>(next: &mut State, prev: &State) {
     unsafe { *next = _mm_aeskeygenassist_si128::<RCON>(*prev) }
 }
 
+#[inline]
 fn aes_keygen_assist0<const RCON: i32>(next: &mut State, prev: &State) {
     aes_keygen_assist::<RCON>(next, prev);
     unsafe { *next = _mm_shuffle_epi32(*next, 0xff) }
 }
 
+#[inline]
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist::<0>(next, prev);
     unsafe { *next = _mm_shuffle_epi32(*next, 0xaa) }
 }
 
+#[inline]
 fn key_expansion_step(next: &mut State, prev: &State) {
     unsafe {
         let p0 = _mm_xor_si128(*prev, _mm_slli_si128(*prev, 4));
@@ -42,20 +51,24 @@ fn key_expansion_step(next: &mut State, prev: &State) {
 }
 
 impl crate::platform::AESState for State {
+    #[inline]
     fn new() -> Self {
         new_state()
     }
 
+    #[inline]
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
         unsafe { *self = _mm_loadu_si128(b.as_ptr() as *const __m128i) };
     }
 
+    #[inline]
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
         unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, *self) }
     }
 
+    #[inline]
     fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
         let inp_vec = unsafe { _mm_loadu_si128(inp.as_ptr() as *const __m128i) };
@@ -63,27 +76,33 @@ impl crate::platform::AESState for State {
         unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, out_vec) }
     }
 
+    #[inline]
     fn xor_key(&mut self, key: &Self) {
         xor_key1_state(self, key);
     }
 
+    #[inline]
     fn aes_enc(&mut self, key: &Self) {
         aes_enc(self, key);
         (self, key);
     }
 
+    #[inline]
     fn aes_enc_last(&mut self, key: &Self) {
         aes_enc_last(self, key);
     }
 
+    #[inline]
     fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
         aes_keygen_assist0::<RCON>(self, prev);
     }
 
+    #[inline]
     fn aes_keygen_assist1(&mut self, prev: &Self) {
         aes_keygen_assist1(self, prev);
     }
 
+    #[inline]
     fn key_expansion_step(&mut self, prev: &Self) {
         key_expansion_step(self, prev)
     }
diff --git a/aesgcm/src/platform/intel_ni/gf128_core.rs b/aesgcm/src/platform/intel_ni/gf128_core.rs
index 139aeb438..7fd9f5a65 100644
--- a/aesgcm/src/platform/intel_ni/gf128_core.rs
+++ b/aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -1,28 +1,34 @@
 use core::arch::x86_64::*;
 
-// A lot of the code below is shared with NEON. Refactor!
+// XXX: A lot of the code below is shared with NEON. Refactor!
 
+/// An avx2 gf128 field element.
 #[derive(Clone, Copy)]
-pub struct FieldElement(pub u128);
+pub(super) struct FieldElement(pub(super) u128);
 
+#[inline]
 fn zero() -> FieldElement {
     FieldElement(0)
 }
 
+#[inline]
 fn load_elem(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
     FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
 }
 
+#[inline]
 fn store_elem(elem: &FieldElement, b: &mut [u8]) {
     debug_assert!(b.len() == 16);
     b.copy_from_slice(&elem.0.to_be_bytes());
 }
 
+#[inline]
 fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
     FieldElement((*elem).0 ^ (*other).0)
 }
 
+#[inline]
 fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
     let lhs: __m128i = unsafe { std::mem::transmute((*elem).0) };
     let rhs: __m128i = unsafe { std::mem::transmute((*other).0) };
@@ -41,6 +47,7 @@ fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldEl
     (FieldElement(low128), FieldElement(high128))
 }
 
+#[inline]
 fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
     let high = ((*high).0 << 1) ^ ((*low).0 >> 127);
     let low = (*low).0 << 1;
@@ -50,28 +57,34 @@ fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
     FieldElement(x1_x0 ^ high)
 }
 
+#[inline]
 fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
     let (high, low) = mul_wide(x, y);
     reduce(&high, &low)
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
+    #[inline]
     fn zero() -> Self {
         zero()
     }
 
+    #[inline]
     fn load_elem(b: &[u8]) -> Self {
         load_elem(b)
     }
 
+    #[inline]
     fn store_elem(&self, b: &mut [u8]) {
         store_elem(self, b);
     }
 
+    #[inline]
     fn add(&mut self, other: &Self) {
         *self = add(self, other);
     }
 
+    #[inline]
     fn mul(&mut self, other: &Self) {
         *self = mul(self, other)
     }
diff --git a/aesgcm/src/platform/neon/aes_core.rs b/aesgcm/src/platform/neon/aes_core.rs
index 8b6967c5d..ed6553ff9 100644
--- a/aesgcm/src/platform/neon/aes_core.rs
+++ b/aesgcm/src/platform/neon/aes_core.rs
@@ -1,23 +1,29 @@
 use core::arch::aarch64::*;
 
+/// The Neon state
 pub(crate) type State = uint8x16_t;
 
+#[inline]
 fn new_state() -> State {
     unsafe { vdupq_n_u8(0) }
 }
 
+#[inline]
 fn xor_key1_state(st: &mut State, k: &State) {
     unsafe { *st = veorq_u8(*st, *k) }
 }
 
+#[inline]
 fn aes_enc(st: &mut State, key: &State) {
     unsafe { *st = veorq_u8(vaesmcq_u8(vaeseq_u8(*st, vdupq_n_u8(0))), *key) }
 }
 
+#[inline]
 fn aes_enc_last(st: &mut State, key: &State) {
     unsafe { *st = veorq_u8(vaeseq_u8(*st, vdupq_n_u8(0)), *key) }
 }
 
+#[inline]
 fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
     unsafe {
         let st = vaeseq_u8(*prev, vdupq_n_u8(0));
@@ -34,16 +40,19 @@ fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
     }
 }
 
+#[inline]
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
     unsafe { *next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 3)) }
 }
 
+#[inline]
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
     unsafe { *next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 2)) }
 }
 
+#[inline]
 fn key_expansion_step(next: &mut State, prev: &State) {
     unsafe {
         let zero = vdupq_n_u32(0);
@@ -56,20 +65,24 @@ fn key_expansion_step(next: &mut State, prev: &State) {
 }
 
 impl crate::platform::AESState for State {
+    #[inline]
     fn new() -> Self {
         new_state()
     }
 
+    #[inline]
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
         unsafe { *self = vld1q_u8(b.as_ptr()) };
     }
 
+    #[inline]
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
         unsafe { vst1q_u8(out.as_mut_ptr(), *self) }
     }
 
+    #[inline]
     fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
         debug_assert!(inp.len() == out.len() && inp.len() <= 16);
         let inp_vec = unsafe { vld1q_u8(inp.as_ptr()) };
@@ -77,26 +90,32 @@ impl crate::platform::AESState for State {
         unsafe { vst1q_u8(out.as_mut_ptr(), out_vec) }
     }
 
+    #[inline]
     fn xor_key(&mut self, key: &Self) {
         xor_key1_state(self, key);
     }
 
+    #[inline]
     fn aes_enc(&mut self, key: &Self) {
         aes_enc(self, key);
     }
 
+    #[inline]
     fn aes_enc_last(&mut self, key: &Self) {
         aes_enc_last(self, key);
     }
 
+    #[inline]
     fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
         aes_keygen_assist0(self, prev, RCON as u8);
     }
 
+    #[inline]
     fn aes_keygen_assist1(&mut self, prev: &Self) {
         aes_keygen_assist1(self, prev);
     }
 
+    #[inline]
     fn key_expansion_step(&mut self, prev: &Self) {
         key_expansion_step(self, prev)
     }
diff --git a/aesgcm/src/platform/neon/gf128_core.rs b/aesgcm/src/platform/neon/gf128_core.rs
index af3e4be8b..db0174671 100644
--- a/aesgcm/src/platform/neon/gf128_core.rs
+++ b/aesgcm/src/platform/neon/gf128_core.rs
@@ -1,26 +1,32 @@
 use libcrux_intrinsics::arm64::*;
 
+/// A Neon gf128 field element
 #[derive(Clone, Copy)]
-pub struct FieldElement(pub u128);
+pub(crate) struct FieldElement(pub(crate) u128);
 
+#[inline]
 fn zero() -> FieldElement {
     FieldElement(0)
 }
 
+#[inline]
 fn load_element(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
     FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
 }
 
+#[inline]
 fn store_element(element: &FieldElement, bytes: &mut [u8]) {
     debug_assert!(bytes.len() == 16);
     bytes.copy_from_slice(&element.0.to_be_bytes());
 }
 
+#[inline]
 fn add(element: &mut FieldElement, other: &FieldElement) {
     element.0 ^= other.0;
 }
 
+#[inline]
 fn mul_wide(element: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
     let l0 = element.0 as u64;
     let h0 = (element.0 >> 64) as u64;
@@ -41,6 +47,7 @@ fn mul_wide(element: &FieldElement, other: &FieldElement) -> (FieldElement, Fiel
     (FieldElement(high), FieldElement(low))
 }
 
+#[inline]
 fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
     let high = (high.0 << 1) ^ (low.0 >> 127);
     let low = low.0 << 1;
@@ -50,28 +57,34 @@ fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
     FieldElement(x1_x0 ^ high)
 }
 
+#[inline]
 fn mul(x: &mut FieldElement, y: &FieldElement) {
     let (high, low) = mul_wide(x, y);
     *x = reduce(&high, &low);
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
+    #[inline]
     fn zero() -> Self {
         zero()
     }
 
+    #[inline]
     fn load_element(b: &[u8]) -> Self {
         load_element(b)
     }
 
+    #[inline]
     fn store_element(&self, b: &mut [u8]) {
         store_element(self, b);
     }
 
+    #[inline]
     fn add(&mut self, other: &Self) {
         add(self, other);
     }
 
+    #[inline]
     fn mul(&mut self, other: &Self) {
         mul(self, other)
     }
diff --git a/aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
index 7a6a1ec92..7cc75160b 100644
--- a/aesgcm/src/platform/portable/aes_core.rs
+++ b/aesgcm/src/platform/portable/aes_core.rs
@@ -1,9 +1,13 @@
 #![allow(clippy::needless_range_loop)]
 
-use crate::aes_generic::AES_BLOCK_LEN;
+use crate::aes::AES_BLOCK_LEN;
+
+#[cfg(test)]
+mod test;
 
 pub(crate) type State = [u16; 8];
 
+#[inline]
 fn new_state() -> State {
     [0u16; 8]
 }
@@ -149,6 +153,7 @@ fn xnor(a: u16, b: u16) -> u16 {
     !(a ^ b)
 }
 
+#[inline]
 fn sub_bytes_state(st: &mut State) {
     let u0 = st[7];
     let u1 = st[6];
@@ -311,6 +316,7 @@ fn shift_row_u16(input: u16) -> u16 {
         | ((input & 0x0888) << 4)
 }
 
+#[inline]
 fn shift_rows_state(st: &mut State) {
     st[0] = shift_row_u16(st[0]);
     st[1] = shift_row_u16(st[1]);
@@ -350,6 +356,7 @@ fn xor_key1_state(st: &mut State, k: &State) {
     st[7] ^= k[7];
 }
 
+#[inline]
 fn aes_enc(st: &mut State, key: &State) {
     sub_bytes_state(st);
     shift_rows_state(st);
@@ -357,6 +364,7 @@ fn aes_enc(st: &mut State, key: &State) {
     xor_key1_state(st, key)
 }
 
+#[inline]
 fn aes_enc_last(st: &mut State, key: &State) {
     sub_bytes_state(st);
     shift_rows_state(st);
@@ -374,6 +382,7 @@ fn aes_keygen_assisti(rcon: u8, i: usize, u: u16) -> u16 {
     n ^ (u3 >> 4)
 }
 
+#[inline]
 fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
     next.copy_from_slice(prev);
     sub_bytes_state(next);
@@ -388,6 +397,7 @@ fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
     next[7] = aes_keygen_assisti(rcon, 7, next[7]);
 }
 
+#[inline]
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
 
@@ -409,6 +419,7 @@ fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     next[7] = aux(next[7]);
 }
 
+#[inline]
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
 
@@ -436,6 +447,7 @@ fn key_expand1(p: u16, n: u16) -> u16 {
     n ^ p
 }
 
+#[inline]
 fn key_expansion_step(next: &mut State, prev: &State) {
     next[0] = key_expand1(prev[0], next[0]);
     next[1] = key_expand1(prev[1], next[1]);
@@ -448,16 +460,19 @@ fn key_expansion_step(next: &mut State, prev: &State) {
 }
 
 impl crate::platform::AESState for State {
+    #[inline]
     fn new() -> Self {
         new_state()
     }
 
+    #[inline]
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
 
         transpose_u8x16(b.try_into().unwrap(), self);
     }
 
+    #[inline]
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
 
@@ -476,30 +491,33 @@ impl crate::platform::AESState for State {
         }
     }
 
+    #[inline]
     fn xor_key(&mut self, key: &Self) {
         xor_key1_state(self, key);
     }
 
+    #[inline]
     fn aes_enc(&mut self, key: &Self) {
         aes_enc(self, key);
     }
 
+    #[inline]
     fn aes_enc_last(&mut self, key: &Self) {
         aes_enc_last(self, key);
     }
 
+    #[inline]
     fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
         aes_keygen_assist0(self, prev, RCON as u8);
     }
 
+    #[inline]
     fn aes_keygen_assist1(&mut self, prev: &Self) {
         aes_keygen_assist1(self, prev);
     }
 
+    #[inline]
     fn key_expansion_step(&mut self, prev: &Self) {
         key_expansion_step(self, prev)
     }
 }
-
-#[cfg(test)]
-mod test;
diff --git a/aesgcm/src/platform/portable/gf128_core.rs b/aesgcm/src/platform/portable/gf128_core.rs
index ff138ffa0..ce680d36d 100644
--- a/aesgcm/src/platform/portable/gf128_core.rs
+++ b/aesgcm/src/platform/portable/gf128_core.rs
@@ -1,29 +1,31 @@
+
+/// A portable gf128 field element.
 pub(crate) type FieldElement = u128;
 
-#[inline(always)]
+#[inline]
 fn zero() -> FieldElement {
     0
 }
 
-#[inline(always)]
+#[inline]
 fn load_element(bytes: &[u8]) -> FieldElement {
     debug_assert!(bytes.len() == 16);
 
     u128::from_be_bytes(bytes.try_into().unwrap())
 }
 
-#[inline(always)]
+#[inline]
 fn store_element(element: &FieldElement, bytes: &mut [u8]) {
     debug_assert!(bytes.len() == 16);
     bytes.copy_from_slice(&u128::to_be_bytes(*element));
 }
 
-#[inline(always)]
+#[inline]
 fn add(element: &FieldElement, other: &FieldElement) -> FieldElement {
     element ^ other
 }
 
-#[inline(always)]
+#[inline]
 fn ith_bit_mask(elem: &FieldElement, i: usize) -> FieldElement {
     debug_assert!(i < 128);
 
@@ -37,13 +39,13 @@ fn ith_bit_mask(elem: &FieldElement, i: usize) -> FieldElement {
 
 const IRRED: FieldElement = 0xE100_0000_0000_0000_0000_0000_0000_0000;
 
-#[inline(always)]
+#[inline]
 fn mul_x(elem: &mut FieldElement) {
     let mask = ith_bit_mask(elem, 127);
     *elem = (*elem >> 1) ^ (IRRED & mask)
 }
 
-#[inline(always)]
+#[inline]
 fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut FieldElement) {
     debug_assert!(i < 128);
     let mask = ith_bit_mask(x, i);
@@ -51,7 +53,7 @@ fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut Field
     mul_x(y);
 }
 
-#[inline(always)]
+#[inline]
 fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
     let mut result = 0;
     let mut multiplicand = *y;
@@ -62,27 +64,27 @@ fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
 }
 
 impl crate::platform::GF128FieldElement for FieldElement {
-    #[inline(always)]
+    #[inline]
     fn zero() -> Self {
         zero()
     }
 
-    #[inline(always)]
+    #[inline]
     fn load_element(bytes: &[u8]) -> Self {
         load_element(bytes)
     }
 
-    #[inline(always)]
+    #[inline]
     fn store_element(&self, bytes: &mut [u8]) {
         store_element(self, bytes);
     }
 
-    #[inline(always)]
+    #[inline]
     fn add(&mut self, other: &Self) {
         *self = add(self, other);
     }
 
-    #[inline(always)]
+    #[inline]
     fn mul(&mut self, other: &Self) {
         *self = mul(self, other)
     }
diff --git a/aesgcm/tests/wycheproof.rs b/aesgcm/tests/wycheproof.rs
index 1ac5b0bb1..6f1ff7e7e 100644
--- a/aesgcm/tests/wycheproof.rs
+++ b/aesgcm/tests/wycheproof.rs
@@ -1,14 +1,11 @@
-use libcrux_aesgcm::{
-    portable::{aes256_gcm_decrypt, aes256_gcm_encrypt},
-    Aead,
-};
+use libcrux_aesgcm::Aead;
 use wycheproof::{aead::Test, TestResult};
 
 #[test]
 fn test() {
     let test_set = wycheproof::aead::TestSet::load(wycheproof::aead::TestName::AesGcm).unwrap();
 
-    fn run<const KEY_LEN: usize, Cipher: Aead<16, 16, 12>>(test: &Test) {
+    fn run<const KEY_LEN: usize, Cipher: Aead<KEY_LEN, 16, 12>>(test: &Test) {
         let mut ciphertext = vec![0u8; test.pt.len()];
         let mut plaintext = vec![0u8; test.pt.len()];
         let mut tag = [0u8; 16];
@@ -75,49 +72,20 @@ fn test() {
         } else if test_group.key_size == 256 {
             for test in test_group.tests {
                 println!("  Test AES-GCM 256 {}", test.tc_id);
-                println!("    pt:    {:?}", &test.pt);
-                println!("    aad:   {:?}", &test.aad);
-                println!("    key:   {:?}", &test.key);
-                println!("    nonce: {:?}", &test.nonce);
 
-                let mut ciphertext = vec![0u8; test.pt.len()];
-                let mut plaintext = vec![0u8; test.pt.len()];
-                let mut tag = [0u8; 16];
+                // Multiplexing
+                run::<32, libcrux_aesgcm::AesGcm256>(&test);
 
-                aes256_gcm_encrypt(
-                    &test.key,
-                    &test.nonce,
-                    &test.aad,
-                    &test.pt,
-                    &mut ciphertext,
-                    &mut tag,
-                );
-                aes256_gcm_decrypt(
-                    &test.key,
-                    &test.nonce,
-                    &test.aad,
-                    &ciphertext,
-                    &tag,
-                    &mut plaintext,
-                )
-                .unwrap();
+                // Portable
+                run::<32, libcrux_aesgcm::PortableAesGcm256>(&test);
 
-                assert_eq!(plaintext.as_slice(), test.pt.as_slice());
+                // Neon
+                #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+                run::<32, libcrux_aesgcm::NeonAesGcm256>(&test);
 
-                if test.result == TestResult::Valid {
-                    assert_eq!(
-                        test.ct.as_slice(),
-                        &ciphertext,
-                        "\nExpected: {}\nGot: {}",
-                        hex::encode(test.ct.as_slice()),
-                        hex::encode(&ciphertext)
-                    );
-                    assert_eq!(test.tag.as_slice(), &tag);
-                } else {
-                    let ct_ok = test.ct.as_slice() == &ciphertext;
-                    let tag_ok = test.tag.as_slice() == &tag;
-                    assert!(!ct_ok || !tag_ok);
-                }
+                // x64
+                #[cfg(all(target_arch = "x86_64"))]
+                run::<32, libcrux_aesgcm::X64AesGcm256>(&test);
             }
         }
     }

From c8da23c9a5ac3543b149e2718280b4ec9e07fc22 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Sat, 20 Sep 2025 11:04:49 +0200
Subject: [PATCH 23/43] less code duplication, but ugly

---
 aesgcm/src/aes.rs            |  3 ++
 aesgcm/src/aes_gcm_128.rs    | 94 ++----------------------------------
 aesgcm/src/aes_gcm_256.rs    | 93 ++---------------------------------
 aesgcm/src/ctr.rs            |  2 +-
 aesgcm/src/ctr/aes128_ctr.rs |  8 +--
 aesgcm/src/ctr/test128.rs    |  4 +-
 aesgcm/src/gf128.rs          |  2 +-
 aesgcm/src/lib.rs            |  1 +
 8 files changed, 19 insertions(+), 188 deletions(-)

diff --git a/aesgcm/src/aes.rs b/aesgcm/src/aes.rs
index b6e460763..cf688ea59 100644
--- a/aesgcm/src/aes.rs
+++ b/aesgcm/src/aes.rs
@@ -19,8 +19,11 @@ pub(crate) fn block_cipher<T: AESState, const NUM_KEYS: usize>(
     keyex: &ExtendedKey<T, NUM_KEYS>,
 ) {
     st.xor_key(&keyex[0]);
+
+    #[allow(clippy::needless_range_loop)]
     for i in 1..NUM_KEYS - 1 {
         st.aes_enc(&keyex[i]);
     }
+
     st.aes_enc_last(&keyex[NUM_KEYS - 1]);
 }
diff --git a/aesgcm/src/aes_gcm_128.rs b/aesgcm/src/aes_gcm_128.rs
index 3ddeb4eb3..64cd6d3c3 100644
--- a/aesgcm/src/aes_gcm_128.rs
+++ b/aesgcm/src/aes_gcm_128.rs
@@ -1,7 +1,8 @@
-#![allow(clippy::needless_range_loop)]
+//! AES-GCM 128
 
 use crate::{
     aes::AES_BLOCK_LEN,
+    aes_gcm::aesgcm,
     ctr::Aes128CtrContext,
     gf128::GF128State,
     platform::{AESState, GF128FieldElement},
@@ -10,6 +11,7 @@ use crate::{
 
 /// Key length.
 pub(crate) const KEY_LEN: usize = 16;
+pub(crate) const GCM_KEY_LEN: usize = 16;
 
 /// The AES-GCM 128 state
 pub(crate) struct State<T: AESState, U: GF128FieldElement> {
@@ -18,92 +20,4 @@ pub(crate) struct State<T: AESState, U: GF128FieldElement> {
     pub(crate) tag_mix: [u8; TAG_LEN],
 }
 
-impl<T: AESState, U: GF128FieldElement> super::State for State<T, U> {
-    /// Initialize the state
-    fn init(key: &[u8]) -> Self {
-        debug_assert!(key.len() == KEY_LEN);
-
-        let nonce = [0u8; NONCE_LEN];
-        let mut gcm_key = [0u8; KEY_LEN];
-        let tag_mix = [0u8; TAG_LEN];
-
-        let aes_state = Aes128CtrContext::<T>::init(key, &nonce);
-        aes_state.key_block(0, &mut gcm_key);
-        let gcm_state = GF128State::init(&gcm_key);
-
-        Self {
-            aes_state,
-            gcm_state,
-            tag_mix,
-        }
-    }
-
-    fn set_nonce(&mut self, nonce: &[u8]) {
-        debug_assert!(nonce.len() == NONCE_LEN);
-
-        self.aes_state.set_nonce(nonce);
-        self.aes_state.key_block(1, &mut self.tag_mix);
-    }
-
-    fn encrypt(&mut self, aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]) {
-        debug_assert!(ciphertext.len() == plaintext.len());
-        debug_assert!(plaintext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        self.aes_state.update(2, plaintext, ciphertext);
-
-        self.gcm_state.update_padded(aad);
-        self.gcm_state.update_padded(ciphertext);
-
-        let mut last_block = [0u8; AES_BLOCK_LEN];
-        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-
-        self.gcm_state.update(&last_block);
-        self.gcm_state.emit(tag);
-
-        for i in 0..16 {
-            tag[i] ^= self.tag_mix[i];
-        }
-    }
-
-    fn decrypt(
-        &mut self,
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &[u8],
-        plaintext: &mut [u8],
-    ) -> Result<(), DecryptError> {
-        debug_assert!(plaintext.len() == ciphertext.len());
-        debug_assert!(ciphertext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        self.gcm_state.update_padded(aad);
-        self.gcm_state.update_padded(ciphertext);
-
-        let mut last_block = [0u8; AES_BLOCK_LEN];
-        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-
-        self.gcm_state.update(&last_block);
-
-        let mut computed_tag = [0u8; TAG_LEN];
-        self.gcm_state.emit(&mut computed_tag);
-
-        for i in 0..16 {
-            computed_tag[i] ^= self.tag_mix[i];
-        }
-
-        let mut eq_mask = 0u8;
-        for i in 0..16 {
-            eq_mask |= computed_tag[i] ^ tag[i];
-        }
-
-        if eq_mask == 0 {
-            self.aes_state.update(2, ciphertext, plaintext);
-            Ok(())
-        } else {
-            Err(DecryptError())
-        }
-    }
-}
+aesgcm!(State<T, U>, Aes128CtrContext);
diff --git a/aesgcm/src/aes_gcm_256.rs b/aesgcm/src/aes_gcm_256.rs
index 1a12e2a99..070a31031 100644
--- a/aesgcm/src/aes_gcm_256.rs
+++ b/aesgcm/src/aes_gcm_256.rs
@@ -1,7 +1,8 @@
-#![allow(clippy::needless_range_loop)]
+//! AES-GCM 256
 
 use crate::{
     aes::AES_BLOCK_LEN,
+    aes_gcm::aesgcm,
     ctr::Aes256CtrContext,
     gf128::GF128State,
     platform::{AESState, GF128FieldElement},
@@ -19,92 +20,4 @@ pub(crate) struct State<T: AESState, U: GF128FieldElement> {
     pub(crate) tag_mix: [u8; TAG_LEN],
 }
 
-impl<T: AESState, U: GF128FieldElement> super::State for State<T, U> {
-    /// Initialize the state
-    fn init(key: &[u8]) -> Self {
-        debug_assert!(key.len() == KEY_LEN);
-
-        let nonce = [0u8; NONCE_LEN];
-        let mut gcm_key = [0u8; GCM_KEY_LEN];
-        let tag_mix = [0u8; TAG_LEN];
-
-        let aes_state = Aes256CtrContext::<T>::init(key, &nonce);
-        aes_state.key_block(0, &mut gcm_key);
-        let gcm_state = GF128State::init(&gcm_key);
-
-        Self {
-            aes_state,
-            gcm_state,
-            tag_mix,
-        }
-    }
-
-    fn set_nonce(&mut self, nonce: &[u8]) {
-        debug_assert!(nonce.len() == NONCE_LEN);
-
-        self.aes_state.set_nonce(nonce);
-        self.aes_state.key_block(1, &mut self.tag_mix);
-    }
-
-    fn encrypt(&mut self, aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]) {
-        debug_assert!(ciphertext.len() == plaintext.len());
-        debug_assert!(plaintext.len() / 16 <= u32::MAX as usize);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        self.aes_state.update(2, plaintext, ciphertext);
-
-        self.gcm_state.update_padded(aad);
-        self.gcm_state.update_padded(ciphertext);
-
-        let mut last_block = [0u8; AES_BLOCK_LEN];
-        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-
-        self.gcm_state.update(&last_block);
-        self.gcm_state.emit(tag);
-
-        for i in 0..16 {
-            tag[i] ^= self.tag_mix[i];
-        }
-    }
-
-    fn decrypt(
-        &mut self,
-        aad: &[u8],
-        ciphertext: &[u8],
-        tag: &[u8],
-        plaintext: &mut [u8],
-    ) -> Result<(), DecryptError> {
-        debug_assert!(plaintext.len() == ciphertext.len());
-        debug_assert!(ciphertext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
-        debug_assert!(tag.len() == TAG_LEN);
-
-        self.gcm_state.update_padded(aad);
-        self.gcm_state.update_padded(ciphertext);
-
-        let mut last_block = [0u8; AES_BLOCK_LEN];
-        last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
-        last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
-
-        self.gcm_state.update(&last_block);
-
-        let mut computed_tag = [0u8; TAG_LEN];
-        self.gcm_state.emit(&mut computed_tag);
-
-        for i in 0..16 {
-            computed_tag[i] ^= self.tag_mix[i];
-        }
-
-        let mut eq_mask = 0u8;
-        for i in 0..16 {
-            eq_mask |= computed_tag[i] ^ tag[i];
-        }
-
-        if eq_mask == 0 {
-            self.aes_state.update(2, ciphertext, plaintext);
-            Ok(())
-        } else {
-            Err(DecryptError())
-        }
-    }
-}
+aesgcm!(State<T, U>, Aes256CtrContext);
diff --git a/aesgcm/src/ctr.rs b/aesgcm/src/ctr.rs
index 6a8d9e321..2d732616c 100644
--- a/aesgcm/src/ctr.rs
+++ b/aesgcm/src/ctr.rs
@@ -65,7 +65,7 @@ impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
 
     #[inline]
     fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
-        debug_assert!(input.len() == out.len() && input.len() % AES_BLOCK_LEN == 0);
+        debug_assert!(input.len() == out.len() && input.len().is_multiple_of(AES_BLOCK_LEN));
         debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
 
         let blocks = input.len() / AES_BLOCK_LEN;
diff --git a/aesgcm/src/ctr/aes128_ctr.rs b/aesgcm/src/ctr/aes128_ctr.rs
index 2db42ebb5..87a7e9134 100644
--- a/aesgcm/src/ctr/aes128_ctr.rs
+++ b/aesgcm/src/ctr/aes128_ctr.rs
@@ -3,7 +3,7 @@
 use core::array::from_fn;
 
 use super::AesCtrContext;
-use crate::{aes_gcm_128::KEY_LEN, aes::*, platform::AESState, NONCE_LEN};
+use crate::{aes::*, aes_gcm_128::GCM_KEY_LEN, platform::AESState, NONCE_LEN};
 
 pub(super) const NUM_KEYS: usize = 11;
 
@@ -14,7 +14,7 @@ impl<T: AESState> Aes128CtrContext<T> {
     #[inline]
     pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
         debug_assert!(nonce.len() == NONCE_LEN);
-        debug_assert!(key.len() == KEY_LEN);
+        debug_assert!(key.len() == GCM_KEY_LEN);
 
         let mut ctr_nonce = [0u8; 16];
         ctr_nonce[0..12].copy_from_slice(nonce);
@@ -34,7 +34,7 @@ impl<T: AESState> Aes128CtrContext<T> {
 
     #[inline]
     pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
-        debug_assert!(out.len() == KEY_LEN);
+        debug_assert!(out.len() == GCM_KEY_LEN);
 
         self.aes_ctr_key_block(ctr, out);
     }
@@ -50,7 +50,7 @@ impl<T: AESState> Aes128CtrContext<T> {
 /// 128 - Key expansion
 #[inline]
 fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
-    debug_assert!(key.len() == KEY_LEN);
+    debug_assert!(key.len() == GCM_KEY_LEN);
 
     let mut keyex = from_fn(|_| T::new());
     keyex[0].load_block(key);
diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
index 439414a44..7d5fb92d3 100644
--- a/aesgcm/src/ctr/test128.rs
+++ b/aesgcm/src/ctr/test128.rs
@@ -1,6 +1,6 @@
 use crate::{
+    aes_gcm_128::GCM_KEY_LEN,
     ctr::Aes128CtrContext,
-    aes_gcm_128::KEY_LEN,
     platform::{self, AESState},
     NONCE_LEN,
 };
@@ -23,7 +23,7 @@ pub(crate) fn aes128_ctr_encrypt<T: AESState>(
     out: &mut [u8],
 ) {
     debug_assert!(nonce.len() == NONCE_LEN);
-    debug_assert!(key.len() == KEY_LEN);
+    debug_assert!(key.len() == GCM_KEY_LEN);
     debug_assert!(inp.len() == out.len());
     let ctx = Aes128CtrContext::<T>::init(key, nonce);
     ctx.update(ctr, inp, out);
diff --git a/aesgcm/src/gf128.rs b/aesgcm/src/gf128.rs
index 26d4307dc..b17064ecb 100644
--- a/aesgcm/src/gf128.rs
+++ b/aesgcm/src/gf128.rs
@@ -37,7 +37,7 @@ impl<T: GF128FieldElement> GF128State<T> {
 
     #[inline]
     pub(crate) fn update_blocks(&mut self, input: &[u8]) {
-        debug_assert!(input.len() % 16 == 0);
+        debug_assert!(input.len().is_multiple_of(AES_BLOCK_LEN));
 
         let blocks = input.len() / AES_BLOCK_LEN;
         for i in 0..blocks {
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 081f62264..44762fdf0 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -5,6 +5,7 @@ mod ctr;
 mod gf128;
 mod platform;
 
+mod aes_gcm;
 mod aes_gcm_128;
 mod aes_gcm_256;
 

From f1c55f1e26907fcbef5fec117bdda0011341d97d Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Sat, 20 Sep 2025 11:06:20 +0200
Subject: [PATCH 24/43] forgot a file

---
 aesgcm/src/aes_gcm.rs | 105 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 aesgcm/src/aes_gcm.rs

diff --git a/aesgcm/src/aes_gcm.rs b/aesgcm/src/aes_gcm.rs
new file mode 100644
index 000000000..25d5b2591
--- /dev/null
+++ b/aesgcm/src/aes_gcm.rs
@@ -0,0 +1,105 @@
+//! Implementation of AES-GCM
+
+/// Macro to instantiate the AES state.
+/// This should really be replaced by using traits everywhere.
+macro_rules! aesgcm {
+    ($state:ty, $context:ident) => {
+        impl<T: AESState, U: GF128FieldElement> super::State for $state {
+            /// Initialize the state
+            fn init(key: &[u8]) -> Self {
+                debug_assert!(key.len() == KEY_LEN);
+
+                let nonce = [0u8; NONCE_LEN];
+                let mut gcm_key = [0u8; GCM_KEY_LEN];
+                let tag_mix = [0u8; TAG_LEN];
+
+                let aes_state = $context::<T>::init(key, &nonce);
+                aes_state.key_block(0, &mut gcm_key);
+                let gcm_state = GF128State::init(&gcm_key);
+
+                Self {
+                    aes_state,
+                    gcm_state,
+                    tag_mix,
+                }
+            }
+
+            fn set_nonce(&mut self, nonce: &[u8]) {
+                debug_assert!(nonce.len() == NONCE_LEN);
+
+                self.aes_state.set_nonce(nonce);
+                self.aes_state.key_block(1, &mut self.tag_mix);
+            }
+
+            fn encrypt(
+                &mut self,
+                aad: &[u8],
+                plaintext: &[u8],
+                ciphertext: &mut [u8],
+                tag: &mut [u8],
+            ) {
+                debug_assert!(ciphertext.len() == plaintext.len());
+                debug_assert!(plaintext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
+                debug_assert!(tag.len() == TAG_LEN);
+
+                self.aes_state.update(2, plaintext, ciphertext);
+
+                self.gcm_state.update_padded(aad);
+                self.gcm_state.update_padded(ciphertext);
+
+                let mut last_block = [0u8; AES_BLOCK_LEN];
+                last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+                last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+                self.gcm_state.update(&last_block);
+                self.gcm_state.emit(tag);
+
+                for i in 0..16 {
+                    tag[i] ^= self.tag_mix[i];
+                }
+            }
+
+            fn decrypt(
+                &mut self,
+                aad: &[u8],
+                ciphertext: &[u8],
+                tag: &[u8],
+                plaintext: &mut [u8],
+            ) -> Result<(), DecryptError> {
+                debug_assert!(plaintext.len() == ciphertext.len());
+                debug_assert!(ciphertext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
+                debug_assert!(tag.len() == TAG_LEN);
+
+                self.gcm_state.update_padded(aad);
+                self.gcm_state.update_padded(ciphertext);
+
+                let mut last_block = [0u8; AES_BLOCK_LEN];
+                last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+                last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+                self.gcm_state.update(&last_block);
+
+                let mut computed_tag = [0u8; TAG_LEN];
+                self.gcm_state.emit(&mut computed_tag);
+
+                for i in 0..16 {
+                    computed_tag[i] ^= self.tag_mix[i];
+                }
+
+                let mut eq_mask = 0u8;
+                for i in 0..16 {
+                    eq_mask |= computed_tag[i] ^ tag[i];
+                }
+
+                if eq_mask == 0 {
+                    self.aes_state.update(2, ciphertext, plaintext);
+                    Ok(())
+                } else {
+                    Err(DecryptError())
+                }
+            }
+        }
+    };
+}
+
+pub(crate) use aesgcm;

From 5cd727e71d9f4880eab0b50d792c48b99e079de3 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Sun, 21 Sep 2025 08:10:20 +0200
Subject: [PATCH 25/43] fixup x64

---
 aesgcm/Cargo.toml                             |   3 +-
 aesgcm/benches/aesgcm.rs                      |   4 +-
 aesgcm/build.rs                               |  34 +++++
 aesgcm/src/ctr/test128.rs                     |  12 +-
 aesgcm/src/gf128/test.rs                      |   8 +-
 aesgcm/src/lib.rs                             | 140 ++++++++++++++++--
 aesgcm/src/platform.rs                        |   4 +-
 aesgcm/src/platform/intel_ni.rs               |   1 +
 aesgcm/src/platform/intel_ni/aes_core.rs      |  31 ++--
 aesgcm/src/platform/intel_ni/gf128_core.rs    |  33 +++--
 aesgcm/src/platform/neon.rs                   |   5 +-
 aesgcm/src/platform/neon/gf128_core.rs        |   2 +
 aesgcm/src/platform/portable.rs               |   5 +-
 aesgcm/src/platform/portable/aes_core/test.rs |  20 +--
 aesgcm/tests/wycheproof.rs                    |   1 +
 15 files changed, 234 insertions(+), 69 deletions(-)
 create mode 100644 aesgcm/build.rs

diff --git a/aesgcm/Cargo.toml b/aesgcm/Cargo.toml
index 3494c05e1..876708a07 100644
--- a/aesgcm/Cargo.toml
+++ b/aesgcm/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "libcrux-aesgcm"
+name = "libcrux_aesgcm"
 version.workspace = true
 authors.workspace = true
 license.workspace = true
@@ -32,6 +32,7 @@ name = "aesgcm"
 harness = false
 
 [dev-dependencies]
+libcrux_aesgcm = { version = "*", features = ["std"], path = "." }
 cavp = { version = "0.0.2", path = "../cavp" }
 criterion = "0.5.1"
 hex = "0.4.3"
diff --git a/aesgcm/benches/aesgcm.rs b/aesgcm/benches/aesgcm.rs
index 5dda968a0..1fe2a78d7 100644
--- a/aesgcm/benches/aesgcm.rs
+++ b/aesgcm/benches/aesgcm.rs
@@ -176,7 +176,7 @@ impl_comp!(
     16,
     libcrux_aesgcm::portable::aes_gcm_128::encrypt,
     libcrux_aesgcm::neon::aes_gcm_128::encrypt,
-    libcrux_aesgcm::intel_ni::aes_gcm_128::encrypt,
+    libcrux_aesgcm::x64::aes_gcm_128::encrypt,
     rustcrypto_aes128_gcm_encrypt
 );
 impl_comp!(
@@ -184,7 +184,7 @@ impl_comp!(
     32,
     libcrux_aesgcm::portable::aes_gcm_256::encrypt,
     libcrux_aesgcm::neon::aes_gcm_256::encrypt,
-    libcrux_aesgcm::intel_ni::aes_gcm_256::encrypt,
+    libcrux_aesgcm::x64::aes_gcm_256::encrypt,
     rustcrypto_aes256_gcm_encrypt
 );
 
diff --git a/aesgcm/build.rs b/aesgcm/build.rs
new file mode 100644
index 000000000..91a6fae70
--- /dev/null
+++ b/aesgcm/build.rs
@@ -0,0 +1,34 @@
+use std::env;
+
+fn main() {
+    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+    let disable_simd128 = read_env("LIBCRUX_DISABLE_SIMD128");
+    let disable_simd256 = read_env("LIBCRUX_DISABLE_SIMD256");
+
+    // Force a simd build. Make sure you know what you're doing.
+    let enable_simd128 = read_env("LIBCRUX_ENABLE_SIMD128");
+    let enable_simd256 = read_env("LIBCRUX_ENABLE_SIMD256");
+
+    let simd128_possible = target_arch == "aarch64";
+    if (simd128_possible || enable_simd128) && !disable_simd128 {
+        // We enable simd128 on all aarch64 builds.
+        println!("cargo:rustc-cfg=feature=\"simd128\"");
+    }
+    let simd126_possible = target_arch == "x86_64";
+    if (simd126_possible || enable_simd256) && !disable_simd256 {
+        // We enable simd256 on all x86_64 builds.
+        // Note that this doesn't mean the required CPU features are available.
+        // But the compiler will support them and the runtime checks ensure that
+        // it's only used when available.
+        //
+        // We don't enable this on x86 because it seems to generate invalid code.
+        println!("cargo:rustc-cfg=feature=\"simd256\"");
+    }
+}
+
+fn read_env(key: &str) -> bool {
+    match env::var(key) {
+        Ok(s) => s == "1" || s == "y" || s == "Y",
+        Err(_) => false,
+    }
+}
diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
index 7d5fb92d3..8d0380e17 100644
--- a/aesgcm/src/ctr/test128.rs
+++ b/aesgcm/src/ctr/test128.rs
@@ -53,7 +53,7 @@ fn test_ctr_block() {
     for i in 0..32 {
         if computed[i] != EXPECTED[i] {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
@@ -72,7 +72,7 @@ fn test_ctr_block_neon() {
     for i in 0..32 {
         if computed[i] != EXPECTED[i] {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
@@ -88,7 +88,7 @@ fn test_ctr_encrypt() {
     for i in 0..32 {
         if computed[i] != EXPECTED[i] {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
@@ -105,7 +105,7 @@ fn test_ctr_encrypt_neon() {
     for i in 0..32 {
         if computed[i] != EXPECTED[i] {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
@@ -114,14 +114,14 @@ fn test_ctr_encrypt_neon() {
     }
 }
 
-#[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
+#[cfg(all(feature = "simd256", feature = "std"))]
 #[test]
 fn test_ctr_encrypt_intel() {
     let mut computed: [u8; 32] = [0u8; 32];
     aes128_ctr_encrypt::<platform::intel_ni::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
     for i in 0..32 {
         if computed[i] != EXPECTED[i] {
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
diff --git a/aesgcm/src/gf128/test.rs b/aesgcm/src/gf128/test.rs
index b755bd68e..2896908b0 100644
--- a/aesgcm/src/gf128/test.rs
+++ b/aesgcm/src/gf128/test.rs
@@ -36,7 +36,7 @@ fn test_gf128() {
     for i in 0..16 {
         if computed[i] != EXPECTED[i] {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
@@ -53,7 +53,7 @@ fn test_gf128_neon() {
     for i in 0..16 {
         if computed[i] != EXPECTED[i] {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
@@ -62,14 +62,14 @@ fn test_gf128_neon() {
     }
 }
 
-#[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
+#[cfg(all(feature = "simd256", feature = "std"))]
 #[test]
 fn test_gf128_intel() {
     let mut computed: [u8; 16] = [0u8; 16];
     gf128::<crate::platform::intel_ni::FieldElement>(&KEY, &INPUT, &mut computed);
     for i in 0..16 {
         if computed[i] != EXPECTED[i] {
-            println!(
+            std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
                 i, EXPECTED[i], computed[i]
             );
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 44762fdf0..c93fdde30 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -1,5 +1,8 @@
 #![no_std]
 
+#[cfg(feature = "std")]
+extern crate std;
+
 mod aes;
 mod ctr;
 mod gf128;
@@ -55,19 +58,19 @@ pub struct AesGcm256 {}
 pub struct PortableAesGcm256 {}
 
 /// Neon AES-GCM 256.
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 pub struct NeonAesGcm256 {}
 
 /// Neon AES-GCM 256.
-#[cfg(not(all(target_arch = "aarch64", target_feature = "aes")))]
+#[cfg(not(feature = "simd128"))]
 pub type NeonAesGcm256 = PortableAesGcm256;
 
 /// AES-NI AES-GCM 256.
-#[cfg(target_arch = "x86_64")]
+#[cfg(feature = "simd256")]
 pub struct X64AesGcm256 {}
 
 /// AES-NI AES-GCM 256.
-#[cfg(not(target_arch = "x86_64"))]
+#[cfg(not(feature = "simd256"))]
 pub type X64AesGcm256 = PortableAesGcm256;
 
 /// Tag length.
@@ -85,7 +88,6 @@ pub(crate) fn encrypt<S: State>(
     ciphertext: &mut [u8],
     tag: &mut [u8],
 ) {
-    // XXX: debug_assert!(key.len() == KEY_LEN);
     debug_assert!(nonce.len() == NONCE_LEN);
     debug_assert!(tag.len() == TAG_LEN);
 
@@ -103,7 +105,6 @@ pub(crate) fn decrypt<S: State>(
     tag: &[u8],
     plaintext: &mut [u8],
 ) -> Result<(), DecryptError> {
-    // XXX: debug_assert!(key.len() == KEY_LEN);
     debug_assert!(nonce.len() == NONCE_LEN);
     debug_assert!(tag.len() == TAG_LEN);
 
@@ -158,12 +159,88 @@ pub mod portable {
     pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::portable::State, platform::portable::FieldElement>);
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 pub mod neon {
     pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::neon::State, platform::neon::FieldElement>);
     pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::neon::State, platform::neon::FieldElement>);
 }
 
+#[cfg(feature = "simd256")]
+pub mod x64 {
+    // Here we don't use the `pub_mod` macro becaus we need to add target features
+    // onto the functions.
+    macro_rules! x64_pub_mod {
+        ($variant_comment:literal, $mod_name:ident, $state:ty) => {
+            #[doc = $variant_comment]
+            pub mod $mod_name {
+                use crate::$mod_name::KEY_LEN;
+                use crate::{platform, DecryptError};
+
+                type State = $state;
+
+                #[doc = $variant_comment]
+                /// encrypt.
+                pub fn encrypt(
+                    key: &[u8],
+                    nonce: &[u8],
+                    aad: &[u8],
+                    plaintext: &[u8],
+                    ciphertext: &mut [u8],
+                    tag: &mut [u8],
+                ) {
+                    debug_assert!(key.len() == KEY_LEN);
+
+                    #[inline]
+                    #[target_feature(enable = "avx2", enable = "aes")]
+                    unsafe fn inner(
+                        key: &[u8],
+                        nonce: &[u8],
+                        aad: &[u8],
+                        plaintext: &[u8],
+                        ciphertext: &mut [u8],
+                        tag: &mut [u8],
+                    ) {
+                        crate::encrypt::<State>(key, nonce, aad, plaintext, ciphertext, tag);
+                    }
+
+                    unsafe { inner(key, nonce, aad, plaintext, ciphertext, tag) };
+                }
+
+                #[doc = $variant_comment]
+                /// decrypt.
+                pub fn decrypt(
+                    key: &[u8],
+                    nonce: &[u8],
+                    aad: &[u8],
+                    ciphertext: &[u8],
+                    tag: &[u8],
+                    plaintext: &mut [u8],
+                ) -> Result<(), DecryptError> {
+                    debug_assert!(key.len() == KEY_LEN);
+
+                    #[inline]
+                    #[target_feature(enable = "avx2", enable = "aes")]
+                    unsafe fn inner(
+                        key: &[u8],
+                        nonce: &[u8],
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &[u8],
+                        plaintext: &mut [u8],
+                    ) -> Result<(), DecryptError> {
+                        crate::decrypt::<State>(key, nonce, aad, ciphertext, tag, plaintext)
+                    }
+
+                    unsafe { inner(key, nonce, aad, ciphertext, tag, plaintext) }
+                }
+            }
+        };
+    }
+
+    x64_pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::portable::State, platform::intel_ni::FieldElement>);
+    x64_pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::intel_ni::State, platform::intel_ni::FieldElement>);
+}
+
 /// Macro to implement the different structs and multiplexing.
 macro_rules! api {
     ($mod_name:ident, $variant:ident, $multiplexing:ty, $portable:ident, $neon:ident, $x64:ident) => {
@@ -185,12 +262,14 @@ macro_rules! api {
                     aad: &[u8],
                     plaintext: &[u8],
                 ) -> Result<(), EncryptError> {
-                    if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
-                        $neon::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-                    } else if libcrux_platform::simd256_support()
+                    // SIMD256 needs to come first because SIMD128 is true for
+                    // x64 as well, but we don't actually implement it.
+                    if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                        $x64::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                    } else if libcrux_platform::simd128_support()
                         && libcrux_platform::aes_ni_support()
                     {
-                        $x64::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        $neon::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
                     } else {
                         $portable::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
                     }
@@ -204,12 +283,14 @@ macro_rules! api {
                     ciphertext: &[u8],
                     tag: &Tag,
                 ) -> Result<(), DecryptError> {
-                    if libcrux_platform::simd128_support() && libcrux_platform::aes_ni_support() {
-                        $neon::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-                    } else if libcrux_platform::simd256_support()
+                    // SIMD256 needs to come first because SIMD128 is true for
+                    // x64 as well, but we don't actually implement it.
+                    if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                        $x64::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                    } else if libcrux_platform::simd128_support()
                         && libcrux_platform::aes_ni_support()
                     {
-                        $x64::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        $neon::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
                     } else {
                         $portable::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
                     }
@@ -242,7 +323,7 @@ macro_rules! api {
                 }
             }
 
-            #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+            #[cfg(feature = "simd128")]
             impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
                 fn encrypt(
                     ciphertext: &mut [u8],
@@ -268,6 +349,33 @@ macro_rules! api {
                         .map_err(|_| DecryptError::InvalidTag)
                 }
             }
+
+            #[cfg(feature = "simd256")]
+            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $x64 {
+                fn encrypt(
+                    ciphertext: &mut [u8],
+                    tag: &mut Tag,
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    plaintext: &[u8],
+                ) -> Result<(), EncryptError> {
+                    x64::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                    Ok(())
+                }
+
+                fn decrypt(
+                    plaintext: &mut [u8],
+                    key: &Key,
+                    nonce: &Nonce,
+                    aad: &[u8],
+                    ciphertext: &[u8],
+                    tag: &Tag,
+                ) -> Result<(), DecryptError> {
+                    x64::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                        .map_err(|_| DecryptError::InvalidTag)
+                }
+            }
         }
     };
 }
diff --git a/aesgcm/src/platform.rs b/aesgcm/src/platform.rs
index 8657dd2c6..9fba0d282 100644
--- a/aesgcm/src/platform.rs
+++ b/aesgcm/src/platform.rs
@@ -2,10 +2,10 @@
 
 pub(crate) mod portable;
 
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 pub(crate) mod neon;
 
-#[cfg(target_arch = "x86_64")] // ENABLE: target_feature="aes"
+#[cfg(feature = "simd256")]
 pub(crate) mod intel_ni;
 
 /// The AES state.
diff --git a/aesgcm/src/platform/intel_ni.rs b/aesgcm/src/platform/intel_ni.rs
index 7fe9d7462..2cb276213 100644
--- a/aesgcm/src/platform/intel_ni.rs
+++ b/aesgcm/src/platform/intel_ni.rs
@@ -1,4 +1,5 @@
 mod aes_core;
 mod gf128_core;
+
 pub(crate) use aes_core::State;
 pub(crate) use gf128_core::FieldElement;
diff --git a/aesgcm/src/platform/intel_ni/aes_core.rs b/aesgcm/src/platform/intel_ni/aes_core.rs
index d94eca869..6a363cca1 100644
--- a/aesgcm/src/platform/intel_ni/aes_core.rs
+++ b/aesgcm/src/platform/intel_ni/aes_core.rs
@@ -1,7 +1,7 @@
 use core::arch::x86_64::*;
 
 /// The avx2 state.
-pub(super) type State = __m128i;
+pub(crate) type State = __m128i;
 
 #[inline]
 fn new_state() -> State {
@@ -59,21 +59,30 @@ impl crate::platform::AESState for State {
     #[inline]
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
+
         unsafe { *self = _mm_loadu_si128(b.as_ptr() as *const __m128i) };
     }
 
     #[inline]
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
-        unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, *self) }
+
+        unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, *self) };
     }
 
     #[inline]
-    fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
-        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        let inp_vec = unsafe { _mm_loadu_si128(inp.as_ptr() as *const __m128i) };
+    fn xor_block(&self, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= 16);
+        // XXX: hot-fix to have enough input and output here.
+        let mut block_in = [0u8; 16];
+        let mut block_out = [0u8; 16];
+        block_in[0..input.len()].copy_from_slice(input);
+
+        let inp_vec = unsafe { _mm_loadu_si128(block_in.as_ptr() as *const __m128i) };
         let out_vec = unsafe { _mm_xor_si128(inp_vec, *self) };
-        unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, out_vec) }
+        unsafe { _mm_storeu_si128(block_out.as_mut_ptr() as *mut __m128i, out_vec) };
+
+        out.copy_from_slice(&block_out[0..out.len()]);
     }
 
     #[inline]
@@ -108,6 +117,7 @@ impl crate::platform::AESState for State {
     }
 }
 
+#[cfg(feature = "std")]
 #[test]
 fn test() {
     unsafe {
@@ -116,10 +126,13 @@ fn test() {
         let w = _mm_slli_si128(x, 4);
         let mut z: [i32; 4] = [0; 4];
         _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, x);
-        println!("{:?}", z);
+
+        std::eprintln!("{:?}", z);
         _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, w);
-        println!("shift right 4 {:?}", z);
+
+        std::eprintln!("shift right 4 {:?}", z);
         _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, y);
-        println!("shuffle aa {:?}", z);
+
+        std::eprintln!("shuffle aa {:?}", z);
     }
 }
diff --git a/aesgcm/src/platform/intel_ni/gf128_core.rs b/aesgcm/src/platform/intel_ni/gf128_core.rs
index 7fd9f5a65..28cf800b5 100644
--- a/aesgcm/src/platform/intel_ni/gf128_core.rs
+++ b/aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -4,7 +4,7 @@ use core::arch::x86_64::*;
 
 /// An avx2 gf128 field element.
 #[derive(Clone, Copy)]
-pub(super) struct FieldElement(pub(super) u128);
+pub(crate) struct FieldElement(pub(super) u128);
 
 #[inline]
 fn zero() -> FieldElement {
@@ -12,14 +12,16 @@ fn zero() -> FieldElement {
 }
 
 #[inline]
-fn load_elem(b: &[u8]) -> FieldElement {
+fn load_element(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
+
     FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
 }
 
 #[inline]
-fn store_elem(elem: &FieldElement, b: &mut [u8]) {
+fn store_element(elem: &FieldElement, b: &mut [u8]) {
     debug_assert!(b.len() == 16);
+
     b.copy_from_slice(&elem.0.to_be_bytes());
 }
 
@@ -30,8 +32,8 @@ fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
 
 #[inline]
 fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
-    let lhs: __m128i = unsafe { std::mem::transmute((*elem).0) };
-    let rhs: __m128i = unsafe { std::mem::transmute((*other).0) };
+    let lhs: __m128i = unsafe { core::mem::transmute((*elem).0) };
+    let rhs: __m128i = unsafe { core::mem::transmute((*other).0) };
     let low = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x11) };
     let mid0 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x10) };
     let mid1 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x01) };
@@ -42,8 +44,8 @@ fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldEl
     let low = unsafe { _mm_xor_si128(low, m0) };
     let high = unsafe { _mm_xor_si128(high, m1) };
 
-    let low128: u128 = unsafe { std::mem::transmute(low) };
-    let high128: u128 = unsafe { std::mem::transmute(high) };
+    let low128: u128 = unsafe { core::mem::transmute(low) };
+    let high128: u128 = unsafe { core::mem::transmute(high) };
     (FieldElement(low128), FieldElement(high128))
 }
 
@@ -70,13 +72,13 @@ impl crate::platform::GF128FieldElement for FieldElement {
     }
 
     #[inline]
-    fn load_elem(b: &[u8]) -> Self {
-        load_elem(b)
+    fn load_element(b: &[u8]) -> Self {
+        load_element(b)
     }
 
     #[inline]
-    fn store_elem(&self, b: &mut [u8]) {
-        store_elem(self, b);
+    fn store_element(&self, b: &mut [u8]) {
+        store_element(self, b);
     }
 
     #[inline]
@@ -90,14 +92,15 @@ impl crate::platform::GF128FieldElement for FieldElement {
     }
 }
 
+#[cfg(feature = "std")]
 #[test]
 fn test_transmute() {
     let x = 1u128 << 64 ^ 2u128;
-    let xv: __m128i = unsafe { std::mem::transmute(x) };
+    let xv: __m128i = unsafe { core::mem::transmute(x) };
     let xv: __m128i = unsafe { _mm_slli_si128(xv, 8) };
-    let x: u128 = unsafe { std::mem::transmute(xv) };
-    println!("trans {:x}", x);
+    let x: u128 = unsafe { core::mem::transmute(xv) };
+    std::eprintln!("trans {:x}", x);
     let mut u64s = [0u64; 2];
     unsafe { _mm_storeu_si128(u64s.as_mut_ptr() as *mut __m128i, xv) };
-    println!("store {:?}", u64s)
+    std::eprintln!("store {:?}", u64s)
 }
diff --git a/aesgcm/src/platform/neon.rs b/aesgcm/src/platform/neon.rs
index 7fe9d7462..37a24363c 100644
--- a/aesgcm/src/platform/neon.rs
+++ b/aesgcm/src/platform/neon.rs
@@ -1,4 +1,5 @@
-mod aes_core;
-mod gf128_core;
 pub(crate) use aes_core::State;
 pub(crate) use gf128_core::FieldElement;
+
+mod aes_core;
+mod gf128_core;
diff --git a/aesgcm/src/platform/neon/gf128_core.rs b/aesgcm/src/platform/neon/gf128_core.rs
index db0174671..3c9594dc9 100644
--- a/aesgcm/src/platform/neon/gf128_core.rs
+++ b/aesgcm/src/platform/neon/gf128_core.rs
@@ -12,12 +12,14 @@ fn zero() -> FieldElement {
 #[inline]
 fn load_element(b: &[u8]) -> FieldElement {
     debug_assert!(b.len() == 16);
+
     FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
 }
 
 #[inline]
 fn store_element(element: &FieldElement, bytes: &mut [u8]) {
     debug_assert!(bytes.len() == 16);
+
     bytes.copy_from_slice(&element.0.to_be_bytes());
 }
 
diff --git a/aesgcm/src/platform/portable.rs b/aesgcm/src/platform/portable.rs
index 7fe9d7462..37a24363c 100644
--- a/aesgcm/src/platform/portable.rs
+++ b/aesgcm/src/platform/portable.rs
@@ -1,4 +1,5 @@
-mod aes_core;
-mod gf128_core;
 pub(crate) use aes_core::State;
 pub(crate) use gf128_core::FieldElement;
+
+mod aes_core;
+mod gf128_core;
diff --git a/aesgcm/src/platform/portable/aes_core/test.rs b/aesgcm/src/platform/portable/aes_core/test.rs
index c624a9dfe..c5524fc4f 100644
--- a/aesgcm/src/platform/portable/aes_core/test.rs
+++ b/aesgcm/src/platform/portable/aes_core/test.rs
@@ -694,13 +694,13 @@ fn test_transpose() {
             if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
                 #[cfg(feature = "std")]
                 {
-                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
-                    println!("y[{},{}] = {}", i, j, get_bit_u16(&y, i, j));
+                    std::eprintln!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    std::eprintln!("y[{},{}] = {}", i, j, get_bit_u16(&y, i, j));
                 }
                 assert!(false);
             } else {
                 #[cfg(feature = "std")]
-                println!("transpose ok: {},{}", i, j);
+                std::eprintln!("transpose ok: {},{}", i, j);
             }
         }
     }
@@ -711,13 +711,13 @@ fn test_transpose() {
             if get_bit_u8(&x, i, j) != get_bit_u8(&z, i, j) {
                 #[cfg(feature = "std")]
                 {
-                    println!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
-                    println!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
+                    std::eprintln!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    std::eprintln!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
                 }
                 assert!(false);
             } else {
                 #[cfg(feature = "std")]
-                println!("inv-transpose ok: {},{}", i, j);
+                std::eprintln!("inv-transpose ok: {},{}", i, j);
             }
         }
     }
@@ -736,11 +736,11 @@ fn test_sbox() {
         transpose_u16x8(&y, &mut w);
         if w[0] != sbox_fwd(i as u8) {
             #[cfg(feature = "std")]
-            println!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
+            std::eprintln!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
             assert!(false);
         } else {
             #[cfg(feature = "std")]
-            println!("sbox ok {}", i)
+            std::eprintln!("sbox ok {}", i)
         }
     }
 }
@@ -758,7 +758,7 @@ fn test_sbox_inv() {
         transpose_u16x8(&y, &mut w);
         if w[0] != sbox_inv(i as u8) {
             #[cfg(feature = "std")]
-            println!(
+            std::eprintln!(
                 "sbox_inv[{}] = {}, should be {}",
                 i,
                 w[0],
@@ -767,7 +767,7 @@ fn test_sbox_inv() {
             assert!(false);
         } else {
             #[cfg(feature = "std")]
-            println!("sbox inv ok {}", i)
+            std::eprintln!("sbox inv ok {}", i)
         }
     }
 }
diff --git a/aesgcm/tests/wycheproof.rs b/aesgcm/tests/wycheproof.rs
index 6f1ff7e7e..66fb40c65 100644
--- a/aesgcm/tests/wycheproof.rs
+++ b/aesgcm/tests/wycheproof.rs
@@ -55,6 +55,7 @@ fn test() {
         if test_group.key_size == 128 {
             for test in test_group.tests {
                 println!("  Test AES-GCM 128 {}", test.tc_id);
+
                 // Multiplexing
                 run::<16, libcrux_aesgcm::AesGcm128>(&test);
 

From 97cf135776183b9257c60f869ac5fdd5a26bcacc Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Sun, 21 Sep 2025 11:07:01 +0200
Subject: [PATCH 26/43] make gf128 x64 a little faster

---
 aesgcm/.gitignore                          |  1 +
 aesgcm/examples/bench.rs                   | 26 +++++++
 aesgcm/src/lib.rs                          |  2 +-
 aesgcm/src/platform/intel_ni/gf128_core.rs | 84 ++++++++++++++++++----
 4 files changed, 97 insertions(+), 16 deletions(-)
 create mode 100644 aesgcm/.gitignore
 create mode 100644 aesgcm/examples/bench.rs

diff --git a/aesgcm/.gitignore b/aesgcm/.gitignore
new file mode 100644
index 000000000..407088d6f
--- /dev/null
+++ b/aesgcm/.gitignore
@@ -0,0 +1 @@
+profile.json.gz
diff --git a/aesgcm/examples/bench.rs b/aesgcm/examples/bench.rs
new file mode 100644
index 000000000..0f823cbfd
--- /dev/null
+++ b/aesgcm/examples/bench.rs
@@ -0,0 +1,26 @@
+use libcrux_aesgcm::Aead;
+
+fn main() {
+    const PAYLOAD_SIZES: usize = 3045;
+
+    let key = [0x16; 16];
+    let nonce = [0x12; 12];
+
+    let aad = [0xff; 32];
+    let plaintext = [0xab; PAYLOAD_SIZES];
+
+    let mut ciphertext = vec![0; PAYLOAD_SIZES];
+    let mut tag = [0u8; 16];
+
+    for _ in 0..10000 {
+        libcrux_aesgcm::AesGcm128::encrypt(
+            &mut ciphertext,
+            &mut tag,
+            &key,
+            &nonce,
+            &aad,
+            &plaintext,
+        )
+        .unwrap();
+    }
+}
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index c93fdde30..84cd7fc6b 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -237,7 +237,7 @@ pub mod x64 {
         };
     }
 
-    x64_pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::portable::State, platform::intel_ni::FieldElement>);
+    x64_pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::intel_ni::State, platform::intel_ni::FieldElement>);
     x64_pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::intel_ni::State, platform::intel_ni::FieldElement>);
 }
 
diff --git a/aesgcm/src/platform/intel_ni/gf128_core.rs b/aesgcm/src/platform/intel_ni/gf128_core.rs
index 28cf800b5..8d854b516 100644
--- a/aesgcm/src/platform/intel_ni/gf128_core.rs
+++ b/aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -30,23 +30,77 @@ fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
     FieldElement((*elem).0 ^ (*other).0)
 }
 
+// #[inline]
+// fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+//     let lhs: __m128i = unsafe { core::mem::transmute((*elem).0) };
+//     let rhs: __m128i = unsafe { core::mem::transmute((*other).0) };
+
+//     let low = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x11) };
+//     let mid0 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x10) };
+//     let mid1 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x01) };
+//     let high = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x00) };
+//     let mid = unsafe { _mm_xor_si128(mid0, mid1) };
+//     let m0 = unsafe { _mm_srli_si128(mid, 8) };
+//     let m1 = unsafe { _mm_slli_si128(mid, 8) };
+//     let low = unsafe { _mm_xor_si128(low, m0) };
+//     let high = unsafe { _mm_xor_si128(high, m1) };
+
+//     let low128: u128 = unsafe { core::mem::transmute(low) };
+//     let high128: u128 = unsafe { core::mem::transmute(high) };
+
+//     (FieldElement(low128), FieldElement(high128))
+// }
+
+/// Performs a 128x128 to 256-bit carry-less multiplication.
+///
+/// This implementation uses the Karatsuba algorithm to reduce the number of expensive
+/// PCLMULQDQ instructions from 4 to 3. On most modern x64 CPUs (Intel Sandy
+/// Bridge and newer, AMD Zen and newer), this results in higher performance due to
+/// better utilization of execution ports and potentially lower overall latency.
+///
+/// @param elem The first 128-bit field element.
+/// @param other The second 128-bit field element.
+/// @returns A tuple `(high, low)` containing the 256-bit result.
 #[inline]
 fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
-    let lhs: __m128i = unsafe { core::mem::transmute((*elem).0) };
-    let rhs: __m128i = unsafe { core::mem::transmute((*other).0) };
-    let low = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x11) };
-    let mid0 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x10) };
-    let mid1 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x01) };
-    let high = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x00) };
-    let mid = unsafe { _mm_xor_si128(mid0, mid1) };
-    let m0 = unsafe { _mm_srli_si128(mid, 8) };
-    let m1 = unsafe { _mm_slli_si128(mid, 8) };
-    let low = unsafe { _mm_xor_si128(low, m0) };
-    let high = unsafe { _mm_xor_si128(high, m1) };
-
-    let low128: u128 = unsafe { core::mem::transmute(low) };
-    let high128: u128 = unsafe { core::mem::transmute(high) };
-    (FieldElement(low128), FieldElement(high128))
+    // Let the inputs be a = (a_hi << 64) | a_lo and b = (b_hi << 64) | b_lo.
+    // The product is (a_hi*b_hi << 128) + ((a_lo*b_hi ^ a_hi*b_lo) << 64) + a_lo*b_lo.
+    // The Karatsuba trick computes the middle term using the other two products:
+    // (a_lo*b_hi ^ a_hi*b_lo) = (a_lo^a_hi)*(b_lo^b_hi) ^ a_lo*b_lo ^ a_hi*b_hi
+
+    let a: __m128i = unsafe { core::mem::transmute(elem.0) };
+    let b: __m128i = unsafe { core::mem::transmute(other.0) };
+
+    // 1. Calculate the low and high 128-bit parts of the product in parallel.
+    //    p_lo = a_lo * b_lo
+    let p_lo = unsafe { _mm_clmulepi64_si128(a, b, 0x00) };
+    //    p_hi = a_hi * b_hi
+    let p_hi = unsafe { _mm_clmulepi64_si128(a, b, 0x11) };
+
+    // 2. Calculate the middle term using the third multiplication.
+    //    First, prepare the operands (a_lo^a_hi) and (b_lo^b_hi).
+    //    Using unpack instructions is an alternative to shuffling.
+    let a_xor = unsafe { _mm_xor_si128(_mm_unpackhi_epi64(a, a), _mm_unpacklo_epi64(a, a)) };
+    let b_xor = unsafe { _mm_xor_si128(_mm_unpackhi_epi64(b, b), _mm_unpacklo_epi64(b, b)) };
+
+    // Multiply the low 64-bit parts of the XORed results.
+    // p_mid_prod = (a_lo^a_hi) * (b_lo^b_hi)
+    let p_mid_prod = unsafe { _mm_clmulepi64_si128(a_xor, b_xor, 0x00) };
+
+    // Finish computing the middle term by XORing with p_lo and p_hi.
+    let p_mid = unsafe { _mm_xor_si128(_mm_xor_si128(p_mid_prod, p_lo), p_hi) };
+
+    // 3. Combine the parts to get the final 256-bit result.
+    //    The middle part is XORed at a 64-bit offset.
+    //    res_low  = p_lo ^ (p_mid << 64)
+    //    res_high = p_hi ^ (p_mid >> 64)
+    let res_low = unsafe { _mm_xor_si128(p_lo, _mm_slli_si128(p_mid, 8)) };
+    let res_high = unsafe { _mm_xor_si128(p_hi, _mm_srli_si128(p_mid, 8)) };
+
+    // The original function returned (high_part, low_part). We maintain that order.
+    let high_part: u128 = unsafe { core::mem::transmute(res_high) };
+    let low_part: u128 = unsafe { core::mem::transmute(res_low) };
+    (FieldElement(high_part), FieldElement(low_part))
 }
 
 #[inline]

From 7c5b857a7d5d2b49c5422030c321a2a47a82db63 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Sun, 21 Sep 2025 17:51:54 +0200
Subject: [PATCH 27/43] more cleanup

---
 aesgcm/src/lib.rs                          | 13 +++-
 aesgcm/src/platform/intel_ni/aes_core.rs   | 40 ++++++-----
 aesgcm/src/platform/intel_ni/gf128_core.rs | 50 +++++++++----
 libcrux-intrinsics/src/avx2.rs             | 81 ++++++++++++++++++++++
 4 files changed, 151 insertions(+), 33 deletions(-)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 84cd7fc6b..6aae73a99 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -1,4 +1,5 @@
 #![no_std]
+#![deny(unsafe_code)]
 
 #[cfg(feature = "std")]
 extern crate std;
@@ -192,6 +193,7 @@ pub mod x64 {
 
                     #[inline]
                     #[target_feature(enable = "avx2", enable = "aes")]
+                    #[allow(unsafe_code)]
                     unsafe fn inner(
                         key: &[u8],
                         nonce: &[u8],
@@ -203,7 +205,10 @@ pub mod x64 {
                         crate::encrypt::<State>(key, nonce, aad, plaintext, ciphertext, tag);
                     }
 
-                    unsafe { inner(key, nonce, aad, plaintext, ciphertext, tag) };
+                    #[allow(unsafe_code)]
+                    unsafe {
+                        inner(key, nonce, aad, plaintext, ciphertext, tag)
+                    };
                 }
 
                 #[doc = $variant_comment]
@@ -220,6 +225,7 @@ pub mod x64 {
 
                     #[inline]
                     #[target_feature(enable = "avx2", enable = "aes")]
+                    #[allow(unsafe_code)]
                     unsafe fn inner(
                         key: &[u8],
                         nonce: &[u8],
@@ -231,7 +237,10 @@ pub mod x64 {
                         crate::decrypt::<State>(key, nonce, aad, ciphertext, tag, plaintext)
                     }
 
-                    unsafe { inner(key, nonce, aad, ciphertext, tag, plaintext) }
+                    #[allow(unsafe_code)]
+                    unsafe {
+                        inner(key, nonce, aad, ciphertext, tag, plaintext)
+                    }
                 }
             }
         };
diff --git a/aesgcm/src/platform/intel_ni/aes_core.rs b/aesgcm/src/platform/intel_ni/aes_core.rs
index 6a363cca1..4ff6dbb2b 100644
--- a/aesgcm/src/platform/intel_ni/aes_core.rs
+++ b/aesgcm/src/platform/intel_ni/aes_core.rs
@@ -1,53 +1,56 @@
 use core::arch::x86_64::*;
 
+use libcrux_intrinsics::avx2::{
+    mm_aesenc_si128, mm_aesenclast_si128, mm_aeskeygenassist_si128, mm_loadu_si128,
+    mm_setzero_si128, mm_shuffle_epi32, mm_slli_si128, mm_storeu_si128_u8, mm_xor_si128,
+};
+
 /// The avx2 state.
 pub(crate) type State = __m128i;
 
 #[inline]
 fn new_state() -> State {
-    unsafe { _mm_setzero_si128() }
+    mm_setzero_si128()
 }
 
 #[inline]
 fn xor_key1_state(st: &mut State, k: &State) {
-    unsafe { *st = _mm_xor_si128(*st, *k) }
+    *st = mm_xor_si128(*st, *k);
 }
 
 #[inline]
 fn aes_enc(st: &mut State, key: &State) {
-    unsafe { *st = _mm_aesenc_si128(*st, *key) }
+    *st = mm_aesenc_si128(*st, *key);
 }
 
 #[inline]
 fn aes_enc_last(st: &mut State, key: &State) {
-    unsafe { *st = _mm_aesenclast_si128(*st, *key) }
+    *st = mm_aesenclast_si128(*st, *key);
 }
 
 #[inline]
 fn aes_keygen_assist<const RCON: i32>(next: &mut State, prev: &State) {
-    unsafe { *next = _mm_aeskeygenassist_si128::<RCON>(*prev) }
+    *next = mm_aeskeygenassist_si128::<RCON>(*prev);
 }
 
 #[inline]
 fn aes_keygen_assist0<const RCON: i32>(next: &mut State, prev: &State) {
     aes_keygen_assist::<RCON>(next, prev);
-    unsafe { *next = _mm_shuffle_epi32(*next, 0xff) }
+    *next = mm_shuffle_epi32::<0xff>(*next);
 }
 
 #[inline]
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist::<0>(next, prev);
-    unsafe { *next = _mm_shuffle_epi32(*next, 0xaa) }
+    *next = mm_shuffle_epi32::<0xaa>(*next);
 }
 
 #[inline]
 fn key_expansion_step(next: &mut State, prev: &State) {
-    unsafe {
-        let p0 = _mm_xor_si128(*prev, _mm_slli_si128(*prev, 4));
-        let p1 = _mm_xor_si128(p0, _mm_slli_si128(p0, 4));
-        let p2 = _mm_xor_si128(p1, _mm_slli_si128(p1, 4));
-        *next = _mm_xor_si128(*next, p2);
-    }
+    let p0 = mm_xor_si128(*prev, mm_slli_si128::<4>(*prev));
+    let p1 = mm_xor_si128(p0, mm_slli_si128::<4>(p0));
+    let p2 = mm_xor_si128(p1, mm_slli_si128::<4>(p1));
+    *next = mm_xor_si128(*next, p2);
 }
 
 impl crate::platform::AESState for State {
@@ -60,14 +63,14 @@ impl crate::platform::AESState for State {
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
 
-        unsafe { *self = _mm_loadu_si128(b.as_ptr() as *const __m128i) };
+        *self = mm_loadu_si128(b);
     }
 
     #[inline]
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
 
-        unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, *self) };
+        mm_storeu_si128_u8(out, *self);
     }
 
     #[inline]
@@ -78,9 +81,9 @@ impl crate::platform::AESState for State {
         let mut block_out = [0u8; 16];
         block_in[0..input.len()].copy_from_slice(input);
 
-        let inp_vec = unsafe { _mm_loadu_si128(block_in.as_ptr() as *const __m128i) };
-        let out_vec = unsafe { _mm_xor_si128(inp_vec, *self) };
-        unsafe { _mm_storeu_si128(block_out.as_mut_ptr() as *mut __m128i, out_vec) };
+        let inp_vec = mm_loadu_si128(&block_in);
+        let out_vec = mm_xor_si128(inp_vec, *self);
+        mm_storeu_si128_u8(&mut block_out, out_vec);
 
         out.copy_from_slice(&block_out[0..out.len()]);
     }
@@ -118,6 +121,7 @@ impl crate::platform::AESState for State {
 }
 
 #[cfg(feature = "std")]
+#[allow(unsafe_code)]
 #[test]
 fn test() {
     unsafe {
diff --git a/aesgcm/src/platform/intel_ni/gf128_core.rs b/aesgcm/src/platform/intel_ni/gf128_core.rs
index 8d854b516..919d7c6d2 100644
--- a/aesgcm/src/platform/intel_ni/gf128_core.rs
+++ b/aesgcm/src/platform/intel_ni/gf128_core.rs
@@ -1,11 +1,33 @@
 use core::arch::x86_64::*;
 
+use libcrux_intrinsics::avx2::{
+    mm_clmulepi64_si128, mm_slli_si128, mm_srli_si128, mm_unpackhi_epi64, mm_unpacklo_epi64,
+    mm_xor_si128,
+};
+
 // XXX: A lot of the code below is shared with NEON. Refactor!
 
 /// An avx2 gf128 field element.
 #[derive(Clone, Copy)]
+#[repr(transparent)]
 pub(crate) struct FieldElement(pub(super) u128);
 
+impl FieldElement {
+    /// Transmute `u128` and `__m128i`.
+    #[inline]
+    #[allow(unsafe_code)]
+    fn transmute(&self) -> __m128i {
+        unsafe { core::mem::transmute(self.0) }
+    }
+
+    /// Convert a vec to self.
+    #[inline]
+    #[allow(unsafe_code)]
+    fn from_vec128(vec: __m128i) -> Self {
+        unsafe { core::mem::transmute(vec) }
+    }
+}
+
 #[inline]
 fn zero() -> FieldElement {
     FieldElement(0)
@@ -68,39 +90,40 @@ fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldEl
     // The Karatsuba trick computes the middle term using the other two products:
     // (a_lo*b_hi ^ a_hi*b_lo) = (a_lo^a_hi)*(b_lo^b_hi) ^ a_lo*b_lo ^ a_hi*b_hi
 
-    let a: __m128i = unsafe { core::mem::transmute(elem.0) };
-    let b: __m128i = unsafe { core::mem::transmute(other.0) };
+    let a: __m128i = elem.transmute();
+    let b: __m128i = other.transmute();
 
     // 1. Calculate the low and high 128-bit parts of the product in parallel.
     //    p_lo = a_lo * b_lo
-    let p_lo = unsafe { _mm_clmulepi64_si128(a, b, 0x00) };
+    let p_lo = mm_clmulepi64_si128::<0x00>(a, b);
     //    p_hi = a_hi * b_hi
-    let p_hi = unsafe { _mm_clmulepi64_si128(a, b, 0x11) };
+    let p_hi = mm_clmulepi64_si128::<0x11>(a, b);
 
     // 2. Calculate the middle term using the third multiplication.
     //    First, prepare the operands (a_lo^a_hi) and (b_lo^b_hi).
     //    Using unpack instructions is an alternative to shuffling.
-    let a_xor = unsafe { _mm_xor_si128(_mm_unpackhi_epi64(a, a), _mm_unpacklo_epi64(a, a)) };
-    let b_xor = unsafe { _mm_xor_si128(_mm_unpackhi_epi64(b, b), _mm_unpacklo_epi64(b, b)) };
+    let a_xor = mm_xor_si128(mm_unpackhi_epi64(a, a), mm_unpacklo_epi64(a, a));
+    let b_xor = mm_xor_si128(mm_unpackhi_epi64(b, b), mm_unpacklo_epi64(b, b));
 
     // Multiply the low 64-bit parts of the XORed results.
     // p_mid_prod = (a_lo^a_hi) * (b_lo^b_hi)
-    let p_mid_prod = unsafe { _mm_clmulepi64_si128(a_xor, b_xor, 0x00) };
+    let p_mid_prod = mm_clmulepi64_si128::<0x00>(a_xor, b_xor);
 
     // Finish computing the middle term by XORing with p_lo and p_hi.
-    let p_mid = unsafe { _mm_xor_si128(_mm_xor_si128(p_mid_prod, p_lo), p_hi) };
+    let p_mid = mm_xor_si128(mm_xor_si128(p_mid_prod, p_lo), p_hi);
 
     // 3. Combine the parts to get the final 256-bit result.
     //    The middle part is XORed at a 64-bit offset.
     //    res_low  = p_lo ^ (p_mid << 64)
     //    res_high = p_hi ^ (p_mid >> 64)
-    let res_low = unsafe { _mm_xor_si128(p_lo, _mm_slli_si128(p_mid, 8)) };
-    let res_high = unsafe { _mm_xor_si128(p_hi, _mm_srli_si128(p_mid, 8)) };
+    let res_low = mm_xor_si128(p_lo, mm_slli_si128::<8>(p_mid));
+    let res_high = mm_xor_si128(p_hi, mm_srli_si128::<8>(p_mid));
 
     // The original function returned (high_part, low_part). We maintain that order.
-    let high_part: u128 = unsafe { core::mem::transmute(res_high) };
-    let low_part: u128 = unsafe { core::mem::transmute(res_low) };
-    (FieldElement(high_part), FieldElement(low_part))
+    (
+        FieldElement::from_vec128(res_high),
+        FieldElement::from_vec128(res_low),
+    )
 }
 
 #[inline]
@@ -146,6 +169,7 @@ impl crate::platform::GF128FieldElement for FieldElement {
     }
 }
 
+#[allow(unsafe_code)]
 #[cfg(feature = "std")]
 #[test]
 fn test_transmute() {
diff --git a/libcrux-intrinsics/src/avx2.rs b/libcrux-intrinsics/src/avx2.rs
index 9acb73ddb..b04a71d87 100644
--- a/libcrux-intrinsics/src/avx2.rs
+++ b/libcrux-intrinsics/src/avx2.rs
@@ -57,6 +57,16 @@ pub fn mm_storeu_si128(output: &mut [i16], vector: Vec128) {
     }
 }
 
+#[hax_lib::opaque]
+#[inline(always)]
+pub fn mm_storeu_si128_u8(output: &mut [u8], vector: Vec128) {
+    #[cfg(not(hax))]
+    debug_assert!(output.len() >= 8);
+    unsafe {
+        _mm_storeu_si128(output.as_mut_ptr() as *mut Vec128, vector);
+    }
+}
+
 #[hax_lib::opaque]
 #[inline(always)]
 pub fn mm_storeu_si128_i32(output: &mut [i32], vector: Vec128) {
@@ -115,6 +125,11 @@ pub fn mm256_setzero_si256() -> Vec256 {
     unsafe { _mm256_setzero_si256() }
 }
 
+#[inline(always)]
+pub fn mm_setzero_si128() -> Vec128 {
+    unsafe { _mm_setzero_si128() }
+}
+
 #[inline(always)]
 pub fn mm256_set_m128i(hi: Vec128, lo: Vec128) -> Vec256 {
     unsafe { _mm256_set_m128i(hi, lo) }
@@ -439,6 +454,12 @@ pub fn mm256_xor_si256(lhs: Vec256, rhs: Vec256) -> Vec256 {
     unsafe { _mm256_xor_si256(lhs, rhs) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_xor_si128(lhs: Vec128, rhs: Vec128) -> Vec128 {
+    unsafe { _mm_xor_si128(lhs, rhs) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_srai_epi16<const SHIFT_BY: i32>(vector: Vec256) -> Vec256 {
@@ -504,6 +525,22 @@ pub fn mm256_slli_epi32<const SHIFT_BY: i32>(vector: Vec256) -> Vec256 {
     unsafe { _mm256_slli_epi32::<SHIFT_BY>(vector) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_slli_si128<const SHIFT_BY: i32>(vector: Vec128) -> Vec128 {
+    #[cfg(not(hax))]
+    debug_assert!(SHIFT_BY >= 0 && SHIFT_BY < 16);
+    unsafe { _mm_slli_si128::<SHIFT_BY>(vector) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_srli_si128<const SHIFT_BY: i32>(vector: Vec128) -> Vec128 {
+    #[cfg(not(hax))]
+    debug_assert!(SHIFT_BY >= 0 && SHIFT_BY < 16);
+    unsafe { _mm_srli_si128::<SHIFT_BY>(vector) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm_shuffle_epi8(vector: Vec128, control: Vec128) -> Vec128 {
@@ -524,6 +561,14 @@ pub fn mm256_shuffle_epi32<const CONTROL: i32>(vector: Vec256) -> Vec256 {
     unsafe { _mm256_shuffle_epi32::<CONTROL>(vector) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_shuffle_epi32<const CONTROL: i32>(vector: Vec128) -> Vec128 {
+    #[cfg(not(hax))]
+    debug_assert!(CONTROL >= 0 && CONTROL < 256);
+    unsafe { _mm_shuffle_epi32::<CONTROL>(vector) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_permute4x64_epi64<const CONTROL: i32>(vector: Vec256) -> Vec256 {
@@ -538,6 +583,12 @@ pub fn mm256_unpackhi_epi64(lhs: Vec256, rhs: Vec256) -> Vec256 {
     unsafe { _mm256_unpackhi_epi64(lhs, rhs) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_unpackhi_epi64(lhs: Vec128, rhs: Vec128) -> Vec128 {
+    unsafe { _mm_unpackhi_epi64(lhs, rhs) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_unpacklo_epi32(lhs: Vec256, rhs: Vec256) -> Vec256 {
@@ -701,8 +752,38 @@ pub fn mm256_unpacklo_epi64(lhs: Vec256, rhs: Vec256) -> Vec256 {
     unsafe { _mm256_unpacklo_epi64(lhs, rhs) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_unpacklo_epi64(lhs: Vec128, rhs: Vec128) -> Vec128 {
+    unsafe { _mm_unpacklo_epi64(lhs, rhs) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_permute2x128_si256<const IMM8: i32>(a: Vec256, b: Vec256) -> Vec256 {
     unsafe { _mm256_permute2x128_si256::<IMM8>(a, b) }
 }
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_clmulepi64_si128<const IMM8: i32>(a: Vec128, b: Vec128) -> Vec128 {
+    unsafe { _mm_clmulepi64_si128(a, b, IMM8) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_aesenc_si128(a: Vec128, b: Vec128) -> Vec128 {
+    unsafe { _mm_aesenc_si128(a, b) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_aesenclast_si128(a: Vec128, b: Vec128) -> Vec128 {
+    unsafe { _mm_aesenclast_si128(a, b) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_aeskeygenassist_si128<const RCON: i32>(a: Vec128) -> Vec128 {
+    unsafe { _mm_aeskeygenassist_si128(a, RCON) }
+}

From 2631ea7e8e1352137e40690217d41a7f24f4b876 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Mon, 22 Sep 2025 15:36:41 +0200
Subject: [PATCH 28/43] fixup neon

---
 aesgcm/src/platform/neon/aes_core.rs | 110 ++++++++++++++++-----------
 libcrux-intrinsics/src/arm64.rs      |  59 ++++++++++++++
 2 files changed, 126 insertions(+), 43 deletions(-)

diff --git a/aesgcm/src/platform/neon/aes_core.rs b/aesgcm/src/platform/neon/aes_core.rs
index ed6553ff9..709d6e38b 100644
--- a/aesgcm/src/platform/neon/aes_core.rs
+++ b/aesgcm/src/platform/neon/aes_core.rs
@@ -1,122 +1,146 @@
-use core::arch::aarch64::*;
+use libcrux_intrinsics::arm64::{
+    _uint8x16_t, _vaeseq_u8, _vaesmcq_u8, _vdupq_laneq_u32, _vdupq_n_u32, _vdupq_n_u8, _veorq_u32,
+    _veorq_u8, _vextq_u32, _vld1q_u32, _vld1q_u8, _vreinterpretq_u32_u8, _vreinterpretq_u8_u32,
+    _vst1q_u8,
+};
 
 /// The Neon state
-pub(crate) type State = uint8x16_t;
+pub(crate) type State = _uint8x16_t;
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn new_state() -> State {
-    unsafe { vdupq_n_u8(0) }
+    _vdupq_n_u8(0)
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn xor_key1_state(st: &mut State, k: &State) {
-    unsafe { *st = veorq_u8(*st, *k) }
+    *st = _veorq_u8(*st, *k);
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn aes_enc(st: &mut State, key: &State) {
-    unsafe { *st = veorq_u8(vaesmcq_u8(vaeseq_u8(*st, vdupq_n_u8(0))), *key) }
+    *st = _veorq_u8(_vaesmcq_u8(_vaeseq_u8(*st, _vdupq_n_u8(0))), *key);
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn aes_enc_last(st: &mut State, key: &State) {
-    unsafe { *st = veorq_u8(vaeseq_u8(*st, vdupq_n_u8(0)), *key) }
+    *st = _veorq_u8(_vaeseq_u8(*st, _vdupq_n_u8(0)), *key)
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
-    unsafe {
-        let st = vaeseq_u8(*prev, vdupq_n_u8(0));
-        let mut tmp = [0u8; 16];
-        vst1q_u8(tmp.as_mut_ptr(), st);
-        let tmp_new = [
-            tmp[4], tmp[1], tmp[14], tmp[11], tmp[1], tmp[14], tmp[11], tmp[4], tmp[12], tmp[9],
-            tmp[6], tmp[3], tmp[9], tmp[6], tmp[3], tmp[12],
-        ];
-        let st_new = vld1q_u8(tmp_new.as_ptr());
-        let rcon_array = [0, rcon as u32, 0, rcon as u32];
-        let rcon_vec = vreinterpretq_u8_u32(vld1q_u32(rcon_array.as_ptr()));
-        *next = veorq_u8(st_new, rcon_vec);
-    }
+    let st = _vaeseq_u8(*prev, _vdupq_n_u8(0));
+    let mut tmp = [0u8; 16];
+    _vst1q_u8(&mut tmp, st);
+    let tmp_new = [
+        tmp[4], tmp[1], tmp[14], tmp[11], tmp[1], tmp[14], tmp[11], tmp[4], tmp[12], tmp[9],
+        tmp[6], tmp[3], tmp[9], tmp[6], tmp[3], tmp[12],
+    ];
+    let st_new = _vld1q_u8(&tmp_new);
+    let rcon_array = [0, rcon as u32, 0, rcon as u32];
+    let rcon_vec = _vreinterpretq_u8_u32(_vld1q_u32(&rcon_array));
+    *next = _veorq_u8(st_new, rcon_vec);
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
-    unsafe { *next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 3)) }
+    *next = _vreinterpretq_u8_u32(_vdupq_laneq_u32::<3>(_vreinterpretq_u32_u8(*next)))
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
-    unsafe { *next = vreinterpretq_u8_u32(vdupq_laneq_u32(vreinterpretq_u32_u8(*next), 2)) }
+    *next = _vreinterpretq_u8_u32(_vdupq_laneq_u32::<2>(_vreinterpretq_u32_u8(*next)));
 }
 
 #[inline]
+#[target_feature(enable = "neon")]
 fn key_expansion_step(next: &mut State, prev: &State) {
-    unsafe {
-        let zero = vdupq_n_u32(0);
-        let prev0 = vreinterpretq_u32_u8(*prev);
-        let prev1 = veorq_u32(prev0, vextq_u32(zero, prev0, 3));
-        let prev2 = veorq_u32(prev1, vextq_u32(zero, prev1, 3));
-        let prev3 = veorq_u32(prev2, vextq_u32(zero, prev2, 3));
-        *next = veorq_u8(*next, vreinterpretq_u8_u32(prev3));
-    }
+    let zero = _vdupq_n_u32(0);
+    let prev0 = _vreinterpretq_u32_u8(*prev);
+    let prev1 = _veorq_u32(prev0, _vextq_u32::<3>(zero, prev0));
+    let prev2 = _veorq_u32(prev1, _vextq_u32::<3>(zero, prev1));
+    let prev3 = _veorq_u32(prev2, _vextq_u32::<3>(zero, prev2));
+    *next = _veorq_u8(*next, _vreinterpretq_u8_u32(prev3));
 }
 
 impl crate::platform::AESState for State {
     #[inline]
+    #[allow(unsafe_code)]
     fn new() -> Self {
-        new_state()
+        unsafe { new_state() }
     }
 
     #[inline]
     fn load_block(&mut self, b: &[u8]) {
         debug_assert!(b.len() == 16);
-        unsafe { *self = vld1q_u8(b.as_ptr()) };
+        *self = _vld1q_u8(b);
     }
 
     #[inline]
     fn store_block(&self, out: &mut [u8]) {
         debug_assert!(out.len() == 16);
-        unsafe { vst1q_u8(out.as_mut_ptr(), *self) }
+        _vst1q_u8(out, *self);
     }
 
     #[inline]
-    fn xor_block(&self, inp: &[u8], out: &mut [u8]) {
-        debug_assert!(inp.len() == out.len() && inp.len() <= 16);
-        let inp_vec = unsafe { vld1q_u8(inp.as_ptr()) };
-        let out_vec = unsafe { veorq_u8(inp_vec, *self) };
-        unsafe { vst1q_u8(out.as_mut_ptr(), out_vec) }
+    #[allow(unsafe_code)]
+    fn xor_block(&self, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= 16);
+        // XXX: hot-fix to have enough input and output here.
+        // For some reason this doesn't fail even if we don't do this.
+        let mut block_in = [0u8; 16];
+        let mut block_out = [0u8; 16];
+        block_in[0..input.len()].copy_from_slice(input);
+
+        let inp_vec = _vld1q_u8(&block_in);
+        let out_vec = unsafe { _veorq_u8(inp_vec, *self) };
+        _vst1q_u8(&mut block_out, out_vec);
+
+        out.copy_from_slice(&block_out[0..out.len()]);
     }
 
     #[inline]
+    #[allow(unsafe_code)]
     fn xor_key(&mut self, key: &Self) {
-        xor_key1_state(self, key);
+        unsafe { xor_key1_state(self, key) };
     }
 
     #[inline]
+    #[allow(unsafe_code)]
     fn aes_enc(&mut self, key: &Self) {
-        aes_enc(self, key);
+        unsafe { aes_enc(self, key) };
     }
 
     #[inline]
+    #[allow(unsafe_code)]
     fn aes_enc_last(&mut self, key: &Self) {
-        aes_enc_last(self, key);
+        unsafe { aes_enc_last(self, key) };
     }
 
     #[inline]
+    #[allow(unsafe_code)]
     fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
-        aes_keygen_assist0(self, prev, RCON as u8);
+        unsafe { aes_keygen_assist0(self, prev, RCON as u8) };
     }
 
     #[inline]
+    #[allow(unsafe_code)]
     fn aes_keygen_assist1(&mut self, prev: &Self) {
-        aes_keygen_assist1(self, prev);
+        unsafe { aes_keygen_assist1(self, prev) };
     }
 
     #[inline]
+    #[allow(unsafe_code)]
     fn key_expansion_step(&mut self, prev: &Self) {
-        key_expansion_step(self, prev)
+        unsafe { key_expansion_step(self, prev) }
     }
 }
diff --git a/libcrux-intrinsics/src/arm64.rs b/libcrux-intrinsics/src/arm64.rs
index c78871555..5cd5b913c 100644
--- a/libcrux-intrinsics/src/arm64.rs
+++ b/libcrux-intrinsics/src/arm64.rs
@@ -4,6 +4,7 @@ use core::arch::aarch64::*;
 pub type _int16x8_t = int16x8_t;
 pub type _uint32x4_t = uint32x4_t;
 pub type _uint64x2_t = uint64x2_t;
+pub type _uint8x16_t = uint8x16_t;
 
 #[inline(always)]
 pub fn _vdupq_n_s16(i: i16) -> int16x8_t {
@@ -171,6 +172,18 @@ pub fn _vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
     unsafe { vreinterpretq_u32_s32(a) }
 }
 
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    vreinterpretq_u32_u8(a)
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+    vreinterpretq_u8_u32(a)
+}
+
 #[inline(always)]
 pub fn _vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
     unsafe { vshrq_n_u32::<N>(a) }
@@ -270,6 +283,12 @@ pub fn _vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
 pub fn _vld1q_u8(ptr: &[u8]) -> uint8x16_t {
     unsafe { vld1q_u8(ptr.as_ptr()) }
 }
+
+#[inline]
+pub fn _vld1q_u32(ptr: &[u32]) -> uint32x4_t {
+    unsafe { vld1q_u32(ptr.as_ptr()) }
+}
+
 #[inline(always)]
 pub fn _vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
     unsafe { vreinterpretq_u8_s16(a) }
@@ -375,6 +394,18 @@ pub fn _vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
     _veorq_u64(a, _veorq_u64(_vshlq_n_u64::<1>(b), _vshrq_n_u64::<63>(b)))
 }
 
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _veorq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    veorq_u32(a, b)
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    vextq_u32(a, b, N)
+}
+
 #[inline(always)]
 pub fn _veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
     #[cfg(all(
@@ -437,3 +468,31 @@ pub fn _vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
 pub fn _vmull_p64(a: u64, b: u64) -> u128 {
     unsafe { vmull_p64(a, b) }
 }
+
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _veorq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    veorq_u8(a, b)
+}
+
+#[inline]
+pub fn _vaesmcq_u8(data: uint8x16_t) -> uint8x16_t {
+    unsafe { vaesmcq_u8(data) }
+}
+
+#[inline]
+pub fn _vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    unsafe { vaeseq_u8(data, key) }
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _vdupq_n_u8(value: u8) -> uint8x16_t {
+    vdupq_n_u8(value)
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+pub fn _vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    vdupq_laneq_u32(a, N)
+}

From 214e81e05fd6659d74ac8080f0ea98ec5d44e8cb Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Mon, 22 Sep 2025 15:52:02 +0200
Subject: [PATCH 29/43] simple aes fuzz

---
 Cargo.toml                             |  1 +
 aesgcm/fuzz/.gitignore                 |  4 ++++
 aesgcm/fuzz/Cargo.toml                 | 28 ++++++++++++++++++++++++++
 aesgcm/fuzz/fuzz_targets/encrypt128.rs | 28 ++++++++++++++++++++++++++
 aesgcm/fuzz/fuzz_targets/encrypt256.rs | 28 ++++++++++++++++++++++++++
 5 files changed, 89 insertions(+)
 create mode 100644 aesgcm/fuzz/.gitignore
 create mode 100644 aesgcm/fuzz/Cargo.toml
 create mode 100644 aesgcm/fuzz/fuzz_targets/encrypt128.rs
 create mode 100644 aesgcm/fuzz/fuzz_targets/encrypt256.rs

diff --git a/Cargo.toml b/Cargo.toml
index 5ea08e6b5..5b28a5296 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [workspace]
 members = [
     "aesgcm",
+    "aesgcm/fuzz",
     "sys/hacl",
     "sys/libjade",
     "sys/platform",
diff --git a/aesgcm/fuzz/.gitignore b/aesgcm/fuzz/.gitignore
new file mode 100644
index 000000000..1a45eee77
--- /dev/null
+++ b/aesgcm/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/aesgcm/fuzz/Cargo.toml b/aesgcm/fuzz/Cargo.toml
new file mode 100644
index 000000000..6f6710321
--- /dev/null
+++ b/aesgcm/fuzz/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "libcrux_aesgcm-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.libcrux_aesgcm]
+path = ".."
+
+[[bin]]
+name = "encrypt128"
+path = "fuzz_targets/encrypt.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "encrypt256"
+path = "fuzz_targets/encrypt256.rs"
+test = false
+doc = false
+bench = false
diff --git a/aesgcm/fuzz/fuzz_targets/encrypt128.rs b/aesgcm/fuzz/fuzz_targets/encrypt128.rs
new file mode 100644
index 000000000..0daa622a8
--- /dev/null
+++ b/aesgcm/fuzz/fuzz_targets/encrypt128.rs
@@ -0,0 +1,28 @@
+#![no_main]
+
+use libcrux_aesgcm::Aead;
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    if data.len() < 16 + 12 + 7 {
+        // We want at least a key, nonce, and a few input bytes.
+        return;
+    }
+
+    let key = &data[0..16];
+    let nonce = &data[16..16 + 12];
+    let aad = &data[16 + 12..16 + 12 + 5];
+
+    let mut ctxt = vec![0u8; data.len()];
+    let mut tag = [0u8; 16];
+    libcrux_aesgcm::PortableAesGcm128::encrypt(
+        &mut ctxt,
+        &mut tag,
+        key.try_into().unwrap(),
+        nonce.try_into().unwrap(),
+        aad,
+        &data,
+    )
+    .unwrap();
+});
diff --git a/aesgcm/fuzz/fuzz_targets/encrypt256.rs b/aesgcm/fuzz/fuzz_targets/encrypt256.rs
new file mode 100644
index 000000000..528634f9c
--- /dev/null
+++ b/aesgcm/fuzz/fuzz_targets/encrypt256.rs
@@ -0,0 +1,28 @@
+#![no_main]
+
+use libcrux_aesgcm::Aead;
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    if data.len() < 32 + 12 + 7 {
+        // We want at least a key, nonce, and a few input bytes.
+        return;
+    }
+
+    let key = &data[0..32];
+    let nonce = &data[32..32 + 12];
+    let aad = &data[32 + 12..32 + 12 + 5];
+
+    let mut ctxt = vec![0u8; data.len()];
+    let mut tag = [0u8; 16];
+    libcrux_aesgcm::PortableAesGcm256::encrypt(
+        &mut ctxt,
+        &mut tag,
+        key.try_into().unwrap(),
+        nonce.try_into().unwrap(),
+        aad,
+        &data,
+    )
+    .unwrap();
+});

From 2ca8e7c1d3c50ba638035333eadc6ad2d39f0c73 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Mon, 22 Sep 2025 16:41:53 +0200
Subject: [PATCH 30/43] fixup

---
 aesgcm/fuzz/Cargo.toml | 2 +-
 aesgcm/src/lib.rs      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aesgcm/fuzz/Cargo.toml b/aesgcm/fuzz/Cargo.toml
index 6f6710321..ab0ba8f8f 100644
--- a/aesgcm/fuzz/Cargo.toml
+++ b/aesgcm/fuzz/Cargo.toml
@@ -15,7 +15,7 @@ path = ".."
 
 [[bin]]
 name = "encrypt128"
-path = "fuzz_targets/encrypt.rs"
+path = "fuzz_targets/encrypt128.rs"
 test = false
 doc = false
 bench = false
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 6aae73a99..cf2647841 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -168,7 +168,7 @@ pub mod neon {
 
 #[cfg(feature = "simd256")]
 pub mod x64 {
-    // Here we don't use the `pub_mod` macro becaus we need to add target features
+    // Here we don't use the `pub_mod` macro because we need to add target features
     // onto the functions.
     macro_rules! x64_pub_mod {
         ($variant_comment:literal, $mod_name:ident, $state:ty) => {

From 20c0510a4747cb063c0050a17439e0e85adb1ec4 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 24 Sep 2025 08:45:34 +0200
Subject: [PATCH 31/43] addressing review comments

---
 Cargo.toml                                  |  3 +-
 aesgcm/README.md                            | 12 ++++++++
 aesgcm/build.rs                             |  4 +--
 aesgcm/src/aes.rs                           |  5 ----
 aesgcm/src/ctr.rs                           |  5 +++-
 aesgcm/src/gf128.rs                         |  2 +-
 aesgcm/src/lib.rs                           |  8 +++---
 aesgcm/src/platform.rs                      |  2 +-
 aesgcm/src/platform/intel_ni/aes_core.rs    |  1 -
 aesgcm/src/platform/neon/aes_core.rs        | 32 ++++++---------------
 aesgcm/src/platform/{intel_ni.rs => x64.rs} |  0
 libcrux-intrinsics/src/arm64.rs             | 27 +++++++----------
 12 files changed, 44 insertions(+), 57 deletions(-)
 create mode 100644 aesgcm/README.md
 rename aesgcm/src/platform/{intel_ni.rs => x64.rs} (100%)

diff --git a/Cargo.toml b/Cargo.toml
index 5b28a5296..c507579fa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,6 +53,7 @@ allow-branch = ["main"]
 hax-lib = { version = "0.3.4" }
 libcrux-intrinsics = { version = "=0.0.3", path = "libcrux-intrinsics" }
 libcrux-aesgcm = { version = "=0.0.2", path = "aesgcm" }
+libcrux-chacha20poly1305 = { version = "=0.0.3", path = "chacha20poly1305" }
 libcrux-traits = { version = "=0.0.3", path = "traits" }
 libcrux-hacl-rs = { version = "=0.0.3", path = "hacl-rs" }
 libcrux-hacl = { version = "=0.0.2", path = "sys/hacl" }
@@ -97,7 +98,7 @@ libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
 
 [dependencies]
 libcrux-hacl-rs.workspace = true
-libcrux-chacha20poly1305 = { version = "=0.0.3", path = "chacha20poly1305" }
+libcrux-chacha20poly1305.workspace = true
 libcrux-ml-kem.workspace = true
 libcrux-traits.workspace = true
 libcrux-hacl.workspace = true
diff --git a/aesgcm/README.md b/aesgcm/README.md
new file mode 100644
index 000000000..0d708942f
--- /dev/null
+++ b/aesgcm/README.md
@@ -0,0 +1,12 @@
+# AES-GCM
+
+![pre-verification]
+
+This crate implements AES-GCM 128 and 256
+
+It provides 
+- a portable, bit-sliced implementation
+- an x64 optimised implementation using AES-NI
+- an Aarch64 optimised implementation using the AES instructions
+
+[pre-verification]: https://img.shields.io/badge/pre_verification-orange.svg?style=for-the-badge&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48IS0tIFVwbG9hZGVkIHRvOiBTVkcgUmVwbywgd3d3LnN2Z3JlcG8uY29tLCBHZW5lcmF0b3I6IFNWRyBSZXBvIE1peGVyIFRvb2xzIC0tPg0KPHN2ZyB3aWR0aD0iODAwcHgiIGhlaWdodD0iODAwcHgiIHZpZXdCb3g9IjAgMCAyNCAyNCIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4NCjxwYXRoIGQ9Ik05IDEySDE1TTIwIDEyQzIwIDE2LjQ2MTEgMTQuNTQgMTkuNjkzNyAxMi42NDE0IDIwLjY4M0MxMi40MzYxIDIwLjc5IDEyLjMzMzQgMjAuODQzNSAxMi4xOTEgMjAuODcxMkMxMi4wOCAyMC44OTI4IDExLjkyIDIwLjg5MjggMTEuODA5IDIwLjg3MTJDMTEuNjY2NiAyMC44NDM1IDExLjU2MzkgMjAuNzkgMTEuMzU4NiAyMC42ODNDOS40NTk5NiAxOS42OTM3IDQgMTYuNDYxMSA0IDEyVjguMjE3NTlDNCA3LjQxODA4IDQgNy4wMTgzMyA0LjEzMDc2IDYuNjc0N0M0LjI0NjI3IDYuMzcxMTMgNC40MzM5OCA2LjEwMDI3IDQuNjc3NjYgNS44ODU1MkM0Ljk1MzUgNS42NDI0MyA1LjMyNzggNS41MDIwNyA2LjA3NjQgNS4yMjEzNEwxMS40MzgyIDMuMjEwNjdDMTEuNjQ2MSAzLjEzMjcxIDExLjc1IDMuMDkzNzMgMTEuODU3IDMuMDc4MjdDMTEuOTUxOCAzLjA2NDU3IDEyLjA0ODIgMy4wNjQ1NyAxMi4xNDMgMy4wNzgyN0MxMi4yNSAzLjA5MzczIDEyLjM1MzkgMy4xMzI3MSAxMi41NjE4IDMuMjEwNjdMMTcuOTIzNiA1LjIyMTM0QzE4LjY3MjIgNS41MDIwNyAxOS4wNDY1IDUuNjQyNDMgMTkuMzIyMyA1Ljg4NTUyQzE5LjU2NiA2LjEwMDI3IDE5Ljc1MzcgNi4zNzExMyAxOS44NjkyIDYuNjc0N0MyMCA3LjAxODMzIDIwIDcuNDE4MDggMjAgOC4yMTc1OVYxMloiIHN0cm9rZT0iIzAwMDAwMCIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiLz4NCjwvc3ZnPg==
diff --git a/aesgcm/build.rs b/aesgcm/build.rs
index 91a6fae70..abc7a8b78 100644
--- a/aesgcm/build.rs
+++ b/aesgcm/build.rs
@@ -14,8 +14,8 @@ fn main() {
         // We enable simd128 on all aarch64 builds.
         println!("cargo:rustc-cfg=feature=\"simd128\"");
     }
-    let simd126_possible = target_arch == "x86_64";
-    if (simd126_possible || enable_simd256) && !disable_simd256 {
+    let simd256_possible = target_arch == "x86_64";
+    if (simd256_possible || enable_simd256) && !disable_simd256 {
         // We enable simd256 on all x86_64 builds.
         // Note that this doesn't mean the required CPU features are available.
         // But the compiler will support them and the runtime checks ensure that
diff --git a/aesgcm/src/aes.rs b/aesgcm/src/aes.rs
index cf688ea59..121a63200 100644
--- a/aesgcm/src/aes.rs
+++ b/aesgcm/src/aes.rs
@@ -4,11 +4,6 @@ use crate::platform::*;
 
 pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
 
-// This is inlined into the key expansion below.
-// const RCON: [u8; 11] = [
-//     0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
-// ];
-
 /// AES block size
 pub(crate) const AES_BLOCK_LEN: usize = 16;
 
diff --git a/aesgcm/src/ctr.rs b/aesgcm/src/ctr.rs
index 2d732616c..ea8fb4bb4 100644
--- a/aesgcm/src/ctr.rs
+++ b/aesgcm/src/ctr.rs
@@ -66,7 +66,10 @@ impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
     #[inline]
     fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
         debug_assert!(input.len() == out.len() && input.len().is_multiple_of(AES_BLOCK_LEN));
-        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
+        // If input.len() / AES_BLOCK_LEN == u32::MAX - 1 and we start with
+        // ctr == 2 then we'll wrap to 0 below and we'll repeat the initial key
+        // block
+        debug_assert!(input.len() / AES_BLOCK_LEN < (u32::MAX - 1) as usize);
 
         let blocks = input.len() / AES_BLOCK_LEN;
         for i in 0..blocks {
diff --git a/aesgcm/src/gf128.rs b/aesgcm/src/gf128.rs
index b17064ecb..fe2c4d0cc 100644
--- a/aesgcm/src/gf128.rs
+++ b/aesgcm/src/gf128.rs
@@ -13,7 +13,7 @@ pub(crate) struct GF128State<T: GF128FieldElement> {
     r: T,
 }
 
-const KEY_LEN: usize = 16;
+const KEY_LEN: usize = AES_BLOCK_LEN;
 
 impl<T: GF128FieldElement> GF128State<T> {
     #[inline]
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index cf2647841..71843e511 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -41,9 +41,9 @@ pub struct AesGcm128 {}
 pub struct PortableAesGcm128 {}
 
 /// Neon AES-GCM 128.
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 pub struct NeonAesGcm128 {}
-#[cfg(not(all(target_arch = "aarch64", target_feature = "aes")))]
+#[cfg(not(feature = "simd128"))]
 pub type NeonAesGcm128 = PortableAesGcm128;
 
 /// AES-NI AES-GCM 128.
@@ -246,8 +246,8 @@ pub mod x64 {
         };
     }
 
-    x64_pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::intel_ni::State, platform::intel_ni::FieldElement>);
-    x64_pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::intel_ni::State, platform::intel_ni::FieldElement>);
+    x64_pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::x64::State, platform::x64::FieldElement>);
+    x64_pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::x64::State, platform::x64::FieldElement>);
 }
 
 /// Macro to implement the different structs and multiplexing.
diff --git a/aesgcm/src/platform.rs b/aesgcm/src/platform.rs
index 9fba0d282..9c622c81a 100644
--- a/aesgcm/src/platform.rs
+++ b/aesgcm/src/platform.rs
@@ -6,7 +6,7 @@ pub(crate) mod portable;
 pub(crate) mod neon;
 
 #[cfg(feature = "simd256")]
-pub(crate) mod intel_ni;
+pub(crate) mod x64;
 
 /// The AES state.
 pub(crate) trait AESState: Clone + core::fmt::Debug {
diff --git a/aesgcm/src/platform/intel_ni/aes_core.rs b/aesgcm/src/platform/intel_ni/aes_core.rs
index 4ff6dbb2b..3ab931900 100644
--- a/aesgcm/src/platform/intel_ni/aes_core.rs
+++ b/aesgcm/src/platform/intel_ni/aes_core.rs
@@ -96,7 +96,6 @@ impl crate::platform::AESState for State {
     #[inline]
     fn aes_enc(&mut self, key: &Self) {
         aes_enc(self, key);
-        (self, key);
     }
 
     #[inline]
diff --git a/aesgcm/src/platform/neon/aes_core.rs b/aesgcm/src/platform/neon/aes_core.rs
index 709d6e38b..674b620c9 100644
--- a/aesgcm/src/platform/neon/aes_core.rs
+++ b/aesgcm/src/platform/neon/aes_core.rs
@@ -8,31 +8,26 @@ use libcrux_intrinsics::arm64::{
 pub(crate) type State = _uint8x16_t;
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn new_state() -> State {
     _vdupq_n_u8(0)
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn xor_key1_state(st: &mut State, k: &State) {
     *st = _veorq_u8(*st, *k);
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn aes_enc(st: &mut State, key: &State) {
     *st = _veorq_u8(_vaesmcq_u8(_vaeseq_u8(*st, _vdupq_n_u8(0))), *key);
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn aes_enc_last(st: &mut State, key: &State) {
     *st = _veorq_u8(_vaeseq_u8(*st, _vdupq_n_u8(0)), *key)
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
     let st = _vaeseq_u8(*prev, _vdupq_n_u8(0));
     let mut tmp = [0u8; 16];
@@ -48,21 +43,18 @@ fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
     aes_keygen_assist(next, prev, rcon);
     *next = _vreinterpretq_u8_u32(_vdupq_laneq_u32::<3>(_vreinterpretq_u32_u8(*next)))
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn aes_keygen_assist1(next: &mut State, prev: &State) {
     aes_keygen_assist(next, prev, 0);
     *next = _vreinterpretq_u8_u32(_vdupq_laneq_u32::<2>(_vreinterpretq_u32_u8(*next)));
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 fn key_expansion_step(next: &mut State, prev: &State) {
     let zero = _vdupq_n_u32(0);
     let prev0 = _vreinterpretq_u32_u8(*prev);
@@ -74,9 +66,8 @@ fn key_expansion_step(next: &mut State, prev: &State) {
 
 impl crate::platform::AESState for State {
     #[inline]
-    #[allow(unsafe_code)]
     fn new() -> Self {
-        unsafe { new_state() }
+        new_state()
     }
 
     #[inline]
@@ -92,7 +83,6 @@ impl crate::platform::AESState for State {
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn xor_block(&self, input: &[u8], out: &mut [u8]) {
         debug_assert!(input.len() == out.len() && input.len() <= 16);
         // XXX: hot-fix to have enough input and output here.
@@ -102,45 +92,39 @@ impl crate::platform::AESState for State {
         block_in[0..input.len()].copy_from_slice(input);
 
         let inp_vec = _vld1q_u8(&block_in);
-        let out_vec = unsafe { _veorq_u8(inp_vec, *self) };
+        let out_vec = _veorq_u8(inp_vec, *self);
         _vst1q_u8(&mut block_out, out_vec);
 
         out.copy_from_slice(&block_out[0..out.len()]);
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn xor_key(&mut self, key: &Self) {
-        unsafe { xor_key1_state(self, key) };
+        xor_key1_state(self, key);
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn aes_enc(&mut self, key: &Self) {
-        unsafe { aes_enc(self, key) };
+        aes_enc(self, key);
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn aes_enc_last(&mut self, key: &Self) {
-        unsafe { aes_enc_last(self, key) };
+        aes_enc_last(self, key);
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
-        unsafe { aes_keygen_assist0(self, prev, RCON as u8) };
+        aes_keygen_assist0(self, prev, RCON as u8);
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn aes_keygen_assist1(&mut self, prev: &Self) {
-        unsafe { aes_keygen_assist1(self, prev) };
+        aes_keygen_assist1(self, prev);
     }
 
     #[inline]
-    #[allow(unsafe_code)]
     fn key_expansion_step(&mut self, prev: &Self) {
-        unsafe { key_expansion_step(self, prev) }
+        key_expansion_step(self, prev);
     }
 }
diff --git a/aesgcm/src/platform/intel_ni.rs b/aesgcm/src/platform/x64.rs
similarity index 100%
rename from aesgcm/src/platform/intel_ni.rs
rename to aesgcm/src/platform/x64.rs
diff --git a/libcrux-intrinsics/src/arm64.rs b/libcrux-intrinsics/src/arm64.rs
index 5cd5b913c..94d8abccd 100644
--- a/libcrux-intrinsics/src/arm64.rs
+++ b/libcrux-intrinsics/src/arm64.rs
@@ -172,16 +172,14 @@ pub fn _vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
     unsafe { vreinterpretq_u32_s32(a) }
 }
 
-#[inline]
-#[target_feature(enable = "neon")]
+#[inline(always)]
 pub fn _vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
-    vreinterpretq_u32_u8(a)
+    unsafe { vreinterpretq_u32_u8(a) }
 }
 
-#[inline]
-#[target_feature(enable = "neon")]
+#[inline(always)]
 pub fn _vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
-    vreinterpretq_u8_u32(a)
+    unsafe { vreinterpretq_u8_u32(a) }
 }
 
 #[inline(always)]
@@ -284,7 +282,7 @@ pub fn _vld1q_u8(ptr: &[u8]) -> uint8x16_t {
     unsafe { vld1q_u8(ptr.as_ptr()) }
 }
 
-#[inline]
+#[inline(always)]
 pub fn _vld1q_u32(ptr: &[u32]) -> uint32x4_t {
     unsafe { vld1q_u32(ptr.as_ptr()) }
 }
@@ -395,15 +393,13 @@ pub fn _vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 pub fn _veorq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    veorq_u32(a, b)
+    unsafe { veorq_u32(a, b) }
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 pub fn _vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    vextq_u32(a, b, N)
+    unsafe { vextq_u32(a, b, N) }
 }
 
 #[inline(always)]
@@ -470,9 +466,8 @@ pub fn _vmull_p64(a: u64, b: u64) -> u128 {
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 pub fn _veorq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    veorq_u8(a, b)
+    unsafe { veorq_u8(a, b) }
 }
 
 #[inline]
@@ -486,13 +481,11 @@ pub fn _vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 pub fn _vdupq_n_u8(value: u8) -> uint8x16_t {
-    vdupq_n_u8(value)
+    unsafe { vdupq_n_u8(value) }
 }
 
 #[inline]
-#[target_feature(enable = "neon")]
 pub fn _vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    vdupq_laneq_u32(a, N)
+    unsafe { vdupq_laneq_u32(a, N) }
 }

From af194e989c52b75fbd5ddb62c9047c7674c9a02d Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 24 Sep 2025 08:52:10 +0200
Subject: [PATCH 32/43] fix x64; add ci

---
 .github/workflows/aes.yml                     | 169 ++++++++++++++++++
 .../platform/{intel_ni => x64}/aes_core.rs    |   0
 .../platform/{intel_ni => x64}/gf128_core.rs  |   0
 3 files changed, 169 insertions(+)
 create mode 100644 .github/workflows/aes.yml
 rename aesgcm/src/platform/{intel_ni => x64}/aes_core.rs (100%)
 rename aesgcm/src/platform/{intel_ni => x64}/gf128_core.rs (100%)

diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml
new file mode 100644
index 000000000..e4635625c
--- /dev/null
+++ b/.github/workflows/aes.yml
@@ -0,0 +1,169 @@
+name: AES-GCM
+
+on:
+  merge_group:
+  pull_request:
+    branches: ["main", "dev", "*"]
+    paths:
+      - "aesgcm/**"
+  workflow_dispatch:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        bits: [32, 64]
+        os:
+          - macos-latest # macos-15 on apple silicon
+          - ubuntu-latest
+          - windows-latest
+        exclude:
+          - bits: 32
+            os: "macos-latest"
+
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: aesgcm
+
+    steps:
+      - uses: actions/checkout@v5
+      - uses: taiki-e/install-action@cargo-hack
+
+      - name: Update dependencies
+        run: cargo update
+
+      - run: echo "RUST_TARGET_FLAG=" > $GITHUB_ENV
+        if: ${{ matrix.bits == 64 }}
+
+      - run: echo 'EXCLUDE_FEATURES=--exclude-features simd256' > $GITHUB_ENV
+        if: ${{ matrix.os == 'macos-latest' }}
+
+      - run: echo 'EXCLUDE_FEATURES=--exclude-features simd128' > $GITHUB_ENV
+        if: ${{ matrix.os != 'macos-latest' }}
+
+      - name: 🛠️ Setup Rust Nightly
+        run: rustup toolchain install nightly
+
+      - name: 🛠️ Setup Ubuntu x86
+        if: ${{ matrix.bits == 32 &&  matrix.os == 'ubuntu-latest' }}
+        run: |
+          rustup target add i686-unknown-linux-gnu
+          sudo apt-get update
+          sudo apt-get install -y gcc-multilib g++-multilib
+
+      # Set up 32 bit systems
+
+      - name: 🛠️ Config Windows x86
+        run: echo "RUST_TARGET_FLAG=--target=i686-pc-windows-msvc" > $GITHUB_ENV
+        if: ${{ matrix.bits == 32 && matrix.os == 'windows-latest' }}
+
+      - name: 🛠️ Config Linux x86
+        run: |
+          echo "RUST_TARGET_FLAG=--target=i686-unknown-linux-gnu" > $GITHUB_ENV
+        if: ${{ matrix.bits == 32 && matrix.os == 'ubuntu-latest' }}
+
+      # Build ...
+
+      - name: 🔨 Build
+        run: |
+          rustc --print=cfg
+          cargo build --verbose $RUST_TARGET_FLAG
+
+      - name: 🔨 Build Release
+        run: cargo build --verbose --release $RUST_TARGET_FLAG
+
+      - name: 🏃🏻 Asan MacOS
+        if: ${{ matrix.os == 'macos-latest' }}
+        run: RUSTDOCFLAGS=-Zsanitizer=address RUSTFLAGS=-Zsanitizer=address cargo +nightly test --release --target aarch64-apple-darwin
+
+      # Test ...
+
+      - name: 🏃🏻‍♀️ Test
+        run: |
+          cargo clean
+          cargo test --verbose $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Release
+        run: |
+          cargo clean
+          cargo test --verbose --release $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Portable
+        run: |
+          cargo clean
+          LIBCRUX_DISABLE_SIMD128=1 LIBCRUX_DISABLE_SIMD256=1 cargo test --verbose $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Portable Release
+        run: |
+          cargo clean
+          LIBCRUX_DISABLE_SIMD128=1 LIBCRUX_DISABLE_SIMD256=1 cargo test --verbose --release $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Kyber
+        run: |
+          cargo clean
+          cargo test ,kyber --verbose $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Cargo Test Features
+        if: ${{ matrix.bits == 64 }}
+        run: |
+          cargo clean
+          cargo hack test --each-feature $EXCLUDE_FEATURES --verbose $RUST_TARGET_FLAG
+
+  build-intel-macos:
+    runs-on: macos-13
+    defaults:
+      run:
+        shell: bash
+        working-directory: aesgcm
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Update dependencies
+        run: cargo update
+
+      - name: 🔨 Build
+        run: |
+          rustc --print=cfg
+          cargo build --verbose
+
+  fuzz:
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - macos-latest # macos-15
+          - ubuntu-latest
+
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: aesgcm
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: 🛠️ Setup Rust Nightly
+        run: |
+          rustup toolchain install nightly
+          cargo install cargo-fuzz
+
+      - name: 🛠️ Update dependencies
+        run: cargo update
+
+      - name: 🏃🏻‍♀️ Encrypt256
+        run: CARGO_PROFILE_RELEASE_LTO=false cargo +nightly fuzz run encrypt128 -- -runs=100000
+
+      - name: 🏃🏻‍♀️ Encrypt256
+        run: CARGO_PROFILE_RELEASE_LTO=false cargo +nightly fuzz run encrypt256 -- -runs=100000
diff --git a/aesgcm/src/platform/intel_ni/aes_core.rs b/aesgcm/src/platform/x64/aes_core.rs
similarity index 100%
rename from aesgcm/src/platform/intel_ni/aes_core.rs
rename to aesgcm/src/platform/x64/aes_core.rs
diff --git a/aesgcm/src/platform/intel_ni/gf128_core.rs b/aesgcm/src/platform/x64/gf128_core.rs
similarity index 100%
rename from aesgcm/src/platform/intel_ni/gf128_core.rs
rename to aesgcm/src/platform/x64/gf128_core.rs

From b123be0abecd7dd0ac2a81a4322e23d070e40157 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 24 Sep 2025 08:53:13 +0200
Subject: [PATCH 33/43] rustfmt

---
 aesgcm/src/ctr/aes256_ctr.rs               |  2 +-
 aesgcm/src/ctr/test128.rs                  | 20 +++++++++++++++-----
 aesgcm/src/gf128/test.rs                   | 12 +++++++++---
 aesgcm/src/platform/portable/gf128_core.rs |  1 -
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/aesgcm/src/ctr/aes256_ctr.rs b/aesgcm/src/ctr/aes256_ctr.rs
index ae1e3496c..7009b138f 100644
--- a/aesgcm/src/ctr/aes256_ctr.rs
+++ b/aesgcm/src/ctr/aes256_ctr.rs
@@ -3,7 +3,7 @@
 use core::array::from_fn;
 
 use super::AesCtrContext;
-use crate::{aes_gcm_256::KEY_LEN, aes::*, platform::AESState, NONCE_LEN};
+use crate::{aes::*, aes_gcm_256::KEY_LEN, platform::AESState, NONCE_LEN};
 
 pub(crate) const NUM_KEYS: usize = 15;
 
diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
index 8d0380e17..6de3bcb19 100644
--- a/aesgcm/src/ctr/test128.rs
+++ b/aesgcm/src/ctr/test128.rs
@@ -55,7 +55,9 @@ fn test_ctr_block() {
             #[cfg(feature = "std")]
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
@@ -74,7 +76,9 @@ fn test_ctr_block_neon() {
             #[cfg(feature = "std")]
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
@@ -90,7 +94,9 @@ fn test_ctr_encrypt() {
             #[cfg(feature = "std")]
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
@@ -107,7 +113,9 @@ fn test_ctr_encrypt_neon() {
             #[cfg(feature = "std")]
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
@@ -123,7 +131,9 @@ fn test_ctr_encrypt_intel() {
         if computed[i] != EXPECTED[i] {
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
diff --git a/aesgcm/src/gf128/test.rs b/aesgcm/src/gf128/test.rs
index 2896908b0..52faf18c4 100644
--- a/aesgcm/src/gf128/test.rs
+++ b/aesgcm/src/gf128/test.rs
@@ -38,7 +38,9 @@ fn test_gf128() {
             #[cfg(feature = "std")]
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
@@ -55,7 +57,9 @@ fn test_gf128_neon() {
             #[cfg(feature = "std")]
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
@@ -71,7 +75,9 @@ fn test_gf128_intel() {
         if computed[i] != EXPECTED[i] {
             std::eprintln!(
                 "mismatch at {}: expected is {}, computed is {}",
-                i, EXPECTED[i], computed[i]
+                i,
+                EXPECTED[i],
+                computed[i]
             );
             assert!(false);
         }
diff --git a/aesgcm/src/platform/portable/gf128_core.rs b/aesgcm/src/platform/portable/gf128_core.rs
index ce680d36d..d2523efa9 100644
--- a/aesgcm/src/platform/portable/gf128_core.rs
+++ b/aesgcm/src/platform/portable/gf128_core.rs
@@ -1,4 +1,3 @@
-
 /// A portable gf128 field element.
 pub(crate) type FieldElement = u128;
 

From a6c25e378c842ac734413929333ba933634d4c3e Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 24 Sep 2025 08:56:20 +0200
Subject: [PATCH 34/43] fixup x64 again

---
 aesgcm/src/ctr/test128.rs | 2 +-
 aesgcm/src/gf128/test.rs  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
index 6de3bcb19..141b821ba 100644
--- a/aesgcm/src/ctr/test128.rs
+++ b/aesgcm/src/ctr/test128.rs
@@ -126,7 +126,7 @@ fn test_ctr_encrypt_neon() {
 #[test]
 fn test_ctr_encrypt_intel() {
     let mut computed: [u8; 32] = [0u8; 32];
-    aes128_ctr_encrypt::<platform::intel_ni::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    aes128_ctr_encrypt::<platform::x64::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
     for i in 0..32 {
         if computed[i] != EXPECTED[i] {
             std::eprintln!(
diff --git a/aesgcm/src/gf128/test.rs b/aesgcm/src/gf128/test.rs
index 52faf18c4..d137f71ac 100644
--- a/aesgcm/src/gf128/test.rs
+++ b/aesgcm/src/gf128/test.rs
@@ -70,7 +70,7 @@ fn test_gf128_neon() {
 #[test]
 fn test_gf128_intel() {
     let mut computed: [u8; 16] = [0u8; 16];
-    gf128::<crate::platform::intel_ni::FieldElement>(&KEY, &INPUT, &mut computed);
+    gf128::<crate::platform::x64::FieldElement>(&KEY, &INPUT, &mut computed);
     for i in 0..16 {
         if computed[i] != EXPECTED[i] {
             std::eprintln!(

From 1b462cf4571c5155a0f0a8777c8045b7a6dd580d Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 24 Sep 2025 09:40:10 +0200
Subject: [PATCH 35/43] x64 fixup

---
 aesgcm/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 71843e511..d0b19bf7e 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -47,9 +47,9 @@ pub struct NeonAesGcm128 {}
 pub type NeonAesGcm128 = PortableAesGcm128;
 
 /// AES-NI AES-GCM 128.
-#[cfg(target_arch = "x86_64")]
+#[cfg(feature = "simd256")]
 pub struct X64AesGcm128 {}
-#[cfg(not(target_arch = "x86_64"))]
+#[cfg(not(feature = "simd256"))]
 pub type X64AesGcm128 = PortableAesGcm128;
 
 /// AES-GCM 256.

From a882d9098e61f3b0cd5aa2331d85b76c8f1f1070 Mon Sep 17 00:00:00 2001
From: Franziskus Kiefer <franziskuskiefer@gmail.com>
Date: Wed, 24 Sep 2025 09:51:15 +0200
Subject: [PATCH 36/43] aarch64 test fixup

---
 aesgcm/src/ctr/test128.rs | 4 ++--
 aesgcm/src/gf128/test.rs  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
index 141b821ba..7a2c8ba2e 100644
--- a/aesgcm/src/ctr/test128.rs
+++ b/aesgcm/src/ctr/test128.rs
@@ -64,7 +64,7 @@ fn test_ctr_block() {
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 #[test]
 fn test_ctr_block_neon() {
     let mut computed: [u8; 32] = [0u8; 32];
@@ -103,7 +103,7 @@ fn test_ctr_encrypt() {
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 #[test]
 fn test_ctr_encrypt_neon() {
     let mut computed: [u8; 32] = [0u8; 32];
diff --git a/aesgcm/src/gf128/test.rs b/aesgcm/src/gf128/test.rs
index d137f71ac..a44b8b425 100644
--- a/aesgcm/src/gf128/test.rs
+++ b/aesgcm/src/gf128/test.rs
@@ -47,7 +47,7 @@ fn test_gf128() {
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+#[cfg(feature = "simd128")]
 #[test]
 fn test_gf128_neon() {
     let mut computed: [u8; 16] = [0u8; 16];

From 1c9818609597af0c983abf8df1a25acebae4969c Mon Sep 17 00:00:00 2001
From: Jonas Schneider-Bensch <jonas@cryspen.com>
Date: Wed, 24 Sep 2025 11:03:21 +0200
Subject: [PATCH 37/43] Extend core models by AES & related intrinsics

---
 .../core-models/src/core_arch/x86.rs          | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/fstar-helpers/core-models/src/core_arch/x86.rs b/fstar-helpers/core-models/src/core_arch/x86.rs
index aa4740856..f2f729e0d 100644
--- a/fstar-helpers/core-models/src/core_arch/x86.rs
+++ b/fstar-helpers/core-models/src/core_arch/x86.rs
@@ -260,6 +260,48 @@ pub mod sse2 {
     pub fn _mm_movemask_epi8(_: __m128i) -> i32 {
         unimplemented!()
     }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
+    #[hax_lib::opaque]
+    pub fn _mm_unpacklo_epi64(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
+    #[hax_lib::opaque]
+    pub fn _mm_unpackhi_epi64(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
+    #[hax_lib::opaque]
+    pub fn _mm_shuffle_epi32<const IMM8: i32>(_: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_srli_si128<const IMM8: i32>(_: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_slli_si128<const IMM8: i32>(_: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_xor_si128(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_setzero_si128() -> __m128i {
+        unimplemented!()
+    }
 }
 
 pub use avx::*;
@@ -696,6 +738,34 @@ pub mod avx2 {
     }
 }
 
+pub use other::*;
+pub mod other {
+    use super::*;
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_aeskeygenassist_si128(_: __m128i, _: i32) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_aesenclast_si128(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_aesenc_si128(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_clmulepi64_si128(_: __m128i, _: __m128i, _: i32) -> __m128i {
+        unimplemented!()
+    }
+}
 /// Rewrite lemmas
 const _: () = {
     #[hax_lib::fstar::before("[@@ $REWRITE_RULE ]")]

From 0004188941ac3c69ecffface4ffd26ef8a527040 Mon Sep 17 00:00:00 2001
From: wysiwys <clara@cryspen.com>
Date: Wed, 24 Sep 2025 13:21:33 +0200
Subject: [PATCH 38/43] implement `libcrux_traits` `typed_owned` and
 `typed_refs` `Aead` traits

---
 aesgcm/src/lib.rs | 62 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index d0b19bf7e..7a473819d 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -13,6 +13,9 @@ mod aes_gcm;
 mod aes_gcm_128;
 mod aes_gcm_256;
 
+use libcrux_traits::aead::{arrayref, consts, typed_owned};
+
+// TODO: should this trait be re-exported here?
 pub use libcrux_traits::aead::arrayref::Aead;
 
 /// Trait for an AES State.
@@ -35,40 +38,48 @@ pub(crate) trait State {
 pub struct DecryptError();
 
 /// AES-GCM 128.
-pub struct AesGcm128 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct AesGcm128;
 
 /// Portable AES-GCM 128.
-pub struct PortableAesGcm128 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct PortableAesGcm128;
 
 /// Neon AES-GCM 128.
 #[cfg(feature = "simd128")]
-pub struct NeonAesGcm128 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct NeonAesGcm128;
 #[cfg(not(feature = "simd128"))]
 pub type NeonAesGcm128 = PortableAesGcm128;
 
 /// AES-NI AES-GCM 128.
 #[cfg(feature = "simd256")]
-pub struct X64AesGcm128 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct X64AesGcm128;
 #[cfg(not(feature = "simd256"))]
 pub type X64AesGcm128 = PortableAesGcm128;
 
 /// AES-GCM 256.
-pub struct AesGcm256 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct AesGcm256;
 
 /// Portable AES-GCM 256.
-pub struct PortableAesGcm256 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct PortableAesGcm256;
 
 /// Neon AES-GCM 256.
 #[cfg(feature = "simd128")]
-pub struct NeonAesGcm256 {}
+#[derive(Clone, Copy, PartialEq)]
+pub struct NeonAesGcm256;
 
 /// Neon AES-GCM 256.
 #[cfg(not(feature = "simd128"))]
 pub type NeonAesGcm256 = PortableAesGcm256;
 
 /// AES-NI AES-GCM 256.
+#[derive(Clone, Copy, PartialEq)]
 #[cfg(feature = "simd256")]
-pub struct X64AesGcm256 {}
+pub struct X64AesGcm256;
 
 /// AES-NI AES-GCM 256.
 #[cfg(not(feature = "simd256"))]
@@ -250,6 +261,23 @@ pub mod x64 {
     x64_pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::x64::State, platform::x64::FieldElement>);
 }
 
+/// Macro to implement the libcrux_traits public API traits
+///
+/// For the blanket impl of `typed_refs::Aead` to take place,
+/// the `$type` must implement `Copy` and `PartialEq`.
+macro_rules! impl_traits_public_api {
+    ($type:ty, $keylen:expr, $taglen:expr, $noncelen:expr ) => {
+        // prerequisite for typed_owned::Aead
+        impl consts::AeadConsts for $type {
+            const KEY_LEN: usize = KEY_LEN;
+            const TAG_LEN: usize = TAG_LEN;
+            const NONCE_LEN: usize = NONCE_LEN;
+        }
+        // implement typed_owned::Aead
+        typed_owned::impl_aead_typed_owned!($type, KEY_LEN, TAG_LEN, NONCE_LEN);
+    };
+}
+
 /// Macro to implement the different structs and multiplexing.
 macro_rules! api {
     ($mod_name:ident, $variant:ident, $multiplexing:ty, $portable:ident, $neon:ident, $x64:ident) => {
@@ -262,7 +290,9 @@ macro_rules! api {
             pub type Tag = [u8; TAG_LEN];
             pub type Nonce = [u8; NONCE_LEN];
 
-            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
+            impl_traits_public_api!($multiplexing, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
                 fn encrypt(
                     ciphertext: &mut [u8],
                     tag: &mut Tag,
@@ -306,7 +336,9 @@ macro_rules! api {
                 }
             }
 
-            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
+            impl_traits_public_api!($portable, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
                 fn encrypt(
                     ciphertext: &mut [u8],
                     tag: &mut Tag,
@@ -333,7 +365,10 @@ macro_rules! api {
             }
 
             #[cfg(feature = "simd128")]
-            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
+            impl_traits_public_api!($neon, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            #[cfg(feature = "simd128")]
+            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
                 fn encrypt(
                     ciphertext: &mut [u8],
                     tag: &mut Tag,
@@ -360,7 +395,10 @@ macro_rules! api {
             }
 
             #[cfg(feature = "simd256")]
-            impl Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $x64 {
+            impl_traits_public_api!($x64, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            #[cfg(feature = "simd256")]
+            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $x64 {
                 fn encrypt(
                     ciphertext: &mut [u8],
                     tag: &mut Tag,

From 87156abd5f9ba2b381cc59c2d6fb73025c87b1a1 Mon Sep 17 00:00:00 2001
From: wysiwys <clara@cryspen.com>
Date: Wed, 24 Sep 2025 13:37:12 +0200
Subject: [PATCH 39/43] add tests

---
 aesgcm/src/lib.rs | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 7a473819d..8f06f26bf 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -444,3 +444,57 @@ api!(
     NeonAesGcm256,
     X64AesGcm256
 );
+
+#[cfg(test)]
+mod tests {
+    use libcrux_traits::aead::consts;
+    use libcrux_traits::aead::typed_owned;
+    use libcrux_traits::aead::typed_refs;
+
+    type Key = typed_owned::Key<super::AesGcm128>;
+    type Nonce = typed_owned::Nonce<super::AesGcm128>;
+    type Tag = typed_owned::Tag<super::AesGcm128>;
+
+    #[test]
+    fn test_key_centric_owned() {
+        use consts::AeadConsts as _;
+
+        use super::AesGcm128;
+
+        let k: Key = [0; AesGcm128::KEY_LEN].into();
+        let nonce: Nonce = [0; AesGcm128::NONCE_LEN].into();
+        let mut tag: Tag = [0; AesGcm128::TAG_LEN].into();
+
+        let pt = b"the quick brown fox jumps over the lazy dog";
+        let mut ct = [0; 43];
+        let mut pt_out = [0; 43];
+
+        k.encrypt(&mut ct, &mut tag, &nonce, b"", pt).unwrap();
+        k.decrypt(&mut pt_out, &nonce, b"", &ct, &tag).unwrap();
+        assert_eq!(pt, &pt_out);
+    }
+
+    #[test]
+    fn test_key_centric_refs() {
+        use consts::AeadConsts as _;
+        use typed_refs::Aead as _;
+
+        use super::AesGcm128;
+
+        let algo = AesGcm128;
+
+        let mut tag_bytes = [0; AesGcm128::TAG_LEN];
+        let key = algo.new_key(&[0; AesGcm128::KEY_LEN]).unwrap();
+        let tag = algo.new_tag_mut(&mut tag_bytes).unwrap();
+        let nonce = algo.new_nonce(&[0; AesGcm128::NONCE_LEN]).unwrap();
+
+        let pt = b"the quick brown fox jumps over the lazy dog";
+        let mut ct = [0; 43];
+        let mut pt_out = [0; 43];
+
+        key.encrypt(&mut ct, tag, nonce, b"", pt).unwrap();
+        let tag = algo.new_tag(&tag_bytes).unwrap();
+        key.decrypt(&mut pt_out, nonce, b"", &ct, tag).unwrap();
+        assert_eq!(pt, &pt_out);
+    }
+}

From c5399503af2240fbd70f1af3b3fc1d342f71271c Mon Sep 17 00:00:00 2001
From: wysiwys <clara@cryspen.com>
Date: Wed, 24 Sep 2025 13:41:13 +0200
Subject: [PATCH 40/43] implement slice trait

---
 aesgcm/src/lib.rs | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 8f06f26bf..e8c81a49a 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -13,7 +13,7 @@ mod aes_gcm;
 mod aes_gcm_128;
 mod aes_gcm_256;
 
-use libcrux_traits::aead::{arrayref, consts, typed_owned};
+use libcrux_traits::aead::{arrayref, consts, slice, typed_owned};
 
 // TODO: should this trait be re-exported here?
 pub use libcrux_traits::aead::arrayref::Aead;
@@ -290,6 +290,10 @@ macro_rules! api {
             pub type Tag = [u8; TAG_LEN];
             pub type Nonce = [u8; NONCE_LEN];
 
+            // implement `libcrux_traits` slice trait
+            slice::impl_aead_slice_trait!($multiplexing => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            // implement `libcrux_traits` public API traits
             impl_traits_public_api!($multiplexing, KEY_LEN, TAG_LEN, NONCE_LEN);
 
             impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
@@ -336,6 +340,10 @@ macro_rules! api {
                 }
             }
 
+            // implement `libcrux_traits` slice trait
+            slice::impl_aead_slice_trait!($portable => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            // implement `libcrux_traits` public API traits
             impl_traits_public_api!($portable, KEY_LEN, TAG_LEN, NONCE_LEN);
 
             impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
@@ -364,6 +372,11 @@ macro_rules! api {
                 }
             }
 
+            // implement `libcrux_traits` slice trait
+            #[cfg(feature = "simd128")]
+            slice::impl_aead_slice_trait!($neon => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            // implement `libcrux_traits` public API traits
             #[cfg(feature = "simd128")]
             impl_traits_public_api!($neon, KEY_LEN, TAG_LEN, NONCE_LEN);
 
@@ -394,6 +407,11 @@ macro_rules! api {
                 }
             }
 
+            // implement `libcrux_traits` slice trait
+            #[cfg(feature = "simd256")]
+            slice::impl_aead_slice_trait!($x64 => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+            // implement `libcrux_traits` public API traits
             #[cfg(feature = "simd256")]
             impl_traits_public_api!($x64, KEY_LEN, TAG_LEN, NONCE_LEN);
 

From 9456055c86fb0c3d2b3edf3ffc243422e56283fb Mon Sep 17 00:00:00 2001
From: wysiwys <clara@cryspen.com>
Date: Wed, 24 Sep 2025 15:42:08 +0200
Subject: [PATCH 41/43] derive `Eq` for algorithm structs

---
 aesgcm/src/lib.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index e8c81a49a..702737edc 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -38,38 +38,38 @@ pub(crate) trait State {
 pub struct DecryptError();
 
 /// AES-GCM 128.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct AesGcm128;
 
 /// Portable AES-GCM 128.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct PortableAesGcm128;
 
 /// Neon AES-GCM 128.
 #[cfg(feature = "simd128")]
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct NeonAesGcm128;
 #[cfg(not(feature = "simd128"))]
 pub type NeonAesGcm128 = PortableAesGcm128;
 
 /// AES-NI AES-GCM 128.
 #[cfg(feature = "simd256")]
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct X64AesGcm128;
 #[cfg(not(feature = "simd256"))]
 pub type X64AesGcm128 = PortableAesGcm128;
 
 /// AES-GCM 256.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct AesGcm256;
 
 /// Portable AES-GCM 256.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct PortableAesGcm256;
 
 /// Neon AES-GCM 256.
 #[cfg(feature = "simd128")]
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 pub struct NeonAesGcm256;
 
 /// Neon AES-GCM 256.
@@ -77,7 +77,7 @@ pub struct NeonAesGcm256;
 pub type NeonAesGcm256 = PortableAesGcm256;
 
 /// AES-NI AES-GCM 256.
-#[derive(Clone, Copy, PartialEq)]
+#[derive(Clone, Copy, PartialEq, Eq)]
 #[cfg(feature = "simd256")]
 pub struct X64AesGcm256;
 

From de0f1d1084699008b2071f8040b9b5d2622f3ac5 Mon Sep 17 00:00:00 2001
From: wysiwys <clara@cryspen.com>
Date: Wed, 24 Sep 2025 15:52:13 +0200
Subject: [PATCH 42/43] move tests

---
 aesgcm/src/lib.rs           | 54 -------------------------------------
 aesgcm/tests/key_centric.rs | 48 +++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 54 deletions(-)
 create mode 100644 aesgcm/tests/key_centric.rs

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 702737edc..6c4a8e66e 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -462,57 +462,3 @@ api!(
     NeonAesGcm256,
     X64AesGcm256
 );
-
-#[cfg(test)]
-mod tests {
-    use libcrux_traits::aead::consts;
-    use libcrux_traits::aead::typed_owned;
-    use libcrux_traits::aead::typed_refs;
-
-    type Key = typed_owned::Key<super::AesGcm128>;
-    type Nonce = typed_owned::Nonce<super::AesGcm128>;
-    type Tag = typed_owned::Tag<super::AesGcm128>;
-
-    #[test]
-    fn test_key_centric_owned() {
-        use consts::AeadConsts as _;
-
-        use super::AesGcm128;
-
-        let k: Key = [0; AesGcm128::KEY_LEN].into();
-        let nonce: Nonce = [0; AesGcm128::NONCE_LEN].into();
-        let mut tag: Tag = [0; AesGcm128::TAG_LEN].into();
-
-        let pt = b"the quick brown fox jumps over the lazy dog";
-        let mut ct = [0; 43];
-        let mut pt_out = [0; 43];
-
-        k.encrypt(&mut ct, &mut tag, &nonce, b"", pt).unwrap();
-        k.decrypt(&mut pt_out, &nonce, b"", &ct, &tag).unwrap();
-        assert_eq!(pt, &pt_out);
-    }
-
-    #[test]
-    fn test_key_centric_refs() {
-        use consts::AeadConsts as _;
-        use typed_refs::Aead as _;
-
-        use super::AesGcm128;
-
-        let algo = AesGcm128;
-
-        let mut tag_bytes = [0; AesGcm128::TAG_LEN];
-        let key = algo.new_key(&[0; AesGcm128::KEY_LEN]).unwrap();
-        let tag = algo.new_tag_mut(&mut tag_bytes).unwrap();
-        let nonce = algo.new_nonce(&[0; AesGcm128::NONCE_LEN]).unwrap();
-
-        let pt = b"the quick brown fox jumps over the lazy dog";
-        let mut ct = [0; 43];
-        let mut pt_out = [0; 43];
-
-        key.encrypt(&mut ct, tag, nonce, b"", pt).unwrap();
-        let tag = algo.new_tag(&tag_bytes).unwrap();
-        key.decrypt(&mut pt_out, nonce, b"", &ct, tag).unwrap();
-        assert_eq!(pt, &pt_out);
-    }
-}
diff --git a/aesgcm/tests/key_centric.rs b/aesgcm/tests/key_centric.rs
new file mode 100644
index 000000000..3e3551d99
--- /dev/null
+++ b/aesgcm/tests/key_centric.rs
@@ -0,0 +1,48 @@
+use libcrux_traits::aead::consts;
+use libcrux_traits::aead::typed_owned;
+use libcrux_traits::aead::typed_refs;
+
+use libcrux_aesgcm::AesGcm128;
+
+type Key = typed_owned::Key<AesGcm128>;
+type Nonce = typed_owned::Nonce<AesGcm128>;
+type Tag = typed_owned::Tag<AesGcm128>;
+
+#[test]
+fn test_key_centric_owned() {
+    use consts::AeadConsts as _;
+
+    let k: Key = [0; AesGcm128::KEY_LEN].into();
+    let nonce: Nonce = [0; AesGcm128::NONCE_LEN].into();
+    let mut tag: Tag = [0; AesGcm128::TAG_LEN].into();
+
+    let pt = b"the quick brown fox jumps over the lazy dog";
+    let mut ct = [0; 43];
+    let mut pt_out = [0; 43];
+
+    k.encrypt(&mut ct, &mut tag, &nonce, b"", pt).unwrap();
+    k.decrypt(&mut pt_out, &nonce, b"", &ct, &tag).unwrap();
+    assert_eq!(pt, &pt_out);
+}
+
+#[test]
+fn test_key_centric_refs() {
+    use consts::AeadConsts as _;
+    use typed_refs::Aead as _;
+
+    let algo = AesGcm128;
+
+    let mut tag_bytes = [0; AesGcm128::TAG_LEN];
+    let key = algo.new_key(&[0; AesGcm128::KEY_LEN]).unwrap();
+    let tag = algo.new_tag_mut(&mut tag_bytes).unwrap();
+    let nonce = algo.new_nonce(&[0; AesGcm128::NONCE_LEN]).unwrap();
+
+    let pt = b"the quick brown fox jumps over the lazy dog";
+    let mut ct = [0; 43];
+    let mut pt_out = [0; 43];
+
+    key.encrypt(&mut ct, tag, nonce, b"", pt).unwrap();
+    let tag = algo.new_tag(&tag_bytes).unwrap();
+    key.decrypt(&mut pt_out, nonce, b"", &ct, tag).unwrap();
+    assert_eq!(pt, &pt_out);
+}

From c509b34db0aa76ccdd56290431339b4bbdfefff9 Mon Sep 17 00:00:00 2001
From: wysiwys <clara@cryspen.com>
Date: Wed, 24 Sep 2025 15:57:22 +0200
Subject: [PATCH 43/43] move trait implementations to own modules

---
 aesgcm/src/lib.rs | 260 ++++++++++++++++++++++++----------------------
 1 file changed, 136 insertions(+), 124 deletions(-)

diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
index 6c4a8e66e..f289c67d1 100644
--- a/aesgcm/src/lib.rs
+++ b/aesgcm/src/lib.rs
@@ -290,155 +290,167 @@ macro_rules! api {
             pub type Tag = [u8; TAG_LEN];
             pub type Nonce = [u8; NONCE_LEN];
 
-            // implement `libcrux_traits` slice trait
-            slice::impl_aead_slice_trait!($multiplexing => KEY_LEN, TAG_LEN, NONCE_LEN);
+            mod _libcrux_traits_apis_multiplex {
+                use super::*;
 
-            // implement `libcrux_traits` public API traits
-            impl_traits_public_api!($multiplexing, KEY_LEN, TAG_LEN, NONCE_LEN);
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($multiplexing => KEY_LEN, TAG_LEN, NONCE_LEN);
 
-            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
-                fn encrypt(
-                    ciphertext: &mut [u8],
-                    tag: &mut Tag,
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    plaintext: &[u8],
-                ) -> Result<(), EncryptError> {
-                    // SIMD256 needs to come first because SIMD128 is true for
-                    // x64 as well, but we don't actually implement it.
-                    if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
-                        $x64::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-                    } else if libcrux_platform::simd128_support()
-                        && libcrux_platform::aes_ni_support()
-                    {
-                        $neon::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
-                    } else {
-                        $portable::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($multiplexing, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        // SIMD256 needs to come first because SIMD128 is true for
+                        // x64 as well, but we don't actually implement it.
+                        if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                            $x64::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        } else if libcrux_platform::simd128_support()
+                            && libcrux_platform::aes_ni_support()
+                        {
+                            $neon::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        } else {
+                            $portable::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        }
                     }
-                }
 
-                fn decrypt(
-                    plaintext: &mut [u8],
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    ciphertext: &[u8],
-                    tag: &Tag,
-                ) -> Result<(), DecryptError> {
-                    // SIMD256 needs to come first because SIMD128 is true for
-                    // x64 as well, but we don't actually implement it.
-                    if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
-                        $x64::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-                    } else if libcrux_platform::simd128_support()
-                        && libcrux_platform::aes_ni_support()
-                    {
-                        $neon::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
-                    } else {
-                        $portable::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        // SIMD256 needs to come first because SIMD128 is true for
+                        // x64 as well, but we don't actually implement it.
+                        if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                            $x64::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        } else if libcrux_platform::simd128_support()
+                            && libcrux_platform::aes_ni_support()
+                        {
+                            $neon::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        } else {
+                            $portable::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        }
                     }
                 }
             }
 
-            // implement `libcrux_traits` slice trait
-            slice::impl_aead_slice_trait!($portable => KEY_LEN, TAG_LEN, NONCE_LEN);
+            mod _libcrux_traits_apis_portable {
+                use super::*;
 
-            // implement `libcrux_traits` public API traits
-            impl_traits_public_api!($portable, KEY_LEN, TAG_LEN, NONCE_LEN);
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($portable => KEY_LEN, TAG_LEN, NONCE_LEN);
 
-            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
-                fn encrypt(
-                    ciphertext: &mut [u8],
-                    tag: &mut Tag,
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    plaintext: &[u8],
-                ) -> Result<(), EncryptError> {
-                    portable::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-                    Ok(())
-                }
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($portable, KEY_LEN, TAG_LEN, NONCE_LEN);
 
-                fn decrypt(
-                    plaintext: &mut [u8],
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    ciphertext: &[u8],
-                    tag: &Tag,
-                ) -> Result<(), DecryptError> {
-                    portable::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-                        .map_err(|_| DecryptError::InvalidTag)
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        portable::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                        Ok(())
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        portable::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                            .map_err(|_| DecryptError::InvalidTag)
+                    }
                 }
             }
 
-            // implement `libcrux_traits` slice trait
             #[cfg(feature = "simd128")]
-            slice::impl_aead_slice_trait!($neon => KEY_LEN, TAG_LEN, NONCE_LEN);
+            mod _libcrux_traits_apis_neon {
+                use super::*;
 
-            // implement `libcrux_traits` public API traits
-            #[cfg(feature = "simd128")]
-            impl_traits_public_api!($neon, KEY_LEN, TAG_LEN, NONCE_LEN);
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($neon => KEY_LEN, TAG_LEN, NONCE_LEN);
 
-            #[cfg(feature = "simd128")]
-            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
-                fn encrypt(
-                    ciphertext: &mut [u8],
-                    tag: &mut Tag,
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    plaintext: &[u8],
-                ) -> Result<(), EncryptError> {
-                    neon::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-                    Ok(())
-                }
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($neon, KEY_LEN, TAG_LEN, NONCE_LEN);
 
-                fn decrypt(
-                    plaintext: &mut [u8],
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    ciphertext: &[u8],
-                    tag: &Tag,
-                ) -> Result<(), DecryptError> {
-                    neon::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-                        .map_err(|_| DecryptError::InvalidTag)
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        neon::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                        Ok(())
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        neon::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                            .map_err(|_| DecryptError::InvalidTag)
+                    }
                 }
             }
 
-            // implement `libcrux_traits` slice trait
             #[cfg(feature = "simd256")]
-            slice::impl_aead_slice_trait!($x64 => KEY_LEN, TAG_LEN, NONCE_LEN);
+            mod _libcrux_traits_api_x64 {
+                use super::*;
 
-            // implement `libcrux_traits` public API traits
-            #[cfg(feature = "simd256")]
-            impl_traits_public_api!($x64, KEY_LEN, TAG_LEN, NONCE_LEN);
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($x64 => KEY_LEN, TAG_LEN, NONCE_LEN);
 
-            #[cfg(feature = "simd256")]
-            impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $x64 {
-                fn encrypt(
-                    ciphertext: &mut [u8],
-                    tag: &mut Tag,
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    plaintext: &[u8],
-                ) -> Result<(), EncryptError> {
-                    x64::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
-                    Ok(())
-                }
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($x64, KEY_LEN, TAG_LEN, NONCE_LEN);
 
-                fn decrypt(
-                    plaintext: &mut [u8],
-                    key: &Key,
-                    nonce: &Nonce,
-                    aad: &[u8],
-                    ciphertext: &[u8],
-                    tag: &Tag,
-                ) -> Result<(), DecryptError> {
-                    x64::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
-                        .map_err(|_| DecryptError::InvalidTag)
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $x64 {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        x64::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                        Ok(())
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        x64::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                            .map_err(|_| DecryptError::InvalidTag)
+                    }
                 }
             }
         }