diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml
new file mode 100644
index 000000000..e4635625c
--- /dev/null
+++ b/.github/workflows/aes.yml
@@ -0,0 +1,169 @@
+name: AES-GCM
+
+on:
+  merge_group:
+  pull_request:
+    branches: ["main", "dev", "*"]
+    paths:
+      - "aesgcm/**"
+  workflow_dispatch:
+
+env:
+  CARGO_TERM_COLOR: always
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        bits: [32, 64]
+        os:
+          - macos-latest # macos-15 on apple silicon
+          - ubuntu-latest
+          - windows-latest
+        exclude:
+          - bits: 32
+            os: "macos-latest"
+
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: aesgcm
+
+    steps:
+      - uses: actions/checkout@v5
+      - uses: taiki-e/install-action@cargo-hack
+
+      - name: Update dependencies
+        run: cargo update
+
+      - run: echo "RUST_TARGET_FLAG=" > $GITHUB_ENV
+        if: ${{ matrix.bits == 64 }}
+
+      - run: echo 'EXCLUDE_FEATURES=--exclude-features simd256' > $GITHUB_ENV
+        if: ${{ matrix.os == 'macos-latest' }}
+
+      - run: echo 'EXCLUDE_FEATURES=--exclude-features simd128' > $GITHUB_ENV
+        if: ${{ matrix.os != 'macos-latest' }}
+
+      - name: 🛠️ Setup Rust Nightly
+        run: rustup toolchain install nightly
+
+      - name: 🛠️ Setup Ubuntu x86
+        if: ${{ matrix.bits == 32 &&  matrix.os == 'ubuntu-latest' }}
+        run: |
+          rustup target add i686-unknown-linux-gnu
+          sudo apt-get update
+          sudo apt-get install -y gcc-multilib g++-multilib
+
+      # Set up 32 bit systems
+
+      - name: 🛠️ Config Windows x86
+        run: echo "RUST_TARGET_FLAG=--target=i686-pc-windows-msvc" > $GITHUB_ENV
+        if: ${{ matrix.bits == 32 && matrix.os == 'windows-latest' }}
+
+      - name: 🛠️ Config Linux x86
+        run: |
+          echo "RUST_TARGET_FLAG=--target=i686-unknown-linux-gnu" > $GITHUB_ENV
+        if: ${{ matrix.bits == 32 && matrix.os == 'ubuntu-latest' }}
+
+      # Build ...
+
+      - name: 🔨 Build
+        run: |
+          rustc --print=cfg
+          cargo build --verbose $RUST_TARGET_FLAG
+
+      - name: 🔨 Build Release
+        run: cargo build --verbose --release $RUST_TARGET_FLAG
+
+      - name: 🏃🏻 Asan MacOS
+        if: ${{ matrix.os == 'macos-latest' }}
+        run: RUSTDOCFLAGS=-Zsanitizer=address RUSTFLAGS=-Zsanitizer=address cargo +nightly test --release --target aarch64-apple-darwin
+
+      # Test ...
+
+      - name: 🏃🏻‍♀️ Test
+        run: |
+          cargo clean
+          cargo test --verbose $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Release
+        run: |
+          cargo clean
+          cargo test --verbose --release $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Portable
+        run: |
+          cargo clean
+          LIBCRUX_DISABLE_SIMD128=1 LIBCRUX_DISABLE_SIMD256=1 cargo test --verbose $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Portable Release
+        run: |
+          cargo clean
+          LIBCRUX_DISABLE_SIMD128=1 LIBCRUX_DISABLE_SIMD256=1 cargo test --verbose --release $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Test Kyber
+        run: |
+          cargo clean
+          cargo test ,kyber --verbose $RUST_TARGET_FLAG
+
+      - name: 🏃🏻‍♀️ Cargo Test Features
+        if: ${{ matrix.bits == 64 }}
+        run: |
+          cargo clean
+          cargo hack test --each-feature $EXCLUDE_FEATURES --verbose $RUST_TARGET_FLAG
+
+  build-intel-macos:
+    runs-on: macos-13
+    defaults:
+      run:
+        shell: bash
+        working-directory: aesgcm
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Update dependencies
+        run: cargo update
+
+      - name: 🔨 Build
+        run: |
+          rustc --print=cfg
+          cargo build --verbose
+
+  fuzz:
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - macos-latest # macos-15
+          - ubuntu-latest
+
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: aesgcm
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: 🛠️ Setup Rust Nightly
+        run: |
+          rustup toolchain install nightly
+          cargo install cargo-fuzz
+
+      - name: 🛠️ Update dependencies
+        run: cargo update
+
+      - name: 🏃🏻‍♀️ Encrypt256
+        run: CARGO_PROFILE_RELEASE_LTO=false cargo +nightly fuzz run encrypt128 -- -runs=100000
+
+      - name: 🏃🏻‍♀️ Encrypt256
+        run: CARGO_PROFILE_RELEASE_LTO=false cargo +nightly fuzz run encrypt256 -- -runs=100000
diff --git a/Cargo.toml b/Cargo.toml
index 528ebd552..c507579fa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,7 @@
 [workspace]
 members = [
+    "aesgcm",
+    "aesgcm/fuzz",
     "sys/hacl",
     "sys/libjade",
     "sys/platform",
@@ -49,6 +51,20 @@ allow-branch = ["main"]
 
 [workspace.dependencies]
 hax-lib = { version = "0.3.4" }
+libcrux-intrinsics = { version = "=0.0.3", path = "libcrux-intrinsics" }
+libcrux-aesgcm = { version = "=0.0.2", path = "aesgcm" }
+libcrux-chacha20poly1305 = { version = "=0.0.3", path = "chacha20poly1305" }
+libcrux-traits = { version = "=0.0.3", path = "traits" }
+libcrux-hacl-rs = { version = "=0.0.3", path = "hacl-rs" }
+libcrux-hacl = { version = "=0.0.2", path = "sys/hacl" }
+libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
+libcrux-hkdf = { version = "=0.0.3", path = "libcrux-hkdf" }
+libcrux-hmac = { version = "=0.0.3", path = "libcrux-hmac" }
+libcrux-sha2 = { version = "=0.0.3", path = "sha2" }
+libcrux-ed25519 = { version = "=0.0.3", path = "ed25519" }
+libcrux-ecdh = { version = "=0.0.3", path = "libcrux-ecdh" }
+libcrux-ml-kem = { version = "=0.0.3", path = "libcrux-ml-kem" }
+libcrux-kem = { version = "=0.0.3", path = "libcrux-kem" }
 
 [package]
 name = "libcrux"
@@ -81,23 +97,27 @@ bench = false                               # so libtest doesn't eat the argumen
 libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
 
 [dependencies]
-libcrux-traits = { version = "=0.0.3", path = "traits" }
-libcrux-chacha20poly1305 = { version = "=0.0.3", path = "chacha20poly1305" }
-libcrux-hacl-rs = { version = "=0.0.3", path = "hacl-rs" }
-libcrux-hacl = { version = "=0.0.2", path = "sys/hacl" }
-libcrux-platform = { version = "=0.0.2", path = "sys/platform" }
-libcrux-hkdf = { version = "=0.0.3", path = "libcrux-hkdf" }
-libcrux-hmac = { version = "=0.0.3", path = "libcrux-hmac" }
-libcrux-sha2 = { version = "=0.0.3", path = "sha2" }
-libcrux-ed25519 = { version = "=0.0.3", path = "ed25519" }
-libcrux-ecdh = { version = "=0.0.3", path = "libcrux-ecdh" }
-libcrux-ml-kem = { version = "=0.0.3", path = "libcrux-ml-kem" }
-libcrux-kem = { version = "=0.0.3", path = "libcrux-kem" }
+libcrux-hacl-rs.workspace = true
+libcrux-chacha20poly1305.workspace = true
+libcrux-ml-kem.workspace = true
+libcrux-traits.workspace = true
+libcrux-hacl.workspace = true
+libcrux-platform.workspace = true
+libcrux-hkdf.workspace = true
+libcrux-hmac.workspace = true
+libcrux-sha2.workspace = true
+libcrux-ed25519.workspace = true
+libcrux-ecdh.workspace = true
+libcrux-kem.workspace = true
+
 rand = { version = "0.9" }
 log = { version = "0.4", optional = true }
+
 # WASM API
 wasm-bindgen = { version = "0.2.87", optional = true }
 getrandom = { version = "0.3", optional = true }
+
+# Proofs
 hax-lib.workspace = true
 
 [dev-dependencies]
diff --git a/aesgcm/.gitignore b/aesgcm/.gitignore
new file mode 100644
index 000000000..407088d6f
--- /dev/null
+++ b/aesgcm/.gitignore
@@ -0,0 +1 @@
+profile.json.gz
diff --git a/aesgcm/Cargo.toml b/aesgcm/Cargo.toml
new file mode 100644
index 000000000..876708a07
--- /dev/null
+++ b/aesgcm/Cargo.toml
@@ -0,0 +1,45 @@
+[package]
+name = "libcrux_aesgcm"
+version.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+edition.workspace = true
+repository.workspace = true
+readme = "README.md"
+description = "Libcrux AES-GCM implementation"
+exclude = []
+
+[lib]
+bench = false # so libtest doesn't eat the arguments to criterion
+
+[dependencies]
+libcrux-platform.workspace = true
+libcrux-intrinsics.workspace = true
+libcrux-traits.workspace = true
+
+rand = { version = "0.9", optional = true }
+
+[features]
+default = ["rand"]  # XXX: remove rand here when cleaning up
+simd128 = []
+simd256 = []
+rand = ["dep:rand"]
+std = []
+
+[[bench]]
+name = "aesgcm"
+harness = false
+
+[dev-dependencies]
+libcrux_aesgcm = { version = "*", features = ["std"], path = "." }
+cavp = { version = "0.0.2", path = "../cavp" }
+criterion = "0.5.1"
+hex = "0.4.3"
+pretty_env_logger = "0.5.0"
+rand_core = { version = "0.6" }
+aes-gcm = "0.10.3"
+wycheproof = "0.6.0"
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(hax)', 'cfg(eurydice)'] }
diff --git a/aesgcm/README.md b/aesgcm/README.md
new file mode 100644
index 000000000..0d708942f
--- /dev/null
+++ b/aesgcm/README.md
@@ -0,0 +1,12 @@
+# AES-GCM
+
+![pre-verification]
+
+This crate implements AES-GCM 128 and 256
+
+It provides 
+- a portable, bit-sliced implementation
+- an x64 optimised implementation using AES-NI
+- an Aarch64 optimised implementation using the AES instructions
+
+[pre-verification]: https://img.shields.io/badge/pre_verification-orange.svg?style=for-the-badge&logo=data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48IS0tIFVwbG9hZGVkIHRvOiBTVkcgUmVwbywgd3d3LnN2Z3JlcG8uY29tLCBHZW5lcmF0b3I6IFNWRyBSZXBvIE1peGVyIFRvb2xzIC0tPg0KPHN2ZyB3aWR0aD0iODAwcHgiIGhlaWdodD0iODAwcHgiIHZpZXdCb3g9IjAgMCAyNCAyNCIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4NCjxwYXRoIGQ9Ik05IDEySDE1TTIwIDEyQzIwIDE2LjQ2MTEgMTQuNTQgMTkuNjkzNyAxMi42NDE0IDIwLjY4M0MxMi40MzYxIDIwLjc5IDEyLjMzMzQgMjAuODQzNSAxMi4xOTEgMjAuODcxMkMxMi4wOCAyMC44OTI4IDExLjkyIDIwLjg5MjggMTEuODA5IDIwLjg3MTJDMTEuNjY2NiAyMC44NDM1IDExLjU2MzkgMjAuNzkgMTEuMzU4NiAyMC42ODNDOS40NTk5NiAxOS42OTM3IDQgMTYuNDYxMSA0IDEyVjguMjE3NTlDNCA3LjQxODA4IDQgNy4wMTgzMyA0LjEzMDc2IDYuNjc0N0M0LjI0NjI3IDYuMzcxMTMgNC40MzM5OCA2LjEwMDI3IDQuNjc3NjYgNS44ODU1MkM0Ljk1MzUgNS42NDI0MyA1LjMyNzggNS41MDIwNyA2LjA3NjQgNS4yMjEzNEwxMS40MzgyIDMuMjEwNjdDMTEuNjQ2MSAzLjEzMjcxIDExLjc1IDMuMDkzNzMgMTEuODU3IDMuMDc4MjdDMTEuOTUxOCAzLjA2NDU3IDEyLjA0ODIgMy4wNjQ1NyAxMi4xNDMgMy4wNzgyN0MxMi4yNSAzLjA5MzczIDEyLjM1MzkgMy4xMzI3MSAxMi41NjE4IDMuMjEwNjdMMTcuOTIzNiA1LjIyMTM0QzE4LjY3MjIgNS41MDIwNyAxOS4wNDY1IDUuNjQyNDMgMTkuMzIyMyA1Ljg4NTUyQzE5LjU2NiA2LjEwMDI3IDE5Ljc1MzcgNi4zNzExMyAxOS44NjkyIDYuNjc0N0MyMCA3LjAxODMzIDIwIDcuNDE4MDggMjAgOC4yMTc1OVYxMloiIHN0cm9rZT0iIzAwMDAwMCIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiLz4NCjwvc3ZnPg==
diff --git a/aesgcm/benches/aesgcm.rs b/aesgcm/benches/aesgcm.rs
new file mode 100644
index 000000000..1fe2a78d7
--- /dev/null
+++ b/aesgcm/benches/aesgcm.rs
@@ -0,0 +1,197 @@
+#![allow(non_snake_case)]
+use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
+
+pub fn randombytes(n: usize) -> Vec<u8> {
+    let mut bytes = vec![0u8; n];
+    rand::rng().fill_bytes(&mut bytes);
+    bytes
+}
+
+pub fn fmt(x: usize) -> String {
+    let base = (x as f64).log(1024f64).floor() as usize;
+    let suffix = ["", "KB", "MB", "GB"];
+    format!("{} {}", x >> (10 * base), suffix[base])
+}
+
+macro_rules! impl_comp {
+    ($fun:ident, $keylen:literal, $portable_fun:expr, $neon_fun:expr, $intel_fun:expr, $rustcrypto_fun:expr) => {
+        // Comparing libcrux performance for different payload sizes and other implementations.
+        fn $fun(c: &mut Criterion) {
+            const PAYLOAD_SIZES: [usize; 3] = [128, 1024, 1024 * 1024 * 10];
+
+            let mut group = c.benchmark_group(stringify!($fun).replace("_", " "));
+
+            for payload_size in PAYLOAD_SIZES.iter() {
+                group.throughput(Throughput::Bytes(*payload_size as u64));
+
+                group.bench_with_input(
+                    BenchmarkId::new("libcrux", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || {
+                                (
+                                    randombytes($keylen),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $portable_fun(
+                                    &key,
+                                    &nonce,
+                                    &aad,
+                                    &payload,
+                                    &mut ciphertext,
+                                    &mut tag,
+                                );
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+
+                #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+                group.bench_with_input(
+                    BenchmarkId::new("neon-aes-clmul", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || {
+                                (
+                                    randombytes($keylen),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $neon_fun(&key, &nonce, &aad, &payload, &mut ciphertext, &mut tag);
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+
+                #[cfg(all(target_arch = "x86_64"))] // ENABLE: target_feature="aes"
+                group.bench_with_input(
+                    BenchmarkId::new("intel-aes-clmul", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || {
+                                (
+                                    randombytes($keylen),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $intel_fun(&key, &nonce, &aad, &payload, &mut ciphertext, &mut tag);
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+
+                group.bench_with_input(
+                    BenchmarkId::new("rust-crypto", fmt(*payload_size)),
+                    payload_size,
+                    |b, payload_size| {
+                        b.iter_batched(
+                            || {
+                                (
+                                    randombytes($keylen),
+                                    randombytes(12),
+                                    randombytes(32),
+                                    randombytes(*payload_size),
+                                )
+                            },
+                            |(key, nonce, aad, payload)| {
+                                let mut ciphertext = vec![0; *payload_size];
+                                let mut tag = [0u8; 16];
+                                $rustcrypto_fun(
+                                    &key,
+                                    &nonce,
+                                    &aad,
+                                    &payload,
+                                    &mut ciphertext,
+                                    &mut tag,
+                                );
+                            },
+                            BatchSize::SmallInput,
+                        )
+                    },
+                );
+            }
+        }
+    };
+}
+
+use aes_gcm::{
+    aead::{Aead, KeyInit, Payload},
+    Aes128Gcm, Aes256Gcm,
+};
+use rand::RngCore;
+
+fn rustcrypto_aes128_gcm_encrypt(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    msg: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    let cipher = Aes128Gcm::new(key.into());
+    let ctxt = cipher.encrypt(nonce.into(), Payload { msg, aad }).unwrap();
+    ciphertext.copy_from_slice(&ctxt[0..msg.len()]);
+    tag.copy_from_slice(&ctxt[msg.len()..]);
+}
+
+// XXX: We could work with the traits here, but this is quicker for now.
+fn rustcrypto_aes256_gcm_encrypt(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    msg: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    let cipher = Aes256Gcm::new(key.into());
+    let ctxt = cipher.encrypt(nonce.into(), Payload { msg, aad }).unwrap();
+    ciphertext.copy_from_slice(&ctxt[0..msg.len()]);
+    tag.copy_from_slice(&ctxt[msg.len()..]);
+}
+
+impl_comp!(
+    AES128_GCM,
+    16,
+    libcrux_aesgcm::portable::aes_gcm_128::encrypt,
+    libcrux_aesgcm::neon::aes_gcm_128::encrypt,
+    libcrux_aesgcm::x64::aes_gcm_128::encrypt,
+    rustcrypto_aes128_gcm_encrypt
+);
+impl_comp!(
+    AES256_GCM,
+    32,
+    libcrux_aesgcm::portable::aes_gcm_256::encrypt,
+    libcrux_aesgcm::neon::aes_gcm_256::encrypt,
+    libcrux_aesgcm::x64::aes_gcm_256::encrypt,
+    rustcrypto_aes256_gcm_encrypt
+);
+
+fn benchmarks(c: &mut Criterion) {
+    AES128_GCM(c);
+    AES256_GCM(c);
+}
+
+criterion_group!(benches, benchmarks);
+criterion_main!(benches);
diff --git a/aesgcm/build.rs b/aesgcm/build.rs
new file mode 100644
index 000000000..abc7a8b78
--- /dev/null
+++ b/aesgcm/build.rs
@@ -0,0 +1,34 @@
+use std::env;
+
+fn main() {
+    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+    let disable_simd128 = read_env("LIBCRUX_DISABLE_SIMD128");
+    let disable_simd256 = read_env("LIBCRUX_DISABLE_SIMD256");
+
+    // Force a simd build. Make sure you know what you're doing.
+    let enable_simd128 = read_env("LIBCRUX_ENABLE_SIMD128");
+    let enable_simd256 = read_env("LIBCRUX_ENABLE_SIMD256");
+
+    let simd128_possible = target_arch == "aarch64";
+    if (simd128_possible || enable_simd128) && !disable_simd128 {
+        // We enable simd128 on all aarch64 builds.
+        println!("cargo:rustc-cfg=feature=\"simd128\"");
+    }
+    let simd256_possible = target_arch == "x86_64";
+    if (simd256_possible || enable_simd256) && !disable_simd256 {
+        // We enable simd256 on all x86_64 builds.
+        // Note that this doesn't mean the required CPU features are available.
+        // But the compiler will support them and the runtime checks ensure that
+        // it's only used when available.
+        //
+        // We don't enable this on x86 because it seems to generate invalid code.
+        println!("cargo:rustc-cfg=feature=\"simd256\"");
+    }
+}
+
+fn read_env(key: &str) -> bool {
+    match env::var(key) {
+        Ok(s) => s == "1" || s == "y" || s == "Y",
+        Err(_) => false,
+    }
+}
diff --git a/aesgcm/examples/bench.rs b/aesgcm/examples/bench.rs
new file mode 100644
index 000000000..0f823cbfd
--- /dev/null
+++ b/aesgcm/examples/bench.rs
@@ -0,0 +1,26 @@
+use libcrux_aesgcm::Aead;
+
+fn main() {
+    const PAYLOAD_SIZES: usize = 3045;
+
+    let key = [0x16; 16];
+    let nonce = [0x12; 12];
+
+    let aad = [0xff; 32];
+    let plaintext = [0xab; PAYLOAD_SIZES];
+
+    let mut ciphertext = vec![0; PAYLOAD_SIZES];
+    let mut tag = [0u8; 16];
+
+    for _ in 0..10000 {
+        libcrux_aesgcm::AesGcm128::encrypt(
+            &mut ciphertext,
+            &mut tag,
+            &key,
+            &nonce,
+            &aad,
+            &plaintext,
+        )
+        .unwrap();
+    }
+}
diff --git a/aesgcm/fuzz/.gitignore b/aesgcm/fuzz/.gitignore
new file mode 100644
index 000000000..1a45eee77
--- /dev/null
+++ b/aesgcm/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/aesgcm/fuzz/Cargo.toml b/aesgcm/fuzz/Cargo.toml
new file mode 100644
index 000000000..ab0ba8f8f
--- /dev/null
+++ b/aesgcm/fuzz/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "libcrux_aesgcm-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.libcrux_aesgcm]
+path = ".."
+
+[[bin]]
+name = "encrypt128"
+path = "fuzz_targets/encrypt128.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "encrypt256"
+path = "fuzz_targets/encrypt256.rs"
+test = false
+doc = false
+bench = false
diff --git a/aesgcm/fuzz/fuzz_targets/encrypt128.rs b/aesgcm/fuzz/fuzz_targets/encrypt128.rs
new file mode 100644
index 000000000..0daa622a8
--- /dev/null
+++ b/aesgcm/fuzz/fuzz_targets/encrypt128.rs
@@ -0,0 +1,28 @@
+#![no_main]
+
+use libcrux_aesgcm::Aead;
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    if data.len() < 16 + 12 + 7 {
+        // We want at least a key, nonce, and a few input bytes.
+        return;
+    }
+
+    let key = &data[0..16];
+    let nonce = &data[16..16 + 12];
+    let aad = &data[16 + 12..16 + 12 + 5];
+
+    let mut ctxt = vec![0u8; data.len()];
+    let mut tag = [0u8; 16];
+    libcrux_aesgcm::PortableAesGcm128::encrypt(
+        &mut ctxt,
+        &mut tag,
+        key.try_into().unwrap(),
+        nonce.try_into().unwrap(),
+        aad,
+        &data,
+    )
+    .unwrap();
+});
diff --git a/aesgcm/fuzz/fuzz_targets/encrypt256.rs b/aesgcm/fuzz/fuzz_targets/encrypt256.rs
new file mode 100644
index 000000000..528634f9c
--- /dev/null
+++ b/aesgcm/fuzz/fuzz_targets/encrypt256.rs
@@ -0,0 +1,28 @@
+#![no_main]
+
+use libcrux_aesgcm::Aead;
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    if data.len() < 32 + 12 + 7 {
+        // We want at least a key, nonce, and a few input bytes.
+        return;
+    }
+
+    let key = &data[0..32];
+    let nonce = &data[32..32 + 12];
+    let aad = &data[32 + 12..32 + 12 + 5];
+
+    let mut ctxt = vec![0u8; data.len()];
+    let mut tag = [0u8; 16];
+    libcrux_aesgcm::PortableAesGcm256::encrypt(
+        &mut ctxt,
+        &mut tag,
+        key.try_into().unwrap(),
+        nonce.try_into().unwrap(),
+        aad,
+        &data,
+    )
+    .unwrap();
+});
diff --git a/aesgcm/src/aes.rs b/aesgcm/src/aes.rs
new file mode 100644
index 000000000..121a63200
--- /dev/null
+++ b/aesgcm/src/aes.rs
@@ -0,0 +1,24 @@
+//! The AES block cipher function.
+
+use crate::platform::*;
+
+pub(crate) type ExtendedKey<T, const NUM_KEYS: usize> = [T; NUM_KEYS];
+
+/// AES block size
+pub(crate) const AES_BLOCK_LEN: usize = 16;
+
+/// The AES block cipher function.
+#[inline]
+pub(crate) fn block_cipher<T: AESState, const NUM_KEYS: usize>(
+    st: &mut T,
+    keyex: &ExtendedKey<T, NUM_KEYS>,
+) {
+    st.xor_key(&keyex[0]);
+
+    #[allow(clippy::needless_range_loop)]
+    for i in 1..NUM_KEYS - 1 {
+        st.aes_enc(&keyex[i]);
+    }
+
+    st.aes_enc_last(&keyex[NUM_KEYS - 1]);
+}
diff --git a/aesgcm/src/aes_gcm.rs b/aesgcm/src/aes_gcm.rs
new file mode 100644
index 000000000..25d5b2591
--- /dev/null
+++ b/aesgcm/src/aes_gcm.rs
@@ -0,0 +1,105 @@
+//! Implementation of AES-GCM
+
+/// Macro to instantiate the AES state.
+/// This should really be replaced by using traits everywhere.
+macro_rules! aesgcm {
+    ($state:ty, $context:ident) => {
+        impl<T: AESState, U: GF128FieldElement> super::State for $state {
+            /// Initialize the state
+            fn init(key: &[u8]) -> Self {
+                debug_assert!(key.len() == KEY_LEN);
+
+                let nonce = [0u8; NONCE_LEN];
+                let mut gcm_key = [0u8; GCM_KEY_LEN];
+                let tag_mix = [0u8; TAG_LEN];
+
+                let aes_state = $context::<T>::init(key, &nonce);
+                aes_state.key_block(0, &mut gcm_key);
+                let gcm_state = GF128State::init(&gcm_key);
+
+                Self {
+                    aes_state,
+                    gcm_state,
+                    tag_mix,
+                }
+            }
+
+            fn set_nonce(&mut self, nonce: &[u8]) {
+                debug_assert!(nonce.len() == NONCE_LEN);
+
+                self.aes_state.set_nonce(nonce);
+                self.aes_state.key_block(1, &mut self.tag_mix);
+            }
+
+            fn encrypt(
+                &mut self,
+                aad: &[u8],
+                plaintext: &[u8],
+                ciphertext: &mut [u8],
+                tag: &mut [u8],
+            ) {
+                debug_assert!(ciphertext.len() == plaintext.len());
+                debug_assert!(plaintext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
+                debug_assert!(tag.len() == TAG_LEN);
+
+                self.aes_state.update(2, plaintext, ciphertext);
+
+                self.gcm_state.update_padded(aad);
+                self.gcm_state.update_padded(ciphertext);
+
+                let mut last_block = [0u8; AES_BLOCK_LEN];
+                last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+                last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+                self.gcm_state.update(&last_block);
+                self.gcm_state.emit(tag);
+
+                for i in 0..16 {
+                    tag[i] ^= self.tag_mix[i];
+                }
+            }
+
+            fn decrypt(
+                &mut self,
+                aad: &[u8],
+                ciphertext: &[u8],
+                tag: &[u8],
+                plaintext: &mut [u8],
+            ) -> Result<(), DecryptError> {
+                debug_assert!(plaintext.len() == ciphertext.len());
+                debug_assert!(ciphertext.len() / AES_BLOCK_LEN <= u32::MAX as usize);
+                debug_assert!(tag.len() == TAG_LEN);
+
+                self.gcm_state.update_padded(aad);
+                self.gcm_state.update_padded(ciphertext);
+
+                let mut last_block = [0u8; AES_BLOCK_LEN];
+                last_block[0..8].copy_from_slice(&((aad.len() as u64) * 8).to_be_bytes());
+                last_block[8..16].copy_from_slice(&((plaintext.len() as u64) * 8).to_be_bytes());
+
+                self.gcm_state.update(&last_block);
+
+                let mut computed_tag = [0u8; TAG_LEN];
+                self.gcm_state.emit(&mut computed_tag);
+
+                for i in 0..16 {
+                    computed_tag[i] ^= self.tag_mix[i];
+                }
+
+                let mut eq_mask = 0u8;
+                for i in 0..16 {
+                    eq_mask |= computed_tag[i] ^ tag[i];
+                }
+
+                if eq_mask == 0 {
+                    self.aes_state.update(2, ciphertext, plaintext);
+                    Ok(())
+                } else {
+                    Err(DecryptError())
+                }
+            }
+        }
+    };
+}
+
+pub(crate) use aesgcm;
diff --git a/aesgcm/src/aes_gcm_128.rs b/aesgcm/src/aes_gcm_128.rs
new file mode 100644
index 000000000..64cd6d3c3
--- /dev/null
+++ b/aesgcm/src/aes_gcm_128.rs
@@ -0,0 +1,23 @@
+//! AES-GCM 128
+
+use crate::{
+    aes::AES_BLOCK_LEN,
+    aes_gcm::aesgcm,
+    ctr::Aes128CtrContext,
+    gf128::GF128State,
+    platform::{AESState, GF128FieldElement},
+    DecryptError, NONCE_LEN, TAG_LEN,
+};
+
+/// Key length.
+pub(crate) const KEY_LEN: usize = 16;
+pub(crate) const GCM_KEY_LEN: usize = 16;
+
+/// The AES-GCM 128 state
+pub(crate) struct State<T: AESState, U: GF128FieldElement> {
+    pub(crate) aes_state: Aes128CtrContext<T>,
+    pub(crate) gcm_state: GF128State<U>,
+    pub(crate) tag_mix: [u8; TAG_LEN],
+}
+
+aesgcm!(State<T, U>, Aes128CtrContext);
diff --git a/aesgcm/src/aes_gcm_256.rs b/aesgcm/src/aes_gcm_256.rs
new file mode 100644
index 000000000..070a31031
--- /dev/null
+++ b/aesgcm/src/aes_gcm_256.rs
@@ -0,0 +1,23 @@
+//! AES-GCM 256
+
+use crate::{
+    aes::AES_BLOCK_LEN,
+    aes_gcm::aesgcm,
+    ctr::Aes256CtrContext,
+    gf128::GF128State,
+    platform::{AESState, GF128FieldElement},
+    DecryptError, NONCE_LEN, TAG_LEN,
+};
+
+/// Key length.
+pub(crate) const KEY_LEN: usize = 32;
+pub(crate) const GCM_KEY_LEN: usize = 16;
+
+/// The AES-GCM 256 state
+pub(crate) struct State<T: AESState, U: GF128FieldElement> {
+    pub(crate) aes_state: Aes256CtrContext<T>,
+    pub(crate) gcm_state: GF128State<U>,
+    pub(crate) tag_mix: [u8; TAG_LEN],
+}
+
+aesgcm!(State<T, U>, Aes256CtrContext);
diff --git a/aesgcm/src/ctr.rs b/aesgcm/src/ctr.rs
new file mode 100644
index 000000000..ea8fb4bb4
--- /dev/null
+++ b/aesgcm/src/ctr.rs
@@ -0,0 +1,106 @@
+//! AES ctr mode implementation.
+//!
+//! This implementation is generic over the [`AESState`], which has different,
+//! platform dependent implementations.
+//!
+//! This get's instantiated in [`aes128_ctr`] and [`aes256_ctr`].
+
+use crate::{aes::*, platform::AESState};
+
+#[cfg(test)]
+mod test128;
+
+mod aes128_ctr;
+mod aes256_ctr;
+
+pub(crate) use aes128_ctr::*;
+pub(crate) use aes256_ctr::*;
+
+/// The ctr nonce length. This is different from the AES nonce length
+/// [`crate::NONCE_LEN`].
+const NONCE_LEN: usize = 16;
+
+/// Generic AES CTR context.
+pub(crate) struct AesCtrContext<T: AESState, const NUM_KEYS: usize> {
+    pub(crate) extended_key: ExtendedKey<T, NUM_KEYS>,
+    pub(crate) ctr_nonce: [u8; NONCE_LEN],
+}
+
+impl<T: AESState, const NUM_KEYS: usize> AesCtrContext<T, NUM_KEYS> {
+    #[inline]
+    fn aes_ctr_set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == crate::NONCE_LEN);
+
+        self.ctr_nonce[0..crate::NONCE_LEN].copy_from_slice(nonce);
+    }
+
+    #[inline]
+    fn aes_ctr_key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == AES_BLOCK_LEN);
+
+        let mut st_init = self.ctr_nonce;
+        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+        let mut st = T::new();
+
+        st.load_block(&st_init);
+
+        block_cipher(&mut st, &self.extended_key);
+
+        st.store_block(out);
+    }
+
+    #[inline]
+    fn aes_ctr_xor_block(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
+
+        let mut st_init = self.ctr_nonce;
+        st_init[12..16].copy_from_slice(&ctr.to_be_bytes());
+        let mut st = T::new();
+        st.load_block(&st_init);
+
+        block_cipher(&mut st, &self.extended_key);
+
+        st.xor_block(input, out);
+    }
+
+    #[inline]
+    fn aes_ctr_xor_blocks(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len().is_multiple_of(AES_BLOCK_LEN));
+        // If input.len() / AES_BLOCK_LEN == u32::MAX - 1 and we start with
+        // ctr == 2 then we'll wrap to 0 below and we'll repeat the initial key
+        // block
+        debug_assert!(input.len() / AES_BLOCK_LEN < (u32::MAX - 1) as usize);
+
+        let blocks = input.len() / AES_BLOCK_LEN;
+        for i in 0..blocks {
+            let offset = i * AES_BLOCK_LEN;
+            self.aes_ctr_xor_block(
+                ctr.wrapping_add(i as u32),
+                &input[offset..offset + AES_BLOCK_LEN],
+                &mut out[offset..offset + AES_BLOCK_LEN],
+            );
+        }
+    }
+
+    #[inline]
+    fn aes_ctr_update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len());
+        debug_assert!(input.len() / AES_BLOCK_LEN < u32::MAX as usize);
+
+        let blocks = input.len() / AES_BLOCK_LEN;
+        self.aes_ctr_xor_blocks(
+            ctr,
+            &input[0..blocks * AES_BLOCK_LEN],
+            &mut out[0..blocks * AES_BLOCK_LEN],
+        );
+
+        let last = input.len() - input.len() % AES_BLOCK_LEN;
+        if last < input.len() {
+            self.aes_ctr_xor_block(
+                ctr.wrapping_add(blocks as u32),
+                &input[last..],
+                &mut out[last..],
+            );
+        }
+    }
+}
diff --git a/aesgcm/src/ctr/aes128_ctr.rs b/aesgcm/src/ctr/aes128_ctr.rs
new file mode 100644
index 000000000..87a7e9134
--- /dev/null
+++ b/aesgcm/src/ctr/aes128_ctr.rs
@@ -0,0 +1,80 @@
+//! AES128 ctr mode, generic over the platform [`AESState`].
+
+use core::array::from_fn;
+
+use super::AesCtrContext;
+use crate::{aes::*, aes_gcm_128::GCM_KEY_LEN, platform::AESState, NONCE_LEN};
+
+pub(super) const NUM_KEYS: usize = 11;
+
+/// Type alias for the AES 128 ctr context.
+pub(crate) type Aes128CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
+
+impl<T: AESState> Aes128CtrContext<T> {
+    #[inline]
+    pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(key.len() == GCM_KEY_LEN);
+
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..12].copy_from_slice(nonce);
+
+        Self {
+            extended_key: key_expansion(key),
+            ctr_nonce,
+        }
+    }
+
+    #[inline]
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
+
+        self.aes_ctr_set_nonce(nonce);
+    }
+
+    #[inline]
+    pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == GCM_KEY_LEN);
+
+        self.aes_ctr_key_block(ctr, out);
+    }
+
+    #[inline]
+    pub(crate) fn update(&self, ctr: u32, inp: &[u8], out: &mut [u8]) {
+        debug_assert!(inp.len() == out.len());
+
+        self.aes_ctr_update(ctr, inp, out);
+    }
+}
+
+/// 128 - Key expansion
+#[inline]
+fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
+    debug_assert!(key.len() == GCM_KEY_LEN);
+
+    let mut keyex = from_fn(|_| T::new());
+    keyex[0].load_block(key);
+
+    macro_rules! expansion_step128 {
+        ($i:expr,$rcon:expr) => {
+            // For hax we could clone here instead.
+            // let prev = keyex[$i - 1].clone();
+            let (prev, current) = keyex.split_at_mut($i);
+            current[0].aes_keygen_assist0::<$rcon>(&prev[$i - 1]);
+            current[0].key_expansion_step(&prev[$i - 1]);
+        };
+    }
+
+    expansion_step128!(1, 0x01);
+    expansion_step128!(2, 0x02);
+    expansion_step128!(3, 0x04);
+    expansion_step128!(4, 0x08);
+    expansion_step128!(5, 0x10);
+    expansion_step128!(6, 0x20);
+    expansion_step128!(7, 0x40);
+    expansion_step128!(8, 0x80);
+    expansion_step128!(9, 0x1b);
+    expansion_step128!(10, 0x36);
+
+    keyex
+}
diff --git a/aesgcm/src/ctr/aes256_ctr.rs b/aesgcm/src/ctr/aes256_ctr.rs
new file mode 100644
index 000000000..7009b138f
--- /dev/null
+++ b/aesgcm/src/ctr/aes256_ctr.rs
@@ -0,0 +1,107 @@
+//! AES256 ctr mode, generic over the platform [`AESState`].
+
+use core::array::from_fn;
+
+use super::AesCtrContext;
+use crate::{aes::*, aes_gcm_256::KEY_LEN, platform::AESState, NONCE_LEN};
+
+pub(crate) const NUM_KEYS: usize = 15;
+
+/// Type alias for the AES 256 ctr context.
+pub(crate) type Aes256CtrContext<T> = AesCtrContext<T, NUM_KEYS>;
+
+impl<T: AESState> Aes256CtrContext<T> {
+    #[inline]
+    pub(crate) fn init(key: &[u8], nonce: &[u8]) -> Self {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        debug_assert!(key.len() == KEY_LEN);
+
+        let mut ctr_nonce = [0u8; 16];
+        ctr_nonce[0..NONCE_LEN].copy_from_slice(nonce);
+
+        Self {
+            extended_key: key_expansion(key),
+            ctr_nonce,
+        }
+    }
+
+    #[inline]
+    pub(crate) fn set_nonce(&mut self, nonce: &[u8]) {
+        debug_assert!(nonce.len() == NONCE_LEN);
+        self.aes_ctr_set_nonce(nonce);
+    }
+
+    #[inline]
+    pub(crate) fn key_block(&self, ctr: u32, out: &mut [u8]) {
+        debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
+        self.aes_ctr_key_block(ctr, out);
+    }
+
+    #[inline]
+    pub(crate) fn update(&self, ctr: u32, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len());
+        self.aes_ctr_update(ctr, input, out);
+    }
+}
+
+/// 256 - Key expansion
+#[inline]
+fn key_expansion<T: AESState>(key: &[u8]) -> ExtendedKey<T, NUM_KEYS> {
+    debug_assert!(key.len() == KEY_LEN);
+
+    let mut keyex = from_fn(|_| T::new());
+    keyex[0].load_block(&key[0..16]);
+    keyex[1].load_block(&key[16..32]);
+
+    macro_rules! expansion_step256 {
+        ($i:expr,$rcon:expr) => {
+            // Split at $i to get the one we currently look at and the previous
+            // blocks.
+            let (prev, current) = keyex.split_at_mut($i);
+
+            // Split again to get the $i and $i + 1 states to operate on.
+            let (c0, c1) = current.split_at_mut(1);
+            let key_i = &mut c0[0];
+            let key_i_plus_1 = &mut c1[0];
+
+            key_i.aes_keygen_assist0::<$rcon>(&prev[$i - 1]);
+            key_i.key_expansion_step(&prev[$i - 2]);
+
+            key_i_plus_1.aes_keygen_assist1(&key_i);
+            key_i_plus_1.key_expansion_step(&prev[$i - 1]);
+
+            // The following is what will go through hax right now. But it
+            // requires copies that are really not necessary.
+            // let prev0 = keyex[$i - 2].clone();
+            // let prev1 = keyex[$i - 1].clone();
+
+            // keyex[$i].aes_keygen_assist0::<$rcon>(&prev1);
+            // keyex[$i].key_expansion_step(&prev0);
+
+            // let next0 = keyex[$i].clone();
+            // keyex[$i + 1].aes_keygen_assist1(&next0);
+            // keyex[$i + 1].key_expansion_step(&prev1);
+        };
+    }
+
+    expansion_step256!(2, 0x01);
+    expansion_step256!(4, 0x02);
+    expansion_step256!(6, 0x04);
+    expansion_step256!(8, 0x08);
+    expansion_step256!(10, 0x10);
+    expansion_step256!(12, 0x20);
+
+    let (prev0, tmp) = keyex.split_at_mut(13);
+    let (prev1, last) = tmp.split_at_mut(1);
+    // let prev0 = &mut prev0[12];
+    // let prev1 = &mut prev1[0];
+    // let last = &mut last[0];
+    // To get through hax right now we'd have to clone instead.
+    // let prev0 = keyex[12].clone();
+    // let prev1 = keyex[13].clone();
+    // let last = &mut keyex[NUM_KEYS - 1];
+    last[0].aes_keygen_assist0::<0x40>(&prev1[0]);
+    last[0].key_expansion_step(&prev0[12]);
+
+    keyex
+}
diff --git a/aesgcm/src/ctr/test128.rs b/aesgcm/src/ctr/test128.rs
new file mode 100644
index 000000000..7a2c8ba2e
--- /dev/null
+++ b/aesgcm/src/ctr/test128.rs
@@ -0,0 +1,141 @@
+use crate::{
+    aes_gcm_128::GCM_KEY_LEN,
+    ctr::Aes128CtrContext,
+    platform::{self, AESState},
+    NONCE_LEN,
+};
+
+pub(crate) fn aes128_ctr_xor_block<T: AESState>(
+    ctx: &Aes128CtrContext<T>,
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(inp.len() == out.len() && inp.len() <= 16);
+    ctx.aes_ctr_xor_block(ctr, inp, out);
+}
+
+pub(crate) fn aes128_ctr_encrypt<T: AESState>(
+    key: &[u8],
+    nonce: &[u8],
+    ctr: u32,
+    inp: &[u8],
+    out: &mut [u8],
+) {
+    debug_assert!(nonce.len() == NONCE_LEN);
+    debug_assert!(key.len() == GCM_KEY_LEN);
+    debug_assert!(inp.len() == out.len());
+    let ctx = Aes128CtrContext::<T>::init(key, nonce);
+    ctx.update(ctr, inp, out);
+}
+
+const INPUT: [u8; 32] = [
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+];
+const KEY: [u8; 16] = [
+    0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, 0x32, 0x53, 0x91, 0x63,
+];
+const NONCE: [u8; 12] = [
+    0x00, 0x6C, 0xB6, 0xDB, 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B,
+];
+const EXPECTED: [u8; 32] = [
+    0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, 0xEE, 0x8E, 0xDA, 0xD3, 0x88,
+    0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41, 0xBE, 0x28,
+];
+
+#[test]
+fn test_ctr_block() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    let ctx = Aes128CtrContext::<platform::portable::State>::init(&KEY, &NONCE);
+    aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+    aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(feature = "simd128")]
+#[test]
+fn test_ctr_block_neon() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    let ctx = Aes128CtrContext::<platform::neon::State>::init(&KEY, &NONCE);
+    aes128_ctr_xor_block(&ctx, 1, &INPUT[0..16], &mut computed[0..16]);
+    aes128_ctr_xor_block(&ctx, 2, &INPUT[16..32], &mut computed[16..32]);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[test]
+fn test_ctr_encrypt() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    aes128_ctr_encrypt::<platform::portable::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(feature = "simd128")]
+#[test]
+fn test_ctr_encrypt_neon() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    aes128_ctr_encrypt::<platform::neon::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(all(feature = "simd256", feature = "std"))]
+#[test]
+fn test_ctr_encrypt_intel() {
+    let mut computed: [u8; 32] = [0u8; 32];
+    aes128_ctr_encrypt::<platform::x64::State>(&KEY, &NONCE, 1, &INPUT, &mut computed);
+    for i in 0..32 {
+        if computed[i] != EXPECTED[i] {
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
diff --git a/aesgcm/src/gf128.rs b/aesgcm/src/gf128.rs
new file mode 100644
index 000000000..fe2c4d0cc
--- /dev/null
+++ b/aesgcm/src/gf128.rs
@@ -0,0 +1,75 @@
+//! Generic Gf128 implementation.
+//!
+//! Generic over platform dependent [`GF128FieldElement`].
+
+use crate::{aes::AES_BLOCK_LEN, platform::*};
+
+#[cfg(test)]
+mod test;
+
+/// Generic Gf128 state.
+pub(crate) struct GF128State<T: GF128FieldElement> {
+    accumulator: T,
+    r: T,
+}
+
+const KEY_LEN: usize = AES_BLOCK_LEN;
+
+impl<T: GF128FieldElement> GF128State<T> {
+    #[inline]
+    pub(crate) fn init(key: &[u8]) -> Self {
+        debug_assert!(key.len() == KEY_LEN);
+
+        Self {
+            accumulator: T::zero(),
+            r: T::load_element(key),
+        }
+    }
+
+    #[inline]
+    pub(crate) fn update(&mut self, block: &[u8]) {
+        debug_assert!(block.len() == KEY_LEN);
+
+        let block_elem = T::load_element(block);
+        self.accumulator.add(&block_elem);
+        self.accumulator.mul(&self.r);
+    }
+
+    #[inline]
+    pub(crate) fn update_blocks(&mut self, input: &[u8]) {
+        debug_assert!(input.len().is_multiple_of(AES_BLOCK_LEN));
+
+        let blocks = input.len() / AES_BLOCK_LEN;
+        for i in 0..blocks {
+            let offset = i * AES_BLOCK_LEN;
+            self.update(&input[offset..offset + AES_BLOCK_LEN]);
+        }
+    }
+
+    #[inline]
+    pub(crate) fn update_last(&mut self, partial_block: &[u8]) {
+        debug_assert!(partial_block.len() < 16);
+
+        let mut block = [0u8; 16];
+        block[0..partial_block.len()].copy_from_slice(partial_block);
+        self.update(&block);
+    }
+
+    #[inline]
+    pub(crate) fn update_padded(&mut self, input: &[u8]) {
+        let blocks = input.len() / AES_BLOCK_LEN;
+        self.update_blocks(&input[0..blocks * AES_BLOCK_LEN]);
+
+        let last = input.len() - input.len() % AES_BLOCK_LEN;
+        if last < input.len() {
+            self.update_last(&input[last..]);
+        }
+    }
+
+    #[inline]
+    pub(crate) fn emit(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
+
+        self.accumulator.store_element(out);
+    }
+}
diff --git a/aesgcm/src/gf128/test.rs b/aesgcm/src/gf128/test.rs
new file mode 100644
index 000000000..a44b8b425
--- /dev/null
+++ b/aesgcm/src/gf128/test.rs
@@ -0,0 +1,85 @@
+use super::*;
+
+fn gf128<T: GF128FieldElement>(key: &[u8], input: &[u8], out: &mut [u8]) {
+    debug_assert!(key.len() == 16);
+    debug_assert!(out.len() == 16);
+
+    let mut st = GF128State::<T>::init(key);
+    st.update_padded(input);
+    st.emit(out);
+}
+
+const INPUT: [u8; 132] = [
+    0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+    0xab, 0xad, 0xda, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1, 0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20,
+    0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19, 0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4,
+    0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45, 0x2d, 0xa3, 0xeb, 0xf1, 0xc5, 0xd8, 0x2c, 0xde,
+    0xa2, 0x41, 0x89, 0x97, 0x20, 0x0e, 0xf8, 0x2e, 0x5a, 0x8d, 0xef, 0x2f, 0x0c, 0x9e, 0x53, 0xf1,
+    0xf7, 0x5d, 0x78, 0x53, 0x65, 0x9e, 0x2a, 0x20, 0xee, 0xb2, 0xb2, 0x2a, 0xaf, 0xde, 0x64, 0x19,
+    0xa0, 0x58, 0xab, 0x4f, 0x6f, 0x74, 0x6b, 0xf4, 0x0f, 0xc0, 0xc3, 0xb7, 0x80, 0xf2, 0x44, 0x45,
+    0x44, 0xae, 0x7e, 0x3f,
+];
+
+const KEY: [u8; 16] = [
+    0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7,
+];
+
+const EXPECTED: [u8; 16] = [
+    0xfb, 0xba, 0xaa, 0x70, 0xa0, 0x73, 0x6f, 0xf9, 0xed, 0x2f, 0xc4, 0x62, 0xde, 0x72, 0x61, 0xe0,
+];
+
+#[test]
+fn test_gf128() {
+    let mut computed: [u8; 16] = [0u8; 16];
+    gf128::<crate::platform::portable::FieldElement>(&KEY, &INPUT, &mut computed);
+    for i in 0..16 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(feature = "simd128")]
+#[test]
+fn test_gf128_neon() {
+    let mut computed: [u8; 16] = [0u8; 16];
+    gf128::<crate::platform::neon::FieldElement>(&KEY, &INPUT, &mut computed);
+    for i in 0..16 {
+        if computed[i] != EXPECTED[i] {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
+
+#[cfg(all(feature = "simd256", feature = "std"))]
+#[test]
+fn test_gf128_intel() {
+    let mut computed: [u8; 16] = [0u8; 16];
+    gf128::<crate::platform::x64::FieldElement>(&KEY, &INPUT, &mut computed);
+    for i in 0..16 {
+        if computed[i] != EXPECTED[i] {
+            std::eprintln!(
+                "mismatch at {}: expected is {}, computed is {}",
+                i,
+                EXPECTED[i],
+                computed[i]
+            );
+            assert!(false);
+        }
+    }
+}
diff --git a/aesgcm/src/lib.rs b/aesgcm/src/lib.rs
new file mode 100644
index 000000000..f289c67d1
--- /dev/null
+++ b/aesgcm/src/lib.rs
@@ -0,0 +1,476 @@
+#![no_std]
+#![deny(unsafe_code)]
+
+#[cfg(feature = "std")]
+extern crate std;
+
+mod aes;
+mod ctr;
+mod gf128;
+mod platform;
+
+mod aes_gcm;
+mod aes_gcm_128;
+mod aes_gcm_256;
+
+use libcrux_traits::aead::{arrayref, consts, slice, typed_owned};
+
+// TODO: should this trait be re-exported here?
+pub use libcrux_traits::aead::arrayref::Aead;
+
+/// Trait for an AES State.
+/// Implemented for 128 and 256.
+pub(crate) trait State {
+    fn init(key: &[u8]) -> Self;
+    fn set_nonce(&mut self, nonce: &[u8]);
+    fn encrypt(&mut self, aad: &[u8], plaintext: &[u8], ciphertext: &mut [u8], tag: &mut [u8]);
+    fn decrypt(
+        &mut self,
+        aad: &[u8],
+        ciphertext: &[u8],
+        tag: &[u8],
+        plaintext: &mut [u8],
+    ) -> Result<(), DecryptError>;
+}
+
+/// AES-GCM decryption error.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct DecryptError();
+
+/// AES-GCM 128.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct AesGcm128;
+
+/// Portable AES-GCM 128.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct PortableAesGcm128;
+
+/// Neon AES-GCM 128.
+#[cfg(feature = "simd128")]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct NeonAesGcm128;
+#[cfg(not(feature = "simd128"))]
+pub type NeonAesGcm128 = PortableAesGcm128;
+
+/// AES-NI AES-GCM 128.
+#[cfg(feature = "simd256")]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct X64AesGcm128;
+#[cfg(not(feature = "simd256"))]
+pub type X64AesGcm128 = PortableAesGcm128;
+
+/// AES-GCM 256.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct AesGcm256;
+
+/// Portable AES-GCM 256.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct PortableAesGcm256;
+
+/// Neon AES-GCM 256.
+#[cfg(feature = "simd128")]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct NeonAesGcm256;
+
+/// Neon AES-GCM 256.
+#[cfg(not(feature = "simd128"))]
+pub type NeonAesGcm256 = PortableAesGcm256;
+
+/// AES-NI AES-GCM 256.
+#[derive(Clone, Copy, PartialEq, Eq)]
+#[cfg(feature = "simd256")]
+pub struct X64AesGcm256;
+
+/// AES-NI AES-GCM 256.
+#[cfg(not(feature = "simd256"))]
+pub type X64AesGcm256 = PortableAesGcm256;
+
+/// Tag length.
+pub(crate) const TAG_LEN: usize = 16;
+
+/// Nonce length.
+pub(crate) const NONCE_LEN: usize = 12;
+
+/// Generic AES-GCM encrypt.
+pub(crate) fn encrypt<S: State>(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    plaintext: &[u8],
+    ciphertext: &mut [u8],
+    tag: &mut [u8],
+) {
+    debug_assert!(nonce.len() == NONCE_LEN);
+    debug_assert!(tag.len() == TAG_LEN);
+
+    let mut st = S::init(key);
+    st.set_nonce(nonce);
+    st.encrypt(aad, plaintext, ciphertext, tag);
+}
+
+/// Generic AES-GCM decrypt.
+pub(crate) fn decrypt<S: State>(
+    key: &[u8],
+    nonce: &[u8],
+    aad: &[u8],
+    ciphertext: &[u8],
+    tag: &[u8],
+    plaintext: &mut [u8],
+) -> Result<(), DecryptError> {
+    debug_assert!(nonce.len() == NONCE_LEN);
+    debug_assert!(tag.len() == TAG_LEN);
+
+    let mut st = S::init(key);
+    st.set_nonce(nonce);
+    st.decrypt(aad, ciphertext, tag, plaintext)
+}
+
+/// Macro to instantiate the different variants, both 128/256 and platforms.
+macro_rules! pub_mod {
+    ($variant_comment:literal, $mod_name:ident, $state:ty) => {
+        #[doc = $variant_comment]
+        pub mod $mod_name {
+            use crate::$mod_name::KEY_LEN;
+            use crate::{platform, DecryptError};
+
+            type State = $state;
+
+            #[doc = $variant_comment]
+            /// encrypt.
+            pub fn encrypt(
+                key: &[u8],
+                nonce: &[u8],
+                aad: &[u8],
+                plaintext: &[u8],
+                ciphertext: &mut [u8],
+                tag: &mut [u8],
+            ) {
+                debug_assert!(key.len() == KEY_LEN);
+                crate::encrypt::<State>(key, nonce, aad, plaintext, ciphertext, tag);
+            }
+
+            #[doc = $variant_comment]
+            /// decrypt.
+            pub fn decrypt(
+                key: &[u8],
+                nonce: &[u8],
+                aad: &[u8],
+                ciphertext: &[u8],
+                tag: &[u8],
+                plaintext: &mut [u8],
+            ) -> Result<(), DecryptError> {
+                debug_assert!(key.len() == KEY_LEN);
+                crate::decrypt::<State>(key, nonce, aad, ciphertext, tag, plaintext)
+            }
+        }
+    };
+}
+
+pub mod portable {
+    pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::portable::State, platform::portable::FieldElement>);
+    pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::portable::State, platform::portable::FieldElement>);
+}
+
+#[cfg(feature = "simd128")]
+pub mod neon {
+    pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::neon::State, platform::neon::FieldElement>);
+    pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::neon::State, platform::neon::FieldElement>);
+}
+
+#[cfg(feature = "simd256")]
+pub mod x64 {
+    // Here we don't use the `pub_mod` macro because we need to add target features
+    // onto the functions.
+    macro_rules! x64_pub_mod {
+        ($variant_comment:literal, $mod_name:ident, $state:ty) => {
+            #[doc = $variant_comment]
+            pub mod $mod_name {
+                use crate::$mod_name::KEY_LEN;
+                use crate::{platform, DecryptError};
+
+                type State = $state;
+
+                #[doc = $variant_comment]
+                /// encrypt.
+                pub fn encrypt(
+                    key: &[u8],
+                    nonce: &[u8],
+                    aad: &[u8],
+                    plaintext: &[u8],
+                    ciphertext: &mut [u8],
+                    tag: &mut [u8],
+                ) {
+                    debug_assert!(key.len() == KEY_LEN);
+
+                    #[inline]
+                    #[target_feature(enable = "avx2", enable = "aes")]
+                    #[allow(unsafe_code)]
+                    unsafe fn inner(
+                        key: &[u8],
+                        nonce: &[u8],
+                        aad: &[u8],
+                        plaintext: &[u8],
+                        ciphertext: &mut [u8],
+                        tag: &mut [u8],
+                    ) {
+                        crate::encrypt::<State>(key, nonce, aad, plaintext, ciphertext, tag);
+                    }
+
+                    #[allow(unsafe_code)]
+                    unsafe {
+                        inner(key, nonce, aad, plaintext, ciphertext, tag)
+                    };
+                }
+
+                #[doc = $variant_comment]
+                /// decrypt.
+                pub fn decrypt(
+                    key: &[u8],
+                    nonce: &[u8],
+                    aad: &[u8],
+                    ciphertext: &[u8],
+                    tag: &[u8],
+                    plaintext: &mut [u8],
+                ) -> Result<(), DecryptError> {
+                    debug_assert!(key.len() == KEY_LEN);
+
+                    #[inline]
+                    #[target_feature(enable = "avx2", enable = "aes")]
+                    #[allow(unsafe_code)]
+                    unsafe fn inner(
+                        key: &[u8],
+                        nonce: &[u8],
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &[u8],
+                        plaintext: &mut [u8],
+                    ) -> Result<(), DecryptError> {
+                        crate::decrypt::<State>(key, nonce, aad, ciphertext, tag, plaintext)
+                    }
+
+                    #[allow(unsafe_code)]
+                    unsafe {
+                        inner(key, nonce, aad, ciphertext, tag, plaintext)
+                    }
+                }
+            }
+        };
+    }
+
+    x64_pub_mod!(r"AES-GCM 128 ", aes_gcm_128, crate::aes_gcm_128::State<platform::x64::State, platform::x64::FieldElement>);
+    x64_pub_mod!(r"AES-GCM 256 ", aes_gcm_256, crate::aes_gcm_256::State<platform::x64::State, platform::x64::FieldElement>);
+}
+
+/// Macro to implement the libcrux_traits public API traits
+///
+/// For the blanket impl of `typed_refs::Aead` to take place,
+/// the `$type` must implement `Copy` and `PartialEq`.
+macro_rules! impl_traits_public_api {
+    ($type:ty, $keylen:expr, $taglen:expr, $noncelen:expr ) => {
+        // prerequisite for typed_owned::Aead
+        impl consts::AeadConsts for $type {
+            const KEY_LEN: usize = KEY_LEN;
+            const TAG_LEN: usize = TAG_LEN;
+            const NONCE_LEN: usize = NONCE_LEN;
+        }
+        // implement typed_owned::Aead
+        typed_owned::impl_aead_typed_owned!($type, KEY_LEN, TAG_LEN, NONCE_LEN);
+    };
+}
+
+/// Macro to implement the different structs and multiplexing.
+macro_rules! api {
+    ($mod_name:ident, $variant:ident, $multiplexing:ty, $portable:ident, $neon:ident, $x64:ident) => {
+        mod $mod_name {
+            use super::*;
+            use libcrux_traits::aead::arrayref::{DecryptError, EncryptError};
+            use $variant::KEY_LEN;
+
+            pub type Key = [u8; KEY_LEN];
+            pub type Tag = [u8; TAG_LEN];
+            pub type Nonce = [u8; NONCE_LEN];
+
+            mod _libcrux_traits_apis_multiplex {
+                use super::*;
+
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($multiplexing => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($multiplexing, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $multiplexing {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        // SIMD256 needs to come first because SIMD128 is true for
+                        // x64 as well, but we don't actually implement it.
+                        if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                            $x64::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        } else if libcrux_platform::simd128_support()
+                            && libcrux_platform::aes_ni_support()
+                        {
+                            $neon::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        } else {
+                            $portable::encrypt(ciphertext, tag, key, nonce, aad, plaintext)
+                        }
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        // SIMD256 needs to come first because SIMD128 is true for
+                        // x64 as well, but we don't actually implement it.
+                        if libcrux_platform::simd256_support() && libcrux_platform::aes_ni_support() {
+                            $x64::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        } else if libcrux_platform::simd128_support()
+                            && libcrux_platform::aes_ni_support()
+                        {
+                            $neon::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        } else {
+                            $portable::decrypt(plaintext, key, nonce, aad, ciphertext, tag)
+                        }
+                    }
+                }
+            }
+
+            mod _libcrux_traits_apis_portable {
+                use super::*;
+
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($portable => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($portable, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $portable {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        portable::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                        Ok(())
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        portable::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                            .map_err(|_| DecryptError::InvalidTag)
+                    }
+                }
+            }
+
+            #[cfg(feature = "simd128")]
+            mod _libcrux_traits_apis_neon {
+                use super::*;
+
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($neon => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($neon, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $neon {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        neon::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                        Ok(())
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        neon::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                            .map_err(|_| DecryptError::InvalidTag)
+                    }
+                }
+            }
+
+            #[cfg(feature = "simd256")]
+            mod _libcrux_traits_api_x64 {
+                use super::*;
+
+                // implement `libcrux_traits` slice trait
+                slice::impl_aead_slice_trait!($x64 => KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                // implement `libcrux_traits` public API traits
+                impl_traits_public_api!($x64, KEY_LEN, TAG_LEN, NONCE_LEN);
+
+                impl arrayref::Aead<KEY_LEN, TAG_LEN, NONCE_LEN> for $x64 {
+                    fn encrypt(
+                        ciphertext: &mut [u8],
+                        tag: &mut Tag,
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        plaintext: &[u8],
+                    ) -> Result<(), EncryptError> {
+                        x64::$variant::encrypt(key, nonce, aad, plaintext, ciphertext, tag);
+                        Ok(())
+                    }
+
+                    fn decrypt(
+                        plaintext: &mut [u8],
+                        key: &Key,
+                        nonce: &Nonce,
+                        aad: &[u8],
+                        ciphertext: &[u8],
+                        tag: &Tag,
+                    ) -> Result<(), DecryptError> {
+                        x64::$variant::decrypt(key, nonce, aad, ciphertext, tag, plaintext)
+                            .map_err(|_| DecryptError::InvalidTag)
+                    }
+                }
+            }
+        }
+    };
+}
+
+api!(
+    aes128,
+    aes_gcm_128,
+    AesGcm128,
+    PortableAesGcm128,
+    NeonAesGcm128,
+    X64AesGcm128
+);
+
+api!(
+    aes256,
+    aes_gcm_256,
+    AesGcm256,
+    PortableAesGcm256,
+    NeonAesGcm256,
+    X64AesGcm256
+);
diff --git a/aesgcm/src/platform.rs b/aesgcm/src/platform.rs
new file mode 100644
index 000000000..9c622c81a
--- /dev/null
+++ b/aesgcm/src/platform.rs
@@ -0,0 +1,33 @@
+//! Traits for platform dependent implementations
+
+pub(crate) mod portable;
+
+#[cfg(feature = "simd128")]
+pub(crate) mod neon;
+
+#[cfg(feature = "simd256")]
+pub(crate) mod x64;
+
+/// The AES state.
+pub(crate) trait AESState: Clone + core::fmt::Debug {
+    fn new() -> Self;
+    fn load_block(&mut self, b: &[u8]);
+    fn store_block(&self, out: &mut [u8]);
+    fn xor_block(&self, inp: &[u8], out: &mut [u8]);
+
+    fn xor_key(&mut self, key: &Self);
+    fn aes_enc(&mut self, key: &Self);
+    fn aes_enc_last(&mut self, key: &Self);
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self);
+    fn aes_keygen_assist1(&mut self, prev: &Self);
+    fn key_expansion_step(&mut self, prev: &Self);
+}
+
+/// A gf128 field element.
+pub(crate) trait GF128FieldElement {
+    fn zero() -> Self;
+    fn load_element(bytes: &[u8]) -> Self;
+    fn store_element(&self, bytes: &mut [u8]);
+    fn add(&mut self, other: &Self);
+    fn mul(&mut self, other: &Self);
+}
diff --git a/aesgcm/src/platform/neon.rs b/aesgcm/src/platform/neon.rs
new file mode 100644
index 000000000..37a24363c
--- /dev/null
+++ b/aesgcm/src/platform/neon.rs
@@ -0,0 +1,5 @@
+pub(crate) use aes_core::State;
+pub(crate) use gf128_core::FieldElement;
+
+mod aes_core;
+mod gf128_core;
diff --git a/aesgcm/src/platform/neon/aes_core.rs b/aesgcm/src/platform/neon/aes_core.rs
new file mode 100644
index 000000000..674b620c9
--- /dev/null
+++ b/aesgcm/src/platform/neon/aes_core.rs
@@ -0,0 +1,130 @@
+use libcrux_intrinsics::arm64::{
+    _uint8x16_t, _vaeseq_u8, _vaesmcq_u8, _vdupq_laneq_u32, _vdupq_n_u32, _vdupq_n_u8, _veorq_u32,
+    _veorq_u8, _vextq_u32, _vld1q_u32, _vld1q_u8, _vreinterpretq_u32_u8, _vreinterpretq_u8_u32,
+    _vst1q_u8,
+};
+
+/// The Neon state
+pub(crate) type State = _uint8x16_t;
+
+#[inline]
+fn new_state() -> State {
+    _vdupq_n_u8(0)
+}
+
+#[inline]
+fn xor_key1_state(st: &mut State, k: &State) {
+    *st = _veorq_u8(*st, *k);
+}
+
+#[inline]
+fn aes_enc(st: &mut State, key: &State) {
+    *st = _veorq_u8(_vaesmcq_u8(_vaeseq_u8(*st, _vdupq_n_u8(0))), *key);
+}
+
+#[inline]
+fn aes_enc_last(st: &mut State, key: &State) {
+    *st = _veorq_u8(_vaeseq_u8(*st, _vdupq_n_u8(0)), *key)
+}
+
+#[inline]
+fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
+    let st = _vaeseq_u8(*prev, _vdupq_n_u8(0));
+    let mut tmp = [0u8; 16];
+    _vst1q_u8(&mut tmp, st);
+    let tmp_new = [
+        tmp[4], tmp[1], tmp[14], tmp[11], tmp[1], tmp[14], tmp[11], tmp[4], tmp[12], tmp[9],
+        tmp[6], tmp[3], tmp[9], tmp[6], tmp[3], tmp[12],
+    ];
+    let st_new = _vld1q_u8(&tmp_new);
+    let rcon_array = [0, rcon as u32, 0, rcon as u32];
+    let rcon_vec = _vreinterpretq_u8_u32(_vld1q_u32(&rcon_array));
+    *next = _veorq_u8(st_new, rcon_vec);
+}
+
+#[inline]
+fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
+    aes_keygen_assist(next, prev, rcon);
+    *next = _vreinterpretq_u8_u32(_vdupq_laneq_u32::<3>(_vreinterpretq_u32_u8(*next)))
+}
+
+#[inline]
+fn aes_keygen_assist1(next: &mut State, prev: &State) {
+    aes_keygen_assist(next, prev, 0);
+    *next = _vreinterpretq_u8_u32(_vdupq_laneq_u32::<2>(_vreinterpretq_u32_u8(*next)));
+}
+
+#[inline]
+fn key_expansion_step(next: &mut State, prev: &State) {
+    let zero = _vdupq_n_u32(0);
+    let prev0 = _vreinterpretq_u32_u8(*prev);
+    let prev1 = _veorq_u32(prev0, _vextq_u32::<3>(zero, prev0));
+    let prev2 = _veorq_u32(prev1, _vextq_u32::<3>(zero, prev1));
+    let prev3 = _veorq_u32(prev2, _vextq_u32::<3>(zero, prev2));
+    *next = _veorq_u8(*next, _vreinterpretq_u8_u32(prev3));
+}
+
+impl crate::platform::AESState for State {
+    #[inline]
+    fn new() -> Self {
+        new_state()
+    }
+
+    #[inline]
+    fn load_block(&mut self, b: &[u8]) {
+        debug_assert!(b.len() == 16);
+        *self = _vld1q_u8(b);
+    }
+
+    #[inline]
+    fn store_block(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
+        _vst1q_u8(out, *self);
+    }
+
+    #[inline]
+    fn xor_block(&self, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= 16);
+        // XXX: hot-fix to have enough input and output here.
+        // For some reason this doesn't fail even if we don't do this.
+        let mut block_in = [0u8; 16];
+        let mut block_out = [0u8; 16];
+        block_in[0..input.len()].copy_from_slice(input);
+
+        let inp_vec = _vld1q_u8(&block_in);
+        let out_vec = _veorq_u8(inp_vec, *self);
+        _vst1q_u8(&mut block_out, out_vec);
+
+        out.copy_from_slice(&block_out[0..out.len()]);
+    }
+
+    #[inline]
+    fn xor_key(&mut self, key: &Self) {
+        xor_key1_state(self, key);
+    }
+
+    #[inline]
+    fn aes_enc(&mut self, key: &Self) {
+        aes_enc(self, key);
+    }
+
+    #[inline]
+    fn aes_enc_last(&mut self, key: &Self) {
+        aes_enc_last(self, key);
+    }
+
+    #[inline]
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
+        aes_keygen_assist0(self, prev, RCON as u8);
+    }
+
+    #[inline]
+    fn aes_keygen_assist1(&mut self, prev: &Self) {
+        aes_keygen_assist1(self, prev);
+    }
+
+    #[inline]
+    fn key_expansion_step(&mut self, prev: &Self) {
+        key_expansion_step(self, prev);
+    }
+}
diff --git a/aesgcm/src/platform/neon/gf128_core.rs b/aesgcm/src/platform/neon/gf128_core.rs
new file mode 100644
index 000000000..3c9594dc9
--- /dev/null
+++ b/aesgcm/src/platform/neon/gf128_core.rs
@@ -0,0 +1,93 @@
+use libcrux_intrinsics::arm64::*;
+
+/// A Neon gf128 field element
+#[derive(Clone, Copy)]
+pub(crate) struct FieldElement(pub(crate) u128);
+
+#[inline]
+fn zero() -> FieldElement {
+    FieldElement(0)
+}
+
+#[inline]
+fn load_element(b: &[u8]) -> FieldElement {
+    debug_assert!(b.len() == 16);
+
+    FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
+}
+
+#[inline]
+fn store_element(element: &FieldElement, bytes: &mut [u8]) {
+    debug_assert!(bytes.len() == 16);
+
+    bytes.copy_from_slice(&element.0.to_be_bytes());
+}
+
+#[inline]
+fn add(element: &mut FieldElement, other: &FieldElement) {
+    element.0 ^= other.0;
+}
+
+#[inline]
+fn mul_wide(element: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+    let l0 = element.0 as u64;
+    let h0 = (element.0 >> 64) as u64;
+    let l1 = other.0 as u64;
+    let h1 = (other.0 >> 64) as u64;
+
+    let low: u128 = _vmull_p64(l0, l1);
+    let m1: u128 = _vmull_p64(l0, h1);
+    let m2: u128 = _vmull_p64(l1, h0);
+    let high: u128 = _vmull_p64(h0, h1);
+
+    let mid = m1 ^ m2;
+    let m0 = mid << 64;
+    let m1 = mid >> 64;
+    let low = low ^ m0;
+    let high = high ^ m1;
+
+    (FieldElement(high), FieldElement(low))
+}
+
+#[inline]
+fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
+    let high = (high.0 << 1) ^ (low.0 >> 127);
+    let low = low.0 << 1;
+    let x0_0 = low << 64;
+    let x1_x0 = low ^ (x0_0 << 63) ^ (x0_0 << 62) ^ (x0_0 << 57);
+    let x1_x0 = x1_x0 ^ (x1_x0 >> 1) ^ (x1_x0 >> 2) ^ (x1_x0 >> 7);
+    FieldElement(x1_x0 ^ high)
+}
+
+#[inline]
+fn mul(x: &mut FieldElement, y: &FieldElement) {
+    let (high, low) = mul_wide(x, y);
+    *x = reduce(&high, &low);
+}
+
+impl crate::platform::GF128FieldElement for FieldElement {
+    #[inline]
+    fn zero() -> Self {
+        zero()
+    }
+
+    #[inline]
+    fn load_element(b: &[u8]) -> Self {
+        load_element(b)
+    }
+
+    #[inline]
+    fn store_element(&self, b: &mut [u8]) {
+        store_element(self, b);
+    }
+
+    #[inline]
+    fn add(&mut self, other: &Self) {
+        add(self, other);
+    }
+
+    #[inline]
+    fn mul(&mut self, other: &Self) {
+        mul(self, other)
+    }
+}
diff --git a/aesgcm/src/platform/portable.rs b/aesgcm/src/platform/portable.rs
new file mode 100644
index 000000000..37a24363c
--- /dev/null
+++ b/aesgcm/src/platform/portable.rs
@@ -0,0 +1,5 @@
+pub(crate) use aes_core::State;
+pub(crate) use gf128_core::FieldElement;
+
+mod aes_core;
+mod gf128_core;
diff --git a/aesgcm/src/platform/portable/aes_core.rs b/aesgcm/src/platform/portable/aes_core.rs
new file mode 100644
index 000000000..7cc75160b
--- /dev/null
+++ b/aesgcm/src/platform/portable/aes_core.rs
@@ -0,0 +1,523 @@
+#![allow(clippy::needless_range_loop)]
+
+use crate::aes::AES_BLOCK_LEN;
+
+#[cfg(test)]
+mod test;
+
+pub(crate) type State = [u16; 8];
+
+#[inline]
+fn new_state() -> State {
+    [0u16; 8]
+}
+
+#[inline]
+fn interleave_u8_1(i0: u8, i1: u8) -> u16 {
+    let mut x = i0 as u16;
+
+    x = (x | (x << 4)) & 0x0F0F;
+    x = (x | (x << 2)) & 0x3333;
+    x = (x | (x << 1)) & 0x5555;
+
+    let mut y = i1 as u16;
+
+    y = (y | (y << 4)) & 0x0F0F;
+    y = (y | (y << 2)) & 0x3333;
+    y = (y | (y << 1)) & 0x5555;
+
+    x | (y << 1)
+}
+
+#[inline]
+fn deinterleave_u8_1(i0: u16) -> (u8, u8) {
+    let mut x = i0 & 0x5555;
+
+    x = (x | (x >> 1)) & 0x3333;
+    x = (x | (x >> 2)) & 0x0F0F;
+    x = (x | (x >> 4)) & 0x00FF;
+
+    let mut y = (i0 >> 1) & 0x5555;
+
+    y = (y | (y >> 1)) & 0x3333;
+    y = (y | (y >> 2)) & 0x0F0F;
+    y = (y | (y >> 4)) & 0x00FF;
+
+    (x as u8, y as u8)
+}
+
+#[inline]
+fn interleave_u16_2(i0: u16, i1: u16) -> (u16, u16) {
+    let x = ((i1 & 0x3333) << 2) | (i0 & 0x3333);
+    let y = ((i0 & 0xcccc) >> 2) | (i1 & 0xcccc);
+    (x, y)
+}
+
+#[inline]
+fn interleave_u16_4(i0: u16, i1: u16) -> (u16, u16) {
+    let x = ((i1 & 0x0F0F) << 4) | (i0 & 0x0F0F);
+    let y = ((i0 & 0xF0F0) >> 4) | (i1 & 0xF0F0);
+    (x, y)
+}
+
+#[inline]
+fn interleave_u16_8(i0: u16, i1: u16) -> (u16, u16) {
+    let x = ((i1 & 0x00FF) << 8) | (i0 & 0x00FF);
+    let y = ((i0 & 0xFF00) >> 8) | (i1 & 0xFF00);
+    (x, y)
+}
+
+#[inline]
+fn transpose_u8x16(input: &[u8; 16], output: &mut [u16; 8]) {
+    let o0 = interleave_u8_1(input[0], input[1]);
+    let o1 = interleave_u8_1(input[2], input[3]);
+    let o2 = interleave_u8_1(input[4], input[5]);
+    let o3 = interleave_u8_1(input[6], input[7]);
+    let o4 = interleave_u8_1(input[8], input[9]);
+    let o5 = interleave_u8_1(input[10], input[11]);
+    let o6 = interleave_u8_1(input[12], input[13]);
+    let o7 = interleave_u8_1(input[14], input[15]);
+
+    let (o0, o1) = interleave_u16_2(o0, o1);
+    let (o2, o3) = interleave_u16_2(o2, o3);
+    let (o4, o5) = interleave_u16_2(o4, o5);
+    let (o6, o7) = interleave_u16_2(o6, o7);
+
+    let (o0, o2) = interleave_u16_4(o0, o2);
+    let (o1, o3) = interleave_u16_4(o1, o3);
+    let (o4, o6) = interleave_u16_4(o4, o6);
+    let (o5, o7) = interleave_u16_4(o5, o7);
+
+    let (o0, o4) = interleave_u16_8(o0, o4);
+    let (o1, o5) = interleave_u16_8(o1, o5);
+    let (o2, o6) = interleave_u16_8(o2, o6);
+    let (o3, o7) = interleave_u16_8(o3, o7);
+
+    output[0] = o0;
+    output[1] = o1;
+    output[2] = o2;
+    output[3] = o3;
+    output[4] = o4;
+    output[5] = o5;
+    output[6] = o6;
+    output[7] = o7;
+}
+
+#[inline]
+fn transpose_u16x8(input: &[u16; 8], output: &mut [u8]) {
+    let (i0, i4) = interleave_u16_8(input[0], input[4]);
+    let (i1, i5) = interleave_u16_8(input[1], input[5]);
+    let (i2, i6) = interleave_u16_8(input[2], input[6]);
+    let (i3, i7) = interleave_u16_8(input[3], input[7]);
+
+    let (i0, i2) = interleave_u16_4(i0, i2);
+    let (i1, i3) = interleave_u16_4(i1, i3);
+    let (i4, i6) = interleave_u16_4(i4, i6);
+    let (i5, i7) = interleave_u16_4(i5, i7);
+
+    let (i0, i1) = interleave_u16_2(i0, i1);
+    let (i2, i3) = interleave_u16_2(i2, i3);
+    let (i4, i5) = interleave_u16_2(i4, i5);
+    let (i6, i7) = interleave_u16_2(i6, i7);
+
+    let (o0, o1) = deinterleave_u8_1(i0);
+    let (o2, o3) = deinterleave_u8_1(i1);
+    let (o4, o5) = deinterleave_u8_1(i2);
+    let (o6, o7) = deinterleave_u8_1(i3);
+    let (o8, o9) = deinterleave_u8_1(i4);
+
+    let (o10, o11) = deinterleave_u8_1(i5);
+    let (o12, o13) = deinterleave_u8_1(i6);
+    let (o14, o15) = deinterleave_u8_1(i7);
+
+    output[0] = o0;
+    output[1] = o1;
+    output[2] = o2;
+    output[3] = o3;
+    output[4] = o4;
+    output[5] = o5;
+    output[6] = o6;
+    output[7] = o7;
+    output[8] = o8;
+    output[9] = o9;
+    output[10] = o10;
+    output[11] = o11;
+    output[12] = o12;
+    output[13] = o13;
+    output[14] = o14;
+    output[15] = o15;
+}
+
+#[inline]
+fn xnor(a: u16, b: u16) -> u16 {
+    !(a ^ b)
+}
+
+#[inline]
+fn sub_bytes_state(st: &mut State) {
+    let u0 = st[7];
+    let u1 = st[6];
+    let u2 = st[5];
+    let u3 = st[4];
+    let u4 = st[3];
+    let u5 = st[2];
+    let u6 = st[1];
+    let u7 = st[0];
+
+    let t1 = u6 ^ u4;
+    let t2 = u3 ^ u0;
+    let t3 = u1 ^ u2;
+    let t4 = u7 ^ t3;
+    let t5 = t1 ^ t2;
+    let t6 = u1 ^ u5;
+    let t7 = u0 ^ u6;
+    let t8 = t1 ^ t6;
+    let t9 = u6 ^ t4;
+    let t10 = u3 ^ t4;
+    let t11 = u7 ^ t5;
+    let t12 = t5 ^ t6;
+    let t13 = u2 ^ u5;
+    let t14 = t3 ^ t5;
+    let t15 = u5 ^ t7;
+    let t16 = u0 ^ u5;
+    let t17 = u7 ^ t8;
+    let t18 = u6 ^ u5;
+    let t19 = t2 ^ t18;
+    let t20 = t4 ^ t15;
+    let t21 = t1 ^ t13;
+    let t22 = u0 ^ t4;
+    let t39 = t21 ^ t5;
+    let t40 = t21 ^ t7;
+    let t41 = t7 ^ t19;
+    let t42 = t16 ^ t14;
+    let t43 = t22 ^ t17;
+    let t44 = t19 & t5;
+    let t45 = t20 & t11;
+    let t46 = t12 ^ t44;
+    let t47 = t10 & u7;
+    let t48 = t47 ^ t44;
+    let t49 = t7 & t21;
+    let t50 = t9 & t4;
+    let t51 = t40 ^ t49;
+    let t52 = t22 & t17;
+    let t53 = t52 ^ t49;
+    let t54 = t2 & t8;
+    let t55 = t41 & t39;
+    let t56 = t55 ^ t54;
+    let t57 = t16 & t14;
+    let t58 = t57 ^ t54;
+    let t59 = t46 ^ t45;
+    let t60 = t48 ^ t42;
+    let t61 = t51 ^ t50;
+    let t62 = t53 ^ t58;
+    let t63 = t59 ^ t56;
+    let t64 = t60 ^ t58;
+    let t65 = t61 ^ t56;
+    let t66 = t62 ^ t43;
+    let t67 = t65 ^ t66;
+    let t68 = t65 & t63;
+    let t69 = t64 ^ t68;
+    let t70 = t63 ^ t64;
+    let t71 = t66 ^ t68;
+    let t72 = t71 & t70;
+    let t73 = t69 & t67;
+    let t74 = t63 & t66;
+    let t75 = t70 & t74;
+    let t76 = t70 ^ t68;
+    let t77 = t64 & t65;
+    let t78 = t67 & t77;
+    let t79 = t67 ^ t68;
+    let t80 = t64 ^ t72;
+    let t81 = t75 ^ t76;
+    let t82 = t66 ^ t73;
+    let t83 = t78 ^ t79;
+    let t84 = t81 ^ t83;
+    let t85 = t80 ^ t82;
+    let t86 = t80 ^ t81;
+    let t87 = t82 ^ t83;
+    let t88 = t85 ^ t84;
+    let t89 = t87 & t5;
+    let t90 = t83 & t11;
+    let t91 = t82 & u7;
+    let t92 = t86 & t21;
+    let t93 = t81 & t4;
+    let t94 = t80 & t17;
+    let t95 = t85 & t8;
+    let t96 = t88 & t39;
+    let t97 = t84 & t14;
+    let t98 = t87 & t19;
+    let t99 = t83 & t20;
+    let t100 = t82 & t10;
+    let t101 = t86 & t7;
+    let t102 = t81 & t9;
+    let t103 = t80 & t22;
+    let t104 = t85 & t2;
+    let t105 = t88 & t41;
+    let t106 = t84 & t16;
+    let t107 = t104 ^ t105;
+    let t108 = t93 ^ t99;
+    let t109 = t96 ^ t107;
+    let t110 = t98 ^ t108;
+    let t111 = t91 ^ t101;
+    let t112 = t89 ^ t92;
+    let t113 = t107 ^ t112;
+    let t114 = t90 ^ t110;
+    let t115 = t89 ^ t95;
+    let t116 = t94 ^ t102;
+    let t117 = t97 ^ t103;
+    let t118 = t91 ^ t114;
+    let t119 = t111 ^ t117;
+    let t120 = t100 ^ t108;
+    let t121 = t92 ^ t95;
+    let t122 = t110 ^ t121;
+    let t123 = t106 ^ t119;
+    let t124 = t104 ^ t115;
+    let t125 = t111 ^ t116;
+
+    let t128 = t94 ^ t107;
+
+    let t131 = t93 ^ t101;
+    let t132 = t112 ^ t120;
+
+    let t134 = t97 ^ t116;
+    let t135 = t131 ^ t134;
+    let t136 = t93 ^ t115;
+
+    let t138 = t119 ^ t132;
+    let t140 = t114 ^ t136;
+
+    let s0 = t109 ^ t122;
+    let s2 = xnor(t123, t124);
+    let s3 = t113 ^ t114;
+    let s4 = t118 ^ t128;
+    let s7 = xnor(t113, t125);
+    let s6 = xnor(t109, t135);
+    let s5 = t109 ^ t138;
+    let s1 = xnor(t109, t140);
+
+    st[0] = s7;
+    st[1] = s6;
+    st[2] = s5;
+    st[3] = s4;
+    st[4] = s3;
+    st[5] = s2;
+    st[6] = s1;
+    st[7] = s0;
+}
+
+#[inline]
+fn shift_row_u16(input: u16) -> u16 {
+    (input & 0x1111)
+        | ((input & 0x2220) >> 4)
+        | ((input & 0x0002) << 12)
+        | ((input & 0x4400) >> 8)
+        | ((input & 0x0044) << 8)
+        | ((input & 0x8000) >> 12)
+        | ((input & 0x0888) << 4)
+}
+
+#[inline]
+fn shift_rows_state(st: &mut State) {
+    st[0] = shift_row_u16(st[0]);
+    st[1] = shift_row_u16(st[1]);
+    st[2] = shift_row_u16(st[2]);
+    st[3] = shift_row_u16(st[3]);
+    st[4] = shift_row_u16(st[4]);
+    st[5] = shift_row_u16(st[5]);
+    st[6] = shift_row_u16(st[6]);
+    st[7] = shift_row_u16(st[7]);
+}
+
+#[inline]
+fn mix_columns_state(st: &mut State) {
+    let mut last_col: u16 = 0;
+
+    for i in 0..8 {
+        let col = st[i] ^ (((st[i] & 0xeeee) >> 1) | ((st[i] & 0x1111) << 3));
+        st[i] = st[i] ^ last_col ^ col ^ (((col & 0xcccc) >> 2) | ((col & 0x3333) << 2));
+        last_col = col;
+    }
+
+    st[0] ^= last_col;
+    st[1] ^= last_col;
+    st[3] ^= last_col;
+    st[4] ^= last_col;
+}
+
+#[inline]
+fn xor_key1_state(st: &mut State, k: &State) {
+    st[0] ^= k[0];
+    st[1] ^= k[1];
+    st[2] ^= k[2];
+    st[3] ^= k[3];
+    st[4] ^= k[4];
+    st[5] ^= k[5];
+    st[6] ^= k[6];
+    st[7] ^= k[7];
+}
+
+#[inline]
+fn aes_enc(st: &mut State, key: &State) {
+    sub_bytes_state(st);
+    shift_rows_state(st);
+    mix_columns_state(st);
+    xor_key1_state(st, key)
+}
+
+#[inline]
+fn aes_enc_last(st: &mut State, key: &State) {
+    sub_bytes_state(st);
+    shift_rows_state(st);
+    xor_key1_state(st, key)
+}
+
+#[inline]
+fn aes_keygen_assisti(rcon: u8, i: usize, u: u16) -> u16 {
+    let u3 = u & 0xf000;
+    let n = u3 >> 12;
+    let n = ((n >> 1) | (n << 3)) & 0x000f;
+    let ri = ((rcon >> i) & 1) as u16;
+    let n = n ^ ri;
+    let n = n << 12;
+    n ^ (u3 >> 4)
+}
+
+#[inline]
+fn aes_keygen_assist(next: &mut State, prev: &State, rcon: u8) {
+    next.copy_from_slice(prev);
+    sub_bytes_state(next);
+
+    next[0] = aes_keygen_assisti(rcon, 0, next[0]);
+    next[1] = aes_keygen_assisti(rcon, 1, next[1]);
+    next[2] = aes_keygen_assisti(rcon, 2, next[2]);
+    next[3] = aes_keygen_assisti(rcon, 3, next[3]);
+    next[4] = aes_keygen_assisti(rcon, 4, next[4]);
+    next[5] = aes_keygen_assisti(rcon, 5, next[5]);
+    next[6] = aes_keygen_assisti(rcon, 6, next[6]);
+    next[7] = aes_keygen_assisti(rcon, 7, next[7]);
+}
+
+#[inline]
+fn aes_keygen_assist0(next: &mut State, prev: &State, rcon: u8) {
+    aes_keygen_assist(next, prev, rcon);
+
+    #[inline]
+    fn aux(mut n: u16) -> u16 {
+        n &= 0xf000;
+        n ^= n >> 4;
+        n ^= n >> 8;
+        n
+    }
+
+    next[0] = aux(next[0]);
+    next[1] = aux(next[1]);
+    next[2] = aux(next[2]);
+    next[3] = aux(next[3]);
+    next[4] = aux(next[4]);
+    next[5] = aux(next[5]);
+    next[6] = aux(next[6]);
+    next[7] = aux(next[7]);
+}
+
+#[inline]
+fn aes_keygen_assist1(next: &mut State, prev: &State) {
+    aes_keygen_assist(next, prev, 0);
+
+    #[inline]
+    fn aux(mut n: u16) -> u16 {
+        n &= 0x0f00;
+        n ^= n << 4;
+        n ^= n >> 8;
+        n
+    }
+
+    next[0] = aux(next[0]);
+    next[1] = aux(next[1]);
+    next[2] = aux(next[2]);
+    next[3] = aux(next[3]);
+    next[4] = aux(next[4]);
+    next[5] = aux(next[5]);
+    next[6] = aux(next[6]);
+    next[7] = aux(next[7]);
+}
+
+#[inline]
+fn key_expand1(p: u16, n: u16) -> u16 {
+    let p = p ^ ((p & 0x0fff) << 4) ^ ((p & 0x00ff) << 8) ^ ((p & 0x000f) << 12);
+    n ^ p
+}
+
+#[inline]
+fn key_expansion_step(next: &mut State, prev: &State) {
+    next[0] = key_expand1(prev[0], next[0]);
+    next[1] = key_expand1(prev[1], next[1]);
+    next[2] = key_expand1(prev[2], next[2]);
+    next[3] = key_expand1(prev[3], next[3]);
+    next[4] = key_expand1(prev[4], next[4]);
+    next[5] = key_expand1(prev[5], next[5]);
+    next[6] = key_expand1(prev[6], next[6]);
+    next[7] = key_expand1(prev[7], next[7]);
+}
+
+impl crate::platform::AESState for State {
+    #[inline]
+    fn new() -> Self {
+        new_state()
+    }
+
+    #[inline]
+    fn load_block(&mut self, b: &[u8]) {
+        debug_assert!(b.len() == 16);
+
+        transpose_u8x16(b.try_into().unwrap(), self);
+    }
+
+    #[inline]
+    fn store_block(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == AES_BLOCK_LEN, "out.len() = {}", out.len());
+
+        transpose_u16x8(self, out);
+    }
+
+    #[inline]
+    fn xor_block(&self, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= AES_BLOCK_LEN);
+
+        let mut block = [0u8; AES_BLOCK_LEN];
+        self.store_block(&mut block);
+
+        for i in 0..input.len() {
+            out[i] = input[i] ^ block[i];
+        }
+    }
+
+    #[inline]
+    fn xor_key(&mut self, key: &Self) {
+        xor_key1_state(self, key);
+    }
+
+    #[inline]
+    fn aes_enc(&mut self, key: &Self) {
+        aes_enc(self, key);
+    }
+
+    #[inline]
+    fn aes_enc_last(&mut self, key: &Self) {
+        aes_enc_last(self, key);
+    }
+
+    #[inline]
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
+        aes_keygen_assist0(self, prev, RCON as u8);
+    }
+
+    #[inline]
+    fn aes_keygen_assist1(&mut self, prev: &Self) {
+        aes_keygen_assist1(self, prev);
+    }
+
+    #[inline]
+    fn key_expansion_step(&mut self, prev: &Self) {
+        key_expansion_step(self, prev)
+    }
+}
diff --git a/aesgcm/src/platform/portable/aes_core/test.rs b/aesgcm/src/platform/portable/aes_core/test.rs
new file mode 100644
index 000000000..c5524fc4f
--- /dev/null
+++ b/aesgcm/src/platform/portable/aes_core/test.rs
@@ -0,0 +1,773 @@
+use super::*;
+
+#[allow(non_snake_case)]
+fn sub_bytes_inv_state(st: &mut State) {
+    let U0 = st[7];
+    let U1 = st[6];
+    let U2 = st[5];
+    let U3 = st[4];
+    let U4 = st[3];
+    let U5 = st[2];
+    let U6 = st[1];
+    let U7 = st[0];
+
+    let T23 = U0 ^ U3;
+    let T22 = xnor(U1, U3);
+    let T2 = xnor(U0, U1);
+    let T1 = U3 ^ U4;
+    let T24 = xnor(U4, U7);
+    let R5 = U6 ^ U7;
+    let T8 = xnor(U1, T23);
+    let T19 = T22 ^ R5;
+    let T9 = xnor(U7, T1);
+    let T10 = T2 ^ T24;
+    let T13 = T2 ^ R5;
+    let T3 = T1 ^ R5;
+    let T25 = xnor(U2, T1);
+    let R13 = U1 ^ U6;
+    let T17 = xnor(U2, T19);
+    let T20 = T24 ^ R13;
+    let T4 = U4 ^ T8;
+    let R17 = xnor(U2, U5);
+    let R18 = xnor(U5, U6);
+    let R19 = xnor(U2, U4);
+    let Y5 = U0 ^ R17;
+    let T6 = T22 ^ R17;
+    let T16 = R13 ^ R19;
+    let T27 = T1 ^ R18;
+    let T15 = T10 ^ T27;
+    let T14 = T10 ^ R18;
+    let T26 = T3 ^ T16;
+    let M1 = T13 & T6;
+    let M2 = T23 & T8;
+    let M3 = T14 ^ M1;
+    let M4 = T19 & Y5;
+    let M5 = M4 ^ M1;
+    let M6 = T3 & T16;
+    let M7 = T22 & T9;
+    let M8 = T26 ^ M6;
+    let M9 = T20 & T17;
+    let M10 = M9 ^ M6;
+    let M11 = T1 & T15;
+    let M12 = T4 & T27;
+    let M13 = M12 ^ M11;
+    let M14 = T2 & T10;
+    let M15 = M14 ^ M11;
+    let M16 = M3 ^ M2;
+    let M17 = M5 ^ T24;
+    let M18 = M8 ^ M7;
+    let M19 = M10 ^ M15;
+    let M20 = M16 ^ M13;
+    let M21 = M17 ^ M15;
+    let M22 = M18 ^ M13;
+    let M23 = M19 ^ T25;
+    let M24 = M22 ^ M23;
+    let M25 = M22 & M20;
+    let M26 = M21 ^ M25;
+    let M27 = M20 ^ M21;
+    let M28 = M23 ^ M25;
+    let M29 = M28 & M27;
+    let M30 = M26 & M24;
+    let M31 = M20 & M23;
+    let M32 = M27 & M31;
+    let M33 = M27 ^ M25;
+    let M34 = M21 & M22;
+    let M35 = M24 & M34;
+    let M36 = M24 ^ M25;
+    let M37 = M21 ^ M29;
+    let M38 = M32 ^ M33;
+    let M39 = M23 ^ M30;
+    let M40 = M35 ^ M36;
+    let M41 = M38 ^ M40;
+    let M42 = M37 ^ M39;
+    let M43 = M37 ^ M38;
+    let M44 = M39 ^ M40;
+    let M45 = M42 ^ M41;
+    let M46 = M44 & T6;
+    let M47 = M40 & T8;
+    let M48 = M39 & Y5;
+    let M49 = M43 & T16;
+    let M50 = M38 & T9;
+    let M51 = M37 & T17;
+    let M52 = M42 & T15;
+    let M53 = M45 & T27;
+    let M54 = M41 & T10;
+    let M55 = M44 & T13;
+    let M56 = M40 & T23;
+    let M57 = M39 & T19;
+    let M58 = M43 & T3;
+    let M59 = M38 & T22;
+    let M60 = M37 & T20;
+    let M61 = M42 & T1;
+    let M62 = M45 & T4;
+    let M63 = M41 & T2;
+    let P0 = M52 ^ M61;
+    let P1 = M58 ^ M59;
+    let P2 = M54 ^ M62;
+    let P3 = M47 ^ M50;
+    let P4 = M48 ^ M56;
+    let P5 = M46 ^ M51;
+    let P6 = M49 ^ M60;
+    let P7 = P0 ^ P1;
+    let P8 = M50 ^ M53;
+    let P9 = M55 ^ M63;
+    let P10 = M57 ^ P4;
+    let P11 = P0 ^ P3;
+    let P12 = M46 ^ M48;
+    let P13 = M49 ^ M51;
+    let P14 = M49 ^ M62;
+    let P15 = M54 ^ M59;
+    let P16 = M57 ^ M61;
+    let P17 = M58 ^ P2;
+    let P18 = M63 ^ P5;
+    let P19 = P2 ^ P3;
+    let P20 = P4 ^ P6;
+    let P22 = P2 ^ P7;
+    let P23 = P7 ^ P8;
+    let P24 = P5 ^ P7;
+    let P25 = P6 ^ P10;
+    let P26 = P9 ^ P11;
+    let P27 = P10 ^ P18;
+    let P28 = P11 ^ P25;
+    let P29 = P15 ^ P20;
+    let W0 = P13 ^ P22;
+    let W1 = P26 ^ P29;
+    let W2 = P17 ^ P28;
+    let W3 = P12 ^ P22;
+    let W4 = P23 ^ P27;
+    let W5 = P19 ^ P24;
+    let W6 = P14 ^ P23;
+    let W7 = P9 ^ P16;
+
+    st[0] = W7;
+    st[1] = W6;
+    st[2] = W5;
+    st[3] = W4;
+    st[4] = W3;
+    st[5] = W2;
+    st[6] = W1;
+    st[7] = W0;
+}
+
+fn sbox_fwd(s: u8) -> u8 {
+    match s {
+        0 => 0x63,
+        1 => 0x7c,
+        2 => 0x77,
+        3 => 0x7b,
+        4 => 0xf2,
+        5 => 0x6b,
+        6 => 0x6f,
+        7 => 0xc5,
+        8 => 0x30,
+        9 => 0x01,
+        10 => 0x67,
+        11 => 0x2b,
+        12 => 0xfe,
+        13 => 0xd7,
+        14 => 0xab,
+        15 => 0x76,
+        16 => 0xca,
+        17 => 0x82,
+        18 => 0xc9,
+        19 => 0x7d,
+        20 => 0xfa,
+        21 => 0x59,
+        22 => 0x47,
+        23 => 0xf0,
+        24 => 0xad,
+        25 => 0xd4,
+        26 => 0xa2,
+        27 => 0xaf,
+        28 => 0x9c,
+        29 => 0xa4,
+        30 => 0x72,
+        31 => 0xc0,
+        32 => 0xb7,
+        33 => 0xfd,
+        34 => 0x93,
+        35 => 0x26,
+        36 => 0x36,
+        37 => 0x3f,
+        38 => 0xf7,
+        39 => 0xcc,
+        40 => 0x34,
+        41 => 0xa5,
+        42 => 0xe5,
+        43 => 0xf1,
+        44 => 0x71,
+        45 => 0xd8,
+        46 => 0x31,
+        47 => 0x15,
+        48 => 0x04,
+        49 => 0xc7,
+        50 => 0x23,
+        51 => 0xc3,
+        52 => 0x18,
+        53 => 0x96,
+        54 => 0x05,
+        55 => 0x9a,
+        56 => 0x07,
+        57 => 0x12,
+        58 => 0x80,
+        59 => 0xe2,
+        60 => 0xeb,
+        61 => 0x27,
+        62 => 0xb2,
+        63 => 0x75,
+        64 => 0x09,
+        65 => 0x83,
+        66 => 0x2c,
+        67 => 0x1a,
+        68 => 0x1b,
+        69 => 0x6e,
+        70 => 0x5a,
+        71 => 0xa0,
+        72 => 0x52,
+        73 => 0x3b,
+        74 => 0xd6,
+        75 => 0xb3,
+        76 => 0x29,
+        77 => 0xe3,
+        78 => 0x2f,
+        79 => 0x84,
+        80 => 0x53,
+        81 => 0xd1,
+        82 => 0x00,
+        83 => 0xed,
+        84 => 0x20,
+        85 => 0xfc,
+        86 => 0xb1,
+        87 => 0x5b,
+        88 => 0x6a,
+        89 => 0xcb,
+        90 => 0xbe,
+        91 => 0x39,
+        92 => 0x4a,
+        93 => 0x4c,
+        94 => 0x58,
+        95 => 0xcf,
+        96 => 0xd0,
+        97 => 0xef,
+        98 => 0xaa,
+        99 => 0xfb,
+        100 => 0x43,
+        101 => 0x4d,
+        102 => 0x33,
+        103 => 0x85,
+        104 => 0x45,
+        105 => 0xf9,
+        106 => 0x02,
+        107 => 0x7f,
+        108 => 0x50,
+        109 => 0x3c,
+        110 => 0x9f,
+        111 => 0xa8,
+        112 => 0x51,
+        113 => 0xa3,
+        114 => 0x40,
+        115 => 0x8f,
+        116 => 0x92,
+        117 => 0x9d,
+        118 => 0x38,
+        119 => 0xf5,
+        120 => 0xbc,
+        121 => 0xb6,
+        122 => 0xda,
+        123 => 0x21,
+        124 => 0x10,
+        125 => 0xff,
+        126 => 0xf3,
+        127 => 0xd2,
+        128 => 0xcd,
+        129 => 0x0c,
+        130 => 0x13,
+        131 => 0xec,
+        132 => 0x5f,
+        133 => 0x97,
+        134 => 0x44,
+        135 => 0x17,
+        136 => 0xc4,
+        137 => 0xa7,
+        138 => 0x7e,
+        139 => 0x3d,
+        140 => 0x64,
+        141 => 0x5d,
+        142 => 0x19,
+        143 => 0x73,
+        144 => 0x60,
+        145 => 0x81,
+        146 => 0x4f,
+        147 => 0xdc,
+        148 => 0x22,
+        149 => 0x2a,
+        150 => 0x90,
+        151 => 0x88,
+        152 => 0x46,
+        153 => 0xee,
+        154 => 0xb8,
+        155 => 0x14,
+        156 => 0xde,
+        157 => 0x5e,
+        158 => 0x0b,
+        159 => 0xdb,
+        160 => 0xe0,
+        161 => 0x32,
+        162 => 0x3a,
+        163 => 0x0a,
+        164 => 0x49,
+        165 => 0x06,
+        166 => 0x24,
+        167 => 0x5c,
+        168 => 0xc2,
+        169 => 0xd3,
+        170 => 0xac,
+        171 => 0x62,
+        172 => 0x91,
+        173 => 0x95,
+        174 => 0xe4,
+        175 => 0x79,
+        176 => 0xe7,
+        177 => 0xc8,
+        178 => 0x37,
+        179 => 0x6d,
+        180 => 0x8d,
+        181 => 0xd5,
+        182 => 0x4e,
+        183 => 0xa9,
+        184 => 0x6c,
+        185 => 0x56,
+        186 => 0xf4,
+        187 => 0xea,
+        188 => 0x65,
+        189 => 0x7a,
+        190 => 0xae,
+        191 => 0x08,
+        192 => 0xba,
+        193 => 0x78,
+        194 => 0x25,
+        195 => 0x2e,
+        196 => 0x1c,
+        197 => 0xa6,
+        198 => 0xb4,
+        199 => 0xc6,
+        200 => 0xe8,
+        201 => 0xdd,
+        202 => 0x74,
+        203 => 0x1f,
+        204 => 0x4b,
+        205 => 0xbd,
+        206 => 0x8b,
+        207 => 0x8a,
+        208 => 0x70,
+        209 => 0x3e,
+        210 => 0xb5,
+        211 => 0x66,
+        212 => 0x48,
+        213 => 0x03,
+        214 => 0xf6,
+        215 => 0x0e,
+        216 => 0x61,
+        217 => 0x35,
+        218 => 0x57,
+        219 => 0xb9,
+        220 => 0x86,
+        221 => 0xc1,
+        222 => 0x1d,
+        223 => 0x9e,
+        224 => 0xe1,
+        225 => 0xf8,
+        226 => 0x98,
+        227 => 0x11,
+        228 => 0x69,
+        229 => 0xd9,
+        230 => 0x8e,
+        231 => 0x94,
+        232 => 0x9b,
+        233 => 0x1e,
+        234 => 0x87,
+        235 => 0xe9,
+        236 => 0xce,
+        237 => 0x55,
+        238 => 0x28,
+        239 => 0xdf,
+        240 => 0x8c,
+        241 => 0xa1,
+        242 => 0x89,
+        243 => 0x0d,
+        244 => 0xbf,
+        245 => 0xe6,
+        246 => 0x42,
+        247 => 0x68,
+        248 => 0x41,
+        249 => 0x99,
+        250 => 0x2d,
+        251 => 0x0f,
+        252 => 0xb0,
+        253 => 0x54,
+        254 => 0xbb,
+        255 => 0x16,
+    }
+}
+
+fn sbox_inv(s: u8) -> u8 {
+    match s {
+        0 => 0x52,
+        1 => 0x09,
+        2 => 0x6a,
+        3 => 0xd5,
+        4 => 0x30,
+        5 => 0x36,
+        6 => 0xa5,
+        7 => 0x38,
+        8 => 0xbf,
+        9 => 0x40,
+        10 => 0xa3,
+        11 => 0x9e,
+        12 => 0x81,
+        13 => 0xf3,
+        14 => 0xd7,
+        15 => 0xfb,
+        16 => 0x7c,
+        17 => 0xe3,
+        18 => 0x39,
+        19 => 0x82,
+        20 => 0x9b,
+        21 => 0x2f,
+        22 => 0xff,
+        23 => 0x87,
+        24 => 0x34,
+        25 => 0x8e,
+        26 => 0x43,
+        27 => 0x44,
+        28 => 0xc4,
+        29 => 0xde,
+        30 => 0xe9,
+        31 => 0xcb,
+        32 => 0x54,
+        33 => 0x7b,
+        34 => 0x94,
+        35 => 0x32,
+        36 => 0xa6,
+        37 => 0xc2,
+        38 => 0x23,
+        39 => 0x3d,
+        40 => 0xee,
+        41 => 0x4c,
+        42 => 0x95,
+        43 => 0x0b,
+        44 => 0x42,
+        45 => 0xfa,
+        46 => 0xc3,
+        47 => 0x4e,
+        48 => 0x08,
+        49 => 0x2e,
+        50 => 0xa1,
+        51 => 0x66,
+        52 => 0x28,
+        53 => 0xd9,
+        54 => 0x24,
+        55 => 0xb2,
+        56 => 0x76,
+        57 => 0x5b,
+        58 => 0xa2,
+        59 => 0x49,
+        60 => 0x6d,
+        61 => 0x8b,
+        62 => 0xd1,
+        63 => 0x25,
+        64 => 0x72,
+        65 => 0xf8,
+        66 => 0xf6,
+        67 => 0x64,
+        68 => 0x86,
+        69 => 0x68,
+        70 => 0x98,
+        71 => 0x16,
+        72 => 0xd4,
+        73 => 0xa4,
+        74 => 0x5c,
+        75 => 0xcc,
+        76 => 0x5d,
+        77 => 0x65,
+        78 => 0xb6,
+        79 => 0x92,
+        80 => 0x6c,
+        81 => 0x70,
+        82 => 0x48,
+        83 => 0x50,
+        84 => 0xfd,
+        85 => 0xed,
+        86 => 0xb9,
+        87 => 0xda,
+        88 => 0x5e,
+        89 => 0x15,
+        90 => 0x46,
+        91 => 0x57,
+        92 => 0xa7,
+        93 => 0x8d,
+        94 => 0x9d,
+        95 => 0x84,
+        96 => 0x90,
+        97 => 0xd8,
+        98 => 0xab,
+        99 => 0x00,
+        100 => 0x8c,
+        101 => 0xbc,
+        102 => 0xd3,
+        103 => 0x0a,
+        104 => 0xf7,
+        105 => 0xe4,
+        106 => 0x58,
+        107 => 0x05,
+        108 => 0xb8,
+        109 => 0xb3,
+        110 => 0x45,
+        111 => 0x06,
+        112 => 0xd0,
+        113 => 0x2c,
+        114 => 0x1e,
+        115 => 0x8f,
+        116 => 0xca,
+        117 => 0x3f,
+        118 => 0x0f,
+        119 => 0x02,
+        120 => 0xc1,
+        121 => 0xaf,
+        122 => 0xbd,
+        123 => 0x03,
+        124 => 0x01,
+        125 => 0x13,
+        126 => 0x8a,
+        127 => 0x6b,
+        128 => 0x3a,
+        129 => 0x91,
+        130 => 0x11,
+        131 => 0x41,
+        132 => 0x4f,
+        133 => 0x67,
+        134 => 0xdc,
+        135 => 0xea,
+        136 => 0x97,
+        137 => 0xf2,
+        138 => 0xcf,
+        139 => 0xce,
+        140 => 0xf0,
+        141 => 0xb4,
+        142 => 0xe6,
+        143 => 0x73,
+        144 => 0x96,
+        145 => 0xac,
+        146 => 0x74,
+        147 => 0x22,
+        148 => 0xe7,
+        149 => 0xad,
+        150 => 0x35,
+        151 => 0x85,
+        152 => 0xe2,
+        153 => 0xf9,
+        154 => 0x37,
+        155 => 0xe8,
+        156 => 0x1c,
+        157 => 0x75,
+        158 => 0xdf,
+        159 => 0x6e,
+        160 => 0x47,
+        161 => 0xf1,
+        162 => 0x1a,
+        163 => 0x71,
+        164 => 0x1d,
+        165 => 0x29,
+        166 => 0xc5,
+        167 => 0x89,
+        168 => 0x6f,
+        169 => 0xb7,
+        170 => 0x62,
+        171 => 0x0e,
+        172 => 0xaa,
+        173 => 0x18,
+        174 => 0xbe,
+        175 => 0x1b,
+        176 => 0xfc,
+        177 => 0x56,
+        178 => 0x3e,
+        179 => 0x4b,
+        180 => 0xc6,
+        181 => 0xd2,
+        182 => 0x79,
+        183 => 0x20,
+        184 => 0x9a,
+        185 => 0xdb,
+        186 => 0xc0,
+        187 => 0xfe,
+        188 => 0x78,
+        189 => 0xcd,
+        190 => 0x5a,
+        191 => 0xf4,
+        192 => 0x1f,
+        193 => 0xdd,
+        194 => 0xa8,
+        195 => 0x33,
+        196 => 0x88,
+        197 => 0x07,
+        198 => 0xc7,
+        199 => 0x31,
+        200 => 0xb1,
+        201 => 0x12,
+        202 => 0x10,
+        203 => 0x59,
+        204 => 0x27,
+        205 => 0x80,
+        206 => 0xec,
+        207 => 0x5f,
+        208 => 0x60,
+        209 => 0x51,
+        210 => 0x7f,
+        211 => 0xa9,
+        212 => 0x19,
+        213 => 0xb5,
+        214 => 0x4a,
+        215 => 0x0d,
+        216 => 0x2d,
+        217 => 0xe5,
+        218 => 0x7a,
+        219 => 0x9f,
+        220 => 0x93,
+        221 => 0xc9,
+        222 => 0x9c,
+        223 => 0xef,
+        224 => 0xa0,
+        225 => 0xe0,
+        226 => 0x3b,
+        227 => 0x4d,
+        228 => 0xae,
+        229 => 0x2a,
+        230 => 0xf5,
+        231 => 0xb0,
+        232 => 0xc8,
+        233 => 0xeb,
+        234 => 0xbb,
+        235 => 0x3c,
+        236 => 0x83,
+        237 => 0x53,
+        238 => 0x99,
+        239 => 0x61,
+        240 => 0x17,
+        241 => 0x2b,
+        242 => 0x04,
+        243 => 0x7e,
+        244 => 0xba,
+        245 => 0x77,
+        246 => 0xd6,
+        247 => 0x26,
+        248 => 0xe1,
+        249 => 0x69,
+        250 => 0x14,
+        251 => 0x63,
+        252 => 0x55,
+        253 => 0x21,
+        254 => 0x0c,
+        255 => 0x7d,
+    }
+}
+
+use rand_core::{OsRng, RngCore};
+
+use crate::platform::portable::aes_core::transpose_u8x16;
+
+fn get_bit_u8(x: &[u8], i: usize, j: usize) -> u8 {
+    (x[i] >> j) & 0x1
+}
+
+fn get_bit_u16(x: &[u16], i: usize, j: usize) -> u8 {
+    ((x[j] >> i) & 0x1) as u8
+}
+
+#[test]
+fn test_transpose() {
+    let mut x = [0u8; 16];
+    OsRng.fill_bytes(&mut x);
+    let mut y = [0u16; 8];
+    transpose_u8x16(&x, &mut y);
+    for i in 0..16 {
+        for j in 0..8 {
+            if get_bit_u8(&x, i, j) != get_bit_u16(&y, i, j) {
+                #[cfg(feature = "std")]
+                {
+                    std::eprintln!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    std::eprintln!("y[{},{}] = {}", i, j, get_bit_u16(&y, i, j));
+                }
+                assert!(false);
+            } else {
+                #[cfg(feature = "std")]
+                std::eprintln!("transpose ok: {},{}", i, j);
+            }
+        }
+    }
+    let mut z = [0u8; 16];
+    transpose_u16x8(&y, &mut z);
+    for i in 0..16 {
+        for j in 0..8 {
+            if get_bit_u8(&x, i, j) != get_bit_u8(&z, i, j) {
+                #[cfg(feature = "std")]
+                {
+                    std::eprintln!("x[{},{}] = {}", i, j, get_bit_u8(&x, i, j));
+                    std::eprintln!("z[{},{}] = {}", i, j, get_bit_u8(&z, i, j));
+                }
+                assert!(false);
+            } else {
+                #[cfg(feature = "std")]
+                std::eprintln!("inv-transpose ok: {},{}", i, j);
+            }
+        }
+    }
+}
+
+#[test]
+fn test_sbox() {
+    let mut x = [0u8; 16];
+    let mut y = [0u16; 8];
+    let mut w = [0u8; 16];
+    for i in 0..=255 {
+        x[0] = i;
+        x[9] = i;
+        transpose_u8x16(&x, &mut y);
+        sub_bytes_state(&mut y);
+        transpose_u16x8(&y, &mut w);
+        if w[0] != sbox_fwd(i as u8) {
+            #[cfg(feature = "std")]
+            std::eprintln!("sbox[{}] = {}, should be {}", i, w[0], sbox_fwd(i as u8));
+            assert!(false);
+        } else {
+            #[cfg(feature = "std")]
+            std::eprintln!("sbox ok {}", i)
+        }
+    }
+}
+
+#[test]
+fn test_sbox_inv() {
+    let mut x = [0u8; 16];
+    let mut y = [0u16; 8];
+    let mut w = [0u8; 16];
+    for i in 0..=255 {
+        x[0] = i;
+        x[9] = i;
+        transpose_u8x16(&x, &mut y);
+        sub_bytes_inv_state(&mut y);
+        transpose_u16x8(&y, &mut w);
+        if w[0] != sbox_inv(i as u8) {
+            #[cfg(feature = "std")]
+            std::eprintln!(
+                "sbox_inv[{}] = {}, should be {}",
+                i,
+                w[0],
+                sbox_inv(i as u8)
+            );
+            assert!(false);
+        } else {
+            #[cfg(feature = "std")]
+            std::eprintln!("sbox inv ok {}", i)
+        }
+    }
+}
diff --git a/aesgcm/src/platform/portable/gf128_core.rs b/aesgcm/src/platform/portable/gf128_core.rs
new file mode 100644
index 000000000..d2523efa9
--- /dev/null
+++ b/aesgcm/src/platform/portable/gf128_core.rs
@@ -0,0 +1,90 @@
+/// A portable gf128 field element.
+pub(crate) type FieldElement = u128;
+
+#[inline]
+fn zero() -> FieldElement {
+    0
+}
+
+#[inline]
+fn load_element(bytes: &[u8]) -> FieldElement {
+    debug_assert!(bytes.len() == 16);
+
+    u128::from_be_bytes(bytes.try_into().unwrap())
+}
+
+#[inline]
+fn store_element(element: &FieldElement, bytes: &mut [u8]) {
+    debug_assert!(bytes.len() == 16);
+    bytes.copy_from_slice(&u128::to_be_bytes(*element));
+}
+
+#[inline]
+fn add(element: &FieldElement, other: &FieldElement) -> FieldElement {
+    element ^ other
+}
+
+#[inline]
+fn ith_bit_mask(elem: &FieldElement, i: usize) -> FieldElement {
+    debug_assert!(i < 128);
+
+    let bit: u16 = ((elem >> (127 - i)) as u16) & 0x1;
+    let bit_mask16 = (!bit).wrapping_add(1);
+    let bit_mask32 = (bit_mask16 as u32) ^ ((bit_mask16 as u32) << 16);
+    let bit_mask64 = (bit_mask32 as u64) ^ ((bit_mask32 as u64) << 32);
+
+    (bit_mask64 as u128) ^ ((bit_mask64 as u128) << 64)
+}
+
+const IRRED: FieldElement = 0xE100_0000_0000_0000_0000_0000_0000_0000;
+
+#[inline]
+fn mul_x(elem: &mut FieldElement) {
+    let mask = ith_bit_mask(elem, 127);
+    *elem = (*elem >> 1) ^ (IRRED & mask)
+}
+
+#[inline]
+fn mul_step(x: &FieldElement, y: &mut FieldElement, i: usize, result: &mut FieldElement) {
+    debug_assert!(i < 128);
+    let mask = ith_bit_mask(x, i);
+    *result ^= *y & mask;
+    mul_x(y);
+}
+
+#[inline]
+fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
+    let mut result = 0;
+    let mut multiplicand = *y;
+    for i in 0..128 {
+        mul_step(x, &mut multiplicand, i, &mut result)
+    }
+    result
+}
+
+impl crate::platform::GF128FieldElement for FieldElement {
+    #[inline]
+    fn zero() -> Self {
+        zero()
+    }
+
+    #[inline]
+    fn load_element(bytes: &[u8]) -> Self {
+        load_element(bytes)
+    }
+
+    #[inline]
+    fn store_element(&self, bytes: &mut [u8]) {
+        store_element(self, bytes);
+    }
+
+    #[inline]
+    fn add(&mut self, other: &Self) {
+        *self = add(self, other);
+    }
+
+    #[inline]
+    fn mul(&mut self, other: &Self) {
+        *self = mul(self, other)
+    }
+}
diff --git a/aesgcm/src/platform/x64.rs b/aesgcm/src/platform/x64.rs
new file mode 100644
index 000000000..2cb276213
--- /dev/null
+++ b/aesgcm/src/platform/x64.rs
@@ -0,0 +1,5 @@
+mod aes_core;
+mod gf128_core;
+
+pub(crate) use aes_core::State;
+pub(crate) use gf128_core::FieldElement;
diff --git a/aesgcm/src/platform/x64/aes_core.rs b/aesgcm/src/platform/x64/aes_core.rs
new file mode 100644
index 000000000..3ab931900
--- /dev/null
+++ b/aesgcm/src/platform/x64/aes_core.rs
@@ -0,0 +1,141 @@
+use core::arch::x86_64::*;
+
+use libcrux_intrinsics::avx2::{
+    mm_aesenc_si128, mm_aesenclast_si128, mm_aeskeygenassist_si128, mm_loadu_si128,
+    mm_setzero_si128, mm_shuffle_epi32, mm_slli_si128, mm_storeu_si128_u8, mm_xor_si128,
+};
+
+/// The avx2 state.
+pub(crate) type State = __m128i;
+
+#[inline]
+fn new_state() -> State {
+    mm_setzero_si128()
+}
+
+#[inline]
+fn xor_key1_state(st: &mut State, k: &State) {
+    *st = mm_xor_si128(*st, *k);
+}
+
+#[inline]
+fn aes_enc(st: &mut State, key: &State) {
+    *st = mm_aesenc_si128(*st, *key);
+}
+
+#[inline]
+fn aes_enc_last(st: &mut State, key: &State) {
+    *st = mm_aesenclast_si128(*st, *key);
+}
+
+#[inline]
+fn aes_keygen_assist<const RCON: i32>(next: &mut State, prev: &State) {
+    *next = mm_aeskeygenassist_si128::<RCON>(*prev);
+}
+
+#[inline]
+fn aes_keygen_assist0<const RCON: i32>(next: &mut State, prev: &State) {
+    aes_keygen_assist::<RCON>(next, prev);
+    *next = mm_shuffle_epi32::<0xff>(*next);
+}
+
+#[inline]
+fn aes_keygen_assist1(next: &mut State, prev: &State) {
+    aes_keygen_assist::<0>(next, prev);
+    *next = mm_shuffle_epi32::<0xaa>(*next);
+}
+
+#[inline]
+fn key_expansion_step(next: &mut State, prev: &State) {
+    let p0 = mm_xor_si128(*prev, mm_slli_si128::<4>(*prev));
+    let p1 = mm_xor_si128(p0, mm_slli_si128::<4>(p0));
+    let p2 = mm_xor_si128(p1, mm_slli_si128::<4>(p1));
+    *next = mm_xor_si128(*next, p2);
+}
+
+impl crate::platform::AESState for State {
+    #[inline]
+    fn new() -> Self {
+        new_state()
+    }
+
+    #[inline]
+    fn load_block(&mut self, b: &[u8]) {
+        debug_assert!(b.len() == 16);
+
+        *self = mm_loadu_si128(b);
+    }
+
+    #[inline]
+    fn store_block(&self, out: &mut [u8]) {
+        debug_assert!(out.len() == 16);
+
+        mm_storeu_si128_u8(out, *self);
+    }
+
+    #[inline]
+    fn xor_block(&self, input: &[u8], out: &mut [u8]) {
+        debug_assert!(input.len() == out.len() && input.len() <= 16);
+        // XXX: hot-fix to have enough input and output here.
+        let mut block_in = [0u8; 16];
+        let mut block_out = [0u8; 16];
+        block_in[0..input.len()].copy_from_slice(input);
+
+        let inp_vec = mm_loadu_si128(&block_in);
+        let out_vec = mm_xor_si128(inp_vec, *self);
+        mm_storeu_si128_u8(&mut block_out, out_vec);
+
+        out.copy_from_slice(&block_out[0..out.len()]);
+    }
+
+    #[inline]
+    fn xor_key(&mut self, key: &Self) {
+        xor_key1_state(self, key);
+    }
+
+    #[inline]
+    fn aes_enc(&mut self, key: &Self) {
+        aes_enc(self, key);
+    }
+
+    #[inline]
+    fn aes_enc_last(&mut self, key: &Self) {
+        aes_enc_last(self, key);
+    }
+
+    #[inline]
+    fn aes_keygen_assist0<const RCON: i32>(&mut self, prev: &Self) {
+        aes_keygen_assist0::<RCON>(self, prev);
+    }
+
+    #[inline]
+    fn aes_keygen_assist1(&mut self, prev: &Self) {
+        aes_keygen_assist1(self, prev);
+    }
+
+    #[inline]
+    fn key_expansion_step(&mut self, prev: &Self) {
+        key_expansion_step(self, prev)
+    }
+}
+
+#[cfg(feature = "std")]
+#[allow(unsafe_code)]
+#[test]
+fn test() {
+    unsafe {
+        let x = _mm_set_epi32(3, 2, 1, 0);
+        let y = _mm_shuffle_epi32(x, 0xaa);
+        let w = _mm_slli_si128(x, 4);
+        let mut z: [i32; 4] = [0; 4];
+        _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, x);
+
+        std::eprintln!("{:?}", z);
+        _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, w);
+
+        std::eprintln!("shift right 4 {:?}", z);
+        _mm_storeu_si128(z.as_mut_ptr() as *mut __m128i, y);
+
+        std::eprintln!("shuffle aa {:?}", z);
+    }
+}
diff --git a/aesgcm/src/platform/x64/gf128_core.rs b/aesgcm/src/platform/x64/gf128_core.rs
new file mode 100644
index 000000000..919d7c6d2
--- /dev/null
+++ b/aesgcm/src/platform/x64/gf128_core.rs
@@ -0,0 +1,184 @@
+use core::arch::x86_64::*;
+
+use libcrux_intrinsics::avx2::{
+    mm_clmulepi64_si128, mm_slli_si128, mm_srli_si128, mm_unpackhi_epi64, mm_unpacklo_epi64,
+    mm_xor_si128,
+};
+
+// XXX: A lot of the code below is shared with NEON. Refactor!
+
+/// An avx2 gf128 field element.
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+pub(crate) struct FieldElement(pub(super) u128);
+
+impl FieldElement {
+    /// Transmute `u128` and `__m128i`.
+    #[inline]
+    #[allow(unsafe_code)]
+    fn transmute(&self) -> __m128i {
+        unsafe { core::mem::transmute(self.0) }
+    }
+
+    /// Convert a vec to self.
+    #[inline]
+    #[allow(unsafe_code)]
+    fn from_vec128(vec: __m128i) -> Self {
+        unsafe { core::mem::transmute(vec) }
+    }
+}
+
+#[inline]
+fn zero() -> FieldElement {
+    FieldElement(0)
+}
+
+#[inline]
+fn load_element(b: &[u8]) -> FieldElement {
+    debug_assert!(b.len() == 16);
+
+    FieldElement(u128::from_be_bytes(b.try_into().unwrap()))
+}
+
+#[inline]
+fn store_element(elem: &FieldElement, b: &mut [u8]) {
+    debug_assert!(b.len() == 16);
+
+    b.copy_from_slice(&elem.0.to_be_bytes());
+}
+
+#[inline]
+fn add(elem: &FieldElement, other: &FieldElement) -> FieldElement {
+    FieldElement((*elem).0 ^ (*other).0)
+}
+
+// #[inline]
+// fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+//     let lhs: __m128i = unsafe { core::mem::transmute((*elem).0) };
+//     let rhs: __m128i = unsafe { core::mem::transmute((*other).0) };
+
+//     let low = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x11) };
+//     let mid0 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x10) };
+//     let mid1 = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x01) };
+//     let high = unsafe { _mm_clmulepi64_si128(lhs, rhs, 0x00) };
+//     let mid = unsafe { _mm_xor_si128(mid0, mid1) };
+//     let m0 = unsafe { _mm_srli_si128(mid, 8) };
+//     let m1 = unsafe { _mm_slli_si128(mid, 8) };
+//     let low = unsafe { _mm_xor_si128(low, m0) };
+//     let high = unsafe { _mm_xor_si128(high, m1) };
+
+//     let low128: u128 = unsafe { core::mem::transmute(low) };
+//     let high128: u128 = unsafe { core::mem::transmute(high) };
+
+//     (FieldElement(low128), FieldElement(high128))
+// }
+
+/// Performs a 128x128 to 256-bit carry-less multiplication.
+///
+/// This implementation uses the Karatsuba algorithm to reduce the number of expensive
+/// PCLMULQDQ instructions from 4 to 3. On most modern x64 CPUs (Intel Sandy
+/// Bridge and newer, AMD Zen and newer), this results in higher performance due to
+/// better utilization of execution ports and potentially lower overall latency.
+///
+/// @param elem The first 128-bit field element.
+/// @param other The second 128-bit field element.
+/// @returns A tuple `(high, low)` containing the 256-bit result.
+#[inline]
+fn mul_wide(elem: &FieldElement, other: &FieldElement) -> (FieldElement, FieldElement) {
+    // Let the inputs be a = (a_hi << 64) | a_lo and b = (b_hi << 64) | b_lo.
+    // The product is (a_hi*b_hi << 128) + ((a_lo*b_hi ^ a_hi*b_lo) << 64) + a_lo*b_lo.
+    // The Karatsuba trick computes the middle term using the other two products:
+    // (a_lo*b_hi ^ a_hi*b_lo) = (a_lo^a_hi)*(b_lo^b_hi) ^ a_lo*b_lo ^ a_hi*b_hi
+
+    let a: __m128i = elem.transmute();
+    let b: __m128i = other.transmute();
+
+    // 1. Calculate the low and high 128-bit parts of the product in parallel.
+    //    p_lo = a_lo * b_lo
+    let p_lo = mm_clmulepi64_si128::<0x00>(a, b);
+    //    p_hi = a_hi * b_hi
+    let p_hi = mm_clmulepi64_si128::<0x11>(a, b);
+
+    // 2. Calculate the middle term using the third multiplication.
+    //    First, prepare the operands (a_lo^a_hi) and (b_lo^b_hi).
+    //    Using unpack instructions is an alternative to shuffling.
+    let a_xor = mm_xor_si128(mm_unpackhi_epi64(a, a), mm_unpacklo_epi64(a, a));
+    let b_xor = mm_xor_si128(mm_unpackhi_epi64(b, b), mm_unpacklo_epi64(b, b));
+
+    // Multiply the low 64-bit parts of the XORed results.
+    // p_mid_prod = (a_lo^a_hi) * (b_lo^b_hi)
+    let p_mid_prod = mm_clmulepi64_si128::<0x00>(a_xor, b_xor);
+
+    // Finish computing the middle term by XORing with p_lo and p_hi.
+    let p_mid = mm_xor_si128(mm_xor_si128(p_mid_prod, p_lo), p_hi);
+
+    // 3. Combine the parts to get the final 256-bit result.
+    //    The middle part is XORed at a 64-bit offset.
+    //    res_low  = p_lo ^ (p_mid << 64)
+    //    res_high = p_hi ^ (p_mid >> 64)
+    let res_low = mm_xor_si128(p_lo, mm_slli_si128::<8>(p_mid));
+    let res_high = mm_xor_si128(p_hi, mm_srli_si128::<8>(p_mid));
+
+    // The original function returned (high_part, low_part). We maintain that order.
+    (
+        FieldElement::from_vec128(res_high),
+        FieldElement::from_vec128(res_low),
+    )
+}
+
+#[inline]
+fn reduce(high: &FieldElement, low: &FieldElement) -> FieldElement {
+    let high = ((*high).0 << 1) ^ ((*low).0 >> 127);
+    let low = (*low).0 << 1;
+    let x0_0 = low << 64;
+    let x1_x0 = low ^ (x0_0 << 63) ^ (x0_0 << 62) ^ (x0_0 << 57);
+    let x1_x0 = x1_x0 ^ (x1_x0 >> 1) ^ (x1_x0 >> 2) ^ (x1_x0 >> 7);
+    FieldElement(x1_x0 ^ high)
+}
+
+#[inline]
+fn mul(x: &FieldElement, y: &FieldElement) -> FieldElement {
+    let (high, low) = mul_wide(x, y);
+    reduce(&high, &low)
+}
+
+impl crate::platform::GF128FieldElement for FieldElement {
+    #[inline]
+    fn zero() -> Self {
+        zero()
+    }
+
+    #[inline]
+    fn load_element(b: &[u8]) -> Self {
+        load_element(b)
+    }
+
+    #[inline]
+    fn store_element(&self, b: &mut [u8]) {
+        store_element(self, b);
+    }
+
+    #[inline]
+    fn add(&mut self, other: &Self) {
+        *self = add(self, other);
+    }
+
+    #[inline]
+    fn mul(&mut self, other: &Self) {
+        *self = mul(self, other)
+    }
+}
+
+#[allow(unsafe_code)]
+#[cfg(feature = "std")]
+#[test]
+fn test_transmute() {
+    let x = 1u128 << 64 ^ 2u128;
+    let xv: __m128i = unsafe { core::mem::transmute(x) };
+    let xv: __m128i = unsafe { _mm_slli_si128(xv, 8) };
+    let x: u128 = unsafe { core::mem::transmute(xv) };
+    std::eprintln!("trans {:x}", x);
+    let mut u64s = [0u64; 2];
+    unsafe { _mm_storeu_si128(u64s.as_mut_ptr() as *mut __m128i, xv) };
+    std::eprintln!("store {:?}", u64s)
+}
diff --git a/aesgcm/tests/key_centric.rs b/aesgcm/tests/key_centric.rs
new file mode 100644
index 000000000..3e3551d99
--- /dev/null
+++ b/aesgcm/tests/key_centric.rs
@@ -0,0 +1,48 @@
+use libcrux_traits::aead::consts;
+use libcrux_traits::aead::typed_owned;
+use libcrux_traits::aead::typed_refs;
+
+use libcrux_aesgcm::AesGcm128;
+
+type Key = typed_owned::Key<AesGcm128>;
+type Nonce = typed_owned::Nonce<AesGcm128>;
+type Tag = typed_owned::Tag<AesGcm128>;
+
+#[test]
+fn test_key_centric_owned() {
+    use consts::AeadConsts as _;
+
+    let k: Key = [0; AesGcm128::KEY_LEN].into();
+    let nonce: Nonce = [0; AesGcm128::NONCE_LEN].into();
+    let mut tag: Tag = [0; AesGcm128::TAG_LEN].into();
+
+    let pt = b"the quick brown fox jumps over the lazy dog";
+    let mut ct = [0; 43];
+    let mut pt_out = [0; 43];
+
+    k.encrypt(&mut ct, &mut tag, &nonce, b"", pt).unwrap();
+    k.decrypt(&mut pt_out, &nonce, b"", &ct, &tag).unwrap();
+    assert_eq!(pt, &pt_out);
+}
+
+#[test]
+fn test_key_centric_refs() {
+    use consts::AeadConsts as _;
+    use typed_refs::Aead as _;
+
+    let algo = AesGcm128;
+
+    let mut tag_bytes = [0; AesGcm128::TAG_LEN];
+    let key = algo.new_key(&[0; AesGcm128::KEY_LEN]).unwrap();
+    let tag = algo.new_tag_mut(&mut tag_bytes).unwrap();
+    let nonce = algo.new_nonce(&[0; AesGcm128::NONCE_LEN]).unwrap();
+
+    let pt = b"the quick brown fox jumps over the lazy dog";
+    let mut ct = [0; 43];
+    let mut pt_out = [0; 43];
+
+    key.encrypt(&mut ct, tag, nonce, b"", pt).unwrap();
+    let tag = algo.new_tag(&tag_bytes).unwrap();
+    key.decrypt(&mut pt_out, nonce, b"", &ct, tag).unwrap();
+    assert_eq!(pt, &pt_out);
+}
diff --git a/aesgcm/tests/wycheproof.rs b/aesgcm/tests/wycheproof.rs
new file mode 100644
index 000000000..66fb40c65
--- /dev/null
+++ b/aesgcm/tests/wycheproof.rs
@@ -0,0 +1,93 @@
+use libcrux_aesgcm::Aead;
+use wycheproof::{aead::Test, TestResult};
+
+#[test]
+fn test() {
+    let test_set = wycheproof::aead::TestSet::load(wycheproof::aead::TestName::AesGcm).unwrap();
+
+    fn run<const KEY_LEN: usize, Cipher: Aead<KEY_LEN, 16, 12>>(test: &Test) {
+        let mut ciphertext = vec![0u8; test.pt.len()];
+        let mut plaintext = vec![0u8; test.pt.len()];
+        let mut tag = [0u8; 16];
+
+        Cipher::encrypt(
+            &mut ciphertext,
+            &mut tag,
+            test.key.as_ref().try_into().unwrap(),
+            test.nonce.as_ref().try_into().unwrap(),
+            &test.aad,
+            &test.pt,
+        )
+        .unwrap();
+        Cipher::decrypt(
+            &mut plaintext,
+            test.key.as_ref().try_into().unwrap(),
+            test.nonce.as_ref().try_into().unwrap(),
+            &test.aad,
+            &ciphertext,
+            tag.as_ref().try_into().unwrap(),
+        )
+        .unwrap();
+
+        assert_eq!(plaintext.as_slice(), test.pt.as_slice());
+
+        if test.result == TestResult::Valid {
+            assert_eq!(test.ct.as_slice(), &ciphertext);
+            assert_eq!(test.tag.as_slice(), &tag);
+        } else {
+            let ct_ok = test.ct.as_slice() == &ciphertext;
+            let tag_ok = test.tag.as_slice() == &tag;
+            assert!(!ct_ok || !tag_ok);
+        }
+    }
+
+    for test_group in test_set.test_groups {
+        println!(
+            "* Group key size:{} tag size:{} nonce size:{}",
+            test_group.key_size, test_group.tag_size, test_group.nonce_size,
+        );
+
+        if test_group.nonce_size != 96 {
+            println!("  Skipping unsupported nonce size");
+            continue;
+        }
+
+        if test_group.key_size == 128 {
+            for test in test_group.tests {
+                println!("  Test AES-GCM 128 {}", test.tc_id);
+
+                // Multiplexing
+                run::<16, libcrux_aesgcm::AesGcm128>(&test);
+
+                // Portable
+                run::<16, libcrux_aesgcm::PortableAesGcm128>(&test);
+
+                // Neon
+                #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+                run::<16, libcrux_aesgcm::NeonAesGcm128>(&test);
+
+                // x64
+                #[cfg(all(target_arch = "x86_64"))]
+                run::<16, libcrux_aesgcm::X64AesGcm128>(&test);
+            }
+        } else if test_group.key_size == 256 {
+            for test in test_group.tests {
+                println!("  Test AES-GCM 256 {}", test.tc_id);
+
+                // Multiplexing
+                run::<32, libcrux_aesgcm::AesGcm256>(&test);
+
+                // Portable
+                run::<32, libcrux_aesgcm::PortableAesGcm256>(&test);
+
+                // Neon
+                #[cfg(all(target_arch = "aarch64", target_feature = "aes"))]
+                run::<32, libcrux_aesgcm::NeonAesGcm256>(&test);
+
+                // x64
+                #[cfg(all(target_arch = "x86_64"))]
+                run::<32, libcrux_aesgcm::X64AesGcm256>(&test);
+            }
+        }
+    }
+}
diff --git a/fstar-helpers/core-models/src/core_arch/x86.rs b/fstar-helpers/core-models/src/core_arch/x86.rs
index aa4740856..f2f729e0d 100644
--- a/fstar-helpers/core-models/src/core_arch/x86.rs
+++ b/fstar-helpers/core-models/src/core_arch/x86.rs
@@ -260,6 +260,48 @@ pub mod sse2 {
     pub fn _mm_movemask_epi8(_: __m128i) -> i32 {
         unimplemented!()
     }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
+    #[hax_lib::opaque]
+    pub fn _mm_unpacklo_epi64(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
+    #[hax_lib::opaque]
+    pub fn _mm_unpackhi_epi64(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
+    #[hax_lib::opaque]
+    pub fn _mm_shuffle_epi32<const IMM8: i32>(_: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_srli_si128<const IMM8: i32>(_: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_slli_si128<const IMM8: i32>(_: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_xor_si128(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_setzero_si128() -> __m128i {
+        unimplemented!()
+    }
 }
 
 pub use avx::*;
@@ -696,6 +738,34 @@ pub mod avx2 {
     }
 }
 
+pub use other::*;
+pub mod other {
+    use super::*;
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_aeskeygenassist_si128(_: __m128i, _: i32) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_aesenclast_si128(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_aesenc_si128(_: __m128i, _: __m128i) -> __m128i {
+        unimplemented!()
+    }
+
+    /// [Intel Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128)
+    #[hax_lib::opaque]
+    pub fn _mm_clmulepi64_si128(_: __m128i, _: __m128i, _: i32) -> __m128i {
+        unimplemented!()
+    }
+}
 /// Rewrite lemmas
 const _: () = {
     #[hax_lib::fstar::before("[@@ $REWRITE_RULE ]")]
diff --git a/libcrux-intrinsics/src/arm64.rs b/libcrux-intrinsics/src/arm64.rs
index f6bbefb04..94d8abccd 100644
--- a/libcrux-intrinsics/src/arm64.rs
+++ b/libcrux-intrinsics/src/arm64.rs
@@ -4,6 +4,7 @@ use core::arch::aarch64::*;
 pub type _int16x8_t = int16x8_t;
 pub type _uint32x4_t = uint32x4_t;
 pub type _uint64x2_t = uint64x2_t;
+pub type _uint8x16_t = uint8x16_t;
 
 #[inline(always)]
 pub fn _vdupq_n_s16(i: i16) -> int16x8_t {
@@ -171,6 +172,16 @@ pub fn _vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
     unsafe { vreinterpretq_u32_s32(a) }
 }
 
+#[inline(always)]
+pub fn _vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    unsafe { vreinterpretq_u32_u8(a) }
+}
+
+#[inline(always)]
+pub fn _vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+    unsafe { vreinterpretq_u8_u32(a) }
+}
+
 #[inline(always)]
 pub fn _vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
     unsafe { vshrq_n_u32::<N>(a) }
@@ -270,6 +281,12 @@ pub fn _vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
 pub fn _vld1q_u8(ptr: &[u8]) -> uint8x16_t {
     unsafe { vld1q_u8(ptr.as_ptr()) }
 }
+
+#[inline(always)]
+pub fn _vld1q_u32(ptr: &[u32]) -> uint32x4_t {
+    unsafe { vld1q_u32(ptr.as_ptr()) }
+}
+
 #[inline(always)]
 pub fn _vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
     unsafe { vreinterpretq_u8_s16(a) }
@@ -351,6 +368,7 @@ pub fn _vld1q_u16(ptr: &[u16]) -> uint16x8_t {
 pub fn _vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
     unsafe { vcleq_s16(a, b) }
 }
+
 #[inline(always)]
 pub fn _vaddvq_u16(a: uint16x8_t) -> u16 {
     unsafe { vaddvq_u16(a) }
@@ -374,6 +392,16 @@ pub fn _vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
     _veorq_u64(a, _veorq_u64(_vshlq_n_u64::<1>(b), _vshrq_n_u64::<63>(b)))
 }
 
+#[inline]
+pub fn _veorq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { veorq_u32(a, b) }
+}
+
+#[inline]
+pub fn _vextq_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    unsafe { vextq_u32(a, b, N) }
+}
+
 #[inline(always)]
 pub fn _veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
     #[cfg(all(
@@ -431,3 +459,33 @@ pub fn _vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
     )))]
     _veorq_u64(a, _vbicq_u64(b, c))
 }
+
+#[inline(always)]
+pub fn _vmull_p64(a: u64, b: u64) -> u128 {
+    unsafe { vmull_p64(a, b) }
+}
+
+#[inline]
+pub fn _veorq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { veorq_u8(a, b) }
+}
+
+#[inline]
+pub fn _vaesmcq_u8(data: uint8x16_t) -> uint8x16_t {
+    unsafe { vaesmcq_u8(data) }
+}
+
+#[inline]
+pub fn _vaeseq_u8(data: uint8x16_t, key: uint8x16_t) -> uint8x16_t {
+    unsafe { vaeseq_u8(data, key) }
+}
+
+#[inline]
+pub fn _vdupq_n_u8(value: u8) -> uint8x16_t {
+    unsafe { vdupq_n_u8(value) }
+}
+
+#[inline]
+pub fn _vdupq_laneq_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    unsafe { vdupq_laneq_u32(a, N) }
+}
diff --git a/libcrux-intrinsics/src/avx2.rs b/libcrux-intrinsics/src/avx2.rs
index 9acb73ddb..b04a71d87 100644
--- a/libcrux-intrinsics/src/avx2.rs
+++ b/libcrux-intrinsics/src/avx2.rs
@@ -57,6 +57,16 @@ pub fn mm_storeu_si128(output: &mut [i16], vector: Vec128) {
     }
 }
 
+#[hax_lib::opaque]
+#[inline(always)]
+pub fn mm_storeu_si128_u8(output: &mut [u8], vector: Vec128) {
+    #[cfg(not(hax))]
+    debug_assert!(output.len() >= 8);
+    unsafe {
+        _mm_storeu_si128(output.as_mut_ptr() as *mut Vec128, vector);
+    }
+}
+
 #[hax_lib::opaque]
 #[inline(always)]
 pub fn mm_storeu_si128_i32(output: &mut [i32], vector: Vec128) {
@@ -115,6 +125,11 @@ pub fn mm256_setzero_si256() -> Vec256 {
     unsafe { _mm256_setzero_si256() }
 }
 
+#[inline(always)]
+pub fn mm_setzero_si128() -> Vec128 {
+    unsafe { _mm_setzero_si128() }
+}
+
 #[inline(always)]
 pub fn mm256_set_m128i(hi: Vec128, lo: Vec128) -> Vec256 {
     unsafe { _mm256_set_m128i(hi, lo) }
@@ -439,6 +454,12 @@ pub fn mm256_xor_si256(lhs: Vec256, rhs: Vec256) -> Vec256 {
     unsafe { _mm256_xor_si256(lhs, rhs) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_xor_si128(lhs: Vec128, rhs: Vec128) -> Vec128 {
+    unsafe { _mm_xor_si128(lhs, rhs) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_srai_epi16<const SHIFT_BY: i32>(vector: Vec256) -> Vec256 {
@@ -504,6 +525,22 @@ pub fn mm256_slli_epi32<const SHIFT_BY: i32>(vector: Vec256) -> Vec256 {
     unsafe { _mm256_slli_epi32::<SHIFT_BY>(vector) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_slli_si128<const SHIFT_BY: i32>(vector: Vec128) -> Vec128 {
+    #[cfg(not(hax))]
+    debug_assert!(SHIFT_BY >= 0 && SHIFT_BY < 16);
+    unsafe { _mm_slli_si128::<SHIFT_BY>(vector) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_srli_si128<const SHIFT_BY: i32>(vector: Vec128) -> Vec128 {
+    #[cfg(not(hax))]
+    debug_assert!(SHIFT_BY >= 0 && SHIFT_BY < 16);
+    unsafe { _mm_srli_si128::<SHIFT_BY>(vector) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm_shuffle_epi8(vector: Vec128, control: Vec128) -> Vec128 {
@@ -524,6 +561,14 @@ pub fn mm256_shuffle_epi32<const CONTROL: i32>(vector: Vec256) -> Vec256 {
     unsafe { _mm256_shuffle_epi32::<CONTROL>(vector) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_shuffle_epi32<const CONTROL: i32>(vector: Vec128) -> Vec128 {
+    #[cfg(not(hax))]
+    debug_assert!(CONTROL >= 0 && CONTROL < 256);
+    unsafe { _mm_shuffle_epi32::<CONTROL>(vector) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_permute4x64_epi64<const CONTROL: i32>(vector: Vec256) -> Vec256 {
@@ -538,6 +583,12 @@ pub fn mm256_unpackhi_epi64(lhs: Vec256, rhs: Vec256) -> Vec256 {
     unsafe { _mm256_unpackhi_epi64(lhs, rhs) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_unpackhi_epi64(lhs: Vec128, rhs: Vec128) -> Vec128 {
+    unsafe { _mm_unpackhi_epi64(lhs, rhs) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_unpacklo_epi32(lhs: Vec256, rhs: Vec256) -> Vec256 {
@@ -701,8 +752,38 @@ pub fn mm256_unpacklo_epi64(lhs: Vec256, rhs: Vec256) -> Vec256 {
     unsafe { _mm256_unpacklo_epi64(lhs, rhs) }
 }
 
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_unpacklo_epi64(lhs: Vec128, rhs: Vec128) -> Vec128 {
+    unsafe { _mm_unpacklo_epi64(lhs, rhs) }
+}
+
 #[inline(always)]
 #[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
 pub fn mm256_permute2x128_si256<const IMM8: i32>(a: Vec256, b: Vec256) -> Vec256 {
     unsafe { _mm256_permute2x128_si256::<IMM8>(a, b) }
 }
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_clmulepi64_si128<const IMM8: i32>(a: Vec128, b: Vec128) -> Vec128 {
+    unsafe { _mm_clmulepi64_si128(a, b, IMM8) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_aesenc_si128(a: Vec128, b: Vec128) -> Vec128 {
+    unsafe { _mm_aesenc_si128(a, b) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_aesenclast_si128(a: Vec128, b: Vec128) -> Vec128 {
+    unsafe { _mm_aesenclast_si128(a, b) }
+}
+
+#[inline(always)]
+#[hax_lib::fstar::before(r#"[@@ "opaque_to_smt"]"#)]
+pub fn mm_aeskeygenassist_si128<const RCON: i32>(a: Vec128) -> Vec128 {
+    unsafe { _mm_aeskeygenassist_si128(a, RCON) }
+}
diff --git a/sys/pqclean/src/bindings.rs b/sys/pqclean/src/bindings.rs
index 581711da3..84b3eef4f 100644
--- a/sys/pqclean/src/bindings.rs
+++ b/sys/pqclean/src/bindings.rs
@@ -1,4 +1,4 @@
-/* automatically generated by rust-bindgen 0.72.0 */
+/* automatically generated by rust-bindgen 0.72.1 */
 
 pub const SHAKE128_RATE: u32 = 168;
 pub const SHAKE256_RATE: u32 = 136;