From a31fc6878372de38c6a1f2a81e5d657834f97cad Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 11 Oct 2024 17:45:33 +0100
Subject: [PATCH 01/17] Minor size improvements

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 33 ++++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 3501837d3a1d..4581abb99e6e 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -140,8 +140,7 @@ static void chacha20_block(const uint32_t initial_state[16],
 
 void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx)
 {
-    mbedtls_platform_zeroize(ctx->state, sizeof(ctx->state));
-    mbedtls_platform_zeroize(ctx->keystream8, sizeof(ctx->keystream8));
+    mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context));
 
     /* Initially, there's no keystream bytes available */
     ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES;
@@ -164,14 +163,18 @@ int mbedtls_chacha20_setkey(mbedtls_chacha20_context *ctx,
     ctx->state[3] = 0x6b206574;
 
     /* Set key */
-    ctx->state[4]  = MBEDTLS_GET_UINT32_LE(key, 0);
-    ctx->state[5]  = MBEDTLS_GET_UINT32_LE(key, 4);
-    ctx->state[6]  = MBEDTLS_GET_UINT32_LE(key, 8);
-    ctx->state[7]  = MBEDTLS_GET_UINT32_LE(key, 12);
-    ctx->state[8]  = MBEDTLS_GET_UINT32_LE(key, 16);
-    ctx->state[9]  = MBEDTLS_GET_UINT32_LE(key, 20);
-    ctx->state[10] = MBEDTLS_GET_UINT32_LE(key, 24);
-    ctx->state[11] = MBEDTLS_GET_UINT32_LE(key, 28);
+    if (MBEDTLS_IS_BIG_ENDIAN) {
+        ctx->state[4]  = MBEDTLS_GET_UINT32_LE(key, 0);
+        ctx->state[5]  = MBEDTLS_GET_UINT32_LE(key, 4);
+        ctx->state[6]  = MBEDTLS_GET_UINT32_LE(key, 8);
+        ctx->state[7]  = MBEDTLS_GET_UINT32_LE(key, 12);
+        ctx->state[8]  = MBEDTLS_GET_UINT32_LE(key, 16);
+        ctx->state[9]  = MBEDTLS_GET_UINT32_LE(key, 20);
+        ctx->state[10] = MBEDTLS_GET_UINT32_LE(key, 24);
+        ctx->state[11] = MBEDTLS_GET_UINT32_LE(key, 28);
+    } else {
+        memcpy(&ctx->state[4], key, 32);
+    }
 
     return 0;
 }
@@ -184,9 +187,13 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx,
     ctx->state[12] = counter;
 
     /* Nonce */
-    ctx->state[13] = MBEDTLS_GET_UINT32_LE(nonce, 0);
-    ctx->state[14] = MBEDTLS_GET_UINT32_LE(nonce, 4);
-    ctx->state[15] = MBEDTLS_GET_UINT32_LE(nonce, 8);
+    if (MBEDTLS_IS_BIG_ENDIAN) {
+        ctx->state[13] = MBEDTLS_GET_UINT32_LE(nonce, 0);
+        ctx->state[14] = MBEDTLS_GET_UINT32_LE(nonce, 4);
+        ctx->state[15] = MBEDTLS_GET_UINT32_LE(nonce, 8);
+    } else {
+        memcpy(&ctx->state[13], nonce, 12);
+    }
 
     mbedtls_platform_zeroize(ctx->keystream8, sizeof(ctx->keystream8));
 

From 2cd29ee2ce126ba695a3a7d564023a9904fd53a2 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Tue, 15 Oct 2024 23:31:15 +0100
Subject: [PATCH 02/17] Small code-size improvement

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 .../drivers/builtin/include/mbedtls/chacha20.h   |  6 +++---
 tf-psa-crypto/drivers/builtin/src/chacha20.c     | 16 ++++++----------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h
index ab7195e1c2c3..c7df5bd8a946 100644
--- a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h
+++ b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h
@@ -34,9 +34,9 @@ extern "C" {
 #endif
 
 typedef struct mbedtls_chacha20_context {
-    uint32_t MBEDTLS_PRIVATE(state)[16];          /*! The state (before round operations). */
-    uint8_t  MBEDTLS_PRIVATE(keystream8)[64];     /*! Leftover keystream bytes. */
-    size_t MBEDTLS_PRIVATE(keystream_bytes_used); /*! Number of keystream bytes already used. */
+    uint32_t MBEDTLS_PRIVATE(state)[16];               /*! The state (before round operations). */
+    uint8_t  MBEDTLS_PRIVATE(keystream8)[64];          /*! Leftover keystream bytes. */
+    size_t MBEDTLS_PRIVATE(keystream_bytes_remaining); /*! Number of not-used keystream bytes */
 }
 mbedtls_chacha20_context;
 
diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 4581abb99e6e..cf27a59d86d4 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -141,9 +141,6 @@ static void chacha20_block(const uint32_t initial_state[16],
 void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx)
 {
     mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context));
-
-    /* Initially, there's no keystream bytes available */
-    ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES;
 }
 
 void mbedtls_chacha20_free(mbedtls_chacha20_context *ctx)
@@ -195,10 +192,8 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx,
         memcpy(&ctx->state[13], nonce, 12);
     }
 
-    mbedtls_platform_zeroize(ctx->keystream8, sizeof(ctx->keystream8));
-
     /* Initially, there's no keystream bytes available */
-    ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES;
+    ctx->keystream_bytes_remaining = 0U;
 
     return 0;
 }
@@ -211,11 +206,12 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     size_t offset = 0U;
 
     /* Use leftover keystream bytes, if available */
-    while (size > 0U && ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) {
+    while (size > 0U && ctx->keystream_bytes_remaining > 0U) {
         output[offset] = input[offset]
-                         ^ ctx->keystream8[ctx->keystream_bytes_used];
+                         ^ ctx->keystream8[CHACHA20_BLOCK_SIZE_BYTES -
+                                           ctx->keystream_bytes_remaining];
 
-        ctx->keystream_bytes_used++;
+        ctx->keystream_bytes_remaining--;
         offset++;
         size--;
     }
@@ -240,7 +236,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
 
         mbedtls_xor(output + offset, input + offset, ctx->keystream8, size);
 
-        ctx->keystream_bytes_used = size;
+        ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size;
 
     }
 

From 8c29c34aa699b1aabb32a376c79da872bbfa012e Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Sun, 13 Oct 2024 20:48:54 +0100
Subject: [PATCH 03/17] Simple Neon implementation

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 116 ++++++++++++++++++-
 1 file changed, 113 insertions(+), 3 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index cf27a59d86d4..1816a38397a0 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -22,13 +22,121 @@
 
 #include "mbedtls/platform.h"
 
-#define ROTL32(value, amount) \
-    ((uint32_t) ((value) << (amount)) | ((value) >> (32 - (amount))))
-
 #define CHACHA20_CTR_INDEX (12U)
 
 #define CHACHA20_BLOCK_SIZE_BYTES (4U * 16U)
 
+#if defined(MBEDTLS_HAVE_NEON_INTRINSICS)
+
+// Tested on all combinations of armv7 arm/thumb2; armv8 arm/thumb2/aarch64 on clang 14, gcc 11,
+// and some more recent versions.
+
+// Define rotate-left operations that rotate within each 32-bit element in a 128-bit vector.
+static inline uint32x4_t chacha20_neon_vrotlq_16_u32(uint32x4_t v)
+{
+    return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(v)));
+}
+
+static inline uint32x4_t chacha20_neon_vrotlq_12_u32(uint32x4_t v)
+{
+    uint32x4_t x = vshlq_n_u32(v, 12);
+    return vsriq_n_u32(x, v, 20);
+}
+
+static inline uint32x4_t chacha20_neon_vrotlq_8_u32(uint32x4_t v)
+{
+    uint32x4_t result;
+#if defined(MBEDTLS_ARCH_IS_ARM64)
+    // This implementation is slightly faster, but only supported on 64-bit Arm
+    // Table look-up which results in an 8-bit rotate-left within each 32-bit element
+    const uint8_t    tbl_rotl8[16] = { 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 };
+    const uint8x16_t vrotl8_tbl = vld1q_u8(tbl_rotl8);
+    result = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(v), vrotl8_tbl));
+#else
+    uint32x4_t a = vshlq_n_u32(v, 8);
+    result = vsriq_n_u32(a, v, 24);
+#endif
+    return result;
+}
+
+static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v)
+{
+    uint32x4_t x = vshlq_n_u32(v, 7);
+    return vsriq_n_u32(x, v, 25);
+}
+
+static inline void chacha20_block(const uint32_t initial_state[16], unsigned char keystream[64])
+{
+    /* Load state into NEON registers */
+    uint32x4_t a = vld1q_u32(&initial_state[0]);
+    uint32x4_t b = vld1q_u32(&initial_state[4]);
+    uint32x4_t c = vld1q_u32(&initial_state[8]);
+    uint32x4_t d = vld1q_u32(&initial_state[12]);
+
+    // capture initial values for use after the main loop
+    const uint32x4_t a1 = a, b1 = b, c1 = c, d1 = d;
+
+    for (int i = 0; i < 10; i++) {
+        a = vaddq_u32(a, b);   // a += b
+        d = veorq_u32(d, a);   // d ^= a
+        d = chacha20_neon_vrotlq_16_u32(d);  // d <<<= 16
+
+        c = vaddq_u32(c, d);   // c += d
+        b = veorq_u32(b, c);   // b ^= c
+        b = chacha20_neon_vrotlq_12_u32(b);  // b <<<= 12
+
+        a = vaddq_u32(a, b);   // a += b
+        d = veorq_u32(d, a);   // d ^= a
+        d = chacha20_neon_vrotlq_8_u32(d);   // d <<<= 8
+
+        c = vaddq_u32(c, d);   // c += d
+        b = veorq_u32(b, c);   // b ^= c
+        b = chacha20_neon_vrotlq_7_u32(b);   // b <<<= 7
+
+        // re-order b, c and d for the diagonal rounds
+        b = vextq_u32(b, b, 1); // b now holds positions 5,6,7,4
+        c = vextq_u32(c, c, 2); // 10, 11, 8, 9
+        d = vextq_u32(d, d, 3); // 15, 12, 13, 14
+
+        a = vaddq_u32(a, b);   // a += b
+        d = veorq_u32(d, a);   // d ^= a
+        d = chacha20_neon_vrotlq_16_u32(d);  // d <<<= 16
+
+        c = vaddq_u32(c, d);   // c += d
+        b = veorq_u32(b, c);   // b ^= c
+        b = chacha20_neon_vrotlq_12_u32(b);  // b <<<= 12
+
+        a = vaddq_u32(a, b);   // a += b
+        d = veorq_u32(d, a);   // d ^= a
+        d = chacha20_neon_vrotlq_8_u32(d);   // d <<<= 8
+
+        c = vaddq_u32(c, d);   // c += d
+        b = veorq_u32(b, c);   // b ^= c
+        b = chacha20_neon_vrotlq_7_u32(b);   // b <<<= 7
+
+        // restore element order in b, c, d
+        b = vextq_u32(b, b, 3);
+        c = vextq_u32(c, c, 2);
+        d = vextq_u32(d, d, 1);
+    }
+
+    a = vaddq_u32(a, a1);
+    b = vaddq_u32(b, b1);
+    c = vaddq_u32(c, c1);
+    d = vaddq_u32(d, d1);
+
+    /* Store into keystream */
+    vst1q_u8(keystream + 0,  vreinterpretq_u8_u32(a));
+    vst1q_u8(keystream + 16, vreinterpretq_u8_u32(b));
+    vst1q_u8(keystream + 32, vreinterpretq_u8_u32(c));
+    vst1q_u8(keystream + 48, vreinterpretq_u8_u32(d));
+}
+
+#else
+
+#define ROTL32(value, amount) \
+    ((uint32_t) ((value) << (amount)) | ((value) >> (32 - (amount))))
+
 /**
  * \brief           ChaCha20 quarter round operation.
  *
@@ -138,6 +246,8 @@ static void chacha20_block(const uint32_t initial_state[16],
     mbedtls_platform_zeroize(working_state, sizeof(working_state));
 }
 
+#endif
+
 void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx)
 {
     mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context));

From 59453af46a2116b30e7c4d968abb0fa3e39c0005 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Sun, 13 Oct 2024 20:53:28 +0100
Subject: [PATCH 04/17] Move xor into Neon implementation

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 59 +++++++++++++++-----
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 1816a38397a0..e684e3dc4172 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -65,15 +65,13 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v)
     return vsriq_n_u32(x, v, 25);
 }
 
-static inline void chacha20_block(const uint32_t initial_state[16], unsigned char keystream[64])
+static inline void chacha20_block(uint32x4_t a,
+                                       uint32x4_t b,
+                                       uint32x4_t c,
+                                       uint32x4_t d,
+                                       uint8_t *output,
+                                       const uint8_t *input)
 {
-    /* Load state into NEON registers */
-    uint32x4_t a = vld1q_u32(&initial_state[0]);
-    uint32x4_t b = vld1q_u32(&initial_state[4]);
-    uint32x4_t c = vld1q_u32(&initial_state[8]);
-    uint32x4_t d = vld1q_u32(&initial_state[12]);
-
-    // capture initial values for use after the main loop
     const uint32x4_t a1 = a, b1 = b, c1 = c, d1 = d;
 
     for (int i = 0; i < 10; i++) {
@@ -125,11 +123,10 @@ static inline void chacha20_block(const uint32_t initial_state[16], unsigned cha
     c = vaddq_u32(c, c1);
     d = vaddq_u32(d, d1);
 
-    /* Store into keystream */
-    vst1q_u8(keystream + 0,  vreinterpretq_u8_u32(a));
-    vst1q_u8(keystream + 16, vreinterpretq_u8_u32(b));
-    vst1q_u8(keystream + 32, vreinterpretq_u8_u32(c));
-    vst1q_u8(keystream + 48, vreinterpretq_u8_u32(d));
+    vst1q_u8(output + 0,  veorq_u8(vld1q_u8(input + 0),  vreinterpretq_u8_u32(a)));
+    vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(b)));
+    vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(c)));
+    vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(d)));
 }
 
 #else
@@ -326,6 +323,41 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
         size--;
     }
 
+#if defined(MBEDTLS_HAVE_NEON_INTRINSICS)
+    /* Load state into NEON registers */
+    uint32x4_t a = vld1q_u32(&ctx->state[0]);
+    uint32x4_t b = vld1q_u32(&ctx->state[4]);
+    uint32x4_t c = vld1q_u32(&ctx->state[8]);
+    uint32x4_t d = vld1q_u32(&ctx->state[12]);
+
+    const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 };
+    const uint32x4_t inc_const = vld1q_u32(inc_const_scalar);
+
+    /* Process full blocks */
+    while (size >= CHACHA20_BLOCK_SIZE_BYTES) {
+        chacha20_block(a, b, c, d, output + offset, input + offset);
+
+        d = vaddq_u32(d, inc_const);
+
+        offset += CHACHA20_BLOCK_SIZE_BYTES;
+        size   -= CHACHA20_BLOCK_SIZE_BYTES;
+    }
+
+    /* Last (partial) block */
+    if (size > 0U) {
+        /* Generate new keystream block and increment counter */
+        memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES);
+        chacha20_block(a, b, c, d, ctx->keystream8, ctx->keystream8);
+        d = vaddq_u32(d, inc_const);
+
+        mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size);
+
+        ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size;
+    }
+
+    /* Capture state */
+    vst1q_u32(&ctx->state[12], d);
+#else
     /* Process full blocks */
     while (size >= CHACHA20_BLOCK_SIZE_BYTES) {
         /* Generate new keystream block and increment counter */
@@ -349,6 +381,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
         ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size;
 
     }
+#endif
 
     return 0;
 }

From b0a9055c7a54f937be48aef1c22f07078428f699 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Sun, 13 Oct 2024 23:30:06 +0100
Subject: [PATCH 05/17] Introduce chacha20_neon_inc_counter

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index e684e3dc4172..5c89b39ab59a 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -65,6 +65,14 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v)
     return vsriq_n_u32(x, v, 25);
 }
 
+// Increment the 32-bit element within v that corresponds to the ChaCha20 counter
+static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v)
+{
+     const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 };
+     const uint32x4_t inc_const = vld1q_u32(inc_const_scalar);
+     return vaddq_u32(v, inc_const);
+}
+
 static inline void chacha20_block(uint32x4_t a,
                                        uint32x4_t b,
                                        uint32x4_t c,
@@ -330,14 +338,11 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     uint32x4_t c = vld1q_u32(&ctx->state[8]);
     uint32x4_t d = vld1q_u32(&ctx->state[12]);
 
-    const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 };
-    const uint32x4_t inc_const = vld1q_u32(inc_const_scalar);
-
     /* Process full blocks */
     while (size >= CHACHA20_BLOCK_SIZE_BYTES) {
         chacha20_block(a, b, c, d, output + offset, input + offset);
 
-        d = vaddq_u32(d, inc_const);
+        d = chacha20_neon_inc_counter(d);
 
         offset += CHACHA20_BLOCK_SIZE_BYTES;
         size   -= CHACHA20_BLOCK_SIZE_BYTES;
@@ -348,7 +353,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
         /* Generate new keystream block and increment counter */
         memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES);
         chacha20_block(a, b, c, d, ctx->keystream8, ctx->keystream8);
-        d = vaddq_u32(d, inc_const);
+        d = chacha20_neon_inc_counter(d);
 
         mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size);
 

From 05f9e328af77d644cfb1893e04db09d9d3f2a678 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Sun, 13 Oct 2024 23:38:42 +0100
Subject: [PATCH 06/17] Tidy up Neon state into a struct

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 112 ++++++++++---------
 1 file changed, 57 insertions(+), 55 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 5c89b39ab59a..32929df76f5d 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -73,68 +73,69 @@ static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v)
      return vaddq_u32(v, inc_const);
 }
 
-static inline void chacha20_block(uint32x4_t a,
-                                       uint32x4_t b,
-                                       uint32x4_t c,
-                                       uint32x4_t d,
-                                       uint8_t *output,
-                                       const uint8_t *input)
+typedef struct {
+    uint32x4_t a, b, c, d;
+} chacha20_neon_regs_t;
+
+static inline void chacha20_block(chacha20_neon_regs_t r,
+                                  uint8_t *output,
+                                  const uint8_t *input)
 {
-    const uint32x4_t a1 = a, b1 = b, c1 = c, d1 = d;
+    const chacha20_neon_regs_t r_original = r;
 
     for (int i = 0; i < 10; i++) {
-        a = vaddq_u32(a, b);   // a += b
-        d = veorq_u32(d, a);   // d ^= a
-        d = chacha20_neon_vrotlq_16_u32(d);  // d <<<= 16
+        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+        r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
 
-        c = vaddq_u32(c, d);   // c += d
-        b = veorq_u32(b, c);   // b ^= c
-        b = chacha20_neon_vrotlq_12_u32(b);  // b <<<= 12
+        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+        r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
 
-        a = vaddq_u32(a, b);   // a += b
-        d = veorq_u32(d, a);   // d ^= a
-        d = chacha20_neon_vrotlq_8_u32(d);   // d <<<= 8
+        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+        r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
 
-        c = vaddq_u32(c, d);   // c += d
-        b = veorq_u32(b, c);   // b ^= c
-        b = chacha20_neon_vrotlq_7_u32(b);   // b <<<= 7
+        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+        r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
 
         // re-order b, c and d for the diagonal rounds
-        b = vextq_u32(b, b, 1); // b now holds positions 5,6,7,4
-        c = vextq_u32(c, c, 2); // 10, 11, 8, 9
-        d = vextq_u32(d, d, 3); // 15, 12, 13, 14
+        r.b = vextq_u32(r.b, r.b, 1);                 // r.b now holds positions 5,6,7,4
+        r.c = vextq_u32(r.c, r.c, 2);                 // 10, 11, 8, 9
+        r.d = vextq_u32(r.d, r.d, 3);                 // 15, 12, 13, 14
 
-        a = vaddq_u32(a, b);   // a += b
-        d = veorq_u32(d, a);   // d ^= a
-        d = chacha20_neon_vrotlq_16_u32(d);  // d <<<= 16
+        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+        r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
 
-        c = vaddq_u32(c, d);   // c += d
-        b = veorq_u32(b, c);   // b ^= c
-        b = chacha20_neon_vrotlq_12_u32(b);  // b <<<= 12
+        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+        r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
 
-        a = vaddq_u32(a, b);   // a += b
-        d = veorq_u32(d, a);   // d ^= a
-        d = chacha20_neon_vrotlq_8_u32(d);   // d <<<= 8
+        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+        r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
 
-        c = vaddq_u32(c, d);   // c += d
-        b = veorq_u32(b, c);   // b ^= c
-        b = chacha20_neon_vrotlq_7_u32(b);   // b <<<= 7
+        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+        r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
 
         // restore element order in b, c, d
-        b = vextq_u32(b, b, 3);
-        c = vextq_u32(c, c, 2);
-        d = vextq_u32(d, d, 1);
+        r.b = vextq_u32(r.b, r.b, 3);
+        r.c = vextq_u32(r.c, r.c, 2);
+        r.d = vextq_u32(r.d, r.d, 1);
     }
 
-    a = vaddq_u32(a, a1);
-    b = vaddq_u32(b, b1);
-    c = vaddq_u32(c, c1);
-    d = vaddq_u32(d, d1);
-
-    vst1q_u8(output + 0,  veorq_u8(vld1q_u8(input + 0),  vreinterpretq_u8_u32(a)));
-    vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(b)));
-    vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(c)));
-    vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(d)));
+    r.a = vaddq_u32(r.a, r_original.a);
+    r.b = vaddq_u32(r.b, r_original.b);
+    r.c = vaddq_u32(r.c, r_original.c);
+    r.d = vaddq_u32(r.d, r_original.d);
+    
+    vst1q_u8(output + 0,  veorq_u8(vld1q_u8(input + 0),  vreinterpretq_u8_u32(r.a)));
+    vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(r.b)));
+    vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(r.c)));
+    vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(r.d)));
 }
 
 #else
@@ -333,16 +334,17 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
 
 #if defined(MBEDTLS_HAVE_NEON_INTRINSICS)
     /* Load state into NEON registers */
-    uint32x4_t a = vld1q_u32(&ctx->state[0]);
-    uint32x4_t b = vld1q_u32(&ctx->state[4]);
-    uint32x4_t c = vld1q_u32(&ctx->state[8]);
-    uint32x4_t d = vld1q_u32(&ctx->state[12]);
+    chacha20_neon_regs_t state;
+    state.a = vld1q_u32(&ctx->state[0]);
+    state.b = vld1q_u32(&ctx->state[4]);
+    state.c = vld1q_u32(&ctx->state[8]);
+    state.d = vld1q_u32(&ctx->state[12]);
 
     /* Process full blocks */
     while (size >= CHACHA20_BLOCK_SIZE_BYTES) {
-        chacha20_block(a, b, c, d, output + offset, input + offset);
+        chacha20_block(state, output + offset, input + offset);
 
-        d = chacha20_neon_inc_counter(d);
+        state.d = chacha20_neon_inc_counter(state.d);
 
         offset += CHACHA20_BLOCK_SIZE_BYTES;
         size   -= CHACHA20_BLOCK_SIZE_BYTES;
@@ -352,8 +354,8 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     if (size > 0U) {
         /* Generate new keystream block and increment counter */
         memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES);
-        chacha20_block(a, b, c, d, ctx->keystream8, ctx->keystream8);
-        d = chacha20_neon_inc_counter(d);
+        chacha20_block(state, ctx->keystream8, ctx->keystream8);
+        state.d = chacha20_neon_inc_counter(state.d);
 
         mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size);
 
@@ -361,7 +363,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     }
 
     /* Capture state */
-    vst1q_u32(&ctx->state[12], d);
+    vst1q_u32(&ctx->state[12], state.d);
 #else
     /* Process full blocks */
     while (size >= CHACHA20_BLOCK_SIZE_BYTES) {

From 864fed715eb5a52d12896976c03d1779d65d2b5a Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Sun, 13 Oct 2024 23:05:04 +0100
Subject: [PATCH 07/17] Refactor to prepare for multiblock

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 154 +++++++++++--------
 1 file changed, 89 insertions(+), 65 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 32929df76f5d..192656deab56 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -27,7 +27,6 @@
 #define CHACHA20_BLOCK_SIZE_BYTES (4U * 16U)
 
 #if defined(MBEDTLS_HAVE_NEON_INTRINSICS)
-
 // Tested on all combinations of armv7 arm/thumb2; armv8 arm/thumb2/aarch64 on clang 14, gcc 11,
 // and some more recent versions.
 
@@ -68,74 +67,101 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v)
 // Increment the 32-bit element within v that corresponds to the ChaCha20 counter
 static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v)
 {
-     const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 };
-     const uint32x4_t inc_const = vld1q_u32(inc_const_scalar);
-     return vaddq_u32(v, inc_const);
+    const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 };
+    const uint32x4_t inc_const = vld1q_u32(inc_const_scalar);
+    return vaddq_u32(v, inc_const);
 }
 
 typedef struct {
     uint32x4_t a, b, c, d;
 } chacha20_neon_regs_t;
 
-static inline void chacha20_block(chacha20_neon_regs_t r,
-                                  uint8_t *output,
-                                  const uint8_t *input)
+static inline chacha20_neon_regs_t chacha20_neon_singlepass(chacha20_neon_regs_t r)
 {
-    const chacha20_neon_regs_t r_original = r;
-
-    for (int i = 0; i < 10; i++) {
-        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-        r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
-
-        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-        r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
-
-        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-        r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
-
-        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-        r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
-
-        // re-order b, c and d for the diagonal rounds
-        r.b = vextq_u32(r.b, r.b, 1);                 // r.b now holds positions 5,6,7,4
-        r.c = vextq_u32(r.c, r.c, 2);                 // 10, 11, 8, 9
-        r.d = vextq_u32(r.d, r.d, 3);                 // 15, 12, 13, 14
-
-        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-        r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
-
-        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-        r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
-
-        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-        r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
-
-        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-        r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
-
-        // restore element order in b, c, d
-        r.b = vextq_u32(r.b, r.b, 3);
-        r.c = vextq_u32(r.c, r.c, 2);
-        r.d = vextq_u32(r.d, r.d, 1);
-    }
+    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+    r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
+
+    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+    r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
+
+    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+    r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
+
+    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+    r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
+
+    // re-order b, c and d for the diagonal rounds
+    r.b = vextq_u32(r.b, r.b, 1);                 // r.b now holds positions 5,6,7,4
+    r.c = vextq_u32(r.c, r.c, 2);                 // 10, 11, 8, 9
+    r.d = vextq_u32(r.d, r.d, 3);                 // 15, 12, 13, 14
+
+    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+    r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
+
+    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+    r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
 
+    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+    r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
+
+    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+    r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
+
+    // restore element order in b, c, d
+    r.b = vextq_u32(r.b, r.b, 3);
+    r.c = vextq_u32(r.c, r.c, 2);
+    r.d = vextq_u32(r.d, r.d, 1);
+
+    return r;
+}
+
+static inline void chacha20_neon_finish_block(chacha20_neon_regs_t r,
+                                              chacha20_neon_regs_t r_original,
+                                              uint8_t **output,
+                                              const uint8_t **input)
+{
     r.a = vaddq_u32(r.a, r_original.a);
     r.b = vaddq_u32(r.b, r_original.b);
     r.c = vaddq_u32(r.c, r_original.c);
     r.d = vaddq_u32(r.d, r_original.d);
-    
-    vst1q_u8(output + 0,  veorq_u8(vld1q_u8(input + 0),  vreinterpretq_u8_u32(r.a)));
-    vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(r.b)));
-    vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(r.c)));
-    vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(r.d)));
+
+    vst1q_u8(*output + 0,  veorq_u8(vld1q_u8(*input + 0),  vreinterpretq_u8_u32(r.a)));
+    vst1q_u8(*output + 16, veorq_u8(vld1q_u8(*input + 16), vreinterpretq_u8_u32(r.b)));
+    vst1q_u8(*output + 32, veorq_u8(vld1q_u8(*input + 32), vreinterpretq_u8_u32(r.c)));
+    vst1q_u8(*output + 48, veorq_u8(vld1q_u8(*input + 48), vreinterpretq_u8_u32(r.d)));
+
+    *input += CHACHA20_BLOCK_SIZE_BYTES;
+    *output += CHACHA20_BLOCK_SIZE_BYTES;
+}
+
+static inline uint32x4_t chacha20_neon_blocks(chacha20_neon_regs_t r_original,
+                                              uint8_t *output,
+                                              const uint8_t *input,
+                                              size_t blocks)
+{
+    for (;;) {
+        chacha20_neon_regs_t r[1];
+
+        r[0] = r_original;
+
+        for (unsigned i = 0; i < 10; i++) {
+            r[0] = chacha20_neon_singlepass(r[0]);
+        }
+
+        chacha20_neon_finish_block(r[0], r_original, &output, &input);
+        r_original.d = chacha20_neon_inc_counter(r_original.d);
+        if (--blocks == 0) {
+            return r_original.d;
+        }
+    }
 }
 
 #else
@@ -341,21 +367,19 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     state.d = vld1q_u32(&ctx->state[12]);
 
     /* Process full blocks */
-    while (size >= CHACHA20_BLOCK_SIZE_BYTES) {
-        chacha20_block(state, output + offset, input + offset);
+    if (size >= CHACHA20_BLOCK_SIZE_BYTES) {
+        size_t blocks = size / CHACHA20_BLOCK_SIZE_BYTES;
+        state.d = chacha20_neon_blocks(state, output + offset, input + offset, blocks);
 
-        state.d = chacha20_neon_inc_counter(state.d);
-
-        offset += CHACHA20_BLOCK_SIZE_BYTES;
-        size   -= CHACHA20_BLOCK_SIZE_BYTES;
+        offset += CHACHA20_BLOCK_SIZE_BYTES * blocks;
+        size   -= CHACHA20_BLOCK_SIZE_BYTES * blocks;
     }
 
     /* Last (partial) block */
     if (size > 0U) {
         /* Generate new keystream block and increment counter */
         memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES);
-        chacha20_block(state, ctx->keystream8, ctx->keystream8);
-        state.d = chacha20_neon_inc_counter(state.d);
+        state.d = chacha20_neon_blocks(state, ctx->keystream8, ctx->keystream8, 1);
 
         mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size);
 

From 79381040a44a12210ae816ad2c15c52e7fac5f37 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Tue, 15 Oct 2024 23:37:27 +0100
Subject: [PATCH 08/17] Add multiblock support

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 105 ++++++++++++++++++-
 1 file changed, 100 insertions(+), 5 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 192656deab56..28cd6318e22f 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -26,9 +26,46 @@
 
 #define CHACHA20_BLOCK_SIZE_BYTES (4U * 16U)
 
-#if defined(MBEDTLS_HAVE_NEON_INTRINSICS)
-// Tested on all combinations of armv7 arm/thumb2; armv8 arm/thumb2/aarch64 on clang 14, gcc 11,
-// and some more recent versions.
+/*
+ * The Neon implementation can be configured to process multiple blocks in parallel; increasing the
+ * number of blocks gains a lot of performance, but adds on average around 250 bytes of code size
+ * for each additional block.
+ *
+ * This is controlled by setting MBEDTLS_CHACHA20_NEON_MULTIBLOCK in the range [0..6] (0 selects
+ * the scalar implementation; 1 selects single-block Neon; 2..6 select multi-block Neon).
+ *
+ * The default (i.e., if MBEDTLS_CHACHA20_NEON_MULTIBLOCK is not set) selects the fastest variant
+ * which has better code size than the scalar implementation (based on testing for Aarch64 on clang
+ * and gcc).
+ *
+ * Size & performance notes for Neon implementation from informal tests on Aarch64
+ * (applies to both gcc and clang except as noted):
+ *   - When single-block is selected, this saves around 400-550 bytes of code-size c.f. the scalar
+ *     implementation
+ *   - Multi-block Neon is smaller and faster than scalar (up to 2 blocks for gcc, 3 for clang)
+ *   - Code size increases consistently with number of blocks
+ *   - Performance increases with number of blocks (except at 5 which is slightly slower than 4)
+ *   - Performance is within a few % for gcc vs clang at all settings
+ *   - Performance at 4 blocks roughly matches our hardware accelerated AES-GCM impl with
+ *     better code size
+ *   - Performance is worse at 7 or more blocks, due to running out of Neon registers
+ */
+
+#if !defined(MBEDTLS_HAVE_NEON_INTRINSICS)
+// Select scalar implementation if Neon not available
+    #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 0
+#elif !defined(MBEDTLS_CHACHA20_NEON_MULTIBLOCK)
+// By default, select the best performing option that is smaller than the scalar implementation.
+    #if defined(MBEDTLS_COMPILER_IS_GCC)
+        #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 2
+    #else
+        #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 3
+    #endif
+#endif
+
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK != 0
+// Tested on all combinations of Armv7 arm/thumb2; Armv8 arm/thumb2/aarch64; Armv8 aarch64_be on
+// clang 14, gcc 11, and some more recent versions.
 
 // Define rotate-left operations that rotate within each 32-bit element in a 128-bit vector.
 static inline uint32x4_t chacha20_neon_vrotlq_16_u32(uint32x4_t v)
@@ -142,18 +179,41 @@ static inline void chacha20_neon_finish_block(chacha20_neon_regs_t r,
     *output += CHACHA20_BLOCK_SIZE_BYTES;
 }
 
+// Prevent gcc from rolling up the (manually unrolled) interleaved block loops
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 static inline uint32x4_t chacha20_neon_blocks(chacha20_neon_regs_t r_original,
                                               uint8_t *output,
                                               const uint8_t *input,
                                               size_t blocks)
 {
+    // Assuming 32 regs, with 4 for original values plus 4 for scratch, with 4 regs per block,
+    // we should be able to process up to 24/4 = 6 blocks simultaneously.
+    // Testing confirms that perf indeed increases with more blocks, and then falls off after 6.
+
     for (;;) {
-        chacha20_neon_regs_t r[1];
+        chacha20_neon_regs_t r[6];
 
+        // It's essential to unroll these loops to benefit from interleaving multiple blocks.
+        // If MBEDTLS_CHACHA20_NEON_MULTIBLOCK < 6, gcc and clang will optimise away the unused bits
         r[0] = r_original;
+        r[1] = r_original;
+        r[2] = r_original;
+        r[3] = r_original;
+        r[4] = r_original;
+        r[5] = r_original;
+        r[1].d = chacha20_neon_inc_counter(r[0].d);
+        r[2].d = chacha20_neon_inc_counter(r[1].d);
+        r[3].d = chacha20_neon_inc_counter(r[2].d);
+        r[4].d = chacha20_neon_inc_counter(r[3].d);
+        r[5].d = chacha20_neon_inc_counter(r[4].d);
 
         for (unsigned i = 0; i < 10; i++) {
             r[0] = chacha20_neon_singlepass(r[0]);
+            r[1] = chacha20_neon_singlepass(r[1]);
+            r[2] = chacha20_neon_singlepass(r[2]);
+            r[3] = chacha20_neon_singlepass(r[3]);
+            r[4] = chacha20_neon_singlepass(r[4]);
+            r[5] = chacha20_neon_singlepass(r[5]);
         }
 
         chacha20_neon_finish_block(r[0], r_original, &output, &input);
@@ -161,6 +221,41 @@ static inline uint32x4_t chacha20_neon_blocks(chacha20_neon_regs_t r_original,
         if (--blocks == 0) {
             return r_original.d;
         }
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 2
+        chacha20_neon_finish_block(r[1], r_original, &output, &input);
+        r_original.d = chacha20_neon_inc_counter(r_original.d);
+        if (--blocks == 0) {
+            return r_original.d;
+        }
+#endif
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 3
+        chacha20_neon_finish_block(r[2], r_original, &output, &input);
+        r_original.d = chacha20_neon_inc_counter(r_original.d);
+        if (--blocks == 0) {
+            return r_original.d;
+        }
+#endif
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 4
+        chacha20_neon_finish_block(r[3], r_original, &output, &input);
+        r_original.d = chacha20_neon_inc_counter(r_original.d);
+        if (--blocks == 0) {
+            return r_original.d;
+        }
+#endif
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 5
+        chacha20_neon_finish_block(r[4], r_original, &output, &input);
+        r_original.d = chacha20_neon_inc_counter(r_original.d);
+        if (--blocks == 0) {
+            return r_original.d;
+        }
+#endif
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 6
+        chacha20_neon_finish_block(r[5], r_original, &output, &input);
+        r_original.d = chacha20_neon_inc_counter(r_original.d);
+        if (--blocks == 0) {
+            return r_original.d;
+        }
+#endif
     }
 }
 
@@ -358,7 +453,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
         size--;
     }
 
-#if defined(MBEDTLS_HAVE_NEON_INTRINSICS)
+#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK != 0
     /* Load state into NEON registers */
     chacha20_neon_regs_t state;
     state.a = vld1q_u32(&ctx->state[0]);

From f9c926881dad5086c9f0deb5c25b983b8d58be03 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Tue, 15 Oct 2024 23:37:38 +0100
Subject: [PATCH 09/17] Code size improvement

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 70 ++++++++------------
 1 file changed, 29 insertions(+), 41 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 28cd6318e22f..0ae5f7c7e9f8 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -115,47 +115,35 @@ typedef struct {
 
 static inline chacha20_neon_regs_t chacha20_neon_singlepass(chacha20_neon_regs_t r)
 {
-    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-    r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
-
-    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-    r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
-
-    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-    r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
-
-    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-    r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
-
-    // re-order b, c and d for the diagonal rounds
-    r.b = vextq_u32(r.b, r.b, 1);                 // r.b now holds positions 5,6,7,4
-    r.c = vextq_u32(r.c, r.c, 2);                 // 10, 11, 8, 9
-    r.d = vextq_u32(r.d, r.d, 3);                 // 15, 12, 13, 14
-
-    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-    r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
-
-    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-    r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
-
-    r.a = vaddq_u32(r.a, r.b);                    // r.a += b
-    r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
-    r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
-
-    r.c = vaddq_u32(r.c, r.d);                    // r.c += d
-    r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
-    r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
-
-    // restore element order in b, c, d
-    r.b = vextq_u32(r.b, r.b, 3);
-    r.c = vextq_u32(r.c, r.c, 2);
-    r.d = vextq_u32(r.d, r.d, 1);
+    for (unsigned i = 0; i < 2; i++) {
+        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+        r.d = chacha20_neon_vrotlq_16_u32(r.d);       // r.d <<<= 16
+
+        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+        r.b = chacha20_neon_vrotlq_12_u32(r.b);       // r.b <<<= 12
+
+        r.a = vaddq_u32(r.a, r.b);                    // r.a += b
+        r.d = veorq_u32(r.d, r.a);                    // r.d ^= a
+        r.d = chacha20_neon_vrotlq_8_u32(r.d);        // r.d <<<= 8
+
+        r.c = vaddq_u32(r.c, r.d);                    // r.c += d
+        r.b = veorq_u32(r.b, r.c);                    // r.b ^= c
+        r.b = chacha20_neon_vrotlq_7_u32(r.b);        // r.b <<<= 7
+
+        if (i == 0) {
+            // re-order b, c and d for the diagonal rounds
+            r.b = vextq_u32(r.b, r.b, 1);                 // r.b now holds positions 5,6,7,4
+            r.c = vextq_u32(r.c, r.c, 2);                 // 10, 11, 8, 9
+            r.d = vextq_u32(r.d, r.d, 3);                 // 15, 12, 13, 14
+        } else {
+            // restore element order in b, c, d
+            r.b = vextq_u32(r.b, r.b, 3);
+            r.c = vextq_u32(r.c, r.c, 2);
+            r.d = vextq_u32(r.d, r.d, 1);
+        }
+    }
 
     return r;
 }

From 50292e82aa5d660b2e4a45d1457416a30411a256 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 11 Oct 2024 18:02:24 +0100
Subject: [PATCH 10/17] add Changelog entry

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 ChangeLog.d/chacha20-neon.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 ChangeLog.d/chacha20-neon.txt

diff --git a/ChangeLog.d/chacha20-neon.txt b/ChangeLog.d/chacha20-neon.txt
new file mode 100644
index 000000000000..946941bb65b3
--- /dev/null
+++ b/ChangeLog.d/chacha20-neon.txt
@@ -0,0 +1,3 @@
+   * ChaCha20 size and performance: add a Neon implementation of ChaCha20 for
+     Thumb2 and 32 and 64-bit Arm, for Armv7 onwards. At default settings,
+     this improves performance by around 2x to 2.3x on Aarch64.

From a8e9a884bd5582f9d5a9adf22472380b45864d2b Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Tue, 15 Oct 2024 23:23:32 +0100
Subject: [PATCH 11/17] Test all ChaCha20 Neon scalar and multiblock variations

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 .../components-configuration-crypto.sh        | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/scripts/components-configuration-crypto.sh b/tests/scripts/components-configuration-crypto.sh
index 74ebb793d70e..82190f1130f8 100644
--- a/tests/scripts/components-configuration-crypto.sh
+++ b/tests/scripts/components-configuration-crypto.sh
@@ -2368,6 +2368,42 @@ END
     ./tf-psa-crypto/tests/test_suite_shax
 }
 
+
+support_test_chacha20_variations () {
+    case $(uname -m) in
+        aarch64) true;;
+        *) false;;
+    esac
+}
+
+component_test_chacha20_neon_variations () {
+    msg "ChaCha20 Neon scalar and multiblock variations"
+
+    # define minimal config sufficient to test ChaCha20
+     cat > include/mbedtls/mbedtls_config.h << END
+         #define MBEDTLS_AES_C
+         #define MBEDTLS_CHACHA20_C
+         #define MBEDTLS_ENTROPY_C
+         #define MBEDTLS_CTR_DRBG_C
+         #define MBEDTLS_PSA_CRYPTO_C
+         #define MBEDTLS_PSA_CRYPTO_CONFIG
+         #define MBEDTLS_SELF_TEST
+END
+
+    cat > tf-psa-crypto/include/psa/crypto_config.h << END
+        #define PSA_WANT_ALG_SHA_256   1
+END
+
+    make clean
+    for x in 0 1 2 3 4 5 6; do
+        msg "multiblock = $x"
+        make clean
+        make -C tests ../tf-psa-crypto/tests/test_suite_chacha20 CFLAGS="-DMBEDTLS_CHACHA20_NEON_MULTIBLOCK=$x"
+        ./tf-psa-crypto/tests/test_suite_chacha20
+    done
+}
+
+
 support_build_aes_aesce_armcc () {
     support_build_armcc
 }

From cd36f2546ce4d0291a5e8785a4c4a8b33c501e04 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Wed, 16 Oct 2024 09:17:14 +0100
Subject: [PATCH 12/17] Fix changelog

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 ChangeLog.d/chacha20-neon.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ChangeLog.d/chacha20-neon.txt b/ChangeLog.d/chacha20-neon.txt
index 946941bb65b3..220d58eb900e 100644
--- a/ChangeLog.d/chacha20-neon.txt
+++ b/ChangeLog.d/chacha20-neon.txt
@@ -1,3 +1,4 @@
+Features
    * ChaCha20 size and performance: add a Neon implementation of ChaCha20 for
      Thumb2 and 32 and 64-bit Arm, for Armv7 onwards. At default settings,
      this improves performance by around 2x to 2.3x on Aarch64.

From c55b1593f5def33caa5d56701fc5e16c3e642de9 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Wed, 16 Oct 2024 09:23:08 +0100
Subject: [PATCH 13/17] Fix ABI break

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 .../drivers/builtin/include/mbedtls/chacha20.h      |  6 +++---
 tf-psa-crypto/drivers/builtin/src/chacha20.c        | 13 ++++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h
index c7df5bd8a946..ab7195e1c2c3 100644
--- a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h
+++ b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h
@@ -34,9 +34,9 @@ extern "C" {
 #endif
 
 typedef struct mbedtls_chacha20_context {
-    uint32_t MBEDTLS_PRIVATE(state)[16];               /*! The state (before round operations). */
-    uint8_t  MBEDTLS_PRIVATE(keystream8)[64];          /*! Leftover keystream bytes. */
-    size_t MBEDTLS_PRIVATE(keystream_bytes_remaining); /*! Number of not-used keystream bytes */
+    uint32_t MBEDTLS_PRIVATE(state)[16];          /*! The state (before round operations). */
+    uint8_t  MBEDTLS_PRIVATE(keystream8)[64];     /*! Leftover keystream bytes. */
+    size_t MBEDTLS_PRIVATE(keystream_bytes_used); /*! Number of keystream bytes already used. */
 }
 mbedtls_chacha20_context;
 
diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 0ae5f7c7e9f8..569491d1d208 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -418,7 +418,7 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx,
     }
 
     /* Initially, there's no keystream bytes available */
-    ctx->keystream_bytes_remaining = 0U;
+    ctx->keystream_bytes_used = 0U;
 
     return 0;
 }
@@ -431,12 +431,11 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     size_t offset = 0U;
 
     /* Use leftover keystream bytes, if available */
-    while (size > 0U && ctx->keystream_bytes_remaining > 0U) {
-        output[offset] = input[offset]
-                         ^ ctx->keystream8[CHACHA20_BLOCK_SIZE_BYTES -
-                                           ctx->keystream_bytes_remaining];
+    while (size > 0U && ctx->keystream_bytes_used > 0U &&
+           ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) {
+        output[offset] = input[offset] ^ ctx->keystream8[ctx->keystream_bytes_used];
 
-        ctx->keystream_bytes_remaining--;
+        ctx->keystream_bytes_used = (ctx->keystream_bytes_used + 1) % CHACHA20_BLOCK_SIZE_BYTES;
         offset++;
         size--;
     }
@@ -466,7 +465,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
 
         mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size);
 
-        ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size;
+        ctx->keystream_bytes_used = size;
     }
 
     /* Capture state */

From 512f2e509aaf0eff5a9510dcb236706d8851333a Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Wed, 16 Oct 2024 10:24:44 +0100
Subject: [PATCH 14/17] Fix ABI break (revert to original behaviour)

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index 569491d1d208..a62b731182ea 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -366,6 +366,9 @@ static void chacha20_block(const uint32_t initial_state[16],
 void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx)
 {
     mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context));
+
+    /* Initially, there's no keystream bytes available */
+    ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES;
 }
 
 void mbedtls_chacha20_free(mbedtls_chacha20_context *ctx)
@@ -418,7 +421,7 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx,
     }
 
     /* Initially, there's no keystream bytes available */
-    ctx->keystream_bytes_used = 0U;
+    ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES;
 
     return 0;
 }
@@ -431,11 +434,10 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
     size_t offset = 0U;
 
     /* Use leftover keystream bytes, if available */
-    while (size > 0U && ctx->keystream_bytes_used > 0U &&
-           ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) {
+    while (size > 0U && ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) {
         output[offset] = input[offset] ^ ctx->keystream8[ctx->keystream_bytes_used];
 
-        ctx->keystream_bytes_used = (ctx->keystream_bytes_used + 1) % CHACHA20_BLOCK_SIZE_BYTES;
+        ctx->keystream_bytes_used++;
         offset++;
         size--;
     }
@@ -491,7 +493,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx,
 
         mbedtls_xor(output + offset, input + offset, ctx->keystream8, size);
 
-        ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size;
+        ctx->keystream_bytes_used = size;
 
     }
 #endif

From 6f632fc99e61266e5bb2cedf7f7ca9d3696c6736 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 17 Oct 2024 09:16:13 +0100
Subject: [PATCH 15/17] Size: improve counter increment operation

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index a62b731182ea..a4a12add6f6e 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -104,9 +104,12 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v)
 // Increment the 32-bit element within v that corresponds to the ChaCha20 counter
 static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v)
 {
-    const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 };
-    const uint32x4_t inc_const = vld1q_u32(inc_const_scalar);
-    return vaddq_u32(v, inc_const);
+    if (MBEDTLS_IS_BIG_ENDIAN) {
+        v[3]++;
+    } else {
+        v[0]++;
+    }
+    return v;
 }
 
 typedef struct {

From 405cbc7017617516f029532ed51fbcb79ebe70aa Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Wed, 23 Oct 2024 14:33:32 +0530
Subject: [PATCH 16/17] Add MBEDTLS_ARCH_IS_THUMB

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 include/mbedtls/build_info.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/mbedtls/build_info.h b/include/mbedtls/build_info.h
index 2b48e50341e2..4f2b22ce87a4 100644
--- a/include/mbedtls/build_info.h
+++ b/include/mbedtls/build_info.h
@@ -50,6 +50,11 @@
 #define MBEDTLS_ARCH_IS_ARM32
 #endif
 
+#if !defined(MBEDTLS_ARCH_IS_THUMB) && \
+    defined(_M_ARMT) || defined(__thumb__) || defined(__thumb2__)
+#define MBEDTLS_ARCH_IS_THUMB
+#endif
+
 #if !defined(MBEDTLS_ARCH_IS_X64) && \
     (defined(__amd64__) || defined(__x86_64__) || \
     ((defined(_M_X64) || defined(_M_AMD64)) && !defined(_M_ARM64EC)))

From a8c28e5c39c736cfebf6930673011ef40c6bf5d2 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Wed, 23 Oct 2024 14:33:48 +0530
Subject: [PATCH 17/17] Adjust defaults for Thumb to keep size down

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 tf-psa-crypto/drivers/builtin/src/chacha20.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c
index a4a12add6f6e..7affe324a867 100644
--- a/tf-psa-crypto/drivers/builtin/src/chacha20.c
+++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c
@@ -56,12 +56,21 @@
     #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 0
 #elif !defined(MBEDTLS_CHACHA20_NEON_MULTIBLOCK)
 // By default, select the best performing option that is smaller than the scalar implementation.
+#if defined(MBEDTLS_ARCH_IS_THUMB)
+// For Thumb, we need a smaller multiblock settting to be smaller than the scalar implementation
+    #if defined(MBEDTLS_COMPILER_IS_GCC)
+        #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 1
+    #else
+        #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 2
+    #endif
+#else // arm or aarch64
     #if defined(MBEDTLS_COMPILER_IS_GCC)
         #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 2
     #else
         #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 3
     #endif
 #endif
+#endif
 
 #if MBEDTLS_CHACHA20_NEON_MULTIBLOCK != 0
 // Tested on all combinations of Armv7 arm/thumb2; Armv8 arm/thumb2/aarch64; Armv8 aarch64_be on