From a31fc6878372de38c6a1f2a81e5d657834f97cad Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 11 Oct 2024 17:45:33 +0100 Subject: [PATCH 01/17] Minor size improvements Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 33 ++++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 3501837d3a1d..4581abb99e6e 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -140,8 +140,7 @@ static void chacha20_block(const uint32_t initial_state[16], void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx) { - mbedtls_platform_zeroize(ctx->state, sizeof(ctx->state)); - mbedtls_platform_zeroize(ctx->keystream8, sizeof(ctx->keystream8)); + mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context)); /* Initially, there's no keystream bytes available */ ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES; @@ -164,14 +163,18 @@ int mbedtls_chacha20_setkey(mbedtls_chacha20_context *ctx, ctx->state[3] = 0x6b206574; /* Set key */ - ctx->state[4] = MBEDTLS_GET_UINT32_LE(key, 0); - ctx->state[5] = MBEDTLS_GET_UINT32_LE(key, 4); - ctx->state[6] = MBEDTLS_GET_UINT32_LE(key, 8); - ctx->state[7] = MBEDTLS_GET_UINT32_LE(key, 12); - ctx->state[8] = MBEDTLS_GET_UINT32_LE(key, 16); - ctx->state[9] = MBEDTLS_GET_UINT32_LE(key, 20); - ctx->state[10] = MBEDTLS_GET_UINT32_LE(key, 24); - ctx->state[11] = MBEDTLS_GET_UINT32_LE(key, 28); + if (MBEDTLS_IS_BIG_ENDIAN) { + ctx->state[4] = MBEDTLS_GET_UINT32_LE(key, 0); + ctx->state[5] = MBEDTLS_GET_UINT32_LE(key, 4); + ctx->state[6] = MBEDTLS_GET_UINT32_LE(key, 8); + ctx->state[7] = MBEDTLS_GET_UINT32_LE(key, 12); + ctx->state[8] = MBEDTLS_GET_UINT32_LE(key, 16); + ctx->state[9] = MBEDTLS_GET_UINT32_LE(key, 20); + ctx->state[10] = MBEDTLS_GET_UINT32_LE(key, 24); + ctx->state[11] = MBEDTLS_GET_UINT32_LE(key, 28); + } else { + memcpy(&ctx->state[4], key, 32); + } return 0; } @@ -184,9 +187,13 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx, ctx->state[12] = counter; /* Nonce */ - ctx->state[13] = MBEDTLS_GET_UINT32_LE(nonce, 0); - ctx->state[14] = MBEDTLS_GET_UINT32_LE(nonce, 4); - ctx->state[15] = MBEDTLS_GET_UINT32_LE(nonce, 8); + if (MBEDTLS_IS_BIG_ENDIAN) { + ctx->state[13] = MBEDTLS_GET_UINT32_LE(nonce, 0); + ctx->state[14] = MBEDTLS_GET_UINT32_LE(nonce, 4); + ctx->state[15] = MBEDTLS_GET_UINT32_LE(nonce, 8); + } else { + memcpy(&ctx->state[13], nonce, 12); + } mbedtls_platform_zeroize(ctx->keystream8, sizeof(ctx->keystream8)); From 2cd29ee2ce126ba695a3a7d564023a9904fd53a2 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 15 Oct 2024 23:31:15 +0100 Subject: [PATCH 02/17] Small code-size improvement Signed-off-by: Dave Rodgman --- .../drivers/builtin/include/mbedtls/chacha20.h | 6 +++--- tf-psa-crypto/drivers/builtin/src/chacha20.c | 16 ++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h index ab7195e1c2c3..c7df5bd8a946 100644 --- a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h +++ b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h @@ -34,9 +34,9 @@ extern "C" { #endif typedef struct mbedtls_chacha20_context { - uint32_t MBEDTLS_PRIVATE(state)[16]; /*! The state (before round operations). */ - uint8_t MBEDTLS_PRIVATE(keystream8)[64]; /*! Leftover keystream bytes. */ - size_t MBEDTLS_PRIVATE(keystream_bytes_used); /*! Number of keystream bytes already used. */ + uint32_t MBEDTLS_PRIVATE(state)[16]; /*! The state (before round operations). */ + uint8_t MBEDTLS_PRIVATE(keystream8)[64]; /*! Leftover keystream bytes. */ + size_t MBEDTLS_PRIVATE(keystream_bytes_remaining); /*! Number of not-used keystream bytes */ } mbedtls_chacha20_context; diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 4581abb99e6e..cf27a59d86d4 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -141,9 +141,6 @@ static void chacha20_block(const uint32_t initial_state[16], void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx) { mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context)); - - /* Initially, there's no keystream bytes available */ - ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES; } void mbedtls_chacha20_free(mbedtls_chacha20_context *ctx) @@ -195,10 +192,8 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx, memcpy(&ctx->state[13], nonce, 12); } - mbedtls_platform_zeroize(ctx->keystream8, sizeof(ctx->keystream8)); - /* Initially, there's no keystream bytes available */ - ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES; + ctx->keystream_bytes_remaining = 0U; return 0; } @@ -211,11 +206,12 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, size_t offset = 0U; /* Use leftover keystream bytes, if available */ - while (size > 0U && ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) { + while (size > 0U && ctx->keystream_bytes_remaining > 0U) { output[offset] = input[offset] - ^ ctx->keystream8[ctx->keystream_bytes_used]; + ^ ctx->keystream8[CHACHA20_BLOCK_SIZE_BYTES - + ctx->keystream_bytes_remaining]; - ctx->keystream_bytes_used++; + ctx->keystream_bytes_remaining--; offset++; size--; } @@ -240,7 +236,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, mbedtls_xor(output + offset, input + offset, ctx->keystream8, size); - ctx->keystream_bytes_used = size; + ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size; } From 8c29c34aa699b1aabb32a376c79da872bbfa012e Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 13 Oct 2024 20:48:54 +0100 Subject: [PATCH 03/17] Simple Neon implementation Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 116 ++++++++++++++++++- 1 file changed, 113 insertions(+), 3 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index cf27a59d86d4..1816a38397a0 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -22,13 +22,121 @@ #include "mbedtls/platform.h" -#define ROTL32(value, amount) \ - ((uint32_t) ((value) << (amount)) | ((value) >> (32 - (amount)))) - #define CHACHA20_CTR_INDEX (12U) #define CHACHA20_BLOCK_SIZE_BYTES (4U * 16U) +#if defined(MBEDTLS_HAVE_NEON_INTRINSICS) + +// Tested on all combinations of armv7 arm/thumb2; armv8 arm/thumb2/aarch64 on clang 14, gcc 11, +// and some more recent versions. + +// Define rotate-left operations that rotate within each 32-bit element in a 128-bit vector. +static inline uint32x4_t chacha20_neon_vrotlq_16_u32(uint32x4_t v) +{ + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(v))); +} + +static inline uint32x4_t chacha20_neon_vrotlq_12_u32(uint32x4_t v) +{ + uint32x4_t x = vshlq_n_u32(v, 12); + return vsriq_n_u32(x, v, 20); +} + +static inline uint32x4_t chacha20_neon_vrotlq_8_u32(uint32x4_t v) +{ + uint32x4_t result; +#if defined(MBEDTLS_ARCH_IS_ARM64) + // This implementation is slightly faster, but only supported on 64-bit Arm + // Table look-up which results in an 8-bit rotate-left within each 32-bit element + const uint8_t tbl_rotl8[16] = { 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 }; + const uint8x16_t vrotl8_tbl = vld1q_u8(tbl_rotl8); + result = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(v), vrotl8_tbl)); +#else + uint32x4_t a = vshlq_n_u32(v, 8); + result = vsriq_n_u32(a, v, 24); +#endif + return result; +} + +static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v) +{ + uint32x4_t x = vshlq_n_u32(v, 7); + return vsriq_n_u32(x, v, 25); +} + +static inline void chacha20_block(const uint32_t initial_state[16], unsigned char keystream[64]) +{ + /* Load state into NEON registers */ + uint32x4_t a = vld1q_u32(&initial_state[0]); + uint32x4_t b = vld1q_u32(&initial_state[4]); + uint32x4_t c = vld1q_u32(&initial_state[8]); + uint32x4_t d = vld1q_u32(&initial_state[12]); + + // capture initial values for use after the main loop + const uint32x4_t a1 = a, b1 = b, c1 = c, d1 = d; + + for (int i = 0; i < 10; i++) { + a = vaddq_u32(a, b); // a += b + d = veorq_u32(d, a); // d ^= a + d = chacha20_neon_vrotlq_16_u32(d); // d <<<= 16 + + c = vaddq_u32(c, d); // c += d + b = veorq_u32(b, c); // b ^= c + b = chacha20_neon_vrotlq_12_u32(b); // b <<<= 12 + + a = vaddq_u32(a, b); // a += b + d = veorq_u32(d, a); // d ^= a + d = chacha20_neon_vrotlq_8_u32(d); // d <<<= 8 + + c = vaddq_u32(c, d); // c += d + b = veorq_u32(b, c); // b ^= c + b = chacha20_neon_vrotlq_7_u32(b); // b <<<= 7 + + // re-order b, c and d for the diagonal rounds + b = vextq_u32(b, b, 1); // b now holds positions 5,6,7,4 + c = vextq_u32(c, c, 2); // 10, 11, 8, 9 + d = vextq_u32(d, d, 3); // 15, 12, 13, 14 + + a = vaddq_u32(a, b); // a += b + d = veorq_u32(d, a); // d ^= a + d = chacha20_neon_vrotlq_16_u32(d); // d <<<= 16 + + c = vaddq_u32(c, d); // c += d + b = veorq_u32(b, c); // b ^= c + b = chacha20_neon_vrotlq_12_u32(b); // b <<<= 12 + + a = vaddq_u32(a, b); // a += b + d = veorq_u32(d, a); // d ^= a + d = chacha20_neon_vrotlq_8_u32(d); // d <<<= 8 + + c = vaddq_u32(c, d); // c += d + b = veorq_u32(b, c); // b ^= c + b = chacha20_neon_vrotlq_7_u32(b); // b <<<= 7 + + // restore element order in b, c, d + b = vextq_u32(b, b, 3); + c = vextq_u32(c, c, 2); + d = vextq_u32(d, d, 1); + } + + a = vaddq_u32(a, a1); + b = vaddq_u32(b, b1); + c = vaddq_u32(c, c1); + d = vaddq_u32(d, d1); + + /* Store into keystream */ + vst1q_u8(keystream + 0, vreinterpretq_u8_u32(a)); + vst1q_u8(keystream + 16, vreinterpretq_u8_u32(b)); + vst1q_u8(keystream + 32, vreinterpretq_u8_u32(c)); + vst1q_u8(keystream + 48, vreinterpretq_u8_u32(d)); +} + +#else + +#define ROTL32(value, amount) \ + ((uint32_t) ((value) << (amount)) | ((value) >> (32 - (amount)))) + /** * \brief ChaCha20 quarter round operation. * @@ -138,6 +246,8 @@ static void chacha20_block(const uint32_t initial_state[16], mbedtls_platform_zeroize(working_state, sizeof(working_state)); } +#endif + void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx) { mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context)); From 59453af46a2116b30e7c4d968abb0fa3e39c0005 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 13 Oct 2024 20:53:28 +0100 Subject: [PATCH 04/17] Move xor into Neon implementation Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 59 +++++++++++++++----- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 1816a38397a0..e684e3dc4172 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -65,15 +65,13 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v) return vsriq_n_u32(x, v, 25); } -static inline void chacha20_block(const uint32_t initial_state[16], unsigned char keystream[64]) +static inline void chacha20_block(uint32x4_t a, + uint32x4_t b, + uint32x4_t c, + uint32x4_t d, + uint8_t *output, + const uint8_t *input) { - /* Load state into NEON registers */ - uint32x4_t a = vld1q_u32(&initial_state[0]); - uint32x4_t b = vld1q_u32(&initial_state[4]); - uint32x4_t c = vld1q_u32(&initial_state[8]); - uint32x4_t d = vld1q_u32(&initial_state[12]); - - // capture initial values for use after the main loop const uint32x4_t a1 = a, b1 = b, c1 = c, d1 = d; for (int i = 0; i < 10; i++) { @@ -125,11 +123,10 @@ static inline void chacha20_block(const uint32_t initial_state[16], unsigned cha c = vaddq_u32(c, c1); d = vaddq_u32(d, d1); - /* Store into keystream */ - vst1q_u8(keystream + 0, vreinterpretq_u8_u32(a)); - vst1q_u8(keystream + 16, vreinterpretq_u8_u32(b)); - vst1q_u8(keystream + 32, vreinterpretq_u8_u32(c)); - vst1q_u8(keystream + 48, vreinterpretq_u8_u32(d)); + vst1q_u8(output + 0, veorq_u8(vld1q_u8(input + 0), vreinterpretq_u8_u32(a))); + vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(b))); + vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(c))); + vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(d))); } #else @@ -326,6 +323,41 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, size--; } +#if defined(MBEDTLS_HAVE_NEON_INTRINSICS) + /* Load state into NEON registers */ + uint32x4_t a = vld1q_u32(&ctx->state[0]); + uint32x4_t b = vld1q_u32(&ctx->state[4]); + uint32x4_t c = vld1q_u32(&ctx->state[8]); + uint32x4_t d = vld1q_u32(&ctx->state[12]); + + const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 }; + const uint32x4_t inc_const = vld1q_u32(inc_const_scalar); + + /* Process full blocks */ + while (size >= CHACHA20_BLOCK_SIZE_BYTES) { + chacha20_block(a, b, c, d, output + offset, input + offset); + + d = vaddq_u32(d, inc_const); + + offset += CHACHA20_BLOCK_SIZE_BYTES; + size -= CHACHA20_BLOCK_SIZE_BYTES; + } + + /* Last (partial) block */ + if (size > 0U) { + /* Generate new keystream block and increment counter */ + memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES); + chacha20_block(a, b, c, d, ctx->keystream8, ctx->keystream8); + d = vaddq_u32(d, inc_const); + + mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size); + + ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size; + } + + /* Capture state */ + vst1q_u32(&ctx->state[12], d); +#else /* Process full blocks */ while (size >= CHACHA20_BLOCK_SIZE_BYTES) { /* Generate new keystream block and increment counter */ @@ -349,6 +381,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size; } +#endif return 0; } From b0a9055c7a54f937be48aef1c22f07078428f699 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 13 Oct 2024 23:30:06 +0100 Subject: [PATCH 05/17] Introduce chacha20_neon_inc_counter Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index e684e3dc4172..5c89b39ab59a 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -65,6 +65,14 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v) return vsriq_n_u32(x, v, 25); } +// Increment the 32-bit element within v that corresponds to the ChaCha20 counter +static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v) +{ + const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 }; + const uint32x4_t inc_const = vld1q_u32(inc_const_scalar); + return vaddq_u32(v, inc_const); +} + static inline void chacha20_block(uint32x4_t a, uint32x4_t b, uint32x4_t c, @@ -330,14 +338,11 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, uint32x4_t c = vld1q_u32(&ctx->state[8]); uint32x4_t d = vld1q_u32(&ctx->state[12]); - const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 }; - const uint32x4_t inc_const = vld1q_u32(inc_const_scalar); - /* Process full blocks */ while (size >= CHACHA20_BLOCK_SIZE_BYTES) { chacha20_block(a, b, c, d, output + offset, input + offset); - d = vaddq_u32(d, inc_const); + d = chacha20_neon_inc_counter(d); offset += CHACHA20_BLOCK_SIZE_BYTES; size -= CHACHA20_BLOCK_SIZE_BYTES; @@ -348,7 +353,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, /* Generate new keystream block and increment counter */ memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES); chacha20_block(a, b, c, d, ctx->keystream8, ctx->keystream8); - d = vaddq_u32(d, inc_const); + d = chacha20_neon_inc_counter(d); mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size); From 05f9e328af77d644cfb1893e04db09d9d3f2a678 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 13 Oct 2024 23:38:42 +0100 Subject: [PATCH 06/17] Tidy up Neon state into a struct Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 112 ++++++++++--------- 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 5c89b39ab59a..32929df76f5d 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -73,68 +73,69 @@ static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v) return vaddq_u32(v, inc_const); } -static inline void chacha20_block(uint32x4_t a, - uint32x4_t b, - uint32x4_t c, - uint32x4_t d, - uint8_t *output, - const uint8_t *input) +typedef struct { + uint32x4_t a, b, c, d; +} chacha20_neon_regs_t; + +static inline void chacha20_block(chacha20_neon_regs_t r, + uint8_t *output, + const uint8_t *input) { - const uint32x4_t a1 = a, b1 = b, c1 = c, d1 = d; + const chacha20_neon_regs_t r_original = r; for (int i = 0; i < 10; i++) { - a = vaddq_u32(a, b); // a += b - d = veorq_u32(d, a); // d ^= a - d = chacha20_neon_vrotlq_16_u32(d); // d <<<= 16 + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 - c = vaddq_u32(c, d); // c += d - b = veorq_u32(b, c); // b ^= c - b = chacha20_neon_vrotlq_12_u32(b); // b <<<= 12 + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 - a = vaddq_u32(a, b); // a += b - d = veorq_u32(d, a); // d ^= a - d = chacha20_neon_vrotlq_8_u32(d); // d <<<= 8 + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 - c = vaddq_u32(c, d); // c += d - b = veorq_u32(b, c); // b ^= c - b = chacha20_neon_vrotlq_7_u32(b); // b <<<= 7 + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 // re-order b, c and d for the diagonal rounds - b = vextq_u32(b, b, 1); // b now holds positions 5,6,7,4 - c = vextq_u32(c, c, 2); // 10, 11, 8, 9 - d = vextq_u32(d, d, 3); // 15, 12, 13, 14 + r.b = vextq_u32(r.b, r.b, 1); // r.b now holds positions 5,6,7,4 + r.c = vextq_u32(r.c, r.c, 2); // 10, 11, 8, 9 + r.d = vextq_u32(r.d, r.d, 3); // 15, 12, 13, 14 - a = vaddq_u32(a, b); // a += b - d = veorq_u32(d, a); // d ^= a - d = chacha20_neon_vrotlq_16_u32(d); // d <<<= 16 + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 - c = vaddq_u32(c, d); // c += d - b = veorq_u32(b, c); // b ^= c - b = chacha20_neon_vrotlq_12_u32(b); // b <<<= 12 + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 - a = vaddq_u32(a, b); // a += b - d = veorq_u32(d, a); // d ^= a - d = chacha20_neon_vrotlq_8_u32(d); // d <<<= 8 + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 - c = vaddq_u32(c, d); // c += d - b = veorq_u32(b, c); // b ^= c - b = chacha20_neon_vrotlq_7_u32(b); // b <<<= 7 + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 // restore element order in b, c, d - b = vextq_u32(b, b, 3); - c = vextq_u32(c, c, 2); - d = vextq_u32(d, d, 1); + r.b = vextq_u32(r.b, r.b, 3); + r.c = vextq_u32(r.c, r.c, 2); + r.d = vextq_u32(r.d, r.d, 1); } - a = vaddq_u32(a, a1); - b = vaddq_u32(b, b1); - c = vaddq_u32(c, c1); - d = vaddq_u32(d, d1); - - vst1q_u8(output + 0, veorq_u8(vld1q_u8(input + 0), vreinterpretq_u8_u32(a))); - vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(b))); - vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(c))); - vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(d))); + r.a = vaddq_u32(r.a, r_original.a); + r.b = vaddq_u32(r.b, r_original.b); + r.c = vaddq_u32(r.c, r_original.c); + r.d = vaddq_u32(r.d, r_original.d); + + vst1q_u8(output + 0, veorq_u8(vld1q_u8(input + 0), vreinterpretq_u8_u32(r.a))); + vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(r.b))); + vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(r.c))); + vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(r.d))); } #else @@ -333,16 +334,17 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, #if defined(MBEDTLS_HAVE_NEON_INTRINSICS) /* Load state into NEON registers */ - uint32x4_t a = vld1q_u32(&ctx->state[0]); - uint32x4_t b = vld1q_u32(&ctx->state[4]); - uint32x4_t c = vld1q_u32(&ctx->state[8]); - uint32x4_t d = vld1q_u32(&ctx->state[12]); + chacha20_neon_regs_t state; + state.a = vld1q_u32(&ctx->state[0]); + state.b = vld1q_u32(&ctx->state[4]); + state.c = vld1q_u32(&ctx->state[8]); + state.d = vld1q_u32(&ctx->state[12]); /* Process full blocks */ while (size >= CHACHA20_BLOCK_SIZE_BYTES) { - chacha20_block(a, b, c, d, output + offset, input + offset); + chacha20_block(state, output + offset, input + offset); - d = chacha20_neon_inc_counter(d); + state.d = chacha20_neon_inc_counter(state.d); offset += CHACHA20_BLOCK_SIZE_BYTES; size -= CHACHA20_BLOCK_SIZE_BYTES; @@ -352,8 +354,8 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, if (size > 0U) { /* Generate new keystream block and increment counter */ memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES); - chacha20_block(a, b, c, d, ctx->keystream8, ctx->keystream8); - d = chacha20_neon_inc_counter(d); + chacha20_block(state, ctx->keystream8, ctx->keystream8); + state.d = chacha20_neon_inc_counter(state.d); mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size); @@ -361,7 +363,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, } /* Capture state */ - vst1q_u32(&ctx->state[12], d); + vst1q_u32(&ctx->state[12], state.d); #else /* Process full blocks */ while (size >= CHACHA20_BLOCK_SIZE_BYTES) { From 864fed715eb5a52d12896976c03d1779d65d2b5a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Sun, 13 Oct 2024 23:05:04 +0100 Subject: [PATCH 07/17] Refactor to prepare for multiblock Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 154 +++++++++++-------- 1 file changed, 89 insertions(+), 65 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 32929df76f5d..192656deab56 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -27,7 +27,6 @@ #define CHACHA20_BLOCK_SIZE_BYTES (4U * 16U) #if defined(MBEDTLS_HAVE_NEON_INTRINSICS) - // Tested on all combinations of armv7 arm/thumb2; armv8 arm/thumb2/aarch64 on clang 14, gcc 11, // and some more recent versions. @@ -68,74 +67,101 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v) // Increment the 32-bit element within v that corresponds to the ChaCha20 counter static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v) { - const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 }; - const uint32x4_t inc_const = vld1q_u32(inc_const_scalar); - return vaddq_u32(v, inc_const); + const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 }; + const uint32x4_t inc_const = vld1q_u32(inc_const_scalar); + return vaddq_u32(v, inc_const); } typedef struct { uint32x4_t a, b, c, d; } chacha20_neon_regs_t; -static inline void chacha20_block(chacha20_neon_regs_t r, - uint8_t *output, - const uint8_t *input) +static inline chacha20_neon_regs_t chacha20_neon_singlepass(chacha20_neon_regs_t r) { - const chacha20_neon_regs_t r_original = r; - - for (int i = 0; i < 10; i++) { - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 - - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 - - // re-order b, c and d for the diagonal rounds - r.b = vextq_u32(r.b, r.b, 1); // r.b now holds positions 5,6,7,4 - r.c = vextq_u32(r.c, r.c, 2); // 10, 11, 8, 9 - r.d = vextq_u32(r.d, r.d, 3); // 15, 12, 13, 14 - - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 - - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 - - // restore element order in b, c, d - r.b = vextq_u32(r.b, r.b, 3); - r.c = vextq_u32(r.c, r.c, 2); - r.d = vextq_u32(r.d, r.d, 1); - } + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 + + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 + + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 + + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 + + // re-order b, c and d for the diagonal rounds + r.b = vextq_u32(r.b, r.b, 1); // r.b now holds positions 5,6,7,4 + r.c = vextq_u32(r.c, r.c, 2); // 10, 11, 8, 9 + r.d = vextq_u32(r.d, r.d, 3); // 15, 12, 13, 14 + + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 + + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 + + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 + + // restore element order in b, c, d + r.b = vextq_u32(r.b, r.b, 3); + r.c = vextq_u32(r.c, r.c, 2); + r.d = vextq_u32(r.d, r.d, 1); + + return r; +} + +static inline void chacha20_neon_finish_block(chacha20_neon_regs_t r, + chacha20_neon_regs_t r_original, + uint8_t **output, + const uint8_t **input) +{ r.a = vaddq_u32(r.a, r_original.a); r.b = vaddq_u32(r.b, r_original.b); r.c = vaddq_u32(r.c, r_original.c); r.d = vaddq_u32(r.d, r_original.d); - - vst1q_u8(output + 0, veorq_u8(vld1q_u8(input + 0), vreinterpretq_u8_u32(r.a))); - vst1q_u8(output + 16, veorq_u8(vld1q_u8(input + 16), vreinterpretq_u8_u32(r.b))); - vst1q_u8(output + 32, veorq_u8(vld1q_u8(input + 32), vreinterpretq_u8_u32(r.c))); - vst1q_u8(output + 48, veorq_u8(vld1q_u8(input + 48), vreinterpretq_u8_u32(r.d))); + + vst1q_u8(*output + 0, veorq_u8(vld1q_u8(*input + 0), vreinterpretq_u8_u32(r.a))); + vst1q_u8(*output + 16, veorq_u8(vld1q_u8(*input + 16), vreinterpretq_u8_u32(r.b))); + vst1q_u8(*output + 32, veorq_u8(vld1q_u8(*input + 32), vreinterpretq_u8_u32(r.c))); + vst1q_u8(*output + 48, veorq_u8(vld1q_u8(*input + 48), vreinterpretq_u8_u32(r.d))); + + *input += CHACHA20_BLOCK_SIZE_BYTES; + *output += CHACHA20_BLOCK_SIZE_BYTES; +} + +static inline uint32x4_t chacha20_neon_blocks(chacha20_neon_regs_t r_original, + uint8_t *output, + const uint8_t *input, + size_t blocks) +{ + for (;;) { + chacha20_neon_regs_t r[1]; + + r[0] = r_original; + + for (unsigned i = 0; i < 10; i++) { + r[0] = chacha20_neon_singlepass(r[0]); + } + + chacha20_neon_finish_block(r[0], r_original, &output, &input); + r_original.d = chacha20_neon_inc_counter(r_original.d); + if (--blocks == 0) { + return r_original.d; + } + } } #else @@ -341,21 +367,19 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, state.d = vld1q_u32(&ctx->state[12]); /* Process full blocks */ - while (size >= CHACHA20_BLOCK_SIZE_BYTES) { - chacha20_block(state, output + offset, input + offset); + if (size >= CHACHA20_BLOCK_SIZE_BYTES) { + size_t blocks = size / CHACHA20_BLOCK_SIZE_BYTES; + state.d = chacha20_neon_blocks(state, output + offset, input + offset, blocks); - state.d = chacha20_neon_inc_counter(state.d); - - offset += CHACHA20_BLOCK_SIZE_BYTES; - size -= CHACHA20_BLOCK_SIZE_BYTES; + offset += CHACHA20_BLOCK_SIZE_BYTES * blocks; + size -= CHACHA20_BLOCK_SIZE_BYTES * blocks; } /* Last (partial) block */ if (size > 0U) { /* Generate new keystream block and increment counter */ memset(ctx->keystream8, 0, CHACHA20_BLOCK_SIZE_BYTES); - chacha20_block(state, ctx->keystream8, ctx->keystream8); - state.d = chacha20_neon_inc_counter(state.d); + state.d = chacha20_neon_blocks(state, ctx->keystream8, ctx->keystream8, 1); mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size); From 79381040a44a12210ae816ad2c15c52e7fac5f37 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 15 Oct 2024 23:37:27 +0100 Subject: [PATCH 08/17] Add multiblock support Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 105 ++++++++++++++++++- 1 file changed, 100 insertions(+), 5 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 192656deab56..28cd6318e22f 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -26,9 +26,46 @@ #define CHACHA20_BLOCK_SIZE_BYTES (4U * 16U) -#if defined(MBEDTLS_HAVE_NEON_INTRINSICS) -// Tested on all combinations of armv7 arm/thumb2; armv8 arm/thumb2/aarch64 on clang 14, gcc 11, -// and some more recent versions. +/* + * The Neon implementation can be configured to process multiple blocks in parallel; increasing the + * number of blocks gains a lot of performance, but adds on average around 250 bytes of code size + * for each additional block. + * + * This is controlled by setting MBEDTLS_CHACHA20_NEON_MULTIBLOCK in the range [0..6] (0 selects + * the scalar implementation; 1 selects single-block Neon; 2..6 select multi-block Neon). + * + * The default (i.e., if MBEDTLS_CHACHA20_NEON_MULTIBLOCK is not set) selects the fastest variant + * which has better code size than the scalar implementation (based on testing for Aarch64 on clang + * and gcc). + * + * Size & performance notes for Neon implementation from informal tests on Aarch64 + * (applies to both gcc and clang except as noted): + * - When single-block is selected, this saves around 400-550 bytes of code-size c.f. the scalar + * implementation + * - Multi-block Neon is smaller and faster than scalar (up to 2 blocks for gcc, 3 for clang) + * - Code size increases consistently with number of blocks + * - Performance increases with number of blocks (except at 5 which is slightly slower than 4) + * - Performance is within a few % for gcc vs clang at all settings + * - Performance at 4 blocks roughly matches our hardware accelerated AES-GCM impl with + * better code size + * - Performance is worse at 7 or more blocks, due to running out of Neon registers + */ + +#if !defined(MBEDTLS_HAVE_NEON_INTRINSICS) +// Select scalar implementation if Neon not available + #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 0 +#elif !defined(MBEDTLS_CHACHA20_NEON_MULTIBLOCK) +// By default, select the best performing option that is smaller than the scalar implementation. + #if defined(MBEDTLS_COMPILER_IS_GCC) + #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 2 + #else + #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 3 + #endif +#endif + +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK != 0 +// Tested on all combinations of Armv7 arm/thumb2; Armv8 arm/thumb2/aarch64; Armv8 aarch64_be on +// clang 14, gcc 11, and some more recent versions. // Define rotate-left operations that rotate within each 32-bit element in a 128-bit vector. static inline uint32x4_t chacha20_neon_vrotlq_16_u32(uint32x4_t v) @@ -142,18 +179,41 @@ static inline void chacha20_neon_finish_block(chacha20_neon_regs_t r, *output += CHACHA20_BLOCK_SIZE_BYTES; } +// Prevent gcc from rolling up the (manually unrolled) interleaved block loops +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE static inline uint32x4_t chacha20_neon_blocks(chacha20_neon_regs_t r_original, uint8_t *output, const uint8_t *input, size_t blocks) { + // Assuming 32 regs, with 4 for original values plus 4 for scratch, with 4 regs per block, + // we should be able to process up to 24/4 = 6 blocks simultaneously. + // Testing confirms that perf indeed increases with more blocks, and then falls off after 6. + for (;;) { - chacha20_neon_regs_t r[1]; + chacha20_neon_regs_t r[6]; + // It's essential to unroll these loops to benefit from interleaving multiple blocks. + // If MBEDTLS_CHACHA20_NEON_MULTIBLOCK < 6, gcc and clang will optimise away the unused bits r[0] = r_original; + r[1] = r_original; + r[2] = r_original; + r[3] = r_original; + r[4] = r_original; + r[5] = r_original; + r[1].d = chacha20_neon_inc_counter(r[0].d); + r[2].d = chacha20_neon_inc_counter(r[1].d); + r[3].d = chacha20_neon_inc_counter(r[2].d); + r[4].d = chacha20_neon_inc_counter(r[3].d); + r[5].d = chacha20_neon_inc_counter(r[4].d); for (unsigned i = 0; i < 10; i++) { r[0] = chacha20_neon_singlepass(r[0]); + r[1] = chacha20_neon_singlepass(r[1]); + r[2] = chacha20_neon_singlepass(r[2]); + r[3] = chacha20_neon_singlepass(r[3]); + r[4] = chacha20_neon_singlepass(r[4]); + r[5] = chacha20_neon_singlepass(r[5]); } chacha20_neon_finish_block(r[0], r_original, &output, &input); @@ -161,6 +221,41 @@ static inline uint32x4_t chacha20_neon_blocks(chacha20_neon_regs_t r_original, if (--blocks == 0) { return r_original.d; } +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 2 + chacha20_neon_finish_block(r[1], r_original, &output, &input); + r_original.d = chacha20_neon_inc_counter(r_original.d); + if (--blocks == 0) { + return r_original.d; + } +#endif +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 3 + chacha20_neon_finish_block(r[2], r_original, &output, &input); + r_original.d = chacha20_neon_inc_counter(r_original.d); + if (--blocks == 0) { + return r_original.d; + } +#endif +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 4 + chacha20_neon_finish_block(r[3], r_original, &output, &input); + r_original.d = chacha20_neon_inc_counter(r_original.d); + if (--blocks == 0) { + return r_original.d; + } +#endif +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 5 + chacha20_neon_finish_block(r[4], r_original, &output, &input); + r_original.d = chacha20_neon_inc_counter(r_original.d); + if (--blocks == 0) { + return r_original.d; + } +#endif +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK >= 6 + chacha20_neon_finish_block(r[5], r_original, &output, &input); + r_original.d = chacha20_neon_inc_counter(r_original.d); + if (--blocks == 0) { + return r_original.d; + } +#endif } } @@ -358,7 +453,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, size--; } -#if defined(MBEDTLS_HAVE_NEON_INTRINSICS) +#if MBEDTLS_CHACHA20_NEON_MULTIBLOCK != 0 /* Load state into NEON registers */ chacha20_neon_regs_t state; state.a = vld1q_u32(&ctx->state[0]); From f9c926881dad5086c9f0deb5c25b983b8d58be03 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 15 Oct 2024 23:37:38 +0100 Subject: [PATCH 09/17] Code size improvement Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 70 ++++++++------------ 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 28cd6318e22f..0ae5f7c7e9f8 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -115,47 +115,35 @@ typedef struct { static inline chacha20_neon_regs_t chacha20_neon_singlepass(chacha20_neon_regs_t r) { - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 - - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 - - // re-order b, c and d for the diagonal rounds - r.b = vextq_u32(r.b, r.b, 1); // r.b now holds positions 5,6,7,4 - r.c = vextq_u32(r.c, r.c, 2); // 10, 11, 8, 9 - r.d = vextq_u32(r.d, r.d, 3); // 15, 12, 13, 14 - - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 - - r.a = vaddq_u32(r.a, r.b); // r.a += b - r.d = veorq_u32(r.d, r.a); // r.d ^= a - r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 - - r.c = vaddq_u32(r.c, r.d); // r.c += d - r.b = veorq_u32(r.b, r.c); // r.b ^= c - r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 - - // restore element order in b, c, d - r.b = vextq_u32(r.b, r.b, 3); - r.c = vextq_u32(r.c, r.c, 2); - r.d = vextq_u32(r.d, r.d, 1); + for (unsigned i = 0; i < 2; i++) { + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_16_u32(r.d); // r.d <<<= 16 + + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_12_u32(r.b); // r.b <<<= 12 + + r.a = vaddq_u32(r.a, r.b); // r.a += b + r.d = veorq_u32(r.d, r.a); // r.d ^= a + r.d = chacha20_neon_vrotlq_8_u32(r.d); // r.d <<<= 8 + + r.c = vaddq_u32(r.c, r.d); // r.c += d + r.b = veorq_u32(r.b, r.c); // r.b ^= c + r.b = chacha20_neon_vrotlq_7_u32(r.b); // r.b <<<= 7 + + if (i == 0) { + // re-order b, c and d for the diagonal rounds + r.b = vextq_u32(r.b, r.b, 1); // r.b now holds positions 5,6,7,4 + r.c = vextq_u32(r.c, r.c, 2); // 10, 11, 8, 9 + r.d = vextq_u32(r.d, r.d, 3); // 15, 12, 13, 14 + } else { + // restore element order in b, c, d + r.b = vextq_u32(r.b, r.b, 3); + r.c = vextq_u32(r.c, r.c, 2); + r.d = vextq_u32(r.d, r.d, 1); + } + } return r; } From 50292e82aa5d660b2e4a45d1457416a30411a256 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 11 Oct 2024 18:02:24 +0100 Subject: [PATCH 10/17] add Changelog entry Signed-off-by: Dave Rodgman --- ChangeLog.d/chacha20-neon.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 ChangeLog.d/chacha20-neon.txt diff --git a/ChangeLog.d/chacha20-neon.txt b/ChangeLog.d/chacha20-neon.txt new file mode 100644 index 000000000000..946941bb65b3 --- /dev/null +++ b/ChangeLog.d/chacha20-neon.txt @@ -0,0 +1,3 @@ + * ChaCha20 size and performance: add a Neon implementation of ChaCha20 for + Thumb2 and 32 and 64-bit Arm, for Armv7 onwards. At default settings, + this improves performance by around 2x to 2.3x on Aarch64. From a8e9a884bd5582f9d5a9adf22472380b45864d2b Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 15 Oct 2024 23:23:32 +0100 Subject: [PATCH 11/17] Test all ChaCha20 Neon scalar and multiblock variations Signed-off-by: Dave Rodgman --- .../components-configuration-crypto.sh | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/scripts/components-configuration-crypto.sh b/tests/scripts/components-configuration-crypto.sh index 74ebb793d70e..82190f1130f8 100644 --- a/tests/scripts/components-configuration-crypto.sh +++ b/tests/scripts/components-configuration-crypto.sh @@ -2368,6 +2368,42 @@ END ./tf-psa-crypto/tests/test_suite_shax } + +support_test_chacha20_variations () { + case $(uname -m) in + aarch64) true;; + *) false;; + esac +} + +component_test_chacha20_neon_variations () { + msg "ChaCha20 Neon scalar and multiblock variations" + + # define minimal config sufficient to test ChaCha20 + cat > include/mbedtls/mbedtls_config.h << END + #define MBEDTLS_AES_C + #define MBEDTLS_CHACHA20_C + #define MBEDTLS_ENTROPY_C + #define MBEDTLS_CTR_DRBG_C + #define MBEDTLS_PSA_CRYPTO_C + #define MBEDTLS_PSA_CRYPTO_CONFIG + #define MBEDTLS_SELF_TEST +END + + cat > tf-psa-crypto/include/psa/crypto_config.h << END + #define PSA_WANT_ALG_SHA_256 1 +END + + make clean + for x in 0 1 2 3 4 5 6; do + msg "multiblock = $x" + make clean + make -C tests ../tf-psa-crypto/tests/test_suite_chacha20 CFLAGS="-DMBEDTLS_CHACHA20_NEON_MULTIBLOCK=$x" + ./tf-psa-crypto/tests/test_suite_chacha20 + done +} + + support_build_aes_aesce_armcc () { support_build_armcc } From cd36f2546ce4d0291a5e8785a4c4a8b33c501e04 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 16 Oct 2024 09:17:14 +0100 Subject: [PATCH 12/17] Fix changelog Signed-off-by: Dave Rodgman --- ChangeLog.d/chacha20-neon.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.d/chacha20-neon.txt b/ChangeLog.d/chacha20-neon.txt index 946941bb65b3..220d58eb900e 100644 --- a/ChangeLog.d/chacha20-neon.txt +++ b/ChangeLog.d/chacha20-neon.txt @@ -1,3 +1,4 @@ +Features * ChaCha20 size and performance: add a Neon implementation of ChaCha20 for Thumb2 and 32 and 64-bit Arm, for Armv7 onwards. At default settings, this improves performance by around 2x to 2.3x on Aarch64. From c55b1593f5def33caa5d56701fc5e16c3e642de9 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 16 Oct 2024 09:23:08 +0100 Subject: [PATCH 13/17] Fix ABI break Signed-off-by: Dave Rodgman --- .../drivers/builtin/include/mbedtls/chacha20.h | 6 +++--- tf-psa-crypto/drivers/builtin/src/chacha20.c | 13 ++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h index c7df5bd8a946..ab7195e1c2c3 100644 --- a/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h +++ b/tf-psa-crypto/drivers/builtin/include/mbedtls/chacha20.h @@ -34,9 +34,9 @@ extern "C" { #endif typedef struct mbedtls_chacha20_context { - uint32_t MBEDTLS_PRIVATE(state)[16]; /*! The state (before round operations). */ - uint8_t MBEDTLS_PRIVATE(keystream8)[64]; /*! Leftover keystream bytes. */ - size_t MBEDTLS_PRIVATE(keystream_bytes_remaining); /*! Number of not-used keystream bytes */ + uint32_t MBEDTLS_PRIVATE(state)[16]; /*! The state (before round operations). */ + uint8_t MBEDTLS_PRIVATE(keystream8)[64]; /*! Leftover keystream bytes. */ + size_t MBEDTLS_PRIVATE(keystream_bytes_used); /*! Number of keystream bytes already used. */ } mbedtls_chacha20_context; diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 0ae5f7c7e9f8..569491d1d208 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -418,7 +418,7 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx, } /* Initially, there's no keystream bytes available */ - ctx->keystream_bytes_remaining = 0U; + ctx->keystream_bytes_used = 0U; return 0; } @@ -431,12 +431,11 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, size_t offset = 0U; /* Use leftover keystream bytes, if available */ - while (size > 0U && ctx->keystream_bytes_remaining > 0U) { - output[offset] = input[offset] - ^ ctx->keystream8[CHACHA20_BLOCK_SIZE_BYTES - - ctx->keystream_bytes_remaining]; + while (size > 0U && ctx->keystream_bytes_used > 0U && + ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) { + output[offset] = input[offset] ^ ctx->keystream8[ctx->keystream_bytes_used]; - ctx->keystream_bytes_remaining--; + ctx->keystream_bytes_used = (ctx->keystream_bytes_used + 1) % CHACHA20_BLOCK_SIZE_BYTES; offset++; size--; } @@ -466,7 +465,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, mbedtls_xor_no_simd(output + offset, input + offset, ctx->keystream8, size); - ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size; + ctx->keystream_bytes_used = size; } /* Capture state */ From 512f2e509aaf0eff5a9510dcb236706d8851333a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 16 Oct 2024 10:24:44 +0100 Subject: [PATCH 14/17] Fix ABI break (revert to original behaviour) Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index 569491d1d208..a62b731182ea 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -366,6 +366,9 @@ static void chacha20_block(const uint32_t initial_state[16], void mbedtls_chacha20_init(mbedtls_chacha20_context *ctx) { mbedtls_platform_zeroize(ctx, sizeof(mbedtls_chacha20_context)); + + /* Initially, there's no keystream bytes available */ + ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES; } void mbedtls_chacha20_free(mbedtls_chacha20_context *ctx) @@ -418,7 +421,7 @@ int mbedtls_chacha20_starts(mbedtls_chacha20_context *ctx, } /* Initially, there's no keystream bytes available */ - ctx->keystream_bytes_used = 0U; + ctx->keystream_bytes_used = CHACHA20_BLOCK_SIZE_BYTES; return 0; } @@ -431,11 +434,10 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, size_t offset = 0U; /* Use leftover keystream bytes, if available */ - while (size > 0U && ctx->keystream_bytes_used > 0U && - ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) { + while (size > 0U && ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES) { output[offset] = input[offset] ^ ctx->keystream8[ctx->keystream_bytes_used]; - ctx->keystream_bytes_used = (ctx->keystream_bytes_used + 1) % CHACHA20_BLOCK_SIZE_BYTES; + ctx->keystream_bytes_used++; offset++; size--; } @@ -491,7 +493,7 @@ int mbedtls_chacha20_update(mbedtls_chacha20_context *ctx, mbedtls_xor(output + offset, input + offset, ctx->keystream8, size); - ctx->keystream_bytes_remaining = CHACHA20_BLOCK_SIZE_BYTES - size; + ctx->keystream_bytes_used = size; } #endif From 6f632fc99e61266e5bb2cedf7f7ca9d3696c6736 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 17 Oct 2024 09:16:13 +0100 Subject: [PATCH 15/17] Size: improve counter increment operation Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index a62b731182ea..a4a12add6f6e 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -104,9 +104,12 @@ static inline uint32x4_t chacha20_neon_vrotlq_7_u32(uint32x4_t v) // Increment the 32-bit element within v that corresponds to the ChaCha20 counter static inline uint32x4_t chacha20_neon_inc_counter(uint32x4_t v) { - const uint32_t inc_const_scalar[4] = { 1, 0, 0, 0 }; - const uint32x4_t inc_const = vld1q_u32(inc_const_scalar); - return vaddq_u32(v, inc_const); + if (MBEDTLS_IS_BIG_ENDIAN) { + v[3]++; + } else { + v[0]++; + } + return v; } typedef struct { From 405cbc7017617516f029532ed51fbcb79ebe70aa Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 23 Oct 2024 14:33:32 +0530 Subject: [PATCH 16/17] Add MBEDTLS_ARCH_IS_THUMB Signed-off-by: Dave Rodgman --- include/mbedtls/build_info.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/mbedtls/build_info.h b/include/mbedtls/build_info.h index 2b48e50341e2..4f2b22ce87a4 100644 --- a/include/mbedtls/build_info.h +++ b/include/mbedtls/build_info.h @@ -50,6 +50,11 @@ #define MBEDTLS_ARCH_IS_ARM32 #endif +#if !defined(MBEDTLS_ARCH_IS_THUMB) && \ + defined(_M_ARMT) || defined(__thumb__) || defined(__thumb2__) +#define MBEDTLS_ARCH_IS_THUMB +#endif + #if !defined(MBEDTLS_ARCH_IS_X64) && \ (defined(__amd64__) || defined(__x86_64__) || \ ((defined(_M_X64) || defined(_M_AMD64)) && !defined(_M_ARM64EC))) From a8c28e5c39c736cfebf6930673011ef40c6bf5d2 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 23 Oct 2024 14:33:48 +0530 Subject: [PATCH 17/17] Adjust defaults for Thumb to keep size down Signed-off-by: Dave Rodgman --- tf-psa-crypto/drivers/builtin/src/chacha20.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tf-psa-crypto/drivers/builtin/src/chacha20.c b/tf-psa-crypto/drivers/builtin/src/chacha20.c index a4a12add6f6e..7affe324a867 100644 --- a/tf-psa-crypto/drivers/builtin/src/chacha20.c +++ b/tf-psa-crypto/drivers/builtin/src/chacha20.c @@ -56,12 +56,21 @@ #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 0 #elif !defined(MBEDTLS_CHACHA20_NEON_MULTIBLOCK) // By default, select the best performing option that is smaller than the scalar implementation. +#if defined(MBEDTLS_ARCH_IS_THUMB) +// For Thumb, we need a smaller multiblock settting to be smaller than the scalar implementation + #if defined(MBEDTLS_COMPILER_IS_GCC) + #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 1 + #else + #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 2 + #endif +#else // arm or aarch64 #if defined(MBEDTLS_COMPILER_IS_GCC) #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 2 #else #define MBEDTLS_CHACHA20_NEON_MULTIBLOCK 3 #endif #endif +#endif #if MBEDTLS_CHACHA20_NEON_MULTIBLOCK != 0 // Tested on all combinations of Armv7 arm/thumb2; Armv8 arm/thumb2/aarch64; Armv8 aarch64_be on