From a87eb39fe77d8125b6b8f2f4c00b384e8338fa74 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Thu, 18 Apr 2024 22:36:34 +0100
Subject: [PATCH 1/7] Align uint8_t array to keep UBSAN happy

---
 Source/astcenc_internal.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index 715028ac8..6995c1e18 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -326,10 +326,10 @@ struct partition_info
 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
 
 	/** @brief The partition of each texel in the block. */
-	uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
 
 	/** @brief The list of texels in each partition. */
-	uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
 };
 
 /**
@@ -367,19 +367,19 @@ struct decimation_info
 	 * @brief The number of weights that contribute to each texel.
 	 * Value is between 1 and 4.
 	 */
-	uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
 
 	/**
 	 * @brief The weight index of the N weights that are interpolated for each texel.
 	 * Stored transposed to improve vectorization.
 	 */
-	uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
 
 	/**
 	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
 	 * Value is between 0 and 16, stored transposed to improve vectorization.
 	 */
-	uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
 
 	/**
 	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
@@ -388,13 +388,13 @@ struct decimation_info
 	ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
 
 	/** @brief The number of texels that each stored weight contributes to. */
-	uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
 
 	/**
 	 * @brief The list of texels that use a specific weight index.
 	 * Stored transposed to improve vectorization.
 	 */
-	uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
 
 	/**
 	 * @brief The bilinear contribution to the N texels that use each weight.
@@ -957,7 +957,7 @@ struct ASTCENC_ALIGNAS compression_working_buffers
 	 *
 	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
 	 */
-	uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
 
 	/** @brief Error of the best encoding combination for each block mode. */
 	ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
@@ -1111,7 +1111,7 @@ struct symbolic_compressed_block
 	 *
 	 * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
 	 */
-	uint8_t weights[BLOCK_MAX_WEIGHTS];
+	ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];
 
 	/**
 	 * @brief Get the weight quantization used by this block mode.

From dc94e8c996b8627ec55d532858a184de85a6d57f Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Apr 2024 20:06:49 +0100
Subject: [PATCH 2/7] Avoid undefined shifts of sign bit in signed ints

---
 Source/astcenc_vecmathlib.h        |  3 +--
 Source/astcenc_vecmathlib_none_4.h | 15 +++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index d48f1d73e..5c9462fa1 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -461,8 +461,7 @@ static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
 	p = p & vint4(0xFFFF);
 
 	p = lsr<6>(p);
-
-	p = p | lsl<10>(vint4(14) - lz);
+	p = p | lsl<10>((vint4(14) - lz));
 
 	vint4 r = select(p, fp16_one, is_one);
 	r = select(r, fp16_small, is_small);
diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
index 1c95c2ff8..924a0e7ce 100644
--- a/Source/astcenc_vecmathlib_none_4.h
+++ b/Source/astcenc_vecmathlib_none_4.h
@@ -556,10 +556,16 @@ ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
  */
 template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
 {
-	return vint4(a.m[0] << s,
-	             a.m[1] << s,
-	             a.m[2] << s,
-	             a.m[3] << s);
+	// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
+	unsigned int as0 = static_cast<unsigned int>(a.m[0]) << s;
+	unsigned int as1 = static_cast<unsigned int>(a.m[1]) << s;
+	unsigned int as2 = static_cast<unsigned int>(a.m[2]) << s;
+	unsigned int as3 = static_cast<unsigned int>(a.m[3]) << s;
+
+	return vint4(static_cast<int>(as0),
+	             static_cast<int>(as1),
+	             static_cast<int>(as2),
+	             static_cast<int>(as3));
 }
 
 /**
@@ -567,6 +573,7 @@ template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
  */
 template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
 {
+	// Cast to unsigned to avoid shift in/out of sign bit undefined behavior
 	unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
 	unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
 	unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;

From 00206dac7f8dfb347316672be28c0285827167c6 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Apr 2024 21:15:05 +0100
Subject: [PATCH 3/7] Zero init SIMD overspill

Ensures that SIMD overspill is ZI for decompression, in order to
keep UBSAN happy. In reality the UB SIMD lanes are never actually
written back to memory, but this keeps UBSAN useful as a tool.
---
 Source/astcenc_entry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/astcenc_entry.cpp b/Source/astcenc_entry.cpp
index 71efe9cec..5dc380160 100644
--- a/Source/astcenc_entry.cpp
+++ b/Source/astcenc_entry.cpp
@@ -1167,7 +1167,7 @@ astcenc_error astcenc_decompress_image(
 		return ASTCENC_ERR_OUT_OF_MEM;
 	}
 
-	image_block blk;
+	image_block blk {};
 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
 
 	// Decode mode inferred from the output data type

From 293ef40008ebcf58d36e391f30bf41b71210c11c Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Apr 2024 21:38:23 +0100
Subject: [PATCH 4/7] Add padding to arrays for UB unaligned load

---
 Source/astcenc_internal.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index 6995c1e18..df6e07f9e 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -732,7 +732,11 @@ struct block_size_descriptor
  *
  * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
  * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
- * data is stored as direct UNORM data, HDR data is stored as LNS data.
+ * data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
+ * elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
+ * 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
+ * two elements above the last real data element). The overspill values are never written to memory,
+ * and would be benign, but the padding avoids hitting undefined behavior.
  *
  * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
  * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
@@ -740,16 +744,16 @@ struct block_size_descriptor
 struct image_block
 {
 	/** @brief The input (compress) or output (decompress) data for the red color component. */
-	ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
 
 	/** @brief The input (compress) or output (decompress) data for the green color component. */
-	ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
 
 	/** @brief The input (compress) or output (decompress) data for the blue color component. */
-	ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
 
 	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
-	ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS];
+	ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
 
 	/** @brief The number of texels in the block. */
 	uint8_t texel_count;

From 81c2d75d2a8976c52e6fa75b976072f63c339d6b Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Apr 2024 21:51:23 +0100
Subject: [PATCH 5/7] Revert whitespace change

---
 Source/astcenc_vecmathlib.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index 5c9462fa1..546aecda9 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -461,6 +461,7 @@ static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
 	p = p & vint4(0xFFFF);
 
 	p = lsr<6>(p);
+
 	p = p | lsl<10>((vint4(14) - lz));
 
 	vint4 r = select(p, fp16_one, is_one);

From b6740340a99b243abb4442b0e6fbec235c1762a1 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Apr 2024 21:51:53 +0100
Subject: [PATCH 6/7] Update copyright year

---
 Source/astcenc_vecmathlib_none_4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
index 924a0e7ce..be7348eff 100644
--- a/Source/astcenc_vecmathlib_none_4.h
+++ b/Source/astcenc_vecmathlib_none_4.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2019-2023 Arm Limited
+// Copyright 2019-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy

From e17acf05263be5d804e718f42ca9cb422d3c3e1b Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Apr 2024 21:52:37 +0100
Subject: [PATCH 7/7] Remove formatting change

---
 Source/astcenc_vecmathlib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/astcenc_vecmathlib.h b/Source/astcenc_vecmathlib.h
index 546aecda9..d48f1d73e 100644
--- a/Source/astcenc_vecmathlib.h
+++ b/Source/astcenc_vecmathlib.h
@@ -462,7 +462,7 @@ static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
 
 	p = lsr<6>(p);
 
-	p = p | lsl<10>((vint4(14) - lz));
+	p = p | lsl<10>(vint4(14) - lz);
 
 	vint4 r = select(p, fp16_one, is_one);
 	r = select(r, fp16_small, is_small);