Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Arm SVE 8-wide (256b) implementation #480

Merged
merged 4 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 22 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ include(CTest)
option(ASTCENC_ISA_AVX2 "Enable astcenc builds for AVX2 SIMD")
option(ASTCENC_ISA_SSE41 "Enable astcenc builds for SSE4.1 SIMD")
option(ASTCENC_ISA_SSE2 "Enable astcenc builds for SSE2 SIMD")
option(ASTCENC_ISA_SVE_256 "Enable astcenc builds for 256-bit SVE SIMD")
option(ASTCENC_ISA_NEON "Enable astcenc builds for NEON SIMD")
option(ASTCENC_ISA_NONE "Enable astcenc builds for no SIMD")
option(ASTCENC_ISA_NATIVE "Enable astcenc builds for native SIMD")
Expand Down Expand Up @@ -86,7 +87,7 @@ endforeach()

# Count options which MUST be arm64
set(ASTCENC_ARM64_ISA_COUNT 0)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON})
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_SVE_256})
foreach(ASTCENC_CONFIG ${ASTCENC_CONFIGS})
if(${ASTCENC_CONFIG})
math(EXPR ASTCENC_ARM64_ISA_COUNT "${ASTCENC_ARM64_ISA_COUNT} + 1")
Expand Down Expand Up @@ -117,22 +118,28 @@ if("${ASTCENC_BLOCK_MAX_TEXELS}")
message(STATUS " Max block texels - ${ASTCENC_BLOCK_MAX_TEXELS}")
endif()

printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
printopt("NEON backend " ${ASTCENC_ISA_NEON})
printopt("NONE backend " ${ASTCENC_ISA_NONE})
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
message(STATUS "Arm backend options")
printopt("SVE 256b backend " ${ASTCENC_ISA_SVE_256})
printopt("NEON backend " ${ASTCENC_ISA_NEON})
message(STATUS "x86-64 backend options")
printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
message(STATUS "Agnostic backend options")
printopt("NONE backend " ${ASTCENC_ISA_NONE})
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})
message(STATUS "Build options")
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD})
printopt("Universal bin " ${ASTCENC_UNIVERSAL_BUILD})
endif()
printopt("Invariance " ${ASTCENC_INVARIANCE})
printopt("Shared libs " ${ASTCENC_SHAREDLIB})
printopt("Decompressor " ${ASTCENC_DECOMPRESSOR})
printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS})
printopt("ASAN " ${ASTCENC_ASAN})
printopt("UBSAN " ${ASTCENC_UBSAN})
printopt("Unit tests " ${ASTCENC_UNITTEST})
printopt("Invariance " ${ASTCENC_INVARIANCE})
printopt("Shared libs " ${ASTCENC_SHAREDLIB})
printopt("Decompressor " ${ASTCENC_DECOMPRESSOR})
message(STATUS "Developer options")
printopt("Diagnostics " ${ASTCENC_DIAGNOSTICS})
printopt("ASAN " ${ASTCENC_ASAN})
printopt("UBSAN " ${ASTCENC_UBSAN})
printopt("Unit tests " ${ASTCENC_UNITTEST})

# Subcomponents
add_subdirectory(Source)
Expand Down
8 changes: 5 additions & 3 deletions Source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ else()
set(ASTCENC_CODEC enc)
endif()

set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")

Expand All @@ -38,7 +38,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})

if(${ASTCENC_ISA_SIMD} MATCHES "neon")
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
# Not suported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
set(CMAKE_OSX_ARCHITECTURES x86_64)
Expand Down
8 changes: 5 additions & 3 deletions Source/UnitTest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# under the License.
# ----------------------------------------------------------------------------

set(ASTCENC_ARTIFACTS native none neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
set(ASTCENC_ARTIFACTS native none sve_256 neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")

Expand All @@ -26,7 +26,9 @@ foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})

if(${ASTCENC_ISA_SIMD} MATCHES "neon")
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
# Not supported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
set(CMAKE_OSX_ARCHITECTURES x86_64)
Expand Down
22 changes: 21 additions & 1 deletion Source/UnitTest/cmake_core.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2023 Arm Limited
# Copyright 2020-2024 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
Expand Down Expand Up @@ -72,6 +72,7 @@ if(${ASTCENC_ISA_SIMD} MATCHES "none")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
Expand All @@ -81,15 +82,32 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)

elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=8
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)

# Enable SVE
target_compile_options(${ASTCENC_TEST}
PRIVATE
-march=armv8-a+sve -msve-vector-bits=256)

elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
Expand All @@ -103,6 +121,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
Expand All @@ -116,6 +135,7 @@ elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
Expand Down
104 changes: 26 additions & 78 deletions Source/UnitTest/test_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ TEST(vfloat, ChangeSign)
vfloat4 a(-1.0f, 1.0f, -3.12f, 3.12f);
vfloat4 b(-1.0f, -1.0f, 3.12f, 3.12f);
vfloat4 r = change_sign(a, b);

EXPECT_EQ(r.lane<0>(), 1.0f);
EXPECT_EQ(r.lane<1>(), -1.0f);
EXPECT_EQ(r.lane<2>(), -3.12f);
Expand All @@ -205,6 +206,7 @@ TEST(vfloat, Atan)
{
vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f);
vfloat4 r = atan(a);

EXPECT_NEAR(r.lane<0>(), -0.149061f, 0.005f);
EXPECT_NEAR(r.lane<1>(), 0.000000f, 0.005f);
EXPECT_NEAR(r.lane<2>(), 0.733616f, 0.005f);
Expand All @@ -217,6 +219,7 @@ TEST(vfloat, Atan2)
vfloat4 a(-0.15f, 0.0f, 0.9f, 2.1f);
vfloat4 b(1.15f, -3.0f, -0.9f, 1.1f);
vfloat4 r = atan2(a, b);

EXPECT_NEAR(r.lane<0>(), -0.129816f, 0.005f);
EXPECT_NEAR(r.lane<1>(), 3.141592f, 0.005f);
EXPECT_NEAR(r.lane<2>(), 2.360342f, 0.005f);
Expand Down Expand Up @@ -909,31 +912,6 @@ TEST(vfloat4, select)
EXPECT_EQ(r2.lane<3>(), 4.0f);
}

/** @brief Test vfloat4 select MSB only. */
TEST(vfloat4, select_msb)
{
int msb_set = static_cast<int>(0x80000000);
vint4 msb(msb_set, 0, msb_set, 0);
vmask4 cond(msb.m);

vfloat4 a(1.0f, 3.0f, 3.0f, 1.0f);
vfloat4 b(4.0f, 2.0f, 2.0f, 4.0f);

// Select in one direction
vfloat4 r1 = select_msb(a, b, cond);
EXPECT_EQ(r1.lane<0>(), 4.0f);
EXPECT_EQ(r1.lane<1>(), 3.0f);
EXPECT_EQ(r1.lane<2>(), 2.0f);
EXPECT_EQ(r1.lane<3>(), 1.0f);

// Select in the other
vfloat4 r2 = select_msb(b, a, cond);
EXPECT_EQ(r2.lane<0>(), 1.0f);
EXPECT_EQ(r2.lane<1>(), 2.0f);
EXPECT_EQ(r2.lane<2>(), 3.0f);
EXPECT_EQ(r2.lane<3>(), 4.0f);
}

/** @brief Test vfloat4 gatherf. */
TEST(vfloat4, gatherf)
{
Expand Down Expand Up @@ -1839,12 +1817,17 @@ TEST(vint4, store_lanes_masked_unaligned)
EXPECT_TRUE(all(result3v == expect3v));
}

/** @brief Test vint4 pack_low_bytes. */
TEST(vint4, pack_low_bytes)
/** @brief Test vint4 pack_and_store_low_bytes. */
TEST(vint4, pack_and_store_low_bytes)
{
vint4 a(1, 2, 3, 4);
vint4 r = pack_low_bytes(a);
EXPECT_EQ(r.lane<0>(), (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0));
uint8_t bytes[4] { 0 };
pack_and_store_low_bytes(a, bytes);

EXPECT_EQ(bytes[0], 1);
EXPECT_EQ(bytes[1], 2);
EXPECT_EQ(bytes[2], 3);
EXPECT_EQ(bytes[3], 4);
}

/** @brief Test vint4 select. */
Expand Down Expand Up @@ -2711,46 +2694,6 @@ TEST(vfloat8, select)
EXPECT_EQ(ra[7], 4.0f);
}

/** @brief Test vfloat8 select MSB only. */
TEST(vfloat8, select_msb)
{
int msb_set = static_cast<int>(0x80000000);
vint8 msb = vint8_lit(msb_set, 0, msb_set, 0, msb_set, 0, msb_set, 0);
vmask8 cond(msb.m);

vfloat8 a = vfloat8_lit(1.0f, 3.0f, 3.0f, 1.0f, 1.0f, 3.0f, 3.0f, 1.0f);
vfloat8 b = vfloat8_lit(4.0f, 2.0f, 2.0f, 4.0f, 4.0f, 2.0f, 2.0f, 4.0f);

// Select in one direction
vfloat8 r1 = select(a, b, cond);

alignas(32) float ra[8];
storea(r1, ra);

EXPECT_EQ(ra[0], 4.0f);
EXPECT_EQ(ra[1], 3.0f);
EXPECT_EQ(ra[2], 2.0f);
EXPECT_EQ(ra[3], 1.0f);
EXPECT_EQ(ra[4], 4.0f);
EXPECT_EQ(ra[5], 3.0f);
EXPECT_EQ(ra[6], 2.0f);
EXPECT_EQ(ra[7], 1.0f);

// Select in the other
vfloat8 r2 = select(b, a, cond);

storea(r2, ra);

EXPECT_EQ(ra[0], 1.0f);
EXPECT_EQ(ra[1], 2.0f);
EXPECT_EQ(ra[2], 3.0f);
EXPECT_EQ(ra[3], 4.0f);
EXPECT_EQ(ra[4], 1.0f);
EXPECT_EQ(ra[5], 2.0f);
EXPECT_EQ(ra[6], 3.0f);
EXPECT_EQ(ra[7], 4.0f);
}

/** @brief Test vfloat8 gatherf. */
TEST(vfloat8, gatherf)
{
Expand Down Expand Up @@ -3583,17 +3526,22 @@ TEST(vint8, store_lanes_masked_unaligned)
EXPECT_TRUE(all(result3v == expect3v));
}

/** @brief Test vint8 pack_low_bytes. */
TEST(vint8, pack_low_bytes)
/** @brief Test vint8 pack_and_store_low_bytes. */
TEST(vint8, pack_and_store_low_bytes)
{
vint8 a = vint8_lit(1, 2, 3, 4, 2, 3, 4, 5);
vint8 r = pack_low_bytes(a);

alignas(32) int ra[8];
store(r, ra);

EXPECT_EQ(ra[0], (4 << 24) | (3 << 16) | (2 << 8) | (1 << 0));
EXPECT_EQ(ra[1], (5 << 24) | (4 << 16) | (3 << 8) | (2 << 0));
uint8_t bytes[8] { 0 };

pack_and_store_low_bytes(a, bytes);

EXPECT_EQ(bytes[0], 1);
EXPECT_EQ(bytes[1], 2);
EXPECT_EQ(bytes[2], 3);
EXPECT_EQ(bytes[3], 4);
EXPECT_EQ(bytes[4], 2);
EXPECT_EQ(bytes[5], 3);
EXPECT_EQ(bytes[6], 4);
EXPECT_EQ(bytes[7], 5);
}

/** @brief Test vint8 select. */
Expand Down
6 changes: 2 additions & 4 deletions Source/astcenc_ideal_endpoints_and_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1050,8 +1050,7 @@ void compute_quantized_weights_for_decimation(

// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, weight_set_out + i);
vint scn = pack_low_bytes(weight);
store_nbytes(scn, quantized_weight_set + i);
pack_and_store_low_bytes(weight, quantized_weight_set + i);
}
}
else
Expand Down Expand Up @@ -1084,8 +1083,7 @@ void compute_quantized_weights_for_decimation(

// Invert the weight-scaling that was done initially
storea(ixl * rscalev + low_boundv, weight_set_out + i);
vint scn = pack_low_bytes(weight);
store_nbytes(scn, quantized_weight_set + i);
pack_and_store_low_bytes(weight, quantized_weight_set + i);
}
}
}
Expand Down
8 changes: 6 additions & 2 deletions Source/astcenc_mathlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,14 @@
#endif
#endif

#ifndef ASTCENC_SVE
#define ASTCENC_SVE 0
#endif

// Force vector-sized SIMD alignment
#if ASTCENC_AVX
#if ASTCENC_AVX || ASTCENC_SVE == 8
#define ASTCENC_VECALIGN 32
#elif ASTCENC_SSE || ASTCENC_NEON
#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
#define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
Expand Down
Loading