Skip to content

Commit 546f9dd

Browse files
rygorousFabian Giesensolidpixel
authored
Add gatherf_byte_inds for gathers using byte indices from memory (#511)
Adds a new function to wrap gathers using byte indices from memory, avoiding the byte-to-int conversion for ISAs that don't have native gathers. Adds a new build option ASTCENC_X86_GATHERS (default ON) to allow builds to disable use of native gathers on X86 as they are much slower than scalar fallbacks on some microarchitectures (AMD Zen, pre-Skylake Intel). Co-authored-by: Fabian Giesen <[email protected]> Co-authored-by: Pete Harris <[email protected]>
1 parent 521179c commit 546f9dd

13 files changed

+144
-36
lines changed

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ option(ASTCENC_UBSAN "Enable astcenc builds with undefined behavior sanitizer")
5151
option(ASTCENC_UNITTEST "Enable astcenc builds with unit tests")
5252
option(ASTCENC_INVARIANCE "Enable astcenc floating point invariance" ON)
5353
option(ASTCENC_CLI "Enable build of astcenc command line tools" ON)
54+
option(ASTCENC_X86_GATHERS "Enable use of native x86 gathers" ON)
5455

5556
# Preflight for some macOS-specific build options
5657
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
@@ -127,6 +128,7 @@ message(STATUS "x86-64 backend options")
127128
printopt("AVX2 backend " ${ASTCENC_ISA_AVX2})
128129
printopt("SSE4.1 backend " ${ASTCENC_ISA_SSE41})
129130
printopt("SSE2 backend " ${ASTCENC_ISA_SSE2})
131+
printopt("Use native gathers " ${ASTCENC_X86_GATHERS})
130132
message(STATUS "Agnostic backend options")
131133
printopt("NONE backend " ${ASTCENC_ISA_NONE})
132134
printopt("NATIVE backend " ${ASTCENC_ISA_NATIVE})

Docs/Building.md

+11
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,17 @@ To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
203203
line when configuring. It is NOT recommended to use this for production; it is
204204
significantly slower than the vectorized SIMD builds.
205205

206+
### No x86 gather instruction builds
207+
208+
On many x86 microarchitectures the native AVX gather instructions are slower
209+
than simply performing manual scalar loads and combining the results. Gathers
210+
are enabled by default, but can be disabled by setting the CMake option
211+
`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring.
212+
213+
Note that we have seen mixed results when compiling the scalar fallback path,
214+
so we would recommend testing which option works best for the compiler and
215+
microarchitecture pairing that you are targeting.
216+
206217
### Test builds
207218

208219
We support building unit tests. These use the `googletest` framework, which is

Source/astcenc_averages_and_directions.cpp

+9-9
Original file line numberDiff line numberDiff line change
@@ -778,12 +778,12 @@ void compute_error_squared_rgba(
778778
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
779779
{
780780
vmask mask = lane_ids < vint(texel_count);
781-
vint texel_idxs(texel_indexes + i);
781+
const uint8_t* texel_idxs = texel_indexes + i;
782782

783-
vfloat data_r = gatherf(blk.data_r, texel_idxs);
784-
vfloat data_g = gatherf(blk.data_g, texel_idxs);
785-
vfloat data_b = gatherf(blk.data_b, texel_idxs);
786-
vfloat data_a = gatherf(blk.data_a, texel_idxs);
783+
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
784+
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
785+
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
786+
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
787787

788788
vfloat uncor_param = (data_r * l_uncor_bs0)
789789
+ (data_g * l_uncor_bs1)
@@ -892,11 +892,11 @@ void compute_error_squared_rgb(
892892
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
893893
{
894894
vmask mask = lane_ids < vint(texel_count);
895-
vint texel_idxs(texel_indexes + i);
895+
const uint8_t* texel_idxs = texel_indexes + i;
896896

897-
vfloat data_r = gatherf(blk.data_r, texel_idxs);
898-
vfloat data_g = gatherf(blk.data_g, texel_idxs);
899-
vfloat data_b = gatherf(blk.data_b, texel_idxs);
897+
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
898+
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
899+
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
900900

901901
vfloat uncor_param = (data_r * l_uncor_bs0)
902902
+ (data_g * l_uncor_bs1)

Source/astcenc_ideal_endpoints_and_weights.cpp

+19-19
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,16 @@ static vfloat bilinear_infill_vla(
4141
unsigned int index
4242
) {
4343
// Load the bilinear filter texel weight indexes in the decimated grid
44-
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
45-
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
46-
vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
47-
vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
44+
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
45+
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
46+
const uint8_t* weight_idx2 = di.texel_weights_tr[2] + index;
47+
const uint8_t* weight_idx3 = di.texel_weights_tr[3] + index;
4848

4949
// Load the bilinear filter weights from the decimated grid
50-
vfloat weight_val0 = gatherf(weights, weight_idx0);
51-
vfloat weight_val1 = gatherf(weights, weight_idx1);
52-
vfloat weight_val2 = gatherf(weights, weight_idx2);
53-
vfloat weight_val3 = gatherf(weights, weight_idx3);
50+
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
51+
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
52+
vfloat weight_val2 = gatherf_byte_inds<vfloat>(weights, weight_idx2);
53+
vfloat weight_val3 = gatherf_byte_inds<vfloat>(weights, weight_idx3);
5454

5555
// Load the weight contribution factors for each decimated weight
5656
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@@ -81,12 +81,12 @@ static vfloat bilinear_infill_vla_2(
8181
unsigned int index
8282
) {
8383
// Load the bilinear filter texel weight indexes in the decimated grid
84-
vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
85-
vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
84+
const uint8_t* weight_idx0 = di.texel_weights_tr[0] + index;
85+
const uint8_t* weight_idx1 = di.texel_weights_tr[1] + index;
8686

8787
// Load the bilinear filter weights from the decimated grid
88-
vfloat weight_val0 = gatherf(weights, weight_idx0);
89-
vfloat weight_val1 = gatherf(weights, weight_idx1);
88+
vfloat weight_val0 = gatherf_byte_inds<vfloat>(weights, weight_idx0);
89+
vfloat weight_val1 = gatherf_byte_inds<vfloat>(weights, weight_idx1);
9090

9191
// Load the weight contribution factors for each decimated weight
9292
vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
@@ -894,18 +894,18 @@ void compute_ideal_weights_for_decimation(
894894

895895
for (unsigned int j = 0; j < max_texel_count; j++)
896896
{
897-
vint texel(di.weight_texels_tr[j] + i);
897+
const uint8_t* texel = di.weight_texels_tr[j] + i;
898898
vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
899899

900900
if (!constant_wes)
901901
{
902-
weight_error_scale = gatherf(ei.weight_error_scale, texel);
902+
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
903903
}
904904

905905
vfloat contrib_weight = weight * weight_error_scale;
906906

907907
weight_weight += contrib_weight;
908-
initial_weight += gatherf(ei.weights, texel) * contrib_weight;
908+
initial_weight += gatherf_byte_inds<vfloat>(ei.weights, texel) * contrib_weight;
909909
}
910910

911911
storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
@@ -952,17 +952,17 @@ void compute_ideal_weights_for_decimation(
952952

953953
for (unsigned int j = 0; j < max_texel_count; j++)
954954
{
955-
vint texel(di.weight_texels_tr[j] + i);
955+
const uint8_t* texel = di.weight_texels_tr[j] + i;
956956
vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
957957

958958
if (!constant_wes)
959959
{
960-
weight_error_scale = gatherf(ei.weight_error_scale, texel);
960+
weight_error_scale = gatherf_byte_inds<vfloat>(ei.weight_error_scale, texel);
961961
}
962962

963963
vfloat scale = weight_error_scale * contrib_weight;
964-
vfloat old_weight = gatherf(infilled_weights, texel);
965-
vfloat ideal_weight = gatherf(ei.weights, texel);
964+
vfloat old_weight = gatherf_byte_inds<vfloat>(infilled_weights, texel);
965+
vfloat ideal_weight = gatherf_byte_inds<vfloat>(ei.weights, texel);
966966

967967
error_change0 += contrib_weight * scale;
968968
error_change1 += (old_weight - ideal_weight) * scale;

Source/astcenc_mathlib.h

+2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@
5858
#ifndef ASTCENC_AVX
5959
#if defined(__AVX2__)
6060
#define ASTCENC_AVX 2
61+
#define ASTCENC_X86_GATHERS 1
6162
#elif defined(__AVX__)
6263
#define ASTCENC_AVX 1
64+
#define ASTCENC_X86_GATHERS 1
6365
#else
6466
#define ASTCENC_AVX 0
6567
#endif

Source/astcenc_pick_best_endpoint_format.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -123,21 +123,21 @@ static void compute_error_squared_rgb_single_partition(
123123
vint lane_ids = vint::lane_id();
124124
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
125125
{
126-
vint tix(texel_indexes + i);
126+
const uint8_t* tix = texel_indexes + i;
127127

128128
vmask mask = lane_ids < vint(texel_count);
129129
lane_ids += vint(ASTCENC_SIMD_WIDTH);
130130

131131
// Compute the error that arises from just ditching alpha
132-
vfloat data_a = gatherf(blk.data_a, tix);
132+
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, tix);
133133
vfloat alpha_diff = data_a - default_a;
134134
alpha_diff = alpha_diff * alpha_diff;
135135

136136
haccumulate(a_drop_errv, alpha_diff, mask);
137137

138-
vfloat data_r = gatherf(blk.data_r, tix);
139-
vfloat data_g = gatherf(blk.data_g, tix);
140-
vfloat data_b = gatherf(blk.data_b, tix);
138+
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, tix);
139+
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, tix);
140+
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, tix);
141141

142142
// Compute uncorrelated error
143143
vfloat param = data_r * uncor_bs0

Source/astcenc_vecmathlib.h

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@
7777
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
7878
#endif
7979

80+
template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
81+
8082
#if ASTCENC_AVX >= 2
8183
// If we have AVX2 expose 8-wide VLA.
8284
#include "astcenc_vecmathlib_sse_4.h"

Source/astcenc_vecmathlib_avx2_8.h

+27
Original file line numberDiff line numberDiff line change
@@ -903,6 +903,33 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
903903
return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
904904
}
905905

906+
/**
907+
* @brief Load a vector of gathered results from an array using byte indices from memory
908+
*/
909+
template<>
910+
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
911+
{
912+
#if ASTCENC_X86_GATHERS == 0
913+
// Perform manual gather using scalar loads in two separate dependency chains,
914+
// then merge late. MSVC translates this 1:1, which is OK. Clang turns it
915+
// into a bunch of memory-operand inserts on 128-bit halves then merges late,
916+
// which performs significantly worse in tests.
917+
__m256 m0 = _mm256_broadcast_ss(base + indices[0]);
918+
__m256 m1 = _mm256_broadcast_ss(base + indices[1]);
919+
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[2]), 1 << 2);
920+
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[3]), 1 << 3);
921+
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[4]), 1 << 4);
922+
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[5]), 1 << 5);
923+
m0 = _mm256_blend_ps(m0, _mm256_broadcast_ss(base + indices[6]), 1 << 6);
924+
m1 = _mm256_blend_ps(m1, _mm256_broadcast_ss(base + indices[7]), 1 << 7);
925+
926+
return vfloat8(_mm256_blend_ps(m0, m1, 0xaa));
927+
#else
928+
vint8 inds(indices);
929+
return gatherf(base, inds);
930+
#endif
931+
}
932+
906933
/**
907934
* @brief Store a vector to an unaligned memory address.
908935
*/

Source/astcenc_vecmathlib_neon_4.h

+19
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,25 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
828828
#endif
829829
}
830830

831+
/**
832+
* @brief Load a vector of gathered results from an array using byte indices from memory
833+
*/
834+
template<>
835+
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
836+
{
837+
#if ASTCENC_SVE == 0
838+
alignas(16) float vals[4];
839+
vals[0] = base[indices[0]];
840+
vals[1] = base[indices[1]];
841+
vals[2] = base[indices[2]];
842+
vals[3] = base[indices[3]];
843+
return vfloat4(vals);
844+
#else
845+
svint32_t offsets = svld1ub_s32(svptrue_pat_b32(SV_VL4), indices);
846+
svfloat32_t data = svld1_gather_s32index_f32(svptrue_pat_b32(SV_VL4), base, offsets);
847+
return vfloat4(svget_neonq_f32(data));
848+
#endif
849+
}
831850
/**
832851
* @brief Store a vector to an unaligned memory address.
833852
*/

Source/astcenc_vecmathlib_none_4.h

+12
Original file line numberDiff line numberDiff line change
@@ -943,6 +943,18 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
943943
base[indices.m[3]]);
944944
}
945945

946+
/**
947+
* @brief Load a vector of gathered results from an array using byte indices from memory
948+
*/
949+
template<>
950+
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
951+
{
952+
return vfloat4(base[indices[0]],
953+
base[indices[1]],
954+
base[indices[2]],
955+
base[indices[3]]);
956+
}
957+
946958
/**
947959
* @brief Store a vector to an unaligned memory address.
948960
*/

Source/astcenc_vecmathlib_sse_4.h

+18-1
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,7 @@ ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
900900
*/
901901
ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
902902
{
903-
#if ASTCENC_AVX >= 2
903+
#if ASTCENC_AVX >= 2 && ASTCENC_X86_GATHERS != 0
904904
return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
905905
#else
906906
alignas(16) int idx[4];
@@ -909,6 +909,23 @@ ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
909909
#endif
910910
}
911911

912+
/**
913+
* @brief Load a vector of gathered results from an array using byte indices from memory
914+
*/
915+
template<>
916+
ASTCENC_SIMD_INLINE vfloat4 gatherf_byte_inds<vfloat4>(const float* base, const uint8_t* indices)
917+
{
918+
// Experimentally, in this particular use case (byte indices in memory),
919+
// using 4 separate scalar loads is appreciably faster than using gathers
920+
// even if they're available, on every x86 uArch tried, so always do the
921+
// separate loads even when ASTCENC_X86_GATHERS is enabled.
922+
//
923+
// Tested on:
924+
// - Intel Skylake-X, Coffee Lake, Crestmont, Redwood Cove
925+
// - AMD Zen 2, Zen 4
926+
return vfloat4(base[indices[0]], base[indices[1]], base[indices[2]], base[indices[3]]);
927+
}
928+
912929
/**
913930
* @brief Store a vector to an unaligned memory address.
914931
*/

Source/astcenc_vecmathlib_sve_8.h

+10
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,16 @@ ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
841841
return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, indices.m));
842842
}
843843

844+
/**
845+
* @brief Load a vector of gathered results from an array using byte indices from memory
846+
*/
847+
template<>
848+
ASTCENC_SIMD_INLINE vfloat8 gatherf_byte_inds<vfloat8>(const float* base, const uint8_t* indices)
849+
{
850+
svint32_t offsets = svld1ub_s32(svptrue_b32(), indices);
851+
return vfloat8(svld1_gather_s32index_f32(svptrue_b32(), base, offsets));
852+
}
853+
844854
/**
845855
* @brief Store a vector to an unaligned memory address.
846856
*/

Source/cmake_core.cmake

+8-2
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
359359
ASTCENC_SSE=20
360360
ASTCENC_AVX=0
361361
ASTCENC_POPCNT=0
362-
ASTCENC_F16C=0)
362+
ASTCENC_F16C=0
363+
ASTCENC_X86_GATHERS=0)
363364

364365
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
365366
target_compile_options(${ASTCENC_TARGET_NAME}
@@ -377,7 +378,8 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
377378
ASTCENC_SSE=41
378379
ASTCENC_AVX=0
379380
ASTCENC_POPCNT=1
380-
ASTCENC_F16C=0)
381+
ASTCENC_F16C=0
382+
ASTCENC_X86_GATHERS=0)
381383

382384
if (${ASTCENC_VENEER_TYPE} GREATER 0)
383385
# Force SSE2 on AppleClang (normally SSE4.1 is the default)
@@ -395,12 +397,16 @@ macro(astcenc_set_properties ASTCENC_TARGET_NAME ASTCENC_VENEER_TYPE)
395397
endif()
396398

397399
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
400+
# Gathers are quite slow on many x86 microarchitectures, to the point where
401+
# it can be significantly faster to just avoid them use scalar loads.
402+
398403
target_compile_definitions(${ASTCENC_TARGET_NAME}
399404
PRIVATE
400405
ASTCENC_NEON=0
401406
ASTCENC_SVE=0
402407
ASTCENC_SSE=41
403408
ASTCENC_AVX=2
409+
ASTCENC_X86_GATHERS=$<BOOL:${ASTCENC_X86_GATHERS}>
404410
ASTCENC_POPCNT=1
405411
ASTCENC_F16C=1)
406412

0 commit comments

Comments
 (0)