Skip to content

Commit db32fa6

Browse files
committed
Add AVX2 support for uhd::convert
1 parent d6ba2df commit db32fa6

10 files changed

+1383
-21
lines changed

host/lib/convert/CMakeLists.txt

Lines changed: 61 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,29 +9,51 @@
99
# This file included, use CMake directory variables
1010
########################################################################
1111
include(CheckIncludeFileCXX)
12+
include(CheckCXXCompilerFlag)
1213
message(STATUS "")
1314

1415
########################################################################
15-
# Check for SSE2 SIMD headers
16+
# Check for SIMD headers
1617
########################################################################
18+
19+
# Check for SSE2 support
20+
check_cxx_compiler_flag("-msse2" SSE2_SUPPORTED)
21+
if(SSE2_SUPPORTED)
22+
message(STATUS "SSE2 is supported")
23+
endif(SSE2_SUPPORTED)
24+
25+
# Check for SSE3 support
26+
check_cxx_compiler_flag("-msse3" SSE3_SUPPORTED)
27+
if(SSE3_SUPPORTED)
28+
message(STATUS "SSE3 is supported")
29+
set(SSE2_SUPPORTED OFF)
30+
endif(SSE3_SUPPORTED)
31+
32+
# Check for AVX2 support
33+
check_cxx_compiler_flag("-mavx2" AVX2_SUPPORTED)
34+
# set(AVX2_SUPPORTED OFF)
35+
if(AVX2_SUPPORTED)
36+
message(STATUS "AVX2 is supported")
37+
# set(SSE3_SUPPORTED OFF)
38+
endif(AVX2_SUPPORTED)
39+
40+
# Check for AVX2 support
41+
check_cxx_compiler_flag("-mavx512" AVX512_SUPPORTED)
42+
if(AVX512_SUPPORTED)
43+
message(STATUS "AVX512 is supported")
44+
set(AVX2_SUPPORTED OFF)
45+
endif(AVX512_SUPPORTED)
46+
1747
if(CMAKE_COMPILER_IS_GNUCXX)
18-
set(EMMINTRIN_FLAGS -msse2)
19-
set(TMMINTRIN_FLAGS -mssse3)
48+
set(SSE2_FLAGS -msse2)
49+
set(SSE3_FLAGS -mssse3)
50+
set(AVX2_FLAGS -mavx2)
51+
set(AVX512_FLAGS -mavx512)
2052
elseif(MSVC)
21-
set(EMMINTRIN_FLAGS /arch:SSE2)
53+
set(SSE2_FLAGS /arch:SSE2)
2254
endif()
2355

24-
set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
25-
CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
26-
unset(CMAKE_REQUIRED_FLAGS)
27-
28-
if(ENABLE_SSSE3)
29-
set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
30-
CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
31-
unset(CMAKE_REQUIRED_FLAGS)
32-
endif(ENABLE_SSSE3)
33-
34-
if(HAVE_EMMINTRIN_H)
56+
if(SSE2_SUPPORTED)
3557
set(convert_with_sse2_sources
3658
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
3759
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
@@ -45,22 +67,41 @@ if(HAVE_EMMINTRIN_H)
4567
)
4668
set_source_files_properties(
4769
${convert_with_sse2_sources}
48-
PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
70+
PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
4971
)
5072
LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
51-
endif(HAVE_EMMINTRIN_H)
73+
endif(SSE2_SUPPORTED)
5274

53-
if(HAVE_TMMINTRIN_H)
75+
if(SSE3_SUPPORTED)
5476
set(convert_with_ssse3_sources
5577
${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
5678
${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
5779
)
5880
set_source_files_properties(
5981
${convert_with_ssse3_sources}
60-
PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
82+
PROPERTIES COMPILE_FLAGS "${SSE3_FLAGS}"
6183
)
6284
LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
63-
endif(HAVE_TMMINTRIN_H)
85+
endif(SSE3_SUPPORTED)
86+
87+
if(AVX2_SUPPORTED)
88+
set(convert_with_avx2_sources
89+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
90+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
91+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
92+
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc64.cpp # AVX2 conversion is not efficient as SSE2 for this case
93+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
94+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp
95+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
96+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
97+
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
98+
)
99+
set_source_files_properties(
100+
${convert_with_avx2_sources}
101+
PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS} ${SSE2_FLAGS}"
102+
)
103+
LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
104+
endif(AVX2_SUPPORTED)
64105

65106
########################################################################
66107
# Check for NEON SIMD headers
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
//
2+
// Copyright 2024 Ettus Research, a National Instruments Brand
3+
//
4+
// SPDX-License-Identifier: GPL-3.0-or-later
5+
//
6+
7+
#include "convert_common.hpp"
8+
#include <uhd/utils/byteswap.hpp>
9+
#include <immintrin.h>
10+
11+
using namespace uhd::convert;
12+
13+
DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD)
14+
{
15+
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
16+
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
17+
18+
const __m256 scalar = _mm256_set1_ps(float(scale_factor));
19+
20+
// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
21+
#define convert_fc32_1_to_item32_1_nswap_guts(_al_) \
22+
for (; i + 7 < nsamps; i += 8) { \
23+
/* load from input */ \
24+
__m256 tmplo = \
25+
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
26+
__m256 tmphi = \
27+
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
28+
\
29+
/* convert and scale */ \
30+
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \
31+
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \
32+
\
33+
__m256i shuffled_lo = _mm256_permute2x128_si256( \
34+
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \
35+
__m256i shuffled_hi = _mm256_permute2x128_si256( \
36+
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \
37+
\
38+
/* now pack the shuffled data sequentially */ \
39+
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \
40+
\
41+
/* pack + swap 16-bit pairs */ \
42+
tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
43+
tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
44+
\
45+
/* store to output */ \
46+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \
47+
}
48+
49+
size_t i = 0;
50+
51+
// need to dispatch according to alignment for fastest conversion
52+
switch (size_t(input) & 0xf) {
53+
case 0x0:
54+
// the data is 16-byte aligned, so do the fast processing of the bulk of the
55+
// samples
56+
convert_fc32_1_to_item32_1_nswap_guts(_) break;
57+
case 0x8:
58+
// the first sample is 8-byte aligned - process it to align the remainder of
59+
// the samples to 16-bytes
60+
xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);
61+
i++;
62+
// do faster processing of the bulk of the samples now that we are 16-byte
63+
// aligned
64+
convert_fc32_1_to_item32_1_nswap_guts(_) break;
65+
default:
66+
// we are not 8 or 16-byte aligned, so do fast processing with the unaligned
67+
// load
68+
convert_fc32_1_to_item32_1_nswap_guts(u_)
69+
}
70+
71+
// convert any remaining samples
72+
xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
73+
}
74+
75+
DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD)
76+
{
77+
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
78+
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);
79+
80+
const __m256 scalar = _mm256_set1_ps(float(scale_factor));
81+
82+
// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a
83+
// time
84+
#define convert_fc32_1_to_item32_1_bswap_guts(_al_) \
85+
for (; i + 7 < nsamps; i += 8) { \
86+
/* load from input */ \
87+
__m256 tmplo = \
88+
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
89+
__m256 tmphi = \
90+
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
91+
\
92+
/* convert and scale */ \
93+
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \
94+
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \
95+
\
96+
__m256i shuffled_lo = _mm256_permute2x128_si256( \
97+
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \
98+
__m256i shuffled_hi = _mm256_permute2x128_si256( \
99+
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \
100+
\
101+
/* Now pack the shuffled data sequentially */ \
102+
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \
103+
\
104+
tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
105+
\
106+
/* store to output */ \
107+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \
108+
}
109+
110+
size_t i = 0;
111+
112+
// need to dispatch according to alignment for fastest conversion
113+
switch (size_t(input) & 0xf) {
114+
case 0x0:
115+
// the data is 16-byte aligned, so do the fast processing of the bulk of the
116+
// samples
117+
convert_fc32_1_to_item32_1_bswap_guts(_) break;
118+
case 0x8:
119+
// the first value is 8-byte aligned - process it and prepare the bulk of the
120+
// data for fast conversion
121+
xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);
122+
i++;
123+
// do faster processing of the remaining samples now that we are 16-byte
124+
// aligned
125+
convert_fc32_1_to_item32_1_bswap_guts(_) break;
126+
default:
127+
// we are not 8 or 16-byte aligned, so do fast processing with the unaligned
128+
// load
129+
convert_fc32_1_to_item32_1_bswap_guts(u_)
130+
}
131+
132+
// convert any remaining samples
133+
xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
134+
}
135+
136+
DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD)
137+
{
138+
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
139+
sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]);
140+
141+
const __m256 scalar = _mm256_set1_ps(float(scale_factor));
142+
143+
// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
144+
#define convert_fc32_1_to_item32_1_guts(_al_) \
145+
for (; i + 7 < nsamps; i += 8) { \
146+
/* load from input */ \
147+
__m256 tmplo = \
148+
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
149+
__m256 tmphi = \
150+
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
151+
\
152+
/* convert and scale */ \
153+
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \
154+
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \
155+
\
156+
/* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \
157+
__m256i shuffled_lo = _mm256_permute2x128_si256( \
158+
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \
159+
__m256i shuffled_hi = _mm256_permute2x128_si256( \
160+
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \
161+
\
162+
/* Now pack the shuffled data sequentially */ \
163+
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \
164+
\
165+
/* store to output */ \
166+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \
167+
}
168+
169+
size_t i = 0;
170+
171+
// need to dispatch according to alignment for fastest conversion
172+
switch (size_t(input) & 0xf) {
173+
case 0x0:
174+
// the data is 16-byte aligned, so do the fast processing of the bulk of the
175+
// samples
176+
convert_fc32_1_to_item32_1_guts(_) break;
177+
case 0x8:
178+
// the first sample is 8-byte aligned - process it to align the remainder of
179+
// the samples to 16-bytes
180+
xx_to_chdr_sc16(input, output, 1, scale_factor);
181+
i++;
182+
// do faster processing of the bulk of the samples now that we are 16-byte
183+
// aligned
184+
convert_fc32_1_to_item32_1_guts(_) break;
185+
default:
186+
// we are not 8 or 16-byte aligned, so do fast processing with the unaligned
187+
// load
188+
convert_fc32_1_to_item32_1_guts(u_)
189+
}
190+
191+
// convert any remaining samples
192+
xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
193+
}

0 commit comments

Comments
 (0)