Skip to content

host: Add AVX2 support for uhd::convert #789

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 61 additions & 20 deletions host/lib/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,51 @@
# This file included, use CMake directory variables
########################################################################
include(CheckIncludeFileCXX)
include(CheckCXXCompilerFlag)
message(STATUS "")

########################################################################
# Check for SSE2 SIMD headers
# Check for SIMD headers
########################################################################

# Check for SSE2 support
check_cxx_compiler_flag("-msse2" SSE2_SUPPORTED)
if(SSE2_SUPPORTED)
message(STATUS "SSE2 is supported")
endif(SSE2_SUPPORTED)

# Check for SSE3 support
check_cxx_compiler_flag("-msse3" SSE3_SUPPORTED)
if(SSE3_SUPPORTED)
message(STATUS "SSE3 is supported")
set(SSE2_SUPPORTED OFF)
endif(SSE3_SUPPORTED)

# Check for AVX2 support
check_cxx_compiler_flag("-mavx2" AVX2_SUPPORTED)
# set(AVX2_SUPPORTED OFF)
if(AVX2_SUPPORTED)
message(STATUS "AVX2 is supported")
# set(SSE3_SUPPORTED OFF)
endif(AVX2_SUPPORTED)

# Check for AVX2 support
check_cxx_compiler_flag("-mavx512" AVX512_SUPPORTED)
if(AVX512_SUPPORTED)
message(STATUS "AVX512 is supported")
set(AVX2_SUPPORTED OFF)
endif(AVX512_SUPPORTED)

if(CMAKE_COMPILER_IS_GNUCXX)
set(EMMINTRIN_FLAGS -msse2)
set(TMMINTRIN_FLAGS -mssse3)
set(SSE2_FLAGS -msse2)
set(SSE3_FLAGS -mssse3)
set(AVX2_FLAGS -mavx2)
set(AVX512_FLAGS -mavx512)
elseif(MSVC)
set(EMMINTRIN_FLAGS /arch:SSE2)
set(SSE2_FLAGS /arch:SSE2)
endif()

set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
unset(CMAKE_REQUIRED_FLAGS)

if(ENABLE_SSSE3)
set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
unset(CMAKE_REQUIRED_FLAGS)
endif(ENABLE_SSSE3)

if(HAVE_EMMINTRIN_H)
if(SSE2_SUPPORTED)
set(convert_with_sse2_sources
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
Expand All @@ -45,22 +67,41 @@ if(HAVE_EMMINTRIN_H)
)
set_source_files_properties(
${convert_with_sse2_sources}
PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
)
LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
endif(HAVE_EMMINTRIN_H)
endif(SSE2_SUPPORTED)

if(HAVE_TMMINTRIN_H)
if(SSE3_SUPPORTED)
set(convert_with_ssse3_sources
${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
)
set_source_files_properties(
${convert_with_ssse3_sources}
PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
PROPERTIES COMPILE_FLAGS "${SSE3_FLAGS}"
)
LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
endif(HAVE_TMMINTRIN_H)
endif(SSE3_SUPPORTED)

if(AVX2_SUPPORTED)
set(convert_with_avx2_sources
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc64.cpp # AVX2 conversion is not efficient as SSE2 for this case
${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
)
set_source_files_properties(
${convert_with_avx2_sources}
PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS} ${SSE2_FLAGS}"
)
LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
endif(AVX2_SUPPORTED)

########################################################################
# Check for NEON SIMD headers
Expand Down
193 changes: 193 additions & 0 deletions host/lib/convert/avx2_fc32_to_sc16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
//
// Copyright 2024 Ettus Research, a National Instruments Brand
//
// SPDX-License-Identifier: GPL-3.0-or-later
//

#include "convert_common.hpp"
#include <uhd/utils/byteswap.hpp>
#include <immintrin.h>

using namespace uhd::convert;

DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));

// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
#define convert_fc32_1_to_item32_1_nswap_guts(_al_) \
for (; i + 7 < nsamps; i += 8) { \
/* load from input */ \
__m256 tmplo = \
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
__m256 tmphi = \
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
\
/* convert and scale */ \
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \
\
__m256i shuffled_lo = _mm256_permute2x128_si256( \
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \
__m256i shuffled_hi = _mm256_permute2x128_si256( \
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \
\
/* now pack the shuffled data sequentially */ \
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \
\
/* pack + swap 16-bit pairs */ \
tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \
\
/* store to output */ \
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \
}

size_t i = 0;

// need to dispatch according to alignment for fastest conversion
switch (size_t(input) & 0xf) {
case 0x0:
// the data is 16-byte aligned, so do the fast processing of the bulk of the
// samples
convert_fc32_1_to_item32_1_nswap_guts(_) break;
case 0x8:
// the first sample is 8-byte aligned - process it to align the remainder of
// the samples to 16-bytes
xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);
i++;
// do faster processing of the bulk of the samples now that we are 16-byte
// aligned
convert_fc32_1_to_item32_1_nswap_guts(_) break;
default:
// we are not 8 or 16-byte aligned, so do fast processing with the unaligned
// load
convert_fc32_1_to_item32_1_nswap_guts(u_)
}

// convert any remaining samples
xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
}

DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
item32_t* output = reinterpret_cast<item32_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));

// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a
// time
#define convert_fc32_1_to_item32_1_bswap_guts(_al_) \
for (; i + 7 < nsamps; i += 8) { \
/* load from input */ \
__m256 tmplo = \
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
__m256 tmphi = \
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
\
/* convert and scale */ \
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \
\
__m256i shuffled_lo = _mm256_permute2x128_si256( \
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \
__m256i shuffled_hi = _mm256_permute2x128_si256( \
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \
\
/* Now pack the shuffled data sequentially */ \
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \
\
tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
\
/* store to output */ \
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \
}

size_t i = 0;

// need to dispatch according to alignment for fastest conversion
switch (size_t(input) & 0xf) {
case 0x0:
// the data is 16-byte aligned, so do the fast processing of the bulk of the
// samples
convert_fc32_1_to_item32_1_bswap_guts(_) break;
case 0x8:
// the first value is 8-byte aligned - process it and prepare the bulk of the
// data for fast conversion
xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);
i++;
// do faster processing of the remaining samples now that we are 16-byte
// aligned
convert_fc32_1_to_item32_1_bswap_guts(_) break;
default:
// we are not 8 or 16-byte aligned, so do fast processing with the unaligned
// load
convert_fc32_1_to_item32_1_bswap_guts(u_)
}

// convert any remaining samples
xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
}

DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD)
{
const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]);

const __m256 scalar = _mm256_set1_ps(float(scale_factor));

// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
#define convert_fc32_1_to_item32_1_guts(_al_) \
for (; i + 7 < nsamps; i += 8) { \
/* load from input */ \
__m256 tmplo = \
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
__m256 tmphi = \
_mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
\
/* convert and scale */ \
__m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \
__m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \
\
/* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \
__m256i shuffled_lo = _mm256_permute2x128_si256( \
tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \
__m256i shuffled_hi = _mm256_permute2x128_si256( \
tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \
\
/* Now pack the shuffled data sequentially */ \
__m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \
\
/* store to output */ \
_mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \
}

size_t i = 0;

// need to dispatch according to alignment for fastest conversion
switch (size_t(input) & 0xf) {
case 0x0:
// the data is 16-byte aligned, so do the fast processing of the bulk of the
// samples
convert_fc32_1_to_item32_1_guts(_) break;
case 0x8:
// the first sample is 8-byte aligned - process it to align the remainder of
// the samples to 16-bytes
xx_to_chdr_sc16(input, output, 1, scale_factor);
i++;
// do faster processing of the bulk of the samples now that we are 16-byte
// aligned
convert_fc32_1_to_item32_1_guts(_) break;
default:
// we are not 8 or 16-byte aligned, so do fast processing with the unaligned
// load
convert_fc32_1_to_item32_1_guts(u_)
}

// convert any remaining samples
xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
}
Loading