EttusResearch · anilgurses · Sep 21, 2024
diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt
@@ -9,29 +9,51 @@
 # This file included, use CMake directory variables
 ########################################################################
 include(CheckIncludeFileCXX)
+include(CheckCXXCompilerFlag)
 message(STATUS "")
 
 ########################################################################
-# Check for SSE2 SIMD headers
+# Check for SIMD headers
 ########################################################################
+
+# Check for SSE2 support
+check_cxx_compiler_flag("-msse2" SSE2_SUPPORTED)
+if(SSE2_SUPPORTED)
+    message(STATUS "SSE2 is supported")
+endif(SSE2_SUPPORTED)
+
+# Check for SSE3 support
+check_cxx_compiler_flag("-msse3" SSE3_SUPPORTED)
+if(SSE3_SUPPORTED)
+    message(STATUS "SSE3 is supported")
+    set(SSE2_SUPPORTED OFF)
+endif(SSE3_SUPPORTED)
+
+# Check for AVX2 support
+check_cxx_compiler_flag("-mavx2" AVX2_SUPPORTED)
+# set(AVX2_SUPPORTED OFF)
+if(AVX2_SUPPORTED)
+    message(STATUS "AVX2 is supported")
+    # set(SSE3_SUPPORTED OFF)
+endif(AVX2_SUPPORTED)
+
+# Check for AVX2 support
+check_cxx_compiler_flag("-mavx512" AVX512_SUPPORTED)
+if(AVX512_SUPPORTED)
+    message(STATUS "AVX512 is supported")
+    set(AVX2_SUPPORTED OFF)
+endif(AVX512_SUPPORTED)
+
 if(CMAKE_COMPILER_IS_GNUCXX)
-    set(EMMINTRIN_FLAGS -msse2)
-    set(TMMINTRIN_FLAGS -mssse3)
+    set(SSE2_FLAGS -msse2)
+    set(SSE3_FLAGS -mssse3)
+    set(AVX2_FLAGS -mavx2)
+    set(AVX512_FLAGS -mavx512)
 elseif(MSVC)
-    set(EMMINTRIN_FLAGS /arch:SSE2)
+    set(SSE2_FLAGS /arch:SSE2)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
-
-if(ENABLE_SSSE3)
-set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
-endif(ENABLE_SSSE3)
-
-if(HAVE_EMMINTRIN_H)
+if(SSE2_SUPPORTED)
     set(convert_with_sse2_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
@@ -45,22 +67,41 @@ if(HAVE_EMMINTRIN_H)
     )
     set_source_files_properties(
         ${convert_with_sse2_sources}
-        PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
-endif(HAVE_EMMINTRIN_H)
+endif(SSE2_SUPPORTED)
 
-if(HAVE_TMMINTRIN_H)
+if(SSE3_SUPPORTED)
     set(convert_with_ssse3_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
     )
     set_source_files_properties(
         ${convert_with_ssse3_sources}
-        PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSE3_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
-endif(HAVE_TMMINTRIN_H)
+endif(SSE3_SUPPORTED)
+
+if(AVX2_SUPPORTED)
+    set(convert_with_avx2_sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc64.cpp # AVX2 conversion is not efficient as SSE2 for this case
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp 
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
+    )
+    set_source_files_properties(
+        ${convert_with_avx2_sources}
+        PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS} ${SSE2_FLAGS}"
+    )
+    LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
+endif(AVX2_SUPPORTED)
 
 ########################################################################
 # Check for NEON SIMD headers

diff --git a/host/lib/convert/avx2_fc32_to_sc16.cpp b/host/lib/convert/avx2_fc32_to_sc16.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_fc32_1_to_item32_1_nswap_guts(_al_)                               \
+    for (; i + 7 < nsamps; i += 8) {                                              \
+        /* load from input */                                                     \
+        __m256 tmplo =                                                            \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
+        __m256 tmphi =                                                            \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
+                                                                                  \
+        /* convert and scale */                                                   \
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));        \
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));        \
+                                                                                  \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(                          \
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */       \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(                          \
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */       \
+                                                                                  \
+        /* now pack the shuffled data sequentially */                             \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);              \
+                                                                                  \
+        /* pack + swap 16-bit pairs */                                            \
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));             \
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));             \
+                                                                                  \
+        /* store to output */                                                     \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);        \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_fc32_1_to_item32_1_nswap_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_fc32_1_to_item32_1_nswap_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            convert_fc32_1_to_item32_1_nswap_guts(u_)
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a
+// time
+#define convert_fc32_1_to_item32_1_bswap_guts(_al_)                                     \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256 tmplo =                                                                  \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));       \
+        __m256 tmphi =                                                                  \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));       \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));              \
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));              \
+                                                                                        \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(                                \
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */             \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(                                \
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */             \
+                                                                                        \
+        /* Now pack the shuffled data sequentially */                                   \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);                    \
+                                                                                        \
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);              \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_fc32_1_to_item32_1_bswap_guts(_) break;
+        case 0x8:
+            // the first value is 8-byte aligned - process it and prepare the bulk of the
+            // data for fast conversion
+            xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the remaining samples now that we are 16-byte
+            // aligned
+            convert_fc32_1_to_item32_1_bswap_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            convert_fc32_1_to_item32_1_bswap_guts(u_)
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_fc32_1_to_item32_1_guts(_al_)                                      \
+    for (; i + 7 < nsamps; i += 8) {                                               \
+        /* load from input */                                                      \
+        __m256 tmplo =                                                             \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));  \
+        __m256 tmphi =                                                             \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));  \
+                                                                                   \
+        /* convert and scale */                                                    \
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));         \
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));         \
+                                                                                   \
+        /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(                           \
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */        \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(                           \
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */        \
+                                                                                   \
+        /* Now pack the shuffled data sequentially */                              \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);               \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);         \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_fc32_1_to_item32_1_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            xx_to_chdr_sc16(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_fc32_1_to_item32_1_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            convert_fc32_1_to_item32_1_guts(u_)
+    }
+
+    // convert any remaining samples
+    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
+}