|
| 1 | +// |
| 2 | +// Copyright 2024 Ettus Research, a National Instruments Brand |
| 3 | +// |
| 4 | +// SPDX-License-Identifier: GPL-3.0-or-later |
| 5 | +// |
| 6 | + |
| 7 | +#include "convert_common.hpp" |
| 8 | +#include <uhd/utils/byteswap.hpp> |
| 9 | +#include <immintrin.h> |
| 10 | + |
| 11 | +using namespace uhd::convert; |
| 12 | + |
| 13 | +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD) |
| 14 | +{ |
| 15 | + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); |
| 16 | + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); |
| 17 | + |
| 18 | + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); |
| 19 | + |
| 20 | +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time |
| 21 | +#define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ |
| 22 | + for (; i + 7 < nsamps; i += 8) { \ |
| 23 | + /* load from input */ \ |
| 24 | + __m256 tmplo = \ |
| 25 | + _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ |
| 26 | + __m256 tmphi = \ |
| 27 | + _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ |
| 28 | + \ |
| 29 | + /* convert and scale */ \ |
| 30 | + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \ |
| 31 | + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \ |
| 32 | + \ |
| 33 | + __m256i shuffled_lo = _mm256_permute2x128_si256( \ |
| 34 | + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \ |
| 35 | + __m256i shuffled_hi = _mm256_permute2x128_si256( \ |
| 36 | + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \ |
| 37 | + \ |
| 38 | + /* now pack the shuffled data sequentially */ \ |
| 39 | + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ |
| 40 | + \ |
| 41 | + /* pack + swap 16-bit pairs */ \ |
| 42 | + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ |
| 43 | + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ |
| 44 | + \ |
| 45 | + /* store to output */ \ |
| 46 | + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ |
| 47 | + } |
| 48 | + |
| 49 | + size_t i = 0; |
| 50 | + |
| 51 | + // need to dispatch according to alignment for fastest conversion |
| 52 | + switch (size_t(input) & 0xf) { |
| 53 | + case 0x0: |
| 54 | + // the data is 16-byte aligned, so do the fast processing of the bulk of the |
| 55 | + // samples |
| 56 | + convert_fc32_1_to_item32_1_nswap_guts(_) break; |
| 57 | + case 0x8: |
| 58 | + // the first sample is 8-byte aligned - process it to align the remainder of |
| 59 | + // the samples to 16-bytes |
| 60 | + xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); |
| 61 | + i++; |
| 62 | + // do faster processing of the bulk of the samples now that we are 16-byte |
| 63 | + // aligned |
| 64 | + convert_fc32_1_to_item32_1_nswap_guts(_) break; |
| 65 | + default: |
| 66 | + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned |
| 67 | + // load |
| 68 | + convert_fc32_1_to_item32_1_nswap_guts(u_) |
| 69 | + } |
| 70 | + |
| 71 | + // convert any remaining samples |
| 72 | + xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor); |
| 73 | +} |
| 74 | + |
| 75 | +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD) |
| 76 | +{ |
| 77 | + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); |
| 78 | + item32_t* output = reinterpret_cast<item32_t*>(outputs[0]); |
| 79 | + |
| 80 | + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); |
| 81 | + |
| 82 | +// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a |
| 83 | +// time |
| 84 | +#define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ |
| 85 | + for (; i + 7 < nsamps; i += 8) { \ |
| 86 | + /* load from input */ \ |
| 87 | + __m256 tmplo = \ |
| 88 | + _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ |
| 89 | + __m256 tmphi = \ |
| 90 | + _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ |
| 91 | + \ |
| 92 | + /* convert and scale */ \ |
| 93 | + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \ |
| 94 | + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \ |
| 95 | + \ |
| 96 | + __m256i shuffled_lo = _mm256_permute2x128_si256( \ |
| 97 | + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \ |
| 98 | + __m256i shuffled_hi = _mm256_permute2x128_si256( \ |
| 99 | + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \ |
| 100 | + \ |
| 101 | + /* Now pack the shuffled data sequentially */ \ |
| 102 | + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ |
| 103 | + \ |
| 104 | + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \ |
| 105 | + \ |
| 106 | + /* store to output */ \ |
| 107 | + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ |
| 108 | + } |
| 109 | + |
| 110 | + size_t i = 0; |
| 111 | + |
| 112 | + // need to dispatch according to alignment for fastest conversion |
| 113 | + switch (size_t(input) & 0xf) { |
| 114 | + case 0x0: |
| 115 | + // the data is 16-byte aligned, so do the fast processing of the bulk of the |
| 116 | + // samples |
| 117 | + convert_fc32_1_to_item32_1_bswap_guts(_) break; |
| 118 | + case 0x8: |
| 119 | + // the first value is 8-byte aligned - process it and prepare the bulk of the |
| 120 | + // data for fast conversion |
| 121 | + xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); |
| 122 | + i++; |
| 123 | + // do faster processing of the remaining samples now that we are 16-byte |
| 124 | + // aligned |
| 125 | + convert_fc32_1_to_item32_1_bswap_guts(_) break; |
| 126 | + default: |
| 127 | + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned |
| 128 | + // load |
| 129 | + convert_fc32_1_to_item32_1_bswap_guts(u_) |
| 130 | + } |
| 131 | + |
| 132 | + // convert any remaining samples |
| 133 | + xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor); |
| 134 | +} |
| 135 | + |
| 136 | +DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD) |
| 137 | +{ |
| 138 | + const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); |
| 139 | + sc16_t* output = reinterpret_cast<sc16_t*>(outputs[0]); |
| 140 | + |
| 141 | + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); |
| 142 | + |
| 143 | +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time |
| 144 | +#define convert_fc32_1_to_item32_1_guts(_al_) \ |
| 145 | + for (; i + 7 < nsamps; i += 8) { \ |
| 146 | + /* load from input */ \ |
| 147 | + __m256 tmplo = \ |
| 148 | + _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ |
| 149 | + __m256 tmphi = \ |
| 150 | + _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ |
| 151 | + \ |
| 152 | + /* convert and scale */ \ |
| 153 | + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \ |
| 154 | + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \ |
| 155 | + \ |
| 156 | + /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \ |
| 157 | + __m256i shuffled_lo = _mm256_permute2x128_si256( \ |
| 158 | + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \ |
| 159 | + __m256i shuffled_hi = _mm256_permute2x128_si256( \ |
| 160 | + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \ |
| 161 | + \ |
| 162 | + /* Now pack the shuffled data sequentially */ \ |
| 163 | + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ |
| 164 | + \ |
| 165 | + /* store to output */ \ |
| 166 | + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ |
| 167 | + } |
| 168 | + |
| 169 | + size_t i = 0; |
| 170 | + |
| 171 | + // need to dispatch according to alignment for fastest conversion |
| 172 | + switch (size_t(input) & 0xf) { |
| 173 | + case 0x0: |
| 174 | + // the data is 16-byte aligned, so do the fast processing of the bulk of the |
| 175 | + // samples |
| 176 | + convert_fc32_1_to_item32_1_guts(_) break; |
| 177 | + case 0x8: |
| 178 | + // the first sample is 8-byte aligned - process it to align the remainder of |
| 179 | + // the samples to 16-bytes |
| 180 | + xx_to_chdr_sc16(input, output, 1, scale_factor); |
| 181 | + i++; |
| 182 | + // do faster processing of the bulk of the samples now that we are 16-byte |
| 183 | + // aligned |
| 184 | + convert_fc32_1_to_item32_1_guts(_) break; |
| 185 | + default: |
| 186 | + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned |
| 187 | + // load |
| 188 | + convert_fc32_1_to_item32_1_guts(u_) |
| 189 | + } |
| 190 | + |
| 191 | + // convert any remaining samples |
| 192 | + xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor); |
| 193 | +} |
0 commit comments