diff --git a/radio/USRP/CMakeLists.txt b/radio/USRP/CMakeLists.txt index c0e4f551fa..db6704b344 100644 --- a/radio/USRP/CMakeLists.txt +++ b/radio/USRP/CMakeLists.txt @@ -4,7 +4,7 @@ add_compile_options(-Wunused-parameter) #find_package(Boost REQUIRED) find_package(UHD REQUIRED UHD) -add_library(oai_usrpdevif MODULE usrp_lib.cpp) +add_library(oai_usrpdevif MODULE usrp_lib.cpp usrp_converters.cpp) #target_include_directories(oai_usrpdevif PRIVATE Boost::boost) target_link_libraries(oai_usrpdevif PRIVATE ${UHD_LIBRARIES}) target_include_directories(oai_usrpdevif PRIVATE ${UHD_INCLUDE_DIRS}) @@ -13,3 +13,7 @@ set_target_properties(oai_usrpdevif PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_ add_custom_command(TARGET oai_usrpdevif POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink liboai_usrpdevif.so liboai_device.so WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + +if (ENABLE_TESTS) + add_subdirectory(tests) +endif() diff --git a/radio/USRP/tests/CMakeLists.txt b/radio/USRP/tests/CMakeLists.txt new file mode 100644 index 0000000000..491424fbfe --- /dev/null +++ b/radio/USRP/tests/CMakeLists.txt @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LicenseRef-CSSL-1.0 + +add_executable(test_usrp_converters test_usrp_converters.cpp ../usrp_converters.cpp) +target_link_libraries(test_usrp_converters PRIVATE GTest::gtest benchmark::benchmark ${UHD_LIBRARIES}) +target_include_directories(test_usrp_converters PRIVATE ${UHD_INCLUDE_DIRS} .. ${CMAKE_SOURCE_DIR}) +add_dependencies(tests test_usrp_converters) +add_test(NAME test_usrp_converters + COMMAND ./test_usrp_converters) diff --git a/radio/USRP/tests/test_usrp_converters.cpp b/radio/USRP/tests/test_usrp_converters.cpp new file mode 100644 index 0000000000..282c76c5ab --- /dev/null +++ b/radio/USRP/tests/test_usrp_converters.cpp @@ -0,0 +1,386 @@ +/* + * SPDX-License-Identifier: LicenseRef-CSSL-1.0 + */ + +#include +#include +#include +#include +#include +#include +#include "usrp_converters.hpp" + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +constexpr bool HOST_IS_BIG_ENDIAN = true; +#else +constexpr bool HOST_IS_BIG_ENDIAN = false; +#endif + +// Helper to manually compute expected RX conversion +static int16_t compute_expected_rx(int16_t val, int shift, bool swap_bytes) +{ + if (swap_bytes) { + val = (val << 8) | ((val >> 8) & 0x00FF); + } + // Arithmetic right shift + return val >> shift; +} + +// Helper to manually compute expected TX conversion +static int16_t compute_expected_tx(int16_t val, int shift, bool swap_bytes) +{ + val = val << shift; + if (swap_bytes) { + val = (val << 8) | ((val >> 8) & 0x00FF); + } + return val; +} + +class UsrpConvertersTest : public ::testing::Test { + protected: + void SetUp() override + { + // Register converters with default shift of 4 + register_oai_converters(4); + } +}; + +TEST_F(UsrpConvertersTest, LookupConverters) +{ + // Test lookup of LE converters + uhd::convert::id_type rx_id_le; + rx_id_le.input_format = "sc16_item32_le"; + rx_id_le.num_inputs = 1; + rx_id_le.output_format = "sc16_oai"; + rx_id_le.num_outputs = 1; + + uhd::convert::function_type rx_fcn_le = uhd::convert::get_converter(rx_id_le); + ASSERT_TRUE(rx_fcn_le); + uhd::convert::converter::sptr rx_conv_le = rx_fcn_le(); + ASSERT_TRUE(rx_conv_le); + + uhd::convert::id_type tx_id_le; + tx_id_le.input_format = "sc16_oai"; + tx_id_le.num_inputs = 1; + tx_id_le.output_format = "sc16_item32_le"; + tx_id_le.num_outputs = 1; + + uhd::convert::function_type tx_fcn_le = uhd::convert::get_converter(tx_id_le); + ASSERT_TRUE(tx_fcn_le); + uhd::convert::converter::sptr tx_conv_le = tx_fcn_le(); + ASSERT_TRUE(tx_conv_le); + + // Test lookup of BE converters + uhd::convert::id_type rx_id_be; + rx_id_be.input_format = "sc16_item32_be"; + rx_id_be.num_inputs = 1; + rx_id_be.output_format = "sc16_oai"; + rx_id_be.num_outputs = 1; + + uhd::convert::function_type rx_fcn_be = uhd::convert::get_converter(rx_id_be); + ASSERT_TRUE(rx_fcn_be); + uhd::convert::converter::sptr rx_conv_be = rx_fcn_be(); + ASSERT_TRUE(rx_conv_be); + + uhd::convert::id_type tx_id_be; + tx_id_be.input_format = "sc16_oai"; + tx_id_be.num_inputs = 1; + tx_id_be.output_format = "sc16_item32_be"; + tx_id_be.num_outputs = 1; + + uhd::convert::function_type tx_fcn_be = uhd::convert::get_converter(tx_id_be); + ASSERT_TRUE(tx_fcn_be); + uhd::convert::converter::sptr tx_conv_be = tx_fcn_be(); + ASSERT_TRUE(tx_conv_be); +} + +void run_rx_test(const std::string& input_format, int shift, bool swap_bytes, int offset_in, int offset_out, size_t num_samples) +{ + uhd::convert::id_type rx_id; + rx_id.input_format = input_format; + rx_id.num_inputs = 1; + rx_id.output_format = "sc16_oai"; + rx_id.num_outputs = 1; + + uhd::convert::converter::sptr rx_conv = uhd::convert::get_converter(rx_id)(); + ASSERT_TRUE(rx_conv); + + // Allocate 64-byte aligned buffers + alignas(64) int16_t raw_in[2048]; + alignas(64) int16_t raw_out[2048]; + + // Initialize input with test pattern + for (int i = 0; i < 2048; ++i) { + raw_in[i] = static_cast(i * 3 + 7); + } + std::memset(raw_out, 0, sizeof(raw_out)); + + int16_t* in_ptr = &raw_in[offset_in]; + int16_t* out_ptr = &raw_out[offset_out]; + + // Check alignment + bool is_in_aligned = (((uintptr_t)in_ptr) & 0x3F) == 0; + bool is_out_aligned = (((uintptr_t)out_ptr) & 0x3F) == 0; + + uhd::ref_vector in_vec(in_ptr); + uhd::ref_vector out_vec(out_ptr); + + rx_conv->conv(in_vec, out_vec, num_samples); + + // Verify outputs + size_t total_ints = num_samples * 2; + for (size_t i = 0; i < total_ints; ++i) { + int16_t expected = compute_expected_rx(in_ptr[i], shift, swap_bytes); + EXPECT_EQ(out_ptr[i], expected) << "Mismatch at index " << i << " with num_samples=" << num_samples + << ", in_aligned=" << is_in_aligned << ", out_aligned=" << is_out_aligned; + } +} + +void run_tx_test(const std::string& output_format, int shift, bool swap_bytes, int offset_in, int offset_out, size_t num_samples) +{ + uhd::convert::id_type tx_id; + tx_id.input_format = "sc16_oai"; + tx_id.num_inputs = 1; + tx_id.output_format = output_format; + tx_id.num_outputs = 1; + + uhd::convert::converter::sptr tx_conv = uhd::convert::get_converter(tx_id)(); + ASSERT_TRUE(tx_conv); + + // Allocate 64-byte aligned buffers + alignas(64) int16_t raw_in[2048]; + alignas(64) int16_t raw_out[2048]; + + // Initialize input with test pattern + for (int i = 0; i < 2048; ++i) { + raw_in[i] = static_cast(i * 5 - 11); + } + std::memset(raw_out, 0, sizeof(raw_out)); + + int16_t* in_ptr = &raw_in[offset_in]; + int16_t* out_ptr = &raw_out[offset_out]; + + // Check alignment + bool is_in_aligned = (((uintptr_t)in_ptr) & 0x3F) == 0; + bool is_out_aligned = (((uintptr_t)out_ptr) & 0x3F) == 0; + + uhd::ref_vector in_vec(in_ptr); + uhd::ref_vector out_vec(out_ptr); + + tx_conv->conv(in_vec, out_vec, num_samples); + + // Verify outputs + size_t total_ints = num_samples * 2; + for (size_t i = 0; i < total_ints; ++i) { + int16_t expected = compute_expected_tx(in_ptr[i], shift, swap_bytes); + EXPECT_EQ(out_ptr[i], expected) << "Mismatch at index " << i << " with num_samples=" << num_samples + << ", in_aligned=" << is_in_aligned << ", out_aligned=" << is_out_aligned; + } +} + +TEST_F(UsrpConvertersTest, RxConverterLE_Shift4) +{ + constexpr bool swap_bytes = HOST_IS_BIG_ENDIAN; + + // Test combinations of alignments and sizes + std::vector offsets = {0, 1, 2, 4, 8, 15, 16, 32}; + std::vector sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256}; + + for (int off_in : offsets) { + for (int off_out : offsets) { + for (size_t num_s : sample_counts) { + run_rx_test("sc16_item32_le", 4, swap_bytes, off_in, off_out, num_s); + } + } + } +} + +TEST_F(UsrpConvertersTest, RxConverterBE_Shift4) +{ + constexpr bool swap_bytes = !HOST_IS_BIG_ENDIAN; + + // Test combinations of alignments and sizes + std::vector offsets = {0, 1, 2, 4, 8, 15, 16, 32}; + std::vector sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256}; + + for (int off_in : offsets) { + for (int off_out : offsets) { + for (size_t num_s : sample_counts) { + run_rx_test("sc16_item32_be", 4, swap_bytes, off_in, off_out, num_s); + } + } + } +} + +TEST_F(UsrpConvertersTest, TxConverterLE) +{ + constexpr bool swap_bytes = HOST_IS_BIG_ENDIAN; + + // Test combinations of alignments and sizes + std::vector offsets = {0, 1, 2, 4, 8, 15, 16, 32}; + std::vector sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256}; + + for (int off_in : offsets) { + for (int off_out : offsets) { + for (size_t num_s : sample_counts) { + run_tx_test("sc16_item32_le", 4, swap_bytes, off_in, off_out, num_s); + } + } + } +} + +TEST_F(UsrpConvertersTest, TxConverterBE) +{ + constexpr bool swap_bytes = !HOST_IS_BIG_ENDIAN; + + // Test combinations of alignments and sizes + std::vector offsets = {0, 1, 2, 4, 8, 15, 16, 32}; + std::vector sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256}; + + for (int off_in : offsets) { + for (int off_out : offsets) { + for (size_t num_s : sample_counts) { + run_tx_test("sc16_item32_be", 4, swap_bytes, off_in, off_out, num_s); + } + } + } +} + +TEST_F(UsrpConvertersTest, RxConverterShift2) +{ + // Register with shift 2 + register_oai_converters(2); + + constexpr bool swap_bytes_le = HOST_IS_BIG_ENDIAN; + constexpr bool swap_bytes_be = !HOST_IS_BIG_ENDIAN; + + std::vector offsets = {0, 1, 2, 8}; + std::vector sample_counts = {1, 7, 8, 31, 32, 64}; + + for (int off_in : offsets) { + for (int off_out : offsets) { + for (size_t num_s : sample_counts) { + run_rx_test("sc16_item32_le", 2, swap_bytes_le, off_in, off_out, num_s); + run_rx_test("sc16_item32_be", 2, swap_bytes_be, off_in, off_out, num_s); + } + } + } +} + +#include + +static void BM_RxConverterLE_Shift4_Aligned(benchmark::State& state) { + uhd::convert::id_type rx_id; + rx_id.input_format = "sc16_item32_le"; + rx_id.num_inputs = 1; + rx_id.output_format = "sc16_oai"; + rx_id.num_outputs = 1; + + uhd::convert::converter::sptr rx_conv = uhd::convert::get_converter(rx_id)(); + + alignas(64) int16_t raw_in[4096]; + alignas(64) int16_t raw_out[4096]; + + for (int i = 0; i < 4096; ++i) { + raw_in[i] = static_cast(i); + } + + const void* in_vec_ptr = raw_in; + void* out_vec_ptr = raw_out; + uhd::ref_vector in_vec(in_vec_ptr); + uhd::ref_vector out_vec(out_vec_ptr); + + size_t num_samples = state.range(0); + + for (auto _ : state) { + rx_conv->conv(in_vec, out_vec, num_samples); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * num_samples); +} +BENCHMARK(BM_RxConverterLE_Shift4_Aligned)->RangeMultiplier(2)->Range(8, 2048); + +static void BM_RxConverterLE_Shift4_Unaligned(benchmark::State& state) { + uhd::convert::id_type rx_id; + rx_id.input_format = "sc16_item32_le"; + rx_id.num_inputs = 1; + rx_id.output_format = "sc16_oai"; + rx_id.num_outputs = 1; + + uhd::convert::converter::sptr rx_conv = uhd::convert::get_converter(rx_id)(); + + alignas(64) int16_t raw_in[4096]; + alignas(64) int16_t raw_out[4096]; + + for (int i = 0; i < 4096; ++i) { + raw_in[i] = static_cast(i); + } + + const void* in_vec_ptr = raw_in + 1; // 2-byte aligned + void* out_vec_ptr = raw_out + 1; + uhd::ref_vector in_vec(in_vec_ptr); + uhd::ref_vector out_vec(out_vec_ptr); + + size_t num_samples = state.range(0); + + for (auto _ : state) { + rx_conv->conv(in_vec, out_vec, num_samples); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * num_samples); +} +BENCHMARK(BM_RxConverterLE_Shift4_Unaligned)->RangeMultiplier(2)->Range(8, 2048); + +static void BM_TxConverterLE_Aligned(benchmark::State& state) { + uhd::convert::id_type tx_id; + tx_id.input_format = "sc16_oai"; + tx_id.num_inputs = 1; + tx_id.output_format = "sc16_item32_le"; + tx_id.num_outputs = 1; + + uhd::convert::converter::sptr tx_conv = uhd::convert::get_converter(tx_id)(); + + alignas(64) int16_t raw_in[4096]; + alignas(64) int16_t raw_out[4096]; + + for (int i = 0; i < 4096; ++i) { + raw_in[i] = static_cast(i); + } + + const void* in_vec_ptr = raw_in; + void* out_vec_ptr = raw_out; + uhd::ref_vector in_vec(in_vec_ptr); + uhd::ref_vector out_vec(out_vec_ptr); + + size_t num_samples = state.range(0); + + for (auto _ : state) { + tx_conv->conv(in_vec, out_vec, num_samples); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * num_samples); +} +BENCHMARK(BM_TxConverterLE_Aligned)->RangeMultiplier(2)->Range(8, 2048); + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + int gtest_ret = RUN_ALL_TESTS(); + + bool run_benchmarks = false; + for (int i = 1; i < argc; ++i) { + if (std::strcmp(argv[i], "--benchmark") == 0 || std::strstr(argv[i], "--benchmark_") != nullptr) { + run_benchmarks = true; + break; + } + } + + if (run_benchmarks) { + // Re-register converters with shift 4 for benchmark + register_oai_converters(4); + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + } + + return gtest_ret; +} diff --git a/radio/USRP/usrp_converters.cpp b/radio/USRP/usrp_converters.cpp new file mode 100644 index 0000000000..4452ec8a4f --- /dev/null +++ b/radio/USRP/usrp_converters.cpp @@ -0,0 +1,417 @@ +/* + * SPDX-License-Identifier: LicenseRef-CSSL-1.0 + */ + +#include +#include +#include +#include "usrp_converters.hpp" + +#if defined(__ARM_NEON) +#include +#endif + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +constexpr bool HOST_IS_BIG_ENDIAN = true; +#else +constexpr bool HOST_IS_BIG_ENDIAN = false; +#endif + +const std::string CPU_FORMAT_OAI = "sc16_oai"; + +namespace uhd { +namespace convert { +converter::~converter(void) +{ + // NOP +} +} // namespace convert +} // namespace uhd + +template +class sc16_oai_rx_converter : public uhd::convert::converter { + public: + virtual ~sc16_oai_rx_converter(void) override + { + } + + void set_scalar(const double) override + { + // No-op + } + + void operator()(const input_type& in, const output_type& out, const size_t num) override + { + for (size_t chan = 0; chan < in.size(); ++chan) { + const int16_t* src = reinterpret_cast(in[chan]); + int16_t* dest = reinterpret_cast(out[chan]); + + size_t total_ints = num * 2; + size_t j = 0; + +#if defined(__ARM_NEON) + // Process 32 elements (16 samples) at a time + for (; j + 31 < total_ints; j += 32) { + int16x8_t v0 = vld1q_s16(&src[j + 0]); + int16x8_t v1 = vld1q_s16(&src[j + 8]); + int16x8_t v2 = vld1q_s16(&src[j + 16]); + int16x8_t v3 = vld1q_s16(&src[j + 24]); + + if (SwapBytes) { + v0 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v0))); + v1 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v1))); + v2 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v2))); + v3 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v3))); + } + + v0 = vshrq_n_s16(v0, Shift); + v1 = vshrq_n_s16(v1, Shift); + v2 = vshrq_n_s16(v2, Shift); + v3 = vshrq_n_s16(v3, Shift); + + vst1q_s16(&dest[j + 0], v0); + vst1q_s16(&dest[j + 8], v1); + vst1q_s16(&dest[j + 16], v2); + vst1q_s16(&dest[j + 24], v3); + } + + // Process remaining elements (8 at a time) + for (; j + 7 < total_ints; j += 8) { + int16x8_t v = vld1q_s16(&src[j]); + if (SwapBytes) { + v = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v))); + } + v = vshrq_n_s16(v, Shift); + vst1q_s16(&dest[j], v); + } +#elif defined(__AVX512F__) && defined(__AVX512BW__) + bool src_aligned = ((((uintptr_t)src) & 0x3F) == 0); + bool dest_aligned = ((((uintptr_t)dest) & 0x3F) == 0); + simde__m512i mask_ff = simde_mm512_set1_epi16(0x00FF); + + if (src_aligned && dest_aligned) { + for (; j + 31 < total_ints; j += 32) { + simde__m512i v_in = simde_mm512_load_si512(reinterpret_cast(&src[j])); + if (SwapBytes) { + simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_in, 8), mask_ff); + simde__m512i high = simde_mm512_slli_epi16(v_in, 8); + v_in = simde_mm512_or_si512(low, high); + } + simde__m512i v_out = simde_mm512_srai_epi16(v_in, Shift); + simde_mm512_store_si512(reinterpret_cast(&dest[j]), v_out); + } + } else { + for (; j + 31 < total_ints; j += 32) { + simde__m512i v_in = simde_mm512_loadu_si512(reinterpret_cast(&src[j])); + if (SwapBytes) { + simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_in, 8), mask_ff); + simde__m512i high = simde_mm512_slli_epi16(v_in, 8); + v_in = simde_mm512_or_si512(low, high); + } + simde__m512i v_out = simde_mm512_srai_epi16(v_in, Shift); + simde_mm512_storeu_si512(reinterpret_cast(&dest[j]), v_out); + } + } +#elif defined(__AVX2__) + bool src_aligned = ((((uintptr_t)src) & 0x1F) == 0); + bool dest_aligned = ((((uintptr_t)dest) & 0x1F) == 0); + simde__m256i mask_ff = simde_mm256_set1_epi16(0x00FF); + + if (src_aligned && dest_aligned) { + for (; j + 15 < total_ints; j += 16) { + simde__m256i v_in = simde_mm256_load_si256(reinterpret_cast(&src[j])); + if (SwapBytes) { + simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_in, 8), mask_ff); + simde__m256i high = simde_mm256_slli_epi16(v_in, 8); + v_in = simde_mm256_or_si256(low, high); + } + simde__m256i v_out = simde_mm256_srai_epi16(v_in, Shift); + simde_mm256_store_si256(reinterpret_cast(&dest[j]), v_out); + } + } else { + for (; j + 15 < total_ints; j += 16) { + simde__m256i v_in = simde_mm256_loadu_si256(reinterpret_cast(&src[j])); + if (SwapBytes) { + simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_in, 8), mask_ff); + simde__m256i high = simde_mm256_slli_epi16(v_in, 8); + v_in = simde_mm256_or_si256(low, high); + } + simde__m256i v_out = simde_mm256_srai_epi16(v_in, Shift); + simde_mm256_storeu_si256(reinterpret_cast(&dest[j]), v_out); + } + } +#elif defined(__SSE2__) + bool src_aligned = ((((uintptr_t)src) & 0x0F) == 0); + bool dest_aligned = ((((uintptr_t)dest) & 0x0F) == 0); + simde__m128i mask_ff = simde_mm_set1_epi16(0x00FF); + + if (src_aligned && dest_aligned) { + for (; j + 7 < total_ints; j += 8) { + simde__m128i v_in = simde_mm_load_si128(reinterpret_cast(&src[j])); + if (SwapBytes) { + simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_in, 8), mask_ff); + simde__m128i high = simde_mm_slli_epi16(v_in, 8); + v_in = simde_mm_or_si128(low, high); + } + simde__m128i v_out = simde_mm_srai_epi16(v_in, Shift); + simde_mm_store_si128(reinterpret_cast(&dest[j]), v_out); + } + } else { + for (; j + 7 < total_ints; j += 8) { + simde__m128i v_in = simde_mm_loadu_si128(reinterpret_cast(&src[j])); + if (SwapBytes) { + simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_in, 8), mask_ff); + simde__m128i high = simde_mm_slli_epi16(v_in, 8); + v_in = simde_mm_or_si128(low, high); + } + simde__m128i v_out = simde_mm_srai_epi16(v_in, Shift); + simde_mm_storeu_si128(reinterpret_cast(&dest[j]), v_out); + } + } +#endif + for (; j < total_ints; ++j) { + int16_t val = src[j]; + if (SwapBytes) { + val = (val << 8) | ((val >> 8) & 0x00FF); + } + dest[j] = val >> Shift; + } + } + } +}; + +template +class sc16_oai_tx_converter : public uhd::convert::converter { + public: + virtual ~sc16_oai_tx_converter(void) override + { + } + + void set_scalar(const double) override + { + // No-op + } + + void operator()(const input_type& in, const output_type& out, const size_t num) override + { + for (size_t chan = 0; chan < in.size(); ++chan) { + const int16_t* src = reinterpret_cast(in[chan]); + int16_t* dest = reinterpret_cast(out[chan]); + + size_t total_ints = num * 2; + size_t j = 0; + +#if defined(__ARM_NEON) + // Process 32 elements (16 samples) at a time + for (; j + 31 < total_ints; j += 32) { + int16x8_t v0 = vld1q_s16(&src[j + 0]); + int16x8_t v1 = vld1q_s16(&src[j + 8]); + int16x8_t v2 = vld1q_s16(&src[j + 16]); + int16x8_t v3 = vld1q_s16(&src[j + 24]); + + v0 = vshlq_n_s16(v0, Shift); + v1 = vshlq_n_s16(v1, Shift); + v2 = vshlq_n_s16(v2, Shift); + v3 = vshlq_n_s16(v3, Shift); + + if (SwapBytes) { + v0 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v0))); + v1 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v1))); + v2 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v2))); + v3 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v3))); + } + + vst1q_s16(&dest[j + 0], v0); + vst1q_s16(&dest[j + 8], v1); + vst1q_s16(&dest[j + 16], v2); + vst1q_s16(&dest[j + 24], v3); + } + + // Process remaining elements (8 at a time) + for (; j + 7 < total_ints; j += 8) { + int16x8_t v = vld1q_s16(&src[j]); + v = vshlq_n_s16(v, Shift); + if (SwapBytes) { + v = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v))); + } + vst1q_s16(&dest[j], v); + } +#elif defined(__AVX512F__) && defined(__AVX512BW__) + bool src_aligned = ((((uintptr_t)src) & 0x3F) == 0); + bool dest_aligned = ((((uintptr_t)dest) & 0x3F) == 0); + simde__m512i mask_ff = simde_mm512_set1_epi16(0x00FF); + + if (src_aligned && dest_aligned) { + for (; j + 31 < total_ints; j += 32) { + simde__m512i v_in = simde_mm512_load_si512(reinterpret_cast(&src[j])); + simde__m512i v_out = simde_mm512_slli_epi16(v_in, Shift); + if (SwapBytes) { + simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_out, 8), mask_ff); + simde__m512i high = simde_mm512_slli_epi16(v_out, 8); + v_out = simde_mm512_or_si512(low, high); + } + simde_mm512_store_si512(reinterpret_cast(&dest[j]), v_out); + } + } else { + for (; j + 31 < total_ints; j += 32) { + simde__m512i v_in = simde_mm512_loadu_si512(reinterpret_cast(&src[j])); + simde__m512i v_out = simde_mm512_slli_epi16(v_in, Shift); + if (SwapBytes) { + simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_out, 8), mask_ff); + simde__m512i high = simde_mm512_slli_epi16(v_out, 8); + v_out = simde_mm512_or_si512(low, high); + } + simde_mm512_storeu_si512(reinterpret_cast(&dest[j]), v_out); + } + } +#elif defined(__AVX2__) + bool src_aligned = ((((uintptr_t)src) & 0x1F) == 0); + bool dest_aligned = ((((uintptr_t)dest) & 0x1F) == 0); + simde__m256i mask_ff = simde_mm256_set1_epi16(0x00FF); + + if (src_aligned && dest_aligned) { + for (; j + 15 < total_ints; j += 16) { + simde__m256i v_in = simde_mm256_load_si256(reinterpret_cast(&src[j])); + simde__m256i v_out = simde_mm256_slli_epi16(v_in, Shift); + if (SwapBytes) { + simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_out, 8), mask_ff); + simde__m256i high = simde_mm256_slli_epi16(v_out, 8); + v_out = simde_mm256_or_si256(low, high); + } + simde_mm256_store_si256(reinterpret_cast(&dest[j]), v_out); + } + } else { + for (; j + 15 < total_ints; j += 16) { + simde__m256i v_in = simde_mm256_loadu_si256(reinterpret_cast(&src[j])); + simde__m256i v_out = simde_mm256_slli_epi16(v_in, Shift); + if (SwapBytes) { + simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_out, 8), mask_ff); + simde__m256i high = simde_mm256_slli_epi16(v_out, 8); + v_out = simde_mm256_or_si256(low, high); + } + simde_mm256_storeu_si256(reinterpret_cast(&dest[j]), v_out); + } + } +#elif defined(__SSE2__) + bool src_aligned = ((((uintptr_t)src) & 0x0F) == 0); + bool dest_aligned = ((((uintptr_t)dest) & 0x0F) == 0); + simde__m128i mask_ff = simde_mm_set1_epi16(0x00FF); + + if (src_aligned && dest_aligned) { + for (; j + 7 < total_ints; j += 8) { + simde__m128i v_in = simde_mm_load_si128(reinterpret_cast(&src[j])); + simde__m128i v_out = simde_mm_slli_epi16(v_in, Shift); + if (SwapBytes) { + simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_out, 8), mask_ff); + simde__m128i high = simde_mm_slli_epi16(v_out, 8); + v_out = simde_mm_or_si128(low, high); + } + simde_mm_store_si128(reinterpret_cast(&dest[j]), v_out); + } + } else { + for (; j + 7 < total_ints; j += 8) { + simde__m128i v_in = simde_mm_loadu_si128(reinterpret_cast(&src[j])); + simde__m128i v_out = simde_mm_slli_epi16(v_in, Shift); + if (SwapBytes) { + simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_out, 8), mask_ff); + simde__m128i high = simde_mm_slli_epi16(v_out, 8); + v_out = simde_mm_or_si128(low, high); + } + simde_mm_storeu_si128(reinterpret_cast(&dest[j]), v_out); + } + } +#endif + for (; j < total_ints; ++j) { + int16_t val = src[j] << Shift; + if (SwapBytes) { + val = (val << 8) | ((val >> 8) & 0x00FF); + } + dest[j] = val; + } + } + } +}; + +void register_oai_converters(int rxshift) +{ + uhd::convert::register_bytes_per_item(CPU_FORMAT_OAI, sizeof(int16_t) * 2); + + constexpr bool swap_le = (false != HOST_IS_BIG_ENDIAN); + constexpr bool swap_be = (true != HOST_IS_BIG_ENDIAN); + + for (size_t num_chans = 1; num_chans <= 4; ++num_chans) { + // 1. LE wire format + { + uhd::convert::id_type rx_id; + rx_id.input_format = "sc16_item32_le"; + rx_id.num_inputs = num_chans; + rx_id.output_format = CPU_FORMAT_OAI; + rx_id.num_outputs = num_chans; + + switch (rxshift) { + case 2: + uhd::convert::register_converter( + rx_id, + []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<2, swap_le>()); }, + 100); + break; + case 4: + uhd::convert::register_converter( + rx_id, + []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<4, swap_le>()); }, + 100); + break; + default: + break; + } + + uhd::convert::id_type tx_id; + tx_id.input_format = CPU_FORMAT_OAI; + tx_id.num_inputs = num_chans; + tx_id.output_format = rx_id.input_format; + tx_id.num_outputs = num_chans; + + uhd::convert::register_converter( + tx_id, + []() { return uhd::convert::converter::sptr(new sc16_oai_tx_converter<4, swap_le>()); }, + 100); + } + + // 2. BE wire format + { + uhd::convert::id_type rx_id; + rx_id.input_format = "sc16_item32_be"; + rx_id.num_inputs = num_chans; + rx_id.output_format = CPU_FORMAT_OAI; + rx_id.num_outputs = num_chans; + + switch (rxshift) { + case 2: + uhd::convert::register_converter( + rx_id, + []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<2, swap_be>()); }, + 100); + break; + case 4: + uhd::convert::register_converter( + rx_id, + []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<4, swap_be>()); }, + 100); + break; + default: + break; + } + + uhd::convert::id_type tx_id; + tx_id.input_format = CPU_FORMAT_OAI; + tx_id.num_inputs = num_chans; + tx_id.output_format = rx_id.input_format; + tx_id.num_outputs = num_chans; + + uhd::convert::register_converter( + tx_id, + []() { return uhd::convert::converter::sptr(new sc16_oai_tx_converter<4, swap_be>()); }, + 100); + } + } +} diff --git a/radio/USRP/usrp_converters.hpp b/radio/USRP/usrp_converters.hpp new file mode 100644 index 0000000000..144387af24 --- /dev/null +++ b/radio/USRP/usrp_converters.hpp @@ -0,0 +1,10 @@ +/* + * SPDX-License-Identifier: LicenseRef-CSSL-1.0 + */ + +#ifndef USRP_CONVERTERS_HPP +#define USRP_CONVERTERS_HPP + +void register_oai_converters(int rxshift); + +#endif // USRP_CONVERTERS_HPP diff --git a/radio/USRP/usrp_lib.cpp b/radio/USRP/usrp_lib.cpp index 5b8b34c9ed..f37955616c 100644 --- a/radio/USRP/usrp_lib.cpp +++ b/radio/USRP/usrp_lib.cpp @@ -14,6 +14,7 @@ #else #include #endif +#include #include #include #include @@ -35,6 +36,8 @@ #include #include "openair1/PHY/sse_intrin.h" +#include "usrp_converters.hpp" + /** @addtogroup _USRP_PHY_RF_INTERFACE_ * @{ @@ -415,7 +418,6 @@ static int trx_usrp_write(openair0_device_t *device, int ret=0; usrp_state_t *s = (usrp_state_t *)device->priv; timestamp -= device->openair0_cfg->command_line_sample_advance + device->openair0_cfg->tx_sample_advance; - int nsamps2; // aligned to upper 32 or 16 byte boundary radio_tx_burst_flag_t flags_burst = (radio_tx_burst_flag_t) (flags & 0xf); radio_tx_gpio_flag_t flags_gpio = (radio_tx_gpio_flag_t) ((flags >> 4) & 0x1fff); @@ -458,22 +460,6 @@ static int trx_usrp_write(openair0_device_t *device, } if (usrp_tx_thread == 0) { - nsamps2 = (nsamps+7)>>3; - simde__m256i buff_tx[cc < 2 ? 2 : cc][nsamps2]; - - // bring TX data into 16 MSBs, assuming it is on the 12 LSB after OAI computation - const int shift = 4; - for (int i = 0; i < cc; i++) { - for (int j = 0; j < nsamps2; j++) { - if ((((uintptr_t)buff[i]) & 0x1F) == 0) { - buff_tx[i][j] = simde_mm256_slli_epi16(((simde__m256i *)buff[i])[j], shift); - } else { - simde__m256i tmp = simde_mm256_loadu_si256(((simde__m256i *)buff[i]) + j); - buff_tx[i][j] = simde_mm256_slli_epi16(tmp, shift); - } - } - } - s->tx_md.has_time_spec = true; s->tx_md.start_of_burst = (s->tx_count == 0) ? true : first_packet_state; s->tx_md.end_of_burst = last_packet_state; @@ -494,11 +480,11 @@ static int trx_usrp_write(openair0_device_t *device, std::vector buff_ptrs; for (int i = 0; i < cc; i++) - buff_ptrs.push_back(&(((int16_t *)buff_tx[i])[0])); + buff_ptrs.push_back(buff[i]); ret = (int)s->tx_stream->send(buff_ptrs, nsamps, s->tx_md); } else { - ret = (int)s->tx_stream->send(&(((int16_t *)buff_tx[0])[0]), nsamps, s->tx_md); + ret = (int)s->tx_stream->send(buff[0], nsamps, s->tx_md); } if (ret != nsamps) { @@ -553,7 +539,6 @@ void *trx_usrp_write_thread(void * arg) openair0_write_package_t *write_package = write_thread->write_package; usrp_state_t *s; - int nsamps2; // aligned to upper 32 or 16 byte boundary int start; openair0_timestamp_t timestamp; void **buff; @@ -588,23 +573,6 @@ void *trx_usrp_write_thread(void * arg) LOG_W(HW,"count write = %d, start = %d, end = %d\n", write_thread->count_write, write_thread->start, write_thread->end); }*/ - nsamps2 = (nsamps+7)>>3; - simde__m256i buff_tx[cc < 2 ? 2 : cc][nsamps2]; - // bring TX data into 16 MSBs, assuming it is on the 12 LSB after OAI computation - const int shift = 4; - for (int i = 0; i < cc; i++) { - for (int j = 0; j < nsamps2; j++) { - if ((((uintptr_t) buff[i])&0x1F)==0) { - buff_tx[i][j] = simde_mm256_slli_epi16(((simde__m256i *)buff[i])[j], shift); - } - else - { - simde__m256i tmp = simde_mm256_loadu_si256(((simde__m256i *)buff[i]) + j); - buff_tx[i][j] = simde_mm256_slli_epi16(tmp, shift); - } - } - } - s->tx_md.has_time_spec = true; s->tx_md.start_of_burst = (s->tx_count==0) ? true : first_packet; s->tx_md.end_of_burst = last_packet; @@ -624,15 +592,15 @@ void *trx_usrp_write_thread(void * arg) std::vector buff_ptrs; for (int i=0; itx_stream->send(buff_ptrs, nsamps, s->tx_md); } else { - ret = (int)s->tx_stream->send(&(((int16_t *)buff_tx[0])[0]), nsamps, s->tx_md); + ret = (int)s->tx_stream->send(buff[0], nsamps, s->tx_md); } - T(T_USRP_TX_ANT0, T_INT(timestamp), T_BUFFER(buff_tx[0], nsamps*4)); + T(T_USRP_TX_ANT0, T_INT(timestamp), T_BUFFER(buff[0], nsamps*4)); if (ret != nsamps) LOG_E(HW,"[xmit] tx samples %d != %d\n",ret,nsamps); VCD_SIGNAL_DUMPER_DUMP_VARIABLE_BY_NAME( VCD_SIGNAL_DUMPER_VARIABLES_USRP_SEND_RETURN, ret ); @@ -693,23 +661,7 @@ static int trx_usrp_read(openair0_device_t *device, openair0_timestamp_t *ptimes { usrp_state_t *s = (usrp_state_t *)device->priv; int samples_received=0; - int nsamps2; // aligned to upper 32 or 16 byte boundary - nsamps2 = (nsamps+7)>>3; - simde__m256i buff_tmp[cc < 2 ? 2 : cc][nsamps2]; static int read_count = 0; - int rxshift; - switch (device->type) { - case USRP_B200_DEV: - rxshift=4; - break; - case USRP_X300_DEV: - case USRP_N300_DEV: - case USRP_X400_DEV: - rxshift=2; - break; - default: - AssertFatal(1==0,"Shouldn't be here\n"); - } samples_received=0; while (samples_received != nsamps) { @@ -718,12 +670,12 @@ static int trx_usrp_read(openair0_device_t *device, openair0_timestamp_t *ptimes // receive multiple channels (e.g. RF A and RF B) std::vector buff_ptrs; - for (int i=0; irx_stream->recv(buff_ptrs, nsamps-samples_received, s->rx_md); } else { // receive a single channel (e.g. from connector RF A) - samples_received += s->rx_stream->recv((void*)((int32_t*)buff_tmp[0]+samples_received), + samples_received += s->rx_stream->recv((void*)((int32_t*)buff[0]+samples_received), nsamps-samples_received, s->rx_md); } if ((s->wait_for_first_pps == 0) && (s->rx_md.error_code!=uhd::rx_metadata_t::ERROR_CODE_NONE)) @@ -735,22 +687,6 @@ static int trx_usrp_read(openair0_device_t *device, openair0_timestamp_t *ptimes } if (samples_received == nsamps) s->wait_for_first_pps=0; - // bring RX data into 12 LSBs for softmodem RX - for (int i=0; iusrp->get_clock_source(0).c_str()); LOG_I(HW,"Actual time source %s...\n",s->usrp->get_time_source(0).c_str()); + // register custom OAI converters + int rxshift; + switch (device->type) { + case USRP_B200_DEV: + rxshift = 4; + break; + case USRP_X300_DEV: + case USRP_N300_DEV: + case USRP_X400_DEV: + rxshift = 2; + break; + default: + AssertFatal(1 == 0, "Shouldn't be here\n"); + } + register_oai_converters(rxshift); + // create tx & rx streamer - uhd::stream_args_t stream_args_rx("sc16", "sc16"); + uhd::stream_args_t stream_args_rx("sc16_oai", "sc16"); for (int i = 0; irx_stream->get_max_num_samps()); - uhd::stream_args_t stream_args_tx("sc16", "sc16"); + uhd::stream_args_t stream_args_tx("sc16_oai", "sc16"); for (int i = 0; i