diff --git a/radio/USRP/CMakeLists.txt b/radio/USRP/CMakeLists.txt
index c0e4f551fa..db6704b344 100644
--- a/radio/USRP/CMakeLists.txt
+++ b/radio/USRP/CMakeLists.txt
@@ -4,7 +4,7 @@ add_compile_options(-Wunused-parameter)
 #find_package(Boost REQUIRED)
 find_package(UHD REQUIRED UHD)
 
-add_library(oai_usrpdevif MODULE usrp_lib.cpp)
+add_library(oai_usrpdevif MODULE usrp_lib.cpp usrp_converters.cpp)
 #target_include_directories(oai_usrpdevif PRIVATE Boost::boost)
 target_link_libraries(oai_usrpdevif PRIVATE ${UHD_LIBRARIES})
 target_include_directories(oai_usrpdevif PRIVATE ${UHD_INCLUDE_DIRS})
@@ -13,3 +13,7 @@ set_target_properties(oai_usrpdevif PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_
 add_custom_command(TARGET oai_usrpdevif POST_BUILD
                    COMMAND ${CMAKE_COMMAND} -E create_symlink liboai_usrpdevif.so liboai_device.so
                    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+if (ENABLE_TESTS)
+  add_subdirectory(tests)
+endif()
diff --git a/radio/USRP/tests/CMakeLists.txt b/radio/USRP/tests/CMakeLists.txt
new file mode 100644
index 0000000000..491424fbfe
--- /dev/null
+++ b/radio/USRP/tests/CMakeLists.txt
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LicenseRef-CSSL-1.0
+
+add_executable(test_usrp_converters test_usrp_converters.cpp ../usrp_converters.cpp)
+target_link_libraries(test_usrp_converters PRIVATE GTest::gtest benchmark::benchmark ${UHD_LIBRARIES})
+target_include_directories(test_usrp_converters PRIVATE ${UHD_INCLUDE_DIRS} .. ${CMAKE_SOURCE_DIR})
+add_dependencies(tests test_usrp_converters)
+add_test(NAME test_usrp_converters
+  COMMAND ./test_usrp_converters)
diff --git a/radio/USRP/tests/test_usrp_converters.cpp b/radio/USRP/tests/test_usrp_converters.cpp
new file mode 100644
index 0000000000..282c76c5ab
--- /dev/null
+++ b/radio/USRP/tests/test_usrp_converters.cpp
@@ -0,0 +1,386 @@
+/*
+ * SPDX-License-Identifier: LicenseRef-CSSL-1.0
+ */
+
+#include <gtest/gtest.h>
+#include <uhd/convert.hpp>
+#include <vector>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include "usrp_converters.hpp"
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+constexpr bool HOST_IS_BIG_ENDIAN = true;
+#else
+constexpr bool HOST_IS_BIG_ENDIAN = false;
+#endif
+
+// Helper to manually compute expected RX conversion
+static int16_t compute_expected_rx(int16_t val, int shift, bool swap_bytes)
+{
+  if (swap_bytes) {
+    val = (val << 8) | ((val >> 8) & 0x00FF);
+  }
+  // Arithmetic right shift
+  return val >> shift;
+}
+
+// Helper to manually compute expected TX conversion
+static int16_t compute_expected_tx(int16_t val, int shift, bool swap_bytes)
+{
+  val = val << shift;
+  if (swap_bytes) {
+    val = (val << 8) | ((val >> 8) & 0x00FF);
+  }
+  return val;
+}
+
+class UsrpConvertersTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    // Register converters with default shift of 4
+    register_oai_converters(4);
+  }
+};
+
+TEST_F(UsrpConvertersTest, LookupConverters)
+{
+  // Test lookup of LE converters
+  uhd::convert::id_type rx_id_le;
+  rx_id_le.input_format = "sc16_item32_le";
+  rx_id_le.num_inputs = 1;
+  rx_id_le.output_format = "sc16_oai";
+  rx_id_le.num_outputs = 1;
+
+  uhd::convert::function_type rx_fcn_le = uhd::convert::get_converter(rx_id_le);
+  ASSERT_TRUE(rx_fcn_le);
+  uhd::convert::converter::sptr rx_conv_le = rx_fcn_le();
+  ASSERT_TRUE(rx_conv_le);
+
+  uhd::convert::id_type tx_id_le;
+  tx_id_le.input_format = "sc16_oai";
+  tx_id_le.num_inputs = 1;
+  tx_id_le.output_format = "sc16_item32_le";
+  tx_id_le.num_outputs = 1;
+
+  uhd::convert::function_type tx_fcn_le = uhd::convert::get_converter(tx_id_le);
+  ASSERT_TRUE(tx_fcn_le);
+  uhd::convert::converter::sptr tx_conv_le = tx_fcn_le();
+  ASSERT_TRUE(tx_conv_le);
+
+  // Test lookup of BE converters
+  uhd::convert::id_type rx_id_be;
+  rx_id_be.input_format = "sc16_item32_be";
+  rx_id_be.num_inputs = 1;
+  rx_id_be.output_format = "sc16_oai";
+  rx_id_be.num_outputs = 1;
+
+  uhd::convert::function_type rx_fcn_be = uhd::convert::get_converter(rx_id_be);
+  ASSERT_TRUE(rx_fcn_be);
+  uhd::convert::converter::sptr rx_conv_be = rx_fcn_be();
+  ASSERT_TRUE(rx_conv_be);
+
+  uhd::convert::id_type tx_id_be;
+  tx_id_be.input_format = "sc16_oai";
+  tx_id_be.num_inputs = 1;
+  tx_id_be.output_format = "sc16_item32_be";
+  tx_id_be.num_outputs = 1;
+
+  uhd::convert::function_type tx_fcn_be = uhd::convert::get_converter(tx_id_be);
+  ASSERT_TRUE(tx_fcn_be);
+  uhd::convert::converter::sptr tx_conv_be = tx_fcn_be();
+  ASSERT_TRUE(tx_conv_be);
+}
+
+void run_rx_test(const std::string& input_format, int shift, bool swap_bytes, int offset_in, int offset_out, size_t num_samples)
+{
+  uhd::convert::id_type rx_id;
+  rx_id.input_format = input_format;
+  rx_id.num_inputs = 1;
+  rx_id.output_format = "sc16_oai";
+  rx_id.num_outputs = 1;
+
+  uhd::convert::converter::sptr rx_conv = uhd::convert::get_converter(rx_id)();
+  ASSERT_TRUE(rx_conv);
+
+  // Allocate 64-byte aligned buffers
+  alignas(64) int16_t raw_in[2048];
+  alignas(64) int16_t raw_out[2048];
+
+  // Initialize input with test pattern
+  for (int i = 0; i < 2048; ++i) {
+    raw_in[i] = static_cast<int16_t>(i * 3 + 7);
+  }
+  std::memset(raw_out, 0, sizeof(raw_out));
+
+  int16_t* in_ptr = &raw_in[offset_in];
+  int16_t* out_ptr = &raw_out[offset_out];
+
+  // Check alignment
+  bool is_in_aligned = (((uintptr_t)in_ptr) & 0x3F) == 0;
+  bool is_out_aligned = (((uintptr_t)out_ptr) & 0x3F) == 0;
+
+  uhd::ref_vector<const void*> in_vec(in_ptr);
+  uhd::ref_vector<void*> out_vec(out_ptr);
+
+  rx_conv->conv(in_vec, out_vec, num_samples);
+
+  // Verify outputs
+  size_t total_ints = num_samples * 2;
+  for (size_t i = 0; i < total_ints; ++i) {
+    int16_t expected = compute_expected_rx(in_ptr[i], shift, swap_bytes);
+    EXPECT_EQ(out_ptr[i], expected) << "Mismatch at index " << i << " with num_samples=" << num_samples
+                                    << ", in_aligned=" << is_in_aligned << ", out_aligned=" << is_out_aligned;
+  }
+}
+
+void run_tx_test(const std::string& output_format, int shift, bool swap_bytes, int offset_in, int offset_out, size_t num_samples)
+{
+  uhd::convert::id_type tx_id;
+  tx_id.input_format = "sc16_oai";
+  tx_id.num_inputs = 1;
+  tx_id.output_format = output_format;
+  tx_id.num_outputs = 1;
+
+  uhd::convert::converter::sptr tx_conv = uhd::convert::get_converter(tx_id)();
+  ASSERT_TRUE(tx_conv);
+
+  // Allocate 64-byte aligned buffers
+  alignas(64) int16_t raw_in[2048];
+  alignas(64) int16_t raw_out[2048];
+
+  // Initialize input with test pattern
+  for (int i = 0; i < 2048; ++i) {
+    raw_in[i] = static_cast<int16_t>(i * 5 - 11);
+  }
+  std::memset(raw_out, 0, sizeof(raw_out));
+
+  int16_t* in_ptr = &raw_in[offset_in];
+  int16_t* out_ptr = &raw_out[offset_out];
+
+  // Check alignment
+  bool is_in_aligned = (((uintptr_t)in_ptr) & 0x3F) == 0;
+  bool is_out_aligned = (((uintptr_t)out_ptr) & 0x3F) == 0;
+
+  uhd::ref_vector<const void*> in_vec(in_ptr);
+  uhd::ref_vector<void*> out_vec(out_ptr);
+
+  tx_conv->conv(in_vec, out_vec, num_samples);
+
+  // Verify outputs
+  size_t total_ints = num_samples * 2;
+  for (size_t i = 0; i < total_ints; ++i) {
+    int16_t expected = compute_expected_tx(in_ptr[i], shift, swap_bytes);
+    EXPECT_EQ(out_ptr[i], expected) << "Mismatch at index " << i << " with num_samples=" << num_samples
+                                    << ", in_aligned=" << is_in_aligned << ", out_aligned=" << is_out_aligned;
+  }
+}
+
+TEST_F(UsrpConvertersTest, RxConverterLE_Shift4)
+{
+  constexpr bool swap_bytes = HOST_IS_BIG_ENDIAN;
+
+  // Test combinations of alignments and sizes
+  std::vector<int> offsets = {0, 1, 2, 4, 8, 15, 16, 32};
+  std::vector<size_t> sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256};
+
+  for (int off_in : offsets) {
+    for (int off_out : offsets) {
+      for (size_t num_s : sample_counts) {
+        run_rx_test("sc16_item32_le", 4, swap_bytes, off_in, off_out, num_s);
+      }
+    }
+  }
+}
+
+TEST_F(UsrpConvertersTest, RxConverterBE_Shift4)
+{
+  constexpr bool swap_bytes = !HOST_IS_BIG_ENDIAN;
+
+  // Test combinations of alignments and sizes
+  std::vector<int> offsets = {0, 1, 2, 4, 8, 15, 16, 32};
+  std::vector<size_t> sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256};
+
+  for (int off_in : offsets) {
+    for (int off_out : offsets) {
+      for (size_t num_s : sample_counts) {
+        run_rx_test("sc16_item32_be", 4, swap_bytes, off_in, off_out, num_s);
+      }
+    }
+  }
+}
+
+TEST_F(UsrpConvertersTest, TxConverterLE)
+{
+  constexpr bool swap_bytes = HOST_IS_BIG_ENDIAN;
+
+  // Test combinations of alignments and sizes
+  std::vector<int> offsets = {0, 1, 2, 4, 8, 15, 16, 32};
+  std::vector<size_t> sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256};
+
+  for (int off_in : offsets) {
+    for (int off_out : offsets) {
+      for (size_t num_s : sample_counts) {
+        run_tx_test("sc16_item32_le", 4, swap_bytes, off_in, off_out, num_s);
+      }
+    }
+  }
+}
+
+TEST_F(UsrpConvertersTest, TxConverterBE)
+{
+  constexpr bool swap_bytes = !HOST_IS_BIG_ENDIAN;
+
+  // Test combinations of alignments and sizes
+  std::vector<int> offsets = {0, 1, 2, 4, 8, 15, 16, 32};
+  std::vector<size_t> sample_counts = {1, 3, 7, 8, 15, 16, 31, 32, 63, 64, 128, 256};
+
+  for (int off_in : offsets) {
+    for (int off_out : offsets) {
+      for (size_t num_s : sample_counts) {
+        run_tx_test("sc16_item32_be", 4, swap_bytes, off_in, off_out, num_s);
+      }
+    }
+  }
+}
+
+TEST_F(UsrpConvertersTest, RxConverterShift2)
+{
+  // Register with shift 2
+  register_oai_converters(2);
+
+  constexpr bool swap_bytes_le = HOST_IS_BIG_ENDIAN;
+  constexpr bool swap_bytes_be = !HOST_IS_BIG_ENDIAN;
+
+  std::vector<int> offsets = {0, 1, 2, 8};
+  std::vector<size_t> sample_counts = {1, 7, 8, 31, 32, 64};
+
+  for (int off_in : offsets) {
+    for (int off_out : offsets) {
+      for (size_t num_s : sample_counts) {
+        run_rx_test("sc16_item32_le", 2, swap_bytes_le, off_in, off_out, num_s);
+        run_rx_test("sc16_item32_be", 2, swap_bytes_be, off_in, off_out, num_s);
+      }
+    }
+  }
+}
+
+#include <benchmark/benchmark.h>
+
+static void BM_RxConverterLE_Shift4_Aligned(benchmark::State& state) {
+  uhd::convert::id_type rx_id;
+  rx_id.input_format = "sc16_item32_le";
+  rx_id.num_inputs = 1;
+  rx_id.output_format = "sc16_oai";
+  rx_id.num_outputs = 1;
+
+  uhd::convert::converter::sptr rx_conv = uhd::convert::get_converter(rx_id)();
+
+  alignas(64) int16_t raw_in[4096];
+  alignas(64) int16_t raw_out[4096];
+
+  for (int i = 0; i < 4096; ++i) {
+    raw_in[i] = static_cast<int16_t>(i);
+  }
+
+  const void* in_vec_ptr = raw_in;
+  void* out_vec_ptr = raw_out;
+  uhd::ref_vector<const void*> in_vec(in_vec_ptr);
+  uhd::ref_vector<void*> out_vec(out_vec_ptr);
+
+  size_t num_samples = state.range(0);
+
+  for (auto _ : state) {
+    rx_conv->conv(in_vec, out_vec, num_samples);
+    benchmark::ClobberMemory();
+  }
+  state.SetItemsProcessed(state.iterations() * num_samples);
+}
+BENCHMARK(BM_RxConverterLE_Shift4_Aligned)->RangeMultiplier(2)->Range(8, 2048);
+
+static void BM_RxConverterLE_Shift4_Unaligned(benchmark::State& state) {
+  uhd::convert::id_type rx_id;
+  rx_id.input_format = "sc16_item32_le";
+  rx_id.num_inputs = 1;
+  rx_id.output_format = "sc16_oai";
+  rx_id.num_outputs = 1;
+
+  uhd::convert::converter::sptr rx_conv = uhd::convert::get_converter(rx_id)();
+
+  alignas(64) int16_t raw_in[4096];
+  alignas(64) int16_t raw_out[4096];
+
+  for (int i = 0; i < 4096; ++i) {
+    raw_in[i] = static_cast<int16_t>(i);
+  }
+
+  const void* in_vec_ptr = raw_in + 1; // 2-byte aligned
+  void* out_vec_ptr = raw_out + 1;
+  uhd::ref_vector<const void*> in_vec(in_vec_ptr);
+  uhd::ref_vector<void*> out_vec(out_vec_ptr);
+
+  size_t num_samples = state.range(0);
+
+  for (auto _ : state) {
+    rx_conv->conv(in_vec, out_vec, num_samples);
+    benchmark::ClobberMemory();
+  }
+  state.SetItemsProcessed(state.iterations() * num_samples);
+}
+BENCHMARK(BM_RxConverterLE_Shift4_Unaligned)->RangeMultiplier(2)->Range(8, 2048);
+
+static void BM_TxConverterLE_Aligned(benchmark::State& state) {
+  uhd::convert::id_type tx_id;
+  tx_id.input_format = "sc16_oai";
+  tx_id.num_inputs = 1;
+  tx_id.output_format = "sc16_item32_le";
+  tx_id.num_outputs = 1;
+
+  uhd::convert::converter::sptr tx_conv = uhd::convert::get_converter(tx_id)();
+
+  alignas(64) int16_t raw_in[4096];
+  alignas(64) int16_t raw_out[4096];
+
+  for (int i = 0; i < 4096; ++i) {
+    raw_in[i] = static_cast<int16_t>(i);
+  }
+
+  const void* in_vec_ptr = raw_in;
+  void* out_vec_ptr = raw_out;
+  uhd::ref_vector<const void*> in_vec(in_vec_ptr);
+  uhd::ref_vector<void*> out_vec(out_vec_ptr);
+
+  size_t num_samples = state.range(0);
+
+  for (auto _ : state) {
+    tx_conv->conv(in_vec, out_vec, num_samples);
+    benchmark::ClobberMemory();
+  }
+  state.SetItemsProcessed(state.iterations() * num_samples);
+}
+BENCHMARK(BM_TxConverterLE_Aligned)->RangeMultiplier(2)->Range(8, 2048);
+
+int main(int argc, char** argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  int gtest_ret = RUN_ALL_TESTS();
+
+  bool run_benchmarks = false;
+  for (int i = 1; i < argc; ++i) {
+    if (std::strcmp(argv[i], "--benchmark") == 0 || std::strstr(argv[i], "--benchmark_") != nullptr) {
+      run_benchmarks = true;
+      break;
+    }
+  }
+
+  if (run_benchmarks) {
+    // Re-register converters with shift 4 for benchmark
+    register_oai_converters(4);
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+  }
+
+  return gtest_ret;
+}
diff --git a/radio/USRP/usrp_converters.cpp b/radio/USRP/usrp_converters.cpp
new file mode 100644
index 0000000000..4452ec8a4f
--- /dev/null
+++ b/radio/USRP/usrp_converters.cpp
@@ -0,0 +1,417 @@
+/*
+ * SPDX-License-Identifier: LicenseRef-CSSL-1.0
+ */
+
+#include <uhd/convert.hpp>
+#include <string>
+#include <simde/x86/avx512.h>
+#include "usrp_converters.hpp"
+
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+constexpr bool HOST_IS_BIG_ENDIAN = true;
+#else
+constexpr bool HOST_IS_BIG_ENDIAN = false;
+#endif
+
+const std::string CPU_FORMAT_OAI = "sc16_oai";
+
+namespace uhd {
+namespace convert {
+converter::~converter(void)
+{
+  // NOP
+}
+} // namespace convert
+} // namespace uhd
+
+template <int Shift, bool SwapBytes>
+class sc16_oai_rx_converter : public uhd::convert::converter {
+ public:
+  virtual ~sc16_oai_rx_converter(void) override
+  {
+  }
+
+  void set_scalar(const double) override
+  {
+    // No-op
+  }
+
+  void operator()(const input_type& in, const output_type& out, const size_t num) override
+  {
+    for (size_t chan = 0; chan < in.size(); ++chan) {
+      const int16_t* src = reinterpret_cast<const int16_t*>(in[chan]);
+      int16_t* dest = reinterpret_cast<int16_t*>(out[chan]);
+
+      size_t total_ints = num * 2;
+      size_t j = 0;
+
+#if defined(__ARM_NEON)
+      // Process 32 elements (16 samples) at a time
+      for (; j + 31 < total_ints; j += 32) {
+        int16x8_t v0 = vld1q_s16(&src[j + 0]);
+        int16x8_t v1 = vld1q_s16(&src[j + 8]);
+        int16x8_t v2 = vld1q_s16(&src[j + 16]);
+        int16x8_t v3 = vld1q_s16(&src[j + 24]);
+
+        if (SwapBytes) {
+          v0 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v0)));
+          v1 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v1)));
+          v2 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v2)));
+          v3 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v3)));
+        }
+
+        v0 = vshrq_n_s16(v0, Shift);
+        v1 = vshrq_n_s16(v1, Shift);
+        v2 = vshrq_n_s16(v2, Shift);
+        v3 = vshrq_n_s16(v3, Shift);
+
+        vst1q_s16(&dest[j + 0], v0);
+        vst1q_s16(&dest[j + 8], v1);
+        vst1q_s16(&dest[j + 16], v2);
+        vst1q_s16(&dest[j + 24], v3);
+      }
+
+      // Process remaining elements (8 at a time)
+      for (; j + 7 < total_ints; j += 8) {
+        int16x8_t v = vld1q_s16(&src[j]);
+        if (SwapBytes) {
+          v = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v)));
+        }
+        v = vshrq_n_s16(v, Shift);
+        vst1q_s16(&dest[j], v);
+      }
+#elif defined(__AVX512F__) && defined(__AVX512BW__)
+      bool src_aligned = ((((uintptr_t)src) & 0x3F) == 0);
+      bool dest_aligned = ((((uintptr_t)dest) & 0x3F) == 0);
+      simde__m512i mask_ff = simde_mm512_set1_epi16(0x00FF);
+
+      if (src_aligned && dest_aligned) {
+        for (; j + 31 < total_ints; j += 32) {
+          simde__m512i v_in = simde_mm512_load_si512(reinterpret_cast<const simde__m512i*>(&src[j]));
+          if (SwapBytes) {
+            simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_in, 8), mask_ff);
+            simde__m512i high = simde_mm512_slli_epi16(v_in, 8);
+            v_in = simde_mm512_or_si512(low, high);
+          }
+          simde__m512i v_out = simde_mm512_srai_epi16(v_in, Shift);
+          simde_mm512_store_si512(reinterpret_cast<simde__m512i*>(&dest[j]), v_out);
+        }
+      } else {
+        for (; j + 31 < total_ints; j += 32) {
+          simde__m512i v_in = simde_mm512_loadu_si512(reinterpret_cast<const simde__m512i*>(&src[j]));
+          if (SwapBytes) {
+            simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_in, 8), mask_ff);
+            simde__m512i high = simde_mm512_slli_epi16(v_in, 8);
+            v_in = simde_mm512_or_si512(low, high);
+          }
+          simde__m512i v_out = simde_mm512_srai_epi16(v_in, Shift);
+          simde_mm512_storeu_si512(reinterpret_cast<simde__m512i*>(&dest[j]), v_out);
+        }
+      }
+#elif defined(__AVX2__)
+      bool src_aligned = ((((uintptr_t)src) & 0x1F) == 0);
+      bool dest_aligned = ((((uintptr_t)dest) & 0x1F) == 0);
+      simde__m256i mask_ff = simde_mm256_set1_epi16(0x00FF);
+
+      if (src_aligned && dest_aligned) {
+        for (; j + 15 < total_ints; j += 16) {
+          simde__m256i v_in = simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(&src[j]));
+          if (SwapBytes) {
+            simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_in, 8), mask_ff);
+            simde__m256i high = simde_mm256_slli_epi16(v_in, 8);
+            v_in = simde_mm256_or_si256(low, high);
+          }
+          simde__m256i v_out = simde_mm256_srai_epi16(v_in, Shift);
+          simde_mm256_store_si256(reinterpret_cast<simde__m256i*>(&dest[j]), v_out);
+        }
+      } else {
+        for (; j + 15 < total_ints; j += 16) {
+          simde__m256i v_in = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(&src[j]));
+          if (SwapBytes) {
+            simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_in, 8), mask_ff);
+            simde__m256i high = simde_mm256_slli_epi16(v_in, 8);
+            v_in = simde_mm256_or_si256(low, high);
+          }
+          simde__m256i v_out = simde_mm256_srai_epi16(v_in, Shift);
+          simde_mm256_storeu_si256(reinterpret_cast<simde__m256i*>(&dest[j]), v_out);
+        }
+      }
+#elif defined(__SSE2__)
+      bool src_aligned = ((((uintptr_t)src) & 0x0F) == 0);
+      bool dest_aligned = ((((uintptr_t)dest) & 0x0F) == 0);
+      simde__m128i mask_ff = simde_mm_set1_epi16(0x00FF);
+
+      if (src_aligned && dest_aligned) {
+        for (; j + 7 < total_ints; j += 8) {
+          simde__m128i v_in = simde_mm_load_si128(reinterpret_cast<const simde__m128i*>(&src[j]));
+          if (SwapBytes) {
+            simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_in, 8), mask_ff);
+            simde__m128i high = simde_mm_slli_epi16(v_in, 8);
+            v_in = simde_mm_or_si128(low, high);
+          }
+          simde__m128i v_out = simde_mm_srai_epi16(v_in, Shift);
+          simde_mm_store_si128(reinterpret_cast<simde__m128i*>(&dest[j]), v_out);
+        }
+      } else {
+        for (; j + 7 < total_ints; j += 8) {
+          simde__m128i v_in = simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(&src[j]));
+          if (SwapBytes) {
+            simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_in, 8), mask_ff);
+            simde__m128i high = simde_mm_slli_epi16(v_in, 8);
+            v_in = simde_mm_or_si128(low, high);
+          }
+          simde__m128i v_out = simde_mm_srai_epi16(v_in, Shift);
+          simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(&dest[j]), v_out);
+        }
+      }
+#endif
+      for (; j < total_ints; ++j) {
+        int16_t val = src[j];
+        if (SwapBytes) {
+          val = (val << 8) | ((val >> 8) & 0x00FF);
+        }
+        dest[j] = val >> Shift;
+      }
+    }
+  }
+};
+
+template <int Shift, bool SwapBytes>
+class sc16_oai_tx_converter : public uhd::convert::converter {
+ public:
+  virtual ~sc16_oai_tx_converter(void) override
+  {
+  }
+
+  void set_scalar(const double) override
+  {
+    // No-op
+  }
+
+  void operator()(const input_type& in, const output_type& out, const size_t num) override
+  {
+    for (size_t chan = 0; chan < in.size(); ++chan) {
+      const int16_t* src = reinterpret_cast<const int16_t*>(in[chan]);
+      int16_t* dest = reinterpret_cast<int16_t*>(out[chan]);
+
+      size_t total_ints = num * 2;
+      size_t j = 0;
+
+#if defined(__ARM_NEON)
+      // Process 32 elements (16 samples) at a time
+      for (; j + 31 < total_ints; j += 32) {
+        int16x8_t v0 = vld1q_s16(&src[j + 0]);
+        int16x8_t v1 = vld1q_s16(&src[j + 8]);
+        int16x8_t v2 = vld1q_s16(&src[j + 16]);
+        int16x8_t v3 = vld1q_s16(&src[j + 24]);
+
+        v0 = vshlq_n_s16(v0, Shift);
+        v1 = vshlq_n_s16(v1, Shift);
+        v2 = vshlq_n_s16(v2, Shift);
+        v3 = vshlq_n_s16(v3, Shift);
+
+        if (SwapBytes) {
+          v0 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v0)));
+          v1 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v1)));
+          v2 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v2)));
+          v3 = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v3)));
+        }
+
+        vst1q_s16(&dest[j + 0], v0);
+        vst1q_s16(&dest[j + 8], v1);
+        vst1q_s16(&dest[j + 16], v2);
+        vst1q_s16(&dest[j + 24], v3);
+      }
+
+      // Process remaining elements (8 at a time)
+      for (; j + 7 < total_ints; j += 8) {
+        int16x8_t v = vld1q_s16(&src[j]);
+        v = vshlq_n_s16(v, Shift);
+        if (SwapBytes) {
+          v = vreinterpretq_s16_u8(vrev16q_u8(vreinterpretq_u8_s16(v)));
+        }
+        vst1q_s16(&dest[j], v);
+      }
+#elif defined(__AVX512F__) && defined(__AVX512BW__)
+      bool src_aligned = ((((uintptr_t)src) & 0x3F) == 0);
+      bool dest_aligned = ((((uintptr_t)dest) & 0x3F) == 0);
+      simde__m512i mask_ff = simde_mm512_set1_epi16(0x00FF);
+
+      if (src_aligned && dest_aligned) {
+        for (; j + 31 < total_ints; j += 32) {
+          simde__m512i v_in = simde_mm512_load_si512(reinterpret_cast<const simde__m512i*>(&src[j]));
+          simde__m512i v_out = simde_mm512_slli_epi16(v_in, Shift);
+          if (SwapBytes) {
+            simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_out, 8), mask_ff);
+            simde__m512i high = simde_mm512_slli_epi16(v_out, 8);
+            v_out = simde_mm512_or_si512(low, high);
+          }
+          simde_mm512_store_si512(reinterpret_cast<simde__m512i*>(&dest[j]), v_out);
+        }
+      } else {
+        for (; j + 31 < total_ints; j += 32) {
+          simde__m512i v_in = simde_mm512_loadu_si512(reinterpret_cast<const simde__m512i*>(&src[j]));
+          simde__m512i v_out = simde_mm512_slli_epi16(v_in, Shift);
+          if (SwapBytes) {
+            simde__m512i low = simde_mm512_and_si512(simde_mm512_srli_epi16(v_out, 8), mask_ff);
+            simde__m512i high = simde_mm512_slli_epi16(v_out, 8);
+            v_out = simde_mm512_or_si512(low, high);
+          }
+          simde_mm512_storeu_si512(reinterpret_cast<simde__m512i*>(&dest[j]), v_out);
+        }
+      }
+#elif defined(__AVX2__)
+      bool src_aligned = ((((uintptr_t)src) & 0x1F) == 0);
+      bool dest_aligned = ((((uintptr_t)dest) & 0x1F) == 0);
+      simde__m256i mask_ff = simde_mm256_set1_epi16(0x00FF);
+
+      if (src_aligned && dest_aligned) {
+        for (; j + 15 < total_ints; j += 16) {
+          simde__m256i v_in = simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(&src[j]));
+          simde__m256i v_out = simde_mm256_slli_epi16(v_in, Shift);
+          if (SwapBytes) {
+            simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_out, 8), mask_ff);
+            simde__m256i high = simde_mm256_slli_epi16(v_out, 8);
+            v_out = simde_mm256_or_si256(low, high);
+          }
+          simde_mm256_store_si256(reinterpret_cast<simde__m256i*>(&dest[j]), v_out);
+        }
+      } else {
+        for (; j + 15 < total_ints; j += 16) {
+          simde__m256i v_in = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(&src[j]));
+          simde__m256i v_out = simde_mm256_slli_epi16(v_in, Shift);
+          if (SwapBytes) {
+            simde__m256i low = simde_mm256_and_si256(simde_mm256_srli_epi16(v_out, 8), mask_ff);
+            simde__m256i high = simde_mm256_slli_epi16(v_out, 8);
+            v_out = simde_mm256_or_si256(low, high);
+          }
+          simde_mm256_storeu_si256(reinterpret_cast<simde__m256i*>(&dest[j]), v_out);
+        }
+      }
+#elif defined(__SSE2__)
+      bool src_aligned = ((((uintptr_t)src) & 0x0F) == 0);
+      bool dest_aligned = ((((uintptr_t)dest) & 0x0F) == 0);
+      simde__m128i mask_ff = simde_mm_set1_epi16(0x00FF);
+
+      if (src_aligned && dest_aligned) {
+        for (; j + 7 < total_ints; j += 8) {
+          simde__m128i v_in = simde_mm_load_si128(reinterpret_cast<const simde__m128i*>(&src[j]));
+          simde__m128i v_out = simde_mm_slli_epi16(v_in, Shift);
+          if (SwapBytes) {
+            simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_out, 8), mask_ff);
+            simde__m128i high = simde_mm_slli_epi16(v_out, 8);
+            v_out = simde_mm_or_si128(low, high);
+          }
+          simde_mm_store_si128(reinterpret_cast<simde__m128i*>(&dest[j]), v_out);
+        }
+      } else {
+        for (; j + 7 < total_ints; j += 8) {
+          simde__m128i v_in = simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(&src[j]));
+          simde__m128i v_out = simde_mm_slli_epi16(v_in, Shift);
+          if (SwapBytes) {
+            simde__m128i low = simde_mm_and_si128(simde_mm_srli_epi16(v_out, 8), mask_ff);
+            simde__m128i high = simde_mm_slli_epi16(v_out, 8);
+            v_out = simde_mm_or_si128(low, high);
+          }
+          simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(&dest[j]), v_out);
+        }
+      }
+#endif
+      for (; j < total_ints; ++j) {
+        int16_t val = src[j] << Shift;
+        if (SwapBytes) {
+          val = (val << 8) | ((val >> 8) & 0x00FF);
+        }
+        dest[j] = val;
+      }
+    }
+  }
+};
+
+void register_oai_converters(int rxshift)
+{
+  uhd::convert::register_bytes_per_item(CPU_FORMAT_OAI, sizeof(int16_t) * 2);
+
+  constexpr bool swap_le = (false != HOST_IS_BIG_ENDIAN);
+  constexpr bool swap_be = (true != HOST_IS_BIG_ENDIAN);
+
+  for (size_t num_chans = 1; num_chans <= 4; ++num_chans) {
+    // 1. LE wire format
+    {
+      uhd::convert::id_type rx_id;
+      rx_id.input_format = "sc16_item32_le";
+      rx_id.num_inputs = num_chans;
+      rx_id.output_format = CPU_FORMAT_OAI;
+      rx_id.num_outputs = num_chans;
+
+      switch (rxshift) {
+        case 2:
+          uhd::convert::register_converter(
+              rx_id,
+              []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<2, swap_le>()); },
+              100);
+          break;
+        case 4:
+          uhd::convert::register_converter(
+              rx_id,
+              []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<4, swap_le>()); },
+              100);
+          break;
+        default:
+          break;
+      }
+
+      uhd::convert::id_type tx_id;
+      tx_id.input_format = CPU_FORMAT_OAI;
+      tx_id.num_inputs = num_chans;
+      tx_id.output_format = rx_id.input_format;
+      tx_id.num_outputs = num_chans;
+
+      uhd::convert::register_converter(
+          tx_id,
+          []() { return uhd::convert::converter::sptr(new sc16_oai_tx_converter<4, swap_le>()); },
+          100);
+    }
+
+    // 2. BE wire format
+    {
+      uhd::convert::id_type rx_id;
+      rx_id.input_format = "sc16_item32_be";
+      rx_id.num_inputs = num_chans;
+      rx_id.output_format = CPU_FORMAT_OAI;
+      rx_id.num_outputs = num_chans;
+
+      switch (rxshift) {
+        case 2:
+          uhd::convert::register_converter(
+              rx_id,
+              []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<2, swap_be>()); },
+              100);
+          break;
+        case 4:
+          uhd::convert::register_converter(
+              rx_id,
+              []() { return uhd::convert::converter::sptr(new sc16_oai_rx_converter<4, swap_be>()); },
+              100);
+          break;
+        default:
+          break;
+      }
+
+      uhd::convert::id_type tx_id;
+      tx_id.input_format = CPU_FORMAT_OAI;
+      tx_id.num_inputs = num_chans;
+      tx_id.output_format = rx_id.input_format;
+      tx_id.num_outputs = num_chans;
+
+      uhd::convert::register_converter(
+          tx_id,
+          []() { return uhd::convert::converter::sptr(new sc16_oai_tx_converter<4, swap_be>()); },
+          100);
+    }
+  }
+}
diff --git a/radio/USRP/usrp_converters.hpp b/radio/USRP/usrp_converters.hpp
new file mode 100644
index 0000000000..144387af24
--- /dev/null
+++ b/radio/USRP/usrp_converters.hpp
@@ -0,0 +1,10 @@
+/*
+ * SPDX-License-Identifier: LicenseRef-CSSL-1.0
+ */
+
+#ifndef USRP_CONVERTERS_HPP
+#define USRP_CONVERTERS_HPP
+
+void register_oai_converters(int rxshift);
+
+#endif // USRP_CONVERTERS_HPP
diff --git a/radio/USRP/usrp_lib.cpp b/radio/USRP/usrp_lib.cpp
index 5b8b34c9ed..f37955616c 100644
--- a/radio/USRP/usrp_lib.cpp
+++ b/radio/USRP/usrp_lib.cpp
@@ -14,6 +14,7 @@
 #else
   #include <uhd/utils/thread.hpp>
 #endif
+#include <uhd/convert.hpp>
 #include <uhd/usrp/multi_usrp.hpp>
 #include <uhd/version.hpp>
 #include <boost/lexical_cast.hpp>
@@ -35,6 +36,8 @@
 #include <sys/resource.h>
 
 #include "openair1/PHY/sse_intrin.h"
+#include "usrp_converters.hpp"
+
 
 /** @addtogroup _USRP_PHY_RF_INTERFACE_
  * @{
@@ -415,7 +418,6 @@ static int trx_usrp_write(openair0_device_t *device,
   int ret=0;
   usrp_state_t *s = (usrp_state_t *)device->priv;
   timestamp -= device->openair0_cfg->command_line_sample_advance + device->openair0_cfg->tx_sample_advance;
-  int nsamps2;  // aligned to upper 32 or 16 byte boundary
 
   radio_tx_burst_flag_t flags_burst = (radio_tx_burst_flag_t) (flags & 0xf);
   radio_tx_gpio_flag_t flags_gpio = (radio_tx_gpio_flag_t) ((flags >> 4) & 0x1fff);
@@ -458,22 +460,6 @@ static int trx_usrp_write(openair0_device_t *device,
     }
 
     if (usrp_tx_thread == 0) {
-      nsamps2 = (nsamps+7)>>3;
-      simde__m256i buff_tx[cc < 2 ? 2 : cc][nsamps2];
-
-      // bring TX data into 16 MSBs, assuming it is on the 12 LSB  after OAI computation
-      const int shift = 4;
-      for (int i = 0; i < cc; i++) {
-        for (int j = 0; j < nsamps2; j++) {
-          if ((((uintptr_t)buff[i]) & 0x1F) == 0) {
-            buff_tx[i][j] = simde_mm256_slli_epi16(((simde__m256i *)buff[i])[j], shift);
-          } else {
-            simde__m256i tmp = simde_mm256_loadu_si256(((simde__m256i *)buff[i]) + j);
-            buff_tx[i][j] = simde_mm256_slli_epi16(tmp, shift);
-          }
-        }
-      }
-
       s->tx_md.has_time_spec = true;
       s->tx_md.start_of_burst = (s->tx_count == 0) ? true : first_packet_state;
       s->tx_md.end_of_burst = last_packet_state;
@@ -494,11 +480,11 @@ static int trx_usrp_write(openair0_device_t *device,
         std::vector<void *> buff_ptrs;
 
         for (int i = 0; i < cc; i++)
-          buff_ptrs.push_back(&(((int16_t *)buff_tx[i])[0]));
+          buff_ptrs.push_back(buff[i]);
 
         ret = (int)s->tx_stream->send(buff_ptrs, nsamps, s->tx_md);
       } else {
-        ret = (int)s->tx_stream->send(&(((int16_t *)buff_tx[0])[0]), nsamps, s->tx_md);
+        ret = (int)s->tx_stream->send(buff[0], nsamps, s->tx_md);
       }
 
       if (ret != nsamps) {
@@ -553,7 +539,6 @@ void *trx_usrp_write_thread(void * arg)
   openair0_write_package_t *write_package = write_thread->write_package;
 
   usrp_state_t *s;
-  int nsamps2;  // aligned to upper 32 or 16 byte boundary
   int start;
   openair0_timestamp_t timestamp;
   void        **buff;
@@ -588,23 +573,6 @@ void *trx_usrp_write_thread(void * arg)
       LOG_W(HW,"count write = %d, start = %d, end = %d\n", write_thread->count_write, write_thread->start, write_thread->end);
     }*/
 
-        nsamps2 = (nsamps+7)>>3;
-        simde__m256i buff_tx[cc < 2 ? 2 : cc][nsamps2];
-        // bring TX data into 16 MSBs, assuming it is on the 12 LSB  after OAI computation
-        const int shift = 4;
-        for (int i = 0; i < cc; i++) {
-          for (int j = 0; j < nsamps2; j++) {
-            if ((((uintptr_t) buff[i])&0x1F)==0) {
-              buff_tx[i][j] = simde_mm256_slli_epi16(((simde__m256i *)buff[i])[j], shift);
-            }
-            else
-            {
-              simde__m256i tmp = simde_mm256_loadu_si256(((simde__m256i *)buff[i]) + j);
-              buff_tx[i][j] = simde_mm256_slli_epi16(tmp, shift);
-            }
-          }
-        }
-
     s->tx_md.has_time_spec  = true;
     s->tx_md.start_of_burst = (s->tx_count==0) ? true : first_packet;
     s->tx_md.end_of_burst   = last_packet;
@@ -624,15 +592,15 @@ void *trx_usrp_write_thread(void * arg)
       std::vector<void *> buff_ptrs;
 
       for (int i=0; i<cc; i++)
-        buff_ptrs.push_back(&(((int16_t *)buff_tx[i])[0]));
+        buff_ptrs.push_back(buff[i]);
 
       ret = (int)s->tx_stream->send(buff_ptrs, nsamps, s->tx_md);
     }
     else {
-      ret = (int)s->tx_stream->send(&(((int16_t *)buff_tx[0])[0]), nsamps, s->tx_md);
+      ret = (int)s->tx_stream->send(buff[0], nsamps, s->tx_md);
     }
 
-    T(T_USRP_TX_ANT0, T_INT(timestamp), T_BUFFER(buff_tx[0], nsamps*4));
+    T(T_USRP_TX_ANT0, T_INT(timestamp), T_BUFFER(buff[0], nsamps*4));
 
     if (ret != nsamps) LOG_E(HW,"[xmit] tx samples %d != %d\n",ret,nsamps);
     VCD_SIGNAL_DUMPER_DUMP_VARIABLE_BY_NAME( VCD_SIGNAL_DUMPER_VARIABLES_USRP_SEND_RETURN, ret );
@@ -693,23 +661,7 @@ static int trx_usrp_read(openair0_device_t *device, openair0_timestamp_t *ptimes
 {
   usrp_state_t *s = (usrp_state_t *)device->priv;
   int samples_received=0;
-  int nsamps2; // aligned to upper 32 or 16 byte boundary
-  nsamps2 = (nsamps+7)>>3;
-  simde__m256i buff_tmp[cc < 2 ? 2 : cc][nsamps2];
   static int read_count = 0;
-  int rxshift;
-  switch (device->type) {
-     case USRP_B200_DEV:
-        rxshift=4;
-        break;
-     case USRP_X300_DEV:
-     case USRP_N300_DEV:
-     case USRP_X400_DEV:
-        rxshift=2;
-        break;
-     default:
-       AssertFatal(1==0,"Shouldn't be here\n");
-  }
 
   samples_received=0;
   while (samples_received != nsamps) {
@@ -718,12 +670,12 @@ static int trx_usrp_read(openair0_device_t *device, openair0_timestamp_t *ptimes
       // receive multiple channels (e.g. RF A and RF B)
       std::vector<void *> buff_ptrs;
 
-      for (int i=0; i<cc; i++) buff_ptrs.push_back(buff_tmp[i]+samples_received);
+      for (int i=0; i<cc; i++) buff_ptrs.push_back((void*)((int32_t*)buff[i]+samples_received));
       samples_received += s->rx_stream->recv(buff_ptrs, nsamps-samples_received, s->rx_md);
     } else {
       // receive a single channel (e.g. from connector RF A)
 
-      samples_received += s->rx_stream->recv((void*)((int32_t*)buff_tmp[0]+samples_received),
+      samples_received += s->rx_stream->recv((void*)((int32_t*)buff[0]+samples_received),
                                              nsamps-samples_received, s->rx_md);
     }
     if  ((s->wait_for_first_pps == 0) && (s->rx_md.error_code!=uhd::rx_metadata_t::ERROR_CODE_NONE))
@@ -735,22 +687,6 @@ static int trx_usrp_read(openair0_device_t *device, openair0_timestamp_t *ptimes
   }
   if (samples_received == nsamps) s->wait_for_first_pps=0;
 
-  // bring RX data into 12 LSBs for softmodem RX
-  for (int i=0; i<cc; i++) {
-    for (int j = 0; j < nsamps2; j++) {
-      // bring RX data into 12 LSBs for softmodem RX,
-      // this keeps the significant bits of B210 and may loose better ADC results,
-      // but it makes free bits in MSB for signal processing on int16
-      if ((((uintptr_t) buff[i])&0x1F)==0) {
-        ((simde__m256i *)buff[i])[j] = simde_mm256_srai_epi16(buff_tmp[i][j], rxshift);
-      } else {
-        // FK: in some cases the buffer might not be 32 byte aligned, so we cannot use avx2
-        simde__m256i tmp = simde_mm256_srai_epi16(buff_tmp[i][j], rxshift);
-        simde_mm256_storeu_si256(((simde__m256i *)buff[i]) + j, tmp);
-      }
-    }
-  }
-
   if (samples_received < nsamps) {
     LOG_E(HW,"[recv] received %d samples out of %d\n",samples_received,nsamps);
   }
@@ -1458,8 +1394,24 @@ extern "C" {
   LOG_I(HW,"Actual clock source %s...\n",s->usrp->get_clock_source(0).c_str());
   LOG_I(HW,"Actual time source %s...\n",s->usrp->get_time_source(0).c_str());
 
+  // register custom OAI converters
+  int rxshift;
+  switch (device->type) {
+    case USRP_B200_DEV:
+      rxshift = 4;
+      break;
+    case USRP_X300_DEV:
+    case USRP_N300_DEV:
+    case USRP_X400_DEV:
+      rxshift = 2;
+      break;
+    default:
+      AssertFatal(1 == 0, "Shouldn't be here\n");
+  }
+  register_oai_converters(rxshift);
+
   // create tx & rx streamer
-  uhd::stream_args_t stream_args_rx("sc16", "sc16");
+  uhd::stream_args_t stream_args_rx("sc16_oai", "sc16");
   for (int i = 0; i<openair0_cfg[0].rx_num_channels; i++) {
     LOG_I(HW,"setting rx channel %d\n",i+choffset);
     stream_args_rx.channels.push_back(i+choffset);
@@ -1478,7 +1430,7 @@ extern "C" {
   LOG_I(HW,"rx_max_num_samps %zu\n",
         s->rx_stream->get_max_num_samps());
 
-  uhd::stream_args_t stream_args_tx("sc16", "sc16");
+  uhd::stream_args_t stream_args_tx("sc16_oai", "sc16");
 
   for (int i = 0; i<openair0_cfg[0].tx_num_channels; i++)
     stream_args_tx.channels.push_back(i+choffset);