From 9f4b19c0a85bd6747c5ed1f0d9b073b5f5e82b17 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Thu, 2 Oct 2025 07:38:11 -0700
Subject: [PATCH 01/14] libfabric: add SYNAPSEAI support

---
 meson.build                                   | 32 +++++++++++++++++++
 src/plugins/libfabric/libfabric_backend.cpp   | 24 ++++++++++----
 src/plugins/libfabric/meson.build             |  6 ++++
 src/utils/libfabric/libfabric_rail.cpp        | 24 +++++++-------
 src/utils/libfabric/libfabric_rail.h          |  5 +++
 .../libfabric/libfabric_rail_manager.cpp      |  4 +--
 src/utils/libfabric/libfabric_topology.cpp    | 30 +++++++++++++++--
 src/utils/libfabric/libfabric_topology.h      |  2 ++
 src/utils/libfabric/meson.build               |  6 ++++
 9 files changed, 111 insertions(+), 22 deletions(-)

diff --git a/meson.build b/meson.build
index 8a7be09c91..6eebe4e921 100644
--- a/meson.build
+++ b/meson.build
@@ -104,6 +104,38 @@ else
     warning('CUDA not found. UCX backend will be built without CUDA support, and some plugins will be disabled.')
 endif
 
+# SynapseAI (Habana Gaudi) dependency detection
+# Try to find both libSynapse and hl-thunk libraries
+synapse_lib = cpp.find_library('Synapse',
+                               dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
+                               required: false)
+hlthunk_lib = cpp.find_library('hl-thunk',
+                               dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
+                               required: false)
+
+# SynapseAI support requires both libraries
+synapseai_dep = dependency('', required: false)  # Initialize as not found
+if synapse_lib.found() and hlthunk_lib.found()
+    synapseai_dep = declare_dependency(dependencies: [synapse_lib, hlthunk_lib])
+elif hlthunk_lib.found()
+    # Fallback to just hl-thunk if libSynapse not available
+    synapseai_dep = hlthunk_lib
+endif
+
+if synapseai_dep.found()
+    # Create proper dependency with include paths (including DRM path for habanalabs headers)
+    synapseai_dep = declare_dependency(
+        dependencies: synapseai_dep,
+        include_directories: [
+            include_directories('/usr/include/habanalabs'),
+            include_directories('/usr/include/drm')
+        ]
+    )
+    message('Found SynapseAI support for Habana Gaudi devices')
+else
+    warning('SynapseAI not found. Habana Gaudi device support will be disabled.')
+endif
+
 # DOCA
 doca_gpunetio_dep = dependency('doca-gpunetio', required : false)
 
diff --git a/src/plugins/libfabric/libfabric_backend.cpp b/src/plugins/libfabric/libfabric_backend.cpp
index 1f9ff20e30..e650a45927 100644
--- a/src/plugins/libfabric/libfabric_backend.cpp
+++ b/src/plugins/libfabric/libfabric_backend.cpp
@@ -721,7 +721,7 @@ nixl_mem_list_t
 nixlLibfabricEngine::getSupportedMems() const {
     nixl_mem_list_t mems;
     mems.push_back(DRAM_SEG);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_SYNAPSEAI)
     mems.push_back(VRAM_SEG);
 #endif
     return mems;
@@ -737,9 +737,10 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
     priv->length_ = mem.len;
     priv->gpu_device_id_ = mem.devId; // Store GPU device ID
 
-#ifdef HAVE_CUDA
-    // Handle CUDA memory registration with GPU Direct RDMA support
     if (nixl_mem == VRAM_SEG) {
+#ifdef HAVE_CUDA
+        // Handle CUDA memory registration with GPU Direct RDMA support
+
         // For multi-GPU support, skip CUDA address workaround
         if (cuda_addr_wa_) {
             bool need_restart;
@@ -763,19 +764,28 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
             }
             NIXL_DEBUG << "Set CUDA device context to GPU " << mem.devId;
         }
-    }
 #endif
 
+#ifdef HAVE_SYNAPSEAI
+        // Handle SynapseAI memory registration
+        NIXL_DEBUG << "Registering SynapseAI device memory for device " << mem.devId;
+        // SynapseAI-specific setup would go here if needed
+#endif
+    }
+
     // Initialize vectors to accommodate all possible rails (for indexing consistency)
     priv->rail_mr_list_.resize(rail_manager.getNumDataRails(), nullptr);
     priv->rail_key_list_.resize(rail_manager.getNumDataRails(), 0);
 
-#ifdef HAVE_CUDA
-    // Set CUDA context before libfabric operations for VRAM
     if (nixl_mem == VRAM_SEG) {
+#ifdef HAVE_CUDA
+        // Set CUDA context before libfabric operations for VRAM
         vramApplyCtx();
-    }
 #endif
+#ifdef HAVE_SYNAPSEAI
+        // SynapseAI context application would go here if needed
+#endif
+    }
 
     // Use Rail Manager for centralized memory registration with GPU Direct RDMA support
     NIXL_TRACE << "Registering memory: addr=" << (void *)mem.addr << " len=" << mem.len
diff --git a/src/plugins/libfabric/meson.build b/src/plugins/libfabric/meson.build
index c48d13806b..8f66f066e9 100644
--- a/src/plugins/libfabric/meson.build
+++ b/src/plugins/libfabric/meson.build
@@ -33,6 +33,12 @@ if cuda_dep.found()
     compile_flags += ['-DHAVE_CUDA']
 endif
 
+# Add SynapseAI support if available (dependency is globally defined)
+if synapseai_dep.found()
+    libfabric_plugin_deps += [synapseai_dep]
+    compile_flags += ['-DHAVE_SYNAPSEAI']
+endif
+
 # Build as static or shared library based on configuration
 if 'LIBFABRIC' in static_plugins
     libfabric_backend_lib = static_library(
diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp
index 97b3ebf2aa..7b3d15bf67 100644
--- a/src/utils/libfabric/libfabric_rail.cpp
+++ b/src/utils/libfabric/libfabric_rail.cpp
@@ -512,17 +512,19 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device,
         }
 
         // Disable shared memory transfers for EFA provider to fix same-agent transfers
-        bool optval = false;
-        ret = fi_setopt(&endpoint->fid,
-                        FI_OPT_ENDPOINT,
-                        FI_OPT_SHARED_MEMORY_PERMITTED,
-                        &optval,
-                        sizeof(optval));
-        if (ret && ret != -FI_ENOSYS) {
-            NIXL_WARN << "fi_setopt FI_OPT_SHARED_MEMORY_PERMITTED failed for rail " << rail_id
-                      << ": " << fi_strerror(-ret) << " - continuing anyway";
-        } else if (ret == 0) {
-            NIXL_DEBUG << "Successfully disabled shared memory transfers for rail " << rail_id;
+        if (provider_name.find("efa") == 0) {
+           bool optval = false;
+           ret = fi_setopt(&endpoint->fid,
+                           FI_OPT_ENDPOINT,
+                           FI_OPT_SHARED_MEMORY_PERMITTED,
+                           &optval,
+                           sizeof(optval));
+           if (ret && ret != -FI_ENOSYS) {
+               NIXL_WARN << "fi_setopt FI_OPT_SHARED_MEMORY_PERMITTED failed for rail " << rail_id
+                         << ": " << fi_strerror(-ret) << " - continuing anyway";
+           } else if (ret == 0) {
+               NIXL_DEBUG << "Successfully disabled shared memory transfers for rail " << rail_id;
+           }
         }
 
         // Enable endpoint for this rail
diff --git a/src/utils/libfabric/libfabric_rail.h b/src/utils/libfabric/libfabric_rail.h
index 7e5fdfdd9b..7fe7f56906 100644
--- a/src/utils/libfabric/libfabric_rail.h
+++ b/src/utils/libfabric/libfabric_rail.h
@@ -30,6 +30,11 @@
 #include "backend/backend_aux.h"
 #include "libfabric/libfabric_common.h"
 
+#ifdef HAVE_SYNAPSEAI
+#include <habanalabs/synapse_api.h>
+#include <habanalabs/hlthunk.h>
+#endif
+
 // Forward declarations
 class nixlLibfabricConnection;
 
diff --git a/src/utils/libfabric/libfabric_rail_manager.cpp b/src/utils/libfabric/libfabric_rail_manager.cpp
index bb26e4b80f..dcf17c15c8 100644
--- a/src/utils/libfabric/libfabric_rail_manager.cpp
+++ b/src/utils/libfabric/libfabric_rail_manager.cpp
@@ -314,7 +314,7 @@ nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
                                                nixl_mem_t mem_type,
                                                int gpu_id) const {
     if (mem_type == VRAM_SEG) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_SYNAPSEAI)
         if (gpu_id < 0) {
             NIXL_ERROR << "Invalid GPU ID " << gpu_id << " for VRAM memory " << mem_addr;
             return {}; // Return empty vector to indicate failure
@@ -354,7 +354,7 @@ nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
                    << gpu_rails.size() << " rails total";
         return gpu_rails;
 #else
-        NIXL_ERROR << "VRAM memory type not supported without CUDA";
+        NIXL_ERROR << "VRAM memory type not supported without CUDA/SYNAPSEAI";
         return {};
 #endif
     }
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 19c13c0865..6fd66167f9 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -309,14 +309,25 @@ nixlLibfabricTopology::discoverGpusWithHwloc() {
             NIXL_TRACE << "Found NVIDIA GPU " << num_gpus << ": " << pcie_addr << " (vendor=0x"
                        << std::hex << vendor_id << ", device=0x" << device_id << ", class=0x"
                        << class_id << std::dec << ")";
+            num_gpus++;
+        } else if (isIntelHpu(pci_obj)) {
+            std::string pcie_addr = getPcieAddressFromHwlocObj(pci_obj);
+            // Get device and vendor info
+            uint16_t vendor_id = pci_obj->attr->pcidev.vendor_id;
+            uint16_t device_id = pci_obj->attr->pcidev.device_id;
+            uint16_t class_id = pci_obj->attr->pcidev.class_id;
 
+            NIXL_TRACE << "Found Intel Habana GPU " << num_gpus << ": " << pcie_addr << " (vendor=0x"
+                       << std::hex << vendor_id << ", device=0x" << device_id << ", class=0x"
+                       << class_id << std::dec << ")";
             num_gpus++;
         }
     }
 
-    NIXL_TRACE << "Discovered " << num_gpus << " NVIDIA GPUs via hwloc";
+    NIXL_TRACE << "Discovered " << num_gpus << " GPUs via hwloc";
 
     // If we found more than 8 GPUs on P5en, investigate further
+    // FIXME: add Habana related messages
     if (num_gpus > 8) {
         NIXL_WARN << "Found " << num_gpus
                   << " NVIDIA GPUs, but P5en should have 8. Investigating...";
@@ -481,7 +492,7 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
     // Step 2: Discover GPUs
     hwloc_obj_t pci_obj = nullptr;
     while ((pci_obj = hwloc_get_next_pcidev(hwloc_topology, pci_obj)) != nullptr) {
-        if (isNvidiaGpu(pci_obj)) {
+        if (isNvidiaGpu(pci_obj) || isIntelHpu(pci_obj)) {
             GpuInfo gpu;
             gpu.hwloc_node = pci_obj;
             gpu.domain_id = pci_obj->attr->pcidev.domain;
@@ -570,6 +581,21 @@ nixlLibfabricTopology::getPcieAddressFromHwlocObj(hwloc_obj_t obj) const {
     return std::string(pcie_addr);
 }
 
+bool
+nixlLibfabricTopology::isIntelHpu(hwloc_obj_t obj) const {
+    if (!obj || obj->type != HWLOC_OBJ_PCI_DEVICE) {
+        return false;
+    }
+    // Intel Habana vendor ID is 0x1da3
+    if (obj->attr->pcidev.vendor_id != 0x1da3) {
+        return false;
+    }
+    // Gaudi devices use class 0x1200 (Processing Accelerators)
+    // Accept this class specifically for Habana devices
+    uint16_t class_id = obj->attr->pcidev.class_id;
+    return (class_id == 0x1200);
+}
+
 bool
 nixlLibfabricTopology::isNvidiaGpu(hwloc_obj_t obj) const {
     if (!obj || obj->type != HWLOC_OBJ_PCI_DEVICE) {
diff --git a/src/utils/libfabric/libfabric_topology.h b/src/utils/libfabric/libfabric_topology.h
index f85bc74e9a..93ec863c7c 100644
--- a/src/utils/libfabric/libfabric_topology.h
+++ b/src/utils/libfabric/libfabric_topology.h
@@ -117,6 +117,8 @@ class nixlLibfabricTopology {
     std::string
     getPcieAddressFromHwlocObj(hwloc_obj_t obj) const;
     bool
+    isIntelHpu(hwloc_obj_t obj) const;
+    bool
     isNvidiaGpu(hwloc_obj_t obj) const;
     bool
     isEfaDevice(hwloc_obj_t obj) const;
diff --git a/src/utils/libfabric/meson.build b/src/utils/libfabric/meson.build
index 39fa98bca3..18e62293ca 100644
--- a/src/utils/libfabric/meson.build
+++ b/src/utils/libfabric/meson.build
@@ -49,6 +49,12 @@ if cuda_dep.found()
     libfabric_utils_cpp_args += ['-DHAVE_CUDA']
 endif
 
+# Add SynapseAI support if available (dependency is globally defined)
+if synapseai_dep.found()
+    libfabric_utils_deps += [synapseai_dep]
+    libfabric_utils_cpp_args += ['-DHAVE_SYNAPSEAI']
+endif
+
 # Create static library
 libfabric_utils_lib = static_library(
     'nixl_libfabric_utils',

From 14149d11b5392fd4dff6b7dd602d985e0af3e3c4 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Thu, 2 Oct 2025 14:39:24 -0700
Subject: [PATCH 02/14] libfabric: Genericize discovery. Add verbs support.

---
 src/utils/libfabric/libfabric_common.cpp      |   2 +
 src/utils/libfabric/libfabric_rail.cpp        |   2 +-
 .../libfabric/libfabric_rail_manager.cpp      |  42 ++---
 src/utils/libfabric/libfabric_rail_manager.h  |  18 +-
 src/utils/libfabric/libfabric_topology.cpp    | 161 ++++++++++++------
 src/utils/libfabric/libfabric_topology.h      |  26 +--
 .../libfabric/libfabric_topology_test.cpp     |   4 +-
 7 files changed, 156 insertions(+), 99 deletions(-)

diff --git a/src/utils/libfabric/libfabric_common.cpp b/src/utils/libfabric/libfabric_common.cpp
index ec8f85cc14..39017b2de9 100644
--- a/src/utils/libfabric/libfabric_common.cpp
+++ b/src/utils/libfabric/libfabric_common.cpp
@@ -87,6 +87,8 @@ getAvailableNetworkDevices() {
 
     if (provider_device_map.find("efa") != provider_device_map.end()) {
         return {"efa", provider_device_map["efa"]};
+    } else if (provider_device_map.find("verbs") != provider_device_map.end()) {
+        return {"verbs", provider_device_map["verbs"]};
     } else if (provider_device_map.find("sockets") != provider_device_map.end()) {
         return {"sockets", {provider_device_map["sockets"][0]}};
     }
diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp
index 7b3d15bf67..bf15c69c6b 100644
--- a/src/utils/libfabric/libfabric_rail.cpp
+++ b/src/utils/libfabric/libfabric_rail.cpp
@@ -421,7 +421,7 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device,
         hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ALLOCATED;
         hints->domain_attr->mr_key_size = 0; // Let provider decide
     } else {
-        // EFA and other providers support advanced memory registration
+        // EFA, verbs and other providers support advanced memory registration
         hints->domain_attr->mr_mode =
             FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
         hints->domain_attr->mr_key_size = 2;
diff --git a/src/utils/libfabric/libfabric_rail_manager.cpp b/src/utils/libfabric/libfabric_rail_manager.cpp
index dcf17c15c8..e2b0287520 100644
--- a/src/utils/libfabric/libfabric_rail_manager.cpp
+++ b/src/utils/libfabric/libfabric_rail_manager.cpp
@@ -73,14 +73,14 @@ nixlLibfabricRailManager::~nixlLibfabricRailManager() {
 }
 
 nixl_status_t
-nixlLibfabricRailManager::createDataRails(const std::vector<std::string> &efa_devices,
+nixlLibfabricRailManager::createDataRails(const std::vector<std::string> &fabric_devices,
                                           const std::string &provider_name) {
-    num_data_rails_ = efa_devices.size();
+    num_data_rails_ = fabric_devices.size();
     // Pre-allocate to ensure contiguous memory allocation
     data_rails_.reserve(num_data_rails_);
 
-    // Build EFA device to rail index mapping for O(1) lookup
-    efa_device_to_rail_map.reserve(num_data_rails_);
+    // Build fabric device to rail index mapping for O(1) lookup
+    device_to_rail_map.reserve(num_data_rails_);
 
     try {
         data_rails_.clear();
@@ -88,12 +88,12 @@ nixlLibfabricRailManager::createDataRails(const std::vector<std::string> &efa_de
 
         for (size_t i = 0; i < num_data_rails_; ++i) {
             data_rails_.emplace_back(std::make_unique<nixlLibfabricRail>(
-                efa_devices[i], provider_name, static_cast<uint16_t>(i)));
+                fabric_devices[i], provider_name, static_cast<uint16_t>(i)));
 
-            // Initialize EFA device mapping
-            efa_device_to_rail_map[efa_devices[i]] = i;
+            // Initialize fabric device mapping
+            device_to_rail_map[fabric_devices[i]] = i;
 
-            NIXL_DEBUG << "Created data rail " << i << " (device: " << efa_devices[i]
+            NIXL_DEBUG << "Created data rail " << i << " (device: " << fabric_devices[i]
                        << ", provider: " << provider_name << ")";
         }
     }
@@ -105,7 +105,7 @@ nixlLibfabricRailManager::createDataRails(const std::vector<std::string> &efa_de
 }
 
 nixl_status_t
-nixlLibfabricRailManager::createControlRails(const std::vector<std::string> &efa_devices,
+nixlLibfabricRailManager::createControlRails(const std::vector<std::string> &fabric_devices,
                                              const std::string &provider_name,
                                              size_t num_control_rails) {
     // Pre-allocate to ensure contiguous memory allocation
@@ -118,8 +118,8 @@ nixlLibfabricRailManager::createControlRails(const std::vector<std::string> &efa
 
         for (size_t i = 0; i < num_control_rails_; ++i) {
             control_rails_.emplace_back(std::make_unique<nixlLibfabricRail>(
-                efa_devices[i], provider_name, static_cast<uint16_t>(i)));
-            NIXL_DEBUG << "Created control rail " << i << " (device: " << efa_devices[i]
+                fabric_devices[i], provider_name, static_cast<uint16_t>(i)));
+            NIXL_DEBUG << "Created control rail " << i << " (device: " << fabric_devices[i]
                        << ", provider: " << provider_name << ")";
         }
     }
@@ -319,34 +319,34 @@ nixlLibfabricRailManager::selectRailsForMemory(void *mem_addr,
             NIXL_ERROR << "Invalid GPU ID " << gpu_id << " for VRAM memory " << mem_addr;
             return {}; // Return empty vector to indicate failure
         }
-        std::vector<std::string> gpu_efa_devices = topology->getEfaDevicesForGpu(gpu_id);
-        if (gpu_efa_devices.empty()) {
-            NIXL_ERROR << "No EFA devices found for GPU " << gpu_id;
+        std::vector<std::string> gpu_nics = topology->getNicsForGpu(gpu_id);
+        if (gpu_nics.empty()) {
+            NIXL_ERROR << "No NICs found for GPU " << gpu_id;
             return {}; // Return empty vector to indicate failure
         }
         std::vector<size_t> gpu_rails;
-        for (const std::string &efa_device : gpu_efa_devices) {
-            auto it = efa_device_to_rail_map.find(efa_device);
-            if (it != efa_device_to_rail_map.end()) {
+        for (const std::string &device_name : gpu_nics) {
+            auto it = device_to_rail_map.find(device_name);
+            if (it != device_to_rail_map.end()) {
                 // Bounds check: ensure rail index is valid
                 if (it->second < data_rails_.size()) {
                     gpu_rails.push_back(it->second);
                     NIXL_DEBUG << "VRAM memory " << mem_addr << " on GPU " << gpu_id
-                               << " mapped to rail " << it->second << " (EFA device: " << efa_device
+                               << " mapped to rail " << it->second << " (fabric device: " << device_name
                                << ")";
                 } else {
-                    NIXL_WARN << "EFA device " << efa_device << " maps to rail " << it->second
+                    NIXL_WARN << "Fabric device " << device_name << " maps to rail " << it->second
                               << " but only " << data_rails_.size() << " rails available";
                 }
             } else {
-                NIXL_WARN << "EFA device " << efa_device << " not found in rail mapping for GPU "
+                NIXL_WARN << "Fabric device " << device_name << " not found in rail mapping for GPU "
                           << gpu_id;
             }
         }
 
         if (gpu_rails.empty()) {
             NIXL_ERROR << "No valid rail mapping found for GPU " << gpu_id << " (checked "
-                       << gpu_efa_devices.size() << " EFA devices)";
+                       << gpu_nics.size() << " NICs)";
             return {};
         }
 
diff --git a/src/utils/libfabric/libfabric_rail_manager.h b/src/utils/libfabric/libfabric_rail_manager.h
index 5e93645ccc..25fe21d46a 100644
--- a/src/utils/libfabric/libfabric_rail_manager.h
+++ b/src/utils/libfabric/libfabric_rail_manager.h
@@ -48,22 +48,22 @@ class nixlLibfabricRailManager {
     ~nixlLibfabricRailManager();
 
     // Rail management
-    /** Create data rails for high-bandwidth transfers (one per EFA device)
-     * @param efa_devices List of EFA device names to create rails on
-     * @param provider_name Provider name ("efa" or "efa-direct")
+    /** Create data rails for high-bandwidth transfers (one per fabric device)
+     * @param fabric_devices List of fabric device names to create rails on
+     * @param provider_name Provider name (e.g., "efa", "verbs", "sockets")
      * @return NIXL_SUCCESS on success, error code on failure
      */
     nixl_status_t
-    createDataRails(const std::vector<std::string> &efa_devices, const std::string &provider_name);
+    createDataRails(const std::vector<std::string> &fabric_devices, const std::string &provider_name);
 
     /** Create control rails for connection management and notifications
-     * @param efa_devices List of EFA device names
-     * @param provider_name Provider name ("efa" or "efa-direct")
+     * @param fabric_devices List of fabric device names
+     * @param provider_name Provider name (e.g., "efa", "verbs", "sockets")
      * @param num_control_rails Number of control rails to create
      * @return NIXL_SUCCESS on success, error code on failure
      */
     nixl_status_t
-    createControlRails(const std::vector<std::string> &efa_devices,
+    createControlRails(const std::vector<std::string> &fabric_devices,
                        const std::string &provider_name,
                        size_t num_control_rails);
 
@@ -302,8 +302,8 @@ class nixlLibfabricRailManager {
 
     std::unique_ptr<nixlLibfabricTopology> topology;
 
-    // EFA device to rail mapping
-    std::unordered_map<std::string, size_t> efa_device_to_rail_map;
+    // Fabric device to rail mapping
+    std::unordered_map<std::string, size_t> device_to_rail_map;
 
     // Active Rail Tracking System
     std::unordered_set<size_t> active_rails_;
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 6fd66167f9..61cf121130 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -65,18 +65,18 @@ nixlLibfabricTopology::discoverTopology() {
         NIXL_ERROR << "Failed to initialize hwloc topology";
         return status;
     }
-    // Discover EFA devices using libfabric
-    status = discoverEfaDevices();
+    // Discover fabric devices using libfabric
+    status = discoverDevices();
     if (status != NIXL_SUCCESS) {
         return status;
     }
-    // For EFA devices, build PCIe to Libfabric device mapping and full topology
-    if (provider_name == "efa") {
+    // For RDMA providers (EFA, verbs, etc.), build PCIe to Libfabric device mapping and full topology
+    if (isRdmaProvider()) {
         // Build PCIe to Libfabric device mapping
         status = buildPcieToLibfabricMapping();
         if (status != NIXL_SUCCESS) {
-            NIXL_ERROR << "Failed to build PCIe to Libfabric mapping - this is required for EFA "
-                          "topology discovery";
+            NIXL_ERROR << "Failed to build PCIe to Libfabric mapping - this is required for "
+                       << provider_name << " topology discovery";
             return status;
         }
         // Discover hardware topology using hwloc
@@ -85,10 +85,10 @@ nixlLibfabricTopology::discoverTopology() {
             NIXL_ERROR << "Failed to discover hwloc topology";
             return status;
         }
-        // Build GPU to EFA mapping based on PCIe topology
-        status = buildGpuToEfaMapping();
+        // Build GPU to NIC mapping based on PCIe topology
+        status = buildGpuToNicMapping();
         if (status != NIXL_SUCCESS) {
-            NIXL_ERROR << "Failed to build GPU to EFA mapping";
+            NIXL_ERROR << "Failed to build GPU to NIC mapping for " << provider_name;
             return status;
         }
     } else {
@@ -108,8 +108,16 @@ nixlLibfabricTopology::discoverTopology() {
     return NIXL_SUCCESS;
 }
 
+bool
+nixlLibfabricTopology::isRdmaProvider() const {
+    return (provider_name == "efa" ||
+            provider_name == "verbs" ||
+            provider_name == "psm2" ||
+            provider_name == "cxi");
+}
+
 nixl_status_t
-nixlLibfabricTopology::discoverEfaDevices() {
+nixlLibfabricTopology::discoverDevices() {
     // Use the utility function from libfabric_common
     auto network_device = LibfabricUtils::getAvailableNetworkDevices();
     provider_name = network_device.first;
@@ -117,30 +125,34 @@ nixlLibfabricTopology::discoverEfaDevices() {
 
     num_devices = all_devices.size();
 
-    // Set device type based on discovered provider
+    // Log discovered provider and device count
     if (provider_name == "efa") {
-        NIXL_INFO << "Discovered " << num_devices << " EFA-Direct devices";
+        NIXL_INFO << "Discovered " << num_devices << " EFA devices";
+    } else if (provider_name == "verbs") {
+        NIXL_INFO << "Discovered " << num_devices << " verbs devices (RDMA)";
     } else if (provider_name == "sockets") {
         NIXL_INFO << "Discovered " << num_devices << " socket devices (TCP fallback)";
     } else if (provider_name == "none" || all_devices.empty()) {
         NIXL_WARN << "No network devices found";
         return NIXL_ERR_BACKEND;
+    } else {
+        NIXL_INFO << "Discovered " << num_devices << " " << provider_name << " devices";
     }
 
     for (size_t i = 0; i < all_devices.size(); ++i) {
-        NIXL_TRACE << "Network device " << i << ": " << all_devices[i]
+        NIXL_TRACE << "Device " << i << ": " << all_devices[i]
                    << " (provider: " << provider_name << ")";
     }
     return NIXL_SUCCESS;
 }
 
 std::vector<std::string>
-nixlLibfabricTopology::getEfaDevicesForGpu(int gpu_id) const {
-    auto it = gpu_to_efa_devices.find(gpu_id);
-    if (it != gpu_to_efa_devices.end()) {
+nixlLibfabricTopology::getNicsForGpu(int gpu_id) const {
+    auto it = gpu_to_nics.find(gpu_id);
+    if (it != gpu_to_nics.end()) {
         return it->second;
     }
-    NIXL_WARN << "No EFA devices found for GPU " << gpu_id << ", returning all devices";
+    NIXL_WARN << "No NICs found for GPU " << gpu_id << ", returning all devices";
     return all_devices;
 }
 
@@ -150,23 +162,24 @@ nixlLibfabricTopology::isValidGpuId(int gpu_id) const {
 }
 
 bool
-nixlLibfabricTopology::isValidDevice(const std::string &efa_device) const {
-    return std::find(all_devices.begin(), all_devices.end(), efa_device) != all_devices.end();
+nixlLibfabricTopology::isValidDevice(const std::string &device_name) const {
+    return std::find(all_devices.begin(), all_devices.end(), device_name) != all_devices.end();
 }
 
 void
 nixlLibfabricTopology::printTopologyInfo() const {
     NIXL_TRACE << "=== Libfabric Topology Information ===";
     NIXL_TRACE << "Topology discovered: " << (topology_discovered ? "Yes" : "No");
+    NIXL_TRACE << "Provider: " << provider_name;
     NIXL_TRACE << "Number of GPUs: " << num_gpus;
     NIXL_TRACE << "Number of NUMA nodes: " << num_numa_nodes;
-    NIXL_TRACE << "Number of EFA devices: " << num_devices;
-    NIXL_TRACE << "EFA devices: ";
+    NIXL_TRACE << "Number of devices: " << num_devices;
+    NIXL_TRACE << "Available devices: ";
     for (size_t i = 0; i < all_devices.size(); ++i) {
         NIXL_TRACE << "  [" << i << "] " << all_devices[i];
     }
-    NIXL_TRACE << "GPU → EFA mapping:";
-    for (const auto &pair : gpu_to_efa_devices) {
+    NIXL_TRACE << "GPU → NIC mapping:";
+    for (const auto &pair : gpu_to_nics) {
         std::stringstream ss;
         ss << "  GPU " << pair.first << " → [";
         for (size_t i = 0; i < pair.second.size(); ++i) {
@@ -176,7 +189,7 @@ nixlLibfabricTopology::printTopologyInfo() const {
         ss << "]";
         NIXL_TRACE << ss.str();
     }
-    NIXL_TRACE << "Host memory (DRAM) will use all available EFA devices for maximum bandwidth";
+    NIXL_TRACE << "Host memory (DRAM) will use all available devices for maximum bandwidth";
     NIXL_TRACE << "=====================================";
 }
 
@@ -184,9 +197,10 @@ std::string
 nixlLibfabricTopology::getTopologyString() const {
     std::stringstream ss;
     ss << "Libfabric Topology: ";
+    ss << "Provider=" << provider_name << ", ";
     ss << "GPUs=" << num_gpus << ", ";
     ss << "NUMA=" << num_numa_nodes << ", ";
-    ss << "EFA=" << num_devices << ", ";
+    ss << "Devices=" << num_devices << ", ";
     ss << "Discovered=" << (topology_discovered ? "Yes" : "No");
     return ss.str();
 }
@@ -215,7 +229,7 @@ nixlLibfabricTopology::initHwlocTopology() {
         return NIXL_ERR_BACKEND;
     }
 
-    // Enable I/O device discovery - this is the key to seeing EFA devices!
+    // Enable I/O device discovery - this is the key to seeing PCIe NICs!
 #if (HWLOC_API_VERSION >= 0x00020000)
     enum hwloc_type_filter_e filter = HWLOC_TYPE_FILTER_KEEP_ALL;
     ret = hwloc_topology_set_io_types_filter(hwloc_topology, filter);
@@ -272,15 +286,15 @@ nixlLibfabricTopology::discoverHwlocTopology() {
         NIXL_ERROR << "hwloc topology not initialized";
         return NIXL_ERR_BACKEND;
     }
-    // Discover GPUs and EFA devices using hwloc
+    // Discover GPUs and fabric devices using hwloc
     nixl_status_t status = discoverGpusWithHwloc();
     if (status != NIXL_SUCCESS) {
         NIXL_ERROR << "Failed to discover GPUs with hwloc";
         return status;
     }
-    status = discoverEfaDevicesWithHwloc();
+    status = discoverDevicesWithHwloc();
     if (status != NIXL_SUCCESS) {
-        NIXL_ERROR << "Failed to discover EFA devices with hwloc";
+        NIXL_ERROR << "Failed to discover devices with hwloc";
         return status;
     }
     // Discover NUMA topology
@@ -351,24 +365,47 @@ nixlLibfabricTopology::discoverGpusWithHwloc() {
 }
 
 nixl_status_t
-nixlLibfabricTopology::discoverEfaDevicesWithHwloc() {
-    // EFA devices are already discovered via libfabric
+nixlLibfabricTopology::discoverDevicesWithHwloc() {
+    // Fabric devices are already discovered via libfabric
     // This method validates the hwloc discovery matches libfabric discovery
-    int hwloc_efa_count = 0;
-    hwloc_obj_t pci_obj = nullptr;
-    while ((pci_obj = hwloc_get_next_pcidev(hwloc_topology, pci_obj)) != nullptr) {
-        if (isEfaDevice(pci_obj)) {
-            hwloc_efa_count++;
-            NIXL_TRACE << "Found EFA device via hwloc: " << getPcieAddressFromHwlocObj(pci_obj);
+    // Only validate for providers with specific hwloc checks
+    if (provider_name == "efa") {
+        int hwloc_device_count = 0;
+        hwloc_obj_t pci_obj = nullptr;
+        while ((pci_obj = hwloc_get_next_pcidev(hwloc_topology, pci_obj)) != nullptr) {
+            if (isEfaDevice(pci_obj)) {
+                hwloc_device_count++;
+                NIXL_TRACE << "Found EFA device via hwloc: " << getPcieAddressFromHwlocObj(pci_obj);
+            }
         }
-    }
 
-    NIXL_TRACE << "hwloc found " << hwloc_efa_count << " EFA devices, libfabric found "
-               << num_devices;
+        NIXL_TRACE << "hwloc found " << hwloc_device_count << " EFA devices, libfabric found "
+                   << num_devices;
 
-    if (hwloc_efa_count != num_devices) {
-        NIXL_WARN << "Mismatch between hwloc (" << hwloc_efa_count << ") and libfabric ("
-                  << num_devices << ") EFA device counts";
+        if (hwloc_device_count != num_devices) {
+            NIXL_WARN << "Mismatch between hwloc (" << hwloc_device_count << ") and libfabric ("
+                      << num_devices << ") EFA device counts";
+        }
+    } else if (provider_name == "verbs") {
+        int hwloc_device_count = 0;
+        hwloc_obj_t pci_obj = nullptr;
+        while ((pci_obj = hwloc_get_next_pcidev(hwloc_topology, pci_obj)) != nullptr) {
+            if (isMellanoxNic(pci_obj)) {
+                hwloc_device_count++;
+                NIXL_TRACE << "Found Mellanox NIC via hwloc: " << getPcieAddressFromHwlocObj(pci_obj);
+            }
+        }
+
+        NIXL_TRACE << "hwloc found " << hwloc_device_count << " Mellanox NICs, libfabric found "
+                   << num_devices;
+
+        if (hwloc_device_count != num_devices) {
+            NIXL_WARN << "Mismatch between hwloc (" << hwloc_device_count << ") and libfabric ("
+                      << num_devices << ") Mellanox NIC counts";
+        }
+    } else {
+        // For other providers (sockets, psm2, etc.), skip hwloc validation
+        NIXL_TRACE << "Skipping hwloc device validation for provider: " << provider_name;
     }
 
     return NIXL_SUCCESS;
@@ -379,7 +416,7 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
     pcie_to_libfabric_map.clear();
     libfabric_to_pcie_map.clear();
 
-    // Get EFA device info with PCIe addresses from libfabric
+    // Get fabric device info with PCIe addresses from libfabric
     struct fi_info *hints, *info;
 
     hints = fi_allocinfo();
@@ -433,16 +470,16 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
 }
 
 nixl_status_t
-nixlLibfabricTopology::buildGpuToEfaMapping() {
-    gpu_to_efa_devices.clear();
-    // Implement NIXL's topology-aware GPU-EFA grouping algorithm
+nixlLibfabricTopology::buildGpuToNicMapping() {
+    gpu_to_nics.clear();
+    // Implement NIXL's topology-aware GPU-NIC grouping algorithm
     nixl_status_t status = buildTopologyAwareGrouping();
     if (status != NIXL_SUCCESS) {
         NIXL_WARN << "Topology-aware grouping failed, using fallback to use all available devices";
         return buildFallbackMapping();
     }
 
-    NIXL_TRACE << "Built GPU→EFA mapping for " << gpu_to_efa_devices.size()
+    NIXL_TRACE << "Built GPU→NIC mapping for " << gpu_to_nics.size()
                << " GPUs using topology-aware algorithm";
 
     return NIXL_SUCCESS;
@@ -516,13 +553,13 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
     if (status != NIXL_SUCCESS) {
         return status;
     }
-    // Step 4: Convert groups to GPU→EFA mapping
+    // Step 4: Convert groups to GPU→NIC mapping
     for (size_t group_idx = 0; group_idx < nic_groups.size(); ++group_idx) {
         const auto &group = nic_groups[group_idx];
         if (group.has_gpu) {
-            std::vector<std::string> gpu_efa_devices;
+            std::vector<std::string> gpu_nics;
             for (const auto &nic : group.nics) {
-                gpu_efa_devices.push_back(nic.libfabric_name);
+                gpu_nics.push_back(nic.libfabric_name);
             }
             // Find GPU index in our discovered GPUs list
             int gpu_index = -1;
@@ -538,13 +575,13 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             }
 
             if (gpu_index >= 0) {
-                gpu_to_efa_devices[gpu_index] = gpu_efa_devices;
+                gpu_to_nics[gpu_index] = gpu_nics;
 
                 NIXL_TRACE << "GPU " << gpu_index << " (" << std::hex << group.closest_gpu.domain_id
                            << ":" << static_cast<int>(group.closest_gpu.bus_id) << ":"
                            << static_cast<int>(group.closest_gpu.device_id) << "."
                            << static_cast<int>(group.closest_gpu.function_id) << std::dec << ") → "
-                           << gpu_efa_devices.size() << " EFA devices";
+                           << gpu_nics.size() << " NICs";
             }
         }
     }
@@ -554,10 +591,10 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
 nixl_status_t
 nixlLibfabricTopology::buildFallbackMapping() {
     // Fallback: if specific mapping failed, use simple approach
-    gpu_to_efa_devices.clear();
+    gpu_to_nics.clear();
     // Give all devices to all GPUs (not optimal but functional)
     for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
-        gpu_to_efa_devices[gpu_id] = all_devices;
+        gpu_to_nics[gpu_id] = all_devices;
     }
     return NIXL_SUCCESS;
 }
@@ -624,6 +661,20 @@ nixlLibfabricTopology::isEfaDevice(hwloc_obj_t obj) const {
         (obj->attr->pcidev.device_id & 0xfff0) == 0xefa0;
 }
 
+bool
+nixlLibfabricTopology::isMellanoxNic(hwloc_obj_t obj) const {
+    if (!obj || obj->type != HWLOC_OBJ_PCI_DEVICE) {
+        return false;
+    }
+
+    // Mellanox/NVIDIA vendor ID is 0x15b3
+    // Class 0x0200 is Network controller (Ethernet)
+    // Class 0x0207 is InfiniBand controller
+    uint16_t class_id = obj->attr->pcidev.class_id;
+    return obj->attr->pcidev.vendor_id == 0x15b3 &&
+           (class_id == 0x0200 || class_id == 0x0207);
+}
+
 nixl_status_t
 nixlLibfabricTopology::groupNicsWithGpus(const std::vector<NicInfo> &discovered_nics,
                                          const std::vector<GpuInfo> &discovered_gpus,
diff --git a/src/utils/libfabric/libfabric_topology.h b/src/utils/libfabric/libfabric_topology.h
index 93ec863c7c..d87134c2e3 100644
--- a/src/utils/libfabric/libfabric_topology.h
+++ b/src/utils/libfabric/libfabric_topology.h
@@ -24,16 +24,16 @@
 #include <map>
 
 /**
- * @brief Topology discovery and management for AWS instances with EFA devices
+ * @brief Topology discovery and management for libfabric devices
  *
- * Automatically discovers system topology using hwloc and maps GPUs to EFA devices
- * based on PCIe proximity for optimal performance. Falls back to TCP/sockets
- * when EFA devices are not available.
+ * Automatically discovers system topology using hwloc and maps GPUs to NICs
+ * based on PCIe proximity for optimal performance. Supports EFA, verbs, and other
+ * RDMA providers. Falls back to TCP/sockets when RDMA devices are not available.
  */
 class nixlLibfabricTopology {
 private:
-    // GPU to EFA device mapping: GPU 0→[efa0,efa1], GPU 1→[efa2,efa3], etc.
-    std::map<int, std::vector<std::string>> gpu_to_efa_devices;
+    // GPU to NIC mapping for RDMA providers: GPU 0→[rdmap0s6-rdm,rdmap1s6-rdm], GPU 1→[rdmap2s6-rdm,rdmap3s6-rdm], etc.
+    std::map<int, std::vector<std::string>> gpu_to_nics;
 
     // All available network devices discovered on this system
     std::vector<std::string> all_devices;
@@ -58,9 +58,11 @@ class nixlLibfabricTopology {
 
     // Helper methods
     nixl_status_t
-    discoverEfaDevices();
+    discoverDevices();
     nixl_status_t
     discoverTopology();
+    bool
+    isRdmaProvider() const;
 
     // hwloc-based discovery methods
     nixl_status_t
@@ -72,9 +74,9 @@ class nixlLibfabricTopology {
     nixl_status_t
     discoverGpusWithHwloc();
     nixl_status_t
-    discoverEfaDevicesWithHwloc();
+    discoverDevicesWithHwloc();
     nixl_status_t
-    buildGpuToEfaMapping();
+    buildGpuToNicMapping();
     void
     cleanupHwlocTopology();
 
@@ -122,6 +124,8 @@ class nixlLibfabricTopology {
     isNvidiaGpu(hwloc_obj_t obj) const;
     bool
     isEfaDevice(hwloc_obj_t obj) const;
+    bool
+    isMellanoxNic(hwloc_obj_t obj) const;
 
 public:
     nixlLibfabricTopology(); // Automatically discovers topology
@@ -129,7 +133,7 @@ class nixlLibfabricTopology {
 
     // GPU-based queries (main interface)
     std::vector<std::string>
-    getEfaDevicesForGpu(int gpu_id) const;
+    getNicsForGpu(int gpu_id) const;
 
     // System information
     int
@@ -156,7 +160,7 @@ class nixlLibfabricTopology {
     bool
     isValidGpuId(int gpu_id) const;
     bool
-    isValidDevice(const std::string &efa_device) const;
+    isValidDevice(const std::string &device_name) const;
 
     // Debug/info
     void
diff --git a/test/unit/utils/libfabric/libfabric_topology_test.cpp b/test/unit/utils/libfabric/libfabric_topology_test.cpp
index 1352cf588e..33ae616703 100644
--- a/test/unit/utils/libfabric/libfabric_topology_test.cpp
+++ b/test/unit/utils/libfabric/libfabric_topology_test.cpp
@@ -44,14 +44,14 @@ main() {
             NIXL_INFO << "3. Testing GPU-specific queries (detected " << num_gpus << " GPUs)...";
             int test_gpus = std::min(num_gpus, 3); // Test up to 3 GPUs or all available
             for (int gpu_id = 0; gpu_id < test_gpus; ++gpu_id) {
-                auto gpu_devices = topology.getEfaDevicesForGpu(gpu_id);
+                auto gpu_devices = topology.getNicsForGpu(gpu_id);
                 std::string device_list;
                 for (const auto &device : gpu_devices) {
                     if (!device_list.empty()) device_list += " ";
                     device_list += device;
                 }
                 NIXL_INFO << "   GPU " << gpu_id << " mapped to " << gpu_devices.size()
-                          << " EFA devices: " << device_list;
+                          << " devices: " << device_list;
             }
         } else {
             NIXL_INFO << "3. Skipping GPU-specific tests (no GPUs detected)";

From 2a4391f5790fb73fcae2146918d1a15550b8f640 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Fri, 3 Oct 2025 13:30:10 -0700
Subject: [PATCH 03/14] libfabric: better hw detection, FI_HMEM support

- Enable smart auto-detection of device types
- Dual-path memory registration (GDR, FI_HMEM)
- Device type as custom param (priority: envvar, backend param,
  auto-detect)
---
 src/plugins/libfabric/libfabric_backend.cpp   |  56 ++++++++-
 src/plugins/libfabric/libfabric_backend.h     |   3 +
 src/utils/libfabric/libfabric_common.h        |  10 ++
 src/utils/libfabric/libfabric_rail.cpp        | 107 ++++++++++++++----
 src/utils/libfabric/libfabric_rail.h          |  12 +-
 .../libfabric/libfabric_rail_manager.cpp      |  21 +++-
 src/utils/libfabric/libfabric_rail_manager.h  |  11 ++
 src/utils/libfabric/libfabric_topology.cpp    |  22 ++--
 src/utils/libfabric/libfabric_topology.h      |  14 ++-
 9 files changed, 224 insertions(+), 32 deletions(-)

diff --git a/src/plugins/libfabric/libfabric_backend.cpp b/src/plugins/libfabric/libfabric_backend.cpp
index e650a45927..f0ea8bde49 100644
--- a/src/plugins/libfabric/libfabric_backend.cpp
+++ b/src/plugins/libfabric/libfabric_backend.cpp
@@ -263,6 +263,21 @@ nixlLibfabricEngine::nixlLibfabricEngine(const nixlBackendInitParams *init_param
         NIXL_DEBUG << "Using default striping threshold: " << striping_threshold_ << " bytes";
     }
 
+    // Parse default HMEM interface parameter
+    // Auto-detect from topology if not specified
+    std::string hmem_iface_str;
+    if (getInitParam("default_hmem_iface", hmem_iface_str) == NIXL_SUCCESS) {
+        default_hmem_iface_ = hmem_iface_str;
+        NIXL_DEBUG << "Using custom default HMEM interface from backend params: " << default_hmem_iface_;
+    } else {
+        // Auto-detect device type from topology
+        // Note: topology discovery happens in rail_manager constructor
+        // For now, leave empty to use GDR fallback by default
+        // SynapseAI will be auto-detected per-registration via /dev/accel check
+        default_hmem_iface_ = "";
+        NIXL_DEBUG << "No default HMEM interface specified, will auto-detect per-registration";
+    }
+
     // Initialize Rail Manager which will discover the topology and create all rails.
     try {
         NIXL_DEBUG << "Rail Manager created with " << rail_manager.getNumDataRails()
@@ -787,14 +802,53 @@ nixlLibfabricEngine::registerMem(const nixlBlobDesc &mem,
 #endif
     }
 
+    // Determine HMEM interface hint based on priority:
+    // 1. Environment variables (highest priority)
+    // 2. Per-registration hints via metaInfo blob
+    // 3. Backend-wide defaults from custom params
+    // 4. Auto-detection (fallback - empty string)
+    std::string hmem_hint;
+
+    // Priority 1: Check environment variables
+    const char* env_hmem = getenv("HMEM_IFACE");
+    if (env_hmem && env_hmem[0] != '\0') {
+        hmem_hint = env_hmem;
+        NIXL_DEBUG << "Using HMEM interface from environment variable: " << hmem_hint;
+    }
+    // Priority 2: Check per-registration hint from metaInfo
+    else if (!mem.metaInfo.empty()) {
+        hmem_hint = std::string(mem.metaInfo.begin(), mem.metaInfo.end());
+        NIXL_DEBUG << "Using HMEM interface from metaInfo hint: " << hmem_hint;
+    }
+    // Priority 3: Use backend-wide default
+    else if (!default_hmem_iface_.empty()) {
+        hmem_hint = default_hmem_iface_;
+        NIXL_DEBUG << "Using HMEM interface from backend default: " << hmem_hint;
+    }
+    // Priority 4: Auto-detect from system topology
+    else {
+        // Auto-detect device type based on topology discovery
+        // Intel HPU requires FI_HMEM_SYNAPSEAI (no GDR support exists)
+        // NVIDIA GPU can use GDR fallback (empty hint)
+        if (nixl_mem == VRAM_SEG && rail_manager.getNumIntelHpus() > 0) {
+            hmem_hint = "SYNAPSEAI";
+            NIXL_DEBUG << "Auto-detected Intel HPU system, using HMEM interface: SYNAPSEAI";
+        } else {
+            // Leave empty for GDR fallback (CUDA) or DRAM
+            NIXL_DEBUG << "Auto-detection: using GDR fallback (empty hint)";
+        }
+    }
+
     // Use Rail Manager for centralized memory registration with GPU Direct RDMA support
     NIXL_TRACE << "Registering memory: addr=" << (void *)mem.addr << " len=" << mem.len
-               << " mem_type=" << nixl_mem << " devId=" << mem.devId;
+               << " mem_type=" << nixl_mem << " devId=" << mem.devId
+               << " hmem_hint=" << (hmem_hint.empty() ? "auto" : hmem_hint);
 
     nixl_status_t status = rail_manager.registerMemory((void *)mem.addr,
                                                        mem.len,
                                                        nixl_mem,
                                                        mem.devId,
+                                                       hmem_hint,
                                                        priv->rail_mr_list_,
                                                        priv->rail_key_list_,
                                                        priv->selected_rails_);
diff --git a/src/plugins/libfabric/libfabric_backend.h b/src/plugins/libfabric/libfabric_backend.h
index 9d97d4df20..fcee2544c4 100644
--- a/src/plugins/libfabric/libfabric_backend.h
+++ b/src/plugins/libfabric/libfabric_backend.h
@@ -184,6 +184,9 @@ class nixlLibfabricEngine : public nixlBackendEngine {
 
     mutable size_t total_transfer_size_;
 
+    // HMEM interface management
+    std::string default_hmem_iface_;  // Backend-wide default HMEM interface from custom params (default: "cuda")
+
     // Map of agent name to connection info
     // <remoteAgent, <connection>>
     mutable std::unordered_map<std::string, std::shared_ptr<nixlLibfabricConnection>> connections_;
diff --git a/src/utils/libfabric/libfabric_common.h b/src/utils/libfabric/libfabric_common.h
index 4c149b445a..25347d83f2 100644
--- a/src/utils/libfabric/libfabric_common.h
+++ b/src/utils/libfabric/libfabric_common.h
@@ -31,6 +31,16 @@
 #include <rdma/fi_endpoint.h>
 #include <rdma/fi_cm.h>
 #include <rdma/fi_rma.h>
+#include <rdma/fi_ext.h>
+
+// FI_HMEM constants compatibility for older libfabric versions (< 1.14.0)
+#ifndef FI_HMEM_CUDA
+#define FI_HMEM_CUDA ((fi_hmem_iface)1)
+#endif
+
+#ifndef FI_HMEM_SYNAPSEAI
+#define FI_HMEM_SYNAPSEAI ((fi_hmem_iface)8)
+#endif
 
 // Libfabric configuration constants
 #define NIXL_LIBFABRIC_DEFAULT_CONTROL_RAILS 1
diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp
index bf15c69c6b..7235232036 100644
--- a/src/utils/libfabric/libfabric_rail.cpp
+++ b/src/utils/libfabric/libfabric_rail.cpp
@@ -21,6 +21,7 @@
 #include "serdes/serdes.h"
 #include "libfabric_common.h"
 
+#include <algorithm>
 #include <cstring>
 #include <stdexcept>
 #include <stack>
@@ -1256,6 +1257,8 @@ nixlLibfabricRail::postRead(void *local_buffer,
 nixl_status_t
 nixlLibfabricRail::registerMemory(void *buffer,
                                   size_t length,
+                                  const std::string &hmem_hint,
+                                  int device_id,
                                   struct fid_mr **mr_out,
                                   uint64_t *key_out) const {
     if (!buffer || !mr_out || !key_out) {
@@ -1279,30 +1282,96 @@ nixlLibfabricRail::registerMemory(void *buffer,
     }
 
     struct fid_mr *mr;
+    int ret;
 
-    // For TCP providers, use a unique key to avoid conflicts
-    // TCP provider assigns key 0 by default, but we need unique keys for multiple registrations
-    uint64_t requested_key = 0;
-    if (provider_name == "tcp" || provider_name == "sockets") {
-        // Generate a unique key based on buffer address to avoid collisions
-        // Use the lower bits of the buffer address as a simple unique identifier
-        requested_key = reinterpret_cast<uintptr_t>(buffer) & 0xFFFFFFFF;
+    // Determine registration method based on hint:
+    // - Empty hint: Use GDR method (fi_mr_reg) - Default path
+    // - With hint: Use FI_HMEM method (fi_mr_regattr) - Required for SynapseAI, optional for CUDA
+
+    std::string hint_lower = hmem_hint;
+    std::transform(hint_lower.begin(), hint_lower.end(), hint_lower.begin(), ::tolower);
 
-        NIXL_DEBUG << "TCP provider: using requested key " << requested_key << " for buffer "
-                   << buffer << " on rail " << rail_id;
+    // Validate hint and check if explicit FI_HMEM registration is requested
+    bool use_hmem = false;
+    if (!hint_lower.empty()) {
+        if (hint_lower == "cuda" || hint_lower == "synapseai") {
+            use_hmem = true;
+        } else {
+            NIXL_WARN << "Unknown HMEM hint '" << hmem_hint << "' on rail " << rail_id
+                      << ", falling back to GDR method. Valid hints: CUDA, SYNAPSEAI";
+        }
     }
 
-    NIXL_TRACE << "Memory Registration: rail=" << rail_id << " provider=" << provider_name
-               << " buffer=" << buffer << " length=" << length << " access_flags=0x" << std::hex
-               << provider_access_flags << std::dec << " requested_key=" << requested_key;
+    if (use_hmem) {
+        // === FI_HMEM Path ===
+        NIXL_DEBUG << "Using FI_HMEM registration method on rail " << rail_id
+                   << " (hint=" << hmem_hint << ", device_id=" << device_id << ")";
+
+        // Use fi_mr_regattr for HMEM device memory registration
+        struct fi_mr_attr mr_attr = {};
+        struct iovec iov = {};
+
+        iov.iov_base = buffer;
+        iov.iov_len = length;
+
+        mr_attr.mr_iov = &iov;
+        mr_attr.iov_count = 1;
+        mr_attr.access = provider_access_flags;
+
+        // Map hint to FI_HMEM interface and set device ID
+        if (hint_lower == "cuda") {
+            mr_attr.iface = FI_HMEM_CUDA;
+            mr_attr.device.cuda = device_id;  // Critical for multi-GPU
+            NIXL_DEBUG << "Using CUDA HMEM interface for memory registration on rail " << rail_id
+                       << " device_id=" << device_id;
+        } else if (hint_lower == "synapseai") {
+            mr_attr.iface = FI_HMEM_SYNAPSEAI;
+            mr_attr.device.synapseai = static_cast<uint32_t>(device_id);  // Critical for multi-device
+            NIXL_DEBUG << "Using SynapseAI HMEM interface for memory registration on rail " << rail_id
+                       << " device_id=" << device_id;
+        }
 
-    int ret =
-        fi_mr_reg(domain, buffer, length, provider_access_flags, 0, requested_key, 0, &mr, NULL);
-    if (ret) {
-        NIXL_ERROR << "fi_mr_reg failed on rail " << rail_id << ": " << fi_strerror(-ret)
-                   << " (buffer=" << buffer << ", length=" << length
-                   << ", requested_key=" << requested_key << ")";
-        return NIXL_ERR_BACKEND;
+        NIXL_TRACE << "HMEM Registration: rail=" << rail_id << " provider=" << provider_name
+                   << " buffer=" << buffer << " length=" << length << " iface=" << mr_attr.iface
+                   << " device_id=" << device_id
+                   << " access_flags=0x" << std::hex << provider_access_flags << std::dec;
+
+        ret = fi_mr_regattr(domain, &mr_attr, 0, &mr);
+        if (ret) {
+            NIXL_ERROR << "fi_mr_regattr (HMEM) failed on rail " << rail_id << ": " << fi_strerror(-ret)
+                       << " (buffer=" << buffer << ", length=" << length
+                       << ", hint=" << hmem_hint << ", iface=" << mr_attr.iface
+                       << ", device_id=" << device_id << ")";
+            return NIXL_ERR_BACKEND;
+        }
+    } else {
+        // === GDR Path (Default) ===
+        // Uses standard fi_mr_reg() which relies on GPU Direct RDMA kernel modules
+        // (nvidia-peermem) to enable direct NIC-to-GPU memory access.
+
+        NIXL_DEBUG << "Using GDR registration method on rail " << rail_id
+                   << " (standard fi_mr_reg, relies on nvidia-peermem kernel module)";
+
+        // For TCP providers, use a unique key to avoid conflicts
+        uint64_t requested_key = 0;
+        if (provider_name == "tcp" || provider_name == "sockets") {
+            // Generate a unique key based on buffer address to avoid collisions
+            requested_key = reinterpret_cast<uintptr_t>(buffer) & 0xFFFFFFFF;
+            NIXL_DEBUG << "TCP provider: using requested key " << requested_key << " for buffer "
+                       << buffer << " on rail " << rail_id;
+        }
+
+        NIXL_TRACE << "GDR Memory Registration: rail=" << rail_id << " provider=" << provider_name
+                   << " buffer=" << buffer << " length=" << length << " access_flags=0x" << std::hex
+                   << provider_access_flags << std::dec << " requested_key=" << requested_key;
+
+        ret = fi_mr_reg(domain, buffer, length, provider_access_flags, 0, requested_key, 0, &mr, NULL);
+        if (ret) {
+            NIXL_ERROR << "fi_mr_reg failed on rail " << rail_id << ": " << fi_strerror(-ret)
+                       << " (buffer=" << buffer << ", length=" << length
+                       << ", requested_key=" << requested_key << ")";
+            return NIXL_ERR_BACKEND;
+        }
     }
 
     *mr_out = mr;
diff --git a/src/utils/libfabric/libfabric_rail.h b/src/utils/libfabric/libfabric_rail.h
index 7fe7f56906..daba0137f6 100644
--- a/src/utils/libfabric/libfabric_rail.h
+++ b/src/utils/libfabric/libfabric_rail.h
@@ -279,9 +279,17 @@ class nixlLibfabricRail {
     isProperlyInitialized() const;
 
     // Memory registration methods
-    /** Register memory buffer with libfabric */
+    /** Register memory buffer with libfabric with HMEM support
+     * @param buffer Memory buffer to register
+     * @param length Buffer length in bytes
+     * @param hmem_hint HMEM interface hint ("cuda", "synapseai", or empty for auto-detection)
+     * @param device_id Device ID for GPU memory (used when hmem_hint is specified, -1 for host memory)
+     * @param mr_out Output memory registration handle
+     * @param key_out Output remote access key
+     * @return NIXL_SUCCESS on success, error code on failure
+     */
     nixl_status_t
-    registerMemory(void *buffer, size_t length, struct fid_mr **mr_out, uint64_t *key_out) const;
+    registerMemory(void *buffer, size_t length, const std::string &hmem_hint, int device_id, struct fid_mr **mr_out, uint64_t *key_out) const;
 
     /** Deregister memory from libfabric */
     nixl_status_t
diff --git a/src/utils/libfabric/libfabric_rail_manager.cpp b/src/utils/libfabric/libfabric_rail_manager.cpp
index e2b0287520..e2d639c6f5 100644
--- a/src/utils/libfabric/libfabric_rail_manager.cpp
+++ b/src/utils/libfabric/libfabric_rail_manager.cpp
@@ -381,6 +381,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
                                          size_t length,
                                          nixl_mem_t mem_type,
                                          int gpu_id,
+                                         const std::string &hmem_hint,
                                          std::vector<struct fid_mr *> &mr_list_out,
                                          std::vector<uint64_t> &key_list_out,
                                          std::vector<size_t> &selected_rails_out) {
@@ -401,7 +402,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
     key_list_out.resize(data_rails_.size(), 0);
     selected_rails_out = selected_rails; // Return which rails were selected
 
-    // Register memory on each selected rail
+    // Register memory on each selected rail with HMEM hint
     for (size_t i = 0; i < selected_rails.size(); ++i) {
         size_t rail_idx = selected_rails[i];
         if (rail_idx >= data_rails_.size()) {
@@ -419,7 +420,7 @@ nixlLibfabricRailManager::registerMemory(void *buffer,
 
         struct fid_mr *mr;
         uint64_t key;
-        nixl_status_t status = data_rails_[rail_idx]->registerMemory(buffer, length, &mr, &key);
+        nixl_status_t status = data_rails_[rail_idx]->registerMemory(buffer, length, hmem_hint, gpu_id, &mr, &key);
         if (status != NIXL_SUCCESS) {
             NIXL_ERROR << "Failed to register memory on rail " << rail_idx;
             // Cleanup already registered MRs
@@ -892,3 +893,19 @@ nixlLibfabricRailManager::getActiveRailCount() const {
     std::lock_guard<std::mutex> lock(active_rails_mutex_);
     return active_rails_.size();
 }
+
+int
+nixlLibfabricRailManager::getNumNvidiaGpus() const {
+    if (topology) {
+        return topology->getNumNvidiaGpus();
+    }
+    return 0;
+}
+
+int
+nixlLibfabricRailManager::getNumIntelHpus() const {
+    if (topology) {
+        return topology->getNumIntelHpus();
+    }
+    return 0;
+}
diff --git a/src/utils/libfabric/libfabric_rail_manager.h b/src/utils/libfabric/libfabric_rail_manager.h
index 25fe21d46a..fe68e962d0 100644
--- a/src/utils/libfabric/libfabric_rail_manager.h
+++ b/src/utils/libfabric/libfabric_rail_manager.h
@@ -110,6 +110,7 @@ class nixlLibfabricRailManager {
      * @param length Buffer size in bytes
      * @param mem_type Memory type (DRAM_SEG or VRAM_SEG)
      * @param gpu_id GPU device ID (used for VRAM_SEG, ignored for DRAM_SEG)
+     * @param hmem_hint HMEM interface hint ("cuda", "synapseai", "ze", or empty for auto-detection)
      * @param mr_list_out Memory registration handles, indexed by rail ID
      * @param key_list_out Remote access keys, indexed by rail ID
      * @param selected_rails_out List of rail IDs where memory was registered
@@ -120,6 +121,7 @@ class nixlLibfabricRailManager {
                    size_t length,
                    nixl_mem_t mem_type,
                    int gpu_id,
+                   const std::string &hmem_hint,
                    std::vector<struct fid_mr *> &mr_list_out,
                    std::vector<uint64_t> &key_list_out,
                    std::vector<size_t> &selected_rails_out);
@@ -244,6 +246,15 @@ class nixlLibfabricRailManager {
     size_t
     getActiveRailCount() const;
 
+    // Topology Information APIs
+    /** Get number of NVIDIA GPUs in the system */
+    int
+    getNumNvidiaGpus() const;
+
+    /** Get number of Intel HPUs in the system */
+    int
+    getNumIntelHpus() const;
+
     // Memory Descriptor APIs
     /** Get memory descriptor for specified rail and MR */
     struct fid_mr *
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 61cf121130..656fb20da6 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -34,6 +34,8 @@
 
 nixlLibfabricTopology::nixlLibfabricTopology()
     : num_gpus(0),
+      num_nvidia_gpus(0),
+      num_intel_hpus(0),
       num_numa_nodes(0),
       num_devices(0),
       topology_discovered(false),
@@ -98,6 +100,8 @@ nixlLibfabricTopology::discoverTopology() {
 
         // Set basic values without hwloc discovery
         num_gpus = 0; // TCP doesn't need GPU topology
+        num_nvidia_gpus = 0;
+        num_intel_hpus = 0;
         num_numa_nodes = 1; // Simple fallback
 
         // For TCP/sockets devices, no GPU-mapping required.
@@ -171,7 +175,8 @@ nixlLibfabricTopology::printTopologyInfo() const {
     NIXL_TRACE << "=== Libfabric Topology Information ===";
     NIXL_TRACE << "Topology discovered: " << (topology_discovered ? "Yes" : "No");
     NIXL_TRACE << "Provider: " << provider_name;
-    NIXL_TRACE << "Number of GPUs: " << num_gpus;
+    NIXL_TRACE << "Number of GPUs: " << num_gpus << " (" << num_nvidia_gpus << " NVIDIA, "
+               << num_intel_hpus << " Intel HPU)";
     NIXL_TRACE << "Number of NUMA nodes: " << num_numa_nodes;
     NIXL_TRACE << "Number of devices: " << num_devices;
     NIXL_TRACE << "Available devices: ";
@@ -309,7 +314,8 @@ nixlLibfabricTopology::discoverHwlocTopology() {
 
 nixl_status_t
 nixlLibfabricTopology::discoverGpusWithHwloc() {
-    num_gpus = 0;
+    num_nvidia_gpus = 0;
+    num_intel_hpus = 0;
     // Find all PCI devices and log detailed information
     hwloc_obj_t pci_obj = nullptr;
     while ((pci_obj = hwloc_get_next_pcidev(hwloc_topology, pci_obj)) != nullptr) {
@@ -320,10 +326,10 @@ nixlLibfabricTopology::discoverGpusWithHwloc() {
             uint16_t device_id = pci_obj->attr->pcidev.device_id;
             uint16_t class_id = pci_obj->attr->pcidev.class_id;
 
-            NIXL_TRACE << "Found NVIDIA GPU " << num_gpus << ": " << pcie_addr << " (vendor=0x"
+            NIXL_TRACE << "Found NVIDIA GPU " << num_nvidia_gpus << ": " << pcie_addr << " (vendor=0x"
                        << std::hex << vendor_id << ", device=0x" << device_id << ", class=0x"
                        << class_id << std::dec << ")";
-            num_gpus++;
+            num_nvidia_gpus++;
         } else if (isIntelHpu(pci_obj)) {
             std::string pcie_addr = getPcieAddressFromHwlocObj(pci_obj);
             // Get device and vendor info
@@ -331,14 +337,16 @@ nixlLibfabricTopology::discoverGpusWithHwloc() {
             uint16_t device_id = pci_obj->attr->pcidev.device_id;
             uint16_t class_id = pci_obj->attr->pcidev.class_id;
 
-            NIXL_TRACE << "Found Intel Habana GPU " << num_gpus << ": " << pcie_addr << " (vendor=0x"
+            NIXL_TRACE << "Found Intel HPU " << num_intel_hpus << ": " << pcie_addr << " (vendor=0x"
                        << std::hex << vendor_id << ", device=0x" << device_id << ", class=0x"
                        << class_id << std::dec << ")";
-            num_gpus++;
+            num_intel_hpus++;
         }
     }
 
-    NIXL_TRACE << "Discovered " << num_gpus << " GPUs via hwloc";
+    num_gpus = num_nvidia_gpus + num_intel_hpus;
+    NIXL_TRACE << "Discovered " << num_gpus << " GPUs via hwloc (" << num_nvidia_gpus
+               << " NVIDIA, " << num_intel_hpus << " Intel HPU)";
 
     // If we found more than 8 GPUs on P5en, investigate further
     // FIXME: add Habana related messages
diff --git a/src/utils/libfabric/libfabric_topology.h b/src/utils/libfabric/libfabric_topology.h
index d87134c2e3..68120def23 100644
--- a/src/utils/libfabric/libfabric_topology.h
+++ b/src/utils/libfabric/libfabric_topology.h
@@ -42,7 +42,9 @@ class nixlLibfabricTopology {
     std::string provider_name;
 
     // System information
-    int num_gpus;
+    int num_gpus;  // Total GPUs (NVIDIA + Intel HPU)
+    int num_nvidia_gpus;  // NVIDIA GPU count
+    int num_intel_hpus;   // Intel Habana HPU count
     int num_numa_nodes;
     int num_devices;
 
@@ -141,6 +143,16 @@ class nixlLibfabricTopology {
         return num_gpus;
     }
 
+    int
+    getNumNvidiaGpus() const {
+        return num_nvidia_gpus;
+    }
+
+    int
+    getNumIntelHpus() const {
+        return num_intel_hpus;
+    }
+
     const std::vector<std::string> &
     getAllDevices() const {
         return all_devices;

From d674c073d92e3444537c8ee9491471846a341e52 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Sun, 5 Oct 2025 15:23:18 -0700
Subject: [PATCH 04/14] libfabric: Add clear provider specific config

---
 src/plugins/libfabric/README.md            |   1 +
 src/utils/libfabric/libfabric_common.cpp   | 122 +++++++++++++++++++--
 src/utils/libfabric/libfabric_common.h     |  13 +++
 src/utils/libfabric/libfabric_rail.cpp     |  27 +++--
 src/utils/libfabric/libfabric_topology.cpp |   6 +-
 5 files changed, 149 insertions(+), 20 deletions(-)

diff --git a/src/plugins/libfabric/README.md b/src/plugins/libfabric/README.md
index 2846bfedb2..70dc8e13bf 100644
--- a/src/plugins/libfabric/README.md
+++ b/src/plugins/libfabric/README.md
@@ -21,6 +21,7 @@ EFA Specific **Topology-Aware Optimization**: Hardware-aware GPU-to-EFA and NUMA
 - **Libfabric**
   - Many system will have installed libfabric already. If not, custom libfabric installation is available via https://ofiwg.github.io/libfabric/ - Minimum required version: v2.3.0rc2
   - For EFA enabled AWS instances, it is recommanded to install through AWS EFA installer: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html - Minimum required version: 1.43.2
+  - **Note:** HMEM support for some GPU implementations (SynapseAI, etc.) requires libfabric v1.16.x or newer
 
 - **hwloc**
   - hwloc is used to understand the underlying architecture to optimize application performance. Suggested version: 2.10.0 or newer
diff --git a/src/utils/libfabric/libfabric_common.cpp b/src/utils/libfabric/libfabric_common.cpp
index 39017b2de9..8eeb228f71 100644
--- a/src/utils/libfabric/libfabric_common.cpp
+++ b/src/utils/libfabric/libfabric_common.cpp
@@ -29,6 +29,93 @@
 
 namespace LibfabricUtils {
 
+// Provider-specific configurations
+static const ProviderConfig PROVIDER_CONFIGS[] = {
+    {
+        "efa",
+        FI_MSG | FI_RMA | FI_LOCAL_COMM | FI_REMOTE_COMM,
+        FI_CONTEXT | FI_CONTEXT2,
+        0,  // let provider choose
+        FI_RM_UNSPEC,
+        FI_THREAD_SAFE
+    },
+    {
+        "verbs",  // Matches both "verbs" and "verbs;ofi_rxm"
+        FI_MSG | FI_RMA | FI_READ | FI_REMOTE_READ,
+        0,  // no mode flags required
+        FI_MR_LOCAL | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY,
+        FI_RM_ENABLED,
+        FI_THREAD_SAFE
+    },
+    {
+        "tcp",
+        FI_MSG | FI_RMA | FI_LOCAL_COMM | FI_REMOTE_COMM,
+        FI_CONTEXT | FI_CONTEXT2,
+        0,  // basic MR mode, overridden in rail.cpp
+        FI_RM_UNSPEC,
+        FI_THREAD_UNSPEC
+    },
+    {
+        "sockets",
+        FI_MSG | FI_RMA | FI_LOCAL_COMM | FI_REMOTE_COMM,
+        0,
+        0,  // let provider choose
+        FI_RM_UNSPEC,
+        FI_THREAD_UNSPEC  // default threading
+    }
+};
+
+static const size_t NUM_PROVIDER_CONFIGS = sizeof(PROVIDER_CONFIGS) / sizeof(PROVIDER_CONFIGS[0]);
+
+void
+configureHintsForProvider(struct fi_info* hints, const std::string& provider_name) {
+    const ProviderConfig* config = nullptr;
+
+    // Find matching config
+    // Match order: 1) exact match, 2) prefix match for composite providers (e.g., "verbs;ofi_rxm")
+    for (size_t i = 0; i < NUM_PROVIDER_CONFIGS; ++i) {
+        const std::string& config_name = PROVIDER_CONFIGS[i].name;
+
+        // Exact match
+        if (provider_name == config_name) {
+            config = &PROVIDER_CONFIGS[i];
+            break;
+        }
+
+        // Composite provider match (e.g., "verbs;ofi_rxm" matches "verbs")
+        // Check if provider_name starts with config_name followed by ";"
+        if (provider_name.rfind(config_name + ";", 0) == 0) {
+            config = &PROVIDER_CONFIGS[i];
+            break;
+        }
+    }
+
+    if (!config) {
+        // Default configuration
+        NIXL_DEBUG << "No specific config for provider '" << provider_name << "', using defaults";
+        hints->caps = FI_MSG | FI_RMA | FI_LOCAL_COMM | FI_REMOTE_COMM;
+        hints->mode = 0;
+        hints->ep_attr->type = FI_EP_RDM;
+        return;
+    }
+
+    // Apply provider-specific configuration
+    hints->caps = config->caps;
+    hints->mode = config->mode;
+    hints->ep_attr->type = FI_EP_RDM;
+
+    if (config->resource_mgmt != FI_RM_UNSPEC) {
+        hints->domain_attr->resource_mgmt = config->resource_mgmt;
+    }
+
+    if (config->mr_mode != 0) {
+        hints->domain_attr->mr_mode = config->mr_mode;
+    }
+
+    if (config->threading != FI_THREAD_UNSPEC) {
+        hints->domain_attr->threading = config->threading;
+    }
+}
 
 std::pair<std::string, std::vector<std::string>>
 getAvailableNetworkDevices() {
@@ -43,16 +130,24 @@ getAvailableNetworkDevices() {
         return {"none", {}};
     }
 
-    hints->caps = 0;
-    hints->caps = FI_MSG | FI_RMA; // Basic messaging and RMA
+    // Check if FI_PROVIDER environment variable is set
+    const char* env_provider = getenv("FI_PROVIDER");
+    std::string provider = env_provider && env_provider[0] != '\0' ? env_provider : "";
 
-    hints->caps |= FI_LOCAL_COMM | FI_REMOTE_COMM;
-    hints->mode = FI_CONTEXT | FI_CONTEXT2;
-    hints->ep_attr->type = FI_EP_RDM;
+    if (!provider.empty()) {
+        hints->fabric_attr->prov_name = strdup(env_provider);
+        NIXL_INFO << "Using provider from FI_PROVIDER environment: " << env_provider;
+        // Configure hints based on provider
+        configureHintsForProvider(hints, provider);
+    } else {
+        // Auto-detect: start with default configuration
+        configureHintsForProvider(hints, "");
+    }
 
-    int ret = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, hints, &info);
+    // Use FI_VERSION(1, 16) where HMEM support for some GPUs was added
+    int ret = fi_getinfo(FI_VERSION(1, 16), NULL, NULL, 0, hints, &info);
     if (ret) {
-        NIXL_ERROR << "fi_getinfo failed " << fi_strerror(-ret);
+        NIXL_ERROR << "fi_getinfo failed: " << fi_strerror(-ret);
         fi_freeinfo(hints);
         return {"none", {}};
     }
@@ -85,9 +180,22 @@ getAvailableNetworkDevices() {
         }
     }
 
+    // Provider selection priority:
+    // 1. EFA (AWS Elastic Fabric Adapter)
+    // 2. verbs;ofi_rxm (explicit verbs with RXM)
+    // 3. verbs (plain verbs)
+    // 4. sockets (TCP fallback)
+
     if (provider_device_map.find("efa") != provider_device_map.end()) {
         return {"efa", provider_device_map["efa"]};
+    } else if (provider_device_map.find("verbs;ofi_rxm") != provider_device_map.end()) {
+        // Explicit verbs with RXM
+        NIXL_INFO << "Using verbs with RXM for RDM endpoint support";
+        return {"verbs;ofi_rxm", provider_device_map["verbs;ofi_rxm"]};
     } else if (provider_device_map.find("verbs") != provider_device_map.end()) {
+        // Plain verbs - might not support RDM, but try it
+        NIXL_WARN << "Using plain verbs provider - may not support RDM endpoints. "
+                  << "Consider setting FI_PROVIDER=verbs;ofi_rxm for RDM support";
         return {"verbs", provider_device_map["verbs"]};
     } else if (provider_device_map.find("sockets") != provider_device_map.end()) {
         return {"sockets", {provider_device_map["sockets"][0]}};
diff --git a/src/utils/libfabric/libfabric_common.h b/src/utils/libfabric/libfabric_common.h
index 25347d83f2..1f1b572149 100644
--- a/src/utils/libfabric/libfabric_common.h
+++ b/src/utils/libfabric/libfabric_common.h
@@ -152,6 +152,16 @@ struct BinaryNotification {
     }
 };
 
+// Provider configuration structure
+struct ProviderConfig {
+    std::string name;
+    uint64_t caps;
+    uint64_t mode;
+    uint64_t mr_mode;
+    fi_resource_mgmt resource_mgmt;
+    fi_threading threading;
+};
+
 // Global XFER_ID management
 namespace LibfabricUtils {
 // Get next unique XFER_ID
@@ -173,6 +183,9 @@ getAvailableNetworkDevices();
 // String utilities
 std::string
 hexdump(const void *data);
+// Provider configuration helper
+void
+configureHintsForProvider(struct fi_info* hints, const std::string& provider_name);
 } // namespace LibfabricUtils
 
 #endif // NIXL_SRC_UTILS_LIBFABRIC_LIBFABRIC_COMMON_H
diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp
index 7235232036..21978585f6 100644
--- a/src/utils/libfabric/libfabric_rail.cpp
+++ b/src/utils/libfabric/libfabric_rail.cpp
@@ -411,27 +411,30 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device,
         NIXL_ERROR << "fi_allocinfo failed for rail " << rail_id;
         throw std::runtime_error("Failed to allocate fi_info for rail " + std::to_string(rail_id));
     }
-    hints->caps = 0;
-    hints->caps = FI_MSG | FI_RMA;
-    hints->caps |= FI_LOCAL_COMM | FI_REMOTE_COMM;
-    hints->mode = FI_CONTEXT | FI_CONTEXT2;
-    hints->ep_attr->type = FI_EP_RDM;
-    // Configure memory registration mode based on provider capabilities
+
+    // Configure hints based on provider
+    LibfabricUtils::configureHintsForProvider(hints, provider);
+
+    // Override mr_mode for TCP/sockets (they don't support advanced MR features)
     if (provider == "tcp" || provider == "sockets") {
-        // TCP provider doesn't support FI_MR_PROV_KEY or FI_MR_VIRT_ADDR, use basic mode
         hints->domain_attr->mr_mode = FI_MR_LOCAL | FI_MR_ALLOCATED;
         hints->domain_attr->mr_key_size = 0; // Let provider decide
     } else {
-        // EFA, verbs and other providers support advanced memory registration
-        hints->domain_attr->mr_mode =
-            FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
+        // Add HMEM support for other providers (EFA, verbs)
+        if (hints->domain_attr->mr_mode != 0) {
+            hints->domain_attr->mr_mode |= FI_MR_HMEM;
+        } else {
+            hints->domain_attr->mr_mode =
+                FI_MR_LOCAL | FI_MR_HMEM | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
+        }
         hints->domain_attr->mr_key_size = 2;
     }
+
     hints->domain_attr->name = strdup(device_name.c_str());
-    hints->domain_attr->threading = FI_THREAD_SAFE;
     try {
         // Get fabric info for this specific device
-        int ret = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, hints, &info);
+        // Use FI_VERSION(1, 16) for HMEM support for some GPU implementations
+        int ret = fi_getinfo(FI_VERSION(1, 16), NULL, NULL, 0, hints, &info);
         if (ret) {
             NIXL_ERROR << "fi_getinfo failed for rail " << rail_id << ": " << fi_strerror(-ret);
             throw std::runtime_error("fi_getinfo failed for rail " + std::to_string(rail_id));
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 656fb20da6..4072e33dbf 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -114,8 +114,10 @@ nixlLibfabricTopology::discoverTopology() {
 
 bool
 nixlLibfabricTopology::isRdmaProvider() const {
+    // Check for exact match or composite provider (e.g., "verbs;ofi_rxm")
     return (provider_name == "efa" ||
             provider_name == "verbs" ||
+            provider_name.rfind("verbs;", 0) == 0 ||  // verbs;ofi_rxm, verbs;*
             provider_name == "psm2" ||
             provider_name == "cxi");
 }
@@ -436,8 +438,10 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
     // Configure hints for the discovered provider
     // This ensures consistency between device discovery and PCIe mapping
     hints->fabric_attr->prov_name = strdup(provider_name.c_str());
+    LibfabricUtils::configureHintsForProvider(hints, provider_name);
 
-    int ret = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, hints, &info);
+    // Use FI_VERSION(1, 16) for consistency with other fi_getinfo calls
+    int ret = fi_getinfo(FI_VERSION(1, 16), NULL, NULL, 0, hints, &info);
     if (ret) {
         NIXL_ERROR << "fi_getinfo failed for PCIe mapping with provider " << provider_name << ": "
                    << fi_strerror(-ret);

From b7c4b4033b01923eb3f2766e1d527974f46b14be Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Mon, 6 Oct 2025 10:47:59 -0700
Subject: [PATCH 05/14] libfabric: Add SynapseAI dmabuf support

---
 src/utils/libfabric/libfabric_common.cpp   |   8 +-
 src/utils/libfabric/libfabric_common.h     |   9 -
 src/utils/libfabric/libfabric_rail.cpp     | 222 ++++++++++++++++++---
 src/utils/libfabric/libfabric_rail.h       |  21 ++
 src/utils/libfabric/libfabric_topology.cpp |   4 +-
 5 files changed, 221 insertions(+), 43 deletions(-)

diff --git a/src/utils/libfabric/libfabric_common.cpp b/src/utils/libfabric/libfabric_common.cpp
index 8eeb228f71..6ed15d50a8 100644
--- a/src/utils/libfabric/libfabric_common.cpp
+++ b/src/utils/libfabric/libfabric_common.cpp
@@ -41,9 +41,9 @@ static const ProviderConfig PROVIDER_CONFIGS[] = {
     },
     {
         "verbs",  // Matches both "verbs" and "verbs;ofi_rxm"
-        FI_MSG | FI_RMA | FI_READ | FI_REMOTE_READ,
+        FI_MSG | FI_RMA | FI_READ | FI_WRITE | FI_RECV | FI_SEND | FI_REMOTE_READ | FI_REMOTE_WRITE | FI_MULTI_RECV | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_HMEM,
         0,  // no mode flags required
-        FI_MR_LOCAL | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY,
+        FI_MR_LOCAL | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_HMEM,
         FI_RM_ENABLED,
         FI_THREAD_SAFE
     },
@@ -144,8 +144,8 @@ getAvailableNetworkDevices() {
         configureHintsForProvider(hints, "");
     }
 
-    // Use FI_VERSION(1, 16) where HMEM support for some GPUs was added
-    int ret = fi_getinfo(FI_VERSION(1, 16), NULL, NULL, 0, hints, &info);
+    // Use FI_VERSION(1, 18) for DMABUF and HMEM support
+    int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info);
     if (ret) {
         NIXL_ERROR << "fi_getinfo failed: " << fi_strerror(-ret);
         fi_freeinfo(hints);
diff --git a/src/utils/libfabric/libfabric_common.h b/src/utils/libfabric/libfabric_common.h
index 1f1b572149..1e82753a02 100644
--- a/src/utils/libfabric/libfabric_common.h
+++ b/src/utils/libfabric/libfabric_common.h
@@ -33,15 +33,6 @@
 #include <rdma/fi_rma.h>
 #include <rdma/fi_ext.h>
 
-// FI_HMEM constants compatibility for older libfabric versions (< 1.14.0)
-#ifndef FI_HMEM_CUDA
-#define FI_HMEM_CUDA ((fi_hmem_iface)1)
-#endif
-
-#ifndef FI_HMEM_SYNAPSEAI
-#define FI_HMEM_SYNAPSEAI ((fi_hmem_iface)8)
-#endif
-
 // Libfabric configuration constants
 #define NIXL_LIBFABRIC_DEFAULT_CONTROL_RAILS 1
 #define NIXL_LIBFABRIC_CQ_SREAD_TIMEOUT_SEC 1
diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp
index 21978585f6..2a165d7f9d 100644
--- a/src/utils/libfabric/libfabric_rail.cpp
+++ b/src/utils/libfabric/libfabric_rail.cpp
@@ -26,6 +26,18 @@
 #include <stdexcept>
 #include <stack>
 
+#ifdef HAVE_SYNAPSEAI
+#include <dlfcn.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+// Static SynapseAI library handles
+void* nixlLibfabricRail::synapseai_handle_ = nullptr;
+void* nixlLibfabricRail::hlthunk_handle_ = nullptr;
+std::mutex nixlLibfabricRail::synapseai_init_mutex_;
+nixlLibfabricRail::SynapseAIOps nixlLibfabricRail::synapseai_ops_ = {};
+#endif
+
 // RequestPool Base Class Implementation
 
 RequestPool::RequestPool(size_t pool_size, size_t rail_id)
@@ -433,8 +445,8 @@ nixlLibfabricRail::nixlLibfabricRail(const std::string &device,
     hints->domain_attr->name = strdup(device_name.c_str());
     try {
         // Get fabric info for this specific device
-        // Use FI_VERSION(1, 16) for HMEM support for some GPU implementations
-        int ret = fi_getinfo(FI_VERSION(1, 16), NULL, NULL, 0, hints, &info);
+        // Use FI_VERSION(1, 18) for DMABUF and HMEM support
+        int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info);
         if (ret) {
             NIXL_ERROR << "fi_getinfo failed for rail " << rail_id << ": " << fi_strerror(-ret);
             throw std::runtime_error("fi_getinfo failed for rail " + std::to_string(rail_id));
@@ -1257,6 +1269,28 @@ nixlLibfabricRail::postRead(void *local_buffer,
 
 // Memory Registration Methods
 
+uint64_t
+nixlLibfabricRail::getMemoryRegistrationAccessFlags() const {
+    // Start with base flags needed for RDMA operations
+    uint64_t access_flags = FI_REMOTE_READ | FI_REMOTE_WRITE | FI_SEND | FI_RECV;
+
+    // TCP/sockets providers need additional basic flags
+    if (provider_name == "tcp" || provider_name == "sockets") {
+        access_flags |= FI_READ | FI_WRITE;
+    }
+
+    // Query provider capabilities and add conditionally
+    if (info && info->domain_attr) {
+        if (info->caps & FI_READ) access_flags |= FI_READ;
+        if (info->caps & FI_WRITE) access_flags |= FI_WRITE;
+        if (info->caps & FI_RMA) {
+            access_flags |= FI_READ | FI_WRITE;
+        }
+    }
+
+    return access_flags;
+}
+
 nixl_status_t
 nixlLibfabricRail::registerMemory(void *buffer,
                                   size_t length,
@@ -1273,16 +1307,8 @@ nixlLibfabricRail::registerMemory(void *buffer,
         return NIXL_ERR_BACKEND;
     }
 
-    // Determine access flags based on provider capabilities
-    uint64_t provider_access_flags;
-    if (provider_name == "tcp" || provider_name == "sockets") {
-        // TCP provider has more limited memory registration capabilities
-        // Use basic flags that are commonly supported
-        provider_access_flags = FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE;
-    } else {
-        // EFA and other providers use standard remote access flags
-        provider_access_flags = FI_REMOTE_WRITE | FI_REMOTE_READ;
-    }
+    // Get access flags based on provider capabilities
+    uint64_t provider_access_flags = getMemoryRegistrationAccessFlags();
 
     struct fid_mr *mr;
     int ret;
@@ -1327,25 +1353,34 @@ nixlLibfabricRail::registerMemory(void *buffer,
             mr_attr.device.cuda = device_id;  // Critical for multi-GPU
             NIXL_DEBUG << "Using CUDA HMEM interface for memory registration on rail " << rail_id
                        << " device_id=" << device_id;
+
+            NIXL_TRACE << "HMEM Registration: rail=" << rail_id << " provider=" << provider_name
+                       << " buffer=" << buffer << " length=" << length << " iface=" << mr_attr.iface
+                       << " device_id=" << device_id
+                       << " access_flags=0x" << std::hex << provider_access_flags << std::dec;
+
+            ret = fi_mr_regattr(domain, &mr_attr, 0, &mr);
+            if (ret) {
+                NIXL_ERROR << "fi_mr_regattr (HMEM) failed on rail " << rail_id << ": " << fi_strerror(-ret)
+                           << " (buffer=" << buffer << ", length=" << length
+                           << ", hint=" << hmem_hint << ", iface=" << mr_attr.iface
+                           << ", device_id=" << device_id << ")";
+                return NIXL_ERR_BACKEND;
+            }
         } else if (hint_lower == "synapseai") {
-            mr_attr.iface = FI_HMEM_SYNAPSEAI;
-            mr_attr.device.synapseai = static_cast<uint32_t>(device_id);  // Critical for multi-device
-            NIXL_DEBUG << "Using SynapseAI HMEM interface for memory registration on rail " << rail_id
+#ifdef HAVE_SYNAPSEAI
+            // Use DMABUF path for SynapseAI
+            NIXL_DEBUG << "Using SynapseAI DMABUF registration on rail " << rail_id
                        << " device_id=" << device_id;
-        }
-
-        NIXL_TRACE << "HMEM Registration: rail=" << rail_id << " provider=" << provider_name
-                   << " buffer=" << buffer << " length=" << length << " iface=" << mr_attr.iface
-                   << " device_id=" << device_id
-                   << " access_flags=0x" << std::hex << provider_access_flags << std::dec;
 
-        ret = fi_mr_regattr(domain, &mr_attr, 0, &mr);
-        if (ret) {
-            NIXL_ERROR << "fi_mr_regattr (HMEM) failed on rail " << rail_id << ": " << fi_strerror(-ret)
-                       << " (buffer=" << buffer << ", length=" << length
-                       << ", hint=" << hmem_hint << ", iface=" << mr_attr.iface
-                       << ", device_id=" << device_id << ")";
-            return NIXL_ERR_BACKEND;
+            nixl_status_t status = registerSynapseAIMemoryDmabuf(buffer, length, device_id, provider_access_flags, &mr);
+            if (status != NIXL_SUCCESS) {
+                return status;
+            }
+#else
+            NIXL_ERROR << "SynapseAI support not enabled (HAVE_SYNAPSEAI not defined)";
+            return NIXL_ERR_NOT_SUPPORTED;
+#endif
         }
     } else {
         // === GDR Path (Default) ===
@@ -1388,6 +1423,137 @@ nixlLibfabricRail::registerMemory(void *buffer,
     return NIXL_SUCCESS;
 }
 
+#ifdef HAVE_SYNAPSEAI
+nixl_status_t
+nixlLibfabricRail::registerSynapseAIMemoryDmabuf(void *buffer, size_t length, int device_id, uint64_t provider_access_flags, struct fid_mr **mr_out) const {
+    synDeviceId syn_device_id = static_cast<synDeviceId>(device_id);
+    synDeviceInfoV2 device_info;
+
+    // Thread-safe initialization of static handles
+    std::lock_guard<std::mutex> lock(synapseai_init_mutex_);
+
+    // Load SynapseAI library functions (shared across instances)
+    if (!synapseai_handle_) {
+        synapseai_handle_ = dlopen("libSynapse.so", RTLD_NOW);
+        if (!synapseai_handle_) {
+            NIXL_ERROR << "Failed to dlopen libSynapse.so: " << dlerror();
+            return NIXL_ERR_BACKEND;
+        }
+
+        synapseai_ops_.synDeviceGetInfoV2 =
+            (synStatus (*)(const synDeviceId, synDeviceInfoV2 *))dlsym(synapseai_handle_, "synDeviceGetInfoV2");
+        if (!synapseai_ops_.synDeviceGetInfoV2) {
+            NIXL_ERROR << "Failed to find synDeviceGetInfoV2: " << dlerror();
+            return NIXL_ERR_BACKEND;
+        }
+    }
+
+    if (!hlthunk_handle_) {
+        hlthunk_handle_ = dlopen("libhl-thunk.so", RTLD_NOW);
+        if (!hlthunk_handle_) {
+            NIXL_ERROR << "Failed to dlopen libhl-thunk.so: " << dlerror();
+            return NIXL_ERR_BACKEND;
+        }
+
+        synapseai_ops_.hlthunk_device_mapped_memory_export_dmabuf_fd =
+            (int (*)(int, uint64_t, uint64_t, uint64_t, uint32_t))dlsym(hlthunk_handle_, "hlthunk_device_mapped_memory_export_dmabuf_fd");
+        if (!synapseai_ops_.hlthunk_device_mapped_memory_export_dmabuf_fd) {
+            NIXL_ERROR << "Failed to find hlthunk_device_mapped_memory_export_dmabuf_fd: " << dlerror();
+            return NIXL_ERR_BACKEND;
+        }
+    }
+
+    // Get device info
+    if (synapseai_ops_.synDeviceGetInfoV2(syn_device_id, &device_info) != synSuccess) {
+        NIXL_ERROR << "SynapseAI device " << device_id << " not available";
+        return NIXL_ERR_BACKEND;
+    }
+
+    NIXL_DEBUG << "Using SynapseAI device ID: " << device_id << " on rail " << rail_id;
+
+    // Calculate aligned buffer size
+    const size_t ACCEL_PAGE_SIZE = 4096;
+    size_t modi_memlen = length;
+
+    // Validate memory is within device range
+    uint64_t hbm_base = device_info.globalHbmBaseAddress;
+    uint64_t hbm_size = device_info.dramSize;
+    uint64_t buffer_addr = reinterpret_cast<uint64_t>(buffer);
+
+    if (buffer_addr < hbm_base || buffer_addr >= (hbm_base + hbm_size)) {
+        NIXL_ERROR << "Memory address 0x" << std::hex << buffer_addr
+                  << " is not within HPU device memory range [0x" << hbm_base
+                  << " - 0x" << (hbm_base + hbm_size) << "]" << std::dec;
+        return NIXL_ERR_INVALID_PARAM;
+    }
+
+    // Align device offset to page size
+    uint64_t device_offset = buffer_addr - hbm_base;
+    uint64_t modi_mem_addr = buffer_addr;
+    if (buffer_addr % ACCEL_PAGE_SIZE) {
+        modi_mem_addr = (buffer_addr / ACCEL_PAGE_SIZE) * ACCEL_PAGE_SIZE;
+        device_offset -= buffer_addr - modi_mem_addr;
+        modi_memlen += ACCEL_PAGE_SIZE;
+    }
+    modi_memlen = (modi_memlen + ACCEL_PAGE_SIZE - 1) & ~(ACCEL_PAGE_SIZE - 1);
+
+    NIXL_DEBUG << "Exporting dmabuf: fd=" << device_info.fd
+              << " base=0x" << std::hex << hbm_base
+              << " size=" << std::dec << modi_memlen
+              << " buffer=0x" << std::hex << buffer_addr
+              << " aligned=0x" << modi_mem_addr
+              << " offset=0x" << device_offset << std::dec;
+
+    // Export dmabuf fd
+    int dmabuf_fd = synapseai_ops_.hlthunk_device_mapped_memory_export_dmabuf_fd(
+        device_info.fd,
+        hbm_base,
+        modi_memlen,
+        device_offset,
+        (O_RDWR | O_CLOEXEC)
+    );
+
+    if (dmabuf_fd < 0) {
+        NIXL_ERROR << "hlthunk_device_mapped_memory_export_dmabuf_fd failed: " << strerror(-dmabuf_fd);
+        return NIXL_ERR_BACKEND;
+    }
+
+    NIXL_DEBUG << "Got dmabuf_fd: " << dmabuf_fd << " for device memory on rail " << rail_id;
+
+    // Set up dmabuf structure
+    struct fi_mr_dmabuf dmabuf = {};
+    dmabuf.fd = dmabuf_fd;
+    dmabuf.offset = 0;
+    dmabuf.len = modi_memlen;
+    dmabuf.base_addr = reinterpret_cast<void*>(modi_mem_addr);
+
+    // Set up memory registration attributes
+    struct fi_mr_attr mr_attr = {};
+    mr_attr.dmabuf = &dmabuf;
+    mr_attr.iov_count = 1;
+    mr_attr.access = provider_access_flags;
+    mr_attr.iface = FI_HMEM_SYNAPSEAI;
+    mr_attr.device.synapseai = static_cast<uint32_t>(device_id);
+
+    NIXL_DEBUG << "Registering SynapseAI memory with dmabuf fd: " << dmabuf_fd << " on rail " << rail_id;
+
+    // Register memory with dmabuf
+    int ret = fi_mr_regattr(domain, &mr_attr, FI_MR_DMABUF, mr_out);
+
+    // Cleanup fd after registration
+    close(dmabuf_fd);
+
+    if (ret) {
+        NIXL_ERROR << "fi_mr_regattr (DMABUF) failed on rail " << rail_id << ": " << fi_strerror(-ret);
+        *mr_out = nullptr;
+        return NIXL_ERR_BACKEND;
+    }
+
+    NIXL_INFO << "Successfully registered SynapseAI memory via dmabuf on rail " << rail_id;
+    return NIXL_SUCCESS;
+}
+#endif
+
 nixl_status_t
 nixlLibfabricRail::deregisterMemory(struct fid_mr *mr) const {
     if (!mr) {
diff --git a/src/utils/libfabric/libfabric_rail.h b/src/utils/libfabric/libfabric_rail.h
index daba0137f6..d8c7d4439f 100644
--- a/src/utils/libfabric/libfabric_rail.h
+++ b/src/utils/libfabric/libfabric_rail.h
@@ -383,6 +383,23 @@ class nixlLibfabricRail {
     nixlLibfabricReq *
     findRequestFromContext(void *context) const;
 
+#ifdef HAVE_SYNAPSEAI
+    // SynapseAI DMABUF registration helper
+    nixl_status_t
+    registerSynapseAIMemoryDmabuf(void *buffer, size_t length, int device_id, uint64_t provider_access_flags, struct fid_mr **mr_out) const;
+
+    // Static SynapseAI library handles (shared across all rails)
+    static void *synapseai_handle_;
+    static void *hlthunk_handle_;
+    static std::mutex synapseai_init_mutex_;
+
+    struct SynapseAIOps {
+        synStatus (*synDeviceGetInfoV2)(const synDeviceId deviceId, synDeviceInfoV2 *pDeviceInfo);
+        int (*hlthunk_device_mapped_memory_export_dmabuf_fd)(int fd, uint64_t addr, uint64_t size, uint64_t offset, uint32_t flags);
+    };
+    static SynapseAIOps synapseai_ops_;
+#endif
+
 private:
     // Core libfabric resources
     struct fi_info *info; // from rail_infos[rail_id]
@@ -417,6 +434,10 @@ class nixlLibfabricRail {
     processRecvCompletion(struct fi_cq_data_entry *comp) const;
     nixl_status_t
     processRemoteWriteCompletion(struct fi_cq_data_entry *comp) const;
+
+    // Memory registration helper
+    uint64_t
+    getMemoryRegistrationAccessFlags() const;
 };
 
 
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 4072e33dbf..e73eb2dc84 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -440,8 +440,8 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
     hints->fabric_attr->prov_name = strdup(provider_name.c_str());
     LibfabricUtils::configureHintsForProvider(hints, provider_name);
 
-    // Use FI_VERSION(1, 16) for consistency with other fi_getinfo calls
-    int ret = fi_getinfo(FI_VERSION(1, 16), NULL, NULL, 0, hints, &info);
+    // Use FI_VERSION(1, 18) for DMABUF and HMEM support
+    int ret = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, hints, &info);
     if (ret) {
         NIXL_ERROR << "fi_getinfo failed for PCIe mapping with provider " << provider_name << ": "
                    << fi_strerror(-ret);

From d1f2b04de8260b85c171977598e4276bc9b37f9a Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Tue, 7 Oct 2025 09:35:02 -0700
Subject: [PATCH 06/14] libfabric: bonding dev support in topo detection

---
 src/utils/libfabric/libfabric_common.cpp   |   2 +
 src/utils/libfabric/libfabric_topology.cpp | 112 +++++++++++++++++----
 2 files changed, 93 insertions(+), 21 deletions(-)

diff --git a/src/utils/libfabric/libfabric_common.cpp b/src/utils/libfabric/libfabric_common.cpp
index 6ed15d50a8..f17e0eee97 100644
--- a/src/utils/libfabric/libfabric_common.cpp
+++ b/src/utils/libfabric/libfabric_common.cpp
@@ -23,6 +23,8 @@
 #include <sstream>
 #include <atomic>
 #include <cstring>
+#include <fstream>
+#include <algorithm>
 
 #include <rdma/fabric.h>
 #include <rdma/fi_domain.h>
diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index e73eb2dc84..7857a156b4 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -24,6 +24,7 @@
 #include <sstream>
 #include <algorithm>
 #include <set>
+#include <climits>
 
 #include <rdma/fabric.h>
 #include <rdma/fi_domain.h>
@@ -449,31 +450,81 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
         return NIXL_ERR_BACKEND;
     }
 
+    int device_count = 0;
+    int mapped_count = 0;
     for (struct fi_info *cur = info; cur; cur = cur->next) {
-        if (cur->domain_attr && cur->domain_attr->name && cur->nic && cur->nic->bus_attr) {
-            std::string libfabric_name = cur->domain_attr->name;
-            // Extract PCIe address from bus_attr if available
-            if (cur->nic->bus_attr->bus_type == FI_BUS_PCI &&
-                cur->nic->bus_attr->attr.pci.domain_id != FI_ADDR_UNSPEC) {
-                char pcie_addr[32];
-                snprintf(pcie_addr,
-                         sizeof(pcie_addr),
-                         "%x:%02x:%02x.%x",
-                         cur->nic->bus_attr->attr.pci.domain_id,
-                         cur->nic->bus_attr->attr.pci.bus_id,
-                         cur->nic->bus_attr->attr.pci.device_id,
-                         cur->nic->bus_attr->attr.pci.function_id);
-
-                std::string pcie_address = pcie_addr;
-                pcie_to_libfabric_map[pcie_address] = libfabric_name;
-                libfabric_to_pcie_map[libfabric_name] = pcie_address;
-
-                NIXL_TRACE << "Mapped PCIe " << pcie_address << " → Libfabric " << libfabric_name
-                           << " (provider: " << provider_name << ")";
+        device_count++;
+        if (!cur->domain_attr || !cur->domain_attr->name) {
+            NIXL_DEBUG << "Device " << device_count << ": missing domain_attr or name";
+            continue;
+        }
+
+        std::string libfabric_name = cur->domain_attr->name;
+        NIXL_DEBUG << "Processing device: " << libfabric_name;
+
+        if (!cur->nic) {
+            NIXL_DEBUG << "  Device " << libfabric_name << ": nic is NULL";
+            continue;
+        }
+
+        if (!cur->nic->bus_attr) {
+            NIXL_DEBUG << "  Device " << libfabric_name << ": bus_attr is NULL (likely bonded device)";
+            continue;
+        }
+
+        NIXL_DEBUG << "  Device " << libfabric_name << ": bus_type=" << cur->nic->bus_attr->bus_type;
+
+        if (cur->nic->bus_attr->bus_type != FI_BUS_PCI) {
+            NIXL_DEBUG << "  Device " << libfabric_name << ": not a PCI device, trying sysfs fallback";
+
+            // Fallback: Try to get PCIe address from sysfs for bonded/virtual devices
+            std::string sysfs_path = "/sys/class/infiniband/" + libfabric_name + "/device";
+            char resolved_path[PATH_MAX];
+            if (realpath(sysfs_path.c_str(), resolved_path)) {
+                // Parse PCIe address from path like: /sys/devices/pci0000:6d/0000:6d:02.0/0000:6e:00.0
+                std::string path_str(resolved_path);
+                size_t last_slash = path_str.rfind('/');
+                if (last_slash != std::string::npos) {
+                    std::string pcie_addr = path_str.substr(last_slash + 1);
+                    // Verify format: domain:bus:device.function (e.g., 0000:6e:00.0)
+                    if (pcie_addr.length() >= 7 && pcie_addr.find(':') != std::string::npos) {
+                        pcie_to_libfabric_map[pcie_addr] = libfabric_name;
+                        libfabric_to_pcie_map[libfabric_name] = pcie_addr;
+                        mapped_count++;
+                        NIXL_DEBUG << "  Successfully mapped PCIe " << pcie_addr << " → " << libfabric_name << " (via sysfs)";
+                        continue;
+                    }
+                }
             }
+            NIXL_DEBUG << "  Device " << libfabric_name << ": sysfs fallback failed";
+            continue;
         }
+
+        if (cur->nic->bus_attr->attr.pci.domain_id == FI_ADDR_UNSPEC) {
+            NIXL_DEBUG << "  Device " << libfabric_name << ": PCIe domain_id is FI_ADDR_UNSPEC";
+            continue;
+        }
+
+        // Extract PCIe address from bus_attr if available
+        char pcie_addr[32];
+        snprintf(pcie_addr,
+                 sizeof(pcie_addr),
+                 "%x:%02x:%02x.%x",
+                 cur->nic->bus_attr->attr.pci.domain_id,
+                 cur->nic->bus_attr->attr.pci.bus_id,
+                 cur->nic->bus_attr->attr.pci.device_id,
+                 cur->nic->bus_attr->attr.pci.function_id);
+
+        std::string pcie_address = pcie_addr;
+        pcie_to_libfabric_map[pcie_address] = libfabric_name;
+        libfabric_to_pcie_map[libfabric_name] = pcie_address;
+        mapped_count++;
+
+        NIXL_DEBUG << "  Successfully mapped PCIe " << pcie_address << " → Libfabric " << libfabric_name;
     }
 
+    NIXL_DEBUG << "PCIe mapping: processed " << device_count << " devices, successfully mapped " << mapped_count;
+
     fi_freeinfo(info);
     fi_freeinfo(hints);
     NIXL_TRACE << "Built PCIe to Libfabric mapping for " << pcie_to_libfabric_map.size()
@@ -502,11 +553,16 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
     // Step 1: Build NIC info structures by correlating libfabric with hwloc
     std::vector<NicInfo> discovered_nics;
     std::vector<GpuInfo> discovered_gpus;
+
+    NIXL_DEBUG << "Starting NIC discovery: pcie_to_libfabric_map has " << pcie_to_libfabric_map.size() << " entries";
+
     // Discover NICs by correlating libfabric devices with hwloc objects
     for (const auto &pair : pcie_to_libfabric_map) {
         const std::string &pcie_addr = pair.first;
         const std::string &libfabric_name = pair.second;
 
+        NIXL_DEBUG << "Processing NIC: libfabric_name=" << libfabric_name << ", pcie_addr=" << pcie_addr;
+
         // Parse PCIe address
         uint16_t domain_id;
         uint8_t bus_id, device_id, function_id;
@@ -520,6 +576,9 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             continue;
         }
 
+        NIXL_DEBUG << "Parsed PCIe address: domain=" << domain_id << ", bus=" << (int)bus_id
+                   << ", device=" << (int)device_id << ", function=" << (int)function_id;
+
         // Find corresponding hwloc object
         hwloc_obj_t hwloc_node =
             hwloc_get_pcidev_by_busid(hwloc_topology, domain_id, bus_id, device_id, function_id);
@@ -533,15 +592,23 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             nic.device_id = device_id;
             nic.function_id = function_id;
             discovered_nics.push_back(nic);
-            NIXL_TRACE << "Correlated NIC: " << pcie_addr << " → " << libfabric_name;
+            NIXL_DEBUG << "Successfully correlated NIC: " << pcie_addr << " → " << libfabric_name;
         } else {
             NIXL_WARN << "Could not find hwloc object for PCIe address: " << pcie_addr;
         }
     }
+
+    NIXL_DEBUG << "NIC discovery complete: found " << discovered_nics.size() << " NICs";
+
     // Step 2: Discover GPUs
+    NIXL_DEBUG << "Starting GPU discovery";
     hwloc_obj_t pci_obj = nullptr;
+    int pci_device_count = 0;
+    int gpu_count = 0;
     while ((pci_obj = hwloc_get_next_pcidev(hwloc_topology, pci_obj)) != nullptr) {
+        pci_device_count++;
         if (isNvidiaGpu(pci_obj) || isIntelHpu(pci_obj)) {
+            gpu_count++;
             GpuInfo gpu;
             gpu.hwloc_node = pci_obj;
             gpu.domain_id = pci_obj->attr->pcidev.domain;
@@ -549,8 +616,11 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             gpu.device_id = pci_obj->attr->pcidev.dev;
             gpu.function_id = pci_obj->attr->pcidev.func;
             discovered_gpus.push_back(gpu);
+            NIXL_DEBUG << "Found GPU at " << std::hex << gpu.domain_id << ":"
+                       << (int)gpu.bus_id << ":" << (int)gpu.device_id << "." << (int)gpu.function_id << std::dec;
         }
     }
+    NIXL_DEBUG << "GPU discovery complete: scanned " << pci_device_count << " PCI devices, found " << discovered_gpus.size() << " GPUs";
 
     NIXL_TRACE << "Discovered " << discovered_nics.size() << " NICs and " << discovered_gpus.size()
                << " GPUs for grouping";

From 74dca53af4048d30a9227f3fe73eba6e4da51908 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Tue, 7 Oct 2025 15:12:14 -0700
Subject: [PATCH 07/14] libfabric: assign bond dev to all GPUs v closest

---
 src/utils/libfabric/libfabric_topology.cpp | 48 ++++++++++++++++++----
 src/utils/libfabric/libfabric_topology.h   |  3 +-
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 7857a156b4..06adda0d42 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -488,7 +488,7 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
                     std::string pcie_addr = path_str.substr(last_slash + 1);
                     // Verify format: domain:bus:device.function (e.g., 0000:6e:00.0)
                     if (pcie_addr.length() >= 7 && pcie_addr.find(':') != std::string::npos) {
-                        pcie_to_libfabric_map[pcie_addr] = libfabric_name;
+                        pcie_to_libfabric_map[pcie_addr].push_back(libfabric_name);
                         libfabric_to_pcie_map[libfabric_name] = pcie_addr;
                         mapped_count++;
                         NIXL_DEBUG << "  Successfully mapped PCIe " << pcie_addr << " → " << libfabric_name << " (via sysfs)";
@@ -516,7 +516,7 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
                  cur->nic->bus_attr->attr.pci.function_id);
 
         std::string pcie_address = pcie_addr;
-        pcie_to_libfabric_map[pcie_address] = libfabric_name;
+        pcie_to_libfabric_map[pcie_address].push_back(libfabric_name);
         libfabric_to_pcie_map[libfabric_name] = pcie_address;
         mapped_count++;
 
@@ -554,14 +554,18 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
     std::vector<NicInfo> discovered_nics;
     std::vector<GpuInfo> discovered_gpus;
 
-    NIXL_DEBUG << "Starting NIC discovery: pcie_to_libfabric_map has " << pcie_to_libfabric_map.size() << " entries";
+    NIXL_DEBUG << "Starting NIC discovery: pcie_to_libfabric_map has " << pcie_to_libfabric_map.size() << " PCIe addresses";
 
     // Discover NICs by correlating libfabric devices with hwloc objects
     for (const auto &pair : pcie_to_libfabric_map) {
         const std::string &pcie_addr = pair.first;
-        const std::string &libfabric_name = pair.second;
+        const std::vector<std::string> &libfabric_devices = pair.second;
 
-        NIXL_DEBUG << "Processing NIC: libfabric_name=" << libfabric_name << ", pcie_addr=" << pcie_addr;
+        NIXL_DEBUG << "Processing PCIe address " << pcie_addr << " with " << libfabric_devices.size() << " libfabric device(s)";
+
+        // Process all libfabric devices that share this PCIe address
+        for (const std::string &libfabric_name : libfabric_devices) {
+            NIXL_DEBUG << "  Processing device: " << libfabric_name;
 
         // Parse PCIe address
         uint16_t domain_id;
@@ -592,11 +596,12 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             nic.device_id = device_id;
             nic.function_id = function_id;
             discovered_nics.push_back(nic);
-            NIXL_DEBUG << "Successfully correlated NIC: " << pcie_addr << " → " << libfabric_name;
+            NIXL_DEBUG << "    Successfully correlated NIC: " << pcie_addr << " → " << libfabric_name;
         } else {
-            NIXL_WARN << "Could not find hwloc object for PCIe address: " << pcie_addr;
+            NIXL_WARN << "  Could not find hwloc object for PCIe address: " << pcie_addr;
         }
-    }
+        } // end for each libfabric device
+    } // end for each PCIe address
 
     NIXL_DEBUG << "NIC discovery complete: found " << discovered_nics.size() << " NICs";
 
@@ -667,6 +672,33 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
             }
         }
     }
+
+    // Step 5: Handle bonded devices - if all NICs share the same PCIe address,
+    // assign them to all GPUs instead of just the closest one
+    if (!discovered_nics.empty() && pcie_to_libfabric_map.size() == 1) {
+        // All NICs share a single PCIe address - this is a bonded device
+        const std::string &bonded_pcie_addr = pcie_to_libfabric_map.begin()->first;
+        const std::vector<std::string> &bonded_devices = pcie_to_libfabric_map.begin()->second;
+
+        // Deduplicate device names - libfabric may report the same bonded device multiple times
+        std::vector<std::string> unique_devices;
+        std::set<std::string> seen;
+        for (const auto &dev : bonded_devices) {
+            if (seen.insert(dev).second) {
+                unique_devices.push_back(dev);
+            }
+        }
+
+        NIXL_INFO << "Detected bonded device at PCIe " << bonded_pcie_addr
+                  << " with " << bonded_devices.size() << " instances (" << unique_devices.size() << " unique)";
+        NIXL_INFO << "Assigning bonded device to all " << discovered_gpus.size() << " GPUs (bond driver handles load balancing)";
+
+        // Assign unique bonded device instances to all GPUs
+        for (size_t gpu_idx = 0; gpu_idx < discovered_gpus.size(); ++gpu_idx) {
+            gpu_to_nics[static_cast<int>(gpu_idx)] = unique_devices;
+        }
+    }
+
     return NIXL_SUCCESS;
 }
 
diff --git a/src/utils/libfabric/libfabric_topology.h b/src/utils/libfabric/libfabric_topology.h
index 68120def23..eafc356b58 100644
--- a/src/utils/libfabric/libfabric_topology.h
+++ b/src/utils/libfabric/libfabric_topology.h
@@ -55,7 +55,8 @@ class nixlLibfabricTopology {
     hwloc_topology_t hwloc_topology;
 
     // PCIe to Libfabric device mapping
-    std::map<std::string, std::string> pcie_to_libfabric_map;
+    // One PCIe address can have multiple libfabric devices (bonded case)
+    std::map<std::string, std::vector<std::string>> pcie_to_libfabric_map;
     std::map<std::string, std::string> libfabric_to_pcie_map;
 
     // Helper methods

From 3917898934673e8f503f66c61004d80c901a4ac7 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Wed, 8 Oct 2025 06:43:27 -0700
Subject: [PATCH 08/14] libfabric: deduplicate virtual device list

Use the unique device list to resolve double free

Signed-off-by: Jerome Anand <jerome.anand@intel.com>
Signed-off-by: Tushar Gohad <tushar.gohad@intel.com>
---
 src/utils/libfabric/libfabric_topology.cpp | 39 ++++++++++++++--------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/utils/libfabric/libfabric_topology.cpp b/src/utils/libfabric/libfabric_topology.cpp
index 06adda0d42..6eb6cad136 100644
--- a/src/utils/libfabric/libfabric_topology.cpp
+++ b/src/utils/libfabric/libfabric_topology.cpp
@@ -468,7 +468,7 @@ nixlLibfabricTopology::buildPcieToLibfabricMapping() {
         }
 
         if (!cur->nic->bus_attr) {
-            NIXL_DEBUG << "  Device " << libfabric_name << ": bus_attr is NULL (likely bonded device)";
+            NIXL_DEBUG << "  Device " << libfabric_name << ": bus_attr is NULL (likely virtual device, bonded NIC, etc.)";
             continue;
         }
 
@@ -563,8 +563,21 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
 
         NIXL_DEBUG << "Processing PCIe address " << pcie_addr << " with " << libfabric_devices.size() << " libfabric device(s)";
 
-        // Process all libfabric devices that share this PCIe address
-        for (const std::string &libfabric_name : libfabric_devices) {
+        // Deduplicate device names (libfabric may return the same device multiple times)
+        std::set<std::string> seen;
+        std::vector<std::string> unique_devices;
+        for (const auto &dev : libfabric_devices) {
+            if (seen.insert(dev).second) {
+                unique_devices.push_back(dev);
+            }
+        }
+
+        if (unique_devices.size() < libfabric_devices.size()) {
+            NIXL_DEBUG << "  Deduplicated " << libfabric_devices.size() << " → " << unique_devices.size() << " devices";
+        }
+
+        // Process all unique libfabric devices that share this PCIe address
+        for (const std::string &libfabric_name : unique_devices) {
             NIXL_DEBUG << "  Processing device: " << libfabric_name;
 
         // Parse PCIe address
@@ -673,27 +686,27 @@ nixlLibfabricTopology::buildTopologyAwareGrouping() {
         }
     }
 
-    // Step 5: Handle bonded devices - if all NICs share the same PCIe address,
+    // Step 5: Handle virtual devices - if all NICs share the same PCIe address,
     // assign them to all GPUs instead of just the closest one
     if (!discovered_nics.empty() && pcie_to_libfabric_map.size() == 1) {
-        // All NICs share a single PCIe address - this is a bonded device
-        const std::string &bonded_pcie_addr = pcie_to_libfabric_map.begin()->first;
-        const std::vector<std::string> &bonded_devices = pcie_to_libfabric_map.begin()->second;
+        // All NICs share a single PCIe address - this is a virtual device
+        const std::string &vdev_pcie_addr = pcie_to_libfabric_map.begin()->first;
+        const std::vector<std::string> &vdev_devices = pcie_to_libfabric_map.begin()->second;
 
-        // Deduplicate device names - libfabric may report the same bonded device multiple times
+        // Deduplicate device names - libfabric may report the same virtual device multiple times
         std::vector<std::string> unique_devices;
         std::set<std::string> seen;
-        for (const auto &dev : bonded_devices) {
+        for (const auto &dev : vdev_devices) {
             if (seen.insert(dev).second) {
                 unique_devices.push_back(dev);
             }
         }
 
-        NIXL_INFO << "Detected bonded device at PCIe " << bonded_pcie_addr
-                  << " with " << bonded_devices.size() << " instances (" << unique_devices.size() << " unique)";
-        NIXL_INFO << "Assigning bonded device to all " << discovered_gpus.size() << " GPUs (bond driver handles load balancing)";
+        NIXL_INFO << "Detected virtual device at PCIe " << vdev_pcie_addr
+                  << " with " << vdev_devices.size() << " instances (" << unique_devices.size() << " unique)";
+        NIXL_INFO << "Assigning virtual device to all " << discovered_gpus.size() << " GPUs (if bond, lower layer handles load balancing)";
 
-        // Assign unique bonded device instances to all GPUs
+        // Assign unique virtual device instances to all GPUs
         for (size_t gpu_idx = 0; gpu_idx < discovered_gpus.size(); ++gpu_idx) {
             gpu_to_nics[static_cast<int>(gpu_idx)] = unique_devices;
         }

From 3e28f4338a470ca02e271c738568482825d01f19 Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Tue, 7 Oct 2025 18:06:32 -0700
Subject: [PATCH 09/14] libfabric: Use desc-specific target offset

This fixes a bug in multi-descriptor transfers where descriptors
point to different offsets within the same registered memory region.

Without this fix, RDMA reads always target offset 0. Should extract
each descriptor's specific target address instead.

Also impacted: Block-based transfers (Iteration N would read blocks
from iteration 0, etc), Partial buffer updates, etc.

Signed-off-by: Tushar Gohad <tushar.gohad@intel.com>
---
 src/plugins/libfabric/libfabric_backend.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/plugins/libfabric/libfabric_backend.cpp b/src/plugins/libfabric/libfabric_backend.cpp
index f0ea8bde49..8bbd3264e5 100644
--- a/src/plugins/libfabric/libfabric_backend.cpp
+++ b/src/plugins/libfabric/libfabric_backend.cpp
@@ -1081,7 +1081,8 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
         int gpu_id = local[desc_idx].devId;
 
         NIXL_DEBUG << "Processing descriptor " << desc_idx << " GPU " << gpu_id
-                   << " addr: " << transfer_addr << " size: " << transfer_size;
+                   << " local_addr: " << transfer_addr << " size: " << transfer_size
+                   << " remote_addr: " << (void *)remote[desc_idx].addr;
 
         NIXL_DEBUG << "DEBUG: remote_agent='" << remote_agent << "' localAgent='" << localAgent
                    << "'";
@@ -1117,11 +1118,14 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
         }
 
         // Prepare and submit transfer for remote agents
+        // Use descriptor's specific target address
+        uint64_t remote_target_addr = remote[desc_idx].addr;
+
         nixl_status_t status = rail_manager.prepareAndSubmitTransfer(
             op_type,
             transfer_addr,
             transfer_size,
-            remote_md->remote_buf_addr_,
+            remote_target_addr,
             local_md->selected_rails_,
             local_md->rail_mr_list_,
             remote_md->rail_remote_key_list_,

From 22a4dfb22e09965d2bcb62828965fb45e64fee1d Mon Sep 17 00:00:00 2001
From: Tushar Gohad <tushar.gohad@intel.com>
Date: Wed, 8 Oct 2025 23:16:43 -0700
Subject: [PATCH 10/14] libfabric: Unit tests for libfabric backend

- Create 16 descriptors at different 64KB offsets in a 1MB buffer
- Attempt RDMA transfers
---
 .../libfabric_backend_integration_test.cpp    | 299 ++++++++++++++++++
 test/unit/plugins/libfabric/meson.build       |  69 ++++
 test/unit/plugins/meson.build                 |   7 +
 3 files changed, 375 insertions(+)
 create mode 100644 test/unit/plugins/libfabric/libfabric_backend_integration_test.cpp
 create mode 100644 test/unit/plugins/libfabric/meson.build

diff --git a/test/unit/plugins/libfabric/libfabric_backend_integration_test.cpp b/test/unit/plugins/libfabric/libfabric_backend_integration_test.cpp
new file mode 100644
index 0000000000..0a4fc38d56
--- /dev/null
+++ b/test/unit/plugins/libfabric/libfabric_backend_integration_test.cpp
@@ -0,0 +1,299 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025 Amazon.com, Inc. and affiliates.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Integration test for libfabric backend descriptor offset handling
+ * Tests the actual backend with multiple descriptors pointing to different offsets
+ * within the same registered memory region.
+ */
+
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <unistd.h>
+
+#include "libfabric_backend.h"
+#include "common/nixl_log.h"
+
+using namespace std;
+
+nixlLibfabricEngine *
+createEngine(std::string name, bool p_thread) {
+    nixlBackendInitParams init;
+    nixl_b_params_t custom_params;
+
+    init.enableProgTh = p_thread;
+    init.pthrDelay = 100;
+    init.localAgent = name;
+    init.customParams = &custom_params;
+    init.type = "LIBFABRIC";
+
+    auto engine = new nixlLibfabricEngine(&init);
+    assert(!engine->getInitErr());
+    if (engine->getInitErr()) {
+        std::cout << "Failed to initialize libfabric engine" << std::endl;
+        exit(1);
+    }
+
+    return engine;
+}
+
+void
+releaseEngine(nixlLibfabricEngine *engine) {
+    delete engine;
+}
+
+void
+allocateAndRegister(nixlLibfabricEngine *engine,
+                    int dev_id,
+                    nixl_mem_t mem_type,
+                    void *&addr,
+                    size_t len,
+                    nixlBackendMD *&md) {
+    nixlBlobDesc desc;
+
+    // Allocate buffer
+    addr = calloc(1, len);
+    assert(addr != nullptr);
+
+    desc.addr = (uintptr_t)addr;
+    desc.len = len;
+    desc.devId = dev_id;
+
+    int ret = engine->registerMem(desc, mem_type, md);
+    assert(ret == NIXL_SUCCESS);
+}
+
+void
+deallocateAndDeregister(nixlLibfabricEngine *engine,
+                        int dev_id,
+                        nixl_mem_t mem_type,
+                        void *&addr,
+                        nixlBackendMD *&md) {
+    engine->deregisterMem(md);
+    free(addr);
+}
+
+void
+loadRemote(nixlLibfabricEngine *engine,
+           int dev_id,
+           std::string agent,
+           nixl_mem_t mem_type,
+           void *addr,
+           size_t len,
+           nixlBackendMD *&lmd,
+           nixlBackendMD *&rmd) {
+    nixlBlobDesc info;
+    info.addr = (uintptr_t)addr;
+    info.len = len;
+    info.devId = dev_id;
+    engine->getPublicData(lmd, info.metaInfo);
+
+    assert(info.metaInfo.size() > 0);
+
+    int ret = engine->loadRemoteMD(info, mem_type, agent, rmd);
+    assert(NIXL_SUCCESS == ret);
+}
+
+void
+populateDescs(nixl_meta_dlist_t &descs, int dev_id, void *addr, int desc_cnt, size_t desc_size,
+              nixlBackendMD *&md) {
+    for (int i = 0; i < desc_cnt; i++) {
+        nixlMetaDesc req;
+        req.addr = (uintptr_t)(((char *)addr) + i * desc_size); // Different offset per descriptor
+        req.len = desc_size;
+        req.devId = dev_id;
+        req.metadataP = md;
+        descs.addDesc(req);
+    }
+}
+
+void
+performTransfer(nixlLibfabricEngine *engine1,
+                nixlLibfabricEngine *engine2,
+                nixl_meta_dlist_t &req_src_descs,
+                nixl_meta_dlist_t &req_dst_descs,
+                void *addr1,
+                void *addr2,
+                size_t total_len,
+                nixl_xfer_op_t op) {
+
+    std::string remote_agent("Agent2");
+    if (engine1 == engine2)
+        remote_agent = "Agent1";
+
+    std::cout << "\t" << (op == NIXL_READ ? "READ" : "WRITE") << " from " << addr1 << " to "
+              << addr2 << " (" << total_len << " bytes, " << req_src_descs.descCount()
+              << " descriptors)\n";
+
+    nixl_opt_b_args_t opt_args;
+    opt_args.hasNotif = false;
+
+    // Prepare and post transfer
+    nixlBackendReqH *handle = nullptr;
+    nixl_status_t ret = engine1->prepXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args);
+    assert(ret == NIXL_SUCCESS);
+
+    ret = engine1->postXfer(op, req_src_descs, req_dst_descs, remote_agent, handle, &opt_args);
+    assert(ret == NIXL_SUCCESS || ret == NIXL_IN_PROG);
+
+    if (ret == NIXL_SUCCESS) {
+        cout << "\t\tTransfer completed immediately\n";
+    } else {
+        cout << "\t\tWaiting for transfer completion...\n";
+        while (ret == NIXL_IN_PROG) {
+            ret = engine1->checkXfer(handle);
+            // checkXfer() already progresses rails when progress thread is disabled
+            assert(ret == NIXL_SUCCESS || ret == NIXL_IN_PROG);
+        }
+    }
+
+    engine1->releaseReqH(handle);
+    cout << "\t\tTransfer complete\n";
+}
+
+void
+test_multi_descriptor_offsets(bool p_thread) {
+    std::cout << "\n\n";
+    std::cout << "****************************************************\n";
+    std::cout << "   Multi-descriptor offset test (Integration)\n";
+    std::cout << "   P-Thread=" << (p_thread ? "ON" : "OFF") << "\n";
+    std::cout << "****************************************************\n";
+    std::cout << "\n";
+
+    std::string agent1("Agent1");
+    std::string agent2("Agent2");
+
+    // Create engines
+    nixlLibfabricEngine *engine1 = createEngine(agent1, p_thread);
+    nixlLibfabricEngine *engine2 = createEngine(agent2, p_thread);
+
+    // Test parameters
+    const size_t TOTAL_SIZE = 1024 * 1024; // 1MB total
+    const size_t DESC_SIZE = 64 * 1024;    // 64KB per descriptor
+    const int DESC_COUNT = TOTAL_SIZE / DESC_SIZE; // 16 descriptors
+
+    std::cout << "Test configuration:\n";
+    std::cout << "  Total buffer size: " << TOTAL_SIZE << " bytes\n";
+    std::cout << "  Descriptor size: " << DESC_SIZE << " bytes\n";
+    std::cout << "  Descriptor count: " << DESC_COUNT << "\n\n";
+
+    // Allocate and register buffers
+    void *send_buf = nullptr;
+    void *recv_buf = nullptr;
+    nixlBackendMD *send_md = nullptr;
+    nixlBackendMD *recv_md = nullptr;
+
+    allocateAndRegister(engine1, 0, DRAM_SEG, send_buf, TOTAL_SIZE, send_md);
+    allocateAndRegister(engine2, 0, DRAM_SEG, recv_buf, TOTAL_SIZE, recv_md);
+
+    // Fill send buffer with unique pattern for each descriptor's region
+    for (int i = 0; i < DESC_COUNT; i++) {
+        size_t offset = i * DESC_SIZE;
+        uint8_t pattern = static_cast<uint8_t>(i);
+        for (size_t j = 0; j < DESC_SIZE; j++) {
+            ((uint8_t *)send_buf)[offset + j] = pattern;
+        }
+    }
+
+    // Zero receive buffer
+    memset(recv_buf, 0, TOTAL_SIZE);
+
+    // Exchange connection info
+    std::string conn1, conn2;
+    engine1->getConnInfo(conn1);
+    engine2->getConnInfo(conn2);
+
+    engine1->loadRemoteConnInfo(agent2, conn2);
+    engine2->loadRemoteConnInfo(agent1, conn1);
+
+    std::cout << "Establishing connections...\n";
+    engine1->connect(agent2);
+    engine2->connect(agent1);
+
+    // Wait for async connection establishment to complete
+    // The CM thread handles connection progress
+    sleep(2);
+    std::cout << "Connections established\n\n";
+
+    // Load remote metadata
+    nixlBackendMD *recv_rmd = nullptr;
+
+    loadRemote(engine1, 0, agent2, DRAM_SEG, recv_buf, TOTAL_SIZE, recv_md, recv_rmd);
+
+    // Create descriptor lists with different offsets
+    nixl_meta_dlist_t src_descs(DRAM_SEG);
+    nixl_meta_dlist_t dst_descs(DRAM_SEG);
+
+    populateDescs(src_descs, 0, send_buf, DESC_COUNT, DESC_SIZE, send_md);
+    populateDescs(dst_descs, 0, recv_buf, DESC_COUNT, DESC_SIZE, recv_rmd);
+
+    std::cout << "Created " << src_descs.descCount() << " source descriptors\n";
+    std::cout << "Created " << dst_descs.descCount() << " destination descriptors\n\n";
+
+    // Perform transfer
+    performTransfer(engine1, engine2, src_descs, dst_descs, send_buf, recv_buf, TOTAL_SIZE, NIXL_WRITE);
+
+    // Verify data correctness for each descriptor's region
+    std::cout << "\nData verification:\n";
+    bool all_correct = true;
+
+    for (int i = 0; i < DESC_COUNT; i++) {
+        size_t offset = i * DESC_SIZE;
+        uint8_t expected_pattern = static_cast<uint8_t>(i);
+        bool desc_correct = true;
+
+        for (size_t j = 0; j < DESC_SIZE; j++) {
+            if (((uint8_t *)recv_buf)[offset + j] != expected_pattern) {
+                std::cerr << "  ERROR: Descriptor " << i << " at offset " << offset + j
+                          << " has wrong data: expected " << (int)expected_pattern << ", got "
+                          << (int)((uint8_t *)recv_buf)[offset + j] << "\n";
+                desc_correct = false;
+                all_correct = false;
+                break; // Only report first mismatch per descriptor
+            }
+        }
+
+        if (desc_correct) {
+            std::cout << "  Descriptor " << i << " (offset " << offset << "): OK (pattern "
+                      << (int)expected_pattern << ")\n";
+        }
+    }
+
+    if (all_correct) {
+        std::cout << "\n✓ ALL DESCRIPTORS VERIFIED SUCCESSFULLY\n";
+        std::cout << "  Each descriptor transferred data from its correct offset\n";
+    } else {
+        std::cerr << "\n✗ DATA CORRUPTION DETECTED\n";
+        std::cerr << "  Some descriptors received data from wrong offsets\n";
+        std::cerr << "  This indicates the descriptor offset bug is present!\n";
+        exit(1);
+    }
+
+    // Cleanup
+    engine1->disconnect(agent2);
+    engine2->disconnect(agent1);
+
+    deallocateAndDeregister(engine1, 0, DRAM_SEG, send_buf, send_md);
+    deallocateAndDeregister(engine2, 0, DRAM_SEG, recv_buf, recv_md);
+
+    releaseEngine(engine1);
+    releaseEngine(engine2);
+
+    std::cout << "\nTest completed successfully!\n";
+}
+
+int
+main(int argc, char **argv) {
+    bool p_thread = false;
+
+    if (argc > 1 && std::string(argv[1]) == "--pthread") {
+        p_thread = true;
+    }
+
+    test_multi_descriptor_offsets(p_thread);
+
+    return 0;
+}
diff --git a/test/unit/plugins/libfabric/meson.build b/test/unit/plugins/libfabric/meson.build
new file mode 100644
index 0000000000..0abcefe4bb
--- /dev/null
+++ b/test/unit/plugins/libfabric/meson.build
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025 Intel Corporation. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Libfabric plugin integration tests
+
+libfabric_backend_dep = declare_dependency(
+    link_with: libfabric_backend_lib,
+    include_directories: [nixl_inc_dirs, '../../../../src/plugins/libfabric']
+)
+
+# Collect compile flags
+compile_flags = []
+additional_deps = []
+
+if cuda_dep.found()
+    additional_deps += [cuda_dep]
+    compile_flags += ['-DHAVE_CUDA']
+endif
+
+if synapseai_dep.found()
+    additional_deps += [synapseai_dep]
+    compile_flags += ['-DHAVE_SYNAPSEAI']
+endif
+
+libfabric_test_deps = [
+    nixl_dep,
+    nixl_infra,
+    nixl_common_deps,
+    libfabric_utils_dep,
+    libfabric_backend_dep,
+    libfabric_dep,
+    thread_dep,
+] + additional_deps
+
+# Integration test: Multi-descriptor offset handling with actual backend
+test_backend_integration = executable(
+    'test_libfabric_backend_integration',
+    'libfabric_backend_integration_test.cpp',
+    dependencies: libfabric_test_deps,
+    cpp_args: compile_flags,
+    include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../../../src/plugins/libfabric'],
+    install: false,
+)
+
+test('libfabric_backend_integration',
+     test_backend_integration,
+     suite: ['unit', 'plugins', 'libfabric', 'integration'],
+     timeout: 120,
+)
+
+# Future integration tests can be added here following the same pattern:
+# - Multi-GPU transfers
+# - Different memory types (DRAM, VRAM)
+# - Notification flow
+# - Connection management
diff --git a/test/unit/plugins/meson.build b/test/unit/plugins/meson.build
index af5aa09378..0f30c29c3a 100644
--- a/test/unit/plugins/meson.build
+++ b/test/unit/plugins/meson.build
@@ -1,4 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025 Amazon.com, Inc. and affiliates.
+# SPDX-FileCopyrightText: Copyright (c) 2025 Intel Corporation. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -47,3 +49,8 @@ hf3fs_lib_found = cc.find_library(hf3fs_lib_file, dirs: [hf3fs_lib_path], requir
 if hf3fs_lib_found.found()
     subdir('hf3fs')
 endif
+
+# Libfabric plugin tests
+if libfabric_dep.found()
+    subdir('libfabric')
+endif

From 5b6c0379bf9e906771d89544db4a9132c02f76fa Mon Sep 17 00:00:00 2001
From: Jeeja Kp <jeeja.kp@intel.com>
Date: Thu, 16 Oct 2025 12:53:14 +0300
Subject: [PATCH 11/14] Fix synapse dependency in meson

Signed-off-by: Jeeja Kp <jeeja.kp@intel.com>
---
 meson.build                          | 30 ++++++++++++++++++++++------
 meson_options.txt                    |  2 ++
 src/utils/libfabric/libfabric_rail.h |  4 ++--
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/meson.build b/meson.build
index 6eebe4e921..767217d15b 100644
--- a/meson.build
+++ b/meson.build
@@ -105,13 +105,31 @@ else
 endif
 
 # SynapseAI (Habana Gaudi) dependency detection
-# Try to find both libSynapse and hl-thunk libraries
-synapse_lib = cpp.find_library('Synapse',
-                               dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
+synapse_inc_path = get_option('synapsepath_inc')
+synapse_lib_path = get_option('synapsepath_lib')
+
+if synapse_lib_path == ''
+    #use default path
+    # Try to find both libSynapse and hl-thunk libraries
+    synapse_lib = cpp.find_library('Synapse',
+                                 dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
                                required: false)
-hlthunk_lib = cpp.find_library('hl-thunk',
+    hlthunk_lib = cpp.find_library('hl-thunk',
                                dirs: ['/usr/lib/habanalabs', '/usr/local/lib/habanalabs'],
                                required: false)
+else
+    synapse_lib = cpp.find_library('Synapse',
+                                 dirs: [synapse_lib_path],
+                               required: false)
+    hlthunk_lib = cpp.find_library('hl-thunk',
+                               dirs: [synapse_lib_path],
+                               required: false)
+endif
+
+if synapse_inc_path == ''
+  #use default path
+  synapse_inc_path = '/usr/include/habanalabs'
+endif
 
 # SynapseAI support requires both libraries
 synapseai_dep = dependency('', required: false)  # Initialize as not found
@@ -127,8 +145,8 @@ if synapseai_dep.found()
     synapseai_dep = declare_dependency(
         dependencies: synapseai_dep,
         include_directories: [
-            include_directories('/usr/include/habanalabs'),
-            include_directories('/usr/include/drm')
+            include_directories('/usr/include/drm'),
+            include_directories(synapse_inc_path)
         ]
     )
     message('Found SynapseAI support for Habana Gaudi devices')
diff --git a/meson_options.txt b/meson_options.txt
index a316184f8d..2ef181b048 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -24,6 +24,8 @@ option('gds_path', type: 'string', value: '/usr/local/cuda/', description: 'Path
 option('cudapath_inc', type: 'string', value: '', description: 'Include path for CUDA')
 option('cudapath_lib', type: 'string', value: '', description: 'Library path for CUDA')
 option('cudapath_stub', type: 'string', value: '', description: 'Extra Stub path for CUDA')
+option('synapsepath_inc', type: 'string', value: '', description: 'Include path for GAUDI')
+option('synapsepath_lib', type: 'string', value: '', description: 'Library path for GAUDI')
 option('static_plugins', type: 'string', value: '', description: 'Plugins to be built-in, comma-separated')
 option('build_docs', type: 'boolean', value: false, description: 'Build Doxygen documentation')
 option('log_level', type: 'combo', choices: ['trace', 'debug', 'info', 'warning', 'error', 'fatal', 'auto'], value: 'auto', description: 'Log Level (auto: auto-detect based on build type: trace for debug builds, info for release builds)')
diff --git a/src/utils/libfabric/libfabric_rail.h b/src/utils/libfabric/libfabric_rail.h
index d8c7d4439f..db23bbab85 100644
--- a/src/utils/libfabric/libfabric_rail.h
+++ b/src/utils/libfabric/libfabric_rail.h
@@ -31,8 +31,8 @@
 #include "libfabric/libfabric_common.h"
 
 #ifdef HAVE_SYNAPSEAI
-#include <habanalabs/synapse_api.h>
-#include <habanalabs/hlthunk.h>
+#include <synapse_api.h>
+#include <hlthunk.h>
 #endif
 
 // Forward declarations

From b4ad7ee718a2ab72346db95644f80229e2e690e8 Mon Sep 17 00:00:00 2001
From: Jeeja Kp <jeeja.kp@intel.com>
Date: Mon, 27 Oct 2025 09:28:28 +0200
Subject: [PATCH 12/14] Add support to nixl_test to verfiy libfabric backend

- add libfabric backend to nixl test
- add libfabric Gaudi->Gaudi transfer test

Signed-off-by: Jeeja Kp <jeeja.kp@intel.com>
---
 meson_options.txt              |   4 +-
 test/meson.build               |   5 +
 test/nixl/meson.build          |  19 ++-
 test/nixl/nixl_test.cpp        | 213 ++++++++++++++++++++++-----------
 test/utils/meson.build         |  34 ++++++
 test/utils/synapseai_utils.cpp | 110 +++++++++++++++++
 test/utils/synapseai_utils.h   |  34 ++++++
 7 files changed, 346 insertions(+), 73 deletions(-)
 create mode 100644 test/utils/meson.build
 create mode 100644 test/utils/synapseai_utils.cpp
 create mode 100644 test/utils/synapseai_utils.h

diff --git a/meson_options.txt b/meson_options.txt
index 2ef181b048..9e4a5b4e6a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -24,8 +24,8 @@ option('gds_path', type: 'string', value: '/usr/local/cuda/', description: 'Path
 option('cudapath_inc', type: 'string', value: '', description: 'Include path for CUDA')
 option('cudapath_lib', type: 'string', value: '', description: 'Library path for CUDA')
 option('cudapath_stub', type: 'string', value: '', description: 'Extra Stub path for CUDA')
-option('synapsepath_inc', type: 'string', value: '', description: 'Include path for GAUDI')
-option('synapsepath_lib', type: 'string', value: '', description: 'Library path for GAUDI')
+option('synapsepath_inc', type: 'string', value: '', description: 'Include path for Intel Gaudi/ HPU')
+option('synapsepath_lib', type: 'string', value: '', description: 'Library path for Intel Gaudi/ HPU')
 option('static_plugins', type: 'string', value: '', description: 'Plugins to be built-in, comma-separated')
 option('build_docs', type: 'boolean', value: false, description: 'Build Doxygen documentation')
 option('log_level', type: 'combo', choices: ['trace', 'debug', 'info', 'warning', 'error', 'fatal', 'auto'], value: 'auto', description: 'Log Level (auto: auto-detect based on build type: trace for debug builds, info for release builds)')
diff --git a/test/meson.build b/test/meson.build
index aa9fd3acb6..0b0ec96f9d 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -13,6 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+subdir('utils')
+
+# Re-export dependency to project root
+test_synapseai_utils_dep = test_synapseai_utils_dep
+
 subdir('nixl')
 subdir('unit')
 subdir('gtest')
diff --git a/test/nixl/meson.build b/test/nixl/meson.build
index bc6a9c9ffa..f8ac290c2f 100644
--- a/test/nixl/meson.build
+++ b/test/nixl/meson.build
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 desc_example = executable('desc_example',
            'desc_example.cpp',
            dependencies: [nixl_dep, nixl_infra],
@@ -27,10 +28,20 @@ agent_example = executable('agent_example',
            link_with: [serdes_lib],
            install: true)
 
-nixl_test_app  = executable('nixl_test', 'nixl_test.cpp',
-                           dependencies: [nixl_dep, nixl_infra, stream_interface, thread_dep],
-                           include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../src/utils/serdes'],
-                           link_with: [serdes_lib], install: true)
+if synapseai_dep.found()
+    compile_flags += ['-DHAVE_SYNAPSEAI']
+    nixl_test_app  = executable('nixl_test', 'nixl_test.cpp',
+                             cpp_args: compile_flags,
+                             dependencies: [nixl_dep, nixl_infra, stream_interface, thread_dep, test_synapseai_utils_dep],
+                             include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../src/utils/serdes'],
+                             link_with: [serdes_lib], install: true)
+else
+    nixl_test_app  = executable('nixl_test', 'nixl_test.cpp',
+                             dependencies: [nixl_dep, nixl_infra, stream_interface, thread_dep],
+                             include_directories: [nixl_inc_dirs, utils_inc_dirs, '../../src/utils/serdes'],
+                             link_with: [serdes_lib], install: true)
+endif
+
 
 plugin_test = executable('test_plugin',
                         'test_plugin.cpp',
diff --git a/test/nixl/nixl_test.cpp b/test/nixl/nixl_test.cpp
index 03095c2cf3..8a8ed01364 100644
--- a/test/nixl/nixl_test.cpp
+++ b/test/nixl/nixl_test.cpp
@@ -27,7 +27,9 @@
 #include "serdes/serdes.h"
 #include <mutex>
 #include <vector>
-
+#ifdef HAVE_SYNAPSEAI
+#include "synapseai_utils.h"
+#endif
 #define NUM_TRANSFERS 2
 #define NUM_THREADS 4
 #define SIZE 1024
@@ -48,30 +50,39 @@ struct SharedNotificationState {
 static const std::string target("target");
 static const std::string initiator("initiator");
 
-static std::vector<std::unique_ptr<uint8_t[]>> initMem(nixlAgent &agent,
-                                                       nixl_reg_dlist_t &dram,
-                                                       nixl_opt_args_t *extra_params,
-                                                       uint8_t val) {
+static std::vector<std::unique_ptr<uint8_t[]>>
+initMem(nixlAgent &agent, nixl_reg_dlist_t &mem_dlist, nixl_opt_args_t *extra_params, uint8_t val) {
     std::vector<std::unique_ptr<uint8_t[]>> addrs;
 
     for (int i = 0; i < NUM_TRANSFERS; i++) {
         auto addr = std::make_unique<uint8_t[]>(SIZE);
-
         std::fill_n(addr.get(), SIZE, val);
-        std::cout << "Allocating : " << (void *)addr.get() << ", "
-                  << "Setting to 0x" << std::hex << (unsigned)val << std::dec << std::endl;
-        dram.addDesc(nixlBlobDesc((uintptr_t)(addr.get()), SIZE, 0, ""));
 
+#ifdef HAVE_SYNAPSEAI
+        auto device_buffer = Synapseaiutils::allocate_synapse_memory(SIZE, addr.get());
+
+        std::cout << "Allocating : " << addr.get() << ", " << "Setting to 0x" << std::hex
+                  << (unsigned)val << std::dec << std::endl;
+        mem_dlist.addDesc(nixlBlobDesc(
+            (uintptr_t)(device_buffer), SIZE, Synapseaiutils::get_device_handle(), ""));
+#else
+        mem_dlist.addDesc(nixlBlobDesc((uintptr_t)(addr.get()), SIZE, 0, ""));
+#endif
         addrs.push_back(std::move(addr));
     }
-    agent.registerMem(dram, extra_params);
+    agent.registerMem(mem_dlist, extra_params);
 
     return addrs;
 }
 
-static void targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int thread_id) {
-    nixl_reg_dlist_t dram_for_ucx(DRAM_SEG);
-    auto addrs = initMem(agent, dram_for_ucx, extra_params, 0);
+static void
+targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int thread_id, std::string backend) {
+#ifdef HAVE_SYNAPSEAI
+    nixl_reg_dlist_t mem_dlist(VRAM_SEG);
+#else
+    nixl_reg_dlist_t mem_dlist(DRAM_SEG);
+#endif
+    auto addrs = initMem(agent, mem_dlist, extra_params, 0);
 
     nixl_blob_t tgt_metadata;
     agent.getLocalMD(tgt_metadata);
@@ -79,47 +90,68 @@ static void targetThread(nixlAgent &agent, nixl_opt_args_t *extra_params, int th
     std::cout << "Thread " << thread_id << " Start Control Path metadata exchanges\n";
 
     std::cout << "Thread " << thread_id << " Desc List from Target to Initiator\n";
-    dram_for_ucx.print();
+    mem_dlist.print();
 
     /** Only send desc list */
     nixlSerDes serdes;
-    assert(dram_for_ucx.trim().serialize(&serdes) == NIXL_SUCCESS);
+    assert(mem_dlist.trim().serialize(&serdes) == NIXL_SUCCESS);
 
     std::cout << "Thread " << thread_id << " Wait for initiator and then send xfer descs\n";
     std::string message = serdes.exportStr();
-    while (agent.genNotif(initiator, message, extra_params) != NIXL_SUCCESS);
-    std::cout << "Thread " << thread_id << " End Control Path metadata exchanges\n";
+
+    while (agent.genNotif(initiator, message, extra_params) != NIXL_SUCCESS)
+        ;
 
     std::cout << "Thread " << thread_id << " Start Data Path Exchanges\n";
     std::cout << "Thread " << thread_id << " Waiting to receive Data from Initiator\n";
 
     bool rc = false;
     for (int n_tries = 0; !rc && n_tries < 100; n_tries++) {
-        //Only works with progress thread now, as backend is protected
+        // Only works with progress thread now, as backend is protected
         /** Sanity Check */
+#ifdef HAVE_SYNAPSEAI
+        for (int i = 0; i < mem_dlist.descCount(); ++i) {
+            nixlBlobDesc desc = mem_dlist[i];
+            Synapseaiutils::copy_from_device_buffer((uint64_t)desc.addr, addrs[i].get(), desc.len);
+        }
+#endif
         rc = std::all_of(addrs.begin(), addrs.end(), [](auto &addr) {
-            return std::all_of(addr.get(), addr.get() + SIZE, [](int x) {
-                return x == MEM_VAL;
-            });
+            return std::all_of(addr.get(), addr.get() + SIZE, [](int x) { return x == MEM_VAL; });
         });
-        if (!rc)
-            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        if (!rc) std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
     if (!rc)
-        std::cerr << "Thread " << thread_id << " UCX Transfer failed, buffers are different\n";
+        std::cerr << "Thread " << thread_id << " " << backend
+                  << " Transfer failed, buffers are different\n";
     else
-        std::cout << "Thread " << thread_id << " Transfer completed and Buffers match with Initiator\n"
-                  << "Thread " << thread_id << " UCX Transfer Success!!!\n";
+        std::cout << "Thread " << thread_id
+                  << " Transfer completed and Buffers match with Initiator\n"
+                  << "Thread " << thread_id << " " << backend << " Transfer Success!!!\n";
 
     std::cout << "Thread " << thread_id << " Cleanup..\n";
-    agent.deregisterMem(dram_for_ucx, extra_params);
+    agent.deregisterMem(mem_dlist, extra_params);
+#ifdef HAVE_SYNAPSEAI
+    for (int i = 0; i < mem_dlist.descCount(); ++i) {
+        nixlBlobDesc desc = mem_dlist[i];
+        Synapseaiutils::free_synapse_memory((uint64_t)desc.addr);
+    }
+#endif
 }
 
-static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params,
-                          const std::string &target_ip, int target_port, int thread_id,
-                          SharedNotificationState &shared_state) {
-    nixl_reg_dlist_t dram_for_ucx(DRAM_SEG);
-    auto addrs = initMem(agent, dram_for_ucx, extra_params, MEM_VAL);
+static void
+initiatorThread(nixlAgent &agent,
+                nixl_opt_args_t *extra_params,
+                const std::string &target_ip,
+                int target_port,
+                int thread_id,
+                SharedNotificationState &shared_state,
+                std::string backend) {
+#ifdef HAVE_SYNAPSEAI
+    nixl_reg_dlist_t mem_dlist(VRAM_SEG);
+#else
+    nixl_reg_dlist_t mem_dlist(DRAM_SEG);
+#endif
+    auto addrs = initMem(agent, mem_dlist, extra_params, MEM_VAL);
 
     std::cout << "Thread " << thread_id << " Start Control Path metadata exchanges\n";
     std::cout << "Thread " << thread_id << " Exchange metadata with Target\n";
@@ -163,13 +195,15 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params,
     }
 
     std::cout << "Thread " << thread_id << " Verify Deserialized Target's Desc List at Initiator\n";
-    nixl_xfer_dlist_t dram_target_ucx(&remote_serdes);
-    nixl_xfer_dlist_t dram_initiator_ucx = dram_for_ucx.trim();
-    dram_target_ucx.print();
+
+    nixl_xfer_dlist_t xfer_target_dlist(&remote_serdes);
+    nixl_xfer_dlist_t xfer_initiator_dlist = mem_dlist.trim();
+    xfer_target_dlist.print();
 
     std::cout << "Thread " << thread_id << " End Control Path metadata exchanges\n";
     std::cout << "Thread " << thread_id << " Start Data Path Exchanges\n\n";
-    std::cout << "Thread " << thread_id << " Create transfer request with UCX backend\n";
+    std::cout << "Thread " << thread_id << " Create transfer request with " << backend
+              << " backend\n";
 
     // Need to do this in a loop with NIXL_ERR_NOT_FOUND
     // UCX AM with desc list is faster than listener thread can recv/load MD with sockets
@@ -177,8 +211,8 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params,
     nixlXferReqH *treq;
     nixl_status_t ret = NIXL_SUCCESS;
     do {
-        ret = agent.createXferReq(NIXL_WRITE, dram_initiator_ucx, dram_target_ucx,
-                                  target, treq, extra_params);
+        ret = agent.createXferReq(
+            NIXL_WRITE, xfer_initiator_dlist, xfer_target_dlist, target, treq, extra_params);
     } while (ret == NIXL_ERR_NOT_FOUND);
 
     if (ret != NIXL_SUCCESS) {
@@ -186,7 +220,7 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params,
         exit(-1);
     }
 
-    std::cout << "Thread " << thread_id << " Post the request with UCX backend\n";
+    std::cout << "Thread " << thread_id << " Post the request with " << backend << " backend\n";
     ret = agent.postXferReq(treq);
     std::cout << "Thread " << thread_id << " Initiator posted Data Path transfer\n";
     std::cout << "Thread " << thread_id << " Waiting for completion\n";
@@ -195,88 +229,133 @@ static void initiatorThread(nixlAgent &agent, nixl_opt_args_t *extra_params,
         ret = agent.getXferStatus(treq);
         assert(ret >= 0);
     }
-    std::cout << "Thread " << thread_id << " Completed Sending Data using UCX backend\n";
+    std::cout << "Thread " << thread_id << " Completed Sending Data using " << backend
+              << " backend\n";
     agent.releaseXferReq(treq);
     agent.invalidateLocalMD(&md_extra_params);
 
     std::cout << "Thread " << thread_id << " Cleanup..\n";
-    agent.deregisterMem(dram_for_ucx, extra_params);
+    agent.deregisterMem(mem_dlist, extra_params);
+#ifdef HAVE_SYNAPSEAI
+    for (int i = 0; i < mem_dlist.descCount(); ++i) {
+        nixlBlobDesc desc = mem_dlist[i];
+        Synapseaiutils::free_synapse_memory((uint64_t)desc.addr);
+    }
+#endif
 }
 
-static void runTarget(const std::string &ip, int port, nixl_thread_sync_t sync_mode) {
+static void
+runTarget(const std::string &ip, int port, nixl_thread_sync_t sync_mode, std::string backend) {
     nixlAgentConfig cfg(true, true, port, sync_mode, 1, 0, 100000, false);
 
+
     std::cout << "Starting Agent for target\n";
     nixlAgent agent(target, cfg);
 
     nixl_b_params_t params = {
-        { "num_workers", "4" },
+        {"num_workers", "4"},
     };
-    nixlBackendH *ucx;
-    agent.createBackend("UCX", params, ucx);
+    nixlBackendH *nixl_backend;
+    agent.createBackend(backend, params, nixl_backend);
+
+#ifdef HAVE_SYNAPSEAI
+    Synapseaiutils::init_synapse_device();
+#endif
 
     nixl_opt_args_t extra_params;
-    extra_params.backends.push_back(ucx);
+    extra_params.backends.push_back(nixl_backend);
 
     std::vector<std::thread> threads;
     for (int i = 0; i < NUM_THREADS; i++)
-        threads.emplace_back(targetThread, std::ref(agent), &extra_params, i);
+        threads.emplace_back(targetThread, std::ref(agent), &extra_params, i, backend);
 
     for (auto &thread : threads)
         thread.join();
+#ifdef HAVE_SYNAPSEAI
+    Synapseaiutils::deinit_synapse_device();
+#endif
 }
 
-static void runInitiator(const std::string &target_ip, int target_port, nixl_thread_sync_t sync_mode) {
+static void
+runInitiator(const std::string &target_ip,
+             int target_port,
+             nixl_thread_sync_t sync_mode,
+             std::string backend) {
     nixlAgentConfig cfg(true, true, 0, sync_mode, 1, 0, 100000, false);
 
     std::cout << "Starting Agent for initiator\n";
     nixlAgent agent(initiator, cfg);
 
     nixl_b_params_t params = {
-        { "num_workers", "4" },
+        {"num_workers", "4"},
     };
-    nixlBackendH *ucx;
-    agent.createBackend("UCX", params, ucx);
+    nixlBackendH *nixl_backend;
+    agent.createBackend(backend, params, nixl_backend);
+
+#ifdef HAVE_SYNAPSEAI
+    Synapseaiutils::init_synapse_device();
+#endif
 
     nixl_opt_args_t extra_params;
-    extra_params.backends.push_back(ucx);
+    extra_params.backends.push_back(nixl_backend);
 
     SharedNotificationState shared_state;
 
     std::vector<std::thread> threads;
     for (int i = 0; i < NUM_THREADS; i++)
-        threads.emplace_back(initiatorThread, std::ref(agent), &extra_params,
-                             target_ip, target_port, i, std::ref(shared_state));
+        threads.emplace_back(initiatorThread,
+                             std::ref(agent),
+                             &extra_params,
+                             target_ip,
+                             target_port,
+                             i,
+                             std::ref(shared_state),
+                             backend);
 
     for (auto &thread : threads)
         thread.join();
+
+#ifdef HAVE_SYNAPSEAI
+    Synapseaiutils::deinit_synapse_device();
+#endif
 }
 
-int main(int argc, char *argv[]) {
+int
+main(int argc, char *argv[]) {
     /** Argument Parsing */
     if (argc < 4) {
-        std::cout <<"Enter the required arguments\n" << std::endl;
-        std::cout <<"<Role> " <<"<Target IP> <Target Port>"
-                  << std::endl;
+        std::cout << "Enter the required arguments\n" << std::endl;
+        std::cout << "<Role> " << "<Target IP> <Target Port>" << std::endl;
         exit(-1);
     }
 
     std::string role = std::string(argv[1]);
-    const char  *target_ip   = argv[2];
-    int         target_port = std::stoi(argv[3]);
+    const char *target_ip = argv[2];
+    int target_port = std::stoi(argv[3]);
 
     std::transform(role.begin(), role.end(), role.begin(), ::tolower);
 
     if (!role.compare(initiator) && !role.compare(target)) {
-            std::cerr << "Invalid role. Use 'initiator' or 'target'."
-                      << "Currently "<< role <<std::endl;
+        std::cerr << "Invalid role. Use 'initiator' or 'target'." << "Currently " << role
+                  << std::endl;
+        return 1;
+    }
+
+    std::string backend = "UCX"; // default
+    if (argc == 5) {
+        backend = argv[4];
+        std::transform(backend.begin(), backend.end(), backend.begin(), ::toupper);
+        if (backend != "UCX" && backend != "LIBFABRIC") {
+            std::cerr << "This test is support only for UCX/LIBFABRIC backend" << std::endl;
             return 1;
+        }
     }
 
     auto sync_mode = nixl_thread_sync_t::NIXL_THREAD_SYNC_RW;
-    if (argc == 5) {
-        std::string sync_mode_str{argv[4]};
-        std::transform(sync_mode_str.begin(), sync_mode_str.end(), sync_mode_str.begin(), ::tolower);
+    if (argc == 6) {
+        std::string sync_mode_str{argv[5]};
+        std::transform(
+            sync_mode_str.begin(), sync_mode_str.end(), sync_mode_str.begin(), ::tolower);
         if (sync_mode_str == "rw") {
             sync_mode = nixl_thread_sync_t::NIXL_THREAD_SYNC_RW;
             std::cout << "Using RW sync mode" << std::endl;
@@ -292,9 +371,9 @@ int main(int argc, char *argv[]) {
     /*** End - Argument Parsing */
 
     if (role == target)
-        runTarget(target_ip, target_port, sync_mode);
+        runTarget(target_ip, target_port, sync_mode, backend);
     else
-        runInitiator(target_ip, target_port, sync_mode);
+        runInitiator(target_ip, target_port, sync_mode, backend);
 
     return 0;
 }
diff --git a/test/utils/meson.build b/test/utils/meson.build
new file mode 100644
index 0000000000..702cdd06a0
--- /dev/null
+++ b/test/utils/meson.build
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+utils_inc = include_directories('.')
+
+synapseai_utils_lib = static_library(
+  'synapseai_utils',
+  ['synapseai_utils.cpp'],
+  include_directories: utils_inc,
+  dependencies: [ synapseai_dep ],
+)
+
+synapseai_utils_dep = declare_dependency(
+  link_with: synapseai_utils_lib,
+  include_directories: utils_inc,
+  dependencies: [ synapseai_dep ],
+)
+
+# Export to parent (test/) scope
+test_synapseai_utils_dep = synapseai_utils_dep
+
diff --git a/test/utils/synapseai_utils.cpp b/test/utils/synapseai_utils.cpp
new file mode 100644
index 0000000000..2909888a07
--- /dev/null
+++ b/test/utils/synapseai_utils.cpp
@@ -0,0 +1,110 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright 2025 Intel Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <mutex>
+#include "synapseai_utils.h"
+
+static bool device_initialized = false;
+static std::mutex mtx;
+static synDeviceId deviceHandle;
+static synStreamHandle stream;
+
+namespace Synapseaiutils {
+static void
+check(int ret, const char *msg) {
+    if (ret) {
+        fprintf(stderr, "%s: %s(%d)\n", msg, "failed", -ret);
+        exit(1);
+    }
+}
+
+int
+init_synapse_device() {
+    std::lock_guard<std::mutex> lock(mtx);
+    auto env = std::getenv("HLS_MODULE_ID");
+    int module_id = 0;
+    if (env != nullptr) {
+        module_id = std::stoi(env);
+    }
+    if (device_initialized) return 0;
+    check(synInitialize(), "synInitialize");
+    check(synDeviceAcquireByModuleId(&deviceHandle, module_id), "synDeviceAcquire");
+    device_initialized = true;
+    check(synStreamCreateGeneric(&stream, deviceHandle, 0), "synStreamCreateGeneric");
+    return 0;
+}
+
+synDeviceId
+get_device_handle() {
+    return deviceHandle;
+}
+
+uint64_t
+allocate_synapse_memory(size_t len, void *host_buffer) {
+    uint64_t device_buffer;
+    std::lock_guard<std::mutex> lock(mtx);
+    if (!device_initialized) {
+        fprintf(stderr, "%s\n", "device nor initialized");
+        exit(1);
+    }
+
+    check(synDeviceMalloc(deviceHandle, len, 0x0, 0, &device_buffer), "synDeviceMalloc");
+    check(synHostMap(deviceHandle, len, host_buffer), "synHostMap");
+    check(synMemCopyAsync(stream, (uint64_t)host_buffer, len, device_buffer, HOST_TO_DRAM),
+          "synMemCopyAsync");
+    check(synStreamSynchronize(stream), "synStreamSynchronize");
+    check(synHostUnmap(deviceHandle, host_buffer), "synHostUnmap");
+    return device_buffer;
+}
+
+void
+free_synapse_memory(uint64_t ptr) {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (!device_initialized) fprintf(stderr, "%s\n", "device nor initialized");
+    // cleanup Synapse resources
+    check(synDeviceFree(deviceHandle, ptr, 0), "synDeviceFree");
+}
+
+void
+deinit_synapse_device() {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (!device_initialized) {
+        fprintf(stderr, "%s\n", "device nor initialized");
+        exit(1);
+    }
+    check(synStreamDestroy(stream), "synStreamDestroy");
+    check(synDeviceRelease(deviceHandle), "synDeviceRelease");
+    check(synDestroy(), "synDestroy");
+    device_initialized = false;
+}
+
+void
+copy_from_device_buffer(uint64_t device_buffer, void *host_buffer, size_t len) {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (!device_initialized) {
+        fprintf(stderr, "%s\n", "device nor initialized");
+        exit(1);
+    }
+    check(synHostMap(deviceHandle, len, host_buffer), "synHostMap");
+    check(synMemCopyAsync(stream, device_buffer, len, (uint64_t)host_buffer, DRAM_TO_HOST),
+          "synMemCopyAsync");
+    check(synStreamSynchronize(stream), "synStreamSynchronize");
+    check(synHostUnmap(deviceHandle, host_buffer), "synHostUnmap");
+}
+} // namespace Synapseaiutils
diff --git a/test/utils/synapseai_utils.h b/test/utils/synapseai_utils.h
new file mode 100644
index 0000000000..d9cffa6616
--- /dev/null
+++ b/test/utils/synapseai_utils.h
@@ -0,0 +1,34 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright 2025 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "synapse_api.h"
+
+namespace Synapseaiutils {
+int
+init_synapse_device();
+synDeviceId
+get_device_handle();
+uint64_t
+allocate_synapse_memory(size_t len, void *host_buffer);
+void
+free_synapse_memory(uint64_t ptr);
+void
+deinit_synapse_device();
+void
+copy_from_device_buffer(uint64_t device_buffer, void *host_buffer, size_t len);
+} // namespace Synapseaiutils

From 352045d2de502853adec730b06d0d3daa7af90a4 Mon Sep 17 00:00:00 2001
From: Jeeja KP <jeeja.kp@intel.com>
Date: Tue, 2 Dec 2025 17:17:03 +0200
Subject: [PATCH 13/14] Add nixl install script

ported from ofi branch,
commit f70b5d4e8f2b42b3fab576f0c91c4ebf0430f8f1
Author: Chendi Xue <Chendi.Xue@intel.com>

- changed the branch name from v0.6.0_OFI to libfabric

Signed-off-by: Jeeja KP <jeeja.kp@intel.com>
---
 install_nixl.py | 360 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 360 insertions(+)
 create mode 100644 install_nixl.py

diff --git a/install_nixl.py b/install_nixl.py
new file mode 100644
index 0000000000..1c17638ac5
--- /dev/null
+++ b/install_nixl.py
@@ -0,0 +1,360 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+
+# install_prerequisites.py
+import os
+import shutil
+import subprocess
+import sys
+
+# --- Configuration ---
+WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache")
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+UCX_DIR = os.path.join("/tmp", "ucx_source")
+LIBFABRIC_DIR = os.path.join("/tmp", "libfabric_source")
+NIXL_DIR = os.path.join("/tmp", "nixl_source")
+UCX_INSTALL_DIR = os.path.join("/tmp", "ucx_install")
+LIBFABRIC_INSTALL_DIR = os.path.join("/tmp", "libfabric_install")
+
+# --- Repository and Version Configuration ---
+UCX_REPO_URL = "https://github.com/openucx/ucx.git"
+UCX_BRANCH = "v1.19.x"
+LIBFABRIC_REPO_URL = "https://github.com/ofiwg/libfabric.git"
+LIBFABRIC_REF = "v1.21.0"  # Using a recent stable tag
+NIXL_REPO_URL = "https://github.com/intel-staging/nixl.git"
+NIXL_BRANCH = "libfabric"
+
+
+# --- Helper Functions ---
+def run_command(command, cwd=".", env=None):
+    """Helper function to run a shell command and check for errors."""
+    print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True)
+    subprocess.check_call(command, cwd=cwd, env=env)
+
+
+def is_pip_package_installed(package_name):
+    """Checks if a package is installed via pip without raising an exception."""
+    result = subprocess.run(
+        [sys.executable, "-m", "pip", "show", package_name],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return result.returncode == 0
+
+
+def find_nixl_wheel_in_cache(cache_dir):
+    """Finds a nixl wheel file in the specified cache directory."""
+    # The repaired wheel will have a 'manylinux' tag, but this glob still works.
+    search_pattern = os.path.join(cache_dir, "nixl-*.whl")
+    wheels = glob.glob(search_pattern)
+    if wheels:
+        # Sort to get the most recent/highest version if multiple exist
+        wheels.sort()
+        return wheels[-1]
+    return None
+
+
+def install_system_dependencies():
+    """Installs required system packages using apt-get if run as root."""
+    if os.geteuid() != 0:
+        print("\n---", flush=True)
+        print(
+            "WARNING: Not running as root. Skipping system dependency installation.",
+            flush=True,
+        )
+        print(
+            "Please ensure the following packages are installed on your system:",
+            flush=True,
+        )
+        print(
+            "  patchelf build-essential git cmake ninja-build autotools-dev automake meson libtool libtool-bin",
+            flush=True,
+        )
+        print("---\n", flush=True)
+        return
+
+    print("--- Running as root. Installing system dependencies... ---", flush=True)
+    apt_packages = [
+        "patchelf",  # <-- Add patchelf here
+        "build-essential",
+        "git",
+        "cmake",
+        "ninja-build",
+        "autotools-dev",
+        "automake",
+        "meson",
+        "libtool",
+        "libtool-bin",
+        "libhwloc-dev",
+        "zip",
+    ]
+    run_command(["apt-get", "update"])
+    run_command(["apt-get", "install", "-y"] + apt_packages)
+    print("--- System dependencies installed successfully. ---\n", flush=True)
+
+
+def build_and_install_prerequisites(args):
+    """Builds UCX and NIXL from source, creating a self-contained wheel."""
+
+    # ... (initial checks and setup are unchanged) ...
+    if not args.force_reinstall and is_pip_package_installed("nixl"):
+        print("--> NIXL is already installed. Nothing to do.", flush=True)
+        return
+
+    cached_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
+    if not args.force_reinstall and cached_wheel:
+        print(
+            f"\n--> Found self-contained wheel: {os.path.basename(cached_wheel)}.",
+            flush=True,
+        )
+        print("--> Installing from cache, skipping all source builds.", flush=True)
+        install_command = [sys.executable, "-m", "pip", "install", cached_wheel]
+        run_command(install_command)
+        print("\n--- Installation from cache complete. ---", flush=True)
+        return
+
+    print(
+        "\n--> No installed package or cached wheel found. Starting full build process...",
+        flush=True,
+    )
+    print("\n--> Installing auditwheel...", flush=True)
+    run_command([sys.executable, "-m", "pip", "install", "auditwheel"])
+    install_system_dependencies()
+    ucx_install_path = os.path.abspath(UCX_INSTALL_DIR)
+    print(f"--> Using wheel cache directory: {WHEELS_CACHE_HOME}", flush=True)
+    os.makedirs(WHEELS_CACHE_HOME, exist_ok=True)
+
+    # -- Step 1: Build UCX from source --
+    print("\n[1/3] Configuring and building UCX from source...", flush=True)
+    if not os.path.exists(UCX_DIR):
+        run_command(["git", "clone", UCX_REPO_URL, UCX_DIR])
+    ucx_source_path = os.path.abspath(UCX_DIR)
+    run_command(["git", "checkout", "v1.19.x"], cwd=ucx_source_path)
+    run_command(["./autogen.sh"], cwd=ucx_source_path)
+    configure_command = [
+        "./configure",
+        f"--prefix={ucx_install_path}",
+        "--enable-shared",
+        "--disable-static",
+        "--disable-doxygen-doc",
+        "--enable-optimizations",
+        "--enable-cma",
+        "--enable-devel-headers",
+        "--with-verbs",
+        "--enable-mt",
+    ]
+    run_command(configure_command, cwd=ucx_source_path)
+    run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
+    run_command(["make", "install"], cwd=ucx_source_path)
+    print("--- UCX build and install complete ---", flush=True)
+
+    # -- Step 2: Build Libfabric from source --
+    print(
+        f"\n[2/4] Configuring and building Libfabric (ref: {LIBFABRIC_REF}) from source...",
+        flush=True,
+    )
+    if not os.path.exists(LIBFABRIC_DIR):
+        run_command(["git", "clone", LIBFABRIC_REPO_URL, LIBFABRIC_DIR])
+    run_command(["git", "checkout", LIBFABRIC_REF], cwd=LIBFABRIC_DIR)
+    run_command(["./autogen.sh"], cwd=LIBFABRIC_DIR)
+    configure_command_lf = [
+        "./configure",
+        f"--prefix={LIBFABRIC_INSTALL_DIR}",
+        "--enable-verbs",
+        "--enable-shm",
+        "--enable-sockets",
+        "--enable-tcp",
+        "--with-synapseai=/usr/include/habanalabs",  # As requested
+    ]
+    run_command(configure_command_lf, cwd=LIBFABRIC_DIR)
+    run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=LIBFABRIC_DIR)
+    run_command(["make", "install"], cwd=LIBFABRIC_DIR)
+    print("--- Libfabric build and install complete ---", flush=True)
+
+    # -- Step 3: Build NIXL wheel from source --
+    print(
+        f"\n[3/4] Building NIXL (branch: {NIXL_BRANCH}) wheel from source...",
+        flush=True,
+    )
+    if not os.path.exists(NIXL_DIR):
+        run_command(["git", "clone", "--branch", NIXL_BRANCH, NIXL_REPO_URL, NIXL_DIR])
+
+    build_env = os.environ.copy()
+    # Configure environment to find both UCX and Libfabric
+    ucx_install_path = os.path.abspath(UCX_INSTALL_DIR)
+    lf_install_path = os.path.abspath(LIBFABRIC_INSTALL_DIR)
+
+    ucx_pkg_path = os.path.join(ucx_install_path, "lib", "pkgconfig")
+    lf_pkg_path = os.path.join(lf_install_path, "lib", "pkgconfig")
+    build_env["PKG_CONFIG_PATH"] = f"{ucx_pkg_path}:{lf_pkg_path}".strip(":")
+
+    ucx_lib_path = os.path.join(ucx_install_path, "lib")
+    ucx_plugin_path = os.path.join(ucx_lib_path, "ucx")
+    lf_lib_path = os.path.join(lf_install_path, "lib")
+    build_env[
+        "LD_LIBRARY_PATH"
+    ] = f"{ucx_lib_path}:{ucx_plugin_path}:{lf_lib_path}".strip(":")
+
+    print(f"--> Using PKG_CONFIG_PATH: {build_env['PKG_CONFIG_PATH']}", flush=True)
+    print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True)
+
+    temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse")
+    # Define the build command for nixl wheel with specific meson arguments
+    wheel_build_cmd = [
+        sys.executable,
+        "-m",
+        "pip",
+        "wheel",
+        ".",
+        "--no-deps",
+        f"--wheel-dir={temp_wheel_dir}",
+        # Pass meson arguments via pip's config-settings
+        "--config-settings=setup-args=-Ddisable_gds_backend=true",
+        f"--config-settings=setup-args=-Dlibfabric_path={lf_install_path}",
+        f"--config-settings=setup-args=-Ducx_path={ucx_install_path}",
+    ]
+
+    run_command(wheel_build_cmd, cwd=os.path.abspath(NIXL_DIR), env=build_env)
+
+    # -- Step 4: Repair wheel, then replace libfabric --
+    # auditwheel may bundle an incompatible libfabric, so we need to replace it
+    print(
+        "\n[4/4] Repairing wheel with auditwheel and correcting libfabric...",
+        flush=True,
+    )
+    unrepaired_wheel = find_nixl_wheel_in_cache(temp_wheel_dir)
+    if not unrepaired_wheel:
+        raise RuntimeError("Failed to find the NIXL wheel after building it.")
+
+    # First, run auditwheel to bundle all other dependencies
+    run_command(
+        [
+            sys.executable,
+            "-m",
+            "auditwheel",
+            "repair",
+            "--exclude",
+            "libplugin_UCX.so",
+            unrepaired_wheel,
+            f"--wheel-dir={WHEELS_CACHE_HOME}",
+        ],
+        env=build_env,
+    )
+
+    repaired_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
+    if not repaired_wheel:
+        raise RuntimeError("Failed to find repaired wheel from auditwheel.")
+
+    # Now, unpack the repaired wheel to perform surgery on it
+    wheel_unpack_dir = os.path.join(temp_wheel_dir, "wheel_unpack")
+    if os.path.exists(wheel_unpack_dir):
+        shutil.rmtree(wheel_unpack_dir)
+    os.makedirs(wheel_unpack_dir)
+    run_command(["unzip", "-q", repaired_wheel, "-d", wheel_unpack_dir])
+
+    # Find the main NIXL extension file to inspect its dependencies
+    nixl_extension_search = glob.glob(os.path.join(wheel_unpack_dir, "nixl", "*.so"))
+    if not nixl_extension_search:
+        raise RuntimeError("Could not find main NIXL .so extension file.")
+    # nixl_extension_file = nixl_extension_search[0]
+
+    # Find the .libs directory
+    libs_dir_search = glob.glob(os.path.join(wheel_unpack_dir, "*.libs"))
+    if not libs_dir_search:
+        raise RuntimeError("Could not find .libs directory in unpacked wheel.")
+    libs_dir = libs_dir_search[0]
+
+    # Find the incorrect libfabric that auditwheel bundled
+    incorrect_lib_basename = None
+    for lib in os.listdir(libs_dir):
+        if "libfabric" in lib:
+            incorrect_lib_basename = lib
+            break
+
+    # Only perform replacement if we found a library to replace
+    if incorrect_lib_basename:
+        incorrect_lib_path = os.path.join(libs_dir, incorrect_lib_basename)
+        print(
+            f"--> Found and deleting incorrect bundled library: {incorrect_lib_basename}",
+            flush=True,
+        )
+        os.remove(incorrect_lib_path)
+
+        # Find the correct, pre-built libfabric library
+        lf_lib_path = os.path.join(lf_install_path, "lib")
+        libfabric_so_files = glob.glob(os.path.join(lf_lib_path, "libfabric.so.1.*"))
+        if not libfabric_so_files:
+            raise RuntimeError(f"Could not find libfabric.so.1.* in {lf_lib_path}")
+        correct_libfabric_src = max(libfabric_so_files, key=len)
+        correct_libfabric_basename = os.path.basename(correct_libfabric_src)
+
+        # Copy it into the wheel's .libs directory
+        print(
+            f"--> Copying correct library '{correct_libfabric_basename}' into wheel",
+            flush=True,
+        )
+        shutil.copy2(correct_libfabric_src, os.path.join(libs_dir, incorrect_lib_path))
+
+        # Use patchelf to update the dependency link in the main NIXL extension
+        # print(f"--> Patching NIXL extension to link against '{correct_libfabric_basename}'", flush=True)
+        # run_command(['patchelf', '--replace-needed', incorrect_lib_basename, correct_libfabric_basename, nixl_extension_file])
+    else:
+        print(
+            "--> Warning: Did not find a bundled libfabric to remove. It might have been excluded.",
+            flush=True,
+        )
+
+    # Repack the corrected wheel, overwriting the one from auditwheel
+    print(
+        f"--> Repacking corrected wheel to '{os.path.basename(repaired_wheel)}'",
+        flush=True,
+    )
+    run_command(["zip", "-r", repaired_wheel, "."], cwd=wheel_unpack_dir)
+
+    # --- Cleanup ---
+    shutil.rmtree(temp_wheel_dir)
+
+    # --- Final Installation ---
+    newly_built_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
+    if not newly_built_wheel:
+        raise RuntimeError("Failed to find the repaired NIXL wheel.")
+
+    print(
+        f"--> Successfully built self-contained wheel: {os.path.basename(newly_built_wheel)}. Now installing...",
+        flush=True,
+    )
+    install_command = [sys.executable, "-m", "pip", "install", newly_built_wheel]
+    if args.force_reinstall:
+        install_command.insert(-1, "--force-reinstall")
+
+    run_command(install_command)
+    print("--- NIXL installation complete ---", flush=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Build and install UCX and NIXL dependencies."
+    )
+    parser.add_argument(
+        "--force-reinstall",
+        action="store_true",
+        help="Force rebuild and reinstall of UCX and NIXL even if they are already installed.",
+    )
+    args = parser.parse_args()
+    build_and_install_prerequisites(args)

From b073533ce00094ecbaafc462100a7728cf0d5150 Mon Sep 17 00:00:00 2001
From: root <root@g2h-srv64-c02h-idc.cluster.local>
Date: Thu, 8 Jan 2026 11:24:51 +0200
Subject: [PATCH 14/14] optimize progressthread, cm thread and burst CQ and
 egain handling

---
 src/plugins/libfabric/libfabric_backend.cpp   | 77 ++++++++++-----
 src/utils/libfabric/libfabric_rail.cpp        | 97 ++++++++++---------
 .../libfabric/libfabric_rail_manager.cpp      |  6 +-
 src/utils/libfabric/libfabric_rail_manager.h  |  2 +-
 4 files changed, 107 insertions(+), 75 deletions(-)

diff --git a/src/plugins/libfabric/libfabric_backend.cpp b/src/plugins/libfabric/libfabric_backend.cpp
index 8bbd3264e5..76ddcf7fbb 100644
--- a/src/plugins/libfabric/libfabric_backend.cpp
+++ b/src/plugins/libfabric/libfabric_backend.cpp
@@ -1339,20 +1339,42 @@ nixlLibfabricEngine::cmThread() {
     NIXL_DEBUG << "ConnectionManagement thread started successfully";
     NIXL_DEBUG << "Initial receives already posted in main thread, entering progress loop";
 
-    // Main progress loop - continuously process completions on all rails
-    while (!cm_thread_stop_.load()) {
+    NIXL_DEBUG << "CM: Thread started";
+
+    // Adaptive backoff state (per-thread)
+    static thread_local int backoff_us = 50; // start at 50 µs
+    static thread_local const int backoff_us_max = 2000; // cap at 2 ms
+
+    // Prefer blocking progress if supported (verbs with FI_WAIT_FD)
+    const bool blocking_supported = (rail_manager.getNumControlRails() > 0) &&
+        rail_manager.getControlRail(0).blocking_cq_sread_supported;
+
+    while (!cm_thread_stop_.load(std::memory_order_relaxed)) {
+        nixl_status_t status;
+        if (blocking_supported) {
+            // With blocking control CQ progress, rely on rail_manager to block up to its timeout
+            status = rail_manager.progressAllControlRails(true); // blocking=true inside rail path
+        } else {
+            // Non-blocking path: progress and adaptively back off on idle
+            status = rail_manager.progressAllControlRails(false);
+        }
 
-        nixl_status_t status = rail_manager.progressAllControlRails();
         if (status == NIXL_SUCCESS) {
-            NIXL_DEBUG << "Processed completions on control rails";
-        } else if (status != NIXL_IN_PROG && status != NIXL_SUCCESS) {
-            NIXL_ERROR << "Failed to process completions on control rails";
-            return NIXL_ERR_BACKEND;
+            // Work was done reset backoff
+            backoff_us = 50;
+            // Optionally continue immediately to drain more completions
+            continue;
         }
-        // Sleep briefly to avoid spinning too aggressively when blocking cq read is not used
-        if (!rail_manager.getControlRail(0).blocking_cq_sread_supported) {
-            std::this_thread::sleep_for(std::chrono::nanoseconds(10));
+        if (status == NIXL_IN_PROG) {
+            // No completions available sleep adaptively
+            std::this_thread::sleep_for(std::chrono::microseconds(backoff_us));
+            backoff_us = std::min(backoff_us * 2, backoff_us_max);
+            continue;
         }
+
+        // Unexpected error log and exit
+        NIXL_ERROR << "CM: Failed to process completions on control rails, status=" << status;
+        return NIXL_ERR_BACKEND;
     }
     NIXL_DEBUG << "ConnectionManagement thread exiting cleanly";
     return NIXL_SUCCESS;
@@ -1366,24 +1388,33 @@ nixlLibfabricEngine::cmThread() {
 nixl_status_t
 nixlLibfabricEngine::progressThread() {
     NIXL_DEBUG << "Progress thread started successfully for data rails only";
-    // Main progress loop - continuously process completions only on data rails
-    while (!progress_thread_stop_.load()) {
-        // Process completions only on data rails (non-blocking)
-        bool any_completions = false;
-        nixl_status_t status = rail_manager.progressActiveDataRails();
+
+    // Adaptive backoff layered over configured delay
+    static thread_local int backoff_us = static_cast<int>(progress_thread_delay_.count());
+    static thread_local const int backoff_us_min = 50; // floor at 50 µs
+    static thread_local const int backoff_us_max = 5000; // cap at 5 ms
+    if (backoff_us <= 0) backoff_us = backoff_us_min;
+
+    while (!progress_thread_stop_.load(std::memory_order_relaxed)) {
+        nixl_status_t status = rail_manager.progressActiveDataRails(); // non-blocking
         if (status == NIXL_SUCCESS) {
-            any_completions = true;
-            NIXL_DEBUG << "Processed completions on data rails";
-        } else if (status != NIXL_IN_PROG && status != NIXL_SUCCESS) {
-            NIXL_ERROR << "Failed to process completions on data rails";
-            // Don't return error, continue for robustness
+            // Completions processed reset backoff and continue draining
+            backoff_us = std::max(backoff_us_min, static_cast<int>(progress_thread_delay_.count()));
+            continue;
         }
-        if (!any_completions) {
-            std::this_thread::sleep_for(progress_thread_delay_);
+        if (status == NIXL_IN_PROG) {
+            // Idle sleep adaptively, increasing up to cap
+            std::this_thread::sleep_for(std::chrono::microseconds(backoff_us));
+            backoff_us = std::min(backoff_us * 2, backoff_us_max);
+            continue;
         }
+        // Error log and keep going for robustness (do not kill the PT)
+        NIXL_ERROR << "PT: Failed to process completions on data rails, status=" << status;
+        std::this_thread::sleep_for(std::chrono::microseconds(backoff_us_min));
     }
-    NIXL_DEBUG << "Progress thread exiting cleanly";
+    NIXL_DEBUG << "PT: Thread exiting";
     return NIXL_SUCCESS;
+
 }
 
 void
diff --git a/src/utils/libfabric/libfabric_rail.cpp b/src/utils/libfabric/libfabric_rail.cpp
index 2a165d7f9d..cb71a1cd76 100644
--- a/src/utils/libfabric/libfabric_rail.cpp
+++ b/src/utils/libfabric/libfabric_rail.cpp
@@ -25,6 +25,7 @@
 #include <cstring>
 #include <stdexcept>
 #include <stack>
+#include <thread>
 
 #ifdef HAVE_SYNAPSEAI
 #include <dlfcn.h>
@@ -700,8 +701,9 @@ nixlLibfabricRail::setXferIdCallback(std::function<void(uint32_t)> callback) {
 nixl_status_t
 nixlLibfabricRail::progressCompletionQueue(bool use_blocking) const {
     // Completion processing
-    struct fi_cq_data_entry completion;
-    memset(&completion, 0, sizeof(completion));
+    // Batch read to amortize lock and syscall overhead
+    struct fi_cq_data_entry entries[32];
+    memset(entries, 0, sizeof(entries));
 
     int ret;
 
@@ -711,10 +713,10 @@ nixlLibfabricRail::progressCompletionQueue(bool use_blocking) const {
 
         if (use_blocking && blocking_cq_sread_supported) {
             // Blocking read using fi_cq_sread (used by CM thread)
-            ret = fi_cq_sread(cq, &completion, 1, nullptr, NIXL_LIBFABRIC_CQ_SREAD_TIMEOUT_SEC);
+            ret = fi_cq_sread(cq, entries, 1, nullptr, NIXL_LIBFABRIC_CQ_SREAD_TIMEOUT_SEC);
         } else {
             // Non-blocking read (used by progress thread or fallback)
-            ret = fi_cq_read(cq, &completion, 1);
+            ret = fi_cq_read(cq, entries, 32);
         }
 
         if (ret < 0 && ret != -FI_EAGAIN) {
@@ -738,24 +740,25 @@ nixlLibfabricRail::progressCompletionQueue(bool use_blocking) const {
     }
     // CQ lock released here - completion is now local data
 
-    if (ret == -FI_EAGAIN) {
+    if (ret == -FI_EAGAIN || ret == 0) {
         return NIXL_IN_PROG; // No completions available
     }
 
-    if (ret == 1) {
-        NIXL_TRACE << "Completion received on rail " << rail_id << " flags: " << std::hex
-                   << completion.flags << " data: " << completion.data
-                   << " context: " << completion.op_context << std::dec;
 
-        // Process completion using local data. Callbacks have their own thread safety
-        nixl_status_t status = processCompletionQueueEntry(&completion);
-        if (status != NIXL_SUCCESS) {
-            NIXL_ERROR << "Failed to process completion on rail " << rail_id;
-            return status;
+    if (ret > 0) {
+        bool ok = true;
+        for (int i = 0; i < ret; ++i) {
+            NIXL_TRACE << "Completion received on rail " << rail_id << " flags=" << std::hex
+                       << entries[i].flags << " data=" << entries[i].data
+                       << " context=" << entries[i].op_context << std::dec;
+            nixl_status_t status = processCompletionQueueEntry(&entries[i]);
+            if (status != NIXL_SUCCESS) {
+                NIXL_ERROR << "Failed to process completion on rail " << rail_id;
+                ok = false;
+                break;
+            }
         }
-
-        NIXL_DEBUG << "Completion processed on rail " << rail_id;
-        return NIXL_SUCCESS;
+        return ok ? NIXL_SUCCESS : NIXL_ERR_BACKEND;
     }
 
     return NIXL_ERR_BACKEND; // Unexpected case
@@ -1077,7 +1080,7 @@ nixlLibfabricRail::postSend(uint64_t immediate_data,
 
         if (ret == -FI_EAGAIN) {
             // Resource temporarily unavailable - retry indefinitely for all providers
-            attempt++;
+            ++attempt;
 
             // Log every N attempts to avoid log spam
             if (attempt % NIXL_LIBFABRIC_LOG_INTERVAL_ATTEMPTS == 0) {
@@ -1088,17 +1091,17 @@ nixlLibfabricRail::postSend(uint64_t immediate_data,
                            << ", retrying (attempt " << attempt << ")";
             }
 
-            // Exponential backoff with cap to avoid overwhelming the system
-            int delay_us = std::min(NIXL_LIBFABRIC_BASE_RETRY_DELAY_US * (1 + attempt / 10),
-                                    NIXL_LIBFABRIC_MAX_RETRY_DELAY_US);
-
-            // Progress completion queue to drain pending completions before retry
-            nixl_status_t progress_status = progressCompletionQueue(false);
-            if (progress_status == NIXL_SUCCESS) {
-                NIXL_TRACE << "Progressed completions on rail " << rail_id << " before retry";
+            // Progress CQ a few times before backing off
+            if (attempt <= 8) {
+                (void)progressCompletionQueue(false);
+            } else {
+                int delay_us = std::min(1000 * (attempt / 10 + 1), 100000); // 1ms..100ms
+                if (blocking_cq_sread_supported)
+                    (void)progressCompletionQueue(true);
+                else
+                    std::this_thread::sleep_for(std::chrono::microseconds(delay_us));
             }
 
-            usleep(delay_us);
             continue;
         } else {
             // Other error - don't retry, fail immediately
@@ -1157,7 +1160,7 @@ nixlLibfabricRail::postWrite(const void *local_buffer,
 
         if (ret == -FI_EAGAIN) {
             // Resource temporarily unavailable - retry indefinitely for all providers
-            attempt++;
+            ++attempt;
 
             // Log every N attempts to avoid log spam
             if (attempt % NIXL_LIBFABRIC_LOG_INTERVAL_ATTEMPTS == 0) {
@@ -1168,17 +1171,16 @@ nixlLibfabricRail::postWrite(const void *local_buffer,
                            << ", retrying (attempt " << attempt << ")";
             }
 
-            // Exponential backoff with cap to avoid overwhelming the system
-            int delay_us = std::min(NIXL_LIBFABRIC_BASE_RETRY_DELAY_US * (1 + attempt / 10),
-                                    NIXL_LIBFABRIC_MAX_RETRY_DELAY_US);
-
-            // Progress completion queue to drain pending completions before retry
-            nixl_status_t progress_status = progressCompletionQueue(false);
-            if (progress_status == NIXL_SUCCESS) {
-                NIXL_TRACE << "Progressed completions on rail " << rail_id << " before retry";
+            // Progress CQ a few times before backing off
+            if (attempt <= 8) {
+                (void)progressCompletionQueue(false);
+            } else {
+                int delay_us = std::min(1000 * (attempt / 10 + 1), 100000); // 1ms..100ms
+                if (blocking_cq_sread_supported)
+                    (void)progressCompletionQueue(true);
+                else
+                    std::this_thread::sleep_for(std::chrono::microseconds(delay_us));
             }
-
-            usleep(delay_us);
             continue;
         } else {
             // Other error - don't retry, fail immediately
@@ -1245,17 +1247,16 @@ nixlLibfabricRail::postRead(void *local_buffer,
                            << ", retrying (attempt " << attempt << ")";
             }
 
-            // Exponential backoff with cap to avoid overwhelming the system
-            int delay_us = std::min(NIXL_LIBFABRIC_BASE_RETRY_DELAY_US * (1 + attempt / 10),
-                                    NIXL_LIBFABRIC_MAX_RETRY_DELAY_US);
-
-            // Progress completion queue to drain pending completions before retry
-            nixl_status_t progress_status = progressCompletionQueue(false);
-            if (progress_status == NIXL_SUCCESS) {
-                NIXL_TRACE << "Progressed completions on rail " << rail_id << " before retry";
+            // Progress CQ a few times before backing off
+            if (attempt <= 8) {
+                (void)progressCompletionQueue(false);
+            } else {
+                int delay_us = std::min(1000 * (attempt / 10 + 1), 100000); // 1ms..100ms
+                if (blocking_cq_sread_supported)
+                    (void)progressCompletionQueue(true);
+                else
+                    std::this_thread::sleep_for(std::chrono::microseconds(delay_us));
             }
-
-            usleep(delay_us);
             continue;
         } else {
             // Other error - don't retry, fail immediately
diff --git a/src/utils/libfabric/libfabric_rail_manager.cpp b/src/utils/libfabric/libfabric_rail_manager.cpp
index e2d639c6f5..49762b0afc 100644
--- a/src/utils/libfabric/libfabric_rail_manager.cpp
+++ b/src/utils/libfabric/libfabric_rail_manager.cpp
@@ -658,11 +658,11 @@ nixlLibfabricRailManager::progressActiveDataRails() {
 }
 
 nixl_status_t
-nixlLibfabricRailManager::progressAllControlRails() {
+nixlLibfabricRailManager::progressAllControlRails(bool blocking) {
     bool any_completions = false;
     for (size_t rail_id = 0; rail_id < num_control_rails_; ++rail_id) {
-        nixl_status_t status =
-            control_rails_[rail_id]->progressCompletionQueue(true); // Blocking for control rails
+        nixl_status_t status = control_rails_[rail_id]->progressCompletionQueue(
+            blocking); // Blocking for control rails
         if (status == NIXL_SUCCESS) {
             any_completions = true;
             NIXL_DEBUG << "Processed completion on control rail " << rail_id;
diff --git a/src/utils/libfabric/libfabric_rail_manager.h b/src/utils/libfabric/libfabric_rail_manager.h
index fe68e962d0..6983571cab 100644
--- a/src/utils/libfabric/libfabric_rail_manager.h
+++ b/src/utils/libfabric/libfabric_rail_manager.h
@@ -222,7 +222,7 @@ class nixlLibfabricRailManager {
      * @return NIXL_SUCCESS if completions processed, NIXL_IN_PROG if none, error on failure
      */
     nixl_status_t
-    progressAllControlRails();
+    progressAllControlRails(bool blocking);
     /** Validate that all rails are properly initialized
      * @return NIXL_SUCCESS if all rails initialized, error code otherwise
      */