From b26dd9d551d9a732cb44f95001285d358a457a82 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Tue, 6 May 2025 15:35:09 -0500
Subject: [PATCH 1/7] cmake update for cuTENSOR

---
 CMakeLists.txt           |  30 ++++++++-
 cmake/FindcuTENSOR.cmake | 135 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 cmake/FindcuTENSOR.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b39e8db6..e6931ccc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,16 +29,34 @@ include(get_cmaize)
 set(project_inc_dir "${CMAKE_CURRENT_LIST_DIR}/include/${PROJECT_NAME}")
 set(project_src_dir "${CMAKE_CURRENT_LIST_DIR}/src/${PROJECT_NAME}")
 
+# Documentation
 include(nwx_cxx_api_docs)
 nwx_cxx_api_docs("${project_inc_dir}" "${project_src_dir}")
 
+## Extensions ##
+set(SOURCE_EXTS "cpp")
+set(INCLUDE_EXTS "hpp")
+
 ### Options ###
 cmaize_option_list(
     BUILD_TESTING OFF "Should we build the tests?"
     BUILD_PYBIND11_PYBINDINGS ON "Should we build Python3 bindings?"
     ENABLE_SIGMA OFF "Should we enable Sigma for uncertainty tracking?"
+    ENABLE_CUTENSOR OFF "Should we enable cuTENSOR?"
 )
 
+if("${ENABLE_CUTENSOR}") 
+    if("${ENABLE_SIGMA}")
+        set(MSG "Sigma is not compatible with cuTENSOR. Turning Sigma OFF.")
+        message(WARNING ${MSG})
+        set(ENABLE_SIGMA OFF)
+    endif()
+    enable_language(CUDA)
+    set(SOURCE_EXTS ${SOURCE_EXTS} cu)
+    set(INCLUDE_EXTS ${INCLUDE_EXTS} cuh hu)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+endif()
+
 ### Dependendencies ###
 include(get_utilities)
 
@@ -64,15 +82,25 @@ cmaize_find_or_build_optional_dependency(
     CMAKE_ARGS BUILD_TESTING=OFF
                ENABLE_EIGEN_SUPPORT=ON
 )
+set(DEPENDENCIES utilities parallelzone Boost::boost eigen sigma)
+
+if("${ENABLE_CUTENSOR}")
+    include(cmake/FindcuTENSOR.cmake)
+    list(APPEND DEPENDENCIES cuTENSOR::cuTENSOR)
+endif()
 
 cmaize_add_library(
     ${PROJECT_NAME}
     SOURCE_DIR "${project_src_dir}"
     INCLUDE_DIRS "${project_inc_dir}"
-    DEPENDS utilities parallelzone Boost::boost eigen sigma
+    DEPENDS "${DEPENDENCIES}"
 )
 target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
 
+if("${ENABLE_CUTENSOR}")
+    target_compile_definitions("${PROJECT_NAME}" PUBLIC ENABLE_CUTENSOR)
+endif()
+
 include(nwx_pybind11)
 nwx_add_pybind11_module(
     ${PROJECT_NAME}
diff --git a/cmake/FindcuTENSOR.cmake b/cmake/FindcuTENSOR.cmake
new file mode 100644
index 00000000..17b14b48
--- /dev/null
+++ b/cmake/FindcuTENSOR.cmake
@@ -0,0 +1,135 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+#[=======================================================================[.rst:
+FindcuTENSOR
+--------
+
+Find cuTENSOR
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+This module defines the following :prop_tgt:`IMPORTED` target(s):
+
+``cuTENSOR::cuTENSOR``
+  The cuTENSOR library, if found.
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+This module will set the following variables in your project:
+
+``cuTENSOR_FOUND``
+  True if cuTENSOR is found.
+``cuTENSOR_INCLUDE_DIRS``
+  The include directories needed to use cuTENSOR.
+``cuTENSOR_LIBRARIES``
+  The libraries needed to usecuTENSOR.
+``cuTENSOR_VERSION_STRING``
+  The version of the cuTENSOR library found. [OPTIONAL]
+
+#]=======================================================================]
+
+# Prefer using a Config module if it exists for this project
+set(cuTENSOR_NO_CONFIG FALSE)
+if(NOT cuTENSOR_NO_CONFIG)
+  find_package(cuTENSOR CONFIG QUIET HINTS ${cutensor_DIR})
+  if(cuTENSOR_FOUND)
+    find_package_handle_standard_args(cuTENSOR DEFAULT_MSG cuTENSOR_CONFIG)
+    return()
+  endif()
+endif()
+
+find_path(cuTENSOR_INCLUDE_DIR NAMES cutensor.h )
+
+set(cuTENSOR_IS_HEADER_ONLY FALSE)
+if(NOT cuTENSOR_LIBRARY AND NOT cuTENSOR_IS_HEADER_ONLY)
+  find_library(cuTENSOR_LIBRARY_RELEASE NAMES libcutensor.so NAMES_PER_DIR )
+  find_library(cuTENSOR_LIBRARY_DEBUG   NAMES libcutensor.sod   NAMES_PER_DIR )
+
+  include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
+  select_library_configurations(cuTENSOR)
+  unset(cuTENSOR_FOUND) #incorrectly set by select_library_configurations
+endif()
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+
+if(cuTENSOR_IS_HEADER_ONLY)
+  find_package_handle_standard_args(cuTENSOR
+                                    REQUIRED_VARS cuTENSOR_INCLUDE_DIR
+                                    VERSION_VAR )
+else()
+  find_package_handle_standard_args(cuTENSOR
+                                    REQUIRED_VARS cuTENSOR_LIBRARY cuTENSOR_INCLUDE_DIR
+                                    VERSION_VAR )
+endif()
+
+if(NOT cuTENSOR_FOUND)
+  set(CUTENSOR_FILENAME libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive)
+
+  message(STATUS "cuTENSOR not found. Downloading library. By continuing this download you accept to the license terms of cuTENSOR")
+
+  CPMAddPackage(
+    NAME cutensor
+    VERSION ${CUTENSOR_VERSION}
+    URL https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive.tar.xz
+    # Eigen's CMakelists are not intended for library use
+    DOWNLOAD_ONLY YES 
+  )
+      
+  set(cuTENSOR_LIBRARY ${cutensor_SOURCE_DIR}/lib/${CUDAToolkit_VERSION_MAJOR}/libcutensor.so)
+  set(cuTENSOR_INCLUDE_DIR ${cutensor_SOURCE_DIR}/include) 
+
+
+  set(cuTENSOR_FOUND TRUE)
+endif()
+
+if(cuTENSOR_FOUND)
+  set(cuTENSOR_INCLUDE_DIRS ${cuTENSOR_INCLUDE_DIR})
+
+  if(NOT cuTENSOR_LIBRARIES)
+    set(cuTENSOR_LIBRARIES ${cuTENSOR_LIBRARY})
+  endif()
+
+  if(NOT TARGET cuTENSOR::cuTENSOR)
+    add_library(cuTENSOR::cuTENSOR UNKNOWN IMPORTED)
+    set_target_properties(cuTENSOR::cuTENSOR PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${cuTENSOR_INCLUDE_DIRS}")
+
+    if(cuTENSOR_LIBRARY_RELEASE)
+      set_property(TARGET cuTENSOR::cuTENSOR APPEND PROPERTY
+        IMPORTED_CONFIGURATIONS RELEASE)
+      set_target_properties(cuTENSOR::cuTENSOR PROPERTIES
+        IMPORTED_LOCATION_RELEASE "${cuTENSOR_LIBRARY_RELEASE}")
+    endif()
+
+    if(cuTENSOR_LIBRARY_DEBUG)
+      set_property(TARGET cuTENSOR::cuTENSOR APPEND PROPERTY
+        IMPORTED_CONFIGURATIONS DEBUG)
+      set_target_properties(cuTENSOR::cuTENSOR PROPERTIES
+        IMPORTED_LOCATION_DEBUG "${cuTENSOR_LIBRARY_DEBUG}")
+    endif()
+
+    if(NOT cuTENSOR_LIBRARY_RELEASE AND NOT cuTENSOR_LIBRARY_DEBUG)
+      set_property(TARGET cuTENSOR::cuTENSOR APPEND PROPERTY
+        IMPORTED_LOCATION "${cuTENSOR_LIBRARY}")
+    endif()
+  endif()
+endif()
+
+unset(cuTENSOR_NO_CONFIG)
+unset(cuTENSOR_IS_HEADER_ONLY)
\ No newline at end of file

From 9914c1c3399266615c19746d27f0f87235c7eecb Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Tue, 6 May 2025 15:36:25 -0500
Subject: [PATCH 2/7] don't add license to FindcuTENSOR.cmake

---
 .github/.licenserc.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml
index 7cae0725..af8f72e3 100644
--- a/.github/.licenserc.yaml
+++ b/.github/.licenserc.yaml
@@ -22,6 +22,7 @@ header:
     - docs/Makefile
     - LICENSE
     - cmake/config.hpp.in
+    - cmake/FindcuTENSOR.cmake
     - docs/requirements.txt
     - docs/source/bibliography/*.bib
     - version.txt

From b67321dbbbb55fb7aec56c1e6a4208038ed1b8d1 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Wed, 7 May 2025 12:17:21 -0500
Subject: [PATCH 3/7] add extensions to cmaize_add_library

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e6931ccc..15940ecb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,7 +92,9 @@ endif()
 cmaize_add_library(
     ${PROJECT_NAME}
     SOURCE_DIR "${project_src_dir}"
+    SOURCE_EXTS "${SOURCE_EXTS}"
     INCLUDE_DIRS "${project_inc_dir}"
+    INCLUDE_EXTS "${INCLUDE_EXTS}"
     DEPENDS "${DEPENDENCIES}"
 )
 target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")

From be74b22d4281fc25783cf32d67f80666ad4cefd2 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Wed, 7 May 2025 12:39:02 -0500
Subject: [PATCH 4/7] stand-in CUDA files for debugging

---
 .../buffer/detail_/eigen_tensor.cpp           |  7 +++++
 .../buffer/detail_/eigen_tensor.cu            | 28 +++++++++++++++++++
 .../buffer/detail_/eigen_tensor.hu            | 24 ++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 src/tensorwrapper/buffer/detail_/eigen_tensor.cu
 create mode 100644 src/tensorwrapper/buffer/detail_/eigen_tensor.hu

diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp
index 0d66206f..bb8f0000 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp
@@ -18,6 +18,10 @@
 #include "../contraction_planner.hpp"
 #include "eigen_tensor.hpp"
 
+#ifdef ENABLE_CUTENSOR
+#include "eigen_tensor.hu"
+#endif
+
 namespace tensorwrapper::buffer::detail_ {
 
 #define TPARAMS template<typename FloatType, unsigned int Rank>
@@ -94,6 +98,9 @@ void EIGEN_TENSOR::contraction_assignment_(label_type olabels,
                                            const_shape_reference result_shape,
                                            const_pimpl_reference lhs,
                                            const_pimpl_reference rhs) {
+#ifdef ENABLE_CUTENSOR
+    hello();
+#endif
     ContractionPlanner plan(olabels, llabels, rlabels);
 
     auto lt = lhs.clone();
diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
new file mode 100644
index 00000000..da9c7980
--- /dev/null
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2025 NWChemEx-Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef ENABLE_CUTENSOR
+#include "eigen_tensor.hu"
+#include <cuda_runtime.h>
+#include <cutensor.h>
+#include <iostream>
+
+namespace tensorwrapper::buffer::detail_ {
+
+void hello() { std::cout << "Hello" << std::endl; }
+
+} // namespace tensorwrapper::buffer::detail_
+
+#endif
\ No newline at end of file
diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.hu b/src/tensorwrapper/buffer/detail_/eigen_tensor.hu
new file mode 100644
index 00000000..45e84977
--- /dev/null
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.hu
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2025 NWChemEx-Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef ENABLE_CUTENSOR
+
+namespace tensorwrapper::buffer::detail_ {
+
+void hello();
+
+} // namespace tensorwrapper::buffer::detail_
+
+#endif
\ No newline at end of file

From 6c6495207acc24438a05d0dd65757ff1ae38444b Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Tue, 13 May 2025 16:06:22 -0500
Subject: [PATCH 5/7] cutensor contraction working

---
 .../{eigen_tensor.hu => cutensor_traits.cuh}  |  19 +-
 .../buffer/detail_/eigen_tensor.cpp           |  16 +-
 .../buffer/detail_/eigen_tensor.cu            | 252 +++++++++++++++++-
 .../buffer/detail_/eigen_tensor.cuh           |  47 ++++
 4 files changed, 324 insertions(+), 10 deletions(-)
 rename src/tensorwrapper/buffer/detail_/{eigen_tensor.hu => cutensor_traits.cuh} (57%)
 create mode 100644 src/tensorwrapper/buffer/detail_/eigen_tensor.cuh

diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.hu b/src/tensorwrapper/buffer/detail_/cutensor_traits.cuh
similarity index 57%
rename from src/tensorwrapper/buffer/detail_/eigen_tensor.hu
rename to src/tensorwrapper/buffer/detail_/cutensor_traits.cuh
index 45e84977..005039bd 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.hu
+++ b/src/tensorwrapper/buffer/detail_/cutensor_traits.cuh
@@ -13,11 +13,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 #ifdef ENABLE_CUTENSOR
+#include <cuda_runtime.h>
+#include <cutensor.h>
 
 namespace tensorwrapper::buffer::detail_ {
 
-void hello();
+// Traits for cuTENSOR based on the floating point type
+template<typename FloatType>
+struct cutensor_traits {};
+
+template<>
+struct cutensor_traits<float> {
+    cutensorDataType_t cutensorDataType     = CUTENSOR_R_32F;
+    cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
+};
+
+template<>
+struct cutensor_traits<double> {
+    cutensorDataType_t cutensorDataType     = CUTENSOR_R_64F;
+    cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_64F;
+};
 
 } // namespace tensorwrapper::buffer::detail_
 
diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp
index bb8f0000..7c08a5ee 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp
@@ -19,7 +19,7 @@
 #include "eigen_tensor.hpp"
 
 #ifdef ENABLE_CUTENSOR
-#include "eigen_tensor.hu"
+#include "eigen_tensor.cuh"
 #endif
 
 namespace tensorwrapper::buffer::detail_ {
@@ -98,11 +98,18 @@ void EIGEN_TENSOR::contraction_assignment_(label_type olabels,
                                            const_shape_reference result_shape,
                                            const_pimpl_reference lhs,
                                            const_pimpl_reference rhs) {
-#ifdef ENABLE_CUTENSOR
-    hello();
-#endif
     ContractionPlanner plan(olabels, llabels, rlabels);
 
+#ifdef ENABLE_CUTENSOR
+    // Prepare m_tensor_
+    m_tensor_ = allocate_from_shape_(result_shape.as_smooth(),
+                                     std::make_index_sequence<Rank>());
+    m_tensor_.setZero();
+
+    // Dispatch to cuTENSOR
+    cutensor_contraction<my_type>(olabels, llabels, rlabels, result_shape, lhs,
+                                  rhs, m_tensor_);
+#else
     auto lt = lhs.clone();
     auto rt = rhs.clone();
     lt->permute_assignment(plan.lhs_permutation(), llabels, lhs);
@@ -147,6 +154,7 @@ void EIGEN_TENSOR::contraction_assignment_(label_type olabels,
     } else {
         m_tensor_ = tensor;
     }
+#endif
     mark_for_rehash_();
 }
 
diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
index da9c7980..a60f5bc8 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
@@ -14,14 +14,256 @@
  * limitations under the License.
  */
 #ifdef ENABLE_CUTENSOR
-#include "eigen_tensor.hu"
-#include <cuda_runtime.h>
-#include <cutensor.h>
-#include <iostream>
+#include "cutensor_traits.cuh"
+#include "eigen_tensor.cuh"
+#include <unordered_map>
+#include <vector>
 
 namespace tensorwrapper::buffer::detail_ {
 
-void hello() { std::cout << "Hello" << std::endl; }
+// Handle cuda errors
+#define HANDLE_CUDA_ERROR(x)                                \
+    {                                                       \
+        const auto err = x;                                 \
+        if(err != cudaSuccess) {                            \
+            printf("Error: %s\n", cudaGetErrorString(err)); \
+            exit(-1);                                       \
+        }                                                   \
+    };
+
+// Handle cuTENSOR errors
+#define HANDLE_CUTENSOR_ERROR(x)                                \
+    {                                                           \
+        const auto err = x;                                     \
+        if(err != CUTENSOR_STATUS_SUCCESS) {                    \
+            printf("Error: %s\n", cutensorGetErrorString(err)); \
+            exit(-1);                                           \
+        }                                                       \
+    };
+
+// Some common typedefs
+using mode_vector_t  = std::vector<int>;
+using int64_vector_t = std::vector<int64_t>;
+
+// Convert a label into a vector of modes
+template<typename LabelType>
+mode_vector_t label_to_mode_vector(const LabelType& label) {
+    mode_vector_t mode;
+    for(const auto& i : label) { mode.push_back(i.data()[0]); }
+    return mode;
+}
+
+// Query extent information from an input
+template<typename InfoType>
+int64_vector_t get_extents(const InfoType& info) {
+    int64_vector_t extent;
+    for(std::size_t i = 0; i < info.rank(); ++i) {
+        extent.push_back((int64_t)info.extent(i));
+    }
+    return extent;
+}
+
+// Compute strides in row major
+int64_vector_t row_major_strides(std::size_t N, const int64_vector_t& extent) {
+    int64_vector_t strides;
+    for(std::size_t i = 0; i < N; ++i) {
+        int64_t product = 1;
+        for(std::size_t j = N - 1; j > i; --j) product *= extent[j];
+        strides.push_back(product);
+    }
+    return strides;
+}
+
+// Perform tensor contraction with cuTENSOR
+template<typename TensorType>
+void cutensor_contraction(
+  typename TensorType::label_type olabel,
+  typename TensorType::label_type llabel,
+  typename TensorType::label_type rlabel,
+  typename TensorType::const_shape_reference result_shape,
+  typename TensorType::const_pimpl_reference lhs,
+  typename TensorType::const_pimpl_reference rhs,
+  typename TensorType::eigen_reference result) {
+    using element_t    = typename TensorType::element_type;
+    using eigen_data_t = typename TensorType::eigen_data_type;
+
+    // GEMM alpha and beta (hardcoded for now)
+    element_t alpha = 1.0;
+    element_t beta  = 0.0;
+
+    // The modes of the tensors
+    mode_vector_t lhs_modes    = label_to_mode_vector(llabel);
+    mode_vector_t rhs_modes    = label_to_mode_vector(rlabel);
+    mode_vector_t output_modes = label_to_mode_vector(olabel);
+
+    // The extents of each tensor
+    int64_vector_t lhs_extents    = get_extents(lhs);
+    int64_vector_t rhs_extents    = get_extents(rhs);
+    int64_vector_t output_extents = get_extents(result_shape.as_smooth());
+
+    // The strides of each tensor
+    int64_vector_t lhs_strides = row_major_strides(lhs.rank(), lhs_extents);
+    int64_vector_t rhs_strides = row_major_strides(rhs.rank(), rhs_extents);
+    int64_vector_t output_strides =
+      row_major_strides(result_shape.rank(), output_extents);
+
+    // The size of each tensor
+    std::size_t lhs_size    = sizeof(element_t) * lhs.size();
+    std::size_t rhs_size    = sizeof(element_t) * rhs.size();
+    std::size_t output_size = sizeof(element_t) * result_shape.size();
+
+    // Allocate on device
+    void *lhs_d, *rhs_d, *output_d;
+    cudaMalloc((void**)&lhs_d, lhs_size);
+    cudaMalloc((void**)&rhs_d, rhs_size);
+    cudaMalloc((void**)&output_d, output_size);
+
+    // Copy to data to device
+    HANDLE_CUDA_ERROR(cudaMemcpy(lhs_d, lhs.get_immutable_data(), lhs_size,
+                                 cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(cudaMemcpy(rhs_d, rhs.get_immutable_data(), rhs_size,
+                                 cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(
+      cudaMemcpy(output_d, result.data(), output_size, cudaMemcpyHostToDevice));
+
+    // Assert alignment
+    const uint32_t kAlignment =
+      128; // Alignment of the global-memory device pointers (bytes)
+    assert(uintptr_t(lhs_d) % kAlignment == 0);
+    assert(uintptr_t(rhs_d) % kAlignment == 0);
+    assert(uintptr_t(output_d) % kAlignment == 0);
+
+    // cuTENSOR traits
+    cutensor_traits<element_t> traits;
+
+    // cuTENSOR handle
+    cutensorHandle_t handle;
+    HANDLE_CUTENSOR_ERROR(cutensorCreate(&handle));
+
+    // Create Tensor Descriptors
+    cutensorTensorDescriptor_t descLHS;
+    HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
+      handle, &descLHS, lhs.rank(), lhs_extents.data(), lhs_strides.data(),
+      traits.cutensorDataType, kAlignment));
+
+    cutensorTensorDescriptor_t descRHS;
+    HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
+      handle, &descRHS, rhs.rank(), rhs_extents.data(), rhs_strides.data(),
+      traits.cutensorDataType, kAlignment));
+
+    cutensorTensorDescriptor_t descOutput;
+    HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
+      handle, &descOutput, result_shape.rank(), output_extents.data(),
+      output_strides.data(), traits.cutensorDataType, kAlignment));
+
+    // Create Contraction Descriptor
+    cutensorOperationDescriptor_t desc;
+    HANDLE_CUTENSOR_ERROR(cutensorCreateContraction(
+      handle, &desc,                                         // Base
+      descLHS, lhs_modes.data(), CUTENSOR_OP_IDENTITY,       // A
+      descRHS, rhs_modes.data(), CUTENSOR_OP_IDENTITY,       // B
+      descOutput, output_modes.data(), CUTENSOR_OP_IDENTITY, // C
+      descOutput, output_modes.data(), traits.descCompute    // Output
+      ));
+
+    // Optional (but recommended): ensure that the scalar type is correct.
+    cutensorDataType_t scalarType;
+    HANDLE_CUTENSOR_ERROR(cutensorOperationDescriptorGetAttribute(
+      handle, desc, CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
+      (void*)&scalarType, sizeof(scalarType)));
+    assert(scalarType == traits.cutensorDataType);
+
+    // Set the algorithm to use
+    const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;
+    cutensorPlanPreference_t planPref;
+    HANDLE_CUTENSOR_ERROR(cutensorCreatePlanPreference(handle, &planPref, algo,
+                                                       CUTENSOR_JIT_MODE_NONE));
+
+    // Query workspace estimate
+    uint64_t workspaceSizeEstimate = 0;
+    const cutensorWorksizePreference_t workspacePref =
+      CUTENSOR_WORKSPACE_DEFAULT;
+    HANDLE_CUTENSOR_ERROR(cutensorEstimateWorkspaceSize(
+      handle, desc, planPref, workspacePref, &workspaceSizeEstimate));
+
+    // Create Contraction Plan
+    cutensorPlan_t plan;
+    HANDLE_CUTENSOR_ERROR(
+      cutensorCreatePlan(handle, &plan, desc, planPref, workspaceSizeEstimate));
+
+    // Optional: Query information about the created plan
+    // query actually used workspace
+    uint64_t actualWorkspaceSize = 0;
+    HANDLE_CUTENSOR_ERROR(cutensorPlanGetAttribute(
+      handle, plan, CUTENSOR_PLAN_REQUIRED_WORKSPACE, &actualWorkspaceSize,
+      sizeof(actualWorkspaceSize)));
+    assert(actualWorkspaceSize <= workspaceSizeEstimate);
+
+    // At this point the user knows exactly how much memory is need by the
+    // operation and only the smaller actual workspace needs to be allocated
+    void* work = nullptr;
+    if(actualWorkspaceSize > 0) {
+        HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
+        assert(uintptr_t(work) % 128 ==
+               0); // workspace must be aligned to 128 byte-boundary
+    }
+
+    // Execute
+    cudaStream_t stream;
+    HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
+    HANDLE_CUTENSOR_ERROR(
+      cutensorContract(handle, plan, (void*)&alpha, lhs_d, rhs_d, (void*)&beta,
+                       output_d, output_d, work, actualWorkspaceSize, stream));
+
+    // Copy Results from Device
+    HANDLE_CUDA_ERROR(
+      cudaMemcpy(result.data(), output_d, output_size, cudaMemcpyDeviceToHost));
+
+    // Free allocated memory
+    HANDLE_CUTENSOR_ERROR(cutensorDestroy(handle));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyPlan(plan));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyOperationDescriptor(desc));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descLHS));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descRHS));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descOutput));
+    HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
+    if(lhs_d) cudaFree(lhs_d);
+    if(rhs_d) cudaFree(rhs_d);
+    if(output_d) cudaFree(output_d);
+    if(work) cudaFree(work);
+}
+
+#undef HANDLE_CUTENSOR_ERROR
+#undef HANDLE_CUDA_ERROR
+
+// Template instantiations
+#define FUNCTION_INSTANTIATE(TYPE, RANK)                         \
+    template void cutensor_contraction<EigenTensor<TYPE, RANK>>( \
+      typename EigenTensor<TYPE, RANK>::label_type,              \
+      typename EigenTensor<TYPE, RANK>::label_type,              \
+      typename EigenTensor<TYPE, RANK>::label_type,              \
+      typename EigenTensor<TYPE, RANK>::const_shape_reference,   \
+      typename EigenTensor<TYPE, RANK>::const_pimpl_reference,   \
+      typename EigenTensor<TYPE, RANK>::const_pimpl_reference,   \
+      typename EigenTensor<TYPE, RANK>::eigen_reference)
+
+#define DEFINE_CUTENSOR_CONTRACTION(TYPE) \
+    FUNCTION_INSTANTIATE(TYPE, 0);        \
+    FUNCTION_INSTANTIATE(TYPE, 1);        \
+    FUNCTION_INSTANTIATE(TYPE, 2);        \
+    FUNCTION_INSTANTIATE(TYPE, 3);        \
+    FUNCTION_INSTANTIATE(TYPE, 4);        \
+    FUNCTION_INSTANTIATE(TYPE, 5);        \
+    FUNCTION_INSTANTIATE(TYPE, 6);        \
+    FUNCTION_INSTANTIATE(TYPE, 7);        \
+    FUNCTION_INSTANTIATE(TYPE, 8);        \
+    FUNCTION_INSTANTIATE(TYPE, 9);        \
+    FUNCTION_INSTANTIATE(TYPE, 10)
+
+TW_APPLY_FLOATING_POINT_TYPES(DEFINE_CUTENSOR_CONTRACTION);
+
+#undef DEFINE_CUTENSOR_CONTRACTION
+#undef FUNCTION_INSTANTIATE
 
 } // namespace tensorwrapper::buffer::detail_
 
diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh
new file mode 100644
index 00000000..67f843b0
--- /dev/null
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2025 NWChemEx-Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#ifdef ENABLE_CUTENSOR
+#include "eigen_tensor.hpp"
+
+namespace tensorwrapper::buffer::detail_ {
+
+/** @brief Performs a tensor contraction on GPU
+ *
+ *  @param[in] olabel The labels for the modes of the output.
+ *  @param[in] llabel The labels for the modes of the left hand tensor.
+ *  @param[in] rlabel The labels for the modes of the right hand tensor.
+ *  @param[in] result_shape The intended shape of the result.
+ *  @param[in] lhs The left hand tensor.
+ *  @param[in] rhs The right hand tensor.
+ *  @param[in, out] result The eigen tensor where the results are stored.
+ *
+ *  @throw std::bad_alloc if there is a problem allocating the copy of
+ *                        @p layout. Strong throw guarantee.
+ */
+template<typename TensorType>
+void cutensor_contraction(
+  typename TensorType::label_type olabel,
+  typename TensorType::label_type llabel,
+  typename TensorType::label_type rlabel,
+  typename TensorType::const_shape_reference result_shape,
+  typename TensorType::const_pimpl_reference lhs,
+  typename TensorType::const_pimpl_reference rhs,
+  typename TensorType::eigen_reference result);
+
+} // namespace tensorwrapper::buffer::detail_
+
+#endif
\ No newline at end of file

From b6db64b089b329dc64d07b81b4bb3efafd914af9 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Wed, 14 May 2025 13:05:08 -0500
Subject: [PATCH 6/7] clean up

---
 .../buffer/detail_/eigen_tensor.cu            | 119 +++++++++---------
 .../buffer/detail_/eigen_tensor.cuh           |  15 ++-
 2 files changed, 64 insertions(+), 70 deletions(-)

diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
index a60f5bc8..987023ee 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
@@ -47,7 +47,7 @@ using int64_vector_t = std::vector<int64_t>;
 
 // Convert a label into a vector of modes
 template<typename LabelType>
-mode_vector_t label_to_mode_vector(const LabelType& label) {
+mode_vector_t label_to_modes(const LabelType& label) {
     mode_vector_t mode;
     for(const auto& i : label) { mode.push_back(i.data()[0]); }
     return mode;
@@ -64,7 +64,7 @@ int64_vector_t get_extents(const InfoType& info) {
 }
 
 // Compute strides in row major
-int64_vector_t row_major_strides(std::size_t N, const int64_vector_t& extent) {
+int64_vector_t get_strides(std::size_t N, const int64_vector_t& extent) {
     int64_vector_t strides;
     for(std::size_t i = 0; i < N; ++i) {
         int64_t product = 1;
@@ -76,14 +76,13 @@ int64_vector_t row_major_strides(std::size_t N, const int64_vector_t& extent) {
 
 // Perform tensor contraction with cuTENSOR
 template<typename TensorType>
-void cutensor_contraction(
-  typename TensorType::label_type olabel,
-  typename TensorType::label_type llabel,
-  typename TensorType::label_type rlabel,
-  typename TensorType::const_shape_reference result_shape,
-  typename TensorType::const_pimpl_reference lhs,
-  typename TensorType::const_pimpl_reference rhs,
-  typename TensorType::eigen_reference result) {
+void cutensor_contraction(typename TensorType::label_type c_label,
+                          typename TensorType::label_type a_label,
+                          typename TensorType::label_type b_label,
+                          typename TensorType::const_shape_reference c_shape,
+                          typename TensorType::const_pimpl_reference A,
+                          typename TensorType::const_pimpl_reference B,
+                          typename TensorType::eigen_reference C) {
     using element_t    = typename TensorType::element_type;
     using eigen_data_t = typename TensorType::eigen_data_type;
 
@@ -92,46 +91,45 @@ void cutensor_contraction(
     element_t beta  = 0.0;
 
     // The modes of the tensors
-    mode_vector_t lhs_modes    = label_to_mode_vector(llabel);
-    mode_vector_t rhs_modes    = label_to_mode_vector(rlabel);
-    mode_vector_t output_modes = label_to_mode_vector(olabel);
+    mode_vector_t a_modes = label_to_modes(a_label);
+    mode_vector_t b_modes = label_to_modes(b_label);
+    mode_vector_t c_modes = label_to_modes(c_label);
 
     // The extents of each tensor
-    int64_vector_t lhs_extents    = get_extents(lhs);
-    int64_vector_t rhs_extents    = get_extents(rhs);
-    int64_vector_t output_extents = get_extents(result_shape.as_smooth());
+    int64_vector_t a_extents = get_extents(A);
+    int64_vector_t b_extents = get_extents(B);
+    int64_vector_t c_extents = get_extents(c_shape.as_smooth());
 
     // The strides of each tensor
-    int64_vector_t lhs_strides = row_major_strides(lhs.rank(), lhs_extents);
-    int64_vector_t rhs_strides = row_major_strides(rhs.rank(), rhs_extents);
-    int64_vector_t output_strides =
-      row_major_strides(result_shape.rank(), output_extents);
+    int64_vector_t a_strides = get_strides(A.rank(), a_extents);
+    int64_vector_t b_strides = get_strides(B.rank(), b_extents);
+    int64_vector_t c_strides = get_strides(c_shape.rank(), c_extents);
 
     // The size of each tensor
-    std::size_t lhs_size    = sizeof(element_t) * lhs.size();
-    std::size_t rhs_size    = sizeof(element_t) * rhs.size();
-    std::size_t output_size = sizeof(element_t) * result_shape.size();
+    std::size_t a_size = sizeof(element_t) * A.size();
+    std::size_t b_size = sizeof(element_t) * B.size();
+    std::size_t c_size = sizeof(element_t) * c_shape.size();
 
     // Allocate on device
-    void *lhs_d, *rhs_d, *output_d;
-    cudaMalloc((void**)&lhs_d, lhs_size);
-    cudaMalloc((void**)&rhs_d, rhs_size);
-    cudaMalloc((void**)&output_d, output_size);
+    void *A_d, *B_d, *C_d;
+    cudaMalloc((void**)&A_d, a_size);
+    cudaMalloc((void**)&B_d, b_size);
+    cudaMalloc((void**)&C_d, c_size);
 
     // Copy to data to device
-    HANDLE_CUDA_ERROR(cudaMemcpy(lhs_d, lhs.get_immutable_data(), lhs_size,
-                                 cudaMemcpyHostToDevice));
-    HANDLE_CUDA_ERROR(cudaMemcpy(rhs_d, rhs.get_immutable_data(), rhs_size,
-                                 cudaMemcpyHostToDevice));
     HANDLE_CUDA_ERROR(
-      cudaMemcpy(output_d, result.data(), output_size, cudaMemcpyHostToDevice));
+      cudaMemcpy(A_d, A.get_immutable_data(), a_size, cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(
+      cudaMemcpy(B_d, B.get_immutable_data(), b_size, cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(
+      cudaMemcpy(C_d, C.data(), c_size, cudaMemcpyHostToDevice));
 
     // Assert alignment
     const uint32_t kAlignment =
       128; // Alignment of the global-memory device pointers (bytes)
-    assert(uintptr_t(lhs_d) % kAlignment == 0);
-    assert(uintptr_t(rhs_d) % kAlignment == 0);
-    assert(uintptr_t(output_d) % kAlignment == 0);
+    assert(uintptr_t(A_d) % kAlignment == 0);
+    assert(uintptr_t(B_d) % kAlignment == 0);
+    assert(uintptr_t(C_d) % kAlignment == 0);
 
     // cuTENSOR traits
     cutensor_traits<element_t> traits;
@@ -141,32 +139,32 @@ void cutensor_contraction(
     HANDLE_CUTENSOR_ERROR(cutensorCreate(&handle));
 
     // Create Tensor Descriptors
-    cutensorTensorDescriptor_t descLHS;
+    cutensorTensorDescriptor_t descA;
     HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
-      handle, &descLHS, lhs.rank(), lhs_extents.data(), lhs_strides.data(),
+      handle, &descA, A.rank(), a_extents.data(), a_strides.data(),
       traits.cutensorDataType, kAlignment));
 
-    cutensorTensorDescriptor_t descRHS;
+    cutensorTensorDescriptor_t descB;
     HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
-      handle, &descRHS, rhs.rank(), rhs_extents.data(), rhs_strides.data(),
+      handle, &descB, B.rank(), b_extents.data(), b_strides.data(),
       traits.cutensorDataType, kAlignment));
 
-    cutensorTensorDescriptor_t descOutput;
+    cutensorTensorDescriptor_t descC;
     HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
-      handle, &descOutput, result_shape.rank(), output_extents.data(),
-      output_strides.data(), traits.cutensorDataType, kAlignment));
+      handle, &descResult, c_shape.rank(), c_extents.data(), c_strides.data(),
+      traits.cutensorDataType, kAlignment));
 
     // Create Contraction Descriptor
     cutensorOperationDescriptor_t desc;
     HANDLE_CUTENSOR_ERROR(cutensorCreateContraction(
-      handle, &desc,                                         // Base
-      descLHS, lhs_modes.data(), CUTENSOR_OP_IDENTITY,       // A
-      descRHS, rhs_modes.data(), CUTENSOR_OP_IDENTITY,       // B
-      descOutput, output_modes.data(), CUTENSOR_OP_IDENTITY, // C
-      descOutput, output_modes.data(), traits.descCompute    // Output
+      handle, &desc,                               // Base
+      descA, a_modes.data(), CUTENSOR_OP_IDENTITY, // A
+      descB, b_modes.data(), CUTENSOR_OP_IDENTITY, // B
+      descC, c_modes.data(), CUTENSOR_OP_IDENTITY, // C
+      descC, c_modes.data(), traits.descCompute    // Result
       ));
 
-    // Optional (but recommended): ensure that the scalar type is correct.
+    // Ensure that the scalar type is correct.
     cutensorDataType_t scalarType;
     HANDLE_CUTENSOR_ERROR(cutensorOperationDescriptorGetAttribute(
       handle, desc, CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
@@ -191,16 +189,13 @@ void cutensor_contraction(
     HANDLE_CUTENSOR_ERROR(
       cutensorCreatePlan(handle, &plan, desc, planPref, workspaceSizeEstimate));
 
-    // Optional: Query information about the created plan
-    // query actually used workspace
+    // Determine workspace size and allocate
     uint64_t actualWorkspaceSize = 0;
     HANDLE_CUTENSOR_ERROR(cutensorPlanGetAttribute(
       handle, plan, CUTENSOR_PLAN_REQUIRED_WORKSPACE, &actualWorkspaceSize,
       sizeof(actualWorkspaceSize)));
     assert(actualWorkspaceSize <= workspaceSizeEstimate);
 
-    // At this point the user knows exactly how much memory is need by the
-    // operation and only the smaller actual workspace needs to be allocated
     void* work = nullptr;
     if(actualWorkspaceSize > 0) {
         HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
@@ -211,25 +206,25 @@ void cutensor_contraction(
     // Execute
     cudaStream_t stream;
     HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
-    HANDLE_CUTENSOR_ERROR(
-      cutensorContract(handle, plan, (void*)&alpha, lhs_d, rhs_d, (void*)&beta,
-                       output_d, output_d, work, actualWorkspaceSize, stream));
+    HANDLE_CUTENSOR_ERROR(cutensorContract(handle, plan, (void*)&alpha, A_d,
+                                           B_d, (void*)&beta, C_d, C_d, work,
+                                           actualWorkspaceSize, stream));
 
     // Copy Results from Device
     HANDLE_CUDA_ERROR(
-      cudaMemcpy(result.data(), output_d, output_size, cudaMemcpyDeviceToHost));
+      cudaMemcpy(C.data(), C_d, c_size, cudaMemcpyDeviceToHost));
 
     // Free allocated memory
     HANDLE_CUTENSOR_ERROR(cutensorDestroy(handle));
     HANDLE_CUTENSOR_ERROR(cutensorDestroyPlan(plan));
     HANDLE_CUTENSOR_ERROR(cutensorDestroyOperationDescriptor(desc));
-    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descLHS));
-    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descRHS));
-    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descOutput));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descA));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descB));
+    HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descC));
     HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
-    if(lhs_d) cudaFree(lhs_d);
-    if(rhs_d) cudaFree(rhs_d);
-    if(output_d) cudaFree(output_d);
+    if(A_d) cudaFree(A_d);
+    if(B_d) cudaFree(B_d);
+    if(C_d) cudaFree(C_d);
     if(work) cudaFree(work);
 }
 
diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh
index 67f843b0..6e026667 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh
@@ -33,14 +33,13 @@ namespace tensorwrapper::buffer::detail_ {
  *                        @p layout. Strong throw guarantee.
  */
 template<typename TensorType>
-void cutensor_contraction(
-  typename TensorType::label_type olabel,
-  typename TensorType::label_type llabel,
-  typename TensorType::label_type rlabel,
-  typename TensorType::const_shape_reference result_shape,
-  typename TensorType::const_pimpl_reference lhs,
-  typename TensorType::const_pimpl_reference rhs,
-  typename TensorType::eigen_reference result);
+void cutensor_contraction(typename TensorType::label_type c_label,
+                          typename TensorType::label_type a_label,
+                          typename TensorType::label_type b_label,
+                          typename TensorType::const_shape_reference c_shape,
+                          typename TensorType::const_pimpl_reference A,
+                          typename TensorType::const_pimpl_reference B,
+                          typename TensorType::eigen_reference C);
 
 } // namespace tensorwrapper::buffer::detail_
 

From 0a65905f28f62ba4b6cc5c568c1d260e0752dd1b Mon Sep 17 00:00:00 2001
From: "Jonathan M. Waldrop" <relik107@gmail.com>
Date: Wed, 14 May 2025 13:06:42 -0500
Subject: [PATCH 7/7] missed one

---
 src/tensorwrapper/buffer/detail_/eigen_tensor.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
index 987023ee..516abff1 100644
--- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
+++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu
@@ -151,7 +151,7 @@ void cutensor_contraction(typename TensorType::label_type c_label,
 
     cutensorTensorDescriptor_t descC;
     HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor(
-      handle, &descResult, c_shape.rank(), c_extents.data(), c_strides.data(),
+      handle, &descC, c_shape.rank(), c_extents.data(), c_strides.data(),
       traits.cutensorDataType, kAlignment));
 
     // Create Contraction Descriptor