From b26dd9d551d9a732cb44f95001285d358a457a82 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Tue, 6 May 2025 15:35:09 -0500 Subject: [PATCH 1/7] cmake update for cuTENSOR --- CMakeLists.txt | 30 ++++++++- cmake/FindcuTENSOR.cmake | 135 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 cmake/FindcuTENSOR.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index b39e8db6..e6931ccc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,16 +29,34 @@ include(get_cmaize) set(project_inc_dir "${CMAKE_CURRENT_LIST_DIR}/include/${PROJECT_NAME}") set(project_src_dir "${CMAKE_CURRENT_LIST_DIR}/src/${PROJECT_NAME}") +# Documentation include(nwx_cxx_api_docs) nwx_cxx_api_docs("${project_inc_dir}" "${project_src_dir}") +## Extensions ## +set(SOURCE_EXTS "cpp") +set(INCLUDE_EXTS "hpp") + ### Options ### cmaize_option_list( BUILD_TESTING OFF "Should we build the tests?" BUILD_PYBIND11_PYBINDINGS ON "Should we build Python3 bindings?" ENABLE_SIGMA OFF "Should we enable Sigma for uncertainty tracking?" + ENABLE_CUTENSOR OFF "Should we enable cuTENSOR?" ) +if("${ENABLE_CUTENSOR}") + if("${ENABLE_SIGMA}") + set(MSG "Sigma is not compatible with cuTENSOR. Turning Sigma OFF.") + message(WARNING ${MSG}) + set(ENABLE_SIGMA OFF) + endif() + enable_language(CUDA) + set(SOURCE_EXTS ${SOURCE_EXTS} cu) + set(INCLUDE_EXTS ${INCLUDE_EXTS} cuh hu) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +endif() + ### Dependendencies ### include(get_utilities) @@ -64,15 +82,25 @@ cmaize_find_or_build_optional_dependency( CMAKE_ARGS BUILD_TESTING=OFF ENABLE_EIGEN_SUPPORT=ON ) +set(DEPENDENCIES utilities parallelzone Boost::boost eigen sigma) + +if("${ENABLE_CUTENSOR}") + include(cmake/FindcuTENSOR.cmake) + list(APPEND DEPENDENCIES cuTENSOR::cuTENSOR) +endif() cmaize_add_library( ${PROJECT_NAME} SOURCE_DIR "${project_src_dir}" INCLUDE_DIRS "${project_inc_dir}" - DEPENDS utilities parallelzone Boost::boost eigen sigma + DEPENDS "${DEPENDENCIES}" ) target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}") +if("${ENABLE_CUTENSOR}") + target_compile_definitions("${PROJECT_NAME}" PUBLIC ENABLE_CUTENSOR) +endif() + include(nwx_pybind11) nwx_add_pybind11_module( ${PROJECT_NAME} diff --git a/cmake/FindcuTENSOR.cmake b/cmake/FindcuTENSOR.cmake new file mode 100644 index 00000000..17b14b48 --- /dev/null +++ b/cmake/FindcuTENSOR.cmake @@ -0,0 +1,135 @@ +#============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +#[=======================================================================[.rst: +FindcuTENSOR +-------- + +Find cuTENSOR + +Imported targets +^^^^^^^^^^^^^^^^ + +This module defines the following :prop_tgt:`IMPORTED` target(s): + +``cuTENSOR::cuTENSOR`` + The cuTENSOR library, if found. + +Result variables +^^^^^^^^^^^^^^^^ + +This module will set the following variables in your project: + +``cuTENSOR_FOUND`` + True if cuTENSOR is found. +``cuTENSOR_INCLUDE_DIRS`` + The include directories needed to use cuTENSOR. +``cuTENSOR_LIBRARIES`` + The libraries needed to usecuTENSOR. +``cuTENSOR_VERSION_STRING`` + The version of the cuTENSOR library found. [OPTIONAL] + +#]=======================================================================] + +# Prefer using a Config module if it exists for this project +set(cuTENSOR_NO_CONFIG FALSE) +if(NOT cuTENSOR_NO_CONFIG) + find_package(cuTENSOR CONFIG QUIET HINTS ${cutensor_DIR}) + if(cuTENSOR_FOUND) + find_package_handle_standard_args(cuTENSOR DEFAULT_MSG cuTENSOR_CONFIG) + return() + endif() +endif() + +find_path(cuTENSOR_INCLUDE_DIR NAMES cutensor.h ) + +set(cuTENSOR_IS_HEADER_ONLY FALSE) +if(NOT cuTENSOR_LIBRARY AND NOT cuTENSOR_IS_HEADER_ONLY) + find_library(cuTENSOR_LIBRARY_RELEASE NAMES libcutensor.so NAMES_PER_DIR ) + find_library(cuTENSOR_LIBRARY_DEBUG NAMES libcutensor.sod NAMES_PER_DIR ) + + include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake) + select_library_configurations(cuTENSOR) + unset(cuTENSOR_FOUND) #incorrectly set by select_library_configurations +endif() + +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) + +if(cuTENSOR_IS_HEADER_ONLY) + find_package_handle_standard_args(cuTENSOR + REQUIRED_VARS cuTENSOR_INCLUDE_DIR + VERSION_VAR ) +else() + find_package_handle_standard_args(cuTENSOR + REQUIRED_VARS cuTENSOR_LIBRARY cuTENSOR_INCLUDE_DIR + VERSION_VAR ) +endif() + +if(NOT cuTENSOR_FOUND) + set(CUTENSOR_FILENAME libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive) + + message(STATUS "cuTENSOR not found. Downloading library. By continuing this download you accept to the license terms of cuTENSOR") + + CPMAddPackage( + NAME cutensor + VERSION ${CUTENSOR_VERSION} + URL https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive.tar.xz + # Eigen's CMakelists are not intended for library use + DOWNLOAD_ONLY YES + ) + + set(cuTENSOR_LIBRARY ${cutensor_SOURCE_DIR}/lib/${CUDAToolkit_VERSION_MAJOR}/libcutensor.so) + set(cuTENSOR_INCLUDE_DIR ${cutensor_SOURCE_DIR}/include) + + + set(cuTENSOR_FOUND TRUE) +endif() + +if(cuTENSOR_FOUND) + set(cuTENSOR_INCLUDE_DIRS ${cuTENSOR_INCLUDE_DIR}) + + if(NOT cuTENSOR_LIBRARIES) + set(cuTENSOR_LIBRARIES ${cuTENSOR_LIBRARY}) + endif() + + if(NOT TARGET cuTENSOR::cuTENSOR) + add_library(cuTENSOR::cuTENSOR UNKNOWN IMPORTED) + set_target_properties(cuTENSOR::cuTENSOR PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${cuTENSOR_INCLUDE_DIRS}") + + if(cuTENSOR_LIBRARY_RELEASE) + set_property(TARGET cuTENSOR::cuTENSOR APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(cuTENSOR::cuTENSOR PROPERTIES + IMPORTED_LOCATION_RELEASE "${cuTENSOR_LIBRARY_RELEASE}") + endif() + + if(cuTENSOR_LIBRARY_DEBUG) + set_property(TARGET cuTENSOR::cuTENSOR APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(cuTENSOR::cuTENSOR PROPERTIES + IMPORTED_LOCATION_DEBUG "${cuTENSOR_LIBRARY_DEBUG}") + endif() + + if(NOT cuTENSOR_LIBRARY_RELEASE AND NOT cuTENSOR_LIBRARY_DEBUG) + set_property(TARGET cuTENSOR::cuTENSOR APPEND PROPERTY + IMPORTED_LOCATION "${cuTENSOR_LIBRARY}") + endif() + endif() +endif() + +unset(cuTENSOR_NO_CONFIG) +unset(cuTENSOR_IS_HEADER_ONLY) \ No newline at end of file From 9914c1c3399266615c19746d27f0f87235c7eecb Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Tue, 6 May 2025 15:36:25 -0500 Subject: [PATCH 2/7] don't add license to FindcuTENSOR.cmake --- .github/.licenserc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml index 7cae0725..af8f72e3 100644 --- a/.github/.licenserc.yaml +++ b/.github/.licenserc.yaml @@ -22,6 +22,7 @@ header: - docs/Makefile - LICENSE - cmake/config.hpp.in + - cmake/FindcuTENSOR.cmake - docs/requirements.txt - docs/source/bibliography/*.bib - version.txt From b67321dbbbb55fb7aec56c1e6a4208038ed1b8d1 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Wed, 7 May 2025 12:17:21 -0500 Subject: [PATCH 3/7] add extensions to cmaize_add_library --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e6931ccc..15940ecb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,7 +92,9 @@ endif() cmaize_add_library( ${PROJECT_NAME} SOURCE_DIR "${project_src_dir}" + SOURCE_EXTS "${SOURCE_EXTS}" INCLUDE_DIRS "${project_inc_dir}" + INCLUDE_EXTS "${INCLUDE_EXTS}" DEPENDS "${DEPENDENCIES}" ) target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}") From be74b22d4281fc25783cf32d67f80666ad4cefd2 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Wed, 7 May 2025 12:39:02 -0500 Subject: [PATCH 4/7] stand-in CUDA files for debugging --- .../buffer/detail_/eigen_tensor.cpp | 7 +++++ .../buffer/detail_/eigen_tensor.cu | 28 +++++++++++++++++++ .../buffer/detail_/eigen_tensor.hu | 24 ++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 src/tensorwrapper/buffer/detail_/eigen_tensor.cu create mode 100644 src/tensorwrapper/buffer/detail_/eigen_tensor.hu diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp index 0d66206f..bb8f0000 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp @@ -18,6 +18,10 @@ #include "../contraction_planner.hpp" #include "eigen_tensor.hpp" +#ifdef ENABLE_CUTENSOR +#include "eigen_tensor.hu" +#endif + namespace tensorwrapper::buffer::detail_ { #define TPARAMS template @@ -94,6 +98,9 @@ void EIGEN_TENSOR::contraction_assignment_(label_type olabels, const_shape_reference result_shape, const_pimpl_reference lhs, const_pimpl_reference rhs) { +#ifdef ENABLE_CUTENSOR + hello(); +#endif ContractionPlanner plan(olabels, llabels, rlabels); auto lt = lhs.clone(); diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu new file mode 100644 index 00000000..da9c7980 --- /dev/null +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu @@ -0,0 +1,28 @@ +/* + * Copyright 2025 NWChemEx-Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifdef ENABLE_CUTENSOR +#include "eigen_tensor.hu" +#include +#include +#include + +namespace tensorwrapper::buffer::detail_ { + +void hello() { std::cout << "Hello" << std::endl; } + +} // namespace tensorwrapper::buffer::detail_ + +#endif \ No newline at end of file diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.hu b/src/tensorwrapper/buffer/detail_/eigen_tensor.hu new file mode 100644 index 00000000..45e84977 --- /dev/null +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.hu @@ -0,0 +1,24 @@ +/* + * Copyright 2025 NWChemEx-Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifdef ENABLE_CUTENSOR + +namespace tensorwrapper::buffer::detail_ { + +void hello(); + +} // namespace tensorwrapper::buffer::detail_ + +#endif \ No newline at end of file From 6c6495207acc24438a05d0dd65757ff1ae38444b Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Tue, 13 May 2025 16:06:22 -0500 Subject: [PATCH 5/7] cutensor contraction working --- .../{eigen_tensor.hu => cutensor_traits.cuh} | 19 +- .../buffer/detail_/eigen_tensor.cpp | 16 +- .../buffer/detail_/eigen_tensor.cu | 252 +++++++++++++++++- .../buffer/detail_/eigen_tensor.cuh | 47 ++++ 4 files changed, 324 insertions(+), 10 deletions(-) rename src/tensorwrapper/buffer/detail_/{eigen_tensor.hu => cutensor_traits.cuh} (57%) create mode 100644 src/tensorwrapper/buffer/detail_/eigen_tensor.cuh diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.hu b/src/tensorwrapper/buffer/detail_/cutensor_traits.cuh similarity index 57% rename from src/tensorwrapper/buffer/detail_/eigen_tensor.hu rename to src/tensorwrapper/buffer/detail_/cutensor_traits.cuh index 45e84977..005039bd 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.hu +++ b/src/tensorwrapper/buffer/detail_/cutensor_traits.cuh @@ -13,11 +13,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #ifdef ENABLE_CUTENSOR +#include +#include namespace tensorwrapper::buffer::detail_ { -void hello(); +// Traits for cuTENSOR based on the floating point type +template +struct cutensor_traits {}; + +template<> +struct cutensor_traits { + cutensorDataType_t cutensorDataType = CUTENSOR_R_32F; + cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F; +}; + +template<> +struct cutensor_traits { + cutensorDataType_t cutensorDataType = CUTENSOR_R_64F; + cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_64F; +}; } // namespace tensorwrapper::buffer::detail_ diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp index bb8f0000..7c08a5ee 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cpp @@ -19,7 +19,7 @@ #include "eigen_tensor.hpp" #ifdef ENABLE_CUTENSOR -#include "eigen_tensor.hu" +#include "eigen_tensor.cuh" #endif namespace tensorwrapper::buffer::detail_ { @@ -98,11 +98,18 @@ void EIGEN_TENSOR::contraction_assignment_(label_type olabels, const_shape_reference result_shape, const_pimpl_reference lhs, const_pimpl_reference rhs) { -#ifdef ENABLE_CUTENSOR - hello(); -#endif ContractionPlanner plan(olabels, llabels, rlabels); +#ifdef ENABLE_CUTENSOR + // Prepare m_tensor_ + m_tensor_ = allocate_from_shape_(result_shape.as_smooth(), + std::make_index_sequence()); + m_tensor_.setZero(); + + // Dispatch to cuTENSOR + cutensor_contraction(olabels, llabels, rlabels, result_shape, lhs, + rhs, m_tensor_); +#else auto lt = lhs.clone(); auto rt = rhs.clone(); lt->permute_assignment(plan.lhs_permutation(), llabels, lhs); @@ -147,6 +154,7 @@ void EIGEN_TENSOR::contraction_assignment_(label_type olabels, } else { m_tensor_ = tensor; } +#endif mark_for_rehash_(); } diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu index da9c7980..a60f5bc8 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu @@ -14,14 +14,256 @@ * limitations under the License. */ #ifdef ENABLE_CUTENSOR -#include "eigen_tensor.hu" -#include -#include -#include +#include "cutensor_traits.cuh" +#include "eigen_tensor.cuh" +#include +#include namespace tensorwrapper::buffer::detail_ { -void hello() { std::cout << "Hello" << std::endl; } +// Handle cuda errors +#define HANDLE_CUDA_ERROR(x) \ + { \ + const auto err = x; \ + if(err != cudaSuccess) { \ + printf("Error: %s\n", cudaGetErrorString(err)); \ + exit(-1); \ + } \ + }; + +// Handle cuTENSOR errors +#define HANDLE_CUTENSOR_ERROR(x) \ + { \ + const auto err = x; \ + if(err != CUTENSOR_STATUS_SUCCESS) { \ + printf("Error: %s\n", cutensorGetErrorString(err)); \ + exit(-1); \ + } \ + }; + +// Some common typedefs +using mode_vector_t = std::vector; +using int64_vector_t = std::vector; + +// Convert a label into a vector of modes +template +mode_vector_t label_to_mode_vector(const LabelType& label) { + mode_vector_t mode; + for(const auto& i : label) { mode.push_back(i.data()[0]); } + return mode; +} + +// Query extent information from an input +template +int64_vector_t get_extents(const InfoType& info) { + int64_vector_t extent; + for(std::size_t i = 0; i < info.rank(); ++i) { + extent.push_back((int64_t)info.extent(i)); + } + return extent; +} + +// Compute strides in row major +int64_vector_t row_major_strides(std::size_t N, const int64_vector_t& extent) { + int64_vector_t strides; + for(std::size_t i = 0; i < N; ++i) { + int64_t product = 1; + for(std::size_t j = N - 1; j > i; --j) product *= extent[j]; + strides.push_back(product); + } + return strides; +} + +// Perform tensor contraction with cuTENSOR +template +void cutensor_contraction( + typename TensorType::label_type olabel, + typename TensorType::label_type llabel, + typename TensorType::label_type rlabel, + typename TensorType::const_shape_reference result_shape, + typename TensorType::const_pimpl_reference lhs, + typename TensorType::const_pimpl_reference rhs, + typename TensorType::eigen_reference result) { + using element_t = typename TensorType::element_type; + using eigen_data_t = typename TensorType::eigen_data_type; + + // GEMM alpha and beta (hardcoded for now) + element_t alpha = 1.0; + element_t beta = 0.0; + + // The modes of the tensors + mode_vector_t lhs_modes = label_to_mode_vector(llabel); + mode_vector_t rhs_modes = label_to_mode_vector(rlabel); + mode_vector_t output_modes = label_to_mode_vector(olabel); + + // The extents of each tensor + int64_vector_t lhs_extents = get_extents(lhs); + int64_vector_t rhs_extents = get_extents(rhs); + int64_vector_t output_extents = get_extents(result_shape.as_smooth()); + + // The strides of each tensor + int64_vector_t lhs_strides = row_major_strides(lhs.rank(), lhs_extents); + int64_vector_t rhs_strides = row_major_strides(rhs.rank(), rhs_extents); + int64_vector_t output_strides = + row_major_strides(result_shape.rank(), output_extents); + + // The size of each tensor + std::size_t lhs_size = sizeof(element_t) * lhs.size(); + std::size_t rhs_size = sizeof(element_t) * rhs.size(); + std::size_t output_size = sizeof(element_t) * result_shape.size(); + + // Allocate on device + void *lhs_d, *rhs_d, *output_d; + cudaMalloc((void**)&lhs_d, lhs_size); + cudaMalloc((void**)&rhs_d, rhs_size); + cudaMalloc((void**)&output_d, output_size); + + // Copy to data to device + HANDLE_CUDA_ERROR(cudaMemcpy(lhs_d, lhs.get_immutable_data(), lhs_size, + cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(rhs_d, rhs.get_immutable_data(), rhs_size, + cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR( + cudaMemcpy(output_d, result.data(), output_size, cudaMemcpyHostToDevice)); + + // Assert alignment + const uint32_t kAlignment = + 128; // Alignment of the global-memory device pointers (bytes) + assert(uintptr_t(lhs_d) % kAlignment == 0); + assert(uintptr_t(rhs_d) % kAlignment == 0); + assert(uintptr_t(output_d) % kAlignment == 0); + + // cuTENSOR traits + cutensor_traits traits; + + // cuTENSOR handle + cutensorHandle_t handle; + HANDLE_CUTENSOR_ERROR(cutensorCreate(&handle)); + + // Create Tensor Descriptors + cutensorTensorDescriptor_t descLHS; + HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( + handle, &descLHS, lhs.rank(), lhs_extents.data(), lhs_strides.data(), + traits.cutensorDataType, kAlignment)); + + cutensorTensorDescriptor_t descRHS; + HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( + handle, &descRHS, rhs.rank(), rhs_extents.data(), rhs_strides.data(), + traits.cutensorDataType, kAlignment)); + + cutensorTensorDescriptor_t descOutput; + HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( + handle, &descOutput, result_shape.rank(), output_extents.data(), + output_strides.data(), traits.cutensorDataType, kAlignment)); + + // Create Contraction Descriptor + cutensorOperationDescriptor_t desc; + HANDLE_CUTENSOR_ERROR(cutensorCreateContraction( + handle, &desc, // Base + descLHS, lhs_modes.data(), CUTENSOR_OP_IDENTITY, // A + descRHS, rhs_modes.data(), CUTENSOR_OP_IDENTITY, // B + descOutput, output_modes.data(), CUTENSOR_OP_IDENTITY, // C + descOutput, output_modes.data(), traits.descCompute // Output + )); + + // Optional (but recommended): ensure that the scalar type is correct. + cutensorDataType_t scalarType; + HANDLE_CUTENSOR_ERROR(cutensorOperationDescriptorGetAttribute( + handle, desc, CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE, + (void*)&scalarType, sizeof(scalarType))); + assert(scalarType == traits.cutensorDataType); + + // Set the algorithm to use + const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT; + cutensorPlanPreference_t planPref; + HANDLE_CUTENSOR_ERROR(cutensorCreatePlanPreference(handle, &planPref, algo, + CUTENSOR_JIT_MODE_NONE)); + + // Query workspace estimate + uint64_t workspaceSizeEstimate = 0; + const cutensorWorksizePreference_t workspacePref = + CUTENSOR_WORKSPACE_DEFAULT; + HANDLE_CUTENSOR_ERROR(cutensorEstimateWorkspaceSize( + handle, desc, planPref, workspacePref, &workspaceSizeEstimate)); + + // Create Contraction Plan + cutensorPlan_t plan; + HANDLE_CUTENSOR_ERROR( + cutensorCreatePlan(handle, &plan, desc, planPref, workspaceSizeEstimate)); + + // Optional: Query information about the created plan + // query actually used workspace + uint64_t actualWorkspaceSize = 0; + HANDLE_CUTENSOR_ERROR(cutensorPlanGetAttribute( + handle, plan, CUTENSOR_PLAN_REQUIRED_WORKSPACE, &actualWorkspaceSize, + sizeof(actualWorkspaceSize))); + assert(actualWorkspaceSize <= workspaceSizeEstimate); + + // At this point the user knows exactly how much memory is need by the + // operation and only the smaller actual workspace needs to be allocated + void* work = nullptr; + if(actualWorkspaceSize > 0) { + HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize)); + assert(uintptr_t(work) % 128 == + 0); // workspace must be aligned to 128 byte-boundary + } + + // Execute + cudaStream_t stream; + HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); + HANDLE_CUTENSOR_ERROR( + cutensorContract(handle, plan, (void*)&alpha, lhs_d, rhs_d, (void*)&beta, + output_d, output_d, work, actualWorkspaceSize, stream)); + + // Copy Results from Device + HANDLE_CUDA_ERROR( + cudaMemcpy(result.data(), output_d, output_size, cudaMemcpyDeviceToHost)); + + // Free allocated memory + HANDLE_CUTENSOR_ERROR(cutensorDestroy(handle)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyPlan(plan)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyOperationDescriptor(desc)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descLHS)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descRHS)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descOutput)); + HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); + if(lhs_d) cudaFree(lhs_d); + if(rhs_d) cudaFree(rhs_d); + if(output_d) cudaFree(output_d); + if(work) cudaFree(work); +} + +#undef HANDLE_CUTENSOR_ERROR +#undef HANDLE_CUDA_ERROR + +// Template instantiations +#define FUNCTION_INSTANTIATE(TYPE, RANK) \ + template void cutensor_contraction>( \ + typename EigenTensor::label_type, \ + typename EigenTensor::label_type, \ + typename EigenTensor::label_type, \ + typename EigenTensor::const_shape_reference, \ + typename EigenTensor::const_pimpl_reference, \ + typename EigenTensor::const_pimpl_reference, \ + typename EigenTensor::eigen_reference) + +#define DEFINE_CUTENSOR_CONTRACTION(TYPE) \ + FUNCTION_INSTANTIATE(TYPE, 0); \ + FUNCTION_INSTANTIATE(TYPE, 1); \ + FUNCTION_INSTANTIATE(TYPE, 2); \ + FUNCTION_INSTANTIATE(TYPE, 3); \ + FUNCTION_INSTANTIATE(TYPE, 4); \ + FUNCTION_INSTANTIATE(TYPE, 5); \ + FUNCTION_INSTANTIATE(TYPE, 6); \ + FUNCTION_INSTANTIATE(TYPE, 7); \ + FUNCTION_INSTANTIATE(TYPE, 8); \ + FUNCTION_INSTANTIATE(TYPE, 9); \ + FUNCTION_INSTANTIATE(TYPE, 10) + +TW_APPLY_FLOATING_POINT_TYPES(DEFINE_CUTENSOR_CONTRACTION); + +#undef DEFINE_CUTENSOR_CONTRACTION +#undef FUNCTION_INSTANTIATE } // namespace tensorwrapper::buffer::detail_ diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh new file mode 100644 index 00000000..67f843b0 --- /dev/null +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh @@ -0,0 +1,47 @@ +/* + * Copyright 2025 NWChemEx-Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#ifdef ENABLE_CUTENSOR +#include "eigen_tensor.hpp" + +namespace tensorwrapper::buffer::detail_ { + +/** @brief Performs a tensor contraction on GPU + * + * @param[in] olabel The labels for the modes of the output. + * @param[in] llabel The labels for the modes of the left hand tensor. + * @param[in] rlabel The labels for the modes of the right hand tensor. + * @param[in] result_shape The intended shape of the result. + * @param[in] lhs The left hand tensor. + * @param[in] rhs The right hand tensor. + * @param[in, out] result The eigen tensor where the results are stored. + * + * @throw std::bad_alloc if there is a problem allocating the copy of + * @p layout. Strong throw guarantee. + */ +template +void cutensor_contraction( + typename TensorType::label_type olabel, + typename TensorType::label_type llabel, + typename TensorType::label_type rlabel, + typename TensorType::const_shape_reference result_shape, + typename TensorType::const_pimpl_reference lhs, + typename TensorType::const_pimpl_reference rhs, + typename TensorType::eigen_reference result); + +} // namespace tensorwrapper::buffer::detail_ + +#endif \ No newline at end of file From b6db64b089b329dc64d07b81b4bb3efafd914af9 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Wed, 14 May 2025 13:05:08 -0500 Subject: [PATCH 6/7] clean up --- .../buffer/detail_/eigen_tensor.cu | 119 +++++++++--------- .../buffer/detail_/eigen_tensor.cuh | 15 ++- 2 files changed, 64 insertions(+), 70 deletions(-) diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu index a60f5bc8..987023ee 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu @@ -47,7 +47,7 @@ using int64_vector_t = std::vector; // Convert a label into a vector of modes template -mode_vector_t label_to_mode_vector(const LabelType& label) { +mode_vector_t label_to_modes(const LabelType& label) { mode_vector_t mode; for(const auto& i : label) { mode.push_back(i.data()[0]); } return mode; @@ -64,7 +64,7 @@ int64_vector_t get_extents(const InfoType& info) { } // Compute strides in row major -int64_vector_t row_major_strides(std::size_t N, const int64_vector_t& extent) { +int64_vector_t get_strides(std::size_t N, const int64_vector_t& extent) { int64_vector_t strides; for(std::size_t i = 0; i < N; ++i) { int64_t product = 1; @@ -76,14 +76,13 @@ int64_vector_t row_major_strides(std::size_t N, const int64_vector_t& extent) { // Perform tensor contraction with cuTENSOR template -void cutensor_contraction( - typename TensorType::label_type olabel, - typename TensorType::label_type llabel, - typename TensorType::label_type rlabel, - typename TensorType::const_shape_reference result_shape, - typename TensorType::const_pimpl_reference lhs, - typename TensorType::const_pimpl_reference rhs, - typename TensorType::eigen_reference result) { +void cutensor_contraction(typename TensorType::label_type c_label, + typename TensorType::label_type a_label, + typename TensorType::label_type b_label, + typename TensorType::const_shape_reference c_shape, + typename TensorType::const_pimpl_reference A, + typename TensorType::const_pimpl_reference B, + typename TensorType::eigen_reference C) { using element_t = typename TensorType::element_type; using eigen_data_t = typename TensorType::eigen_data_type; @@ -92,46 +91,45 @@ void cutensor_contraction( element_t beta = 0.0; // The modes of the tensors - mode_vector_t lhs_modes = label_to_mode_vector(llabel); - mode_vector_t rhs_modes = label_to_mode_vector(rlabel); - mode_vector_t output_modes = label_to_mode_vector(olabel); + mode_vector_t a_modes = label_to_modes(a_label); + mode_vector_t b_modes = label_to_modes(b_label); + mode_vector_t c_modes = label_to_modes(c_label); // The extents of each tensor - int64_vector_t lhs_extents = get_extents(lhs); - int64_vector_t rhs_extents = get_extents(rhs); - int64_vector_t output_extents = get_extents(result_shape.as_smooth()); + int64_vector_t a_extents = get_extents(A); + int64_vector_t b_extents = get_extents(B); + int64_vector_t c_extents = get_extents(c_shape.as_smooth()); // The strides of each tensor - int64_vector_t lhs_strides = row_major_strides(lhs.rank(), lhs_extents); - int64_vector_t rhs_strides = row_major_strides(rhs.rank(), rhs_extents); - int64_vector_t output_strides = - row_major_strides(result_shape.rank(), output_extents); + int64_vector_t a_strides = get_strides(A.rank(), a_extents); + int64_vector_t b_strides = get_strides(B.rank(), b_extents); + int64_vector_t c_strides = get_strides(c_shape.rank(), c_extents); // The size of each tensor - std::size_t lhs_size = sizeof(element_t) * lhs.size(); - std::size_t rhs_size = sizeof(element_t) * rhs.size(); - std::size_t output_size = sizeof(element_t) * result_shape.size(); + std::size_t a_size = sizeof(element_t) * A.size(); + std::size_t b_size = sizeof(element_t) * B.size(); + std::size_t c_size = sizeof(element_t) * c_shape.size(); // Allocate on device - void *lhs_d, *rhs_d, *output_d; - cudaMalloc((void**)&lhs_d, lhs_size); - cudaMalloc((void**)&rhs_d, rhs_size); - cudaMalloc((void**)&output_d, output_size); + void *A_d, *B_d, *C_d; + cudaMalloc((void**)&A_d, a_size); + cudaMalloc((void**)&B_d, b_size); + cudaMalloc((void**)&C_d, c_size); // Copy to data to device - HANDLE_CUDA_ERROR(cudaMemcpy(lhs_d, lhs.get_immutable_data(), lhs_size, - cudaMemcpyHostToDevice)); - HANDLE_CUDA_ERROR(cudaMemcpy(rhs_d, rhs.get_immutable_data(), rhs_size, - cudaMemcpyHostToDevice)); HANDLE_CUDA_ERROR( - cudaMemcpy(output_d, result.data(), output_size, cudaMemcpyHostToDevice)); + cudaMemcpy(A_d, A.get_immutable_data(), a_size, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR( + cudaMemcpy(B_d, B.get_immutable_data(), b_size, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR( + cudaMemcpy(C_d, C.data(), c_size, cudaMemcpyHostToDevice)); // Assert alignment const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes) - assert(uintptr_t(lhs_d) % kAlignment == 0); - assert(uintptr_t(rhs_d) % kAlignment == 0); - assert(uintptr_t(output_d) % kAlignment == 0); + assert(uintptr_t(A_d) % kAlignment == 0); + assert(uintptr_t(B_d) % kAlignment == 0); + assert(uintptr_t(C_d) % kAlignment == 0); // cuTENSOR traits cutensor_traits traits; @@ -141,32 +139,32 @@ void cutensor_contraction( HANDLE_CUTENSOR_ERROR(cutensorCreate(&handle)); // Create Tensor Descriptors - cutensorTensorDescriptor_t descLHS; + cutensorTensorDescriptor_t descA; HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( - handle, &descLHS, lhs.rank(), lhs_extents.data(), lhs_strides.data(), + handle, &descA, A.rank(), a_extents.data(), a_strides.data(), traits.cutensorDataType, kAlignment)); - cutensorTensorDescriptor_t descRHS; + cutensorTensorDescriptor_t descB; HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( - handle, &descRHS, rhs.rank(), rhs_extents.data(), rhs_strides.data(), + handle, &descB, B.rank(), b_extents.data(), b_strides.data(), traits.cutensorDataType, kAlignment)); - cutensorTensorDescriptor_t descOutput; + cutensorTensorDescriptor_t descC; HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( - handle, &descOutput, result_shape.rank(), output_extents.data(), - output_strides.data(), traits.cutensorDataType, kAlignment)); + handle, &descResult, c_shape.rank(), c_extents.data(), c_strides.data(), + traits.cutensorDataType, kAlignment)); // Create Contraction Descriptor cutensorOperationDescriptor_t desc; HANDLE_CUTENSOR_ERROR(cutensorCreateContraction( - handle, &desc, // Base - descLHS, lhs_modes.data(), CUTENSOR_OP_IDENTITY, // A - descRHS, rhs_modes.data(), CUTENSOR_OP_IDENTITY, // B - descOutput, output_modes.data(), CUTENSOR_OP_IDENTITY, // C - descOutput, output_modes.data(), traits.descCompute // Output + handle, &desc, // Base + descA, a_modes.data(), CUTENSOR_OP_IDENTITY, // A + descB, b_modes.data(), CUTENSOR_OP_IDENTITY, // B + descC, c_modes.data(), CUTENSOR_OP_IDENTITY, // C + descC, c_modes.data(), traits.descCompute // Result )); - // Optional (but recommended): ensure that the scalar type is correct. + // Ensure that the scalar type is correct. cutensorDataType_t scalarType; HANDLE_CUTENSOR_ERROR(cutensorOperationDescriptorGetAttribute( handle, desc, CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE, @@ -191,16 +189,13 @@ void cutensor_contraction( HANDLE_CUTENSOR_ERROR( cutensorCreatePlan(handle, &plan, desc, planPref, workspaceSizeEstimate)); - // Optional: Query information about the created plan - // query actually used workspace + // Determine workspace size and allocate uint64_t actualWorkspaceSize = 0; HANDLE_CUTENSOR_ERROR(cutensorPlanGetAttribute( handle, plan, CUTENSOR_PLAN_REQUIRED_WORKSPACE, &actualWorkspaceSize, sizeof(actualWorkspaceSize))); assert(actualWorkspaceSize <= workspaceSizeEstimate); - // At this point the user knows exactly how much memory is need by the - // operation and only the smaller actual workspace needs to be allocated void* work = nullptr; if(actualWorkspaceSize > 0) { HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize)); @@ -211,25 +206,25 @@ void cutensor_contraction( // Execute cudaStream_t stream; HANDLE_CUDA_ERROR(cudaStreamCreate(&stream)); - HANDLE_CUTENSOR_ERROR( - cutensorContract(handle, plan, (void*)&alpha, lhs_d, rhs_d, (void*)&beta, - output_d, output_d, work, actualWorkspaceSize, stream)); + HANDLE_CUTENSOR_ERROR(cutensorContract(handle, plan, (void*)&alpha, A_d, + B_d, (void*)&beta, C_d, C_d, work, + actualWorkspaceSize, stream)); // Copy Results from Device HANDLE_CUDA_ERROR( - cudaMemcpy(result.data(), output_d, output_size, cudaMemcpyDeviceToHost)); + cudaMemcpy(C.data(), C_d, c_size, cudaMemcpyDeviceToHost)); // Free allocated memory HANDLE_CUTENSOR_ERROR(cutensorDestroy(handle)); HANDLE_CUTENSOR_ERROR(cutensorDestroyPlan(plan)); HANDLE_CUTENSOR_ERROR(cutensorDestroyOperationDescriptor(desc)); - HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descLHS)); - HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descRHS)); - HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descOutput)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descA)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descB)); + HANDLE_CUTENSOR_ERROR(cutensorDestroyTensorDescriptor(descC)); HANDLE_CUDA_ERROR(cudaStreamDestroy(stream)); - if(lhs_d) cudaFree(lhs_d); - if(rhs_d) cudaFree(rhs_d); - if(output_d) cudaFree(output_d); + if(A_d) cudaFree(A_d); + if(B_d) cudaFree(B_d); + if(C_d) cudaFree(C_d); if(work) cudaFree(work); } diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh index 67f843b0..6e026667 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cuh @@ -33,14 +33,13 @@ namespace tensorwrapper::buffer::detail_ { * @p layout. Strong throw guarantee. */ template -void cutensor_contraction( - typename TensorType::label_type olabel, - typename TensorType::label_type llabel, - typename TensorType::label_type rlabel, - typename TensorType::const_shape_reference result_shape, - typename TensorType::const_pimpl_reference lhs, - typename TensorType::const_pimpl_reference rhs, - typename TensorType::eigen_reference result); +void cutensor_contraction(typename TensorType::label_type c_label, + typename TensorType::label_type a_label, + typename TensorType::label_type b_label, + typename TensorType::const_shape_reference c_shape, + typename TensorType::const_pimpl_reference A, + typename TensorType::const_pimpl_reference B, + typename TensorType::eigen_reference C); } // namespace tensorwrapper::buffer::detail_ From 0a65905f28f62ba4b6cc5c568c1d260e0752dd1b Mon Sep 17 00:00:00 2001 From: "Jonathan M. Waldrop" Date: Wed, 14 May 2025 13:06:42 -0500 Subject: [PATCH 7/7] missed one --- src/tensorwrapper/buffer/detail_/eigen_tensor.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu index 987023ee..516abff1 100644 --- a/src/tensorwrapper/buffer/detail_/eigen_tensor.cu +++ b/src/tensorwrapper/buffer/detail_/eigen_tensor.cu @@ -151,7 +151,7 @@ void cutensor_contraction(typename TensorType::label_type c_label, cutensorTensorDescriptor_t descC; HANDLE_CUTENSOR_ERROR(cutensorCreateTensorDescriptor( - handle, &descResult, c_shape.rank(), c_extents.data(), c_strides.data(), + handle, &descC, c_shape.rank(), c_extents.data(), c_strides.data(), traits.cutensorDataType, kAlignment)); // Create Contraction Descriptor