Skip to content

chore(gpu): stf experiment #2252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/tfhe-cuda-backend/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ fn main() {
}
println!("cargo:rustc-link-lib=gomp");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-lib=cuda");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
println!("cargo:rustc-link-lib=stdc++");

Expand Down
27 changes: 26 additions & 1 deletion backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,34 @@ else()
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
endif()

# Fetch CPM.cmake directly from GitHub if not already present
include(FetchContent)
FetchContent_Declare(
CPM
GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
GIT_TAG v0.38.5 # replace with the desired version or main for latest
)
FetchContent_MakeAvailable(CPM)

include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)

# This will automatically clone CCCL from GitHub and make the exported cmake targets available
cpmaddpackage(
NAME
CCCL
GITHUB_REPOSITORY
"nvidia/cccl"
GIT_TAG
"main"
# The following is required to make the `CCCL::cudax` target available:
OPTIONS
"CCCL_ENABLE_UNSTABLE ON")

# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")
--use_fast_math -Xcompiler -fPIC -DCCCL_DISABLE_EXCEPTIONS -DCUDASTF_DISABLE_CODE_GENERATION")

set(INCLUDE_DIR include)

Expand All @@ -101,6 +124,8 @@ enable_testing()
add_subdirectory(tests_and_benchmarks)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})

target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)

# This is required for rust cargo build
install(TARGETS tfhe_cuda_backend DESTINATION .)

Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX cuda)
target_include_directories(tfhe_cuda_backend PRIVATE .)
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
#include "programmable_bootstrap.cuh"
#include "programmable_bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include <cuda/experimental/stf.cuh>
#include <vector>

namespace cudastf = cuda::experimental::stf;

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_cg_accumulate(
Expand Down Expand Up @@ -384,25 +387,49 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {

// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
const char *use_graph_env = getenv("USE_CUDA_GRAPH");

cudastf::context ctx(stream);
if (use_graph_env && atoi(use_graph_env) != 0) {
ctx = cudastf::graph_ctx(stream);
}

auto lwe_chunk_size = buffer->lwe_chunk_size;

auto buffer_token = ctx.logical_token();

for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

auto key_token = ctx.logical_token();
auto result_token = ctx.logical_token();

// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
ctx.task(key_token.write(), buffer_token.write())
.set_symbol("compute_keybundle")
->*[&](cudaStream_t stf_stream) {
execute_compute_keybundle<Torus, params>(
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
bootstrapping_key, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset);
};

// Accumulate
execute_cg_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
.set_symbol("accumulate")
->*
[&](cudaStream_t stf_stream) {
execute_cg_external_product_loop<Torus, params>(
stf_stream, gpu_index, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, lwe_array_out,
lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, lwe_offset, num_many_lut, lut_stride);
};
}
ctx.finalize();
}

// Verify if the grid size satisfies the cooperative group constraints
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
#include "polynomial/polynomial_math.cuh"
#include "programmable_bootstrap.cuh"
#include "types/complex/operations.cuh"
#include <cuda/experimental/stf.cuh>
#include <vector>

namespace cudastf = cuda::experimental::stf;

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate(
Expand Down Expand Up @@ -404,23 +407,44 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);

// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
const char *use_graph_env = getenv("USE_CUDA_GRAPH");

cudastf::context ctx(stream);
if (use_graph_env && atoi(use_graph_env) != 0) {
ctx = cudastf::graph_ctx(stream);
}

auto lwe_chunk_size = buffer->lwe_chunk_size;
auto buffer_token = ctx.logical_token();
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

auto key_token = ctx.logical_token();
auto result_token = ctx.logical_token();
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
ctx.task(key_token.write(), buffer_token.write())
.set_symbol("compute_keybundle")
->*[&](cudaStream_t stf_stream) {
execute_compute_keybundle<Torus, params>(
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
bootstrapping_key, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset);
};

// Accumulate
execute_tbc_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
.set_symbol("accumulate")
->*
[&](cudaStream_t stf_stream) {
execute_tbc_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
};
}
}

Expand Down
Loading