zama-ai · agnesLeroy · Apr 10, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/backends/tfhe-cuda-backend/build.rs b/backends/tfhe-cuda-backend/build.rs
@@ -61,6 +61,7 @@ fn main() {
         }
         println!("cargo:rustc-link-lib=gomp");
         println!("cargo:rustc-link-lib=cudart");
+        println!("cargo:rustc-link-lib=cuda");
         println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
         println!("cargo:rustc-link-lib=stdc++");
 

diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -88,11 +88,34 @@ else()
   set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
 endif()
 
+# Fetch CPM.cmake directly from GitHub if not already present
+include(FetchContent)
+FetchContent_Declare(
+  CPM
+  GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
+  GIT_TAG v0.38.5 # replace with the desired version or main for latest
+)
+FetchContent_MakeAvailable(CPM)
+
+include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)
+
+# This will automatically clone CCCL from GitHub and make the exported cmake targets available
+cpmaddpackage(
+  NAME
+  CCCL
+  GITHUB_REPOSITORY
+  "nvidia/cccl"
+  GIT_TAG
+  "main"
+  # The following is required to make the `CCCL::cudax` target available:
+  OPTIONS
+  "CCCL_ENABLE_UNSTABLE ON")
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
     "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
   -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
-  --use_fast_math -Xcompiler -fPIC")
+  --use_fast_math -Xcompiler -fPIC  -DCCCL_DISABLE_EXCEPTIONS  -DCUDASTF_DISABLE_CODE_GENERATION")
 
 set(INCLUDE_DIR include)
 
@@ -101,6 +124,8 @@ enable_testing()
 add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
 
+target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)
+
 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
 

diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
+target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX cuda)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -16,8 +16,11 @@
 #include "programmable_bootstrap.cuh"
 #include "programmable_bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
+#include <cuda/experimental/stf.cuh>
 #include <vector>
 
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_cg_accumulate(
@@ -384,25 +387,49 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride) {
 
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+    ctx = cudastf::graph_ctx(stream);
+  }
+
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
+  auto buffer_token = ctx.logical_token();
+
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
+
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write())
+            .set_symbol("compute_keybundle")
+            ->*[&](cudaStream_t stf_stream) {
+                  execute_compute_keybundle<Torus, params>(
+                      stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
+                      bootstrapping_key, buffer, num_samples, lwe_dimension,
+                      glwe_dimension, polynomial_size, grouping_factor,
+                      level_count, lwe_offset);
+                };
 
     // Accumulate
-    execute_cg_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
+    ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
+            .set_symbol("accumulate")
+            ->*
+        [&](cudaStream_t stf_stream) {
+          execute_cg_external_product_loop<Torus, params>(
+              stf_stream, gpu_index, lut_vector, lut_vector_indexes,
+              lwe_array_in, lwe_input_indexes, lwe_array_out,
+              lwe_output_indexes, buffer, num_samples, lwe_dimension,
+              glwe_dimension, polynomial_size, grouping_factor, base_log,
+              level_count, lwe_offset, num_many_lut, lut_stride);
+        };
   }
+  ctx.finalize();
 }
 
 // Verify if the grid size satisfies the cooperative group constraints

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -16,8 +16,11 @@
 #include "polynomial/polynomial_math.cuh"
 #include "programmable_bootstrap.cuh"
 #include "types/complex/operations.cuh"
+#include <cuda/experimental/stf.cuh>
 #include <vector>
 
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_tbc_accumulate(
@@ -404,23 +407,44 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     uint32_t num_many_lut, uint32_t lut_stride) {
   cuda_set_device(gpu_index);
 
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+    ctx = cudastf::graph_ctx(stream);
+  }
+
   auto lwe_chunk_size = buffer->lwe_chunk_size;
+  auto buffer_token = ctx.logical_token();
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write())
+            .set_symbol("compute_keybundle")
+            ->*[&](cudaStream_t stf_stream) {
+                  execute_compute_keybundle<Torus, params>(
+                      stf_stream, gpu_index, lwe_array_in, lwe_input_indexes,
+                      bootstrapping_key, buffer, num_samples, lwe_dimension,
+                      glwe_dimension, polynomial_size, grouping_factor,
+                      level_count, lwe_offset);
+                };
 
     // Accumulate
-    execute_tbc_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
+    ctx.task(key_token.read(), buffer_token.rw(), result_token.write())
+            .set_symbol("accumulate")
+            ->*
+        [&](cudaStream_t stf_stream) {
+          execute_tbc_external_product_loop<Torus, params>(
+              stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+              lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+              num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+              grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+              lut_stride);
+        };
   }
 }