fix(gpu): add indexes to modulus switch noise reduction

guillermo-oyarzun · guillermo-oyarzun · commit 250f065d3472 · 2025-05-14T10:30:20.000+02:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
 
 void cuda_improve_noise_modulus_switch_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus);
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus);
 
 void cuda_glwe_sample_extract_128(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -248,6 +248,7 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
   __uint128_t *global_accumulator;
   double *global_join_buffer;
   __uint128_t *temp_lwe_array_in;
+  uint64_t *trivial_indexes;
 
   PBS_VARIANT pbs_variant;
   bool uses_noise_reduction;
@@ -263,11 +264,27 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
     cuda_set_device(gpu_index);
     this->pbs_variant = pbs_variant;
     this->uses_noise_reduction = allocate_ms_array;
-    this->temp_lwe_array_in =
-        (__uint128_t *)cuda_malloc_with_size_tracking_async(
-            (lwe_dimension + 1) * input_lwe_ciphertext_count *
-                sizeof(__uint128_t),
-            stream, gpu_index, size_tracker, allocate_ms_array);
+    if (allocate_ms_array) {
+      this->temp_lwe_array_in =
+          (__uint128_t *)cuda_malloc_with_size_tracking_async(
+              (lwe_dimension + 1) * input_lwe_ciphertext_count *
+                  sizeof(__uint128_t),
+              stream, gpu_index, size_tracker, allocate_ms_array);
+      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          size_tracker, allocate_ms_array);
+      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
+      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
+        h_trivial_indexes[i] = i;
+
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          trivial_indexes, h_trivial_indexes,
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          allocate_gpu_memory);
+
+      cuda_synchronize_stream(stream, gpu_index);
+      delete[] h_trivial_indexes;
+    }
     auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
     size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
                                      input_lwe_ciphertext_count *
@@ -404,9 +421,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
       cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                          gpu_memory_allocated);
 
-    if (uses_noise_reduction)
+    if (uses_noise_reduction) {
       cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
                                          gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
+                                         gpu_memory_allocated);
+    }
   }
 };
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -86,13 +86,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
 
 void cuda_improve_noise_modulus_switch_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus) {
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus) {
   host_improve_noise_modulus_switch<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t const *>(lwe_array_in),
+      static_cast<uint64_t const *>(lwe_array_indexes),
       static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
       num_zeros, input_variance, r_sigma, bound, log_modulus);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -178,11 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
 
 // Each thread processes two elements of the lwe array
 template <typename Torus>
-__global__ void
-improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
-                             const Torus *zeros, int lwe_size, int num_zeros,
-                             double input_variance, double r_sigma,
-                             double bound, uint32_t log_modulus) {
+__global__ void improve_noise_modulus_switch(
+    Torus *array_out, const Torus *array_in, const uint64_t *indexes,
+    const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
+    double r_sigma, double bound, uint32_t log_modulus) {
 
   // First we will assume size is less than the number of threads per block
   // I should switch this to dynamic shared memory
@@ -198,13 +197,13 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
   // This probably are not needed cause we are setting the values
   sum_mask_errors[threadIdx.x] = 0.f;
   sum_squared_mask_errors[threadIdx.x] = 0.f;
+  auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
+  auto this_block_lwe_out = array_out + blockIdx.x * lwe_size;
+  Torus input_element1 = this_block_lwe_in[threadIdx.x];
 
-  Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
-
-  Torus input_element2 =
-      threadIdx.x + blockDim.x < lwe_size
-          ? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
-          : 0;
+  Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
+                             ? this_block_lwe_in[threadIdx.x + blockDim.x]
+                             : 0;
 
   // Base noise is only handled by thread 0
   double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +217,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
   __syncthreads();
 
   if (found)
-    array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
+    this_block_lwe_out[threadIdx.x] = input_element1;
 
   if (found && (threadIdx.x + blockDim.x) < lwe_size)
-    array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-        input_element2;
+    this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;
 
   __syncthreads();
   // If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +251,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
     // Assumption we always have at least 512 elements
     // If we find a useful zero encryption we replace the lwe by lwe + zero
     if (found)
-      array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
+      this_block_lwe_out[threadIdx.x] = zero_element1;
 
     if (found && (threadIdx.x + blockDim.x) < lwe_size)
-      array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-          zero_element2;
+      this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;
 
     __syncthreads();
     // If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +267,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
 template <typename Torus>
 __host__ void host_improve_noise_modulus_switch(
     cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
-    Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
-    uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
-    const double r_sigma, const double bound, uint32_t log_modulus) {
+    Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
+    uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
+    const double input_variance, const double r_sigma, const double bound,
+    uint32_t log_modulus) {
 
   if (lwe_size < 512) {
     PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +287,8 @@ __host__ void host_improve_noise_modulus_switch(
   int num_threads = 512, num_blocks = num_lwes;
 
   improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
-      bound, log_modulus);
+      array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
+      r_sigma, bound, log_modulus);
   check_cuda_error(cudaGetLastError());
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -194,7 +194,8 @@ void execute_pbs_async(
             lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
 
         void *zeros = nullptr;
-        if (ms_noise_reduction_key != nullptr)
+        if (ms_noise_reduction_key != nullptr &&
+            ms_noise_reduction_key->ptr != nullptr)
           zeros = ms_noise_reduction_key->ptr[i];
         cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
             streams[i], gpu_indexes[i], current_lwe_array_out,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -660,13 +660,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
       (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
 
   // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key != nullptr) {
+  if (ms_noise_reduction_key != nullptr &&
+      ms_noise_reduction_key->ptr != nullptr) {
     if (ms_noise_reduction_key->num_zeros != 0) {
       uint32_t log_modulus = log2(polynomial_size) + 1;
       host_improve_noise_modulus_switch<uint64_t>(
           static_cast<cudaStream_t>(stream), gpu_index,
           buffer->temp_lwe_array_in,
           static_cast<uint64_t const *>(lwe_array_in),
+          static_cast<uint64_t const *>(lwe_input_indexes),
           static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
           num_samples, ms_noise_reduction_key->num_zeros,
           ms_noise_reduction_key->ms_input_variance,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -256,6 +256,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
         static_cast<cudaStream_t>(stream), gpu_index,
         static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
         static_cast<__uint128_t const *>(lwe_array_in),
+        static_cast<uint64_t const *>(buffer->trivial_indexes),
         static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
         lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
         ms_noise_reduction_key->ms_input_variance,
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -50,6 +50,7 @@ unsafe extern "C" {
         gpu_index: u32,
         lwe_array_out: *mut ffi::c_void,
         lwe_array_in: *const ffi::c_void,
+        lwe_array_indexes: *const ffi::c_void,
         encrypted_zeros: *const ffi::c_void,
         lwe_size: u32,
         num_lwes: u32,
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/modulus_switch_noise_reduction.rs b/tfhe/src/core_crypto/gpu/algorithms/test/modulus_switch_noise_reduction.rs
@@ -1,10 +1,11 @@
 use super::super::test::TestResources;
 use crate::core_crypto::commons::test_tools::{check_both_ratio_under, mean, variance};
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
-use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::gpu::{CudaStreams, CudaVec};
 use crate::core_crypto::prelude::*;
 
 use crate::core_crypto::gpu::GpuIndex;
+use itertools::Itertools;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::cell::RefCell;
 use tfhe_cuda_backend::bindings::{
@@ -147,6 +148,10 @@ fn check_noise_improve_modulus_switch_noise(
 
     let gpu_index = 0;
     let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let num_blocks = 1;
+    let lwe_indexes: Vec<u64> = (0..num_blocks).map(|x| x as u64).collect();
+    let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &streams, 0) };
+    unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0) };
 
     let d_encryptions_of_zero = CudaLweCiphertextList::from_lwe_ciphertext_list(
         &encryptions_of_zero,
@@ -186,6 +191,7 @@ fn check_noise_improve_modulus_switch_noise(
                             streams.gpu_indexes[0].get(),
                             d_ct.0.d_vec.as_mut_c_ptr(0),
                             d_ct_in.0.d_vec.as_c_ptr(0),
+                            d_input_indexes.as_c_ptr(0),
                             d_encryptions_of_zero.0.d_vec.as_c_ptr(0),
                             lwe_dimension.to_lwe_size().0 as u32,
                             d_ct.lwe_ciphertext_count().0 as u32,
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
@@ -606,40 +606,6 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
     );
 }
 
-/// # Safety
-///
-/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
-/// required
-#[allow(clippy::too_many_arguments)]
-pub unsafe fn cuda_improve_noise_modulus_switch_ciphertext_async<T: UnsignedInteger>(
-    streams: &CudaStreams,
-    lwe_array_out: &mut CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
-    encrypted_zeros: &CudaVec<T>,
-    lwe_dimension: LweDimension,
-    num_samples: u32,
-    num_zeros: u32,
-    input_variance: f64,
-    r_sigma_factor: f64,
-    bound: f64,
-    log_modulus: u32,
-) {
-    cuda_improve_noise_modulus_switch_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        encrypted_zeros.as_c_ptr(0),
-        lwe_dimension.to_lwe_size().0 as u32,
-        num_samples,
-        num_zeros,
-        input_variance,
-        r_sigma_factor,
-        bound,
-        log_modulus,
-    );
-}
-
 /// Addition of a vector of LWE ciphertexts
 ///
 /// # Safety