Skip to content

Commit 250f065

Browse files
fix(gpu): add indexes to modulus switch noise reduction
1 parent 406425d commit 250f065

File tree

10 files changed

+69
-71
lines changed

10 files changed

+69
-71
lines changed

backends/tfhe-cuda-backend/cuda/include/ciphertext.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
2828

2929
void cuda_improve_noise_modulus_switch_64(
3030
void *stream, uint32_t gpu_index, void *lwe_array_out,
31-
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
32-
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
33-
double r_sigma, double bound, uint32_t log_modulus);
31+
void const *lwe_array_in, void const *lwe_array_indexes,
32+
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
33+
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
34+
uint32_t log_modulus);
3435

3536
void cuda_glwe_sample_extract_128(
3637
void *stream, uint32_t gpu_index, void *lwe_array_out,

backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
248248
__uint128_t *global_accumulator;
249249
double *global_join_buffer;
250250
__uint128_t *temp_lwe_array_in;
251+
uint64_t *trivial_indexes;
251252

252253
PBS_VARIANT pbs_variant;
253254
bool uses_noise_reduction;
@@ -263,11 +264,27 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
263264
cuda_set_device(gpu_index);
264265
this->pbs_variant = pbs_variant;
265266
this->uses_noise_reduction = allocate_ms_array;
266-
this->temp_lwe_array_in =
267-
(__uint128_t *)cuda_malloc_with_size_tracking_async(
268-
(lwe_dimension + 1) * input_lwe_ciphertext_count *
269-
sizeof(__uint128_t),
270-
stream, gpu_index, size_tracker, allocate_ms_array);
267+
if (allocate_ms_array) {
268+
this->temp_lwe_array_in =
269+
(__uint128_t *)cuda_malloc_with_size_tracking_async(
270+
(lwe_dimension + 1) * input_lwe_ciphertext_count *
271+
sizeof(__uint128_t),
272+
stream, gpu_index, size_tracker, allocate_ms_array);
273+
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
274+
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
275+
size_tracker, allocate_ms_array);
276+
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
277+
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
278+
h_trivial_indexes[i] = i;
279+
280+
cuda_memcpy_with_size_tracking_async_to_gpu(
281+
trivial_indexes, h_trivial_indexes,
282+
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
283+
allocate_gpu_memory);
284+
285+
cuda_synchronize_stream(stream, gpu_index);
286+
delete[] h_trivial_indexes;
287+
}
271288
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
272289
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
273290
input_lwe_ciphertext_count *
@@ -404,9 +421,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
404421
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
405422
gpu_memory_allocated);
406423

407-
if (uses_noise_reduction)
424+
if (uses_noise_reduction) {
408425
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
409426
gpu_memory_allocated);
427+
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
428+
gpu_memory_allocated);
429+
}
410430
}
411431
};
412432

backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
8686

8787
void cuda_improve_noise_modulus_switch_64(
8888
void *stream, uint32_t gpu_index, void *lwe_array_out,
89-
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
90-
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
91-
double r_sigma, double bound, uint32_t log_modulus) {
89+
void const *lwe_array_in, void const *lwe_array_indexes,
90+
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
91+
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
92+
uint32_t log_modulus) {
9293
host_improve_noise_modulus_switch<uint64_t>(
9394
static_cast<cudaStream_t>(stream), gpu_index,
9495
static_cast<uint64_t *>(lwe_array_out),
9596
static_cast<uint64_t const *>(lwe_array_in),
97+
static_cast<uint64_t const *>(lwe_array_indexes),
9698
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
9799
num_zeros, input_variance, r_sigma, bound, log_modulus);
98100
}

backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
178178

179179
// Each thread processes two elements of the lwe array
180180
template <typename Torus>
181-
__global__ void
182-
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
183-
const Torus *zeros, int lwe_size, int num_zeros,
184-
double input_variance, double r_sigma,
185-
double bound, uint32_t log_modulus) {
181+
__global__ void improve_noise_modulus_switch(
182+
Torus *array_out, const Torus *array_in, const uint64_t *indexes,
183+
const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
184+
double r_sigma, double bound, uint32_t log_modulus) {
186185

187186
// First we will assume size is less than the number of threads per block
188187
// I should switch this to dynamic shared memory
@@ -198,13 +197,13 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
198197
// This probably are not needed cause we are setting the values
199198
sum_mask_errors[threadIdx.x] = 0.f;
200199
sum_squared_mask_errors[threadIdx.x] = 0.f;
200+
auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
201+
auto this_block_lwe_out = array_out + blockIdx.x * lwe_size;
202+
Torus input_element1 = this_block_lwe_in[threadIdx.x];
201203

202-
Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
203-
204-
Torus input_element2 =
205-
threadIdx.x + blockDim.x < lwe_size
206-
? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
207-
: 0;
204+
Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
205+
? this_block_lwe_in[threadIdx.x + blockDim.x]
206+
: 0;
208207

209208
// Base noise is only handled by thread 0
210209
double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +217,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
218217
__syncthreads();
219218

220219
if (found)
221-
array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
220+
this_block_lwe_out[threadIdx.x] = input_element1;
222221

223222
if (found && (threadIdx.x + blockDim.x) < lwe_size)
224-
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
225-
input_element2;
223+
this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;
226224

227225
__syncthreads();
228226
// If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +251,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
253251
// Assumption we always have at least 512 elements
254252
// If we find a useful zero encryption we replace the lwe by lwe + zero
255253
if (found)
256-
array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
254+
this_block_lwe_out[threadIdx.x] = zero_element1;
257255

258256
if (found && (threadIdx.x + blockDim.x) < lwe_size)
259-
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
260-
zero_element2;
257+
this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;
261258

262259
__syncthreads();
263260
// If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +267,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
270267
template <typename Torus>
271268
__host__ void host_improve_noise_modulus_switch(
272269
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
273-
Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
274-
uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
275-
const double r_sigma, const double bound, uint32_t log_modulus) {
270+
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
271+
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
272+
const double input_variance, const double r_sigma, const double bound,
273+
uint32_t log_modulus) {
276274

277275
if (lwe_size < 512) {
278276
PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +287,8 @@ __host__ void host_improve_noise_modulus_switch(
289287
int num_threads = 512, num_blocks = num_lwes;
290288

291289
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
292-
array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
293-
bound, log_modulus);
290+
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
291+
r_sigma, bound, log_modulus);
294292
check_cuda_error(cudaGetLastError());
295293
}
296294

backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ void execute_pbs_async(
194194
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
195195

196196
void *zeros = nullptr;
197-
if (ms_noise_reduction_key != nullptr)
197+
if (ms_noise_reduction_key != nullptr &&
198+
ms_noise_reduction_key->ptr != nullptr)
198199
zeros = ms_noise_reduction_key->ptr[i];
199200
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
200201
streams[i], gpu_indexes[i], current_lwe_array_out,

backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -660,13 +660,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
660660
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
661661

662662
// If the parameters contain noise reduction key, then apply it
663-
if (ms_noise_reduction_key != nullptr) {
663+
if (ms_noise_reduction_key != nullptr &&
664+
ms_noise_reduction_key->ptr != nullptr) {
664665
if (ms_noise_reduction_key->num_zeros != 0) {
665666
uint32_t log_modulus = log2(polynomial_size) + 1;
666667
host_improve_noise_modulus_switch<uint64_t>(
667668
static_cast<cudaStream_t>(stream), gpu_index,
668669
buffer->temp_lwe_array_in,
669670
static_cast<uint64_t const *>(lwe_array_in),
671+
static_cast<uint64_t const *>(lwe_input_indexes),
670672
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
671673
num_samples, ms_noise_reduction_key->num_zeros,
672674
ms_noise_reduction_key->ms_input_variance,

backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
256256
static_cast<cudaStream_t>(stream), gpu_index,
257257
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
258258
static_cast<__uint128_t const *>(lwe_array_in),
259+
static_cast<uint64_t const *>(buffer->trivial_indexes),
259260
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
260261
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
261262
ms_noise_reduction_key->ms_input_variance,

backends/tfhe-cuda-backend/src/bindings.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ unsafe extern "C" {
5050
gpu_index: u32,
5151
lwe_array_out: *mut ffi::c_void,
5252
lwe_array_in: *const ffi::c_void,
53+
lwe_array_indexes: *const ffi::c_void,
5354
encrypted_zeros: *const ffi::c_void,
5455
lwe_size: u32,
5556
num_lwes: u32,

tfhe/src/core_crypto/gpu/algorithms/test/modulus_switch_noise_reduction.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
use super::super::test::TestResources;
22
use crate::core_crypto::commons::test_tools::{check_both_ratio_under, mean, variance};
33
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
4-
use crate::core_crypto::gpu::CudaStreams;
4+
use crate::core_crypto::gpu::{CudaStreams, CudaVec};
55
use crate::core_crypto::prelude::*;
66

77
use crate::core_crypto::gpu::GpuIndex;
8+
use itertools::Itertools;
89
use rayon::iter::{IntoParallelIterator, ParallelIterator};
910
use std::cell::RefCell;
1011
use tfhe_cuda_backend::bindings::{
@@ -147,6 +148,10 @@ fn check_noise_improve_modulus_switch_noise(
147148

148149
let gpu_index = 0;
149150
let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
151+
let num_blocks = 1;
152+
let lwe_indexes: Vec<u64> = (0..num_blocks).map(|x| x as u64).collect();
153+
let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &streams, 0) };
154+
unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0) };
150155

151156
let d_encryptions_of_zero = CudaLweCiphertextList::from_lwe_ciphertext_list(
152157
&encryptions_of_zero,
@@ -186,6 +191,7 @@ fn check_noise_improve_modulus_switch_noise(
186191
streams.gpu_indexes[0].get(),
187192
d_ct.0.d_vec.as_mut_c_ptr(0),
188193
d_ct_in.0.d_vec.as_c_ptr(0),
194+
d_input_indexes.as_c_ptr(0),
189195
d_encryptions_of_zero.0.d_vec.as_c_ptr(0),
190196
lwe_dimension.to_lwe_size().0 as u32,
191197
d_ct.lwe_ciphertext_count().0 as u32,

tfhe/src/core_crypto/gpu/mod.rs

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -606,40 +606,6 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
606606
);
607607
}
608608

609-
/// # Safety
610-
///
611-
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
612-
/// required
613-
#[allow(clippy::too_many_arguments)]
614-
pub unsafe fn cuda_improve_noise_modulus_switch_ciphertext_async<T: UnsignedInteger>(
615-
streams: &CudaStreams,
616-
lwe_array_out: &mut CudaVec<T>,
617-
lwe_array_in: &CudaVec<T>,
618-
encrypted_zeros: &CudaVec<T>,
619-
lwe_dimension: LweDimension,
620-
num_samples: u32,
621-
num_zeros: u32,
622-
input_variance: f64,
623-
r_sigma_factor: f64,
624-
bound: f64,
625-
log_modulus: u32,
626-
) {
627-
cuda_improve_noise_modulus_switch_64(
628-
streams.ptr[0],
629-
streams.gpu_indexes[0].get(),
630-
lwe_array_out.as_mut_c_ptr(0),
631-
lwe_array_in.as_c_ptr(0),
632-
encrypted_zeros.as_c_ptr(0),
633-
lwe_dimension.to_lwe_size().0 as u32,
634-
num_samples,
635-
num_zeros,
636-
input_variance,
637-
r_sigma_factor,
638-
bound,
639-
log_modulus,
640-
);
641-
}
642-
643609
/// Addition of a vector of LWE ciphertexts
644610
///
645611
/// # Safety

0 commit comments

Comments
 (0)