Skip to content

Commit 1f9f770

Browse files
Copilotpmudry
andauthored
GPU performance optimizations for CUDA and OptiX real-time path tracing and firefly HDRI artifacts (#26)
* Initial plan * Implement CUDA and OptiX performance optimizations - Precompute inverse ray direction for BVH traversal (eliminates 3 reciprocal ops per AABB test) - Add __launch_bounds__(256) to render kernel for better register allocation - GPU-side converged pixel counting via warp-shuffle reduction (replaces host-side D2H copy) - Dedicated compute stream for CUDA render kernel (enables overlap with display pipeline) - OptiX: dedicated CUDA stream for launches (replaces cudaDeviceSynchronize) - OptiX: GPU-side gamma correction + pinned memory async D2H pipeline - OptiX: proper resource cleanup for new stream/buffer resources Co-authored-by: pmudry <4624112+pmudry@users.noreply.github.com> * Add GPU performance techniques documentation Create website/docs/how-it-works/gpu-performance.md documenting: - Precomputed inverse ray direction for BVH traversal - __launch_bounds__ on render kernel - GPU-side converged pixel counting (warp-shuffle) - Dedicated CUDA streams for render and display - OptiX dedicated render stream - OptiX GPU-side gamma correction with pinned memory - Summary comparison table of all techniques Co-authored-by: pmudry <4624112+pmudry@users.noreply.github.com> * Add GPU performance techniques docs page (force-add past gitignore) Co-authored-by: pmudry <4624112+pmudry@users.noreply.github.com> * Address code review: fix memory leak, extract stream helper, fix doc formatting - Move d_converged_count from local static to file-scope s_d_converged_count with cleanup - Add getOptiXStream() helper to reduce stream fallback pattern duplication - Fix digit grouping in documentation (65,536 instead of 65 536) Co-authored-by: pmudry <4624112+pmudry@users.noreply.github.com> * Fixed incoming bugs with shared buffers and init status * Fixing HDRI firefly problem --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: pmudry <4624112+pmudry@users.noreply.github.com> Co-authored-by: Pierre-André Mudry <pmudry@gmail.com>
1 parent adc8301 commit 1f9f770

File tree

10 files changed

+571
-60
lines changed

10 files changed

+571
-60
lines changed

src/rayon/gpu_renderers/cuda_raytracer.cuh

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -155,45 +155,47 @@ __device__ inline f3 sample_aperture_disk(const f3 &cam_u, const f3 &cam_v, cura
155155
//==============================================================================
156156

157157
/**
158-
* @brief Ray-AABB intersection test using slab method
158+
* @brief Ray-AABB intersection test using slab method with precomputed inverse direction.
159+
*
160+
* The inverse ray direction (inv_dir) must be precomputed once per ray and passed in.
161+
* This avoids redundant reciprocal computations during BVH traversal where the same ray
162+
* is tested against dozens of AABBs — a significant saving in BVH-heavy scenes.
163+
*
159164
* @param r Ray to test
165+
* @param inv_dir Precomputed 1.0f / ray.dir (computed once per ray, reused per AABB test)
160166
* @param box_min AABB minimum corner
161167
* @param box_max AABB maximum corner
162168
* @param t_min Minimum ray parameter
163169
* @param t_max Maximum ray parameter
164170
* @return true if ray intersects AABB in range [t_min, t_max]
165171
*/
166-
__device__ inline bool hit_aabb(const ray_simple &r, const f3 &box_min, const f3 &box_max, float t_min, float t_max)
172+
__device__ __forceinline__ bool hit_aabb(const ray_simple &r, const f3 &inv_dir, const f3 &box_min, const f3 &box_max,
173+
float t_min, float t_max)
167174
{
168-
// Compute inverse ray direction once
169-
float inv_dir_x = 1.0f / r.dir.x;
170-
float inv_dir_y = 1.0f / r.dir.y;
171-
float inv_dir_z = 1.0f / r.dir.z;
172-
173175
// X slab
174-
float t0_x = (box_min.x - r.orig.x) * inv_dir_x;
175-
float t1_x = (box_max.x - r.orig.x) * inv_dir_x;
176-
if (inv_dir_x < 0.0f)
176+
float t0_x = (box_min.x - r.orig.x) * inv_dir.x;
177+
float t1_x = (box_max.x - r.orig.x) * inv_dir.x;
178+
if (inv_dir.x < 0.0f)
177179
{
178180
float temp = t0_x;
179181
t0_x = t1_x;
180182
t1_x = temp;
181183
}
182184

183185
// Y slab
184-
float t0_y = (box_min.y - r.orig.y) * inv_dir_y;
185-
float t1_y = (box_max.y - r.orig.y) * inv_dir_y;
186-
if (inv_dir_y < 0.0f)
186+
float t0_y = (box_min.y - r.orig.y) * inv_dir.y;
187+
float t1_y = (box_max.y - r.orig.y) * inv_dir.y;
188+
if (inv_dir.y < 0.0f)
187189
{
188190
float temp = t0_y;
189191
t0_y = t1_y;
190192
t1_y = temp;
191193
}
192194

193195
// Z slab
194-
float t0_z = (box_min.z - r.orig.z) * inv_dir_z;
195-
float t1_z = (box_max.z - r.orig.z) * inv_dir_z;
196-
if (inv_dir_z < 0.0f)
196+
float t0_z = (box_min.z - r.orig.z) * inv_dir.z;
197+
float t1_z = (box_max.z - r.orig.z) * inv_dir.z;
198+
if (inv_dir.z < 0.0f)
197199
{
198200
float temp = t0_z;
199201
t0_z = t1_z;
@@ -533,6 +535,10 @@ __device__ inline bool hit_scene(const CudaScene::Scene &scene, const ray_simple
533535
// Use BVH if available, otherwise linear scan
534536
if (scene.use_bvh && scene.bvh_root_idx >= 0)
535537
{
538+
// Precompute inverse ray direction once per ray for all AABB tests in this traversal.
539+
// This avoids 3 reciprocal operations per BVH node — significant for deep BVH trees.
540+
const f3 inv_dir(1.0f / r.dir.x, 1.0f / r.dir.y, 1.0f / r.dir.z);
541+
536542
// Stack-based BVH traversal (iterative to avoid recursion)
537543
int stack[32];
538544
int stack_ptr = 0;
@@ -543,8 +549,8 @@ __device__ inline bool hit_scene(const CudaScene::Scene &scene, const ray_simple
543549
int node_idx = stack[--stack_ptr];
544550
const CudaScene::BVHNode &node = scene.bvh_nodes[node_idx];
545551

546-
// Test ray against node's AABB
547-
if (!hit_aabb(r, node.bounds_min, node.bounds_max, t_min, closest_so_far))
552+
// Test ray against node's AABB using precomputed inverse direction
553+
if (!hit_aabb(r, inv_dir, node.bounds_min, node.bounds_max, t_min, closest_so_far))
548554
continue;
549555

550556
if (node.is_leaf)

src/rayon/gpu_renderers/optix/optix_programs.cu

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,17 @@ extern "C" __global__ void __raygen__rg()
545545
}
546546
}
547547

548+
// Firefly rejection: clamp per-sample luminance to prevent single HDR texels
549+
// (e.g., sun disk in outdoor environment maps) from causing permanent white dots.
550+
// Uses a luminance-preserving scale so hue is maintained.
551+
constexpr float FIREFLY_CLAMP = 20.0f;
552+
float sample_lum = 0.2126f * color.x + 0.7152f * color.y + 0.0722f * color.z;
553+
if (sample_lum > FIREFLY_CLAMP)
554+
{
555+
float scale = FIREFLY_CLAMP / sample_lum;
556+
color = color * scale;
557+
}
558+
548559
accumulated = accumulated + color;
549560
}
550561

src/rayon/gpu_renderers/optix/optix_renderer.cu

Lines changed: 112 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,24 @@ struct OptixState
9090
cudaArray_t hdr_cuda_array = nullptr;
9191
cudaTextureObject_t hdr_tex_obj = 0;
9292

93+
// Dedicated CUDA stream for OptiX launches — avoids blocking the default stream
94+
// and enables stream-specific synchronization instead of cudaDeviceSynchronize().
95+
cudaStream_t render_stream = nullptr;
96+
97+
// Pinned host memory + device display buffer for async gamma correction + D2H pipeline
98+
unsigned char *pinned_display = nullptr;
99+
size_t pinned_display_size = 0;
100+
unsigned char *d_display = nullptr;
101+
size_t d_display_size = 0;
102+
93103
bool initialized = false;
94104
};
95105

96106
static OptixState g_state;
97107

108+
// Helper: return the dedicated render stream, or the default stream (0) if not initialized.
109+
static inline cudaStream_t getOptiXStream() { return g_state.render_stream ? g_state.render_stream : 0; }
110+
98111
// Load PTX from file
99112
static std::string loadPTXFromFile(const char *filename)
100113
{
@@ -262,6 +275,12 @@ static void initializeOptiX()
262275
g_state.sbt.missRecordStrideInBytes = sizeof(MissRecord);
263276
g_state.sbt.missRecordCount = 1;
264277

278+
// Create a dedicated CUDA stream for OptiX launches — enables stream-specific
279+
// synchronization instead of cudaDeviceSynchronize(), and allows overlap with
280+
// display/gamma correction work.
281+
if (g_state.render_stream == nullptr)
282+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_state.render_stream, cudaStreamNonBlocking));
283+
265284
g_state.initialized = true;
266285
printf("OptiX renderer initialized successfully\n");
267286
}
@@ -665,8 +684,9 @@ extern "C" void optixRendererResetAccum(int width, int height)
665684
g_state.accum_height = height;
666685
}
667686

668-
// Zero the buffer on device — no host round-trip needed
669-
CUDA_CHECK(cudaMemset(g_state.d_accum_buffer, 0, (size_t)width * height * sizeof(float4)));
687+
// Zero the buffer on the render stream so the memset is ordered before the next optixLaunch.
688+
// cudaMemset on the default stream (0) races with optixLaunch on the non-blocking render stream.
689+
CUDA_CHECK(cudaMemsetAsync(g_state.d_accum_buffer, 0, (size_t)width * height * sizeof(float4), getOptiXStream()));
670690

671691
// Allocate persistent launch params buffer (once)
672692
if (g_state.d_launch_params == 0)
@@ -721,13 +741,18 @@ extern "C" unsigned long long optixRendererLaunch(int width, int height, int num
721741
launch_params.hdr_env_tex = g_state.hdr_tex_obj;
722742
launch_params.use_hdr_env = (g_state.hdr_tex_obj != 0);
723743

724-
// Single memcpy to persistent device buffer — no malloc/free per batch
725-
CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(g_state.d_launch_params), &launch_params, sizeof(OptixLaunchParams),
726-
cudaMemcpyHostToDevice));
744+
// Single memcpy to persistent device buffer — no malloc/free per batch.
745+
// Use the dedicated stream for async param upload + launch.
746+
cudaStream_t stream = getOptiXStream();
747+
CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void *>(g_state.d_launch_params), &launch_params,
748+
sizeof(OptixLaunchParams), cudaMemcpyHostToDevice, stream));
727749

728-
OPTIX_CHECK(optixLaunch(g_state.pipeline, 0, g_state.d_launch_params, sizeof(OptixLaunchParams), &g_state.sbt,
750+
OPTIX_CHECK(optixLaunch(g_state.pipeline, stream, g_state.d_launch_params, sizeof(OptixLaunchParams), &g_state.sbt,
729751
width, height, 1));
730-
CUDA_CHECK(cudaDeviceSynchronize());
752+
753+
// Stream-specific sync instead of cudaDeviceSynchronize() — only waits for
754+
// this stream, allowing other work (display pipeline) to proceed.
755+
CUDA_CHECK(cudaStreamSynchronize(stream));
731756

732757
return (unsigned long long)width * height * samples_to_add;
733758
}
@@ -766,6 +791,78 @@ extern "C" void optixRendererSetGolfDimples(int count, float radius, float depth
766791
g_state.golf_dimple_depth = depth;
767792
}
768793

794+
//==============================================================================
795+
// GPU-side gamma correction for OptiX — mirrors the CUDA renderer's pipeline.
796+
// Converts float4 accum buffer directly to uint8 display image on the GPU,
797+
// then async-copies via pinned memory. Avoids the expensive float4 D2H transfer
798+
// + host-side conversion that the original optixRendererDownloadAccum() used.
799+
//==============================================================================
800+
__global__ void optixGammaCorrectKernel(const float4 *__restrict__ accum_buffer, unsigned char *display_image,
801+
int width, int height, int num_samples, int channels, float gamma)
802+
{
803+
int x = blockIdx.x * blockDim.x + threadIdx.x;
804+
int y = blockIdx.y * blockDim.y + threadIdx.y;
805+
if (x >= width || y >= height)
806+
return;
807+
808+
int pixel_idx = y * width + x;
809+
float4 acc = accum_buffer[pixel_idx];
810+
811+
float inv_samples = 1.0f / (float)num_samples;
812+
float inv_gamma = 1.0f / gamma;
813+
814+
float r = fminf(powf(fmaxf(acc.x * inv_samples, 0.0f), inv_gamma), 0.999f);
815+
float g = fminf(powf(fmaxf(acc.y * inv_samples, 0.0f), inv_gamma), 0.999f);
816+
float b = fminf(powf(fmaxf(acc.z * inv_samples, 0.0f), inv_gamma), 0.999f);
817+
818+
int image_idx = pixel_idx * channels;
819+
display_image[image_idx + 0] = (unsigned char)(256.0f * r);
820+
display_image[image_idx + 1] = (unsigned char)(256.0f * g);
821+
display_image[image_idx + 2] = (unsigned char)(256.0f * b);
822+
if (channels == 4)
823+
display_image[image_idx + 3] = 255;
824+
}
825+
826+
extern "C" void optixRendererConvertAccumToDisplay(unsigned char *display_image, int width, int height,
827+
int channels, int num_samples, float gamma)
828+
{
829+
if (!g_state.d_accum_buffer || !display_image || num_samples <= 0)
830+
return;
831+
832+
size_t display_size = (size_t)width * height * channels * sizeof(unsigned char);
833+
834+
// Allocate/resize device display buffer (persistent across calls)
835+
if (g_state.d_display == nullptr || g_state.d_display_size != display_size)
836+
{
837+
if (g_state.d_display != nullptr)
838+
cudaFree(g_state.d_display);
839+
cudaMalloc(&g_state.d_display, display_size);
840+
g_state.d_display_size = display_size;
841+
}
842+
843+
// Allocate/resize pinned host staging buffer for async D2H copy
844+
if (g_state.pinned_display == nullptr || g_state.pinned_display_size != display_size)
845+
{
846+
if (g_state.pinned_display != nullptr)
847+
cudaFreeHost(g_state.pinned_display);
848+
cudaMallocHost(&g_state.pinned_display, display_size);
849+
g_state.pinned_display_size = display_size;
850+
}
851+
852+
dim3 threads(32, 8);
853+
dim3 blocks((width + threads.x - 1) / threads.x, (height + threads.y - 1) / threads.y);
854+
855+
cudaStream_t stream = getOptiXStream();
856+
857+
optixGammaCorrectKernel<<<blocks, threads, 0, stream>>>(
858+
g_state.d_accum_buffer, g_state.d_display, width, height, num_samples, channels, gamma);
859+
860+
// Async D2H copy via pinned memory, then single stream sync
861+
cudaMemcpyAsync(g_state.pinned_display, g_state.d_display, display_size, cudaMemcpyDeviceToHost, stream);
862+
cudaStreamSynchronize(stream);
863+
memcpy(display_image, g_state.pinned_display, display_size);
864+
}
865+
769866
extern "C" void optixRendererClearHdrEnv()
770867
{
771868
if (g_state.hdr_tex_obj != 0)
@@ -861,6 +958,14 @@ extern "C" void optixRendererCleanup()
861958
if (g_state.d_gas_output)
862959
CUDA_CHECK(cudaFree(reinterpret_cast<void *>(g_state.d_gas_output)));
863960

961+
// Clean up GPU display pipeline resources
962+
if (g_state.render_stream)
963+
CUDA_CHECK(cudaStreamDestroy(g_state.render_stream));
964+
if (g_state.d_display)
965+
CUDA_CHECK(cudaFree(g_state.d_display));
966+
if (g_state.pinned_display)
967+
CUDA_CHECK(cudaFreeHost(g_state.pinned_display));
968+
864969
if (g_state.pipeline)
865970
OPTIX_CHECK(optixPipelineDestroy(g_state.pipeline));
866971
if (g_state.raygen_pg)

src/rayon/gpu_renderers/renderer_cuda_device.cu

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,19 @@ static cudaStream_t s_display_stream = nullptr;
4141
static unsigned char *s_pinned_display = nullptr;
4242
static size_t s_pinned_display_size = 0;
4343

44+
// Compute stream: render kernel runs here so it can overlap with the display pipeline.
45+
// Non-blocking to avoid implicit synchronization with default stream or display stream.
46+
static cudaStream_t s_compute_stream = nullptr;
47+
48+
// Persistent device counter for GPU-side converged pixel counting (freed in cleanup)
49+
static int *s_d_converged_count = nullptr;
50+
4451
extern "C" void initCudaStreams()
4552
{
4653
if (s_display_stream == nullptr)
4754
cudaStreamCreateWithFlags(&s_display_stream, cudaStreamNonBlocking);
55+
if (s_compute_stream == nullptr)
56+
cudaStreamCreateWithFlags(&s_compute_stream, cudaStreamNonBlocking);
4857
}
4958

5059
extern "C" void cleanupCudaStreams()
@@ -54,12 +63,22 @@ extern "C" void cleanupCudaStreams()
5463
cudaStreamDestroy(s_display_stream);
5564
s_display_stream = nullptr;
5665
}
66+
if (s_compute_stream != nullptr)
67+
{
68+
cudaStreamDestroy(s_compute_stream);
69+
s_compute_stream = nullptr;
70+
}
5771
if (s_pinned_display != nullptr)
5872
{
5973
cudaFreeHost(s_pinned_display);
6074
s_pinned_display = nullptr;
6175
s_pinned_display_size = 0;
6276
}
77+
if (s_d_converged_count != nullptr)
78+
{
79+
cudaFree(s_d_converged_count);
80+
s_d_converged_count = nullptr;
81+
}
6382
}
6483

6584
//==================== HOST INTERFACE FUNCTIONS ====================
@@ -148,7 +167,10 @@ extern "C" void resetDeviceAccumBuffer(void *d_accum_buffer, int num_pixels)
148167
{
149168
if (d_accum_buffer != nullptr)
150169
{
151-
cudaMemset(d_accum_buffer, 0, (size_t)num_pixels * sizeof(float4));
170+
// Use the compute stream so the memset is ordered before the next render kernel.
171+
// cudaMemset on the default stream (0) races with kernels on non-blocking streams.
172+
cudaStream_t stream = s_compute_stream ? s_compute_stream : 0;
173+
cudaMemsetAsync(d_accum_buffer, 0, (size_t)num_pixels * sizeof(float4), stream);
152174
}
153175
}
154176

@@ -381,7 +403,11 @@ extern "C" unsigned long long renderPixelsCUDAAccumulative(
381403
}
382404
}
383405

384-
renderAccKernel<<<blocks, threads>>>(
406+
// Launch render kernel on the compute stream (if available) to enable overlap
407+
// with the display conversion pipeline on s_display_stream.
408+
cudaStream_t render_stream = s_compute_stream ? s_compute_stream : 0;
409+
410+
renderAccKernel<<<blocks, threads, 0, render_stream>>>(
385411
d_accum, scene, width, height, samples_to_add, total_samples_so_far, max_depth, (float)cam_center_x,
386412
(float)cam_center_y, (float)cam_center_z, (float)pixel00_x, (float)pixel00_y, (float)pixel00_z, (float)delta_u_x,
387413
(float)delta_u_y, (float)delta_u_z, (float)delta_v_x, (float)delta_v_y, (float)delta_v_z, d_ray_count,
@@ -394,7 +420,9 @@ extern "C" unsigned long long renderPixelsCUDAAccumulative(
394420
printf("❌ Kernel launch error: %s\n", cudaGetErrorString(kernel_err));
395421
}
396422

397-
cudaError_t sync_err = cudaDeviceSynchronize();
423+
// Stream-specific sync instead of cudaDeviceSynchronize() — only waits for
424+
// this stream to finish, allowing other streams to continue running.
425+
cudaError_t sync_err = cudaStreamSynchronize(render_stream);
398426
if (sync_err != cudaSuccess)
399427
{
400428
printf("❌ Kernel execution error: %s\n", cudaGetErrorString(sync_err));
@@ -512,7 +540,9 @@ extern "C" void resetAdaptiveBuffer(void *d_pixel_sample_counts, int num_pixels)
512540
{
513541
if (d_pixel_sample_counts != nullptr)
514542
{
515-
cudaMemset(d_pixel_sample_counts, 0, (size_t)num_pixels * sizeof(int));
543+
// Same stream as the render kernel so the reset is guaranteed to complete first.
544+
cudaStream_t stream = s_compute_stream ? s_compute_stream : 0;
545+
cudaMemsetAsync(d_pixel_sample_counts, 0, (size_t)num_pixels * sizeof(int), stream);
516546
}
517547
}
518548

@@ -529,16 +559,20 @@ extern "C" int countConvergedPixels(void *d_pixel_sample_counts, int num_pixels)
529559
if (d_pixel_sample_counts == nullptr)
530560
return 0;
531561

532-
// Copy buffer to host and count negative values (converged pixels)
533-
std::vector<int> host_counts(num_pixels);
534-
cudaMemcpy(host_counts.data(), d_pixel_sample_counts, (size_t)num_pixels * sizeof(int), cudaMemcpyDeviceToHost);
562+
// GPU-side reduction: count negative values (converged pixels) using warp-shuffle.
563+
// Avoids expensive full-buffer D2H transfer that the old host-side loop required.
564+
if (s_d_converged_count == nullptr)
565+
cudaMalloc(&s_d_converged_count, sizeof(int));
566+
567+
cudaMemset(s_d_converged_count, 0, sizeof(int));
568+
569+
int threads_per_block = 256;
570+
int blocks = (num_pixels + threads_per_block - 1) / threads_per_block;
571+
countConvergedKernel<<<blocks, threads_per_block>>>(
572+
static_cast<const int *>(d_pixel_sample_counts), num_pixels, s_d_converged_count);
535573

536574
int converged = 0;
537-
for (int i = 0; i < num_pixels; ++i)
538-
{
539-
if (host_counts[i] < 0)
540-
++converged;
541-
}
575+
cudaMemcpy(&converged, s_d_converged_count, sizeof(int), cudaMemcpyDeviceToHost);
542576
return converged;
543577
}
544578

0 commit comments

Comments
 (0)