pmudry
diff --git a/‎src/rayon/gpu_renderers/cuda_raytracer.cuh‎
Lines changed: 24 additions & 18 deletions b/‎src/rayon/gpu_renderers/cuda_raytracer.cuh‎
Lines changed: 24 additions & 18 deletions
diff --git a/‎src/rayon/gpu_renderers/optix/optix_programs.cu‎
Lines changed: 11 additions & 0 deletions b/‎src/rayon/gpu_renderers/optix/optix_programs.cu‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/rayon/gpu_renderers/optix/optix_renderer.cu‎
Lines changed: 112 additions & 7 deletions b/‎src/rayon/gpu_renderers/optix/optix_renderer.cu‎
Lines changed: 112 additions & 7 deletions
diff --git a/‎src/rayon/gpu_renderers/renderer_cuda_device.cu‎
Lines changed: 46 additions & 12 deletions b/‎src/rayon/gpu_renderers/renderer_cuda_device.cu‎
Lines changed: 46 additions & 12 deletions
@@ -155,45 +155,47 @@ __device__ inline f3 sample_aperture_disk(const f3 &cam_u, const f3 &cam_v, cura
 //==============================================================================
 
 /**
- * @brief Ray-AABB intersection test using slab method
+ * @brief Ray-AABB intersection test using slab method with precomputed inverse direction.
+ *
+ * The inverse ray direction (inv_dir) must be precomputed once per ray and passed in.
+ * This avoids redundant reciprocal computations during BVH traversal where the same ray
+ * is tested against dozens of AABBs — a significant saving in BVH-heavy scenes.
+ *
  * @param r Ray to test
+ * @param inv_dir Precomputed 1.0f / ray.dir (computed once per ray, reused per AABB test)
  * @param box_min AABB minimum corner
  * @param box_max AABB maximum corner
  * @param t_min Minimum ray parameter
  * @param t_max Maximum ray parameter
  * @return true if ray intersects AABB in range [t_min, t_max]
  */
-__device__ inline bool hit_aabb(const ray_simple &r, const f3 &box_min, const f3 &box_max, float t_min, float t_max)
+__device__ __forceinline__ bool hit_aabb(const ray_simple &r, const f3 &inv_dir, const f3 &box_min, const f3 &box_max,
+                                         float t_min, float t_max)
 {
-   // Compute inverse ray direction once
-   float inv_dir_x = 1.0f / r.dir.x;
-   float inv_dir_y = 1.0f / r.dir.y;
-   float inv_dir_z = 1.0f / r.dir.z;
-
    // X slab
-   float t0_x = (box_min.x - r.orig.x) * inv_dir_x;
-   float t1_x = (box_max.x - r.orig.x) * inv_dir_x;
-   if (inv_dir_x < 0.0f)
+   float t0_x = (box_min.x - r.orig.x) * inv_dir.x;
+   float t1_x = (box_max.x - r.orig.x) * inv_dir.x;
+   if (inv_dir.x < 0.0f)
    {
       float temp = t0_x;
       t0_x = t1_x;
       t1_x = temp;
    }
 
    // Y slab
-   float t0_y = (box_min.y - r.orig.y) * inv_dir_y;
-   float t1_y = (box_max.y - r.orig.y) * inv_dir_y;
-   if (inv_dir_y < 0.0f)
+   float t0_y = (box_min.y - r.orig.y) * inv_dir.y;
+   float t1_y = (box_max.y - r.orig.y) * inv_dir.y;
+   if (inv_dir.y < 0.0f)
    {
       float temp = t0_y;
       t0_y = t1_y;
       t1_y = temp;
    }
 
    // Z slab
-   float t0_z = (box_min.z - r.orig.z) * inv_dir_z;
-   float t1_z = (box_max.z - r.orig.z) * inv_dir_z;
-   if (inv_dir_z < 0.0f)
+   float t0_z = (box_min.z - r.orig.z) * inv_dir.z;
+   float t1_z = (box_max.z - r.orig.z) * inv_dir.z;
+   if (inv_dir.z < 0.0f)
    {
       float temp = t0_z;
       t0_z = t1_z;
@@ -533,6 +535,10 @@ __device__ inline bool hit_scene(const CudaScene::Scene &scene, const ray_simple
    // Use BVH if available, otherwise linear scan
    if (scene.use_bvh && scene.bvh_root_idx >= 0)
    {
+      // Precompute inverse ray direction once per ray for all AABB tests in this traversal.
+      // This avoids 3 reciprocal operations per BVH node — significant for deep BVH trees.
+      const f3 inv_dir(1.0f / r.dir.x, 1.0f / r.dir.y, 1.0f / r.dir.z);
+
       // Stack-based BVH traversal (iterative to avoid recursion)
       int stack[32];
       int stack_ptr = 0;
@@ -543,8 +549,8 @@ __device__ inline bool hit_scene(const CudaScene::Scene &scene, const ray_simple
          int node_idx = stack[--stack_ptr];
          const CudaScene::BVHNode &node = scene.bvh_nodes[node_idx];
 
-         // Test ray against node's AABB
-         if (!hit_aabb(r, node.bounds_min, node.bounds_max, t_min, closest_so_far))
+         // Test ray against node's AABB using precomputed inverse direction
+         if (!hit_aabb(r, inv_dir, node.bounds_min, node.bounds_max, t_min, closest_so_far))
             continue;
 
          if (node.is_leaf)
 
@@ -545,6 +545,17 @@ extern "C" __global__ void __raygen__rg()
          }
       }
 
+      // Firefly rejection: clamp per-sample luminance to prevent single HDR texels
+      // (e.g., sun disk in outdoor environment maps) from causing permanent white dots.
+      // Uses a luminance-preserving scale so hue is maintained.
+      constexpr float FIREFLY_CLAMP = 20.0f;
+      float sample_lum = 0.2126f * color.x + 0.7152f * color.y + 0.0722f * color.z;
+      if (sample_lum > FIREFLY_CLAMP)
+      {
+         float scale = FIREFLY_CLAMP / sample_lum;
+         color = color * scale;
+      }
+
       accumulated = accumulated + color;
    }
 
 
@@ -90,11 +90,24 @@ struct OptixState
    cudaArray_t         hdr_cuda_array = nullptr;
    cudaTextureObject_t hdr_tex_obj    = 0;
 
+   // Dedicated CUDA stream for OptiX launches — avoids blocking the default stream
+   // and enables stream-specific synchronization instead of cudaDeviceSynchronize().
+   cudaStream_t render_stream = nullptr;
+
+   // Pinned host memory + device display buffer for async gamma correction + D2H pipeline
+   unsigned char *pinned_display = nullptr;
+   size_t pinned_display_size = 0;
+   unsigned char *d_display = nullptr;
+   size_t d_display_size = 0;
+
    bool initialized = false;
 };
 
 static OptixState g_state;
 
+// Helper: return the dedicated render stream, or the default stream (0) if not initialized.
+static inline cudaStream_t getOptiXStream() { return g_state.render_stream ? g_state.render_stream : 0; }
+
 // Load PTX from file
 static std::string loadPTXFromFile(const char *filename)
 {
@@ -262,6 +275,12 @@ static void initializeOptiX()
    g_state.sbt.missRecordStrideInBytes = sizeof(MissRecord);
    g_state.sbt.missRecordCount = 1;
 
+   // Create a dedicated CUDA stream for OptiX launches — enables stream-specific
+   // synchronization instead of cudaDeviceSynchronize(), and allows overlap with
+   // display/gamma correction work.
+   if (g_state.render_stream == nullptr)
+      CUDA_CHECK(cudaStreamCreateWithFlags(&g_state.render_stream, cudaStreamNonBlocking));
+
    g_state.initialized = true;
    printf("OptiX renderer initialized successfully\n");
 }
@@ -665,8 +684,9 @@ extern "C" void optixRendererResetAccum(int width, int height)
       g_state.accum_height = height;
    }
 
-   // Zero the buffer on device — no host round-trip needed
-   CUDA_CHECK(cudaMemset(g_state.d_accum_buffer, 0, (size_t)width * height * sizeof(float4)));
+   // Zero the buffer on the render stream so the memset is ordered before the next optixLaunch.
+   // cudaMemset on the default stream (0) races with optixLaunch on the non-blocking render stream.
+   CUDA_CHECK(cudaMemsetAsync(g_state.d_accum_buffer, 0, (size_t)width * height * sizeof(float4), getOptiXStream()));
 
    // Allocate persistent launch params buffer (once)
    if (g_state.d_launch_params == 0)
@@ -721,13 +741,18 @@ extern "C" unsigned long long optixRendererLaunch(int width, int height, int num
    launch_params.hdr_env_tex        = g_state.hdr_tex_obj;
    launch_params.use_hdr_env        = (g_state.hdr_tex_obj != 0);
 
-   // Single memcpy to persistent device buffer — no malloc/free per batch
-   CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(g_state.d_launch_params), &launch_params, sizeof(OptixLaunchParams),
-                          cudaMemcpyHostToDevice));
+   // Single memcpy to persistent device buffer — no malloc/free per batch.
+   // Use the dedicated stream for async param upload + launch.
+   cudaStream_t stream = getOptiXStream();
+   CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void *>(g_state.d_launch_params), &launch_params,
+                               sizeof(OptixLaunchParams), cudaMemcpyHostToDevice, stream));
 
-   OPTIX_CHECK(optixLaunch(g_state.pipeline, 0, g_state.d_launch_params, sizeof(OptixLaunchParams), &g_state.sbt,
+   OPTIX_CHECK(optixLaunch(g_state.pipeline, stream, g_state.d_launch_params, sizeof(OptixLaunchParams), &g_state.sbt,
                             width, height, 1));
-   CUDA_CHECK(cudaDeviceSynchronize());
+
+   // Stream-specific sync instead of cudaDeviceSynchronize() — only waits for
+   // this stream, allowing other work (display pipeline) to proceed.
+   CUDA_CHECK(cudaStreamSynchronize(stream));
 
    return (unsigned long long)width * height * samples_to_add;
 }
@@ -766,6 +791,78 @@ extern "C" void optixRendererSetGolfDimples(int count, float radius, float depth
    g_state.golf_dimple_depth  = depth;
 }
 
+//==============================================================================
+// GPU-side gamma correction for OptiX — mirrors the CUDA renderer's pipeline.
+// Converts float4 accum buffer directly to uint8 display image on the GPU,
+// then async-copies via pinned memory. Avoids the expensive float4 D2H transfer
+// + host-side conversion that the original optixRendererDownloadAccum() used.
+//==============================================================================
+__global__ void optixGammaCorrectKernel(const float4 *__restrict__ accum_buffer, unsigned char *display_image,
+                                         int width, int height, int num_samples, int channels, float gamma)
+{
+   int x = blockIdx.x * blockDim.x + threadIdx.x;
+   int y = blockIdx.y * blockDim.y + threadIdx.y;
+   if (x >= width || y >= height)
+      return;
+
+   int pixel_idx = y * width + x;
+   float4 acc = accum_buffer[pixel_idx];
+
+   float inv_samples = 1.0f / (float)num_samples;
+   float inv_gamma = 1.0f / gamma;
+
+   float r = fminf(powf(fmaxf(acc.x * inv_samples, 0.0f), inv_gamma), 0.999f);
+   float g = fminf(powf(fmaxf(acc.y * inv_samples, 0.0f), inv_gamma), 0.999f);
+   float b = fminf(powf(fmaxf(acc.z * inv_samples, 0.0f), inv_gamma), 0.999f);
+
+   int image_idx = pixel_idx * channels;
+   display_image[image_idx + 0] = (unsigned char)(256.0f * r);
+   display_image[image_idx + 1] = (unsigned char)(256.0f * g);
+   display_image[image_idx + 2] = (unsigned char)(256.0f * b);
+   if (channels == 4)
+      display_image[image_idx + 3] = 255;
+}
+
+extern "C" void optixRendererConvertAccumToDisplay(unsigned char *display_image, int width, int height,
+                                                    int channels, int num_samples, float gamma)
+{
+   if (!g_state.d_accum_buffer || !display_image || num_samples <= 0)
+      return;
+
+   size_t display_size = (size_t)width * height * channels * sizeof(unsigned char);
+
+   // Allocate/resize device display buffer (persistent across calls)
+   if (g_state.d_display == nullptr || g_state.d_display_size != display_size)
+   {
+      if (g_state.d_display != nullptr)
+         cudaFree(g_state.d_display);
+      cudaMalloc(&g_state.d_display, display_size);
+      g_state.d_display_size = display_size;
+   }
+
+   // Allocate/resize pinned host staging buffer for async D2H copy
+   if (g_state.pinned_display == nullptr || g_state.pinned_display_size != display_size)
+   {
+      if (g_state.pinned_display != nullptr)
+         cudaFreeHost(g_state.pinned_display);
+      cudaMallocHost(&g_state.pinned_display, display_size);
+      g_state.pinned_display_size = display_size;
+   }
+
+   dim3 threads(32, 8);
+   dim3 blocks((width + threads.x - 1) / threads.x, (height + threads.y - 1) / threads.y);
+
+   cudaStream_t stream = getOptiXStream();
+
+   optixGammaCorrectKernel<<<blocks, threads, 0, stream>>>(
+       g_state.d_accum_buffer, g_state.d_display, width, height, num_samples, channels, gamma);
+
+   // Async D2H copy via pinned memory, then single stream sync
+   cudaMemcpyAsync(g_state.pinned_display, g_state.d_display, display_size, cudaMemcpyDeviceToHost, stream);
+   cudaStreamSynchronize(stream);
+   memcpy(display_image, g_state.pinned_display, display_size);
+}
+
 extern "C" void optixRendererClearHdrEnv()
 {
    if (g_state.hdr_tex_obj != 0)
@@ -861,6 +958,14 @@ extern "C" void optixRendererCleanup()
    if (g_state.d_gas_output)
       CUDA_CHECK(cudaFree(reinterpret_cast<void *>(g_state.d_gas_output)));
 
+   // Clean up GPU display pipeline resources
+   if (g_state.render_stream)
+      CUDA_CHECK(cudaStreamDestroy(g_state.render_stream));
+   if (g_state.d_display)
+      CUDA_CHECK(cudaFree(g_state.d_display));
+   if (g_state.pinned_display)
+      CUDA_CHECK(cudaFreeHost(g_state.pinned_display));
+
    if (g_state.pipeline)
       OPTIX_CHECK(optixPipelineDestroy(g_state.pipeline));
    if (g_state.raygen_pg)
 
@@ -41,10 +41,19 @@ static cudaStream_t s_display_stream = nullptr;
 static unsigned char *s_pinned_display = nullptr;
 static size_t s_pinned_display_size = 0;
 
+// Compute stream: render kernel runs here so it can overlap with the display pipeline.
+// Non-blocking to avoid implicit synchronization with default stream or display stream.
+static cudaStream_t s_compute_stream = nullptr;
+
+// Persistent device counter for GPU-side converged pixel counting (freed in cleanup)
+static int *s_d_converged_count = nullptr;
+
 extern "C" void initCudaStreams()
 {
    if (s_display_stream == nullptr)
       cudaStreamCreateWithFlags(&s_display_stream, cudaStreamNonBlocking);
+   if (s_compute_stream == nullptr)
+      cudaStreamCreateWithFlags(&s_compute_stream, cudaStreamNonBlocking);
 }
 
 extern "C" void cleanupCudaStreams()
@@ -54,12 +63,22 @@ extern "C" void cleanupCudaStreams()
       cudaStreamDestroy(s_display_stream);
       s_display_stream = nullptr;
    }
+   if (s_compute_stream != nullptr)
+   {
+      cudaStreamDestroy(s_compute_stream);
+      s_compute_stream = nullptr;
+   }
    if (s_pinned_display != nullptr)
    {
       cudaFreeHost(s_pinned_display);
       s_pinned_display = nullptr;
       s_pinned_display_size = 0;
    }
+   if (s_d_converged_count != nullptr)
+   {
+      cudaFree(s_d_converged_count);
+      s_d_converged_count = nullptr;
+   }
 }
 
 //==================== HOST INTERFACE FUNCTIONS ====================
@@ -148,7 +167,10 @@ extern "C" void resetDeviceAccumBuffer(void *d_accum_buffer, int num_pixels)
 {
    if (d_accum_buffer != nullptr)
    {
-      cudaMemset(d_accum_buffer, 0, (size_t)num_pixels * sizeof(float4));
+      // Use the compute stream so the memset is ordered before the next render kernel.
+      // cudaMemset on the default stream (0) races with kernels on non-blocking streams.
+      cudaStream_t stream = s_compute_stream ? s_compute_stream : 0;
+      cudaMemsetAsync(d_accum_buffer, 0, (size_t)num_pixels * sizeof(float4), stream);
    }
 }
 
@@ -381,7 +403,11 @@ extern "C" unsigned long long renderPixelsCUDAAccumulative(
       }
    }
 
-   renderAccKernel<<<blocks, threads>>>(
+   // Launch render kernel on the compute stream (if available) to enable overlap
+   // with the display conversion pipeline on s_display_stream.
+   cudaStream_t render_stream = s_compute_stream ? s_compute_stream : 0;
+
+   renderAccKernel<<<blocks, threads, 0, render_stream>>>(
        d_accum, scene, width, height, samples_to_add, total_samples_so_far, max_depth, (float)cam_center_x,
        (float)cam_center_y, (float)cam_center_z, (float)pixel00_x, (float)pixel00_y, (float)pixel00_z, (float)delta_u_x,
        (float)delta_u_y, (float)delta_u_z, (float)delta_v_x, (float)delta_v_y, (float)delta_v_z, d_ray_count,
@@ -394,7 +420,9 @@ extern "C" unsigned long long renderPixelsCUDAAccumulative(
       printf("❌ Kernel launch error: %s\n", cudaGetErrorString(kernel_err));
    }
 
-   cudaError_t sync_err = cudaDeviceSynchronize();
+   // Stream-specific sync instead of cudaDeviceSynchronize() — only waits for
+   // this stream to finish, allowing other streams to continue running.
+   cudaError_t sync_err = cudaStreamSynchronize(render_stream);
    if (sync_err != cudaSuccess)
    {
       printf("❌ Kernel execution error: %s\n", cudaGetErrorString(sync_err));
@@ -512,7 +540,9 @@ extern "C" void resetAdaptiveBuffer(void *d_pixel_sample_counts, int num_pixels)
 {
    if (d_pixel_sample_counts != nullptr)
    {
-      cudaMemset(d_pixel_sample_counts, 0, (size_t)num_pixels * sizeof(int));
+      // Same stream as the render kernel so the reset is guaranteed to complete first.
+      cudaStream_t stream = s_compute_stream ? s_compute_stream : 0;
+      cudaMemsetAsync(d_pixel_sample_counts, 0, (size_t)num_pixels * sizeof(int), stream);
    }
 }
 
@@ -529,16 +559,20 @@ extern "C" int countConvergedPixels(void *d_pixel_sample_counts, int num_pixels)
    if (d_pixel_sample_counts == nullptr)
       return 0;
 
-   // Copy buffer to host and count negative values (converged pixels)
-   std::vector<int> host_counts(num_pixels);
-   cudaMemcpy(host_counts.data(), d_pixel_sample_counts, (size_t)num_pixels * sizeof(int), cudaMemcpyDeviceToHost);
+   // GPU-side reduction: count negative values (converged pixels) using warp-shuffle.
+   // Avoids expensive full-buffer D2H transfer that the old host-side loop required.
+   if (s_d_converged_count == nullptr)
+      cudaMalloc(&s_d_converged_count, sizeof(int));
+
+   cudaMemset(s_d_converged_count, 0, sizeof(int));
+
+   int threads_per_block = 256;
+   int blocks = (num_pixels + threads_per_block - 1) / threads_per_block;
+   countConvergedKernel<<<blocks, threads_per_block>>>(
+       static_cast<const int *>(d_pixel_sample_counts), num_pixels, s_d_converged_count);
 
    int converged = 0;
-   for (int i = 0; i < num_pixels; ++i)
-   {
-      if (host_counts[i] < 0)
-         ++converged;
-   }
+   cudaMemcpy(&converged, s_d_converged_count, sizeof(int), cudaMemcpyDeviceToHost);
    return converged;
 }
Original file line number	Diff line number	Diff line change
`@@ -545,6 +545,17 @@ extern "C" __global__ void __raygen__rg()`
`545`	`545`	`}`
`546`	`546`	`}`
`547`	`547`
	`548`	`+ // Firefly rejection: clamp per-sample luminance to prevent single HDR texels`
	`549`	`+ // (e.g., sun disk in outdoor environment maps) from causing permanent white dots.`
	`550`	`+ // Uses a luminance-preserving scale so hue is maintained.`
	`551`	`+ constexpr float FIREFLY_CLAMP = 20.0f;`
	`552`	`+ float sample_lum = 0.2126f * color.x + 0.7152f * color.y + 0.0722f * color.z;`
	`553`	`+ if (sample_lum > FIREFLY_CLAMP)`
	`554`	`+ {`
	`555`	`+ float scale = FIREFLY_CLAMP / sample_lum;`
	`556`	`+ color = color * scale;`
	`557`	`+ }`
	`558`	`+`
`548`	`559`	`accumulated = accumulated + color;`
`549`	`560`	`}`
`550`	`561`