david-svitov · Mar 14, 2025
diff --git a/‎cuda_rasterizer/auxiliary.h
+35-3 b/‎cuda_rasterizer/auxiliary.h
+35-3
diff --git a/‎cuda_rasterizer/backward.cu
+32-5 b/‎cuda_rasterizer/backward.cu
+32-5
diff --git a/‎cuda_rasterizer/backward.h
+2 b/‎cuda_rasterizer/backward.h
+2
diff --git a/‎cuda_rasterizer/forward.cu
+93-26 b/‎cuda_rasterizer/forward.cu
+93-26
diff --git a/‎cuda_rasterizer/forward.h
+16 b/‎cuda_rasterizer/forward.h
+16
diff --git a/‎cuda_rasterizer/rasterizer.h
+2 b/‎cuda_rasterizer/rasterizer.h
+2
diff --git a/‎cuda_rasterizer/rasterizer_impl.cu
+35-2 b/‎cuda_rasterizer/rasterizer_impl.cu
+35-2
diff --git a/‎cuda_rasterizer/rasterizer_impl.h
+1 b/‎cuda_rasterizer/rasterizer_impl.h
+1
diff --git a/‎cuda_rasterizer/stopthepop_2DGS/resorted_render.cuh
+693 b/‎cuda_rasterizer/stopthepop_2DGS/resorted_render.cuh
+693
diff --git a/‎cuda_rasterizer/stopthepop_2DGS/stopthepop_common.cuh
+319 b/‎cuda_rasterizer/stopthepop_2DGS/stopthepop_common.cuh
+319
diff --git a/‎diff_bbsplat_rasterization/__init__.py
+4-2 b/‎diff_bbsplat_rasterization/__init__.py
+4-2
diff --git a/‎rasterize_points.cu
+4 b/‎rasterize_points.cu
+4
diff --git a/‎rasterize_points.h
+2 b/‎rasterize_points.h
+2
@@ -27,6 +27,7 @@
 #define MIDDEPTH_OFFSET 5
 #define DISTORTION_OFFSET 6
 #define MEDIAN_WEIGHT_OFFSET 7
+#define OUTPUT_CHANNELS 8
 
 // distortion helper macros
 #define BACKFACE_CULL 1
@@ -35,6 +36,16 @@
 #define FAR_PLANE 100.0
 #define DETACH_WEIGHT 0
 
+#define TILE_SORTING 0
+#define PIXEL_RESORTING 0
+#define BUFFER_LENGTH 8
+
+#define FAST_INFERENCE 0
+#define MAX_BILLBOARD_SIZE 1000
+
+constexpr uint32_t WARP_SIZE = 32U;
+constexpr uint32_t WARP_MASK = 0xFFFFFFFFU;
+
 // Spherical harmonics coefficients
 __device__ const float SH_C0 = 0.28209479177387814f;
 __device__ const float SH_C1 = 0.4886025119029199f;
@@ -55,12 +66,33 @@ __device__ const float SH_C3[] = {
 	-0.5900435899266435f
 };
 
+template<typename T>
+__device__ void swap_T(T& a, T& b)
+{
+	T temp = a;
+	a = b;
+	b = temp;
+}
+
 __forceinline__ __device__ float ndc2Pix(float v, int S)
 {
 	return ((v + 1.0) * S - 1.0) * 0.5;
 }
 
-__forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
+
+__forceinline__ __device__ void getRect(const float2 p, float2 max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
+{
+	rect_min = {
+		min(grid.x, max((int)0, (int)floorf((p.x - max_radius.x) / BLOCK_X))),
+		min(grid.y, max((int)0, (int)floorf((p.y - max_radius.y) / BLOCK_Y)))
+	};
+	rect_max = {
+		min(grid.x, max((int)0, (int)ceilf((p.x + max_radius.x) / BLOCK_X))),
+		min(grid.y, max((int)0, (int)ceilf((p.y + max_radius.y) / BLOCK_Y)))
+	};
+}
+
+__forceinline__ __device__ void getRectOld(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
 {
 	rect_min = {
 		min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
@@ -261,7 +293,7 @@ scale_to_mat(const float3 scale, const float glob_scale) {
 	glm::mat3 S = glm::mat3(1.f);
 	S[0][0] = glob_scale * scale.x;
 	S[1][1] = glob_scale * scale.y;
-	S[2][2] = glob_scale * scale.z;
+	//S[2][2] = glob_scale * scale.z;
 	return S;
 }
 
@@ -276,4 +308,4 @@ throw std::runtime_error(cudaGetErrorString(ret)); \
 } \
 }
 
-#endif
+#endif
@@ -14,6 +14,7 @@
 #include "auxiliary.h"
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include "stopthepop_2DGS/resorted_render.cuh"
 namespace cg = cooperative_groups;
 
 // Backward pass for conversion of spherical harmonics to RGB for
@@ -257,11 +258,6 @@ renderCUDA(
 	float last_alpha = 0;
 	float last_color[C] = { 0 };
 
-	// Gradient of pixel coordinate w.r.t. normalized 
-	// screen-space viewport corrdinates (-1 to 1)
-	const float ddelx_dx = 0.5 * W;
-	const float ddely_dy = 0.5 * H;
-
 	// Traverse all Gaussians
 	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
 	{
@@ -744,6 +740,8 @@ void BACKWARD::render(
 	const float* depths,
 	const float* final_Ts,
 	const uint32_t* n_contrib,
+	const float* out_color,
+	const float* out_others,
 	const float* dL_dpixels,
 	const float* dL_depths,
 	float * dL_dtransMat,
@@ -753,6 +751,34 @@ void BACKWARD::render(
 	float* dL_dtexture_alpha,
 	float* dL_dtexture_color)
 {
+#if PIXEL_RESORTING
+    renderkBufferBackwardCUDA<NUM_CHANNELS> << <grid, block >> >(
+		ranges,
+		point_list,
+		W, H,
+		focal_x, focal_y,
+		bg_color,
+		texture_alpha,
+		texture_color,
+		texture_size,
+		means2D,
+		normal_array,
+		transMats,
+		colors,
+		depths,
+		final_Ts,
+		n_contrib,
+		out_color,
+	    out_others,
+		dL_dpixels,
+		dL_depths,
+		dL_dtransMat,
+		dL_dmean2D,
+		dL_dnormal3D,
+		dL_dcolors,
+		dL_dtexture_alpha,
+		dL_dtexture_color);
+#else
 	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
 		ranges,
 		point_list,
@@ -777,4 +803,5 @@ void BACKWARD::render(
 		dL_dcolors,
 		dL_dtexture_alpha,
 		dL_dtexture_color);
+#endif
 }
@@ -37,6 +37,8 @@ namespace BACKWARD
 		const float* depths,
 		const float* final_Ts,
 		const uint32_t* n_contrib,
+		const float* out_color,
+		const float* out_others,
 		const float* dL_dpixels,
 		const float* dL_depths,
 		float * dL_dtransMat,
 
@@ -12,6 +12,8 @@
 #include "forward.h"
 #include "grid_sample.h"
 #include "auxiliary.h"
+#include "stopthepop_2DGS/stopthepop_common.cuh"
+#include "stopthepop_2DGS/resorted_render.cuh"
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
@@ -181,6 +183,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	const float tan_fovx, const float tan_fovy,
 	const float focal_x, const float focal_y,
 	int* radii,
+    float2* rects,
 	float2* points_xy_image,
 	float* depths,
 	float* transMats,
@@ -233,9 +236,18 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	float radius = ceil(truncated_R * max(max(extent.x, extent.y), FilterSize));
 
 	uint2 rect_min, rect_max;
-	getRect(center, radius, rect_min, rect_max, grid);
-	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
-		return;
+#if FAST_INFERENCE
+	if (radius > MAX_BILLBOARD_SIZE)
+	    getRectOld(center, radius, rect_min, rect_max, grid);
+	else
+	    getRect(center, extent, rect_min, rect_max, grid);
+#else
+    getRectOld(center, radius, rect_min, rect_max, grid);
+#endif
+
+	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) {
+    		return;
+	}
 
 	// compute colors 
 	if (colors_precomp == nullptr) {
@@ -246,6 +258,7 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	}
 
 	depths[idx] = p_view.z;
+    rects[idx] = extent;
 	radii[idx] = (int)radius;
 	points_xy_image[idx] = center;
 	// store them in float4
@@ -299,7 +312,6 @@ renderCUDA(
 
 	// Allocate storage for batches of collectively fetched data.
 	__shared__ int collected_id[BLOCK_SIZE];
-	__shared__ float2 collected_xy[BLOCK_SIZE];
 	__shared__ float3 collected_normal[BLOCK_SIZE];
 	__shared__ float3 collected_Tu[BLOCK_SIZE];
 	__shared__ float3 collected_Tv[BLOCK_SIZE];
@@ -319,7 +331,7 @@ renderCUDA(
 	float dist1 = {0};
 	float dist2 = {0};
 	float distortion = {0};
-	float median_depth = {0};
+	float median_depth = {100};
 	float median_weight = {0};
 	float median_contributor = {-1};
 
@@ -339,7 +351,6 @@ renderCUDA(
 		{
 			int coll_id = point_list[range.x + progress];
 			collected_id[block.thread_rank()] = coll_id;
-			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
 			collected_normal[block.thread_rank()] = normal_array[coll_id];
 			collected_Tu[block.thread_rank()] = {transMats[9 * coll_id+0], transMats[9 * coll_id+1], transMats[9 * coll_id+2]};
 			collected_Tv[block.thread_rank()] = {transMats[9 * coll_id+3], transMats[9 * coll_id+4], transMats[9 * coll_id+5]};
@@ -409,7 +420,7 @@ renderCUDA(
 			float error = mapped_depth * mapped_depth * A + dist2 - 2 * mapped_depth * dist1;
 			distortion += error * alpha * T;
 
-			if (T > 0.5) {
+			if (T > 0.5 && alpha > 0.05) {
 				median_depth = depth;
 				median_weight = alpha * T;
 				median_contributor = contributor;
@@ -484,25 +495,48 @@ void FORWARD::render(
 	float* out_others,
 	float* impact)
 {
-	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
-		ranges,
-		point_list,
-		W, H,
-		focal_x, focal_y,
-		means2D,
-		colors,
-		texture_alpha,
-		texture_color,
-		texture_size,
-		transMats,
-		depths,
-		normal_array,
-		final_T,
-		n_contrib,
-		bg_color,
-		out_color,
-		out_others,
-		impact);
+
+#if PIXEL_RESORTING
+    renderBufferCUDA<NUM_CHANNELS> << <grid, block >> > (
+	    ranges,
+	    point_list,
+	    W, H,
+	    focal_x, focal_y,
+	    means2D,
+	    colors,
+	    texture_alpha,
+	    texture_color,
+	    texture_size,
+	    transMats,
+	    depths,
+	    normal_array,
+	    final_T,
+	    n_contrib,
+	    bg_color,
+	    out_color,
+	    out_others,
+	    impact);
+#else
+    renderCUDA<NUM_CHANNELS> << <grid, block >> > (
+	    ranges,
+	    point_list,
+	    W, H,
+	    focal_x, focal_y,
+	    means2D,
+	    colors,
+	    texture_alpha,
+	    texture_color,
+	    texture_size,
+	    transMats,
+	    depths,
+	    normal_array,
+	    final_T,
+	    n_contrib,
+	    bg_color,
+	    out_color,
+	    out_others,
+	    impact);
+#endif
 }
 
 void FORWARD::preprocess(int P, int D, int M,
@@ -521,6 +555,7 @@ void FORWARD::preprocess(int P, int D, int M,
 	const float focal_x, const float focal_y,
 	const float tan_fovx, const float tan_fovy,
 	int* radii,
+    float2* rects,
 	float2* means2D,
 	float* depths,
 	float* transMats,
@@ -547,6 +582,7 @@ void FORWARD::preprocess(int P, int D, int M,
 		tan_fovx, tan_fovy,
 		focal_x, focal_y,
 		radii,
+		rects,
 		means2D,
 		depths,
 		transMats,
@@ -557,3 +593,34 @@ void FORWARD::preprocess(int P, int D, int M,
 		prefiltered
 		);
 }
+
+void FORWARD::duplicate(
+	int P,
+	int W, int H,
+	const float focal_x, const float focal_y,
+	const float2* means2D,
+	const float* depths,
+	const float2* scales,
+	const float* view2gaussians,
+	const uint32_t* offsets,
+	const int* radii,
+	const float2* rects,
+	uint64_t* gaussian_keys_unsorted,
+	uint32_t* gaussian_values_unsorted,
+	dim3 grid)
+{
+	duplicateWithKeys_extended<false, true> << <(P + 255) / 256, 256 >> >(
+		P, W, H, focal_x, focal_y,
+		means2D,
+		depths,
+		scales,
+		view2gaussians,
+		offsets,
+		radii,
+		rects,
+		gaussian_keys_unsorted,
+		gaussian_values_unsorted,
+		grid
+	);
+
+}
@@ -37,6 +37,7 @@ namespace FORWARD
 		const float focal_x, float focal_y,
 		const float tan_fovx, float tan_fovy,
 		int* radii,
+		float2* rects,
 		float2* points_xy_image,
 		float* depths,
 		float* transMats,
@@ -67,6 +68,21 @@ namespace FORWARD
 		float* out_color,
 		float* out_others,
 		float* impact);
+		
+    void duplicate(
+		int P,
+		int W, int H,
+		const float focal_x, const float focal_y,
+		const float2 *means2D,
+		const float* depths,
+		const float2* scales,
+		const float* view2gaussians,
+		const uint32_t* offsets,
+		const int* radii,
+		const float2* rects,
+		uint64_t* gaussian_keys_unsorted,
+		uint32_t* gaussian_values_unsorted,
+		dim3 grid);
 }
 
 
 
@@ -75,6 +75,8 @@ namespace CudaRasterizer
 			const float* campos,
 			const float tan_fovx, float tan_fovy,
 			const int* radii,
+			const float* out_color,
+			const float* out_others,
 			char* geom_buffer,
 			char* binning_buffer,
 			char* image_buffer,
 
@@ -75,6 +75,7 @@ __global__ void duplicateWithKeys(
 	uint64_t* gaussian_keys_unsorted,
 	uint32_t* gaussian_values_unsorted,
 	int* radii,
+	float2* rects,
 	dim3 grid)
 {
 	auto idx = cg::this_grid().thread_rank();
@@ -88,7 +89,14 @@ __global__ void duplicateWithKeys(
 		uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
 		uint2 rect_min, rect_max;
 
-		getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
+#if FAST_INFERENCE
+		if (radii[idx] > MAX_BILLBOARD_SIZE)
+		    getRectOld(points_xy[idx], radii[idx], rect_min, rect_max, grid);
+		else
+		    getRect(points_xy[idx], rects[idx], rect_min, rect_max, grid);
+#else
+        getRectOld(points_xy[idx], radii[idx], rect_min, rect_max, grid);
+#endif
 
 		// For each tile that the bounding rect overlaps, emit a 
 		// key/value pair. The key is |  tile ID  |      depth      |,
@@ -158,6 +166,7 @@ CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& ch
 	obtain(chunk, geom.depths, P, 128);
 	obtain(chunk, geom.clamped, P * 3, 128);
 	obtain(chunk, geom.internal_radii, P, 128);
+	obtain(chunk, geom.rects2D, P, 128);
 	obtain(chunk, geom.means2D, P, 128);
 	obtain(chunk, geom.transMat, P * 9, 128);
 	obtain(chunk, geom.normal, P, 128);
@@ -265,6 +274,7 @@ int CudaRasterizer::Rasterizer::forward(
 		focal_x, focal_y,
 		tan_fovx, tan_fovy,
 		radii,
+		geomState.rects2D,
 		geomState.means2D,
 		geomState.depths,
 		geomState.transMat,
@@ -286,7 +296,24 @@ int CudaRasterizer::Rasterizer::forward(
 	size_t binning_chunk_size = required<BinningState>(num_rendered);
 	char* binning_chunkptr = binningBuffer(binning_chunk_size);
 	BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
+	
+	const float* transMat_ptr = transMat_precomp != nullptr ? transMat_precomp : geomState.transMat;
 
+	#if TILE_SORTING
+        FORWARD::duplicate(
+	    P, width, height, focal_x, focal_y, 
+	    geomState.means2D,
+	    geomState.depths,
+	    (float2*)scales,
+	    transMat_ptr,
+	    geomState.point_offsets,
+	    radii,
+	    geomState.rects2D,
+	    binningState.point_list_keys_unsorted,
+	    binningState.point_list_unsorted,
+	    tile_grid
+    );
+    #else
 	// For each instance to be rendered, produce adequate [ tile | depth ] key 
 	// and corresponding dublicated Gaussian indices to be sorted
 	duplicateWithKeys << <(P + 255) / 256, 256 >> > (
@@ -297,7 +324,10 @@ int CudaRasterizer::Rasterizer::forward(
 		binningState.point_list_keys_unsorted,
 		binningState.point_list_unsorted,
 		radii,
+		geomState.rects2D,
 		tile_grid)
+	#endif
+	
 	CHECK_CUDA(, debug)
 
 	int bit = getHigherMsb(tile_grid.x * tile_grid.y);
@@ -322,7 +352,6 @@ int CudaRasterizer::Rasterizer::forward(
 
 	// Let each tile blend its range of Gaussians independently in parallel
 	const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
-	const float* transMat_ptr = transMat_precomp != nullptr ? transMat_precomp : geomState.transMat;
 	CHECK_CUDA(FORWARD::render(
 		tile_grid, block,
 		imgState.ranges,
@@ -368,6 +397,8 @@ void CudaRasterizer::Rasterizer::backward(
 	const float* campos,
 	const float tan_fovx, float tan_fovy,
 	const int* radii,
+	const float* out_color,
+	const float* out_others,
 	char* geom_buffer,
 	char* binning_buffer,
 	char* img_buffer,
@@ -424,6 +455,8 @@ void CudaRasterizer::Rasterizer::backward(
 		depth_ptr,
 		imgState.accum_alpha,
 		imgState.n_contrib,
+		out_color,
+		out_others,
 		dL_dpix,
 		dL_depths,
 		dL_dtransMat,
 
@@ -33,6 +33,7 @@ namespace CudaRasterizer
 		char* scanning_space;
 		bool* clamped;
 		int* internal_radii;
+		float2* rects2D;
 		float2* means2D;
 		float* transMat;
 		float3* normal;
 
@@ -0,0 +1,319 @@
+/*
+ * Copyright (C) 2024, Graz University of Technology
+ * This code is licensed under the MIT license (see LICENSE.txt in this folder for details)
+ */
+
+ #pragma once
+
+ #include "../auxiliary.h"
+ 
+ #include <cooperative_groups.h>
+ namespace cg = cooperative_groups;
+
+__device__ __inline__ uint64_t constructSortKey(uint32_t tile_id, float depth)
+{
+    uint64_t key = tile_id;
+    key <<= 32;
+    key |= *((uint32_t*)&depth);
+    return key;
+}
+
+// Given a ray and a Gaussian primitive, compute the intersection depth.
+__device__ __inline__ bool getIntersectPoint(
+    const int W, const int H,
+    const float fx, const float fy,
+    const float2 scale, 
+    const glm::vec2 pixel_center,
+    const float* view2gaussian,
+    float& depth
+){
+ 
+    // Fisrt compute two homogeneous planes, See Eq. (8)
+	float3 Tu = {view2gaussian[0], view2gaussian[1], view2gaussian[2]};
+	float3 Tv = {view2gaussian[3], view2gaussian[4], view2gaussian[5]};
+	float3 Tw = {view2gaussian[6], view2gaussian[7], view2gaussian[8]};
+	float3 k = {-Tu.x + pixel_center.x * Tw.x, -Tu.y + pixel_center.x * Tw.y, -Tu.z + pixel_center.x * Tw.z};
+	float3 l = {-Tv.x + pixel_center.y * Tw.x, -Tv.y + pixel_center.y * Tw.y, -Tv.z + pixel_center.y * Tw.z};
+	// cross product of two planes is a line (i.e., homogeneous point), See Eq. (10)
+	float3 p = crossProduct(k, l);
+	
+	if (p.z == 0.0) return false; // there is not intersection
+	// TODO: no intersection if distance < scale 
+	
+	// 3d homogeneous point to 2d point on the splat
+	float2 s = {p.x / p.z, p.y / p.z};
+	// 3d distance. Compute Mahalanobis distance in the canonical splat' space
+	float rho3d = (s.x * s.x + s.y * s.y);
+
+	depth = (s.x * Tw.x + s.y * Tw.y) + Tw.z;  // splat depth 
+    return true;
+}
+
+ 
+template<bool TILE_BASED_CULLING = false, bool LOAD_BALANCING = true>
+__global__ void duplicateWithKeys_extended(
+    int P, 
+    int W, int H,
+    const float focal_x, const float focal_y,
+    const float2* __restrict__ points_xy,
+    const float* __restrict__ depths,
+    const float2* __restrict__ scales,
+    const float* __restrict__ view2gaussians,
+    const uint32_t* __restrict__  offsets,
+    const int* __restrict__ radii,
+    const float2* __restrict__ rects,
+    uint64_t* __restrict__ gaussian_keys_unsorted,
+    uint32_t* __restrict__ gaussian_values_unsorted,
+    dim3 grid)
+{	
+    auto block = cg::this_thread_block();
+    auto warp = cg::tiled_partition<WARP_SIZE>(block);
+
+	// Since the projection of the quadratic surface on the image is non-convex, 
+	// there is no explicit solution for computing the pixel with the maximum weight on the image,
+	// and tile-based culling is not performed.
+    constexpr bool EVAL_MAX_CONTRIB_POS = false;
+    constexpr bool PER_TILE_DEPTH = true;
+
+#define RETURN_OR_INACTIVE() if constexpr(LOAD_BALANCING) { active = false; } else { return; }
+    uint32_t idx = cg::this_grid().thread_rank();
+    bool active = true;
+    if (idx >= P) {
+	    RETURN_OR_INACTIVE();
+	    idx = P - 1;
+    }
+
+    const int radius = radii[idx];
+    if (radius <= 0) {
+	    RETURN_OR_INACTIVE();
+    }
+
+	// If the thread exceeds the Gaussian index, the Gaussian projection is zero, 
+	// and there are no Gaussians to process in the current warp, return.
+    if constexpr(LOAD_BALANCING)
+	    if (__ballot_sync(WARP_MASK, active) == 0)
+		    return;
+
+    // Find this Gaussian's offset in buffer for writing keys/values.
+    uint32_t off_init = (idx == 0) ? 0 : offsets[idx - 1];
+
+    const int offset_to_init = offsets[idx];
+    const float global_depth_init = depths[idx];
+
+    const float2 xy_init = points_xy[idx];
+    const float2 rect_dims_init = rects[idx];
+
+    __shared__ float2 s_xy[BLOCK_SIZE];
+    __shared__ float2 s_rect_dims[BLOCK_SIZE];
+    __shared__ float s_radius[BLOCK_SIZE];
+    s_xy[block.thread_rank()] = xy_init;
+    s_rect_dims[block.thread_rank()] = rect_dims_init;
+    s_radius[block.thread_rank()] = radius;
+
+    uint2 rect_min_init, rect_max_init;
+#if FAST_INFERENCE
+    if (radius > MAX_BILLBOARD_SIZE)
+	    getRectOld(xy_init, radius, rect_min_init, rect_max_init, grid);
+	else
+	    getRect(xy_init, rect_dims_init, rect_min_init, rect_max_init, grid);
+# else
+    getRectOld(xy_init, radius, rect_min_init, rect_max_init, grid);
+#endif
+
+    __shared__ float s_view2gaussians[BLOCK_SIZE * 9];
+    __shared__ float2 s_scales[BLOCK_SIZE];
+
+    if (PER_TILE_DEPTH)
+    {
+	    s_scales[block.thread_rank()] = scales[idx];
+	    for (int ii = 0; ii < 9; ii++)
+		    s_view2gaussians[9 * block.thread_rank() + ii] = view2gaussians[idx * 9 + ii];
+    }
+
+    constexpr uint32_t SEQUENTIAL_TILE_THRESH = 32U; // all tiles above this threshold will be computed cooperatively
+    const uint32_t rect_width_init = (rect_max_init.x - rect_min_init.x);
+    const uint32_t tile_count_init = (rect_max_init.y - rect_min_init.y) * rect_width_init;
+
+    // Generate no key/value pair for invisible Gaussians
+    if (tile_count_init == 0)	{
+	    RETURN_OR_INACTIVE();
+    }
+    auto tile_function = [&](const int W, const int H,
+						     const float fx, const float fy,
+						     float2 xy,
+						     int x, int y,// tile ID
+						     const float2 scale, 
+						     const float* view2gaussian, 
+						     const float global_depth,
+						     float& depth)  
+	    {
+		    const glm::vec2 tile_min(x * BLOCK_X, y * BLOCK_Y);
+		    const glm::vec2 tile_max((x + 1) * BLOCK_X - 1, (y + 1) * BLOCK_Y - 1); // 像素坐标
+
+		    glm::vec2 max_pos;
+		    if constexpr (PER_TILE_DEPTH) 
+		    {	
+			    glm::vec2 target_pos = {max(min(xy.x, tile_max.x), tile_min.x), max(min(xy.y, tile_max.y), tile_min.y)};
+
+				// Or select the tile's center pixel as the target_pos.
+			    // const glm::vec2 tile_center = (tile_min + tile_max) * 0.5f;
+			    // glm::vec2 target_pos = tile_center;
+
+			    bool intersect = getIntersectPoint(
+				    W, H, fx, fy, scale, target_pos, view2gaussian, depth); // Compute the intersection point of the quadratic surface.
+			    if (intersect)
+				    depth = max(0.0f, depth);
+			    else // If there is no intersection, sort by the Gaussian centroid.
+				    depth = global_depth;
+		    }
+		    else
+		    {
+			    depth = global_depth;
+		    }
+
+			// Since the quadratic surface is non-convex, tile-based culling is not performed.
+		    // return (!TILE_BASED_CULLING) || max_opac_factor <= opacity_factor_threshold;
+		    return true; 
+	    };
+
+    if (active)
+    {
+	    const float2 scale_init = {
+		    s_scales[block.thread_rank()].x, 
+		    s_scales[block.thread_rank()].y};
+
+	    float view2gaussian_init[9];
+	    for (int ii = 0; ii < 9; ii++)
+		    view2gaussian_init[ii] = s_view2gaussians[9 * block.thread_rank() + ii];
+
+	    for (uint32_t tile_idx = 0; tile_idx < tile_count_init && (!LOAD_BALANCING || tile_idx < SEQUENTIAL_TILE_THRESH); tile_idx++)
+	    {
+		    const int y = (tile_idx / rect_width_init) + rect_min_init.y;
+		    const int x = (tile_idx % rect_width_init) + rect_min_init.x;
+
+		    float depth;
+		    bool write_tile = tile_function(
+				    W, H, focal_x, focal_y,
+				    xy_init, x, y, scale_init, view2gaussian_init, global_depth_init, depth);
+		    if (write_tile)
+		    {
+			    if (off_init < offset_to_init)
+			    {
+				    const uint32_t tile_id = y * grid.x + x;
+				    gaussian_values_unsorted[off_init] = idx;
+				    gaussian_keys_unsorted[off_init] = constructSortKey(tile_id, depth);
+			    }
+			    else
+			    {
+#ifdef DUPLICATE_OPT_DEBUG
+				    printf("Error (sequential): Too little memory reserved in preprocess: off=%d off_to=%d idx=%d\n", off_init, offset_to_init, idx);
+#endif
+			    }
+			    off_init++;
+		    }
+	    }
+    }
+
+#undef RETURN_OR_INACTIVE
+
+    if (!LOAD_BALANCING) // Coordinate to handle the unprocessed tasks of other threads within the same warp.
+	    return;
+
+    const uint32_t idx_init = idx; // Current thread idx.
+    const uint32_t lane_idx = cg::this_thread_block().thread_rank() % WARP_SIZE;
+    const uint32_t warp_idx = cg::this_thread_block().thread_rank() / WARP_SIZE;
+    unsigned int lane_mask_allprev_excl = 0xFFFFFFFFU >> (WARP_SIZE - lane_idx);
+
+    const int32_t compute_cooperatively = active && tile_count_init > SEQUENTIAL_TILE_THRESH; // Determine whether additional idle threads are needed for computation.
+    const uint32_t remaining_threads = __ballot_sync(WARP_MASK, compute_cooperatively);
+    if (remaining_threads == 0)
+	    return;
+ 
+    uint32_t n_remaining_threads = __popc(remaining_threads); // The number of threads required for collaborative computation.
+    for (int n = 0; n < n_remaining_threads && n < WARP_SIZE; n++) 
+    {
+	    int i = __fns(remaining_threads, 0, n+1); // find lane index of next remaining thread
+
+	    uint32_t idx_coop = __shfl_sync(WARP_MASK, idx_init, i); 
+	    uint32_t off_coop = __shfl_sync(WARP_MASK, off_init, i); 
+
+	    const uint32_t offset_to = __shfl_sync(WARP_MASK, offset_to_init, i);
+	    const float global_depth = __shfl_sync(WARP_MASK, global_depth_init, i);
+
+	    const float2 xy = s_xy[warp.meta_group_rank() * WARP_SIZE + i];
+	    const float2 rect_dims = s_rect_dims[warp.meta_group_rank() * WARP_SIZE + i];
+	    const float rad = s_radius[warp.meta_group_rank() * WARP_SIZE + i];
+	    const float2 scale = {
+		    s_scales[warp.meta_group_rank() * WARP_SIZE + i].x, 
+		    s_scales[warp.meta_group_rank() * WARP_SIZE + i].y};
+	    float view2gaussian[9];
+	    for (int ii = 0; ii < 9; ii++)
+		    view2gaussian[ii] = s_view2gaussians[9 * (warp.meta_group_rank() * WARP_SIZE + i) + ii];
+
+	    uint2 rect_min, rect_max;
+#if FAST_INFERENCE
+        if (radius > MAX_BILLBOARD_SIZE)
+	        getRectOld(xy, rad, rect_min, rect_max, grid);
+	    else
+	        getRect(xy, rect_dims, rect_min, rect_max, grid);
+#else
+        getRectOld(xy, rad, rect_min, rect_max, grid);
+#endif
+
+	    const uint32_t rect_width = (rect_max.x - rect_min.x);
+	    const uint32_t tile_count = (rect_max.y - rect_min.y) * rect_width;
+	    const uint32_t remaining_tile_count = tile_count - SEQUENTIAL_TILE_THRESH;
+	    const int32_t n_iterations = (remaining_tile_count + WARP_SIZE - 1) / WARP_SIZE;
+	    for (int it = 0; it < n_iterations; it++)
+	    {
+		    int tile_idx = it * WARP_SIZE + lane_idx + SEQUENTIAL_TILE_THRESH; // it*32 + local_warp_idx + 32
+		    int active_curr_it = tile_idx < tile_count;
+ 
+		    int y = (tile_idx / rect_width) + rect_min.y;
+		    int x = (tile_idx % rect_width) + rect_min.x;
+
+		    float depth;
+		    bool write_tile = tile_function(
+			    W, H, focal_x, focal_y,
+			    xy, x, y, scale, view2gaussian, global_depth, depth
+		    );
+
+		    const uint32_t write = active_curr_it && write_tile;
+
+		    uint32_t n_writes, write_offset;
+		    if constexpr (!TILE_BASED_CULLING)
+		    {
+			    n_writes = WARP_SIZE;
+			    write_offset = off_coop + lane_idx;
+		    }
+		    else
+		    {
+			    const uint32_t write_ballot = __ballot_sync(WARP_MASK, write);
+			    n_writes = __popc(write_ballot);
+ 
+			    const uint32_t write_offset_it = __popc(write_ballot & lane_mask_allprev_excl);
+			    write_offset = off_coop + write_offset_it;
+		    }
+
+		    if (write)
+		    {
+			    if (write_offset < offset_to)
+			    {
+				    const uint32_t tile_id = y * grid.x + x;
+				    gaussian_values_unsorted[write_offset] = idx_coop;
+				    gaussian_keys_unsorted[write_offset] = constructSortKey(tile_id, depth);
+			    }
+ #ifdef DUPLICATE_OPT_DEBUG
+			    else
+			    {
+				    printf("Error (parallel): Too little memory reserved in preprocess: off=%d off_to=%d idx=%d tile_count=%d it=%d | x=%d y=%d rect=(%d %d - %d %d)\n", 
+							write_offset, offset_to, idx_coop, tile_count, it, x, y, rect_min.x, rect_min.y, rect_max.x, rect_max.y);
+			    }
+ #endif
+		    }
+		    off_coop += n_writes;
+	    }
+
+	    __syncwarp();
+    }
+ }
@@ -98,7 +98,7 @@ def forward(
         # Keep relevant tensors for backward
         ctx.raster_settings = raster_settings
         ctx.num_rendered = num_rendered
-        ctx.save_for_backward(colors_precomp, means3D, scales, rotations, texture_alpha, textured_color, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer)
+        ctx.save_for_backward(color, depth, colors_precomp, means3D, scales, rotations, texture_alpha, textured_color, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer)
 
         return color, radii, impact, depth
 
@@ -107,12 +107,14 @@ def backward(ctx, grad_out_color, grad_radii, grad_impact, grad_depth):
         # Restore necessary values from context
         num_rendered = ctx.num_rendered
         raster_settings = ctx.raster_settings
-        colors_precomp, means3D, scales, rotations, texture_alpha, textured_color, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer = ctx.saved_tensors
+        out_colors, out_others, colors_precomp, means3D, scales, rotations, texture_alpha, textured_color, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer = ctx.saved_tensors
 
         # Restructure args as C++ method expects them
         args = (raster_settings.bg,
                 means3D, 
                 radii, 
+                out_colors, 
+                out_others,
                 colors_precomp, 
                 scales, 
                 rotations,
 
@@ -164,6 +164,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Te
 	 const torch::Tensor& background,
 	const torch::Tensor& means3D,
 	const torch::Tensor& radii,
+	const torch::Tensor& out_colors,
+	const torch::Tensor& out_others,
 	const torch::Tensor& colors,
 	const torch::Tensor& scales,
 	const torch::Tensor& rotations,
@@ -248,6 +250,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Te
 	  tan_fovx,
 	  tan_fovy,
 	  radii.contiguous().data<int>(),
+	  out_colors.contiguous().data<float>(),
+	  out_others.contiguous().data<float>(),
 	  reinterpret_cast<char*>(geomBuffer.contiguous().data_ptr()),
 	  reinterpret_cast<char*>(binningBuffer.contiguous().data_ptr()),
 	  reinterpret_cast<char*>(imageBuffer.contiguous().data_ptr()),
 
@@ -43,6 +43,8 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Te
 	 const torch::Tensor& background,
 	const torch::Tensor& means3D,
 	const torch::Tensor& radii,
+	const torch::Tensor& out_color,
+	const torch::Tensor& out_others,
 	const torch::Tensor& colors,
 	const torch::Tensor& scales,
 	const torch::Tensor& rotations,