Merge pull request #1 from TarzanZhao/sandeep/compute_local_2d

TarzanZhao · web-flow · commit d898cb446538 · 2024-04-12T14:45:35.000-04:00
Render function Cuda grid refactored from 2D to 1D
diff --git a/cuda_rasterizer/auxiliary.h b/cuda_rasterizer/auxiliary.h
@@ -18,6 +18,7 @@
 
 #define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
 #define NUM_WARPS (BLOCK_SIZE/32)
+#define cdiv(a, b) ((a + b - 1) / b)
 
 // Spherical harmonics coefficients
 __device__ const float SH_C0 = 0.28209479177387814f;
diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu
@@ -408,7 +408,7 @@ renderCUDA(
 	const float* __restrict__ colors,
 	const float* __restrict__ final_Ts,
 	const uint32_t* __restrict__ n_contrib,
-	const bool* __restrict__ compute_locally,
+	const int* __restrict__ compute_locally_1D_2D_map,
 	const float* __restrict__ dL_dpixels,
 	float3* __restrict__ dL_dmean2D,
 	float4* __restrict__ dL_dconic2D,
@@ -423,10 +423,8 @@ renderCUDA(
 	// auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
 
 	// method 2: this seems to be faster than others, in set of experiments: fix_com_loc_flc_1/2/3
-	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
-	auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
-	if (!compute_locally[block_id])
-		return;
+	const int block_id_1d = block.group_index().x;
+    const int block_id = compute_locally_1D_2D_map[block_id_1d];
 
 	// method 3
 	// __shared__ bool compute_locally_this_tile;
@@ -443,7 +441,11 @@ renderCUDA(
 	// if (!compute_locally_this_tile)
 	// 	return;
 
-	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
+    const uint2 tile_grid = { cdiv(W, BLOCK_X), cdiv(H, BLOCK_Y) }; 
+    const int block_id_x = block_id % tile_grid.x;
+    const int block_id_y = block_id / tile_grid.x;
+
+    const uint2 pix_min = { block_id_x * BLOCK_X, block_id_y * BLOCK_Y };
 	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
 	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
 	const uint32_t pix_id = W * pix.y + pix.x;
@@ -665,7 +667,7 @@ void BACKWARD::render(
 	const float* colors,
 	const float* final_Ts,
 	const uint32_t* n_contrib,
-	const bool* compute_locally,
+	const int* compute_locally_1D_2D_map,
 	const float* dL_dpixels,
 	float3* dL_dmean2D,
 	float4* dL_dconic2D,
@@ -682,7 +684,7 @@ void BACKWARD::render(
 		colors,
 		final_Ts,
 		n_contrib,
-		compute_locally,
+		compute_locally_1D_2D_map,
 		dL_dpixels,
 		dL_dmean2D,
 		dL_dconic2D,
diff --git a/cuda_rasterizer/backward.h b/cuda_rasterizer/backward.h
@@ -31,7 +31,7 @@ namespace BACKWARD
 		const float* colors,
 		const float* final_Ts,
 		const uint32_t* n_contrib,
-		const bool* compute_locally,
+		const int* compute_locally_1D_2D_map,
 		const float* dL_dpixels,
 		float3* dL_dmean2D,
 		float4* dL_dconic2D,
diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
@@ -272,7 +272,7 @@ renderCUDA(
 	float* __restrict__ final_T,
 	uint32_t* __restrict__ n_contrib,
 	uint32_t* __restrict__ n_contrib2loss,
-	bool* __restrict__ compute_locally,
+    const int* __restrict__ compute_locally_1D_2D_map,
 	const float* __restrict__ bg_color,
 	float* __restrict__ out_color)
 {
@@ -284,10 +284,10 @@ renderCUDA(
 	// auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
 
 	// method 2: this seems to be faster than others, in set of experiments: fix_com_loc_flc_1/2/3
-	uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
-	auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
-	if (!compute_locally[block_id])
-		return;
+	const int block_id_1d = block.group_index().x;
+    const int block_id = compute_locally_1D_2D_map[block_id_1d];
+
+    //method2.1
 
 	// method 3
 	// __shared__ bool compute_locally_this_tile;
@@ -304,12 +304,15 @@ renderCUDA(
 	// if (!compute_locally_this_tile)
 	// 	return;
 
+    const uint2 tile_grid = { cdiv(W, BLOCK_X), cdiv(H, BLOCK_Y) }; 
+    const int block_id_x = block_id % tile_grid.x;
+    const int block_id_y = block_id / tile_grid.x;
 
-	uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
-	uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
-	uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
-	uint32_t pix_id = W * pix.y + pix.x;
-	float2 pixf = { (float)pix.x, (float)pix.y };
+    const uint2 pix_min = { block_id_x * BLOCK_X, block_id_y * BLOCK_Y };
+	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
+	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
+	const uint32_t pix_id = W * pix.y + pix.x;
+	const float2 pixf = { (float)pix.x, (float)pix.y };
 
 	// Check if this thread is associated with a valid pixel or outside.
 	bool inside = pix.x < W&& pix.y < H;
@@ -323,7 +326,7 @@ renderCUDA(
 	// method 3
 	// uint2 range = range_this_tile;
 
-	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
+	const int rounds = cdiv(range.y - range.x, BLOCK_SIZE);
 	int toDo = range.y - range.x;
 
 	// Allocate storage for batches of collectively fetched data.
@@ -423,7 +426,7 @@ void FORWARD::render(
 	float* final_T,
 	uint32_t* n_contrib,
 	uint32_t* n_contrib2loss,
-	bool* compute_locally,
+    const int* compute_locally_1D_2D_map,
 	const float* bg_color,
 	float* out_color)
 {
@@ -437,7 +440,7 @@ void FORWARD::render(
 		final_T,
 		n_contrib,
 		n_contrib2loss,
-		compute_locally,
+        compute_locally_1D_2D_map,
 		bg_color,
 		out_color);
 }
diff --git a/cuda_rasterizer/forward.h b/cuda_rasterizer/forward.h
@@ -59,7 +59,7 @@ namespace FORWARD
 		float* final_T,
 		uint32_t* n_contrib,
 		uint32_t* n_contrib2loss,
-		bool* compute_locally,
+        const int* compute_locally_1D_2D_map,
 		const float* bg_color,
 		float* out_color);
 }
diff --git a/cuda_rasterizer/rasterizer_impl.cu b/cuda_rasterizer/rasterizer_impl.cu
@@ -511,7 +511,48 @@ void CudaRasterizer::Rasterizer::getDistributionStrategy(
 /////////////////////////////// Render ///////////////////////////////
 
 
+__global__ void map2DcomputelocallyTo1D(
+    int tile_num,
+    const bool* compute_locally,
+    int* compute_locally_1D_2D_map,
+    dim3 grid,
+    int* block_count
+) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < tile_num) {
+        if (compute_locally[i]) {
+            int j = atomicAdd(block_count, 1);
+            compute_locally_1D_2D_map[j] = i;
+        }
+    }
+}
 
+dim3 map2DcomputelocallyTo1DGrid(
+    const int tile_num,
+    const bool* compute_locally,
+    int* compute_locally_1D_2D_map,
+    const dim3 tile_grid,
+    bool debug
+) {
+    int block_count = 0;
+    int* block_count_dev;
+    CHECK_CUDA(cudaMalloc(&block_count_dev, sizeof(int)), debug);
+    CHECK_CUDA(cudaMemcpy(block_count_dev, &block_count, sizeof(int), cudaMemcpyHostToDevice), debug);
+
+    // Perform the mapping on the device side
+    map2DcomputelocallyTo1D<<<cdiv(tile_num, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE>>>(
+        tile_num,
+        compute_locally,
+        compute_locally_1D_2D_map,
+        tile_grid,
+        block_count_dev
+    );
+
+    CHECK_CUDA(cudaMemcpy(&block_count, block_count_dev, sizeof(int), cudaMemcpyDeviceToHost), debug);
+    CHECK_CUDA(cudaFree(block_count_dev), debug);
+
+    return dim3(block_count, 1, 1);
+}
 
 int CudaRasterizer::Rasterizer::renderForward(
 	std::function<char* (size_t)> geometryBuffer,
@@ -542,7 +583,7 @@ int CudaRasterizer::Rasterizer::renderForward(
 	char* chunkptr = geometryBuffer(chunk_size);
 	GeometryState geomState = GeometryState::fromChunk(chunkptr, P, true); // do not allocate extra memory here if sep_rendering==True.
 
-	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+	dim3 tile_grid(cdiv(width, BLOCK_X), cdiv(height, BLOCK_Y), 1);
 	dim3 block(BLOCK_X, BLOCK_Y, 1);
 	int tile_num = tile_grid.x * tile_grid.y;
 
@@ -553,7 +594,7 @@ int CudaRasterizer::Rasterizer::renderForward(
 
 	timer.start("24 updateDistributedStatLocally.updateTileTouched");
 	// For sep_rendering==True case (here), we only compute tiles_touched in the renderForward.
-	updateTileTouched <<<(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
+	updateTileTouched <<<cdiv(P, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >>> (
 		P,
 		tile_grid,
 		radii,
@@ -580,7 +621,7 @@ int CudaRasterizer::Rasterizer::renderForward(
 	timer.start("40 duplicateWithKeys");
 	// For each instance to be rendered, produce adequate [ tile | depth ] key 
 	// and corresponding dublicated Gaussian indices to be sorted
-	duplicateWithKeys << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > (
+	duplicateWithKeys << <cdiv(P, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >> > (
 		P,
 		means2D,
 		depths,
@@ -610,18 +651,26 @@ int CudaRasterizer::Rasterizer::renderForward(
 	timer.start("60 identifyTileRanges");
 	// Identify start and end of per-tile workloads in sorted list
 	if (num_rendered > 0)
-		identifyTileRanges << <(num_rendered + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > (
+		identifyTileRanges << <cdiv(num_rendered, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >> > (
 			num_rendered,
 			binningState.point_list_keys,
 			imgState.ranges);
 	CHECK_CUDA(, debug)
 	timer.stop("60 identifyTileRanges");
 
-	// Let each tile blend its range of Gaussians independently in parallel
+	timer.start("61 map2DcomputelocallyTo1D");
+    int* compute_locally_1D_2D_map;
+    CHECK_CUDA(cudaMalloc(&compute_locally_1D_2D_map, tile_num * sizeof(int)), debug);
+
+    dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid(tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
+
+    timer.stop("61 map2DcomputelocallyTo1D");
+
+    // Let each tile blend its range of Gaussians independently in parallel
 	const float* feature_ptr = rgb;
 	timer.start("70 render");
 	CHECK_CUDA(FORWARD::render(//TODO: only deal with local tiles. do not even load other tiles.
-		tile_grid, block,
+		tile_grid_1d, block,
 		imgState.ranges,
 		binningState.point_list,
 		width, height,
@@ -631,7 +680,7 @@ int CudaRasterizer::Rasterizer::renderForward(
 		imgState.accum_alpha,
 		imgState.n_contrib,
 		imgState.n_contrib2loss,
-		compute_locally,
+        compute_locally_1D_2D_map,
 		background,
 		out_color), debug)
 	timer.stop("70 render");
@@ -754,6 +803,7 @@ int CudaRasterizer::Rasterizer::renderForward(
 	}
 
 	delete[] log_tmp;
+    CHECK_CUDA(cudaFree(compute_locally_1D_2D_map), debug);
 	return num_rendered;
 }
 
@@ -787,14 +837,23 @@ void CudaRasterizer::Rasterizer::renderBackward(
 
 	const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
 	const dim3 block(BLOCK_X, BLOCK_Y, 1);
+    const int tile_num = tile_grid.x * tile_grid.y;
+
+    timer.start("61 map2DcomputelocallyTo1D");
+    int* compute_locally_1D_2D_map;
+    CHECK_CUDA(cudaMalloc(&compute_locally_1D_2D_map, tile_num * sizeof(int)), debug);
+
+    dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid(tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
+
+    timer.stop("61 map2DcomputelocallyTo1D");
 
 	// Compute loss gradients w.r.t. 2D mean position, conic matrix,
 	// opacity and RGB of Gaussians from per-pixel loss gradients.
 	// If we were given precomputed colors and not SHs, use them.
 	const float* color_ptr = rgb;
 	timer.start("b10 render");
 	CHECK_CUDA(BACKWARD::render(
-		tile_grid,
+		tile_grid_1d,
 		block,
 		imgState.ranges,
 		binningState.point_list,
@@ -805,7 +864,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
 		color_ptr,
 		imgState.accum_alpha,
 		imgState.n_contrib,
-		compute_locally,
+		compute_locally_1D_2D_map,
 		dL_dpix,
 		(float3*)dL_dmean2D,
 		(float4*)dL_dconic,
@@ -821,4 +880,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
 	if (zhx_time && iteration % log_interval == 1) {
 		timer.printAllTimes(iteration, world_size, global_rank, log_folder, false);
 	}
+
+    // Free used memory
+    CHECK_CUDA(cudaFree(compute_locally_1D_2D_map), debug);
 }

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ namespace FORWARD`
`59`	`59`	`float* final_T,`
`60`	`60`	`uint32_t* n_contrib,`
`61`	`61`	`uint32_t* n_contrib2loss,`
`62`		`- bool* compute_locally,`
	`62`	`+ const int* compute_locally_1D_2D_map,`
`63`	`63`	`const float* bg_color,`
`64`	`64`	`float* out_color);`
`65`	`65`	`}`