Skip to content

Commit d898cb4

Browse files
authored
Merge pull request #1 from TarzanZhao/sandeep/compute_local_2d
Render function Cuda grid refactored from 2D to 1D
2 parents d802de4 + 3af2b6f commit d898cb4

File tree

6 files changed

+100
-32
lines changed

6 files changed

+100
-32
lines changed

cuda_rasterizer/auxiliary.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
2020
#define NUM_WARPS (BLOCK_SIZE/32)
21+
#define cdiv(a, b) ((a + b - 1) / b)
2122

2223
// Spherical harmonics coefficients
2324
__device__ const float SH_C0 = 0.28209479177387814f;

cuda_rasterizer/backward.cu

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ renderCUDA(
408408
const float* __restrict__ colors,
409409
const float* __restrict__ final_Ts,
410410
const uint32_t* __restrict__ n_contrib,
411-
const bool* __restrict__ compute_locally,
411+
const int* __restrict__ compute_locally_1D_2D_map,
412412
const float* __restrict__ dL_dpixels,
413413
float3* __restrict__ dL_dmean2D,
414414
float4* __restrict__ dL_dconic2D,
@@ -423,10 +423,8 @@ renderCUDA(
423423
// auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
424424

425425
// method 2: this seems to be faster than others, in set of experiments: fix_com_loc_flc_1/2/3
426-
const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
427-
auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
428-
if (!compute_locally[block_id])
429-
return;
426+
const int block_id_1d = block.group_index().x;
427+
const int block_id = compute_locally_1D_2D_map[block_id_1d];
430428

431429
// method 3
432430
// __shared__ bool compute_locally_this_tile;
@@ -443,7 +441,11 @@ renderCUDA(
443441
// if (!compute_locally_this_tile)
444442
// return;
445443

446-
const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
444+
const uint2 tile_grid = { cdiv(W, BLOCK_X), cdiv(H, BLOCK_Y) };
445+
const int block_id_x = block_id % tile_grid.x;
446+
const int block_id_y = block_id / tile_grid.x;
447+
448+
const uint2 pix_min = { block_id_x * BLOCK_X, block_id_y * BLOCK_Y };
447449
const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
448450
const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
449451
const uint32_t pix_id = W * pix.y + pix.x;
@@ -665,7 +667,7 @@ void BACKWARD::render(
665667
const float* colors,
666668
const float* final_Ts,
667669
const uint32_t* n_contrib,
668-
const bool* compute_locally,
670+
const int* compute_locally_1D_2D_map,
669671
const float* dL_dpixels,
670672
float3* dL_dmean2D,
671673
float4* dL_dconic2D,
@@ -682,7 +684,7 @@ void BACKWARD::render(
682684
colors,
683685
final_Ts,
684686
n_contrib,
685-
compute_locally,
687+
compute_locally_1D_2D_map,
686688
dL_dpixels,
687689
dL_dmean2D,
688690
dL_dconic2D,

cuda_rasterizer/backward.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ namespace BACKWARD
3131
const float* colors,
3232
const float* final_Ts,
3333
const uint32_t* n_contrib,
34-
const bool* compute_locally,
34+
const int* compute_locally_1D_2D_map,
3535
const float* dL_dpixels,
3636
float3* dL_dmean2D,
3737
float4* dL_dconic2D,

cuda_rasterizer/forward.cu

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ renderCUDA(
272272
float* __restrict__ final_T,
273273
uint32_t* __restrict__ n_contrib,
274274
uint32_t* __restrict__ n_contrib2loss,
275-
bool* __restrict__ compute_locally,
275+
const int* __restrict__ compute_locally_1D_2D_map,
276276
const float* __restrict__ bg_color,
277277
float* __restrict__ out_color)
278278
{
@@ -284,10 +284,10 @@ renderCUDA(
284284
// auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
285285

286286
// method 2: this seems to be faster than others, in set of experiments: fix_com_loc_flc_1/2/3
287-
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
288-
auto block_id = block.group_index().y * horizontal_blocks + block.group_index().x;
289-
if (!compute_locally[block_id])
290-
return;
287+
const int block_id_1d = block.group_index().x;
288+
const int block_id = compute_locally_1D_2D_map[block_id_1d];
289+
290+
//method2.1
291291

292292
// method 3
293293
// __shared__ bool compute_locally_this_tile;
@@ -304,12 +304,15 @@ renderCUDA(
304304
// if (!compute_locally_this_tile)
305305
// return;
306306

307+
const uint2 tile_grid = { cdiv(W, BLOCK_X), cdiv(H, BLOCK_Y) };
308+
const int block_id_x = block_id % tile_grid.x;
309+
const int block_id_y = block_id / tile_grid.x;
307310

308-
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
309-
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
310-
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
311-
uint32_t pix_id = W * pix.y + pix.x;
312-
float2 pixf = { (float)pix.x, (float)pix.y };
311+
const uint2 pix_min = { block_id_x * BLOCK_X, block_id_y * BLOCK_Y };
312+
const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
313+
const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
314+
const uint32_t pix_id = W * pix.y + pix.x;
315+
const float2 pixf = { (float)pix.x, (float)pix.y };
313316

314317
// Check if this thread is associated with a valid pixel or outside.
315318
bool inside = pix.x < W&& pix.y < H;
@@ -323,7 +326,7 @@ renderCUDA(
323326
// method 3
324327
// uint2 range = range_this_tile;
325328

326-
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
329+
const int rounds = cdiv(range.y - range.x, BLOCK_SIZE);
327330
int toDo = range.y - range.x;
328331

329332
// Allocate storage for batches of collectively fetched data.
@@ -423,7 +426,7 @@ void FORWARD::render(
423426
float* final_T,
424427
uint32_t* n_contrib,
425428
uint32_t* n_contrib2loss,
426-
bool* compute_locally,
429+
const int* compute_locally_1D_2D_map,
427430
const float* bg_color,
428431
float* out_color)
429432
{
@@ -437,7 +440,7 @@ void FORWARD::render(
437440
final_T,
438441
n_contrib,
439442
n_contrib2loss,
440-
compute_locally,
443+
compute_locally_1D_2D_map,
441444
bg_color,
442445
out_color);
443446
}

cuda_rasterizer/forward.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ namespace FORWARD
5959
float* final_T,
6060
uint32_t* n_contrib,
6161
uint32_t* n_contrib2loss,
62-
bool* compute_locally,
62+
const int* compute_locally_1D_2D_map,
6363
const float* bg_color,
6464
float* out_color);
6565
}

cuda_rasterizer/rasterizer_impl.cu

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,48 @@ void CudaRasterizer::Rasterizer::getDistributionStrategy(
511511
/////////////////////////////// Render ///////////////////////////////
512512

513513

514+
__global__ void map2DcomputelocallyTo1D(
515+
int tile_num,
516+
const bool* compute_locally,
517+
int* compute_locally_1D_2D_map,
518+
dim3 grid,
519+
int* block_count
520+
) {
521+
int i = blockIdx.x * blockDim.x + threadIdx.x;
522+
if (i < tile_num) {
523+
if (compute_locally[i]) {
524+
int j = atomicAdd(block_count, 1);
525+
compute_locally_1D_2D_map[j] = i;
526+
}
527+
}
528+
}
514529

530+
dim3 map2DcomputelocallyTo1DGrid(
531+
const int tile_num,
532+
const bool* compute_locally,
533+
int* compute_locally_1D_2D_map,
534+
const dim3 tile_grid,
535+
bool debug
536+
) {
537+
int block_count = 0;
538+
int* block_count_dev;
539+
CHECK_CUDA(cudaMalloc(&block_count_dev, sizeof(int)), debug);
540+
CHECK_CUDA(cudaMemcpy(block_count_dev, &block_count, sizeof(int), cudaMemcpyHostToDevice), debug);
541+
542+
// Perform the mapping on the device side
543+
map2DcomputelocallyTo1D<<<cdiv(tile_num, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE>>>(
544+
tile_num,
545+
compute_locally,
546+
compute_locally_1D_2D_map,
547+
tile_grid,
548+
block_count_dev
549+
);
550+
551+
CHECK_CUDA(cudaMemcpy(&block_count, block_count_dev, sizeof(int), cudaMemcpyDeviceToHost), debug);
552+
CHECK_CUDA(cudaFree(block_count_dev), debug);
553+
554+
return dim3(block_count, 1, 1);
555+
}
515556

516557
int CudaRasterizer::Rasterizer::renderForward(
517558
std::function<char* (size_t)> geometryBuffer,
@@ -542,7 +583,7 @@ int CudaRasterizer::Rasterizer::renderForward(
542583
char* chunkptr = geometryBuffer(chunk_size);
543584
GeometryState geomState = GeometryState::fromChunk(chunkptr, P, true); // do not allocate extra memory here if sep_rendering==True.
544585

545-
dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
586+
dim3 tile_grid(cdiv(width, BLOCK_X), cdiv(height, BLOCK_Y), 1);
546587
dim3 block(BLOCK_X, BLOCK_Y, 1);
547588
int tile_num = tile_grid.x * tile_grid.y;
548589

@@ -553,7 +594,7 @@ int CudaRasterizer::Rasterizer::renderForward(
553594

554595
timer.start("24 updateDistributedStatLocally.updateTileTouched");
555596
// For sep_rendering==True case (here), we only compute tiles_touched in the renderForward.
556-
updateTileTouched <<<(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >>> (
597+
updateTileTouched <<<cdiv(P, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >>> (
557598
P,
558599
tile_grid,
559600
radii,
@@ -580,7 +621,7 @@ int CudaRasterizer::Rasterizer::renderForward(
580621
timer.start("40 duplicateWithKeys");
581622
// For each instance to be rendered, produce adequate [ tile | depth ] key
582623
// and corresponding dublicated Gaussian indices to be sorted
583-
duplicateWithKeys << <(P + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > (
624+
duplicateWithKeys << <cdiv(P, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >> > (
584625
P,
585626
means2D,
586627
depths,
@@ -610,18 +651,26 @@ int CudaRasterizer::Rasterizer::renderForward(
610651
timer.start("60 identifyTileRanges");
611652
// Identify start and end of per-tile workloads in sorted list
612653
if (num_rendered > 0)
613-
identifyTileRanges << <(num_rendered + ONE_DIM_BLOCK_SIZE - 1) / ONE_DIM_BLOCK_SIZE, ONE_DIM_BLOCK_SIZE >> > (
654+
identifyTileRanges << <cdiv(num_rendered, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE >> > (
614655
num_rendered,
615656
binningState.point_list_keys,
616657
imgState.ranges);
617658
CHECK_CUDA(, debug)
618659
timer.stop("60 identifyTileRanges");
619660

620-
// Let each tile blend its range of Gaussians independently in parallel
661+
timer.start("61 map2DcomputelocallyTo1D");
662+
int* compute_locally_1D_2D_map;
663+
CHECK_CUDA(cudaMalloc(&compute_locally_1D_2D_map, tile_num * sizeof(int)), debug);
664+
665+
dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid(tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
666+
667+
timer.stop("61 map2DcomputelocallyTo1D");
668+
669+
// Let each tile blend its range of Gaussians independently in parallel
621670
const float* feature_ptr = rgb;
622671
timer.start("70 render");
623672
CHECK_CUDA(FORWARD::render(//TODO: only deal with local tiles. do not even load other tiles.
624-
tile_grid, block,
673+
tile_grid_1d, block,
625674
imgState.ranges,
626675
binningState.point_list,
627676
width, height,
@@ -631,7 +680,7 @@ int CudaRasterizer::Rasterizer::renderForward(
631680
imgState.accum_alpha,
632681
imgState.n_contrib,
633682
imgState.n_contrib2loss,
634-
compute_locally,
683+
compute_locally_1D_2D_map,
635684
background,
636685
out_color), debug)
637686
timer.stop("70 render");
@@ -754,6 +803,7 @@ int CudaRasterizer::Rasterizer::renderForward(
754803
}
755804

756805
delete[] log_tmp;
806+
CHECK_CUDA(cudaFree(compute_locally_1D_2D_map), debug);
757807
return num_rendered;
758808
}
759809

@@ -787,14 +837,23 @@ void CudaRasterizer::Rasterizer::renderBackward(
787837

788838
const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
789839
const dim3 block(BLOCK_X, BLOCK_Y, 1);
840+
const int tile_num = tile_grid.x * tile_grid.y;
841+
842+
timer.start("61 map2DcomputelocallyTo1D");
843+
int* compute_locally_1D_2D_map;
844+
CHECK_CUDA(cudaMalloc(&compute_locally_1D_2D_map, tile_num * sizeof(int)), debug);
845+
846+
dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid(tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
847+
848+
timer.stop("61 map2DcomputelocallyTo1D");
790849

791850
// Compute loss gradients w.r.t. 2D mean position, conic matrix,
792851
// opacity and RGB of Gaussians from per-pixel loss gradients.
793852
// If we were given precomputed colors and not SHs, use them.
794853
const float* color_ptr = rgb;
795854
timer.start("b10 render");
796855
CHECK_CUDA(BACKWARD::render(
797-
tile_grid,
856+
tile_grid_1d,
798857
block,
799858
imgState.ranges,
800859
binningState.point_list,
@@ -805,7 +864,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
805864
color_ptr,
806865
imgState.accum_alpha,
807866
imgState.n_contrib,
808-
compute_locally,
867+
compute_locally_1D_2D_map,
809868
dL_dpix,
810869
(float3*)dL_dmean2D,
811870
(float4*)dL_dconic,
@@ -821,4 +880,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
821880
if (zhx_time && iteration % log_interval == 1) {
822881
timer.printAllTimes(iteration, world_size, global_rank, log_folder, false);
823882
}
883+
884+
// Free used memory
885+
CHECK_CUDA(cudaFree(compute_locally_1D_2D_map), debug);
824886
}

0 commit comments

Comments
 (0)