@@ -511,7 +511,48 @@ void CudaRasterizer::Rasterizer::getDistributionStrategy(
511511// ///////////////////////////// Render ///////////////////////////////
512512
513513
514+ __global__ void map2DcomputelocallyTo1D (
515+ int tile_num,
516+ const bool * compute_locally,
517+ int * compute_locally_1D_2D_map,
518+ dim3 grid,
519+ int * block_count
520+ ) {
521+ int i = blockIdx .x * blockDim .x + threadIdx .x ;
522+ if (i < tile_num) {
523+ if (compute_locally[i]) {
524+ int j = atomicAdd (block_count, 1 );
525+ compute_locally_1D_2D_map[j] = i;
526+ }
527+ }
528+ }
514529
530+ dim3 map2DcomputelocallyTo1DGrid (
531+ const int tile_num,
532+ const bool * compute_locally,
533+ int * compute_locally_1D_2D_map,
534+ const dim3 tile_grid,
535+ bool debug
536+ ) {
537+ int block_count = 0 ;
538+ int * block_count_dev;
539+ CHECK_CUDA (cudaMalloc (&block_count_dev, sizeof (int )), debug);
540+ CHECK_CUDA (cudaMemcpy (block_count_dev, &block_count, sizeof (int ), cudaMemcpyHostToDevice), debug);
541+
542+ // Perform the mapping on the device side
543+ map2DcomputelocallyTo1D<<<cdiv(tile_num, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE>>> (
544+ tile_num,
545+ compute_locally,
546+ compute_locally_1D_2D_map,
547+ tile_grid,
548+ block_count_dev
549+ );
550+
551+ CHECK_CUDA (cudaMemcpy (&block_count, block_count_dev, sizeof (int ), cudaMemcpyDeviceToHost), debug);
552+ CHECK_CUDA (cudaFree (block_count_dev), debug);
553+
554+ return dim3 (block_count, 1 , 1 );
555+ }
515556
516557int CudaRasterizer::Rasterizer::renderForward (
517558 std::function<char * (size_t )> geometryBuffer,
@@ -542,7 +583,7 @@ int CudaRasterizer::Rasterizer::renderForward(
542583 char * chunkptr = geometryBuffer (chunk_size);
543584 GeometryState geomState = GeometryState::fromChunk (chunkptr, P, true ); // do not allocate extra memory here if sep_rendering==True.
544585
545- dim3 tile_grid ((width + BLOCK_X - 1 ) / BLOCK_X, (height + BLOCK_Y - 1 ) / BLOCK_Y , 1 );
586+ dim3 tile_grid (cdiv (width, BLOCK_X), cdiv (height, BLOCK_Y) , 1 );
546587 dim3 block (BLOCK_X, BLOCK_Y, 1 );
547588 int tile_num = tile_grid.x * tile_grid.y ;
548589
@@ -553,7 +594,7 @@ int CudaRasterizer::Rasterizer::renderForward(
553594
554595 timer.start (" 24 updateDistributedStatLocally.updateTileTouched" );
555596 // For sep_rendering==True case (here), we only compute tiles_touched in the renderForward.
556- updateTileTouched <<<(P + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE , ONE_DIM_BLOCK_SIZE >>> (
597+ updateTileTouched <<<cdiv(P, ONE_DIM_BLOCK_SIZE) , ONE_DIM_BLOCK_SIZE >>> (
557598 P,
558599 tile_grid,
559600 radii,
@@ -580,7 +621,7 @@ int CudaRasterizer::Rasterizer::renderForward(
580621 timer.start (" 40 duplicateWithKeys" );
581622 // For each instance to be rendered, produce adequate [ tile | depth ] key
582623 // and corresponding dublicated Gaussian indices to be sorted
583- duplicateWithKeys << <(P + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE , ONE_DIM_BLOCK_SIZE >> > (
624+ duplicateWithKeys << <cdiv (P, ONE_DIM_BLOCK_SIZE) , ONE_DIM_BLOCK_SIZE >> > (
584625 P,
585626 means2D,
586627 depths,
@@ -610,18 +651,26 @@ int CudaRasterizer::Rasterizer::renderForward(
610651 timer.start (" 60 identifyTileRanges" );
611652 // Identify start and end of per-tile workloads in sorted list
612653 if (num_rendered > 0 )
613- identifyTileRanges << <(num_rendered + ONE_DIM_BLOCK_SIZE - 1 ) / ONE_DIM_BLOCK_SIZE , ONE_DIM_BLOCK_SIZE >> > (
654+ identifyTileRanges << <cdiv (num_rendered, ONE_DIM_BLOCK_SIZE) , ONE_DIM_BLOCK_SIZE >> > (
614655 num_rendered,
615656 binningState.point_list_keys ,
616657 imgState.ranges );
617658 CHECK_CUDA (, debug)
618659 timer.stop (" 60 identifyTileRanges" );
619660
620- // Let each tile blend its range of Gaussians independently in parallel
661+ timer.start (" 61 map2DcomputelocallyTo1D" );
662+ int * compute_locally_1D_2D_map;
663+ CHECK_CUDA (cudaMalloc (&compute_locally_1D_2D_map, tile_num * sizeof (int )), debug);
664+
665+ dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid (tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
666+
667+ timer.stop (" 61 map2DcomputelocallyTo1D" );
668+
669+ // Let each tile blend its range of Gaussians independently in parallel
621670 const float * feature_ptr = rgb;
622671 timer.start (" 70 render" );
623672 CHECK_CUDA (FORWARD::render (// TODO: only deal with local tiles. do not even load other tiles.
624- tile_grid , block,
673+ tile_grid_1d , block,
625674 imgState.ranges ,
626675 binningState.point_list ,
627676 width, height,
@@ -631,7 +680,7 @@ int CudaRasterizer::Rasterizer::renderForward(
631680 imgState.accum_alpha ,
632681 imgState.n_contrib ,
633682 imgState.n_contrib2loss ,
634- compute_locally ,
683+ compute_locally_1D_2D_map ,
635684 background,
636685 out_color), debug)
637686 timer.stop (" 70 render" );
@@ -754,6 +803,7 @@ int CudaRasterizer::Rasterizer::renderForward(
754803 }
755804
756805 delete[] log_tmp;
806+ CHECK_CUDA (cudaFree (compute_locally_1D_2D_map), debug);
757807 return num_rendered;
758808}
759809
@@ -787,14 +837,23 @@ void CudaRasterizer::Rasterizer::renderBackward(
787837
788838 const dim3 tile_grid ((width + BLOCK_X - 1 ) / BLOCK_X, (height + BLOCK_Y - 1 ) / BLOCK_Y, 1 );
789839 const dim3 block (BLOCK_X, BLOCK_Y, 1 );
840+ const int tile_num = tile_grid.x * tile_grid.y ;
841+
842+ timer.start (" 61 map2DcomputelocallyTo1D" );
843+ int * compute_locally_1D_2D_map;
844+ CHECK_CUDA (cudaMalloc (&compute_locally_1D_2D_map, tile_num * sizeof (int )), debug);
845+
846+ dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid (tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
847+
848+ timer.stop (" 61 map2DcomputelocallyTo1D" );
790849
791850 // Compute loss gradients w.r.t. 2D mean position, conic matrix,
792851 // opacity and RGB of Gaussians from per-pixel loss gradients.
793852 // If we were given precomputed colors and not SHs, use them.
794853 const float * color_ptr = rgb;
795854 timer.start (" b10 render" );
796855 CHECK_CUDA (BACKWARD::render (
797- tile_grid ,
856+ tile_grid_1d ,
798857 block,
799858 imgState.ranges ,
800859 binningState.point_list ,
@@ -805,7 +864,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
805864 color_ptr,
806865 imgState.accum_alpha ,
807866 imgState.n_contrib ,
808- compute_locally ,
867+ compute_locally_1D_2D_map ,
809868 dL_dpix,
810869 (float3 *)dL_dmean2D,
811870 (float4 *)dL_dconic,
@@ -821,4 +880,7 @@ void CudaRasterizer::Rasterizer::renderBackward(
821880 if (zhx_time && iteration % log_interval == 1 ) {
822881 timer.printAllTimes (iteration, world_size, global_rank, log_folder, false );
823882 }
883+
884+ // Free used memory
885+ CHECK_CUDA (cudaFree (compute_locally_1D_2D_map), debug);
824886}
0 commit comments