@@ -516,17 +516,44 @@ __global__ void map2DcomputelocallyTo1D(
516516 const bool * compute_locally,
517517 int * compute_locally_1D_2D_map,
518518 dim3 grid,
519- int * count
519+ int * block_count
520520) {
521521 int i = blockIdx .x * blockDim .x + threadIdx .x ;
522522 if (i < tile_num) {
523523 if (compute_locally[i]) {
524- int j = atomicAdd (count , 1 );
524+ int j = atomicAdd (block_count , 1 );
525525 compute_locally_1D_2D_map[j] = i;
526526 }
527527 }
528528}
529529
530+ dim3 map2DcomputelocallyTo1DGrid (
531+ const int tile_num,
532+ const bool * compute_locally,
533+ int * compute_locally_1D_2D_map,
534+ const dim3 tile_grid,
535+ bool debug
536+ ) {
537+ int block_count = 0 ;
538+ int * block_count_dev;
539+ CHECK_CUDA (cudaMalloc (&block_count_dev, sizeof (int )), debug);
540+ CHECK_CUDA (cudaMemcpy (block_count_dev, &block_count, sizeof (int ), cudaMemcpyHostToDevice), debug);
541+
542+ // Perform the mapping on the device side
543+ map2DcomputelocallyTo1D<<<cdiv(tile_num, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE>>> (
544+ tile_num,
545+ compute_locally,
546+ compute_locally_1D_2D_map,
547+ tile_grid,
548+ block_count_dev
549+ );
550+
551+ CHECK_CUDA (cudaMemcpy (&block_count, block_count_dev, sizeof (int ), cudaMemcpyDeviceToHost), debug);
552+ CHECK_CUDA (cudaFree (block_count_dev), debug);
553+
554+ return dim3 (block_count, 1 , 1 );
555+ }
556+
530557int CudaRasterizer::Rasterizer::renderForward (
531558 std::function<char * (size_t )> geometryBuffer,
532559 std::function<char* (size_t )> binningBuffer,
@@ -632,24 +659,10 @@ int CudaRasterizer::Rasterizer::renderForward(
632659 timer.stop (" 60 identifyTileRanges" );
633660
634661 timer.start (" 61 map2DcomputelocallyTo1D" );
635- int count = 0 ;
636662 int * compute_locally_1D_2D_map;
637- int * dev_count;
638663 CHECK_CUDA (cudaMalloc (&compute_locally_1D_2D_map, tile_num * sizeof (int )), debug);
639- CHECK_CUDA (cudaMalloc (&dev_count, sizeof (int )), debug);
640- CHECK_CUDA (cudaMemcpy (dev_count, &count, sizeof (int ), cudaMemcpyHostToDevice), debug);
641664
642- // Perform the mapping on the device side
643- map2DcomputelocallyTo1D<<<cdiv(tile_num, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE>>> (
644- tile_num,
645- compute_locally,
646- compute_locally_1D_2D_map,
647- tile_grid,
648- dev_count
649- );
650-
651- CHECK_CUDA (cudaMemcpy (&count, dev_count, sizeof (int ), cudaMemcpyDeviceToHost), debug);
652- dim3 tile_grid_1d (count, 1 , 1 );
665+ dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid (tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
653666
654667 timer.stop (" 61 map2DcomputelocallyTo1D" );
655668
@@ -791,7 +804,6 @@ int CudaRasterizer::Rasterizer::renderForward(
791804
792805 delete[] log_tmp;
793806 CHECK_CUDA (cudaFree (compute_locally_1D_2D_map), debug);
794- CHECK_CUDA (cudaFree (dev_count), debug);
795807 return num_rendered;
796808}
797809
@@ -828,24 +840,10 @@ void CudaRasterizer::Rasterizer::renderBackward(
828840 const int tile_num = tile_grid.x * tile_grid.y ;
829841
830842 timer.start (" 61 map2DcomputelocallyTo1D" );
831- int count = 0 ;
832843 int * compute_locally_1D_2D_map;
833- int * dev_count;
834844 CHECK_CUDA (cudaMalloc (&compute_locally_1D_2D_map, tile_num * sizeof (int )), debug);
835- CHECK_CUDA (cudaMalloc (&dev_count, sizeof (int )), debug);
836- CHECK_CUDA (cudaMemcpy (dev_count, &count, sizeof (int ), cudaMemcpyHostToDevice), debug);
837-
838- // Perform the mapping on the device side
839- map2DcomputelocallyTo1D<<<cdiv(tile_num, ONE_DIM_BLOCK_SIZE), ONE_DIM_BLOCK_SIZE>>> (
840- tile_num,
841- compute_locally,
842- compute_locally_1D_2D_map,
843- tile_grid,
844- dev_count
845- );
846845
847- CHECK_CUDA (cudaMemcpy (&count, dev_count, sizeof (int ), cudaMemcpyDeviceToHost), debug);
848- dim3 tile_grid_1d (count, 1 , 1 );
846+ dim3 tile_grid_1d = map2DcomputelocallyTo1DGrid (tile_num, compute_locally, compute_locally_1D_2D_map, tile_grid, debug);
849847
850848 timer.stop (" 61 map2DcomputelocallyTo1D" );
851849
@@ -885,5 +883,4 @@ void CudaRasterizer::Rasterizer::renderBackward(
885883
886884 // Free used memory
887885 CHECK_CUDA (cudaFree (compute_locally_1D_2D_map), debug);
888- CHECK_CUDA (cudaFree (dev_count), debug);
889886}
0 commit comments