Skip to content

Commit

Permalink
Merge pull request #13792 from brian-kelley/FixLargeBlocks2
Browse files Browse the repository at this point in the history
Ifpack2: BlockTriDi fix for large blocks
  • Loading branch information
brian-kelley authored Feb 7, 2025
2 parents b8f0a3a + 0e724f4 commit 95b49d7
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 19 deletions.
34 changes: 33 additions & 1 deletion packages/ifpack2/example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,38 @@ ASSERT_DEFINED (
${PACKAGE_NAME}_ENABLE_Galeri
)

IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri)
# Correctness test with maximum block size (32)
# Use a small grid so that GPU memory requirement isn't too large
# Block TriDi
TRIBITS_ADD_TEST(
BlockTriDiagonalSolver
NAME BlockTriDiLargeBlock
ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20"
COMM serial mpi
NUM_MPI_PROCS 1-4
STANDARD_PASS_OUTPUT
)
# Block TriDi with Schur line splitting
TRIBITS_ADD_TEST(
BlockTriDiagonalSolver
NAME BlockTriDiLargeBlockSchur
ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20 --sublinesPerLine=1 --sublinesPerLineSchur=2"
COMM serial mpi
NUM_MPI_PROCS 1-4
STANDARD_PASS_OUTPUT
)
# Block Jacobi
TRIBITS_ADD_TEST(
BlockTriDiagonalSolver
NAME BlockTriDiLargeBlockJacobi
ARGS "--matrixType=Laplace3D --blockSize=32 --nx=20 --ny=20 --nz=20 --sublinesPerLine=-1"
COMM serial mpi
NUM_MPI_PROCS 1-4
STANDARD_PASS_OUTPUT
)
ENDIF()

IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri)

set(blockSize 11)
Expand Down Expand Up @@ -87,4 +119,4 @@ IF(${PACKAGE_NAME}_ENABLE_Xpetra AND ${PACKAGE_NAME}_ENABLE_Galeri)
ENDWHILE()
ENDIF()
endforeach()
ENDIF()
ENDIF()
51 changes: 33 additions & 18 deletions packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2750,7 +2750,7 @@ namespace Ifpack2 {
*/
Kokkos::Experimental::local_deep_copy(member, view1, view2);
}
template<typename MatrixType>
template<typename MatrixType, int ScratchLevel>
struct ExtractAndFactorizeTridiags {
public:
using impl_type = BlockHelperDetails::ImplType<MatrixType>;
Expand Down Expand Up @@ -2785,6 +2785,8 @@ namespace Ifpack2 {
using internal_vector_type = typename impl_type::internal_vector_type;
static constexpr int vector_length = impl_type::vector_length;
static constexpr int internal_vector_length = impl_type::internal_vector_length;
static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");

/// team policy member type
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
Expand Down Expand Up @@ -2812,7 +2814,6 @@ namespace Ifpack2 {
// diagonal safety
const magnitude_type tiny;
const local_ordinal_type vector_loop_size;
const local_ordinal_type vector_length_value;

bool hasBlockCrsMatrix;

Expand Down Expand Up @@ -2873,8 +2874,7 @@ namespace Ifpack2 {
blocksize_square(blocksize*blocksize),
// diagonal weight to avoid zero pivots
tiny(tiny_),
vector_loop_size(vector_length/internal_vector_length),
vector_length_value(vector_length) {
vector_loop_size(vector_length/internal_vector_length) {
using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;

Expand Down Expand Up @@ -3191,7 +3191,7 @@ namespace Ifpack2 {
const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);

#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
Expand Down Expand Up @@ -3294,7 +3294,7 @@ namespace Ifpack2 {
(void) npacks;

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
if (local_subpartidx == 0) {
Kokkos::parallel_for
(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
Expand Down Expand Up @@ -3334,9 +3334,6 @@ namespace Ifpack2 {
//const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
//const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);

// Compute S = D - C E

const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
Expand Down Expand Up @@ -3440,7 +3437,7 @@ namespace Ifpack2 {
const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);

internal_vector_scratch_type_3d_view
WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);

#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
Expand Down Expand Up @@ -3477,7 +3474,7 @@ namespace Ifpack2 {
const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
writeBTDValuesToFile(n_parts, scalar_values, "before.mm");

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
policy, *this);
execution_space().fence();
Expand All @@ -3504,7 +3501,7 @@ namespace Ifpack2 {
Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
policy, *this);
execution_space().fence();
Expand All @@ -3523,7 +3520,7 @@ namespace Ifpack2 {
Kokkos::TeamPolicy<execution_space,ComputeETag>
policy(packindices_sub.extent(0), team_size, vector_loop_size);

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
policy, *this);
execution_space().fence();
Expand All @@ -3544,7 +3541,6 @@ namespace Ifpack2 {
Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);

policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
policy, *this);
writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
Expand All @@ -3561,7 +3557,7 @@ namespace Ifpack2 {
IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
policy(packindices_schur.extent(0), team_size, vector_loop_size);
policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
policy, *this);
execution_space().fence();
Expand All @@ -3587,9 +3583,29 @@ namespace Ifpack2 {
const BlockHelperDetails::PartInterface<MatrixType> &interf,
BlockTridiags<MatrixType> &btdm,
const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
using impl_type = BlockHelperDetails::ImplType<MatrixType>;
using execution_space = typename impl_type::execution_space;
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;

IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, G, tiny);
function.run();

int blocksize = btdm.values.extent(1);
// Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
// For large block sizes, have to fall back to level 1 scratch.
int scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
int max_scratch = team_policy_type::scratch_size_max(0);

if(scratch_required < max_scratch) {
// Can use level 0 scratch
ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
function.run();
}
else {
// Not enough level 0 scratch, so fall back to level 1
ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
function.run();
}
IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
}

Expand Down Expand Up @@ -3654,7 +3670,6 @@ namespace Ifpack2 {
packed_multivector(pmv) {}

// TODO:: modify this routine similar to the team level functions
// inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
KOKKOS_INLINE_FUNCTION
void
operator() (const local_ordinal_type &packidx) const {
Expand Down

0 comments on commit 95b49d7

Please sign in to comment.