diff --git a/include/dlaf/eigensolver/bt_reduction_to_band/impl.h b/include/dlaf/eigensolver/bt_reduction_to_band/impl.h index 1e2e977176..1c5cc71154 100644 --- a/include/dlaf/eigensolver/bt_reduction_to_band/impl.h +++ b/include/dlaf/eigensolver/bt_reduction_to_band/impl.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,7 @@ void BackTransformationReductionToBand::call( const SizeType b, MatrixRef& mat_c, Matrix& mat_v, Matrix& mat_taus) { using namespace bt_red_band; + using dlaf::factorization::internal::computeTFactor; auto hp = pika::execution::thread_priority::high; auto np = pika::execution::thread_priority::normal; @@ -162,10 +164,19 @@ void BackTransformationReductionToBand::call( dlaf::matrix::Distribution dist_t({mb, total_nr_reflector}, {mb, mb}); matrix::Panel panelT(dist_t); + const auto dist_ws = [&]() { + using dlaf::factorization::internal::get_tfactor_num_workers; + const SizeType nworkspaces = + to_SizeType(std::max(0, get_tfactor_num_workers() - 1)); + const SizeType nrefls_step = dist_v.tile_size().cols(); + return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}}; + }(); + common::RoundRobin> panelsWS(n_workspaces, dist_ws); + const SizeType nr_reflector_blocks = dist_t.nrTiles().cols(); for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) { - bool is_last = (k == nr_reflector_blocks - 1); + const bool is_last = (k == nr_reflector_blocks - 1); const SizeType nr_reflectors = dist_t.tileSize({0, k}).cols(); const GlobalElementIndex v_offset(k * mb + b, k * mb); @@ -174,6 +185,7 @@ void BackTransformationReductionToBand::call( const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors); const matrix::SubMatrixView mat_c_view(dist_c, c_offset); + auto& panelWS = panelsWS.nextResource(); auto& panelV = panelsV.nextResource(); auto& panelW = panelsW.nextResource(); auto& panelW2 = panelsW2.nextResource(); @@ -182,6 +194,7 @@ void BackTransformationReductionToBand::call( panelW.setRangeStart(v_offset); if (is_last) { + panelWS.setWidth(nr_reflectors); panelT.setHeight(nr_reflectors); panelW2.setHeight(nr_reflectors); panelW.setWidth(nr_reflectors); @@ -207,8 +220,8 @@ void BackTransformationReductionToBand::call( const LocalTileIndex taus_index{Coord::Row, k}; const LocalTileIndex t_index{Coord::Col, k}; - dlaf::factorization::internal::computeTFactor(panelV, mat_taus.read(taus_index), - panelT.readwrite(t_index)); + + computeTFactor(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS); // W = V T auto tile_t = panelT.read(t_index); @@ -233,6 +246,7 @@ void BackTransformationReductionToBand::call( panelW.reset(); panelW2.reset(); panelT.reset(); + panelWS.reset(); } } @@ -242,6 +256,7 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr Matrix& mat_taus) { namespace ex = pika::execution::experimental; using namespace bt_red_band; + using dlaf::factorization::internal::computeTFactor; auto hp = pika::execution::thread_priority::high; auto np = pika::execution::thread_priority::normal; @@ -277,6 +292,14 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr dist_v.sourceRankIndex()); matrix::Panel panelT(dist_t); + const auto dist_ws = [&]() { + using dlaf::factorization::internal::get_tfactor_num_workers; + const SizeType nworkspaces = to_SizeType(std::max(0, get_tfactor_num_workers() - 1)); + const SizeType nrefls_step = dist_v.tile_size().cols(); + return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}}; + }(); + common::RoundRobin> panelsWS(n_workspaces, dist_ws); + const SizeType nr_reflector_blocks = dist_t.nrTiles().cols(); for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) { @@ -289,6 +312,7 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors); const matrix::SubMatrixView mat_c_view(dist_c, c_offset); + auto& panelWS = panelsWS.nextResource(); auto& panelV = panelsV.nextResource(); auto& panelW = panelsW.nextResource(); auto& panelW2 = panelsW2.nextResource(); @@ -298,6 +322,7 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr panelT.setRange(GlobalTileIndex(Coord::Col, k), GlobalTileIndex(Coord::Col, k + 1)); if (is_last) { + panelWS.setWidth(nr_reflectors); panelT.setHeight(nr_reflectors); panelW2.setHeight(nr_reflectors); panelW.setWidth(nr_reflectors); @@ -329,8 +354,9 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr const GlobalTileIndex taus_index{Coord::Row, k}; const SizeType k_local = dist_t.template localTileFromGlobalTile(k); const LocalTileIndex t_index{Coord::Col, k_local}; - dlaf::factorization::internal::computeTFactor(panelV, mat_taus.read(taus_index), - panelT.readwrite(t_index), mpi_col_task_chain); + + computeTFactor(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS, + mpi_col_task_chain); // WH = V T for (const auto& idx : panel_view.iteratorLocal()) { @@ -349,9 +375,10 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr } if (mpi_col_task_chain.size() > 1) { - for (const auto& kj_panel : panelW2.iteratorLocal()) + for (const auto& kj_panel : panelW2.iteratorLocal()) { ex::start_detached(dlaf::comm::schedule_all_reduce_in_place( mpi_col_task_chain.exclusive(), MPI_SUM, panelW2.readwrite(kj_panel))); + } } broadcast(k_rank_col, panelV, mpi_row_task_chain); @@ -366,6 +393,7 @@ void BackTransformationReductionToBand::call(comm::CommunicatorGrid& gr panelW.reset(); panelW2.reset(); panelT.reset(); + panelWS.reset(); } } } diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h index 28593afcad..87b40f837e 100644 --- a/include/dlaf/eigensolver/reduction_to_band/impl.h +++ b/include/dlaf/eigensolver/reduction_to_band/impl.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -1019,6 +1020,14 @@ Matrix ReductionToBand::call(Matrix& mat_a, const common::RoundRobin> panels_w(n_workspaces, dist); common::RoundRobin> panels_x(n_workspaces, dist); + const auto dist_ws = [&]() { + using dlaf::factorization::internal::get_tfactor_num_workers; + const SizeType nworkspaces = to_SizeType(std::max(0, get_tfactor_num_workers() - 1)); + const SizeType nrefls_step = dist.tile_size().cols(); + return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}}; + }(); + common::RoundRobin> panels_ws(n_workspaces, dist_ws); + // Note: // Here dist_a is given with full panel size instead of dist with just the part actually needeed, // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is @@ -1044,10 +1053,13 @@ Matrix ReductionToBand::call(Matrix& mat_a, const // reflectors (i.e. at the end when last reflector size is 1) const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size); + Panel& ws = panels_ws.nextResource(); Panel& v = panels_v.nextResource(); v.setRangeStart(ij_offset); - if (isPanelIncomplete) + if (isPanelIncomplete) { + ws.setWidth(nrefls_tile); v.setWidth(nrefls_tile); + } // PANEL compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view); @@ -1063,8 +1075,8 @@ Matrix ReductionToBand::call(Matrix& mat_a, const // TODO used just by the column, maybe we can re-use a panel tile? // TODO probably the first one in any panel is ok? Matrix t({nrefls_tile, nrefls_tile}, dist.blockSize()); - - computeTFactor(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx)); + computeTFactor(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws); + ws.reset(); // PREPARATION FOR TRAILING MATRIX UPDATE const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size)); @@ -1206,6 +1218,14 @@ Matrix ReductionToBand::call(comm::CommunicatorGrid& gr common::RoundRobin> panels_xt( n_workspaces, dist); + const auto dist_ws = [&]() { + using dlaf::factorization::internal::get_tfactor_num_workers; + const SizeType nworkspaces = to_SizeType(std::max(0, get_tfactor_num_workers() - 1)); + const SizeType nrefls_step = band_size; + return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}}; + }(); + common::RoundRobin> panels_ws(n_workspaces, dist_ws); + red2band::ComputePanelHelper compute_panel_helper(n_workspaces, dist); ex::unique_any_sender<> trigger_panel{ex::just()}; @@ -1229,10 +1249,12 @@ Matrix ReductionToBand::call(comm::CommunicatorGrid& gr auto& v = panels_v.nextResource(); auto& vt = panels_vt.nextResource(); + auto& ws = panels_ws.nextResource(); v.setRangeStart(at_offset); vt.setRangeStart(at_offset); + ws.setWidth(nrefls_tile); v.setWidth(nrefls_tile); vt.setHeight(nrefls_tile); @@ -1254,8 +1276,10 @@ Matrix ReductionToBand::call(comm::CommunicatorGrid& gr // deadlock due to tile shared between panel and trailing matrix red2band::local::setupReflectorPanelV(rank.row() == rank_v0.row(), panel_view, nrefls_tile, v, mat_a, !is_full_band); - computeTFactor(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), + + computeTFactor(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws, mpi_col_chain); + ws.reset(); } // PREPARATION FOR TRAILING MATRIX UPDATE diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index 7337412e0e..86093dc532 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -33,26 +33,84 @@ namespace dlaf::factorization::internal { /// H0 H1 H2 ... HK-1 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1, /// it has to be set correctly in the panel (0s as well). - +/// +/// It is similar to what xLARFT in LAPACK does. +/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values +/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that +/// +/// H = I - V . T . V* +/// +/// where H = H1 . H2 . ... . Hk +/// +/// in which Hi represents a single elementary reflector transformation. +/// /// A Storage-Efficient WY Representation for Products of Householder Transformations. /// Schreiber, Robert & VanLoan, Charles. (1989) /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005. /// -/// @pre taus contains a vector with k elements -/// @pre t contains a (k x k) tile +/// @param hh_panel where the elementary reflectors are stored +/// @param taus array of taus, associated with the related elementary reflector +/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size +/// TileElementSize(k, k) +/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see +/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k) +/// +/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part) +/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols() template void computeTFactor(matrix::Panel& hh_panel, matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender t) { - QR_Tfactor::call(hh_panel, std::move(taus), std::move(t)); + matrix::ReadWriteTileSender t, + matrix::Panel& workspaces) { + QR_Tfactor::call(hh_panel, std::move(taus), std::move(t), workspaces); } +/// Forms the triangular factor T of a block of reflectors H, which is defined as a product +/// of k := hh_panel.getWidth() elementary reflectors. +/// +/// hh_panel should have the following form +/// H0 0 0 ... 0 +/// . H1 0 ... 0 +/// . . H2 ... 0 +/// . . . ... 0 +/// . . . ... HK-1 +/// . . . ... . +/// H0 H1 H2 ... HK-1 +/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1, +/// it has to be set correctly in the panel (0s as well). +/// +/// It is similar to what xLARFT in LAPACK does. +/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values +/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that +/// +/// H = I - V . T . V* +/// +/// where H = H1 . H2 . ... . Hk +/// +/// in which Hi represents a single elementary reflector transformation. +/// +/// A Storage-Efficient WY Representation for Products of Householder Transformations. +/// Schreiber, Robert & VanLoan, Charles. (1989) +/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005. +/// +/// @param hh_panel where the elementary reflectors are stored +/// @param taus array of taus, associated with the related elementary reflector +/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size +/// TileElementSize(k, k) +/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see +/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k) +/// @param mpi_col_task_chain where internal communications are issued +/// +/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part) +/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols() template void computeTFactor(matrix::Panel& hh_panel, matrix::ReadOnlyTileSender taus, matrix::ReadWriteTileSender t, + matrix::Panel& workspaces, comm::CommunicatorPipeline& mpi_col_task_chain) { - QR_Tfactor::call(hh_panel, std::move(taus), std::move(t), mpi_col_task_chain); + QR_Tfactor::call(hh_panel, std::move(taus), std::move(t), workspaces, + mpi_col_task_chain); } } diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h index e4fef98c37..507bfc4d1c 100644 --- a/include/dlaf/factorization/qr/api.h +++ b/include/dlaf/factorization/qr/api.h @@ -27,62 +27,14 @@ struct QR {}; template struct QR_Tfactor { - /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k - /// elementary reflectors. - /// - /// It is similar to what xLARFT in LAPACK does. - /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start, - /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H - /// block of reflector, such that - /// - /// H = I - V . T . V* - /// - /// where H = H1 . H2 . ... . Hk - /// - /// in which Hi represents a single elementary reflector transformation - /// - /// @param k the number of elementary reflectors to use (from the beginning of the tile) - /// @param v where the elementary reflectors are stored - /// @param v_start tile in @p v where the column of reflectors starts - /// @param taus tile of the taus vector, associated with the related elementary reflector - /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size - /// TileElementSize(k, k) - /// - /// @pre k <= t.get().size().rows && k <= t.get().size().cols() - /// @pre k >= 0 - /// @pre v_start.isIn(v.nrTiles()) - static void call(matrix::Panel& panel_view, + static void call(matrix::Panel& hh_panel, matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender t); - - /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k - /// elementary reflectors. - /// - /// It is similar to what xLARFT in LAPACK does. - /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start, - /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H - /// block of reflector, such that - /// - /// H = I - V . T . V* - /// - /// where H = H1 . H2 . ... . Hk - /// - /// in which Hi represents a single elementary reflector transformation - /// - /// @param k the number of elementary reflectors to use (from the beginning of the tile) - /// @param v where the elementary reflectors are stored - /// @param v_start tile in @p v where the column of reflectors starts - /// @param taus tile of the taus vector, associated with the related elementary reflector - /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size - /// TileElementSize(k, k) - /// @param mpi_col_task_chain where internal communications are issued - /// - /// @pre k <= t.get().size().rows && k <= t.get().size().cols() - /// @pre k >= 0 - /// @pre v_start.isIn(v.nrTiles()) + matrix::ReadWriteTileSender t, + matrix::Panel& workspaces); static void call(matrix::Panel& hh_panel, matrix::ReadOnlyTileSender taus, matrix::ReadWriteTileSender t, + matrix::Panel& workspaces, comm::CommunicatorPipeline& mpi_col_task_chain); }; diff --git a/include/dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h b/include/dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h new file mode 100644 index 0000000000..3634ebc7bb --- /dev/null +++ b/include/dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h @@ -0,0 +1,22 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// +#pragma once + +#include + +#include "dlaf/tune.h" + +namespace dlaf::factorization::internal { + +inline std::chrono::duration getTFactorBarrierBusyWait() noexcept { + return std::chrono::microseconds(getTuneParameters().tfactor_barrier_busy_wait_us); +} + +} diff --git a/include/dlaf/factorization/qr/internal/get_tfactor_num_workers.h b/include/dlaf/factorization/qr/internal/get_tfactor_num_workers.h new file mode 100644 index 0000000000..ec6e01182f --- /dev/null +++ b/include/dlaf/factorization/qr/internal/get_tfactor_num_workers.h @@ -0,0 +1,38 @@ +// +// Distributed Linear Algebra with Future (DLAF) +// +// Copyright (c) 2018-2024, ETH Zurich +// All rights reserved. +// +// Please, refer to the LICENSE file in the root directory. +// SPDX-License-Identifier: BSD-3-Clause +// +#pragma once + +#include +#include +#include + +#include + +#include + +namespace dlaf::factorization::internal { + +template +std::size_t get_tfactor_num_workers() noexcept { + if constexpr (B == Backend::MC) { + // Note: precautionarily we leave at least 1 thread "free" to do other stuff (if possible) + const std::size_t available_workers = + pika::resource::get_thread_pool("default").get_os_thread_count(); + const std::size_t min_workers = 1; + const auto max_workers = std::max(min_workers, available_workers - 1); + + const std::size_t nworkers = getTuneParameters().tfactor_num_threads; + return std::clamp(nworkers, min_workers, max_workers); + } + else if constexpr (B == Backend::GPU) { + return std::max(1, getTuneParameters().tfactor_num_streams); + } +} +} diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index af3222f0af..fec3c3f830 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -10,12 +10,19 @@ #pragma once +#include +#include +#include +#include #include +#include #include +#include #include +#include #include #include #include @@ -24,12 +31,15 @@ #include #include #include +#include +#include #include #include #include #include #include #include +#include #include #ifdef DLAF_WITH_GPU @@ -41,66 +51,140 @@ namespace dlaf::factorization::internal { namespace tfactor_l { + +template +auto split_tfactor_work(const std::size_t nrtiles) { + const std::size_t min_workers = 1; + const std::size_t available_workers = get_tfactor_num_workers(); + const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(1)); + + struct { + std::size_t nworkers; + std::size_t batch_size; + } params; + + params.batch_size = util::ceilDiv(nrtiles, std::clamp(ideal_workers, min_workers, available_workers)); + DLAF_ASSERT_MODERATE(params.batch_size > 0, params.batch_size, nrtiles); + params.nworkers = std::max(1, util::ceilDiv(nrtiles, params.batch_size)); + + return params; +} + template struct Helpers {}; template struct Helpers { - static auto prepareT(matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) { - namespace ex = pika::execution::experimental; + static auto set0_and_return(matrix::ReadWriteTileSender tile_t) { namespace di = dlaf::internal; - return ex::when_all(std::move(taus), std::move(tile_t)) | + return std::move(tile_t) | di::transform(di::Policy(pika::execution::thread_priority::high), - [](const matrix::Tile& taus, - matrix::Tile&& tile_t) { + [](matrix::Tile&& tile_t) { tile::internal::set0(tile_t); - - const SizeType k = tile_t.size().cols(); - lapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), - tile_t.ld() + 1); - return std::move(tile_t); }); } - static auto stepGEMV(matrix::ReadOnlyTileSender tile_vi, - matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) { - auto gemv_func = [](const matrix::Tile& tile_v, + static void loop_gemv(const matrix::Tile& tile_v, const matrix::Tile& taus, - matrix::Tile tile_t) noexcept { - const SizeType k = tile_t.size().cols(); + const matrix::Tile& tile_t) noexcept { + const SizeType k = tile_t.size().cols(); - DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); - DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k); + DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); + DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k); - common::internal::SingleThreadedBlasScope single; - for (SizeType j = 0; j < k; ++j) { - // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) - // [j x 1] = [(n-j) x j]* . [(n-j) x 1] - const TileElementIndex t_start{0, j}; - const TileElementIndex va_start{0, 0}; - const TileElementIndex vb_start{0, j}; - const TileElementSize va_size{tile_v.size().rows(), j}; - const T tau = tile_t({j, j}); - - blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, - tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), - 1); - } - return tile_t; - }; + common::internal::SingleThreadedBlasScope single; + for (SizeType j = 0; j < k; ++j) { + // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j) + // [j x 1] = [(n-j) x j]* . [(n-j) x 1] + const TileElementIndex t_start{0, j}; + const TileElementIndex va_start{0, 0}; + const TileElementIndex vb_start{0, j}; + const TileElementSize va_size{tile_v.size().rows(), j}; + const T tau = tile_t({j, j}); + blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau, + tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1); + } + } + + static matrix::ReadWriteTileSender step_gemv( + matrix::Panel& hh_panel, + matrix::ReadOnlyTileSender taus, + matrix::ReadWriteTileSender tile_t, + matrix::Panel& workspaces) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; - return ex::when_all(tile_vi, std::move(taus), std::move(tile_t)) | - di::transform(di::Policy(pika::execution::thread_priority::high), - std::move(gemv_func)); + using pika::execution::thread_priority; + + std::vector> hh_tiles = + selectRead(hh_panel, hh_panel.iteratorLocal()); + + const SizeType k = hh_panel.getWidth(); + + if (hh_tiles.size() == 0) + return tile_t; + + const auto workers_params = split_tfactor_work(hh_tiles.size()); + const std::size_t nworkers = workers_params.nworkers; + const std::size_t batch_size = workers_params.batch_size; + + const auto range_workspaces = common::iterate_range2d(LocalTileSize{to_SizeType(nworkers - 1), 1}); + DLAF_ASSERT(to_sizet(std::distance(range_workspaces.begin(), range_workspaces.end())) >= + nworkers - 1, + std::distance(range_workspaces.begin(), range_workspaces.end()), nworkers - 1); + + const auto hp_scheduler = di::getBackendScheduler(thread_priority::high); + return ex::when_all(ex::when_all_vector(std::move(hh_tiles)), std::move(taus), std::move(tile_t), + ex::when_all_vector(select(workspaces, range_workspaces))) | + ex::continues_on(hp_scheduler) | + ex::let_value([hp_scheduler, k, nworkers, batch_size](auto& hh_tiles, auto& taus, + auto& tile_t, auto& workspaces) { + return ex::just(std::make_unique>(nworkers)) | + ex::continues_on(hp_scheduler) | + ex::bulk( + nworkers, + [=, &hh_tiles, &taus, &tile_t, &workspaces](const std::size_t worker_id, + auto& barrier_ptr) mutable { + const auto barrier_busy_wait = getTFactorBarrierBusyWait(); + DLAF_ASSERT_HEAVY(k == taus.get().size().rows(), k, taus.get().size().rows()); + + const std::size_t begin = worker_id * batch_size; + const std::size_t end = + std::min(worker_id * batch_size + batch_size, hh_tiles.size()); + + const matrix::Tile ws_worker = + (worker_id == 0 ? tile_t : workspaces[worker_id - 1]) + .subTileReference({{0, 0}, tile_t.size()}); + + tile::internal::set0(ws_worker); + lapack::lacpy(blas::Uplo::General, 1, k, taus.get().ptr(), 1, ws_worker.ptr(), + ws_worker.ld() + 1); + + // make it work on worker_id section of tiles + for (std::size_t index = begin; index < end; ++index) { + const matrix::Tile& tile_v = hh_tiles[index].get(); + loop_gemv(tile_v, taus.get(), ws_worker); + } + + barrier_ptr->arrive_and_wait(barrier_busy_wait); + + // reduce ws_T in tile_t + if (worker_id == 0) { + for (std::size_t other_worker = 1; other_worker < nworkers; ++other_worker) { + matrix::Tile ws = + workspaces[other_worker - 1].subTileReference({{0, 0}, tile_t.size()}); + tile::internal::add(T(1), ws, tile_t); + } + } + }) | + // Note: ignore the barrier sent by the bulk and just return tile_t + ex::then([&tile_t](auto) mutable { return std::move(tile_t); }); + }); } - static void trmvLoop(const matrix::Tile& tile_t) { + static void loop_trmv(const matrix::Tile& tile_t) { common::internal::SingleThreadedBlasScope single; const SizeType k = tile_t.size().cols(); @@ -116,15 +200,8 @@ struct Helpers { } } - static auto stepTRMV(matrix::ReadWriteTileSender tile_t) noexcept { - namespace di = dlaf::internal; - - return std::move(tile_t) | - di::transform(di::Policy(pika::execution::thread_priority::high), trmvLoop); - } - - static auto stepCopyDiagAndTRMV(matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) noexcept { + static auto step_copy_diag_and_trmv(matrix::ReadOnlyTileSender taus, + matrix::ReadWriteTileSender tile_t) noexcept { auto tausdiag_trmvloop = [](const matrix::Tile& taus, matrix::Tile tile_t) { common::internal::SingleThreadedBlasScope single; @@ -132,7 +209,7 @@ struct Helpers { const SizeType k = tile_t.size().cols(); lapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), tile_t.ld() + 1); - trmvLoop(tile_t); + loop_trmv(tile_t); }; namespace di = dlaf::internal; @@ -147,53 +224,132 @@ struct Helpers { #ifdef DLAF_WITH_GPU template struct Helpers { - static auto prepareT(matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) { + static auto set0_and_return(matrix::ReadWriteTileSender tile_t) { namespace di = dlaf::internal; - namespace ex = pika::execution::experimental; - - return ex::when_all(std::move(taus), std::move(tile_t)) | + return std::move(tile_t) | di::transform(di::Policy(pika::execution::thread_priority::high), - [](const matrix::Tile& taus, - matrix::Tile& tile_t, whip::stream_t stream) { + [](matrix::Tile& tile_t, whip::stream_t stream) { tile::internal::set0(tile_t, stream); - - const SizeType k = tile_t.size().cols(); - whip::memcpy_2d_async(tile_t.ptr(), to_sizet(tile_t.ld() + 1) * sizeof(T), - taus.ptr(), sizeof(T), sizeof(T), to_sizet(k), - whip::memcpy_host_to_device, stream); - return std::move(tile_t); }); } - - static auto stepGEMV(matrix::ReadOnlyTileSender tile_vi, - matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) noexcept { - auto gemv_func = [](cublasHandle_t handle, const matrix::Tile& tile_v, + static void loop_gemv(cublasHandle_t handle, const matrix::Tile& tile_v, const matrix::Tile& taus, - matrix::Tile& tile_t) noexcept { - const SizeType m = tile_v.size().rows(); - const SizeType k = tile_t.size().cols(); - DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); - DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k); - DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols()); - - gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(), - tile_t.ld()); + const matrix::Tile& tile_t) noexcept { + const SizeType m = tile_v.size().rows(); + const SizeType k = tile_t.size().cols(); + DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k); + DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k); + DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols()); - return std::move(tile_t); - }; + gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(), + tile_t.ld()); + } + static matrix::ReadWriteTileSender step_gemv( + matrix::Panel& hh_panel, + matrix::ReadOnlyTileSender taus, + matrix::ReadWriteTileSender tile_t, + matrix::Panel& workspaces) { namespace ex = pika::execution::experimental; namespace di = dlaf::internal; - return ex::when_all(std::move(tile_vi), std::move(taus), std::move(tile_t)) | - di::transform( - di::Policy(pika::execution::thread_priority::high), std::move(gemv_func)); + using pika::execution::thread_priority; + + std::vector> hh_tiles = + selectRead(hh_panel, hh_panel.iteratorLocal()); + + const SizeType k = hh_panel.getWidth(); + + if (hh_tiles.size() == 0) + return tile_t; + + const auto workers_params = split_tfactor_work(hh_tiles.size()); + const std::size_t nworkers = workers_params.nworkers; + const std::size_t batch_size = workers_params.batch_size; + + const auto range_workspaces = common::iterate_range2d(LocalTileSize{to_SizeType(nworkers - 1), 1}); + DLAF_ASSERT(to_sizet(std::distance(range_workspaces.begin(), range_workspaces.end())) >= + nworkers - 1, + std::distance(range_workspaces.begin(), range_workspaces.end()), nworkers - 1); + + for (std::size_t worker_id = 0; worker_id < nworkers; ++worker_id) { + const std::size_t begin = worker_id * batch_size; + const std::size_t end = std::min(hh_tiles.size(), (worker_id + 1) * batch_size); + + DLAF_ASSERT(end >= begin, begin, end); + + if (end == begin) + continue; + + std::vector> input_tiles; + input_tiles.reserve(end - begin); + for (std::size_t sub = begin; sub < end; ++sub) + input_tiles.emplace_back(std::move(hh_tiles[sub])); + + matrix::ReadWriteTileSender workspace = + worker_id == 0 ? std::move(tile_t) + : workspaces.readwrite(LocalTileIndex{to_SizeType(worker_id - 1), 0}); + + workspace = + di::whenAllLift(ex::when_all_vector(std::move(input_tiles)), taus, std::move(workspace)) | + di::transform( + di::Policy(thread_priority::high), + [k](cublasHandle_t handle, auto&& hh_tiles, auto&& taus, + matrix::Tile& tile_t_full) { + DLAF_ASSERT_MODERATE(k == taus.size().rows(), k, taus.size().rows()); + + matrix::Tile tile_t = tile_t_full.subTileReference({{0, 0}, {k, k}}); + + // Note: + // prepare the diagonal of taus in t and reset the rest + whip::stream_t stream; + DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream)); + + whip::memset_2d_async(tile_t.ptr(), sizeof(T) * to_sizet(tile_t.ld()), 0, + sizeof(T) * to_sizet(k), to_sizet(k), stream); + gpulapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), tile_t.ld() + 1, + stream); + + // Note: + // - call one gemv per tile + // - being on the same stream, they are already serialised on GPU + for (std::size_t index = 0; index < hh_tiles.size(); ++index) { + const matrix::Tile& tile_v = hh_tiles[index].get(); + Helpers::loop_gemv(handle, tile_v, taus, tile_t); + } + + return std::move(tile_t_full); + }); + + if (worker_id == 0) + tile_t = std::move(workspace); + else + ex::start_detached(std::move(workspace)); + } + + if (nworkers > 1 and k > 1) { + tile_t = + di::whenAllLift(std::move(tile_t), + ex::when_all_vector(selectRead(workspaces, range_workspaces))) | + di::transform( + di::Policy(thread_priority::high), + [nworkers](auto&& tile_t, auto&& workspaces, whip::stream_t stream) { + for (std::size_t index = 0; index < nworkers - 1; ++index) { + matrix::Tile ws = + workspaces[index].get().subTileReference({{0, 0}, tile_t.size()}); + DLAF_ASSERT(equal_size(ws, tile_t), ws, tile_t); + gpulapack::add(blas::Uplo::Upper, tile_t.size().rows() - 1, tile_t.size().cols() - 1, + T(1), ws.ptr({0, 1}), ws.ld(), tile_t.ptr({0, 1}), tile_t.ld(), stream); + } + return std::move(tile_t); + }); + } + + return tile_t; } - static void trmvLoop(cublasHandle_t handle, const matrix::Tile& tile_t) { + static void loop_trmv(cublasHandle_t handle, const matrix::Tile& tile_t) { const SizeType k = tile_t.size().cols(); // Update each column (in order) t = T . t @@ -209,16 +365,8 @@ struct Helpers { } } - static auto stepTRMV(matrix::ReadWriteTileSender tile_t) noexcept { - namespace di = dlaf::internal; - - return std::move(tile_t) | - di::transform( - di::Policy(pika::execution::thread_priority::high), trmvLoop); - } - - static auto stepCopyDiagAndTRMV(matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) noexcept { + static auto step_copy_diag_and_trmv(matrix::ReadOnlyTileSender taus, + matrix::ReadWriteTileSender tile_t) noexcept { // Update each column (in order) t = T . t // remember that T is upper triangular, so it is possible to use TRMV auto trmv_func = [](cublasHandle_t handle, const matrix::Tile& taus, @@ -227,10 +375,9 @@ struct Helpers { DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream)); const SizeType k = tile_t.size().cols(); - whip::memcpy_2d_async(tile_t.ptr(), to_sizet(tile_t.ld() + 1) * sizeof(T), taus.ptr(), sizeof(T), - sizeof(T), to_sizet(k), whip::memcpy_host_to_device, stream); + gpulapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), tile_t.ld() + 1, stream); - trmvLoop(handle, tile_t); + loop_trmv(handle, tile_t); }; namespace ex = pika::execution::experimental; @@ -247,7 +394,8 @@ struct Helpers { template void QR_Tfactor::call(matrix::Panel& hh_panel, matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t) { + matrix::ReadWriteTileSender tile_t, + matrix::Panel& workspaces) { namespace ex = pika::execution::experimental; using Helpers = tfactor_l::Helpers; @@ -256,8 +404,6 @@ void QR_Tfactor::call(matrix::Panel& if (hh_panel.getWidth() == 0) return; - tile_t = Helpers::prepareT(taus, std::move(tile_t)); - // Note: // T factor is an upper triangular square matrix, built column by column // with taus values on the diagonal @@ -268,32 +414,18 @@ void QR_Tfactor::call(matrix::Panel& // // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) // - // // The result is achieved in two main steps: - // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) - // 2) T(0:j, j) = T(0:j, 0:j) . t - - // 1st step: compute the column partial result `t` - // First we compute the matrix vector multiplication for each column - // -tau(j) . V(j:, 0:j)* . V(j:, j) - for (const auto& i_lc : hh_panel.iteratorLocal()) { - // Note: - // Since we are writing always on the same t, the gemv are serialized - // A possible solution to this would be to have multiple places where to store partial - // results, and then locally reduce them just before the reduce over ranks - tile_t = Helpers::stepGEMV(hh_panel.read(i_lc), taus, std::move(tile_t)); - } + // 1) "GEMV" t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) "TRMV" T(0:j, j) = T(0:j, 0:j) . t - // 2nd step: compute the T factor, by performing the last step on each column - // each column depends on the previous part (all reflectors that comes before) - // so it is performed sequentially - ex::start_detached(Helpers::stepTRMV(std::move(tile_t))); + tile_t = Helpers::step_gemv(hh_panel, taus, std::move(tile_t), workspaces); + ex::start_detached(Helpers::step_copy_diag_and_trmv(taus, std::move(tile_t))); } template void QR_Tfactor::call( matrix::Panel& hh_panel, matrix::ReadOnlyTileSender taus, - matrix::ReadWriteTileSender tile_t, + matrix::ReadWriteTileSender tile_t, matrix::Panel& workspaces, comm::CommunicatorPipeline& mpi_col_task_chain) { namespace ex = pika::execution::experimental; @@ -303,8 +435,6 @@ void QR_Tfactor::call( if (hh_panel.getWidth() == 0) return; - tile_t = Helpers::prepareT(taus, std::move(tile_t)); - // Note: // T factor is an upper triangular square matrix, built column by column // with taus values on the diagonal @@ -315,31 +445,22 @@ void QR_Tfactor::call( // // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) // - // // The result is achieved in two main steps: - // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) - // 2) T(0:j, j) = T(0:j, 0:j) . t - - // 1st step: compute the column partial result `t` - // First we compute the matrix vector multiplication for each column - // -tau(j) . V(j:, 0:j)* . V(j:, j) - for (const auto& i_lc : hh_panel.iteratorLocal()) { - // Note: - // Since we are writing always on the same t, the gemv are serialized - // A possible solution to this would be to have multiple places where to store partial - // results, and then locally reduce them just before the reduce over ranks - tile_t = Helpers::stepGEMV(hh_panel.read(i_lc), taus, std::move(tile_t)); - } + // 1) "GEMV" t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) "TRMV" T(0:j, j) = T(0:j, 0:j) . t + + // Note: + // reset is needed because not all ranks might have computations to do, but they will participate + // to mpi reduction anyway. + tile_t = Helpers::set0_and_return(std::move(tile_t)); + tile_t = Helpers::step_gemv(hh_panel, taus, std::move(tile_t), workspaces); - // at this point each rank has its partial result for each column + // Note: at this point each rank has its partial result for each column // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor) if (mpi_col_task_chain.size() > 1) tile_t = schedule_all_reduce_in_place(mpi_col_task_chain.exclusive(), MPI_SUM, std::move(tile_t)); - // 2nd step: compute the T factor, by performing the last step on each column - // each column depends on the previous part (all reflectors that comes before) - // so it is performed sequentially - ex::start_detached(Helpers::stepCopyDiagAndTRMV(taus, std::move(tile_t))); + ex::start_detached(Helpers::step_copy_diag_and_trmv(taus, std::move(tile_t))); } } diff --git a/include/dlaf/lapack/tile.h b/include/dlaf/lapack/tile.h index 7ef832763a..9e50b752b5 100644 --- a/include/dlaf/lapack/tile.h +++ b/include/dlaf/lapack/tile.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #endif diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h index 3508e31c56..1b1195acc5 100644 --- a/include/dlaf/tune.h +++ b/include/dlaf/tune.h @@ -55,6 +55,17 @@ namespace dlaf { /// Enable dump of tridiagonal solver input/output data to "tridiagonal.h5" file that will before /// created in the working folder (it should not exist before the execution). /// Set with environment variable DLAF_DEBUG_DUMP_TRIDIAG_SOLVER_DATA. +/// - tfactor_num_threads: +/// The maximum number of threads to use for computing tfactor (e.g. which is used for +/// instance in red2band and its backtransformation). Set with --dlaf:tfactor-num-threads or env +/// variable DLAF_TFACTOR_NTHREADS. +/// - tfactor_num_streams: +/// The maximum number of streams to use for computing tfactor (e.g. which is used for +/// instance in red2band and its backtransformation). Set with --dlaf:tfactor-num-streams or env +/// variable DLAF_TFACTOR_NSTREAMS. +/// - tfactor_barrier_busy_wait_us: +/// The duration in microseconds to busy-wait in barriers in the tfactor algorithm. +/// Set with --dlaf:tfactor-barrier-busy-wait-us or env variable DLAF_TFACTOR_BARRIER_BUSY_WAIT_US. /// - red2band_panel_nworkers: /// The maximum number of threads to use for computing the panel in the reduction to band algorithm. /// Set with --dlaf:red2band-panel-nworkers or env variable DLAF_RED2BAND_PANEL_NWORKERS. @@ -108,6 +119,7 @@ struct TuneParameters { const auto default_pool_thread_count = pika::resource::get_thread_pool("default").get_os_thread_count(); + tfactor_num_threads = std::max(1, default_pool_thread_count / 2); red2band_panel_nworkers = std::max(1, default_pool_thread_count / 2); tridiag_rank1_nworkers = default_pool_thread_count; } @@ -118,6 +130,10 @@ struct TuneParameters { bool debug_dump_reduction_to_band_data = false; bool debug_dump_band_to_tridiagonal_data = false; bool debug_dump_tridiag_solver_data = false; + + std::size_t tfactor_num_threads = 1; + std::size_t tfactor_num_streams = 4; + std::size_t tfactor_barrier_busy_wait_us = 0; std::size_t red2band_panel_nworkers = 1; std::size_t red2band_barrier_busy_wait_us = 1000; std::size_t tridiag_rank1_nworkers = 1; diff --git a/src/init.cpp b/src/init.cpp index b21890fb2c..7dd3fa9fb5 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -281,6 +281,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu // These are added automatically by updateConfigurationValue. auto& param = getTuneParameters(); // clang-format off + updateConfigurationValue(vm, param.tfactor_num_threads, "TFACTOR_NUM_THREADS", "tfactor-num-threads"); + updateConfigurationValue(vm, param.tfactor_num_streams, "TFACTOR_NUM_STREAMS", "tfactor-num-streams"); + updateConfigurationValue(vm, param.tfactor_barrier_busy_wait_us, "TFACTOR_BARRIER_BUSY_WAIT_US", "tfactor-barrier-busy-wait-us"); updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers"); updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us"); updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band"); @@ -335,14 +338,17 @@ pika::program_options::options_description getOptionsDescription() { desc.add_options()("dlaf:no-mpi-pool", pika::program_options::bool_switch(), "Disable the MPI pool."); // Tune parameters command line options - desc.add_options()( "dlaf:red2band-panel-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); - desc.add_options()( "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); - desc.add_options()( "dlaf:eigensolver-min-band", pika::program_options::value(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); - desc.add_options()( "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); - desc.add_options()( "dlaf:tridiag-rank1-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); - desc.add_options()( "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); - desc.add_options()( "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), "The application of the HH reflector is splitted in smaller applications of group size reflectors."); - desc.add_options()( "dlaf:communicator-grid-num-pipelines", pika::program_options::value(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); + desc.add_options()("dlaf:tfactor-num-threads", pika::program_options::value(), "The maximum number of threads to use for computing the tfactor."); + desc.add_options()("dlaf:tfactor-num-streams", pika::program_options::value(), "The maximum number of GPU streams to use for computing the tfactor."); + desc.add_options()("dlaf:tfactor-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the tfactor algorithm."); + desc.add_options()("dlaf:red2band-panel-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm."); + desc.add_options()("dlaf:red2band-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm."); + desc.add_options()("dlaf:eigensolver-min-band", pika::program_options::value(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead."); + desc.add_options()("dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)"); + desc.add_options()("dlaf:tridiag-rank1-nworkers", pika::program_options::value(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm."); + desc.add_options()("dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm."); + desc.add_options()("dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value(), "The application of the HH reflector is splitted in smaller applications of group size reflectors."); + desc.add_options()("dlaf:communicator-grid-num-pipelines", pika::program_options::value(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid."); // clang-format on return desc; diff --git a/src/tune.cpp b/src/tune.cpp index 8bd64d23ed..03da20c830 100644 --- a/src/tune.cpp +++ b/src/tune.cpp @@ -19,6 +19,9 @@ TuneParameters& getTuneParameters() { } std::ostream& operator<<(std::ostream& os, const TuneParameters& params) { + os << " tfactor_num_threads = " << params.tfactor_num_threads << std::endl; + os << " tfactor_num_streams = " << params.tfactor_num_streams << std::endl; + os << " tfactor_barrier_busy_wait_us = " << params.tfactor_barrier_busy_wait_us << std::endl; os << " red2band_panel_nworkers = " << params.red2band_panel_nworkers << std::endl; os << " red2band_barrier_busy_wait_us = " << params.red2band_barrier_busy_wait_us << std::endl; os << " tridiag_rank1_nworkers = " << params.tridiag_rank1_nworkers << std::endl; diff --git a/test/unit/factorization/test_compute_t_factor.cpp b/test/unit/factorization/test_compute_t_factor.cpp index 7cafc2fedd..542467e640 100644 --- a/test/unit/factorization/test_compute_t_factor.cpp +++ b/test/unit/factorization/test_compute_t_factor.cpp @@ -8,6 +8,7 @@ // SPDX-License-Identifier: BSD-3-Clause // +#include #include #include #include @@ -19,6 +20,7 @@ #include #include #include +#include #include // workaround for importing lapack.hh #include #include @@ -26,6 +28,7 @@ #include #include #include +#include #include #include @@ -260,6 +263,8 @@ std::vector void testComputeTFactor(const SizeType m, const SizeType k, const SizeType mb, const SizeType nb, const GlobalElementIndex v_start) { + using dlaf::factorization::internal::computeTFactor; + ASSERT_LE(v_start.row() + k, m); ASSERT_LE(v_start.col() + k, nb); @@ -301,8 +306,16 @@ void testComputeTFactor(const SizeType m, const SizeType k, const SizeType mb, c panel_v.setTile(i, splitTile(v.get().read(i), panel_view(i))); } - using dlaf::factorization::internal::computeTFactor; - computeTFactor(panel_v, mat_taus.read(GlobalTileIndex(0, 0)), t_output.get().readwrite(t_idx)); + auto workspaces = [k]() -> matrix::Panel { + const SizeType nworkspaces = to_SizeType( + std::max(0, factorization::internal::get_tfactor_num_workers() - 1)); + const SizeType nrefls_step = k; + return matrix::Panel({{nworkspaces * nrefls_step, nrefls_step}, + {nrefls_step, nrefls_step}}); + }(); + + computeTFactor(panel_v, mat_taus.read(GlobalTileIndex(0, 0)), t_output.get().readwrite(t_idx), + workspaces); } // Note: @@ -329,6 +342,8 @@ void testComputeTFactor(const SizeType m, const SizeType k, const SizeType mb, c template void testComputeTFactor(comm::CommunicatorGrid& grid, const SizeType m, const SizeType k, const SizeType mb, const SizeType nb, const GlobalElementIndex v_start) { + using dlaf::factorization::internal::computeTFactor; + ASSERT_LE(v_start.row() + k, m); ASSERT_LE(v_start.col() + k, nb); @@ -382,9 +397,16 @@ void testComputeTFactor(comm::CommunicatorGrid& grid, const SizeType m, const Si panel_v.setTile(i, splitTile(v.get().read(i), panel_view(i))); } - using dlaf::factorization::internal::computeTFactor; + auto workspaces = [k]() -> matrix::Panel { + const SizeType nworkspaces = to_SizeType( + std::max(0, factorization::internal::get_tfactor_num_workers() - 1)); + const SizeType nrefls_step = k; + return matrix::Panel({{nworkspaces * nrefls_step, nrefls_step}, + {nrefls_step, nrefls_step}}); + }(); + computeTFactor(panel_v, mat_taus.read(GlobalTileIndex(0, 0)), t_output.get().readwrite(t_idx), - serial_comm); + workspaces, serial_comm); } // Note: