Skip to content

Commit

Permalink
TFactor: bulkerify GEMV (both MC and GPU) (#1214)
Browse files Browse the repository at this point in the history
Co-authored-by: Mikael Simberg <[email protected]>
  • Loading branch information
albestro and msimberg authored Feb 12, 2025
1 parent b14ffba commit c5c5280
Show file tree
Hide file tree
Showing 12 changed files with 509 additions and 218 deletions.
40 changes: 34 additions & 6 deletions include/dlaf/eigensolver/bt_reduction_to_band/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <dlaf/communication/kernels.h>
#include <dlaf/eigensolver/bt_reduction_to_band/api.h>
#include <dlaf/factorization/qr.h>
#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
#include <dlaf/matrix/copy.h>
#include <dlaf/matrix/copy_tile.h>
#include <dlaf/matrix/distribution.h>
Expand Down Expand Up @@ -134,6 +135,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
const SizeType b, MatrixRef<T, device>& mat_c, Matrix<const T, device>& mat_v,
Matrix<const T, Device::CPU>& mat_taus) {
using namespace bt_red_band;
using dlaf::factorization::internal::computeTFactor;

auto hp = pika::execution::thread_priority::high;
auto np = pika::execution::thread_priority::normal;
Expand Down Expand Up @@ -162,10 +164,19 @@ void BackTransformationReductionToBand<backend, device, T>::call(
dlaf::matrix::Distribution dist_t({mb, total_nr_reflector}, {mb, mb});
matrix::Panel<Coord::Row, T, device> panelT(dist_t);

const auto dist_ws = [&]() {
using dlaf::factorization::internal::get_tfactor_num_workers;
const SizeType nworkspaces =
to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<backend>() - 1));
const SizeType nrefls_step = dist_v.tile_size().cols();
return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
}();
common::RoundRobin<matrix::Panel<Coord::Col, T, device>> panelsWS(n_workspaces, dist_ws);

const SizeType nr_reflector_blocks = dist_t.nrTiles().cols();

for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) {
bool is_last = (k == nr_reflector_blocks - 1);
const bool is_last = (k == nr_reflector_blocks - 1);
const SizeType nr_reflectors = dist_t.tileSize({0, k}).cols();

const GlobalElementIndex v_offset(k * mb + b, k * mb);
Expand All @@ -174,6 +185,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors);
const matrix::SubMatrixView mat_c_view(dist_c, c_offset);

auto& panelWS = panelsWS.nextResource();
auto& panelV = panelsV.nextResource();
auto& panelW = panelsW.nextResource();
auto& panelW2 = panelsW2.nextResource();
Expand All @@ -182,6 +194,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
panelW.setRangeStart(v_offset);

if (is_last) {
panelWS.setWidth(nr_reflectors);
panelT.setHeight(nr_reflectors);
panelW2.setHeight(nr_reflectors);
panelW.setWidth(nr_reflectors);
Expand All @@ -207,8 +220,8 @@ void BackTransformationReductionToBand<backend, device, T>::call(

const LocalTileIndex taus_index{Coord::Row, k};
const LocalTileIndex t_index{Coord::Col, k};
dlaf::factorization::internal::computeTFactor<backend>(panelV, mat_taus.read(taus_index),
panelT.readwrite(t_index));

computeTFactor<backend>(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS);

// W = V T
auto tile_t = panelT.read(t_index);
Expand All @@ -233,6 +246,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
panelW.reset();
panelW2.reset();
panelT.reset();
panelWS.reset();
}
}

Expand All @@ -242,6 +256,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
Matrix<const T, Device::CPU>& mat_taus) {
namespace ex = pika::execution::experimental;
using namespace bt_red_band;
using dlaf::factorization::internal::computeTFactor;

auto hp = pika::execution::thread_priority::high;
auto np = pika::execution::thread_priority::normal;
Expand Down Expand Up @@ -277,6 +292,14 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
dist_v.sourceRankIndex());
matrix::Panel<Coord::Row, T, D> panelT(dist_t);

const auto dist_ws = [&]() {
using dlaf::factorization::internal::get_tfactor_num_workers;
const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
const SizeType nrefls_step = dist_v.tile_size().cols();
return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
}();
common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsWS(n_workspaces, dist_ws);

const SizeType nr_reflector_blocks = dist_t.nrTiles().cols();

for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) {
Expand All @@ -289,6 +312,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors);
const matrix::SubMatrixView mat_c_view(dist_c, c_offset);

auto& panelWS = panelsWS.nextResource();
auto& panelV = panelsV.nextResource();
auto& panelW = panelsW.nextResource();
auto& panelW2 = panelsW2.nextResource();
Expand All @@ -298,6 +322,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
panelT.setRange(GlobalTileIndex(Coord::Col, k), GlobalTileIndex(Coord::Col, k + 1));

if (is_last) {
panelWS.setWidth(nr_reflectors);
panelT.setHeight(nr_reflectors);
panelW2.setHeight(nr_reflectors);
panelW.setWidth(nr_reflectors);
Expand Down Expand Up @@ -329,8 +354,9 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
const GlobalTileIndex taus_index{Coord::Row, k};
const SizeType k_local = dist_t.template localTileFromGlobalTile<Coord::Col>(k);
const LocalTileIndex t_index{Coord::Col, k_local};
dlaf::factorization::internal::computeTFactor<B>(panelV, mat_taus.read(taus_index),
panelT.readwrite(t_index), mpi_col_task_chain);

computeTFactor<B>(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS,
mpi_col_task_chain);

// WH = V T
for (const auto& idx : panel_view.iteratorLocal()) {
Expand All @@ -349,9 +375,10 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
}

if (mpi_col_task_chain.size() > 1) {
for (const auto& kj_panel : panelW2.iteratorLocal())
for (const auto& kj_panel : panelW2.iteratorLocal()) {
ex::start_detached(dlaf::comm::schedule_all_reduce_in_place(
mpi_col_task_chain.exclusive(), MPI_SUM, panelW2.readwrite(kj_panel)));
}
}

broadcast(k_rank_col, panelV, mpi_row_task_chain);
Expand All @@ -366,6 +393,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
panelW.reset();
panelW2.reset();
panelT.reset();
panelWS.reset();
}
}
}
32 changes: 28 additions & 4 deletions include/dlaf/eigensolver/reduction_to_band/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
#include <dlaf/eigensolver/reduction_to_band/api.h>
#include <dlaf/factorization/qr.h>
#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
#include <dlaf/lapack/tile.h>
#include <dlaf/matrix/copy_tile.h>
#include <dlaf/matrix/distribution.h>
Expand Down Expand Up @@ -1019,6 +1020,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);

const auto dist_ws = [&]() {
using dlaf::factorization::internal::get_tfactor_num_workers;
const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
const SizeType nrefls_step = dist.tile_size().cols();
return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
}();
common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_ws(n_workspaces, dist_ws);

// Note:
// Here dist_a is given with full panel size instead of dist with just the part actually needeed,
// because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
Expand All @@ -1044,10 +1053,13 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
// reflectors (i.e. at the end when last reflector size is 1)
const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);

Panel<Coord::Col, T, D>& ws = panels_ws.nextResource();
Panel<Coord::Col, T, D>& v = panels_v.nextResource();
v.setRangeStart(ij_offset);
if (isPanelIncomplete)
if (isPanelIncomplete) {
ws.setWidth(nrefls_tile);
v.setWidth(nrefls_tile);
}

// PANEL
compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
Expand All @@ -1063,8 +1075,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
// TODO used just by the column, maybe we can re-use a panel tile?
// TODO probably the first one in any panel is ok?
Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());

computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws);
ws.reset();

// PREPARATION FOR TRAILING MATRIX UPDATE
const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
Expand Down Expand Up @@ -1206,6 +1218,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
n_workspaces, dist);

const auto dist_ws = [&]() {
using dlaf::factorization::internal::get_tfactor_num_workers;
const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
const SizeType nrefls_step = band_size;
return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
}();
common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_ws(n_workspaces, dist_ws);

red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);

ex::unique_any_sender<> trigger_panel{ex::just()};
Expand All @@ -1229,10 +1249,12 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr

auto& v = panels_v.nextResource();
auto& vt = panels_vt.nextResource();
auto& ws = panels_ws.nextResource();

v.setRangeStart(at_offset);
vt.setRangeStart(at_offset);

ws.setWidth(nrefls_tile);
v.setWidth(nrefls_tile);
vt.setHeight(nrefls_tile);

Expand All @@ -1254,8 +1276,10 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
// deadlock due to tile shared between panel and trailing matrix
red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
nrefls_tile, v, mat_a, !is_full_band);
computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),

computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws,
mpi_col_chain);
ws.reset();
}

// PREPARATION FOR TRAILING MATRIX UPDATE
Expand Down
70 changes: 64 additions & 6 deletions include/dlaf/factorization/qr.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,84 @@ namespace dlaf::factorization::internal {
/// H0 H1 H2 ... HK-1
/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
/// it has to be set correctly in the panel (0s as well).

///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation.
///
/// A Storage-Efficient WY Representation for Products of Householder Transformations.
/// Schreiber, Robert & VanLoan, Charles. (1989)
/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
///
/// @pre taus contains a vector with k elements
/// @pre t contains a (k x k) tile
/// @param hh_panel where the elementary reflectors are stored
/// @param taus array of taus, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k)
///
/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols()
template <Backend backend, Device device, class T>
void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
matrix::ReadOnlyTileSender<T, Device::CPU> taus,
matrix::ReadWriteTileSender<T, device> t) {
QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t));
matrix::ReadWriteTileSender<T, device> t,
matrix::Panel<Coord::Col, T, device>& workspaces) {
QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), workspaces);
}

/// Forms the triangular factor T of a block of reflectors H, which is defined as a product
/// of k := hh_panel.getWidth() elementary reflectors.
///
/// hh_panel should have the following form
/// H0 0 0 ... 0
/// . H1 0 ... 0
/// . . H2 ... 0
/// . . . ... 0
/// . . . ... HK-1
/// . . . ... .
/// H0 H1 H2 ... HK-1
/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
/// it has to be set correctly in the panel (0s as well).
///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation.
///
/// A Storage-Efficient WY Representation for Products of Householder Transformations.
/// Schreiber, Robert & VanLoan, Charles. (1989)
/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
///
/// @param hh_panel where the elementary reflectors are stored
/// @param taus array of taus, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k)
/// @param mpi_col_task_chain where internal communications are issued
///
/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols()
template <Backend backend, Device device, class T>
void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
matrix::ReadOnlyTileSender<T, Device::CPU> taus,
matrix::ReadWriteTileSender<T, device> t,
matrix::Panel<Coord::Col, T, device>& workspaces,
comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain) {
QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), mpi_col_task_chain);
QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), workspaces,
mpi_col_task_chain);
}

}
56 changes: 4 additions & 52 deletions include/dlaf/factorization/qr/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,62 +27,14 @@ struct QR {};

template <Backend backend, Device device, class T>
struct QR_Tfactor {
/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
/// elementary reflectors.
///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
/// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
/// block of reflector, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation
///
/// @param k the number of elementary reflectors to use (from the beginning of the tile)
/// @param v where the elementary reflectors are stored
/// @param v_start tile in @p v where the column of reflectors starts
/// @param taus tile of the taus vector, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
///
/// @pre k <= t.get().size().rows && k <= t.get().size().cols()
/// @pre k >= 0
/// @pre v_start.isIn(v.nrTiles())
static void call(matrix::Panel<Coord::Col, T, device>& panel_view,
static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
matrix::ReadOnlyTileSender<T, Device::CPU> taus,
matrix::ReadWriteTileSender<T, device> t);

/// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
/// elementary reflectors.
///
/// It is similar to what xLARFT in LAPACK does.
/// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
/// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
/// block of reflector, such that
///
/// H = I - V . T . V*
///
/// where H = H1 . H2 . ... . Hk
///
/// in which Hi represents a single elementary reflector transformation
///
/// @param k the number of elementary reflectors to use (from the beginning of the tile)
/// @param v where the elementary reflectors are stored
/// @param v_start tile in @p v where the column of reflectors starts
/// @param taus tile of the taus vector, associated with the related elementary reflector
/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
/// TileElementSize(k, k)
/// @param mpi_col_task_chain where internal communications are issued
///
/// @pre k <= t.get().size().rows && k <= t.get().size().cols()
/// @pre k >= 0
/// @pre v_start.isIn(v.nrTiles())
matrix::ReadWriteTileSender<T, device> t,
matrix::Panel<Coord::Col, T, device>& workspaces);
static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
matrix::ReadOnlyTileSender<T, Device::CPU> taus,
matrix::ReadWriteTileSender<T, device> t,
matrix::Panel<Coord::Col, T, device>& workspaces,
comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain);
};

Expand Down
Loading

0 comments on commit c5c5280

Please sign in to comment.