TFactor: bulkerify GEMV (both MC and GPU) (#1214)

Co-authored-by: Mikael Simberg <[email protected]>
eth-cscs · Feb 12, 2025 · c5c5280 · c5c5280
1 parent b14ffba
commit c5c5280
Show file tree

Hide file tree

Showing 12 changed files with 509 additions and 218 deletions.
diff --git a/include/dlaf/eigensolver/bt_reduction_to_band/impl.h b/include/dlaf/eigensolver/bt_reduction_to_band/impl.h
@@ -30,6 +30,7 @@
 #include <dlaf/communication/kernels.h>
 #include <dlaf/eigensolver/bt_reduction_to_band/api.h>
 #include <dlaf/factorization/qr.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
 #include <dlaf/matrix/copy.h>
 #include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/distribution.h>
@@ -134,6 +135,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     const SizeType b, MatrixRef<T, device>& mat_c, Matrix<const T, device>& mat_v,
     Matrix<const T, Device::CPU>& mat_taus) {
   using namespace bt_red_band;
+  using dlaf::factorization::internal::computeTFactor;
 
   auto hp = pika::execution::thread_priority::high;
   auto np = pika::execution::thread_priority::normal;
@@ -162,10 +164,19 @@ void BackTransformationReductionToBand<backend, device, T>::call(
   dlaf::matrix::Distribution dist_t({mb, total_nr_reflector}, {mb, mb});
   matrix::Panel<Coord::Row, T, device> panelT(dist_t);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces =
+        to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<backend>() - 1));
+    const SizeType nrefls_step = dist_v.tile_size().cols();
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, device>> panelsWS(n_workspaces, dist_ws);
+
   const SizeType nr_reflector_blocks = dist_t.nrTiles().cols();
 
   for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) {
-    bool is_last = (k == nr_reflector_blocks - 1);
+    const bool is_last = (k == nr_reflector_blocks - 1);
     const SizeType nr_reflectors = dist_t.tileSize({0, k}).cols();
 
     const GlobalElementIndex v_offset(k * mb + b, k * mb);
@@ -174,6 +185,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors);
     const matrix::SubMatrixView mat_c_view(dist_c, c_offset);
 
+    auto& panelWS = panelsWS.nextResource();
     auto& panelV = panelsV.nextResource();
     auto& panelW = panelsW.nextResource();
     auto& panelW2 = panelsW2.nextResource();
@@ -182,6 +194,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     panelW.setRangeStart(v_offset);
 
     if (is_last) {
+      panelWS.setWidth(nr_reflectors);
       panelT.setHeight(nr_reflectors);
       panelW2.setHeight(nr_reflectors);
       panelW.setWidth(nr_reflectors);
@@ -207,8 +220,8 @@ void BackTransformationReductionToBand<backend, device, T>::call(
 
     const LocalTileIndex taus_index{Coord::Row, k};
     const LocalTileIndex t_index{Coord::Col, k};
-    dlaf::factorization::internal::computeTFactor<backend>(panelV, mat_taus.read(taus_index),
-                                                           panelT.readwrite(t_index));
+
+    computeTFactor<backend>(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS);
 
     // W = V T
     auto tile_t = panelT.read(t_index);
@@ -233,6 +246,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     panelW.reset();
     panelW2.reset();
     panelT.reset();
+    panelWS.reset();
   }
 }
 
@@ -242,6 +256,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
                                                       Matrix<const T, Device::CPU>& mat_taus) {
   namespace ex = pika::execution::experimental;
   using namespace bt_red_band;
+  using dlaf::factorization::internal::computeTFactor;
 
   auto hp = pika::execution::thread_priority::high;
   auto np = pika::execution::thread_priority::normal;
@@ -277,6 +292,14 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
                                     dist_v.sourceRankIndex());
   matrix::Panel<Coord::Row, T, D> panelT(dist_t);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
+    const SizeType nrefls_step = dist_v.tile_size().cols();
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsWS(n_workspaces, dist_ws);
+
   const SizeType nr_reflector_blocks = dist_t.nrTiles().cols();
 
   for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) {
@@ -289,6 +312,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors);
     const matrix::SubMatrixView mat_c_view(dist_c, c_offset);
 
+    auto& panelWS = panelsWS.nextResource();
     auto& panelV = panelsV.nextResource();
     auto& panelW = panelsW.nextResource();
     auto& panelW2 = panelsW2.nextResource();
@@ -298,6 +322,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     panelT.setRange(GlobalTileIndex(Coord::Col, k), GlobalTileIndex(Coord::Col, k + 1));
 
     if (is_last) {
+      panelWS.setWidth(nr_reflectors);
       panelT.setHeight(nr_reflectors);
       panelW2.setHeight(nr_reflectors);
       panelW.setWidth(nr_reflectors);
@@ -329,8 +354,9 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
       const GlobalTileIndex taus_index{Coord::Row, k};
       const SizeType k_local = dist_t.template localTileFromGlobalTile<Coord::Col>(k);
       const LocalTileIndex t_index{Coord::Col, k_local};
-      dlaf::factorization::internal::computeTFactor<B>(panelV, mat_taus.read(taus_index),
-                                                       panelT.readwrite(t_index), mpi_col_task_chain);
+
+      computeTFactor<B>(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS,
+                        mpi_col_task_chain);
 
       // WH = V T
       for (const auto& idx : panel_view.iteratorLocal()) {
@@ -349,9 +375,10 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     }
 
     if (mpi_col_task_chain.size() > 1) {
-      for (const auto& kj_panel : panelW2.iteratorLocal())
+      for (const auto& kj_panel : panelW2.iteratorLocal()) {
         ex::start_detached(dlaf::comm::schedule_all_reduce_in_place(
             mpi_col_task_chain.exclusive(), MPI_SUM, panelW2.readwrite(kj_panel)));
+      }
     }
 
     broadcast(k_rank_col, panelV, mpi_row_task_chain);
@@ -366,6 +393,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     panelW.reset();
     panelW2.reset();
     panelT.reset();
+    panelWS.reset();
   }
 }
 }
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -42,6 +42,7 @@
 #include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
 #include <dlaf/factorization/qr.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/distribution.h>
@@ -1019,6 +1020,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
   common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
   common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
+    const SizeType nrefls_step = dist.tile_size().cols();
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_ws(n_workspaces, dist_ws);
+
   // Note:
   // Here dist_a is given with full panel size instead of dist with just the part actually needeed,
   // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
@@ -1044,10 +1053,13 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
     //        reflectors (i.e. at the end when last reflector size is 1)
     const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);
 
+    Panel<Coord::Col, T, D>& ws = panels_ws.nextResource();
     Panel<Coord::Col, T, D>& v = panels_v.nextResource();
     v.setRangeStart(ij_offset);
-    if (isPanelIncomplete)
+    if (isPanelIncomplete) {
+      ws.setWidth(nrefls_tile);
       v.setWidth(nrefls_tile);
+    }
 
     // PANEL
     compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
@@ -1063,8 +1075,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
     // TODO used just by the column, maybe we can re-use a panel tile?
     // TODO probably the first one in any panel is ok?
     Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
-    computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
+    computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws);
+    ws.reset();
 
     // PREPARATION FOR TRAILING MATRIX UPDATE
     const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
@@ -1206,6 +1218,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
   common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
       n_workspaces, dist);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
+    const SizeType nrefls_step = band_size;
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_ws(n_workspaces, dist_ws);
+
   red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
 
   ex::unique_any_sender<> trigger_panel{ex::just()};
@@ -1229,10 +1249,12 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
 
     auto& v = panels_v.nextResource();
     auto& vt = panels_vt.nextResource();
+    auto& ws = panels_ws.nextResource();
 
     v.setRangeStart(at_offset);
     vt.setRangeStart(at_offset);
 
+    ws.setWidth(nrefls_tile);
     v.setWidth(nrefls_tile);
     vt.setHeight(nrefls_tile);
 
@@ -1254,8 +1276,10 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
       // deadlock due to tile shared between panel and trailing matrix
       red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
                                                      nrefls_tile, v, mat_a, !is_full_band);
-      computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
+
+      computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws,
                         mpi_col_chain);
+      ws.reset();
     }
 
     // PREPARATION FOR TRAILING MATRIX UPDATE

diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
@@ -33,26 +33,84 @@ namespace dlaf::factorization::internal {
 /// H0 H1 H2 ... HK-1
 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
 ///       it has to be set correctly in the panel (0s as well).
-
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
 /// A Storage-Efficient WY Representation for Products of Householder Transformations.
 /// Schreiber, Robert & VanLoan, Charles. (1989)
 /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
 ///
-/// @pre taus contains a vector with k elements
-/// @pre t contains a (k x k) tile
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
+/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k)
+///
+/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
+/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                    matrix::ReadWriteTileSender<T, device> t) {
-  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t));
+                    matrix::ReadWriteTileSender<T, device> t,
+                    matrix::Panel<Coord::Col, T, device>& workspaces) {
+  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), workspaces);
 }
 
+/// Forms the triangular factor T of a block of reflectors H, which is defined as a product
+/// of k := hh_panel.getWidth() elementary reflectors.
+///
+/// hh_panel should have the following form
+/// H0  0  0 ...    0
+///  . H1  0 ...    0
+///  .  . H2 ...    0
+///  .  .  . ...    0
+///  .  .  . ... HK-1
+///  .  .  . ...    .
+/// H0 H1 H2 ... HK-1
+/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
+///       it has to be set correctly in the panel (0s as well).
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
+/// A Storage-Efficient WY Representation for Products of Householder Transformations.
+/// Schreiber, Robert & VanLoan, Charles. (1989)
+/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
+///
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
+/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k)
+/// @param mpi_col_task_chain where internal communications are issued
+///
+/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
+/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     matrix::ReadOnlyTileSender<T, Device::CPU> taus,
                     matrix::ReadWriteTileSender<T, device> t,
+                    matrix::Panel<Coord::Col, T, device>& workspaces,
                     comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain) {
-  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), mpi_col_task_chain);
+  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), workspaces,
+                                       mpi_col_task_chain);
 }
 
 }
diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h
@@ -27,62 +27,14 @@ struct QR {};
 
 template <Backend backend, Device device, class T>
 struct QR_Tfactor {
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus tile of the taus vector, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
-  static void call(matrix::Panel<Coord::Col, T, device>& panel_view,
+  static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
                    matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                   matrix::ReadWriteTileSender<T, device> t);
-
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus tile of the taus vector, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  /// @param mpi_col_task_chain where internal communications are issued
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
+                   matrix::ReadWriteTileSender<T, device> t,
+                   matrix::Panel<Coord::Col, T, device>& workspaces);
   static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
                    matrix::ReadOnlyTileSender<T, Device::CPU> taus,
                    matrix::ReadWriteTileSender<T, device> t,
+                   matrix::Panel<Coord::Col, T, device>& workspaces,
                    comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain);
 };