diff --git a/include/dlaf/eigensolver/bt_reduction_to_band/impl.h b/include/dlaf/eigensolver/bt_reduction_to_band/impl.h
index 1e2e977176..1c5cc71154 100644
--- a/include/dlaf/eigensolver/bt_reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/bt_reduction_to_band/impl.h
@@ -30,6 +30,7 @@
 #include <dlaf/communication/kernels.h>
 #include <dlaf/eigensolver/bt_reduction_to_band/api.h>
 #include <dlaf/factorization/qr.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
 #include <dlaf/matrix/copy.h>
 #include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/distribution.h>
@@ -134,6 +135,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     const SizeType b, MatrixRef<T, device>& mat_c, Matrix<const T, device>& mat_v,
     Matrix<const T, Device::CPU>& mat_taus) {
   using namespace bt_red_band;
+  using dlaf::factorization::internal::computeTFactor;
 
   auto hp = pika::execution::thread_priority::high;
   auto np = pika::execution::thread_priority::normal;
@@ -162,10 +164,19 @@ void BackTransformationReductionToBand<backend, device, T>::call(
   dlaf::matrix::Distribution dist_t({mb, total_nr_reflector}, {mb, mb});
   matrix::Panel<Coord::Row, T, device> panelT(dist_t);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces =
+        to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<backend>() - 1));
+    const SizeType nrefls_step = dist_v.tile_size().cols();
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, device>> panelsWS(n_workspaces, dist_ws);
+
   const SizeType nr_reflector_blocks = dist_t.nrTiles().cols();
 
   for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) {
-    bool is_last = (k == nr_reflector_blocks - 1);
+    const bool is_last = (k == nr_reflector_blocks - 1);
     const SizeType nr_reflectors = dist_t.tileSize({0, k}).cols();
 
     const GlobalElementIndex v_offset(k * mb + b, k * mb);
@@ -174,6 +185,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors);
     const matrix::SubMatrixView mat_c_view(dist_c, c_offset);
 
+    auto& panelWS = panelsWS.nextResource();
     auto& panelV = panelsV.nextResource();
     auto& panelW = panelsW.nextResource();
     auto& panelW2 = panelsW2.nextResource();
@@ -182,6 +194,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     panelW.setRangeStart(v_offset);
 
     if (is_last) {
+      panelWS.setWidth(nr_reflectors);
       panelT.setHeight(nr_reflectors);
       panelW2.setHeight(nr_reflectors);
       panelW.setWidth(nr_reflectors);
@@ -207,8 +220,8 @@ void BackTransformationReductionToBand<backend, device, T>::call(
 
     const LocalTileIndex taus_index{Coord::Row, k};
     const LocalTileIndex t_index{Coord::Col, k};
-    dlaf::factorization::internal::computeTFactor<backend>(panelV, mat_taus.read(taus_index),
-                                                           panelT.readwrite(t_index));
+
+    computeTFactor<backend>(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS);
 
     // W = V T
     auto tile_t = panelT.read(t_index);
@@ -233,6 +246,7 @@ void BackTransformationReductionToBand<backend, device, T>::call(
     panelW.reset();
     panelW2.reset();
     panelT.reset();
+    panelWS.reset();
   }
 }
 
@@ -242,6 +256,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
                                                       Matrix<const T, Device::CPU>& mat_taus) {
   namespace ex = pika::execution::experimental;
   using namespace bt_red_band;
+  using dlaf::factorization::internal::computeTFactor;
 
   auto hp = pika::execution::thread_priority::high;
   auto np = pika::execution::thread_priority::normal;
@@ -277,6 +292,14 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
                                     dist_v.sourceRankIndex());
   matrix::Panel<Coord::Row, T, D> panelT(dist_t);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
+    const SizeType nrefls_step = dist_v.tile_size().cols();
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panelsWS(n_workspaces, dist_ws);
+
   const SizeType nr_reflector_blocks = dist_t.nrTiles().cols();
 
   for (SizeType k = nr_reflector_blocks - 1; k >= 0; --k) {
@@ -289,6 +312,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     const matrix::SubPanelView panel_view(dist_v, v_offset, nr_reflectors);
     const matrix::SubMatrixView mat_c_view(dist_c, c_offset);
 
+    auto& panelWS = panelsWS.nextResource();
     auto& panelV = panelsV.nextResource();
     auto& panelW = panelsW.nextResource();
     auto& panelW2 = panelsW2.nextResource();
@@ -298,6 +322,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     panelT.setRange(GlobalTileIndex(Coord::Col, k), GlobalTileIndex(Coord::Col, k + 1));
 
     if (is_last) {
+      panelWS.setWidth(nr_reflectors);
       panelT.setHeight(nr_reflectors);
       panelW2.setHeight(nr_reflectors);
       panelW.setWidth(nr_reflectors);
@@ -329,8 +354,9 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
       const GlobalTileIndex taus_index{Coord::Row, k};
       const SizeType k_local = dist_t.template localTileFromGlobalTile<Coord::Col>(k);
       const LocalTileIndex t_index{Coord::Col, k_local};
-      dlaf::factorization::internal::computeTFactor<B>(panelV, mat_taus.read(taus_index),
-                                                       panelT.readwrite(t_index), mpi_col_task_chain);
+
+      computeTFactor<B>(panelV, mat_taus.read(taus_index), panelT.readwrite(t_index), panelWS,
+                        mpi_col_task_chain);
 
       // WH = V T
       for (const auto& idx : panel_view.iteratorLocal()) {
@@ -349,9 +375,10 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     }
 
     if (mpi_col_task_chain.size() > 1) {
-      for (const auto& kj_panel : panelW2.iteratorLocal())
+      for (const auto& kj_panel : panelW2.iteratorLocal()) {
         ex::start_detached(dlaf::comm::schedule_all_reduce_in_place(
             mpi_col_task_chain.exclusive(), MPI_SUM, panelW2.readwrite(kj_panel)));
+      }
     }
 
     broadcast(k_rank_col, panelV, mpi_row_task_chain);
@@ -366,6 +393,7 @@ void BackTransformationReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
     panelW.reset();
     panelW2.reset();
     panelT.reset();
+    panelWS.reset();
   }
 }
 }
diff --git a/include/dlaf/eigensolver/reduction_to_band/impl.h b/include/dlaf/eigensolver/reduction_to_band/impl.h
index 28593afcad..87b40f837e 100644
--- a/include/dlaf/eigensolver/reduction_to_band/impl.h
+++ b/include/dlaf/eigensolver/reduction_to_band/impl.h
@@ -42,6 +42,7 @@
 #include <dlaf/eigensolver/internal/get_red2band_panel_nworkers.h>
 #include <dlaf/eigensolver/reduction_to_band/api.h>
 #include <dlaf/factorization/qr.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/copy_tile.h>
 #include <dlaf/matrix/distribution.h>
@@ -1019,6 +1020,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
   common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
   common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
+    const SizeType nrefls_step = dist.tile_size().cols();
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_ws(n_workspaces, dist_ws);
+
   // Note:
   // Here dist_a is given with full panel size instead of dist with just the part actually needeed,
   // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
@@ -1044,10 +1053,13 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
     //        reflectors (i.e. at the end when last reflector size is 1)
     const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);
 
+    Panel<Coord::Col, T, D>& ws = panels_ws.nextResource();
     Panel<Coord::Col, T, D>& v = panels_v.nextResource();
     v.setRangeStart(ij_offset);
-    if (isPanelIncomplete)
+    if (isPanelIncomplete) {
+      ws.setWidth(nrefls_tile);
       v.setWidth(nrefls_tile);
+    }
 
     // PANEL
     compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
@@ -1063,8 +1075,8 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const
     // TODO used just by the column, maybe we can re-use a panel tile?
     // TODO probably the first one in any panel is ok?
     Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
-    computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
+    computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws);
+    ws.reset();
 
     // PREPARATION FOR TRAILING MATRIX UPDATE
     const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
@@ -1206,6 +1218,14 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
   common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
       n_workspaces, dist);
 
+  const auto dist_ws = [&]() {
+    using dlaf::factorization::internal::get_tfactor_num_workers;
+    const SizeType nworkspaces = to_SizeType(std::max<std::size_t>(0, get_tfactor_num_workers<B>() - 1));
+    const SizeType nrefls_step = band_size;
+    return matrix::Distribution{{nworkspaces * nrefls_step, nrefls_step}, {nrefls_step, nrefls_step}};
+  }();
+  common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_ws(n_workspaces, dist_ws);
+
   red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
 
   ex::unique_any_sender<> trigger_panel{ex::just()};
@@ -1229,10 +1249,12 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
 
     auto& v = panels_v.nextResource();
     auto& vt = panels_vt.nextResource();
+    auto& ws = panels_ws.nextResource();
 
     v.setRangeStart(at_offset);
     vt.setRangeStart(at_offset);
 
+    ws.setWidth(nrefls_tile);
     v.setWidth(nrefls_tile);
     vt.setHeight(nrefls_tile);
 
@@ -1254,8 +1276,10 @@ Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& gr
       // deadlock due to tile shared between panel and trailing matrix
       red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
                                                      nrefls_tile, v, mat_a, !is_full_band);
-      computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
+
+      computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx), ws,
                         mpi_col_chain);
+      ws.reset();
     }
 
     // PREPARATION FOR TRAILING MATRIX UPDATE
diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h
index 7337412e0e..86093dc532 100644
--- a/include/dlaf/factorization/qr.h
+++ b/include/dlaf/factorization/qr.h
@@ -33,26 +33,84 @@ namespace dlaf::factorization::internal {
 /// H0 H1 H2 ... HK-1
 /// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
 ///       it has to be set correctly in the panel (0s as well).
-
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
 /// A Storage-Efficient WY Representation for Products of Householder Transformations.
 /// Schreiber, Robert & VanLoan, Charles. (1989)
 /// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
 ///
-/// @pre taus contains a vector with k elements
-/// @pre t contains a (k x k) tile
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
+/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k)
+///
+/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
+/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                    matrix::ReadWriteTileSender<T, device> t) {
-  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t));
+                    matrix::ReadWriteTileSender<T, device> t,
+                    matrix::Panel<Coord::Col, T, device>& workspaces) {
+  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), workspaces);
 }
 
+/// Forms the triangular factor T of a block of reflectors H, which is defined as a product
+/// of k := hh_panel.getWidth() elementary reflectors.
+///
+/// hh_panel should have the following form
+/// H0  0  0 ...    0
+///  . H1  0 ...    0
+///  .  . H2 ...    0
+///  .  .  . ...    0
+///  .  .  . ... HK-1
+///  .  .  . ...    .
+/// H0 H1 H2 ... HK-1
+/// Note: The first element of the HH reflectors is NOT implicitly assumed to be 1,
+///       it has to be set correctly in the panel (0s as well).
+///
+/// It is similar to what xLARFT in LAPACK does.
+/// Given @p k elementary reflectors stored in the column of @p hh_panel together with related tau values
+/// in @p taus, in @p t will be formed the triangular factor for the H block of reflectors, such that
+///
+/// H = I - V . T . V*
+///
+/// where H = H1 . H2 . ... . Hk
+///
+/// in which Hi represents a single elementary reflector transformation.
+///
+/// A Storage-Efficient WY Representation for Products of Householder Transformations.
+/// Schreiber, Robert & VanLoan, Charles. (1989)
+/// SIAM Journal on Scientific and Statistical Computing. 10. 10.1137/0910005.
+///
+/// @param hh_panel where the elementary reflectors are stored
+/// @param taus array of taus, associated with the related elementary reflector
+/// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
+/// TileElementSize(k, k)
+/// @param workspaces array of tiles used as workspace, with at least one tile per worker (see
+/// get_tfactor_num_workers), each tile should be at least of size TileElementSize(k, k)
+/// @param mpi_col_task_chain where internal communications are issued
+///
+/// @pre reflectors in hh_panel are well formed (1s on the diagonal and 0s in the upper part)
+/// @pre hh_panel.getWidth() <= t.get().size().rows && hh_panel.getWidth() <= t.get().size().cols()
 template <Backend backend, Device device, class T>
 void computeTFactor(matrix::Panel<Coord::Col, T, device>& hh_panel,
                     matrix::ReadOnlyTileSender<T, Device::CPU> taus,
                     matrix::ReadWriteTileSender<T, device> t,
+                    matrix::Panel<Coord::Col, T, device>& workspaces,
                     comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain) {
-  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), mpi_col_task_chain);
+  QR_Tfactor<backend, device, T>::call(hh_panel, std::move(taus), std::move(t), workspaces,
+                                       mpi_col_task_chain);
 }
 
 }
diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h
index e4fef98c37..507bfc4d1c 100644
--- a/include/dlaf/factorization/qr/api.h
+++ b/include/dlaf/factorization/qr/api.h
@@ -27,62 +27,14 @@ struct QR {};
 
 template <Backend backend, Device device, class T>
 struct QR_Tfactor {
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus tile of the taus vector, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
-  static void call(matrix::Panel<Coord::Col, T, device>& panel_view,
+  static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
                    matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                   matrix::ReadWriteTileSender<T, device> t);
-
-  /// Forms the triangular factor T of a block of reflectors H, which is defined as a product of k
-  /// elementary reflectors.
-  ///
-  /// It is similar to what xLARFT in LAPACK does.
-  /// Given @p k elementary reflectors stored in the column of @p v starting at tile @p v_start,
-  /// together with related tau values in @p taus, in @p t will be formed the triangular factor for the H
-  /// block of reflector, such that
-  ///
-  /// H = I - V . T . V*
-  ///
-  /// where H = H1 . H2 . ... . Hk
-  ///
-  /// in which Hi represents a single elementary reflector transformation
-  ///
-  /// @param k the number of elementary reflectors to use (from the beginning of the tile)
-  /// @param v where the elementary reflectors are stored
-  /// @param v_start tile in @p v where the column of reflectors starts
-  /// @param taus tile of the taus vector, associated with the related elementary reflector
-  /// @param t tile where the resulting T factor will be stored in its top-left sub-matrix of size
-  /// TileElementSize(k, k)
-  /// @param mpi_col_task_chain where internal communications are issued
-  ///
-  /// @pre k <= t.get().size().rows && k <= t.get().size().cols()
-  /// @pre k >= 0
-  /// @pre v_start.isIn(v.nrTiles())
+                   matrix::ReadWriteTileSender<T, device> t,
+                   matrix::Panel<Coord::Col, T, device>& workspaces);
   static void call(matrix::Panel<Coord::Col, T, device>& hh_panel,
                    matrix::ReadOnlyTileSender<T, Device::CPU> taus,
                    matrix::ReadWriteTileSender<T, device> t,
+                   matrix::Panel<Coord::Col, T, device>& workspaces,
                    comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain);
 };
 
diff --git a/include/dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h b/include/dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h
new file mode 100644
index 0000000000..3634ebc7bb
--- /dev/null
+++ b/include/dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h
@@ -0,0 +1,22 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2024, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <chrono>
+
+#include "dlaf/tune.h"
+
+namespace dlaf::factorization::internal {
+
+inline std::chrono::duration<double> getTFactorBarrierBusyWait() noexcept {
+  return std::chrono::microseconds(getTuneParameters().tfactor_barrier_busy_wait_us);
+}
+
+}
diff --git a/include/dlaf/factorization/qr/internal/get_tfactor_num_workers.h b/include/dlaf/factorization/qr/internal/get_tfactor_num_workers.h
new file mode 100644
index 0000000000..ec6e01182f
--- /dev/null
+++ b/include/dlaf/factorization/qr/internal/get_tfactor_num_workers.h
@@ -0,0 +1,38 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2024, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include <pika/runtime.hpp>
+
+#include <dlaf/tune.h>
+
+namespace dlaf::factorization::internal {
+
+template <Backend B>
+std::size_t get_tfactor_num_workers() noexcept {
+  if constexpr (B == Backend::MC) {
+    // Note: precautionarily we leave at least 1 thread "free" to do other stuff (if possible)
+    const std::size_t available_workers =
+        pika::resource::get_thread_pool("default").get_os_thread_count();
+    const std::size_t min_workers = 1;
+    const auto max_workers = std::max(min_workers, available_workers - 1);
+
+    const std::size_t nworkers = getTuneParameters().tfactor_num_threads;
+    return std::clamp(nworkers, min_workers, max_workers);
+  }
+  else if constexpr (B == Backend::GPU) {
+    return std::max<std::size_t>(1, getTuneParameters().tfactor_num_streams);
+  }
+}
+}
diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h
index af3222f0af..fec3c3f830 100644
--- a/include/dlaf/factorization/qr/t_factor_impl.h
+++ b/include/dlaf/factorization/qr/t_factor_impl.h
@@ -10,12 +10,19 @@
 
 #pragma once
 
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <iterator>
 #include <utility>
+#include <vector>
 
 #include <blas.hh>
 
+#include <pika/barrier.hpp>
 #include <pika/execution.hpp>
 
+#include <dlaf/blas/tile_extensions.h>
 #include <dlaf/common/assert.h>
 #include <dlaf/common/data.h>
 #include <dlaf/common/index2d.h>
@@ -24,12 +31,15 @@
 #include <dlaf/communication/communicator_pipeline.h>
 #include <dlaf/communication/kernels/all_reduce.h>
 #include <dlaf/factorization/qr/api.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_barrier_busy_wait.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
 #include <dlaf/lapack/gpu/larft.h>
 #include <dlaf/lapack/tile.h>
 #include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/tile.h>
 #include <dlaf/matrix/views.h>
 #include <dlaf/types.h>
+#include <dlaf/util_math.h>
 #include <dlaf/util_matrix.h>
 
 #ifdef DLAF_WITH_GPU
@@ -41,66 +51,140 @@
 namespace dlaf::factorization::internal {
 
 namespace tfactor_l {
+
+template <Backend B>
+auto split_tfactor_work(const std::size_t nrtiles) {
+  const std::size_t min_workers = 1;
+  const std::size_t available_workers = get_tfactor_num_workers<B>();
+  const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(1));
+
+  struct {
+    std::size_t nworkers;
+    std::size_t batch_size;
+  } params;
+
+  params.batch_size = util::ceilDiv(nrtiles, std::clamp(ideal_workers, min_workers, available_workers));
+  DLAF_ASSERT_MODERATE(params.batch_size > 0, params.batch_size, nrtiles);
+  params.nworkers = std::max<std::size_t>(1, util::ceilDiv(nrtiles, params.batch_size));
+
+  return params;
+}
+
 template <Backend backend, Device device, class T>
 struct Helpers {};
 
 template <class T>
 struct Helpers<Backend::MC, Device::CPU, T> {
-  static auto prepareT(matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                       matrix::ReadWriteTileSender<T, Device::CPU> tile_t) {
-    namespace ex = pika::execution::experimental;
+  static auto set0_and_return(matrix::ReadWriteTileSender<T, Device::CPU> tile_t) {
     namespace di = dlaf::internal;
-    return ex::when_all(std::move(taus), std::move(tile_t)) |
+    return std::move(tile_t) |
            di::transform(di::Policy<Backend::MC>(pika::execution::thread_priority::high),
-                         [](const matrix::Tile<const T, Device::CPU>& taus,
-                            matrix::Tile<T, Device::CPU>&& tile_t) {
+                         [](matrix::Tile<T, Device::CPU>&& tile_t) {
                            tile::internal::set0<T>(tile_t);
-
-                           const SizeType k = tile_t.size().cols();
-                           lapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(),
-                                         tile_t.ld() + 1);
-
                            return std::move(tile_t);
                          });
   }
 
-  static auto stepGEMV(matrix::ReadOnlyTileSender<T, Device::CPU> tile_vi,
-                       matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                       matrix::ReadWriteTileSender<T, Device::CPU> tile_t) {
-    auto gemv_func = [](const matrix::Tile<const T, Device::CPU>& tile_v,
+  static void loop_gemv(const matrix::Tile<const T, Device::CPU>& tile_v,
                         const matrix::Tile<const T, Device::CPU>& taus,
-                        matrix::Tile<T, Device::CPU> tile_t) noexcept {
-      const SizeType k = tile_t.size().cols();
+                        const matrix::Tile<T, Device::CPU>& tile_t) noexcept {
+    const SizeType k = tile_t.size().cols();
 
-      DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
-      DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k);
+    DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
+    DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k);
 
-      common::internal::SingleThreadedBlasScope single;
-      for (SizeType j = 0; j < k; ++j) {
-        // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
-        // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
-        const TileElementIndex t_start{0, j};
-        const TileElementIndex va_start{0, 0};
-        const TileElementIndex vb_start{0, j};
-        const TileElementSize va_size{tile_v.size().rows(), j};
-        const T tau = tile_t({j, j});
-
-        blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
-                   tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start),
-                   1);
-      }
-      return tile_t;
-    };
+    common::internal::SingleThreadedBlasScope single;
+    for (SizeType j = 0; j < k; ++j) {
+      // T(0:j, j) = -tau . V(j:, 0:j)* . V(j:, j)
+      // [j x 1] = [(n-j) x j]* . [(n-j) x 1]
+      const TileElementIndex t_start{0, j};
+      const TileElementIndex va_start{0, 0};
+      const TileElementIndex vb_start{0, j};
+      const TileElementSize va_size{tile_v.size().rows(), j};
+      const T tau = tile_t({j, j});
 
+      blas::gemv(blas::Layout::ColMajor, blas::Op::ConjTrans, va_size.rows(), va_size.cols(), -tau,
+                 tile_v.ptr(va_start), tile_v.ld(), tile_v.ptr(vb_start), 1, 1, tile_t.ptr(t_start), 1);
+    }
+  }
+
+  static matrix::ReadWriteTileSender<T, Device::CPU> step_gemv(
+      matrix::Panel<Coord::Col, T, Device::CPU>& hh_panel,
+      matrix::ReadOnlyTileSender<T, Device::CPU> taus,
+      matrix::ReadWriteTileSender<T, Device::CPU> tile_t,
+      matrix::Panel<Coord::Col, T, Device::CPU>& workspaces) {
     namespace ex = pika::execution::experimental;
     namespace di = dlaf::internal;
 
-    return ex::when_all(tile_vi, std::move(taus), std::move(tile_t)) |
-           di::transform(di::Policy<Backend::MC>(pika::execution::thread_priority::high),
-                         std::move(gemv_func));
+    using pika::execution::thread_priority;
+
+    std::vector<matrix::ReadOnlyTileSender<T, Device::CPU>> hh_tiles =
+        selectRead(hh_panel, hh_panel.iteratorLocal());
+
+    const SizeType k = hh_panel.getWidth();
+
+    if (hh_tiles.size() == 0)
+      return tile_t;
+
+    const auto workers_params = split_tfactor_work<Backend::MC>(hh_tiles.size());
+    const std::size_t nworkers = workers_params.nworkers;
+    const std::size_t batch_size = workers_params.batch_size;
+
+    const auto range_workspaces = common::iterate_range2d(LocalTileSize{to_SizeType(nworkers - 1), 1});
+    DLAF_ASSERT(to_sizet(std::distance(range_workspaces.begin(), range_workspaces.end())) >=
+                    nworkers - 1,
+                std::distance(range_workspaces.begin(), range_workspaces.end()), nworkers - 1);
+
+    const auto hp_scheduler = di::getBackendScheduler<Backend::MC>(thread_priority::high);
+    return ex::when_all(ex::when_all_vector(std::move(hh_tiles)), std::move(taus), std::move(tile_t),
+                        ex::when_all_vector(select(workspaces, range_workspaces))) |
+           ex::continues_on(hp_scheduler) |
+           ex::let_value([hp_scheduler, k, nworkers, batch_size](auto& hh_tiles, auto& taus,
+                                                                 auto& tile_t, auto& workspaces) {
+             return ex::just(std::make_unique<pika::barrier<>>(nworkers)) |
+                    ex::continues_on(hp_scheduler) |
+                    ex::bulk(
+                        nworkers,
+                        [=, &hh_tiles, &taus, &tile_t, &workspaces](const std::size_t worker_id,
+                                                                    auto& barrier_ptr) mutable {
+                          const auto barrier_busy_wait = getTFactorBarrierBusyWait();
+                          DLAF_ASSERT_HEAVY(k == taus.get().size().rows(), k, taus.get().size().rows());
+
+                          const std::size_t begin = worker_id * batch_size;
+                          const std::size_t end =
+                              std::min(worker_id * batch_size + batch_size, hh_tiles.size());
+
+                          const matrix::Tile<T, Device::CPU> ws_worker =
+                              (worker_id == 0 ? tile_t : workspaces[worker_id - 1])
+                                  .subTileReference({{0, 0}, tile_t.size()});
+
+                          tile::internal::set0<T>(ws_worker);
+                          lapack::lacpy(blas::Uplo::General, 1, k, taus.get().ptr(), 1, ws_worker.ptr(),
+                                        ws_worker.ld() + 1);
+
+                          // make it work on worker_id section of tiles
+                          for (std::size_t index = begin; index < end; ++index) {
+                            const matrix::Tile<const T, Device::CPU>& tile_v = hh_tiles[index].get();
+                            loop_gemv(tile_v, taus.get(), ws_worker);
+                          }
+
+                          barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+                          // reduce ws_T in tile_t
+                          if (worker_id == 0) {
+                            for (std::size_t other_worker = 1; other_worker < nworkers; ++other_worker) {
+                              matrix::Tile<T, Device::CPU> ws =
+                                  workspaces[other_worker - 1].subTileReference({{0, 0}, tile_t.size()});
+                              tile::internal::add(T(1), ws, tile_t);
+                            }
+                          }
+                        }) |
+                    // Note: ignore the barrier sent by the bulk and just return tile_t
+                    ex::then([&tile_t](auto) mutable { return std::move(tile_t); });
+           });
   }
 
-  static void trmvLoop(const matrix::Tile<T, Device::CPU>& tile_t) {
+  static void loop_trmv(const matrix::Tile<T, Device::CPU>& tile_t) {
     common::internal::SingleThreadedBlasScope single;
 
     const SizeType k = tile_t.size().cols();
@@ -116,15 +200,8 @@ struct Helpers<Backend::MC, Device::CPU, T> {
     }
   }
 
-  static auto stepTRMV(matrix::ReadWriteTileSender<T, Device::CPU> tile_t) noexcept {
-    namespace di = dlaf::internal;
-
-    return std::move(tile_t) |
-           di::transform(di::Policy<Backend::MC>(pika::execution::thread_priority::high), trmvLoop);
-  }
-
-  static auto stepCopyDiagAndTRMV(matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                                  matrix::ReadWriteTileSender<T, Device::CPU> tile_t) noexcept {
+  static auto step_copy_diag_and_trmv(matrix::ReadOnlyTileSender<T, Device::CPU> taus,
+                                      matrix::ReadWriteTileSender<T, Device::CPU> tile_t) noexcept {
     auto tausdiag_trmvloop = [](const matrix::Tile<const T, Device::CPU>& taus,
                                 matrix::Tile<T, Device::CPU> tile_t) {
       common::internal::SingleThreadedBlasScope single;
@@ -132,7 +209,7 @@ struct Helpers<Backend::MC, Device::CPU, T> {
       const SizeType k = tile_t.size().cols();
       lapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), tile_t.ld() + 1);
 
-      trmvLoop(tile_t);
+      loop_trmv(tile_t);
     };
 
     namespace di = dlaf::internal;
@@ -147,53 +224,132 @@ struct Helpers<Backend::MC, Device::CPU, T> {
 #ifdef DLAF_WITH_GPU
 template <class T>
 struct Helpers<Backend::GPU, Device::GPU, T> {
-  static auto prepareT(matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                       matrix::ReadWriteTileSender<T, Device::GPU> tile_t) {
+  static auto set0_and_return(matrix::ReadWriteTileSender<T, Device::GPU> tile_t) {
     namespace di = dlaf::internal;
-    namespace ex = pika::execution::experimental;
-
-    return ex::when_all(std::move(taus), std::move(tile_t)) |
+    return std::move(tile_t) |
            di::transform(di::Policy<Backend::GPU>(pika::execution::thread_priority::high),
-                         [](const matrix::Tile<const T, Device::CPU>& taus,
-                            matrix::Tile<T, Device::GPU>& tile_t, whip::stream_t stream) {
+                         [](matrix::Tile<T, Device::GPU>& tile_t, whip::stream_t stream) {
                            tile::internal::set0<T>(tile_t, stream);
-
-                           const SizeType k = tile_t.size().cols();
-                           whip::memcpy_2d_async(tile_t.ptr(), to_sizet(tile_t.ld() + 1) * sizeof(T),
-                                                 taus.ptr(), sizeof(T), sizeof(T), to_sizet(k),
-                                                 whip::memcpy_host_to_device, stream);
-
                            return std::move(tile_t);
                          });
   }
-
-  static auto stepGEMV(matrix::ReadOnlyTileSender<T, Device::GPU> tile_vi,
-                       matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                       matrix::ReadWriteTileSender<T, Device::GPU> tile_t) noexcept {
-    auto gemv_func = [](cublasHandle_t handle, const matrix::Tile<const T, Device::GPU>& tile_v,
+  static void loop_gemv(cublasHandle_t handle, const matrix::Tile<const T, Device::GPU>& tile_v,
                         const matrix::Tile<const T, Device::CPU>& taus,
-                        matrix::Tile<T, Device::GPU>& tile_t) noexcept {
-      const SizeType m = tile_v.size().rows();
-      const SizeType k = tile_t.size().cols();
-      DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
-      DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k);
-      DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols());
-
-      gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(),
-                             tile_t.ld());
+                        const matrix::Tile<T, Device::GPU>& tile_t) noexcept {
+    const SizeType m = tile_v.size().rows();
+    const SizeType k = tile_t.size().cols();
+    DLAF_ASSERT(tile_v.size().cols() == k, tile_v.size().cols(), k);
+    DLAF_ASSERT(taus.size().rows() == k, taus.size().rows(), k);
+    DLAF_ASSERT(taus.size().cols() == 1, taus.size().cols());
 
-      return std::move(tile_t);
-    };
+    gpulapack::larft_gemv0(handle, m, k, tile_v.ptr(), tile_v.ld(), taus.ptr(), tile_t.ptr(),
+                           tile_t.ld());
+  }
 
+  static matrix::ReadWriteTileSender<T, Device::GPU> step_gemv(
+      matrix::Panel<Coord::Col, T, Device::GPU>& hh_panel,
+      matrix::ReadOnlyTileSender<T, Device::CPU> taus,
+      matrix::ReadWriteTileSender<T, Device::GPU> tile_t,
+      matrix::Panel<Coord::Col, T, Device::GPU>& workspaces) {
     namespace ex = pika::execution::experimental;
     namespace di = dlaf::internal;
 
-    return ex::when_all(std::move(tile_vi), std::move(taus), std::move(tile_t)) |
-           di::transform<di::TransformDispatchType::Blas>(
-               di::Policy<Backend::GPU>(pika::execution::thread_priority::high), std::move(gemv_func));
+    using pika::execution::thread_priority;
+
+    std::vector<matrix::ReadOnlyTileSender<T, Device::GPU>> hh_tiles =
+        selectRead(hh_panel, hh_panel.iteratorLocal());
+
+    const SizeType k = hh_panel.getWidth();
+
+    if (hh_tiles.size() == 0)
+      return tile_t;
+
+    const auto workers_params = split_tfactor_work<Backend::GPU>(hh_tiles.size());
+    const std::size_t nworkers = workers_params.nworkers;
+    const std::size_t batch_size = workers_params.batch_size;
+
+    const auto range_workspaces = common::iterate_range2d(LocalTileSize{to_SizeType(nworkers - 1), 1});
+    DLAF_ASSERT(to_sizet(std::distance(range_workspaces.begin(), range_workspaces.end())) >=
+                    nworkers - 1,
+                std::distance(range_workspaces.begin(), range_workspaces.end()), nworkers - 1);
+
+    for (std::size_t worker_id = 0; worker_id < nworkers; ++worker_id) {
+      const std::size_t begin = worker_id * batch_size;
+      const std::size_t end = std::min(hh_tiles.size(), (worker_id + 1) * batch_size);
+
+      DLAF_ASSERT(end >= begin, begin, end);
+
+      if (end == begin)
+        continue;
+
+      std::vector<matrix::ReadOnlyTileSender<T, Device::GPU>> input_tiles;
+      input_tiles.reserve(end - begin);
+      for (std::size_t sub = begin; sub < end; ++sub)
+        input_tiles.emplace_back(std::move(hh_tiles[sub]));
+
+      matrix::ReadWriteTileSender<T, Device::GPU> workspace =
+          worker_id == 0 ? std::move(tile_t)
+                         : workspaces.readwrite(LocalTileIndex{to_SizeType(worker_id - 1), 0});
+
+      workspace =
+          di::whenAllLift(ex::when_all_vector(std::move(input_tiles)), taus, std::move(workspace)) |
+          di::transform<dlaf::internal::TransformDispatchType::Blas>(
+              di::Policy<Backend::GPU>(thread_priority::high),
+              [k](cublasHandle_t handle, auto&& hh_tiles, auto&& taus,
+                  matrix::Tile<T, Device::GPU>& tile_t_full) {
+                DLAF_ASSERT_MODERATE(k == taus.size().rows(), k, taus.size().rows());
+
+                matrix::Tile<T, Device::GPU> tile_t = tile_t_full.subTileReference({{0, 0}, {k, k}});
+
+                // Note:
+                // prepare the diagonal of taus in t and reset the rest
+                whip::stream_t stream;
+                DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
+
+                whip::memset_2d_async(tile_t.ptr(), sizeof(T) * to_sizet(tile_t.ld()), 0,
+                                      sizeof(T) * to_sizet(k), to_sizet(k), stream);
+                gpulapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), tile_t.ld() + 1,
+                                 stream);
+
+                // Note:
+                // - call one gemv per tile
+                // - being on the same stream, they are already serialised on GPU
+                for (std::size_t index = 0; index < hh_tiles.size(); ++index) {
+                  const matrix::Tile<const T, Device::GPU>& tile_v = hh_tiles[index].get();
+                  Helpers::loop_gemv(handle, tile_v, taus, tile_t);
+                }
+
+                return std::move(tile_t_full);
+              });
+
+      if (worker_id == 0)
+        tile_t = std::move(workspace);
+      else
+        ex::start_detached(std::move(workspace));
+    }
+
+    if (nworkers > 1 and k > 1) {
+      tile_t =
+          di::whenAllLift(std::move(tile_t),
+                          ex::when_all_vector(selectRead(workspaces, range_workspaces))) |
+          di::transform<dlaf::internal::TransformDispatchType::Plain>(
+              di::Policy<Backend::GPU>(thread_priority::high),
+              [nworkers](auto&& tile_t, auto&& workspaces, whip::stream_t stream) {
+                for (std::size_t index = 0; index < nworkers - 1; ++index) {
+                  matrix::Tile<const T, Device::GPU> ws =
+                      workspaces[index].get().subTileReference({{0, 0}, tile_t.size()});
+                  DLAF_ASSERT(equal_size(ws, tile_t), ws, tile_t);
+                  gpulapack::add(blas::Uplo::Upper, tile_t.size().rows() - 1, tile_t.size().cols() - 1,
+                                 T(1), ws.ptr({0, 1}), ws.ld(), tile_t.ptr({0, 1}), tile_t.ld(), stream);
+                }
+                return std::move(tile_t);
+              });
+    }
+
+    return tile_t;
   }
 
-  static void trmvLoop(cublasHandle_t handle, const matrix::Tile<T, Device::GPU>& tile_t) {
+  static void loop_trmv(cublasHandle_t handle, const matrix::Tile<T, Device::GPU>& tile_t) {
     const SizeType k = tile_t.size().cols();
 
     // Update each column (in order) t = T . t
@@ -209,16 +365,8 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
     }
   }
 
-  static auto stepTRMV(matrix::ReadWriteTileSender<T, Device::GPU> tile_t) noexcept {
-    namespace di = dlaf::internal;
-
-    return std::move(tile_t) |
-           di::transform<di::TransformDispatchType::Blas>(
-               di::Policy<Backend::GPU>(pika::execution::thread_priority::high), trmvLoop);
-  }
-
-  static auto stepCopyDiagAndTRMV(matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                                  matrix::ReadWriteTileSender<T, Device::GPU> tile_t) noexcept {
+  static auto step_copy_diag_and_trmv(matrix::ReadOnlyTileSender<T, Device::CPU> taus,
+                                      matrix::ReadWriteTileSender<T, Device::GPU> tile_t) noexcept {
     // Update each column (in order) t = T . t
     // remember that T is upper triangular, so it is possible to use TRMV
     auto trmv_func = [](cublasHandle_t handle, const matrix::Tile<const T, Device::CPU>& taus,
@@ -227,10 +375,9 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
       DLAF_GPUBLAS_CHECK_ERROR(cublasGetStream(handle, &stream));
 
       const SizeType k = tile_t.size().cols();
-      whip::memcpy_2d_async(tile_t.ptr(), to_sizet(tile_t.ld() + 1) * sizeof(T), taus.ptr(), sizeof(T),
-                            sizeof(T), to_sizet(k), whip::memcpy_host_to_device, stream);
+      gpulapack::lacpy(blas::Uplo::General, 1, k, taus.ptr(), 1, tile_t.ptr(), tile_t.ld() + 1, stream);
 
-      trmvLoop(handle, tile_t);
+      loop_trmv(handle, tile_t);
     };
 
     namespace ex = pika::execution::experimental;
@@ -247,7 +394,8 @@ struct Helpers<Backend::GPU, Device::GPU, T> {
 template <Backend backend, Device device, class T>
 void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>& hh_panel,
                                           matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-                                          matrix::ReadWriteTileSender<T, device> tile_t) {
+                                          matrix::ReadWriteTileSender<T, device> tile_t,
+                                          matrix::Panel<Coord::Col, T, device>& workspaces) {
   namespace ex = pika::execution::experimental;
 
   using Helpers = tfactor_l::Helpers<backend, device, T>;
@@ -256,8 +404,6 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   if (hh_panel.getWidth() == 0)
     return;
 
-  tile_t = Helpers::prepareT(taus, std::move(tile_t));
-
   // Note:
   // T factor is an upper triangular square matrix, built column by column
   // with taus values on the diagonal
@@ -268,32 +414,18 @@ void QR_Tfactor<backend, device, T>::call(matrix::Panel<Coord::Col, T, device>&
   //
   // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
   //
-  //
   // The result is achieved in two main steps:
-  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
-  // 2) T(0:j, j) = T(0:j, 0:j) . t
-
-  // 1st step: compute the column partial result `t`
-  // First we compute the matrix vector multiplication for each column
-  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-  for (const auto& i_lc : hh_panel.iteratorLocal()) {
-    // Note:
-    // Since we are writing always on the same t, the gemv are serialized
-    // A possible solution to this would be to have multiple places where to store partial
-    // results, and then locally reduce them just before the reduce over ranks
-    tile_t = Helpers::stepGEMV(hh_panel.read(i_lc), taus, std::move(tile_t));
-  }
+  // 1) "GEMV" t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) "TRMV" T(0:j, j) = T(0:j, 0:j) . t
 
-  // 2nd step: compute the T factor, by performing the last step on each column
-  // each column depends on the previous part (all reflectors that comes before)
-  // so it is performed sequentially
-  ex::start_detached(Helpers::stepTRMV(std::move(tile_t)));
+  tile_t = Helpers::step_gemv(hh_panel, taus, std::move(tile_t), workspaces);
+  ex::start_detached(Helpers::step_copy_diag_and_trmv(taus, std::move(tile_t)));
 }
 
 template <Backend backend, Device device, class T>
 void QR_Tfactor<backend, device, T>::call(
     matrix::Panel<Coord::Col, T, device>& hh_panel, matrix::ReadOnlyTileSender<T, Device::CPU> taus,
-    matrix::ReadWriteTileSender<T, device> tile_t,
+    matrix::ReadWriteTileSender<T, device> tile_t, matrix::Panel<Coord::Col, T, device>& workspaces,
     comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_task_chain) {
   namespace ex = pika::execution::experimental;
 
@@ -303,8 +435,6 @@ void QR_Tfactor<backend, device, T>::call(
   if (hh_panel.getWidth() == 0)
     return;
 
-  tile_t = Helpers::prepareT(taus, std::move(tile_t));
-
   // Note:
   // T factor is an upper triangular square matrix, built column by column
   // with taus values on the diagonal
@@ -315,31 +445,22 @@ void QR_Tfactor<backend, device, T>::call(
   //
   // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j)
   //
-  //
   // The result is achieved in two main steps:
-  // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j)
-  // 2) T(0:j, j) = T(0:j, 0:j) . t
-
-  // 1st step: compute the column partial result `t`
-  // First we compute the matrix vector multiplication for each column
-  // -tau(j) . V(j:, 0:j)* . V(j:, j)
-  for (const auto& i_lc : hh_panel.iteratorLocal()) {
-    // Note:
-    // Since we are writing always on the same t, the gemv are serialized
-    // A possible solution to this would be to have multiple places where to store partial
-    // results, and then locally reduce them just before the reduce over ranks
-    tile_t = Helpers::stepGEMV(hh_panel.read(i_lc), taus, std::move(tile_t));
-  }
+  // 1) "GEMV" t = -tau(j) . V(j:, 0:j)* . V(j:, j)
+  // 2) "TRMV" T(0:j, j) = T(0:j, 0:j) . t
+
+  // Note:
+  // reset is needed because not all ranks might have computations to do, but they will participate
+  // to mpi reduction anyway.
+  tile_t = Helpers::set0_and_return(std::move(tile_t));
+  tile_t = Helpers::step_gemv(hh_panel, taus, std::move(tile_t), workspaces);
 
-  // at this point each rank has its partial result for each column
+  // Note: at this point each rank has its partial result for each column
   // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor)
   if (mpi_col_task_chain.size() > 1)
     tile_t = schedule_all_reduce_in_place(mpi_col_task_chain.exclusive(), MPI_SUM, std::move(tile_t));
 
-  // 2nd step: compute the T factor, by performing the last step on each column
-  // each column depends on the previous part (all reflectors that comes before)
-  // so it is performed sequentially
-  ex::start_detached(Helpers::stepCopyDiagAndTRMV(taus, std::move(tile_t)));
+  ex::start_detached(Helpers::step_copy_diag_and_trmv(taus, std::move(tile_t)));
 }
 
 }
diff --git a/include/dlaf/lapack/tile.h b/include/dlaf/lapack/tile.h
index 7ef832763a..9e50b752b5 100644
--- a/include/dlaf/lapack/tile.h
+++ b/include/dlaf/lapack/tile.h
@@ -44,6 +44,7 @@
 #include <dlaf/gpu/lapack/api.h>
 #include <dlaf/gpu/lapack/assert_info.h>
 #include <dlaf/gpu/lapack/error.h>
+#include <dlaf/lapack/gpu/lacpy.h>
 #include <dlaf/lapack/gpu/laset.h>
 #include <dlaf/util_cublas.h>
 #endif
diff --git a/include/dlaf/tune.h b/include/dlaf/tune.h
index 3508e31c56..1b1195acc5 100644
--- a/include/dlaf/tune.h
+++ b/include/dlaf/tune.h
@@ -55,6 +55,17 @@ namespace dlaf {
 ///     Enable dump of tridiagonal solver input/output data to "tridiagonal.h5" file that will before
 ///     created in the working folder (it should not exist before the execution).
 ///     Set with environment variable DLAF_DEBUG_DUMP_TRIDIAG_SOLVER_DATA.
+/// - tfactor_num_threads:
+///     The maximum number of threads to use for computing tfactor (e.g. which is used for
+///     instance in red2band and its backtransformation). Set with --dlaf:tfactor-num-threads or env
+///     variable DLAF_TFACTOR_NTHREADS.
+/// - tfactor_num_streams:
+///     The maximum number of streams to use for computing tfactor (e.g. which is used for
+///     instance in red2band and its backtransformation). Set with --dlaf:tfactor-num-streams or env
+///     variable DLAF_TFACTOR_NSTREAMS.
+/// - tfactor_barrier_busy_wait_us:
+///     The duration in microseconds to busy-wait in barriers in the tfactor algorithm.
+///     Set with --dlaf:tfactor-barrier-busy-wait-us or env variable DLAF_TFACTOR_BARRIER_BUSY_WAIT_US.
 /// - red2band_panel_nworkers:
 ///     The maximum number of threads to use for computing the panel in the reduction to band algorithm.
 ///     Set with --dlaf:red2band-panel-nworkers or env variable DLAF_RED2BAND_PANEL_NWORKERS.
@@ -108,6 +119,7 @@ struct TuneParameters {
 
     const auto default_pool_thread_count =
         pika::resource::get_thread_pool("default").get_os_thread_count();
+    tfactor_num_threads = std::max<std::size_t>(1, default_pool_thread_count / 2);
     red2band_panel_nworkers = std::max<std::size_t>(1, default_pool_thread_count / 2);
     tridiag_rank1_nworkers = default_pool_thread_count;
   }
@@ -118,6 +130,10 @@ struct TuneParameters {
   bool debug_dump_reduction_to_band_data = false;
   bool debug_dump_band_to_tridiagonal_data = false;
   bool debug_dump_tridiag_solver_data = false;
+
+  std::size_t tfactor_num_threads = 1;
+  std::size_t tfactor_num_streams = 4;
+  std::size_t tfactor_barrier_busy_wait_us = 0;
   std::size_t red2band_panel_nworkers = 1;
   std::size_t red2band_barrier_busy_wait_us = 1000;
   std::size_t tridiag_rank1_nworkers = 1;
diff --git a/src/init.cpp b/src/init.cpp
index b21890fb2c..7dd3fa9fb5 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -281,6 +281,9 @@ void updateConfiguration(const pika::program_options::variables_map& vm, configu
   // These are added automatically by updateConfigurationValue.
   auto& param = getTuneParameters();
   // clang-format off
+  updateConfigurationValue(vm, param.tfactor_num_threads, "TFACTOR_NUM_THREADS", "tfactor-num-threads");
+  updateConfigurationValue(vm, param.tfactor_num_streams, "TFACTOR_NUM_STREAMS", "tfactor-num-streams");
+  updateConfigurationValue(vm, param.tfactor_barrier_busy_wait_us, "TFACTOR_BARRIER_BUSY_WAIT_US", "tfactor-barrier-busy-wait-us");
   updateConfigurationValue(vm, param.red2band_panel_nworkers, "RED2BAND_PANEL_NWORKERS", "red2band-panel-nworkers");
   updateConfigurationValue(vm, param.red2band_barrier_busy_wait_us, "RED2BAND_BARRIER_BUSY_WAIT_US", "red2band-barrier-busy-wait-us");
   updateConfigurationValue(vm, param.eigensolver_min_band, "EIGENSOLVER_MIN_BAND", "eigensolver-min-band");
@@ -335,14 +338,17 @@ pika::program_options::options_description getOptionsDescription() {
   desc.add_options()("dlaf:no-mpi-pool", pika::program_options::bool_switch(), "Disable the MPI pool.");
 
   // Tune parameters command line options
-  desc.add_options()( "dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
-  desc.add_options()( "dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
-  desc.add_options()( "dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
-  desc.add_options()( "dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value<SizeType>(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)");
-  desc.add_options()( "dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
-  desc.add_options()( "dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
-  desc.add_options()( "dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(), "The application of the HH reflector is splitted in smaller applications of group size reflectors.");
-  desc.add_options()( "dlaf:communicator-grid-num-pipelines", pika::program_options::value<std::size_t>(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid.");
+  desc.add_options()("dlaf:tfactor-num-threads", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing the tfactor.");
+  desc.add_options()("dlaf:tfactor-num-streams", pika::program_options::value<std::size_t>(), "The maximum number of GPU streams to use for computing the tfactor.");
+  desc.add_options()("dlaf:tfactor-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers in the tfactor algorithm.");
+  desc.add_options()("dlaf:red2band-panel-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing the panel in the reduction to band algorithm.");
+  desc.add_options()("dlaf:red2band-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers in the reduction to band algorithm.");
+  desc.add_options()("dlaf:eigensolver-min-band", pika::program_options::value<SizeType>(), "The minimum value to start looking for a divisor of the block size. When larger than the block size, the block size will be used instead.");
+  desc.add_options()("dlaf:band-to-tridiag-1d-block-size-base", pika::program_options::value<SizeType>(), "The 1D block size for band_to_tridiagonal is computed as 1d_block_size_base / nb * nb. (The input matrix is distributed with a {nb x nb} block size.)");
+  desc.add_options()("dlaf:tridiag-rank1-nworkers", pika::program_options::value<std::size_t>(), "The maximum number of threads to use for computing rank1 problem solution in tridiagonal solver algorithm.");
+  desc.add_options()("dlaf:tridiag-rank1-barrier-busy-wait-us", pika::program_options::value<std::size_t>(), "The duration in microseconds to busy-wait in barriers when computing rank1 problem solution in the tridiagonal solver algorithm.");
+  desc.add_options()("dlaf:bt-band-to-tridiag-hh-apply-group-size", pika::program_options::value<SizeType>(), "The application of the HH reflector is splitted in smaller applications of group size reflectors.");
+  desc.add_options()("dlaf:communicator-grid-num-pipelines", pika::program_options::value<std::size_t>(), "The default number of row, column, and full communicator pipelines to initialize in CommunicatorGrid.");
   // clang-format on
 
   return desc;
diff --git a/src/tune.cpp b/src/tune.cpp
index 8bd64d23ed..03da20c830 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -19,6 +19,9 @@ TuneParameters& getTuneParameters() {
 }
 
 std::ostream& operator<<(std::ostream& os, const TuneParameters& params) {
+  os << "  tfactor_num_threads = " << params.tfactor_num_threads << std::endl;
+  os << "  tfactor_num_streams = " << params.tfactor_num_streams << std::endl;
+  os << "  tfactor_barrier_busy_wait_us = " << params.tfactor_barrier_busy_wait_us << std::endl;
   os << "  red2band_panel_nworkers = " << params.red2band_panel_nworkers << std::endl;
   os << "  red2band_barrier_busy_wait_us = " << params.red2band_barrier_busy_wait_us << std::endl;
   os << "  tridiag_rank1_nworkers = " << params.tridiag_rank1_nworkers << std::endl;
diff --git a/test/unit/factorization/test_compute_t_factor.cpp b/test/unit/factorization/test_compute_t_factor.cpp
index 7cafc2fedd..542467e640 100644
--- a/test/unit/factorization/test_compute_t_factor.cpp
+++ b/test/unit/factorization/test_compute_t_factor.cpp
@@ -8,6 +8,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 //
 
+#include <cstddef>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -19,6 +20,7 @@
 #include <dlaf/communication/communicator_grid.h>
 #include <dlaf/communication/communicator_pipeline.h>
 #include <dlaf/factorization/qr.h>
+#include <dlaf/factorization/qr/internal/get_tfactor_num_workers.h>
 #include <dlaf/lapack/tile.h>  // workaround for importing lapack.hh
 #include <dlaf/matrix/copy.h>
 #include <dlaf/matrix/copy_tile.h>
@@ -26,6 +28,7 @@
 #include <dlaf/matrix/matrix.h>
 #include <dlaf/matrix/matrix_mirror.h>
 #include <dlaf/matrix/views.h>
+#include <dlaf/types.h>
 #include <dlaf/util_matrix.h>
 
 #include <gtest/gtest.h>
@@ -260,6 +263,8 @@ std::vector<std::tuple<SizeType, SizeType, SizeType, SizeType, GlobalElementInde
 template <class T, Backend B, Device D>
 void testComputeTFactor(const SizeType m, const SizeType k, const SizeType mb, const SizeType nb,
                         const GlobalElementIndex v_start) {
+  using dlaf::factorization::internal::computeTFactor;
+
   ASSERT_LE(v_start.row() + k, m);
   ASSERT_LE(v_start.col() + k, nb);
 
@@ -301,8 +306,16 @@ void testComputeTFactor(const SizeType m, const SizeType k, const SizeType mb, c
       panel_v.setTile(i, splitTile(v.get().read(i), panel_view(i)));
     }
 
-    using dlaf::factorization::internal::computeTFactor;
-    computeTFactor<B>(panel_v, mat_taus.read(GlobalTileIndex(0, 0)), t_output.get().readwrite(t_idx));
+    auto workspaces = [k]() -> matrix::Panel<Coord::Col, T, D> {
+      const SizeType nworkspaces = to_SizeType(
+          std::max<std::size_t>(0, factorization::internal::get_tfactor_num_workers<B>() - 1));
+      const SizeType nrefls_step = k;
+      return matrix::Panel<Coord::Col, T, D>({{nworkspaces * nrefls_step, nrefls_step},
+                                              {nrefls_step, nrefls_step}});
+    }();
+
+    computeTFactor<B>(panel_v, mat_taus.read(GlobalTileIndex(0, 0)), t_output.get().readwrite(t_idx),
+                      workspaces);
   }
 
   // Note:
@@ -329,6 +342,8 @@ void testComputeTFactor(const SizeType m, const SizeType k, const SizeType mb, c
 template <class T, Backend B, Device D>
 void testComputeTFactor(comm::CommunicatorGrid& grid, const SizeType m, const SizeType k,
                         const SizeType mb, const SizeType nb, const GlobalElementIndex v_start) {
+  using dlaf::factorization::internal::computeTFactor;
+
   ASSERT_LE(v_start.row() + k, m);
   ASSERT_LE(v_start.col() + k, nb);
 
@@ -382,9 +397,16 @@ void testComputeTFactor(comm::CommunicatorGrid& grid, const SizeType m, const Si
       panel_v.setTile(i, splitTile(v.get().read(i), panel_view(i)));
     }
 
-    using dlaf::factorization::internal::computeTFactor;
+    auto workspaces = [k]() -> matrix::Panel<Coord::Col, T, D> {
+      const SizeType nworkspaces = to_SizeType(
+          std::max<std::size_t>(0, factorization::internal::get_tfactor_num_workers<B>() - 1));
+      const SizeType nrefls_step = k;
+      return matrix::Panel<Coord::Col, T, D>({{nworkspaces * nrefls_step, nrefls_step},
+                                              {nrefls_step, nrefls_step}});
+    }();
+
     computeTFactor<B>(panel_v, mat_taus.read(GlobalTileIndex(0, 0)), t_output.get().readwrite(t_idx),
-                      serial_comm);
+                      workspaces, serial_comm);
   }
 
   // Note: