diff --git a/benchmarks/small_inv.cpp b/benchmarks/small_inv.cpp
index 23dfe5b30..3c43d11b1 100644
--- a/benchmarks/small_inv.cpp
+++ b/benchmarks/small_inv.cpp
@@ -12,7 +12,7 @@ static void inv(benchmark::State &state) {
   for (int i = 0; i < N; ++i)
     for (int j = 0; j < N; ++j) W(i, j) = (i > j ? 0.5 + i + 2.5 * j : i * 0.8 - j - 0.5);
 
-  while (state.KeepRunning()) { benchmark::DoNotOptimize(Wi = inverse(W)); }
+  while (state.KeepRunning()) { benchmark::DoNotOptimize(Wi = nda::linalg::inv(W)); }
 }
 
 BENCHMARK_TEMPLATE(inv, 1);
diff --git a/c++/nda/_impl_basic_array_view_common.hpp b/c++/nda/_impl_basic_array_view_common.hpp
index b2fabab0b..120ff1e5b 100644
--- a/c++/nda/_impl_basic_array_view_common.hpp
+++ b/c++/nda/_impl_basic_array_view_common.hpp
@@ -438,11 +438,13 @@ auto &operator=(R const &rhs) noexcept
 private:
 // Implementation of the assignment from an n-dimensional array type.
 template <typename RHS>
-void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
+void assign_from_ndarray(RHS const &rhs) noexcept {
 #ifdef NDA_ENFORCE_BOUNDCHECK
-  if (this->shape() != rhs.shape())
-    NDA_RUNTIME_ERROR << "Error in assign_from_ndarray: Size mismatch:"
-                      << "\n LHS.shape() = " << this->shape() << "\n RHS.shape() = " << rhs.shape();
+  if (this->shape() != rhs.shape()) {
+    std::cerr << "Error in assign_from_ndarray: Size mismatch:"
+              << "\n LHS.shape() = " << this->shape() << "\n RHS.shape() = " << rhs.shape() << std::endl;
+    std::terminate();
+  }
 #endif
   // compile-time check if assignment is possible
   static_assert(std::is_assignable_v<value_type &, get_value_t<RHS>>, "Error in assign_from_ndarray: Incompatible value types");
@@ -453,6 +455,10 @@ void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
   // do both operands have the same stride order?
   static constexpr bool same_stride_order = get_layout_info<self_t>.stride_order == get_layout_info<RHS>.stride_order;
 
+  // compile-time check for device arrays to avoid runtime errors
+  static_assert(!(mem::on_device<self_t> or mem::on_device<RHS>) or (both_in_memory and same_stride_order and have_same_value_type_v<self_t, RHS>),
+                "Error in assign_from_ndarray: Assignment to/from device arrays is not supported for the given types.");
+
   // prefer optimized options if possible
   if constexpr (both_in_memory and same_stride_order) {
     if (rhs.empty()) return;
@@ -470,7 +476,10 @@ void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
         auto [n_bl_dst, bl_size_dst, bl_str_dst] = *bl_layout_dst;
         auto [n_bl_src, bl_size_src, bl_str_src] = *bl_layout_src;
         // check that the total memory size is the same
-        if (n_bl_dst * bl_size_dst != n_bl_src * bl_size_src) NDA_RUNTIME_ERROR << "Error in assign_from_ndarray: Incompatible block sizes";
+        if (n_bl_dst * bl_size_dst != n_bl_src * bl_size_src) {
+          std::cerr << "Error in assign_from_ndarray: Incompatible block sizes" << std::endl;
+          std::terminate();
+        }
         // if either destination or source consists of a single block, we can chunk it up to make the layouts compatible
         if (n_bl_dst == 1 && n_bl_src > 1) {
           n_bl_dst = n_bl_src;
@@ -494,7 +503,8 @@ void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
   }
   // otherwise fallback to elementwise assignment
   if constexpr (mem::on_device<self_t> || mem::on_device<RHS>) {
-    NDA_RUNTIME_ERROR << "Error in assign_from_ndarray: Fallback to elementwise assignment not implemented for arrays/views on the GPU";
+    std::cerr << "Error in assign_from_ndarray: Elementwise assignment not implemented for arrays/views on the GPU" << std::endl;
+    std::terminate();
   }
   nda::for_each(shape(), [this, &rhs](auto const &...args) { (*this)(args...) = rhs(args...); });
 }
@@ -502,7 +512,6 @@ void assign_from_ndarray(RHS const &rhs) { // FIXME noexcept {
 // Implementation to fill a view/array with a constant scalar value.
 template <typename Scalar>
 void fill_with_scalar(Scalar const &scalar) noexcept {
-  // we make a special implementation if the array is strided in 1d or contiguous
   if constexpr (mem::on_host<self_t>) {
     if constexpr (has_layout_strided_1d<self_t>) {
       const long L             = size();
@@ -517,8 +526,8 @@ void fill_with_scalar(Scalar const &scalar) noexcept {
     } else {
       for (auto &x : *this) x = scalar;
     }
-  } else if constexpr (mem::on_device<self_t> or mem::on_unified<self_t>) { // on device
-    if constexpr (has_layout_strided_1d<self_t>) {                          // possibly contiguous
+  } else if constexpr (mem::on_device<self_t> or mem::on_unified<self_t>) {
+    if constexpr (has_layout_strided_1d<self_t>) {
       if constexpr (has_contiguous_layout<self_t>) {
         mem::fill_n<mem::get_addr_space<self_t>>(data(), size(), value_type(scalar));
       } else {
@@ -526,14 +535,14 @@ void fill_with_scalar(Scalar const &scalar) noexcept {
         mem::fill2D_n<mem::get_addr_space<self_t>>(data(), stri, 1, size(), value_type(scalar));
       }
     } else {
-      // check for 2D layout
       auto bl_layout = get_block_layout(*this);
       if (bl_layout) {
         auto [n_bl, bl_size, bl_str] = *bl_layout;
         mem::fill2D_n<mem::get_addr_space<self_t>>(data(), bl_str, bl_size, n_bl, value_type(scalar));
       } else {
         // MAM: implement recursive call to fill_with_scalar on (i,nda::ellipsis{})
-        NDA_RUNTIME_ERROR << "fill_with_scalar: Not implemented yet for general layout. ";
+        std::cerr << "Error in fill_with_scalar: Only block strided arrays/views are supported on the GPU";
+        std::terminate();
       }
     }
   }
diff --git a/c++/nda/arithmetic.hpp b/c++/nda/arithmetic.hpp
index 5c771fe95..3b76167b7 100644
--- a/c++/nda/arithmetic.hpp
+++ b/c++/nda/arithmetic.hpp
@@ -12,8 +12,9 @@
 
 #include "./concepts.hpp"
 #include "./declarations.hpp"
+#include "./linalg/inv.hpp"
 #include "./linalg/matmul.hpp"
-#include "./linalg/det_and_inverse.hpp"
+#include "./linalg/matvecmul.hpp"
 #include "./macros.hpp"
 #include "./stdutil/complex.hpp"
 #include "./traits.hpp"
@@ -420,10 +421,10 @@ namespace nda {
       static_assert(r_algebra != 'A', "Error in nda::operator*: Can not multiply a matrix by an array");
       if constexpr (r_algebra == 'M')
         // matrix * matrix
-        return matmul(std::forward<L>(l), std::forward<R>(r));
+        return linalg::matmul(std::forward<L>(l), std::forward<R>(r));
       else
         // matrix * vector
-        return matvecmul(std::forward<L>(l), std::forward<R>(r));
+        return linalg::matvecmul(std::forward<L>(l), std::forward<R>(r));
     }
   }
 
@@ -495,7 +496,7 @@ namespace nda {
     // two matrices: M / M
     if constexpr (l_algebra == 'M') {
       static_assert(r_algebra == 'M', "Error in nda::operator*: Can not divide a matrix by an array/vector");
-      return std::forward<L>(l) * inverse(matrix<get_value_t<R>>{std::forward<R>(r)});
+      return std::forward<L>(l) * linalg::inv(matrix<get_value_t<R>>{std::forward<R>(r)});
     }
   }
 
@@ -532,7 +533,7 @@ namespace nda {
   Array auto operator/(S &&s, A &&a) { // NOLINT (S&& is mandatory for proper concept Array <: typename to work)
     static constexpr char algebra = get_algebra<A>;
     if constexpr (algebra == 'M')
-      return s * inverse(matrix<get_value_t<A>>{std::forward<A>(a)});
+      return s * linalg::inv(matrix<get_value_t<A>>{std::forward<A>(a)});
     else
       return expr<'/', std::decay_t<S>, A>{s, std::forward<A>(a)};
   }
diff --git a/c++/nda/basic_array.hpp b/c++/nda/basic_array.hpp
index c36439235..ecb377972 100644
--- a/c++/nda/basic_array.hpp
+++ b/c++/nda/basic_array.hpp
@@ -14,14 +14,16 @@
 #include "./basic_array_view.hpp"
 #include "./basic_functions.hpp"
 #include "./concepts.hpp"
-#include "./exceptions.hpp"
 #include "./iterators.hpp"
 #include "./layout/for_each.hpp"
 #include "./layout/permutation.hpp"
 #include "./layout/range.hpp"
+#include "./layout/slice_static.hpp"
 #include "./layout_transforms.hpp"
 #include "./macros.hpp"
+#include "./matrix_functions.hpp"
 #include "./mem/address_space.hpp"
+#include "./mem/fill.hpp"
 #include "./mem/memcpy.hpp"
 #include "./mem/policies.hpp"
 #include "./stdutil/array.hpp"
@@ -31,17 +33,14 @@
 #include <array>
 #include <complex>
 #include <concepts>
+#include <exception>
 #include <initializer_list>
+#include <iostream>
 #include <random>
 #include <ranges>
 #include <type_traits>
 #include <utility>
 
-#ifdef NDA_ENFORCE_BOUNDCHECK
-#include <exception>
-#include <iostream>
-#endif // NDA_ENFORCE_BOUNDCHECK
-
 namespace nda {
 
   /**
diff --git a/c++/nda/basic_array_view.hpp b/c++/nda/basic_array_view.hpp
index 412e3e9e0..73d7ddce7 100644
--- a/c++/nda/basic_array_view.hpp
+++ b/c++/nda/basic_array_view.hpp
@@ -14,18 +14,18 @@
 #include "./clef.hpp"
 #include "./concepts.hpp"
 #include "./declarations.hpp"
-#include "./exceptions.hpp"
 #include "./iterators.hpp"
-#include "layout/slice_static.hpp"
 #include "./layout/for_each.hpp"
 #include "./layout/idx_map.hpp"
 #include "./layout/permutation.hpp"
 #include "./layout/range.hpp"
+#include "./layout/slice_static.hpp"
 #include "./macros.hpp"
+#include "./matrix_functions.hpp"
 #include "./mem/address_space.hpp"
+#include "./mem/fill.hpp"
 #include "./mem/memcpy.hpp"
 #include "./mem/memset.hpp"
-#include "./mem/fill.hpp"
 #include "./mem/policies.hpp"
 #include "./traits.hpp"
 
@@ -34,16 +34,13 @@
 #include <algorithm>
 #include <array>
 #include <cstring>
+#include <exception>
+#include <iostream>
 #include <memory>
 #include <ranges>
 #include <type_traits>
 #include <utility>
 
-#ifdef NDA_ENFORCE_BOUNDCHECK
-#include <exception>
-#include <iostream>
-#endif // NDA_ENFORCE_BOUNDCHECK
-
 namespace std {
 
   /**
diff --git a/c++/nda/blas/dot.hpp b/c++/nda/blas/dot.hpp
index 1357303a6..14fdee49c 100644
--- a/c++/nda/blas/dot.hpp
+++ b/c++/nda/blas/dot.hpp
@@ -5,7 +5,7 @@
 
 /**
  * @file
- * @brief Provides a generic interface to the BLAS `dot` routine.
+ * @brief Provides a generic interface to the BLAS `dot`, `dotu` and `dotc` routine.
  */
 
 #pragma once
@@ -13,15 +13,12 @@
 #include "./interface/cxx_interface.hpp"
 #include "../concepts.hpp"
 #include "../macros.hpp"
-#include "../mapped_functions.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
 
 #ifndef NDA_HAVE_DEVICE
 #include "../device.hpp"
-#endif
-
-#include <complex>
+#endif // NDA_HAVE_DEVICE
 
 namespace nda::blas {
 
@@ -31,44 +28,37 @@ namespace nda::blas {
    */
 
   /**
-   * @brief Interface to the BLAS `dot` routine.
+   * @brief Interface to the BLAS `dot` and `dotu` routine.
    *
    * @details This function forms the dot product of two vectors. It calculates
-   * - \f$ \mathbf{x}^T \mathbf{y} \f$ in case that both \f$ \mathbf{x} \f$ and \f$ \mathbf{y} \f$ are vectors,
-   * - \f$ x \mathbf{y} \f$ in case that \f$ x \f$ is a scalar and \f$ \mathbf{y} \f$ is a vector,
-   * - \f$ \mathbf{x} y \f$ in case that \f$ \mathbf{x} \f$ is a vector and \f$ y \f$ is a scalar or
-   * - \f$ x y \f$ in case that both \f$ x \f$ and \f$ y \f$ are scalars.
+   * \f[
+   *   \mathbf{x}^T \mathbf{y} \; .
+   * \f]
    *
-   * @tparam X nda::MemoryVector or nda::Scalar type.
-   * @tparam Y nda::MemoryVector or nda::Scalar type.
-   * @param x Input vector/scalar.
-   * @param y Input vector/scalar.
-   * @return Vector/scalar result of the dot product.
+   * @note The first argument is never conjugated. Even for complex types. Use nda::blas::dotc for that.
+   *
+   * @tparam X nda::MemoryVector type.
+   * @tparam Y nda::MemoryVector type.
+   * @param x Input vector \f$ \mathbf{x} \f$.
+   * @param y Input vector \f$ \mathbf{y} \f$.
+   * @return Result of \f$ \mathbf{x}^T \mathbf{y} \f$.
    */
-  template <typename X, typename Y>
-    requires((Scalar<X> or MemoryVector<X>) and (Scalar<Y> or MemoryVector<X>))
+  template <MemoryVector X, MemoryVector Y>
+    requires(have_same_value_type_v<X, Y> and mem::have_compatible_addr_space<X, Y> and is_blas_lapack_v<get_value_t<X>>)
   auto dot(X const &x, Y const &y) {
-    if constexpr (Scalar<X> or Scalar<Y>) {
-      return x * y;
-    } else {
-      // compile-time checks
-      static_assert(have_same_value_type_v<X, Y>, "Error in nda::blas::dot: Incompatible value types");
-      static_assert(mem::have_compatible_addr_space<X, Y>, "Error in nda::blas::dot: Incompatible memory address spaces");
-      static_assert(is_blas_lapack_v<get_value_t<X>>, "Error in nda::blas::dot: Value types incompatible with blas");
+    // check the dimensions of the input/output arrays/views
+    EXPECTS(x.size() == y.size());
 
-      // runtime check
-      EXPECTS(x.shape() == y.shape());
-
-      if constexpr (mem::have_device_compatible_addr_space<X, Y>) {
+    // perform actual library call
+    if constexpr (mem::have_device_compatible_addr_space<X, Y>) {
 #if defined(NDA_HAVE_DEVICE)
-        return device::dot(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
+      return device::dot(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
 #else
-        compile_error_no_gpu();
-        return get_value_t<X>(0);
+      compile_error_no_gpu();
+      return get_value_t<X>(0);
 #endif
-      } else {
-        return f77::dot(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
-      }
+    } else {
+      return f77::dot(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
     }
   }
 
@@ -76,115 +66,36 @@ namespace nda::blas {
    * @brief Interface to the BLAS `dotc` routine.
    *
    * @details This function forms the dot product of two vectors. It calculates
-   * - \f$ \mathbf{x}^H \mathbf{y} \f$ in case that both \f$ \mathbf{x} \f$ and \f$ \mathbf{y} \f$ are vectors,
-   * - \f$ \bar{x} \mathbf{y} \f$ in case that \f$ x \f$ is a scalar and \f$ \mathbf{y} \f$ is a vector,
-   * - \f$ \mathbf{x}^H y \f$ in case that \f$ \mathbf{x} \f$ is a vector and \f$ y \f$ is a scalar or
-   * - \f$ \bar{x} y \f$ in case that both \f$ x \f$ and \f$ y \f$ are scalars.
+   * \f[
+   *   \mathbf{x}^H \mathbf{y} \; .
+   * \f]
+   *
+   * If the value type of the input vectors is real, it calls nda::blas::dot and returns a real result.
    *
-   * @tparam X nda::MemoryVector or nda::Scalar type.
-   * @tparam Y nda::MemoryVector or nda::Scalar type.
-   * @param x Input vector/scalar.
-   * @param y Input vector/scalar.
-   * @return Vector/scalar result of the dot product.
+   * @tparam X nda::MemoryVector type.
+   * @tparam Y nda::MemoryVector type.
+   * @param x Input vector \f$ \mathbf{x} \f$.
+   * @param y Input vector \f$ \mathbf{y} \f$.
+   * @return Result of \f$ \mathbf{x}^H \mathbf{y} \f$.
    */
-  template <typename X, typename Y>
-    requires((Scalar<X> or MemoryVector<X>) and (Scalar<Y> or MemoryVector<X>))
+  template <MemoryVector X, MemoryVector Y>
+    requires(have_same_value_type_v<X, Y> and mem::have_compatible_addr_space<X, Y> and is_blas_lapack_v<get_value_t<X>>)
   auto dotc(X const &x, Y const &y) {
-    if constexpr (Scalar<X> or Scalar<Y>) {
-      return conj(x) * y;
-    } else {
-      // compile-time checks
-      static_assert(have_same_value_type_v<X, Y>, "Error in nda::blas::dotc: Incompatible value types");
-      static_assert(mem::have_compatible_addr_space<X, Y>, "Error in nda::blas::dotc: Incompatible memory address spaces");
-      static_assert(is_blas_lapack_v<get_value_t<X>>, "Error in nda::blas::dotc: Value types incompatible with blas");
-
-      // runtime check
-      EXPECTS(x.shape() == y.shape());
+    // check the dimensions of the input/output arrays/views
+    EXPECTS(x.size() == y.size());
 
-      if constexpr (!is_complex_v<get_value_t<X>>) {
-        return dot(x, y);
-      } else if constexpr (mem::have_device_compatible_addr_space<X, Y>) {
+    // perform actual library call
+    if constexpr (!is_complex_v<get_value_t<X>>) {
+      return dot(x, y);
+    } else if constexpr (mem::have_device_compatible_addr_space<X, Y>) {
 #if defined(NDA_HAVE_DEVICE)
-        return device::dotc(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
+      return device::dotc(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
 #else
-        compile_error_no_gpu();
-        return get_value_t<X>(0);
+      compile_error_no_gpu();
+      return get_value_t<X>(0);
 #endif
-      } else {
-        return f77::dotc(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
-      }
-    }
-  }
-
-  namespace detail {
-
-    // Implementation of the nda::dot_generic and nda::dotc_generic functions.
-    template <bool star, typename X, typename Y>
-    auto _dot_impl(X const &x, Y const &y) {
-      EXPECTS(x.shape() == y.shape());
-      long N = x.shape()[0];
-
-      auto _conj = [](auto z) __attribute__((always_inline)) {
-        if constexpr (star and is_complex_v<decltype(z)>) {
-          return std::conj(z);
-        } else
-          return z;
-      };
-
-      if constexpr (has_layout_smallest_stride_is_one<X> and has_layout_smallest_stride_is_one<Y>) {
-        if constexpr (is_regular_or_view_v<X> and is_regular_or_view_v<Y>) {
-          auto *__restrict px = x.data();
-          auto *__restrict py = y.data();
-          auto res            = _conj(px[0]) * py[0];
-          for (size_t i = 1; i < N; ++i) { res += _conj(px[i]) * py[i]; }
-          return res;
-        } else {
-          auto res = _conj(x(_linear_index_t{0})) * y(_linear_index_t{0});
-          for (long i = 1; i < N; ++i) { res += _conj(x(_linear_index_t{i})) * y(_linear_index_t{i}); }
-          return res;
-        }
-      } else {
-        auto res = _conj(x(0)) * y(0);
-        for (long i = 1; i < N; ++i) { res += _conj(x(i)) * y(i); }
-        return res;
-      }
-    }
-
-  } // namespace detail
-
-  /**
-   * @brief Generic implementation of nda::blas::dot for types not supported by BLAS/LAPACK.
-   *
-   * @tparam X Vector/Scalar type.
-   * @tparam Y Vector/Scalar type.
-   * @param x Input vector/scalar.
-   * @param y Input vector/scalar.
-   * @return Vector/scalar result of the dot product.
-   */
-  template <typename X, typename Y>
-  auto dot_generic(X const &x, Y const &y) {
-    if constexpr (Scalar<X> or Scalar<Y>) {
-      return x * y;
-    } else {
-      return detail::_dot_impl<false>(x, y);
-    }
-  }
-
-  /**
-   * @brief Generic implementation of nda::blas::dotc for types not supported by BLAS/LAPACK.
-   *
-   * @tparam X Vector/Scalar type.
-   * @tparam Y Vector/Scalar type.
-   * @param x Input vector/scalar.
-   * @param y Input vector/scalar.
-   * @return Vector/scalar result of the dot product.
-   */
-  template <typename X, typename Y>
-  auto dotc_generic(X const &x, Y const &y) {
-    if constexpr (Scalar<X> or Scalar<Y>) {
-      return conj(x) * y;
     } else {
-      return detail::_dot_impl<true>(x, y);
+      return f77::dotc(x.size(), x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0]);
     }
   }
 
diff --git a/c++/nda/blas/gemm.hpp b/c++/nda/blas/gemm.hpp
index b4c5087d2..04002b8f9 100644
--- a/c++/nda/blas/gemm.hpp
+++ b/c++/nda/blas/gemm.hpp
@@ -20,7 +20,7 @@
 
 #ifndef NDA_HAVE_DEVICE
 #include "../device.hpp"
-#endif
+#endif // NDA_HAVE_DEVICE
 
 #include <tuple>
 #include <utility>
@@ -32,103 +32,62 @@ namespace nda::blas {
    * @{
    */
 
-  /**
-   * @brief Generic nda::blas::gemm implementation for types not supported by BLAS/LAPACK.
-   *
-   * @tparam A Some matrix type.
-   * @tparam B Some matrix type.
-   * @tparam C Some matrix type.
-   * @param alpha Input scalar.
-   * @param a Input matrix of size m-by-k.
-   * @param b Input matrix of size k-by-n.
-   * @param beta Input scalar.
-   * @param c Input/Output matrix of size m-by-n.
-   */
-  template <Matrix A, Matrix B, MemoryMatrix C>
-  void gemm_generic(typename A::value_type alpha, A const &a, B const &b, typename A::value_type beta,
-                    C &&c) { // NOLINT (temporary views are allowed here)
-    EXPECTS(a.extent(1) == b.extent(0));
-    EXPECTS(a.extent(0) == c.extent(0));
-    EXPECTS(b.extent(1) == c.extent(1));
-
-    if (beta == 0.0) {
-      c = 0 * alpha;
-    } else {
-      c *= beta;
-    }
-
-    for (int i = 0; i < a.extent(0); ++i) {
-      for (int j = 0; j < b.extent(1); ++j) {
-        for (int k = 0; k < a.extent(1); ++k) c(i, j) += alpha * a(i, k) * b(k, j);
-      }
-    }
-  }
-
   /**
    * @brief Interface to the BLAS `gemm` routine.
    *
    * @details This function performs one of the matrix-matrix operations
    * \f[
-   *   \mathbf{C} \leftarrow \alpha \mathrm{op}(\mathbf{A}) \mathrm{op}(\mathbf{B}) + \beta \mathbf{C} \;,
+   *   \mathbf{C} \leftarrow \alpha \mathrm{op}_A(\mathbf{A}) \mathrm{op}_B(\mathbf{B}) + \beta \mathbf{C} \;,
    * \f]
    * where \f$ \mathrm{op}(\mathbf{X}) \f$ is one of
+   * - \f$ \mathrm{op}(\mathbf{X}) = \mathbf{X} \f$ or,
+   * - \f$ \mathrm{op}(\mathbf{X}) = \mathbf{X}^* \f$ (only if \f$ \mathbf{X} \f$ is in nda::C_layout).
    *
-   * - \f$ \mathrm{op}(\mathbf{X}) = \mathbf{X} \f$,
-   * - \f$ \mathrm{op}(\mathbf{X}) = \mathbf{X}^T \f$ or
-   * - \f$ \mathrm{op}(\mathbf{X}) = \mathbf{X}^H \f$.
+   * Here, \f$ \alpha \f$ and \f$ \beta \f$ are scalars, and \f$ \mathbf{A} \f$, \f$ \mathbf{B} \f$ and \f$ \mathbf{C}
+   * \f$ are matrices of size \f$ m \times k \f$, \f$ k \times n \f$ and \f$ m \times n \f$, respectively.
    *
-   * Here, \f$ \alpha \f$ and \f$ \beta \f$ are scalars, and \f$ \mathbf{A} \f$, \f$ \mathbf{B} \f$ are matrices with
-   * \f$ \mathrm{op}(\mathbf{A}) \f$ is an m-by-k matrix, \f$ \mathrm{op}(\mathbf{B}) \f$ is a k-by-n matrix and
-   * \f$ \mathrm{op}(\mathbf{C}) \f$ is an m-by-n matrix.
+   * @note If matrix \f$ \mathbf{C} \f$ is in nda::C_layout, we transpose both \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ 
+   * and swap their order.
    *
    * @tparam A nda::Matrix type.
    * @tparam B nda::Matrix type.
    * @tparam C nda::MemoryMatrix type.
-   * @param alpha Input scalar.
-   * @param a Input matrix of size m-by-k.
-   * @param b Input matrix of size k-by-n.
-   * @param beta Input scalar.
-   * @param c Input/Output matrix of size m-by-n.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param a Input matrix \f$ \mathrm{op}_A(\mathbf{A}) \f$ of size \f$ m \times k \f$.
+   * @param b Input matrix \f$ \mathrm{op}_B(\mathbf{B}) \f$ of size \f$ k \times n \f$.
+   * @param beta Input scalar \f$ \beta \f$.
+   * @param c Input/Output matrix \f$ \mathbf{C} \f$ of size \f$ m \times n \f$.
    */
   template <Matrix A, Matrix B, MemoryMatrix C>
     requires((MemoryMatrix<A> or is_conj_array_expr<A>) and (MemoryMatrix<B> or is_conj_array_expr<B>)
-             and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
+             and have_same_value_type_v<A, B, C> and mem::have_compatible_addr_space<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
   void gemm(get_value_t<A> alpha, A const &a, B const &b, get_value_t<A> beta, C &&c) {
-    // get underlying matrix in case it is given as a lazy expression
-    auto to_mat = []<typename Z>(Z const &z) -> auto & {
-      if constexpr (is_conj_array_expr<Z>)
-        return std::get<0>(z.a);
-      else
-        return z;
-    };
-    auto &mat_a = to_mat(a);
-    auto &mat_b = to_mat(b);
-
-    // compile-time checks
-    using mat_a_type = decltype(mat_a);
-    using mat_b_type = decltype(mat_b);
-    static_assert(mem::have_compatible_addr_space<mat_a_type, mat_b_type, C>, "Error in nda::blas::gemm: Incompatible memory address spaces");
-
-    // runtime checks
-    EXPECTS(mat_a.extent(1) == mat_b.extent(0));
-    EXPECTS(mat_a.extent(0) == c.extent(0));
-    EXPECTS(mat_b.extent(1) == c.extent(1));
-    EXPECTS(mat_a.indexmap().min_stride() == 1);
-    EXPECTS(mat_b.indexmap().min_stride() == 1);
-    EXPECTS(c.indexmap().min_stride() == 1);
-
-    // c is in C order: compute the transpose of the product in Fortran order
+    // if C is in C-layout, compute the transpose of the product in Fortran order
     if constexpr (has_C_layout<C>) {
       gemm(alpha, transpose(b), transpose(a), beta, transpose(std::forward<C>(c)));
-    } else { // c is in Fortran order
-      static constexpr bool conj_A = is_conj_array_expr<A>;
-      static constexpr bool conj_B = is_conj_array_expr<B>;
-      char op_a                    = get_op<conj_A, /* transpose = */ has_C_layout<mat_a_type>>;
-      char op_b                    = get_op<conj_B, /* transpose = */ has_C_layout<mat_b_type>>;
-      auto [m, k]                  = mat_a.shape();
-      auto n                       = mat_b.extent(1);
-
-      if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
+    } else {
+      // get underlying matrix in case it is given as a conjugate expression
+      auto &mat_a = get_array(a);
+      auto &mat_b = get_array(b);
+
+      // check the dimensions of the input/output arrays/views
+      auto const [m, k] = mat_a.shape();
+      auto const [l, n] = mat_b.shape();
+      EXPECTS(k == l);
+      EXPECTS(m == c.extent(0));
+      EXPECTS(n == c.extent(1));
+
+      // arrays/views must be BLAS compatible
+      EXPECTS(mat_a.indexmap().min_stride() == 1);
+      EXPECTS(mat_b.indexmap().min_stride() == 1);
+      EXPECTS(c.indexmap().min_stride() == 1);
+
+      // check for conjugate lazy expressions and C-layouts
+      char op_a = get_op<is_conj_array_expr<A>, has_C_layout<A>>;
+      char op_b = get_op<is_conj_array_expr<B>, has_C_layout<B>>;
+
+      // perform the actual library call
+      if constexpr (mem::have_device_compatible_addr_space<A, B, C>) {
 #if defined(NDA_HAVE_DEVICE)
         device::gemm(op_a, op_b, m, n, k, alpha, mat_a.data(), get_ld(mat_a), mat_b.data(), get_ld(mat_b), beta, c.data(), get_ld(c));
 #else
diff --git a/c++/nda/blas/gemm_batch.hpp b/c++/nda/blas/gemm_batch.hpp
index e597eff8c..6fbe3acb7 100644
--- a/c++/nda/blas/gemm_batch.hpp
+++ b/c++/nda/blas/gemm_batch.hpp
@@ -21,7 +21,7 @@
 
 #ifndef NDA_HAVE_DEVICE
 #include "../device.hpp"
-#endif
+#endif // NDA_HAVE_DEVICE
 
 #include <algorithm>
 #include <iterator>
@@ -36,108 +36,104 @@ namespace nda::blas {
    * @{
    */
 
+  namespace detail {
+
+    // Get a vector of transpose matrices from a given vector of matrices.
+    auto get_transpose_vector(auto &&v) {
+      auto v_t = std::vector<std::decay_t<decltype(transpose(v[0]))>>{};
+      v_t.reserve(v.size());
+      std::transform(v.begin(), v.end(), std::back_inserter(v_t), [](auto &x) { return transpose(x); });
+      return v_t;
+    }
+
+    // Get a vector of pointers to the memory of matrices from a given vector of matrices.
+    template <typename T, bool is_vbatch, nda::mem::AddressSpace vec_addr_spc>
+    auto get_ptr_vector(auto &&v) {
+      EXPECTS(std::ranges::all_of(v, [&v](auto &A) { return is_vbatch or A.shape() == v[0].shape(); }));
+      EXPECTS(std::ranges::all_of(v, [](auto &A) { return get_array(A).indexmap().min_stride() == 1; }));
+      auto v_ptrs = nda::vector<T, heap<vec_addr_spc>>(v.size());
+      std::transform(v.begin(), v.end(), v_ptrs.begin(), [](auto &z) { return get_array(z).data(); });
+      return v_ptrs;
+    }
+
+  } // namespace detail
+
   /**
-   * @brief Implements a batched version of nda::blas::gemm taking vectors of matrices as arguments.
+   * @brief Interface to MKL's/CUDA's `gemm_batch` and `gemm_vbatch` routines.
    *
    * @details This routine is a batched version of nda::blas::gemm, performing multiple `gemm` operations in a single
-   * call. Each `gemm` operation performs a matrix-matrix product with general matrices.
+   * call. Each `gemm` operation performs a matrix-matrix produc.
+   *
+   * If `is_vbatch` is true, the matrices are allowed to have different sizes. Otherwise, they are required to have the
+   * same size.
+   *
+   * See also nda::blas::gemm for more details.
    *
-   * @tparam VBATCH Allow for variable sized matrices.
+   * @tparam is_vbatch Allow variable sized matrices.
    * @tparam A nda::Matrix type.
    * @tparam B nda::Matrix type.
    * @tparam C nda::MemoryMatrix type.
-   * @param alpha Input scalar.
-   * @param va std::vector of input matrices.
-   * @param vb std::vector of input matrices.
-   * @param beta Input scalar.
-   * @param vc std::vector of input/output matrices.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param va `std::vector` of input matrices.
+   * @param vb `std::vector` of input matrices.
+   * @param beta Input scalar \f$ \beta \f$.
+   * @param vc `std::vector` of input/output matrices.
    */
-  template <bool VBATCH = false, Matrix A, Matrix B, MemoryMatrix C>
+  template <bool is_vbatch = false, Matrix A, Matrix B, MemoryMatrix C>
     requires((MemoryMatrix<A> or is_conj_array_expr<A>) and (MemoryMatrix<B> or is_conj_array_expr<B>)
-             and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
+             and have_same_value_type_v<A, B, C> and mem::have_compatible_addr_space<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
   void gemm_batch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
-    // check sizes
+    // check sizes of input vectors and return if they are empty
     EXPECTS(va.size() == vb.size() and va.size() == vc.size());
     if (va.empty()) return;
-    int batch_count = va.size();
-
-    // get underlying matrix in case it is given as a lazy expression
-    auto to_mat = []<typename Z>(Z &z) -> auto & {
-      if constexpr (is_conj_array_expr<Z>)
-        return std::get<0>(z.a);
-      else
-        return z;
-    };
-    auto &a0 = to_mat(va[0]);
-    auto &b0 = to_mat(vb[0]);
-    auto &c0 = vc[0];
-
-    // compile-time checks
-    using mat_a_type = decltype(a0);
-    using mat_b_type = decltype(b0);
-    static_assert(mem::have_compatible_addr_space<mat_a_type, mat_b_type, C>, "Error in nda::blas::gemm_batch: Incompatible memory address spaces");
-
-    // c is in C order: compute the transpose of the product in Fortran order
+    auto const batch_count = va.size();
+
+    // if C is in C-layout, compute the transpose of the product in Fortran order
     if constexpr (has_C_layout<C>) {
-      // transpose each matrix in the given vector
-      auto map_transpose = [](auto &v) {
-        auto vt = std::vector<std::decay_t<decltype(transpose(v[0]))>>{};
-        vt.reserve(v.size());
-        std::transform(v.begin(), v.end(), std::back_inserter(vt), [](auto &x) { return transpose(x); });
-        return vt;
-      };
-      auto vct = map_transpose(vc);
-      gemm_batch<VBATCH>(alpha, map_transpose(vb), map_transpose(va), beta, vct);
-      return;
-    } else { // c is in Fortran order
+      auto vc_t = detail::get_transpose_vector(vc);
+      return gemm_batch<is_vbatch>(alpha, detail::get_transpose_vector(vb), detail::get_transpose_vector(va), beta, vc_t);
+    } else {
       // for operations on the device, use unified memory for vector of ints or ptrs
-      auto constexpr vec_adr_spc = []() { return mem::on_host<C> ? mem::Host : mem::Unified; }();
-
-      // convert the vector of matrices into the associated vector of pointers
-      auto get_ptrs = [&to_mat]<typename V>(V &v) {
-        EXPECTS(std::all_of(v.begin(), v.end(),
-                            [&v, &to_mat](auto &z) { return (VBATCH or z.shape() == v[0].shape()) and to_mat(z).indexmap().min_stride() == 1; }));
-        using value_t = get_value_t<typename V::value_type>;
-        using ptr_t   = std::conditional_t<std::is_const_v<V>, value_t const *, value_t *>;
-        auto v_ptrs   = nda::vector<ptr_t, heap<vec_adr_spc>>(v.size());
-        std::transform(v.begin(), v.end(), v_ptrs.begin(), [&to_mat](auto &z) { return to_mat(z).data(); });
-        return v_ptrs;
-      };
-      auto a_ptrs = get_ptrs(va);
-      auto b_ptrs = get_ptrs(vb);
-      auto c_ptrs = get_ptrs(vc);
-
-      // gather parameters for gemm call
-      static constexpr bool conj_A = is_conj_array_expr<A>;
-      static constexpr bool conj_B = is_conj_array_expr<B>;
-      char op_a                    = get_op<conj_A, /* transpose = */ has_C_layout<mat_a_type>>;
-      char op_b                    = get_op<conj_B, /* transpose = */ has_C_layout<mat_b_type>>;
-
-      // matrices have different sizes
-      if constexpr (VBATCH) {
-        // create vectors of size 'batch_count + 1' as required by Magma
-        nda::vector<int, heap<vec_adr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),
+      auto constexpr vec_addr_spc = []() { return mem::on_host<C> ? mem::Host : mem::Unified; }();
+
+      // convert the vector of matrices to the corresponding vector of pointers
+      auto a_ptrs = detail::get_ptr_vector<get_value_t<decltype(va[0])> const *, is_vbatch, vec_addr_spc>(va);
+      auto b_ptrs = detail::get_ptr_vector<get_value_t<decltype(vb[0])> const *, is_vbatch, vec_addr_spc>(vb);
+      auto c_ptrs = detail::get_ptr_vector<get_value_t<decltype(vc[0])> *, is_vbatch, vec_addr_spc>(vc);
+
+      // check for conjugate lazy expressions and C-layouts
+      char op_a = get_op<is_conj_array_expr<A>, has_C_layout<A>>;
+      char op_b = get_op<is_conj_array_expr<B>, has_C_layout<B>>;
+
+      // either call gemm_vbatch or gemm_batch
+      if constexpr (is_vbatch) {
+        // create vectors to store shapes and leading dimensions of size 'batch_count + 1' as required by Magma
+        nda::vector<int, heap<vec_addr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),
            vldb(batch_count + 1), vldc(batch_count + 1);
 
         for (auto i : range(batch_count)) {
-          auto &ai = to_mat(va[i]);
-          auto &bi = to_mat(vb[i]);
-          auto &ci = vc[i];
+          auto &&mat_a = get_array(va[i]);
+          auto &&mat_b = get_array(vb[i]);
+          auto &&mat_c = get_array(vc[i]);
 
-          EXPECTS(ai.extent(1) == bi.extent(0));
-          EXPECTS(ai.extent(0) == ci.extent(0));
-          EXPECTS(bi.extent(1) == ci.extent(1));
+          // check the dimensions of the input/output arrays/views
+          auto const [m, k] = mat_a.shape();
+          auto const [l, n] = mat_b.shape();
+          EXPECTS(k == l);
+          EXPECTS(m == mat_c.extent(0));
+          EXPECTS(n == mat_c.extent(1));
 
-          vm[i] = ai.extent(0);
-          vk[i] = ai.extent(1);
-          vn[i] = bi.extent(1);
-
-          vlda[i] = get_ld(ai);
-          vldb[i] = get_ld(bi);
-          vldc[i] = get_ld(ci);
+          // store shapes and leading dimensions
+          vm[i]   = m;
+          vk[i]   = k;
+          vn[i]   = n;
+          vlda[i] = get_ld(mat_a);
+          vldb[i] = get_ld(mat_b);
+          vldc[i] = get_ld(mat_c);
         }
 
-        if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
+        // perform the actual library call
+        if constexpr (mem::have_device_compatible_addr_space<A, B, C>) {
 #if defined(NDA_HAVE_DEVICE)
           device::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
                               c_ptrs.data(), vldc.data(), batch_count);
@@ -149,23 +145,27 @@ namespace nda::blas {
                            c_ptrs.data(), vldc.data(), batch_count);
         }
       } else {
-        // all matrices have the same size
-        EXPECTS(a0.extent(1) == b0.extent(0));
-        EXPECTS(a0.extent(0) == c0.extent(0));
-        EXPECTS(b0.extent(1) == c0.extent(1));
+        auto &&mat_a = get_array(va[0]);
+        auto &&mat_b = get_array(vb[0]);
+        auto &&mat_c = get_array(vc[0]);
 
-        auto [m, k] = a0.shape();
-        auto n      = b0.extent(1);
+        // check the dimensions of the input/output arrays/views
+        auto const [m, k] = mat_a.shape();
+        auto const [l, n] = mat_b.shape();
+        EXPECTS(k == l);
+        EXPECTS(m == mat_c.extent(0));
+        EXPECTS(n == mat_c.extent(1));
 
-        if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
+        // perform the actual library call
+        if constexpr (mem::have_device_compatible_addr_space<A, B, C>) {
 #if defined(NDA_HAVE_DEVICE)
-          device::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
-                             batch_count);
+          device::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(mat_a), b_ptrs.data(), get_ld(mat_b), beta, c_ptrs.data(),
+                             get_ld(mat_c), batch_count);
 #else
           compile_error_no_gpu();
 #endif
         } else {
-          f77::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
+          f77::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(mat_a), b_ptrs.data(), get_ld(mat_b), beta, c_ptrs.data(), get_ld(mat_c),
                           batch_count);
         }
       }
@@ -173,24 +173,26 @@ namespace nda::blas {
   }
 
   /**
-   * @brief Wrapper of nda::blas::gemm_batch that allows variable sized matrices.
+   * @brief Interface to MKL's/Magma's `gemm_vbatch` routine.
+   *
+   * @details It simply calls nda::blas::gemm_batch with `is_vbatch` set to true.
    *
    * @tparam A nda::Matrix type.
    * @tparam B nda::Matrix type.
    * @tparam C nda::MemoryMatrix type.
-   * @param alpha Input scalar.
-   * @param va std::vector of input matrices.
-   * @param vb std::vector of input matrices.
-   * @param beta Input scalar.
-   * @param vc std::vector of input/output matrices.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param va `std::vector` of input matrices.
+   * @param vb `std::vector` of input matrices.
+   * @param beta Input scalar \f$ \beta \f$.
+   * @param vc `std::vector` of input/output matrices.
    */
   template <Matrix A, Matrix B, MemoryMatrix C>
   void gemm_vbatch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
-    gemm_batch</* VBATCH = */ true>(alpha, va, vb, beta, vc);
+    gemm_batch<true>(alpha, va, vb, beta, vc);
   }
 
   /**
-   * @brief Implements a strided batched version of nda::blas::gemm taking 3-dimensional arrays as arguments.
+   * @brief Interface to MKL's/CUDA's `gemm_batch_strided` routine.
    *
    * @details This function is similar to nda::blas::gemm_batch except that it takes 3-dimensional arrays as arguments
    * instead of vectors of matrices. The first dimension of the arrays indexes the matrices to be multiplied.
@@ -198,69 +200,62 @@ namespace nda::blas {
    * @tparam A nda::ArrayOfRank<3> type.
    * @tparam B nda::ArrayOfRank<3> type.
    * @tparam C nda::ArrayOfRank<3> type.
-   * @param alpha Input scalar.
+   * @param alpha Input scalar \f$ \alpha \f$.
    * @param a 3-dimensional input array.
    * @param b 3-dimensional input array.
-   * @param beta Input scalar.
+   * @param beta Input scalar \f$ \beta \f$.
    * @param c 3-dimensional input/output array.
    */
   template <ArrayOfRank<3> A, ArrayOfRank<3> B, MemoryArrayOfRank<3> C>
-    requires((MemoryArrayOfRank<A, 3> or (is_conj_array_expr<A>)) and (MemoryArrayOfRank<B, 3> or (is_conj_array_expr<B>))
-             and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
+    requires((MemoryArrayOfRank<A, 3> or is_conj_array_expr<A>) and (MemoryArrayOfRank<B, 3> or is_conj_array_expr<B>)
+             and have_same_value_type_v<A, B, C> and mem::have_compatible_addr_space<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
   void gemm_batch_strided(get_value_t<A> alpha, A const &a, B const &b, get_value_t<A> beta, C &&c) {
-    // check number of matrices
+    // check sizes of input arrays (number of matrices) and return if they are empty
     EXPECTS(a.shape()[0] == b.shape()[0] and a.shape()[0] == c.shape()[0]);
+    if (a.size() == 0) return;
+    auto const batch_count = a.shape()[0];
 
-    // get underlying array in case it is given as a lazy expression
-    auto to_arr = []<typename Z>(Z &z) -> auto & {
-      if constexpr (is_conj_array_expr<Z>)
-        return std::get<0>(z.a);
-      else
-        return z;
-    };
-    auto arr_a = to_arr(a);
-    auto arr_b = to_arr(b);
-
-    // compile-time check
-    using arr_a_type = decltype(arr_a);
-    using arr_b_type = decltype(arr_b);
-    static_assert(mem::have_compatible_addr_space<arr_a_type, arr_b_type, C>,
-                  "Error in nda::blas::gemm_batch_strided: Incompatible memory address spaces");
-
-    // runtime checks
-    auto _  = nda::range::all;
-    auto a0 = arr_a(0, _, _);
-    auto b0 = arr_b(0, _, _);
-    auto c0 = c(0, _, _);
-    EXPECTS(a0.extent(1) == b0.extent(0));
-    EXPECTS(a0.extent(0) == c0.extent(0));
-    EXPECTS(b0.extent(1) == c0.extent(1));
-    EXPECTS(arr_a.indexmap().min_stride() == 1);
-    EXPECTS(arr_b.indexmap().min_stride() == 1);
-    EXPECTS(c.indexmap().min_stride() == 1);
-
-    // c is in C order: compute the transpose of the product in Fortran order
+    // if C is in C-layout, compute the transpose of the product in Fortran order
     if constexpr (has_C_layout<C>) {
       gemm_batch_strided(alpha, transposed_view<1, 2>(b), transposed_view<1, 2>(a), beta, transposed_view<1, 2>(std::forward<C>(c)));
       return;
-    } else { // c is in Fortran order
-      static constexpr bool conj_A = is_conj_array_expr<A>;
-      static constexpr bool conj_B = is_conj_array_expr<B>;
-      char op_a                    = get_op<conj_A, /* transpose = */ has_C_layout<arr_a_type>>;
-      char op_b                    = get_op<conj_B, /* transpose = */ has_C_layout<arr_b_type>>;
-      auto [m, k]                  = a0.shape();
-      auto n                       = b0.extent(1);
-
-      if constexpr (mem::have_device_compatible_addr_space<arr_a_type, arr_b_type, C>) {
+    } else {
+      // get underlying array in case it is given as a conjugate expression
+      auto arr_a = get_array(a);
+      auto arr_b = get_array(b);
+
+      // get views of the first matrix in the batch
+      auto a0 = arr_a(0, nda::range::all, nda::range::all);
+      auto b0 = arr_b(0, nda::range::all, nda::range::all);
+      auto c0 = c(0, nda::range::all, nda::range::all);
+
+      // check the dimensions of the input/output arrays/views
+      auto const [m, k] = a0.shape();
+      auto const [l, n] = b0.shape();
+      EXPECTS(k == l);
+      EXPECTS(m == c0.extent(0));
+      EXPECTS(n == c0.extent(1));
+
+      // arrays/views must be BLAS compatible
+      EXPECTS(arr_a.indexmap().min_stride() == 1);
+      EXPECTS(arr_b.indexmap().min_stride() == 1);
+      EXPECTS(c.indexmap().min_stride() == 1);
+
+      // check for conjugate lazy expressions and C-layouts
+      char op_a = get_op<is_conj_array_expr<A>, has_C_layout<A>>;
+      char op_b = get_op<is_conj_array_expr<B>, has_C_layout<B>>;
+
+      // perform the actual library call
+      if constexpr (mem::have_device_compatible_addr_space<A, B, C>) {
 #if defined(NDA_HAVE_DEVICE)
         device::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
-                                   arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
+                                   arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], batch_count);
 #else
         compile_error_no_gpu();
 #endif
       } else {
         f77::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
-                                arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
+                                arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], batch_count);
       }
     }
   }
diff --git a/c++/nda/blas/gemv.hpp b/c++/nda/blas/gemv.hpp
index 50234d4eb..9694e241d 100644
--- a/c++/nda/blas/gemv.hpp
+++ b/c++/nda/blas/gemv.hpp
@@ -19,9 +19,10 @@
 
 #ifndef NDA_HAVE_DEVICE
 #include "../device.hpp"
-#endif
+#endif // NDA_HAVE_DEVICE
 
 #include <tuple>
+#include <utility>
 
 namespace nda::blas {
 
@@ -30,85 +31,48 @@ namespace nda::blas {
    * @{
    */
 
-  /**
-   * @brief Generic nda::blas::gemv implementation for types not supported by BLAS/LAPACK.
-   *
-   * @tparam A Some matrix type.
-   * @tparam X Some vector type.
-   * @tparam Y Some vector type.
-   * @param alpha Input scalar.
-   * @param a Input matrix of size m-by-n.
-   * @param x Input vector of size n.
-   * @param beta Input scalar.
-   * @param y Input/Output vector of size m.
-   */
-  template <typename A, typename X, typename Y>
-  void gemv_generic(get_value_t<A> alpha, A const &a, X const &x, get_value_t<A> beta, Y &&y) { // NOLINT (temporary views are allowed here)
-    EXPECTS(a.extent(1) == x.extent(0));
-    EXPECTS(a.extent(0) == y.extent(0));
-
-    if (beta == 0.0) {
-      y = 0 * alpha;
-    } else {
-      y *= beta;
-    }
-
-    for (int i = 0; i < a.extent(0); ++i) {
-      for (int k = 0; k < a.extent(1); ++k) y(i) += alpha * a(i, k) * x(k);
-    }
-  }
-
   /**
    * @brief Interface to the BLAS `gemv` routine.
    *
    * @details This function performs one of the matrix-vector operations
    *
    * - \f$ \mathbf{y} \leftarrow \alpha \mathbf{A} \mathbf{x} + \beta \mathbf{y} \f$,
-   * - \f$ \mathbf{y} \leftarrow \alpha \mathbf{A}^T \mathbf{x} + \beta \mathbf{y} \f$,
-   * - \f$ \mathbf{y} \leftarrow \alpha \mathbf{A}^H \mathbf{x} + \beta \mathbf{y} \f$,
+   * - \f$ \mathbf{y} \leftarrow \alpha \mathbf{A}^* \mathbf{x} + \beta \mathbf{y} \f$ (only if \f$ \mathbf{A} \f$ is
+   * in nda::C_layout),
    *
-   * where \f$ \alpha \f$ and \f$ \beta \f$ are scalars, \f$ \mathbf{x} \f$ and \f$ \mathbf{y} \f$ are vectors and
-   * \f$ \mathbf{A} \f$ is an m-by-n matrix.
+   * where \f$ \alpha \f$ and \f$ \beta \f$ are scalars, \f$ \mathbf{A} \f$ is an \f$ m \times n \f$ matrix and \f$
+   * \mathbf{x} \f$ and \f$ \mathbf{y} \f$ are vectors of sizes \f$ n \f$ and \f$ m \f$, respectively.
    *
    * @tparam A nda::Matrix type.
    * @tparam X nda::MemoryVector type.
    * @tparam Y nda::MemoryVector type.
-   * @param alpha Input scalar.
-   * @param a Input matrix of size m-by-n.
-   * @param x Input vector of size n.
-   * @param beta Input scalar.
-   * @param y Input/Output vector of size m.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param a Input matrix \f$ \mathbf{A} \f$ of size \f$ m \times n \f$.
+   * @param x Input vector \f$ \mathbf{x} \f$ of size \f$ n \f$.
+   * @param beta Input scalar \f$ \beta \f$.
+   * @param y Input/Output vector \f$ \mathbf{y} \f$ of size \f$ m \f$.
    */
   template <Matrix A, MemoryVector X, MemoryVector Y>
-    requires((MemoryMatrix<A> or is_conj_array_expr<A>) and have_same_value_type_v<A, X, Y> and is_blas_lapack_v<get_value_t<A>>)
+    requires((MemoryMatrix<A> or is_conj_array_expr<A>)
+             and have_same_value_type_v<A, X, Y> and mem::have_compatible_addr_space<A, X, Y> and is_blas_lapack_v<get_value_t<A>>)
   void gemv(get_value_t<A> alpha, A const &a, X const &x, get_value_t<A> beta, Y &&y) { // NOLINT (temporary views are allowed here)
-    // get underlying matrix in case it is given as a lazy expression
-    auto to_mat = []<Matrix Z>(Z const &z) -> decltype(auto) {
-      if constexpr (is_conj_array_expr<Z>)
-        return std::get<0>(z.a);
-      else
-        return z;
-    };
-    auto &mat = to_mat(a);
+    // get the underlying matrix in case it is given as a conjugate expression
+    auto &mat = get_array(a);
 
-    // compile-time checks
-    using mat_type = decltype(mat);
-    static_assert(mem::have_compatible_addr_space<mat_type, X, Y>);
+    // check the dimensions of the input/output arrays/views
+    auto [m, n] = mat.shape();
+    EXPECTS(m == y.size());
+    EXPECTS(n == x.size());
 
-    // runtime checks
-    EXPECTS(mat.extent(1) == x.extent(0));
-    EXPECTS(mat.extent(0) == y.extent(0));
+    // arrays/views must be BLAS compatible
     EXPECTS(mat.indexmap().min_stride() == 1);
-    EXPECTS(x.indexmap().min_stride() == 1);
-    EXPECTS(y.indexmap().min_stride() == 1);
 
-    // gather parameters for gemv call
-    static constexpr bool conj_A = is_conj_array_expr<A>;
-    char op_a                    = get_op<conj_A, /* transpose = */ !has_F_layout<mat_type>>;
-    auto [m, n]                  = mat.shape();
-    if constexpr (has_C_layout<mat_type>) std::swap(m, n);
+    // check for conjugate lazy expressions and C-layouts
+    char op_a = get_op<is_conj_array_expr<A>, has_C_layout<A>>;
+    if constexpr (has_C_layout<A>) std::swap(m, n);
 
-    if constexpr (mem::have_device_compatible_addr_space<mat_type, X, Y>) {
+    // perform actual library call
+    if constexpr (mem::have_device_compatible_addr_space<A, X, Y>) {
 #if defined(NDA_HAVE_DEVICE)
       device::gemv(op_a, m, n, alpha, mat.data(), get_ld(mat), x.data(), x.indexmap().strides()[0], beta, y.data(), y.indexmap().strides()[0]);
 #else
diff --git a/c++/nda/blas/ger.hpp b/c++/nda/blas/ger.hpp
index ebba78505..568069884 100644
--- a/c++/nda/blas/ger.hpp
+++ b/c++/nda/blas/ger.hpp
@@ -5,27 +5,23 @@
 
 /**
  * @file
- * @brief Provides a generic interface to the BLAS `ger` routine and an outer product routine.
+ * @brief Provides a generic interface to the BLAS `ger`, `geru` and `gerc` routine.
  */
 
 #pragma once
 
 #include "./interface/cxx_interface.hpp"
 #include "./tools.hpp"
-#include "../basic_functions.hpp"
 #include "../concepts.hpp"
-#include "../exceptions.hpp"
 #include "../layout_transforms.hpp"
+#include "../layout/policies.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
-#include "../stdutil/array.hpp"
 #include "../traits.hpp"
 
 #ifndef NDA_HAVE_DEVICE
 #include "../device.hpp"
-#endif
-
-#include <array>
+#endif // NDA_HAVE_DEVICE
 
 namespace nda::blas {
 
@@ -35,38 +31,42 @@ namespace nda::blas {
    */
 
   /**
-   * @brief Interface to the BLAS `ger` routine.
+   * @brief Interface to the BLAS `ger` and `geru` routine.
    *
    * @details This function performs the rank 1 operation
    * \f[
-   *   \mathbf{M} \leftarrow \alpha \mathbf{x} \mathbf{y}^H + \mathbf{M} ;,
+   *   \mathbf{M} \leftarrow \alpha \mathbf{x} \mathbf{y}^T + \mathbf{M} \; ,
    * \f]
-   * where \f$ \alpha \f$ is a scalar, \f$ \mathbf{x} \f$ is an m element vector, \f$ \mathbf{y} \f$ is an n element
-   * vector and \f$ \mathbf{M} \f$ is an m-by-n matrix.
+   * where \f$ \alpha \f$ is a scalar, \f$ \mathbf{x} \f$ is an \f$ m \f$ element vector, \f$ \mathbf{y} \f$ is an \f$ n
+   * \f$ element vector and \f$ \mathbf{M} \f$ is an \f$ m \times n \f$ matrix.
+   * 
+   * @note The vector \f$ \mathbf{y} \f$ is never conjugated. Even for complex types. Use nda::blas::gerc for that.
    *
    * @tparam X nda::MemoryVector type.
    * @tparam Y nda::MemoryVector type.
    * @tparam M nda::MemoryMatrix type.
-   * @param alpha Input scalar.
-   * @param x Input left vector (column vector) of size m.
-   * @param y Input right vector (row vector) of size n.
-   * @param m Input/Output matrix of size m-by-n to which the outer product is added.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param x Input vector \f$ \mathbf{x} \f$ of size \f$ m \f$.
+   * @param y Input vector \f$ \mathbf{y} \f$  of size \f$ n \f$.
+   * @param m Input/Output matrix \f$ \mathbf{M} \f$  of size \f$ m \times n \f$ to which the outer product is added.
    */
   template <MemoryVector X, MemoryVector Y, MemoryMatrix M>
     requires(have_same_value_type_v<X, Y, M> and mem::have_compatible_addr_space<X, Y, M> and is_blas_lapack_v<get_value_t<X>>)
   void ger(get_value_t<X> alpha, X const &x, Y const &y, M &&m) { // NOLINT (temporary views are allowed here)
-    EXPECTS(m.extent(0) == x.extent(0));
-    EXPECTS(m.extent(1) == y.extent(0));
-
-    // must be lapack compatible
-    EXPECTS(m.indexmap().min_stride() == 1);
-
-    // if in C, we need to call fortran with transposed matrix
-    if (has_C_layout<M>) {
+    // for C-layout arrays/views, call ger with the transpose and swap x and y
+    if constexpr (has_C_layout<M>) {
       ger(alpha, y, x, transpose(m));
       return;
     }
 
+    // check the dimensions of the input/output arrays/views
+    EXPECTS(m.extent(0) == x.size());
+    EXPECTS(m.extent(1) == y.size());
+
+    // arrays/views must be BLAS compatible
+    EXPECTS(m.indexmap().min_stride() == 1);
+
+    // perform actual library call
     if constexpr (mem::have_device_compatible_addr_space<X, Y, M>) {
 #if defined(NDA_HAVE_DEVICE)
       device::ger(m.extent(0), m.extent(1), alpha, x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0], m.data(), get_ld(m));
@@ -79,39 +79,50 @@ namespace nda::blas {
   }
 
   /**
-   * @brief Calculate the outer product of two contiguous arrays/views/scalars.
-   *
-   * @details For general multidimensional arrays/views, it calculates their tensor outer product, i.e.
-   * ```
-   * c(i,j,k,...,u,v,w,...) = a(i,j,k,...) * b(u,v,w,...)
-   * ```
-   * If one of the arguments is a scalar, it multiplies each element of the other argument by the scalar which returns a
-   * lazy nda::expr object.
-   *
-   * If both arguments are scalars, it returns their products.
+   * @brief Interface to the BLAS `gerc` routine.
+   * 
+   * @details This function performs the rank 1 operation
+   * \f[
+   *   \mathbf{M} \leftarrow \alpha \mathbf{x} \mathbf{y}^H + \mathbf{M} \; ,
+   * \f]
+   * where \f$ \alpha \f$ is a scalar, \f$ \mathbf{x} \f$ is an \f$ m \f$ element vector, \f$ \mathbf{y} \f$ is an \f$ n
+   * \f$ element vector and \f$ \mathbf{M} \f$ is an \f$ m \times n \f$ matrix.
+   * 
+   * If the value type of the input vectors/matrix is real, it calls nda::blas::ger.
+   * 
+   * @note \f$ \mathbf{M} \f$ has to be in nda::F_layout.
    *
-   * @tparam A nda::ArrayOrScalar type.
-   * @tparam B nda::ArrayOrScalar type.
-   * @param a Input array/scalar.
-   * @param b Input array/scalar.
-   * @return (Lazy) Outer product.
+   * @tparam X nda::MemoryVector type.
+   * @tparam Y nda::MemoryVector type.
+   * @tparam M nda::MemoryMatrix type.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param x Input vector \f$ \mathbf{x} \f$ of size \f$ m \f$.
+   * @param y Input vector \f$ \mathbf{y} \f$  of size \f$ n \f$.
+   * @param m Input/Output matrix \f$ \mathbf{M} \f$  of size \f$ m \times n \f$ to which the outer product is added.
    */
-  template <ArrayOrScalar A, ArrayOrScalar B>
-  auto outer_product(A const &a, B const &b) {
-    if constexpr (Scalar<A> or Scalar<B>) {
-      return a * b;
-    } else {
-      if (not a.is_contiguous()) NDA_RUNTIME_ERROR << "Error in nda::blas::outer_product: First argument has non-contiguous layout";
-      if (not b.is_contiguous()) NDA_RUNTIME_ERROR << "Error in nda::blas::outer_product: Second argument has non-contiguous layout";
+  template <MemoryVector X, MemoryVector Y, MemoryMatrix M>
+    requires(have_same_value_type_v<X, Y, M> and mem::have_compatible_addr_space<X, Y, M> and is_blas_lapack_v<get_value_t<X>>)
+  void gerc(get_value_t<X> alpha, X const &x, Y const &y, M &&m) { // NOLINT (temporary views are allowed here)
+    static_assert(has_F_layout<M>, "Error in nda::blas::gerc: M must be in Fortran layout");
 
-      // use BLAS ger to calculate the outer product
-      auto res   = zeros<get_value_t<A>, mem::common_addr_space<A, B>>(stdutil::join(a.shape(), b.shape()));
-      auto a_vec = reshape(a, std::array{a.size()});
-      auto b_vec = reshape(b, std::array{b.size()});
-      auto mat   = reshape(res, std::array{a.size(), b.size()});
-      ger(1.0, a_vec, b_vec, mat);
+    // check the dimensions of the input/output arrays/views
+    EXPECTS(m.extent(0) == x.size());
+    EXPECTS(m.extent(1) == y.size());
 
-      return res;
+    // arrays/views must be BLAS compatible
+    EXPECTS(m.indexmap().min_stride() == 1);
+
+    // perform actual library call
+    if constexpr (!is_complex_v<get_value_t<X>>) {
+      return ger(alpha, x, y, m);
+    } else if constexpr (mem::have_device_compatible_addr_space<X, Y, M>) {
+#if defined(NDA_HAVE_DEVICE)
+      device::gerc(m.extent(0), m.extent(1), alpha, x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0], m.data(), get_ld(m));
+#else
+      compile_error_no_gpu();
+#endif
+    } else {
+      f77::gerc(m.extent(0), m.extent(1), alpha, x.data(), x.indexmap().strides()[0], y.data(), y.indexmap().strides()[0], m.data(), get_ld(m));
     }
   }
 
diff --git a/c++/nda/blas/interface/cublas_interface.cpp b/c++/nda/blas/interface/cublas_interface.cpp
index 6882f4895..a2f748051 100644
--- a/c++/nda/blas/interface/cublas_interface.cpp
+++ b/c++/nda/blas/interface/cublas_interface.cpp
@@ -173,6 +173,10 @@ namespace nda::blas::device {
   void ger(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA) {
     CUBLAS_CHECK(cublasZgeru, M, N, cucplx(&alpha), cucplx(x), incx, cucplx(Y), incy, cucplx(A), LDA);
   }
+  void gerc(int M, int N, std::complex<double> alpha, const std::complex<double> *x, int incx, const std::complex<double> *Y, int incy,
+            std::complex<double> *A, int LDA) {
+    CUBLAS_CHECK(cublasZgerc, M, N, cucplx(&alpha), cucplx(x), incx, cucplx(Y), incy, cucplx(A), LDA);
+  }
 
   void scal(int M, double alpha, double *x, int incx) { CUBLAS_CHECK(cublasDscal, M, &alpha, x, incx); }
   void scal(int M, dcomplex alpha, dcomplex *x, int incx) { CUBLAS_CHECK(cublasZscal, M, cucplx(&alpha), cucplx(x), incx); }
diff --git a/c++/nda/blas/interface/cublas_interface.hpp b/c++/nda/blas/interface/cublas_interface.hpp
index 8f4b5ccbb..bc014a0b5 100644
--- a/c++/nda/blas/interface/cublas_interface.hpp
+++ b/c++/nda/blas/interface/cublas_interface.hpp
@@ -67,6 +67,7 @@ namespace nda::blas::device {
 
   void ger(int M, int N, double alpha, const double *x, int incx, const double *Y, int incy, double *A, int LDA);
   void ger(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA);
+  void gerc(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA);
 
   void scal(int M, double alpha, double *x, int incx);
   void scal(int M, dcomplex alpha, dcomplex *x, int incx);
diff --git a/c++/nda/blas/interface/cxx_interface.cpp b/c++/nda/blas/interface/cxx_interface.cpp
index 514825d51..d1fc66456 100644
--- a/c++/nda/blas/interface/cxx_interface.cpp
+++ b/c++/nda/blas/interface/cxx_interface.cpp
@@ -26,6 +26,10 @@ namespace nda::blas {
 #ifdef NDA_USE_MKL_RT
   static int const mkl_interface_layer = mkl_set_interface_layer(MKL_INTERFACE_LP64 + MKL_INTERFACE_GNU);
 #endif
+  inline auto *mklcplx(nda::scomplex *c) { return reinterpret_cast<MKL_Complex8 *>(c); }                // NOLINT
+  inline auto *mklcplx(nda::scomplex const *c) { return reinterpret_cast<const MKL_Complex8 *>(c); }    // NOLINT
+  inline auto *mklcplx(nda::scomplex **c) { return reinterpret_cast<MKL_Complex8 **>(c); }              // NOLINT
+  inline auto *mklcplx(nda::scomplex const **c) { return reinterpret_cast<const MKL_Complex8 **>(c); }  // NOLINT
   inline auto *mklcplx(nda::dcomplex *c) { return reinterpret_cast<MKL_Complex16 *>(c); }               // NOLINT
   inline auto *mklcplx(nda::dcomplex const *c) { return reinterpret_cast<const MKL_Complex16 *>(c); }   // NOLINT
   inline auto *mklcplx(nda::dcomplex **c) { return reinterpret_cast<MKL_Complex16 **>(c); }             // NOLINT
@@ -35,8 +39,13 @@ namespace nda::blas {
 #endif
 
 namespace {
+  // single-precision complex struct which is returned by BLAS functions
+  struct nda_complex_float {
+    float real;
+    float imag;
+  };
 
-  // complex struct which is returned by BLAS functions
+  // double-precision complex struct which is returned by BLAS functions
   struct nda_complex_double {
     double real;
     double imag;
@@ -45,31 +54,66 @@ namespace {
 } // namespace
 
 // manually define dot routines since cblas_f77.h uses "_sub" to wrap the Fortran routines
+#define F77_sdot F77_GLOBAL(sdot, SDOT)
+#define F77_cdotu F77_GLOBAL(cdotu, SDOTU)
+#define F77_cdotc F77_GLOBAL(cdotc, SDOTC)
 #define F77_ddot F77_GLOBAL(ddot, DDOT)
 #define F77_zdotu F77_GLOBAL(zdotu, ZDOTU)
 #define F77_zdotc F77_GLOBAL(zdotc, ZDOTC)
 extern "C" {
+float F77_sdot(FINT, const float *, FINT, const float *, FINT);
+nda_complex_float F77_cdotu(FINT, const float *, FINT, const float *, FINT);
+nda_complex_float F77_cdotc(FINT, const float *, FINT, const float *, FINT);
+
 double F77_ddot(FINT, const double *, FINT, const double *, FINT);
 nda_complex_double F77_zdotu(FINT, const double *, FINT, const double *, FINT);
 nda_complex_double F77_zdotc(FINT, const double *, FINT, const double *, FINT);
 }
 
 namespace nda::blas::f77 {
-
+  inline auto *blacplx(scomplex *c) { return reinterpret_cast<float *>(c); }                 // NOLINT
+  inline auto *blacplx(scomplex const *c) { return reinterpret_cast<const float *>(c); }     // NOLINT
+  inline auto **blacplx(scomplex **c) { return reinterpret_cast<float **>(c); }              // NOLINT
+  inline auto **blacplx(scomplex const **c) { return reinterpret_cast<const float **>(c); }  // NOLINT
   inline auto *blacplx(dcomplex *c) { return reinterpret_cast<double *>(c); }                // NOLINT
   inline auto *blacplx(dcomplex const *c) { return reinterpret_cast<const double *>(c); }    // NOLINT
   inline auto **blacplx(dcomplex **c) { return reinterpret_cast<double **>(c); }             // NOLINT
   inline auto **blacplx(dcomplex const **c) { return reinterpret_cast<const double **>(c); } // NOLINT
 
+  void axpy(int N, float alpha, const float *x, int incx, float *Y, int incy) { F77_saxpy(&N, &alpha, x, &incx, Y, &incy); }
+  void axpy(int N, scomplex alpha, const scomplex *x, int incx, scomplex *Y, int incy) {
+    F77_caxpy(&N, blacplx(&alpha), blacplx(x), &incx, blacplx(Y), &incy);
+  }
   void axpy(int N, double alpha, const double *x, int incx, double *Y, int incy) { F77_daxpy(&N, &alpha, x, &incx, Y, &incy); }
   void axpy(int N, dcomplex alpha, const dcomplex *x, int incx, dcomplex *Y, int incy) {
     F77_zaxpy(&N, blacplx(&alpha), blacplx(x), &incx, blacplx(Y), &incy);
   }
 
   // No Const In Wrapping!
+  void copy(int N, const float *x, int incx, float *Y, int incy) { F77_scopy(&N, x, &incx, Y, &incy); }
+  void copy(int N, const scomplex *x, int incx, scomplex *Y, int incy) { F77_ccopy(&N, blacplx(x), &incx, blacplx(Y), &incy); }
   void copy(int N, const double *x, int incx, double *Y, int incy) { F77_dcopy(&N, x, &incx, Y, &incy); }
   void copy(int N, const dcomplex *x, int incx, dcomplex *Y, int incy) { F77_zcopy(&N, blacplx(x), &incx, blacplx(Y), &incy); }
 
+  float dot(int M, const float *x, int incx, const float *Y, int incy) { return F77_sdot(&M, x, &incx, Y, &incy); }
+  scomplex dot(int M, const scomplex *x, int incx, const scomplex *Y, int incy) {
+#ifdef NDA_USE_MKL
+    MKL_Complex8 result;
+    cblas_cdotu_sub(M, mklcplx(x), incx, mklcplx(Y), incy, &result);
+#else
+    auto result = F77_cdotu(&M, blacplx(x), &incx, blacplx(Y), &incy);
+#endif
+    return scomplex{result.real, result.imag};
+  }
+  scomplex dotc(int M, const scomplex *x, int incx, const scomplex *Y, int incy) {
+#ifdef NDA_USE_MKL
+    MKL_Complex8 result;
+    cblas_cdotc_sub(M, mklcplx(x), incx, mklcplx(Y), incy, &result);
+#else
+    auto result = F77_cdotc(&M, blacplx(x), &incx, blacplx(Y), &incy);
+#endif
+    return scomplex{result.real, result.imag};
+  }
   double dot(int M, const double *x, int incx, const double *Y, int incy) { return F77_ddot(&M, x, &incx, Y, &incy); }
   dcomplex dot(int M, const dcomplex *x, int incx, const dcomplex *Y, int incy) {
 #ifdef NDA_USE_MKL
@@ -90,6 +134,13 @@ namespace nda::blas::f77 {
     return dcomplex{result.real, result.imag};
   }
 
+  void gemm(char op_a, char op_b, int M, int N, int K, float alpha, const float *A, int LDA, const float *B, int LDB, float beta, float *C, int LDC) {
+    F77_sgemm(&op_a, &op_b, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, C, &LDC);
+  }
+  void gemm(char op_a, char op_b, int M, int N, int K, scomplex alpha, const scomplex *A, int LDA, const scomplex *B, int LDB, scomplex beta,
+            scomplex *C, int LDC) {
+    F77_cgemm(&op_a, &op_b, &M, &N, &K, blacplx(&alpha), blacplx(A), &LDA, blacplx(B), &LDB, blacplx(&beta), blacplx(C), &LDC);
+  }
   void gemm(char op_a, char op_b, int M, int N, int K, double alpha, const double *A, int LDA, const double *B, int LDB, double beta, double *C,
             int LDC) {
     F77_dgemm(&op_a, &op_b, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, C, &LDC);
@@ -99,6 +150,25 @@ namespace nda::blas::f77 {
     F77_zgemm(&op_a, &op_b, &M, &N, &K, blacplx(&alpha), blacplx(A), &LDA, blacplx(B), &LDB, blacplx(&beta), blacplx(C), &LDC);
   }
 
+  void gemm_batch(char op_a, char op_b, int M, int N, int K, float alpha, const float **A, int LDA, const float **B, int LDB, float beta, float **C,
+                  int LDC, int batch_count) {
+#ifdef NDA_USE_MKL
+    const int group_count = 1;
+    sgemm_batch(&op_a, &op_b, &M, &N, &K, &alpha, A, &LDA, B, &LDB, &beta, C, &LDC, &group_count, &batch_count);
+#else // Fallback to loop
+    for (int i = 0; i < batch_count; ++i) gemm(op_a, op_b, M, N, K, alpha, A[i], LDA, B[i], LDB, beta, C[i], LDC);
+#endif
+  }
+  void gemm_batch(char op_a, char op_b, int M, int N, int K, scomplex alpha, const scomplex **A, int LDA, const scomplex **B, int LDB, scomplex beta,
+                  scomplex **C, int LDC, int batch_count) {
+#ifdef NDA_USE_MKL
+    const int group_count = 1;
+    cgemm_batch(&op_a, &op_b, &M, &N, &K, mklcplx(&alpha), mklcplx(A), &LDA, mklcplx(B), &LDB, mklcplx(&beta), mklcplx(C), &LDC, &group_count,
+                &batch_count);
+#else
+    for (int i = 0; i < batch_count; ++i) gemm(op_a, op_b, M, N, K, alpha, A[i], LDA, B[i], LDB, beta, C[i], LDC);
+#endif
+  }
   void gemm_batch(char op_a, char op_b, int M, int N, int K, double alpha, const double **A, int LDA, const double **B, int LDB, double beta,
                   double **C, int LDC, int batch_count) {
 #ifdef NDA_USE_MKL
@@ -119,6 +189,29 @@ namespace nda::blas::f77 {
 #endif
   }
 
+  void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, float alpha, const float **A, int *LDA, const float **B, int *LDB, float beta,
+                   float **C, int *LDC, int batch_count) {
+#ifdef NDA_USE_MKL
+    nda::vector<int> group_size(batch_count, 1);
+    nda::vector<char> ops_a(batch_count, op_a), ops_b(batch_count, op_b);
+    nda::vector<float> alphas(batch_count, alpha), betas(batch_count, beta);
+    sgemm_batch(ops_a.data(), ops_b.data(), M, N, K, alphas.data(), A, LDA, B, LDB, betas.data(), C, LDC, &batch_count, group_size.data());
+#else
+    for (int i = 0; i < batch_count; ++i) gemm(op_a, op_b, M[i], N[i], K[i], alpha, A[i], LDA[i], B[i], LDB[i], beta, C[i], LDC[i]);
+#endif
+  }
+  void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, scomplex alpha, const scomplex **A, int *LDA, const scomplex **B, int *LDB,
+                   scomplex beta, scomplex **C, int *LDC, int batch_count) {
+#ifdef NDA_USE_MKL
+    nda::vector<int> group_size(batch_count, 1);
+    nda::vector<char> ops_a(batch_count, op_a), ops_b(batch_count, op_b);
+    nda::vector<scomplex> alphas(batch_count, alpha), betas(batch_count, beta);
+    cgemm_batch(ops_a.data(), ops_b.data(), M, N, K, mklcplx(alphas.data()), mklcplx(A), LDA, mklcplx(B), LDB, mklcplx(betas.data()), mklcplx(C), LDC,
+                &batch_count, group_size.data());
+#else
+    for (int i = 0; i < batch_count; ++i) gemm(op_a, op_b, M[i], N[i], K[i], alpha, A[i], LDA[i], B[i], LDB[i], beta, C[i], LDC[i]);
+#endif
+  }
   void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, double alpha, const double **A, int *LDA, const double **B, int *LDB, double beta,
                    double **C, int *LDC, int batch_count) {
 #ifdef NDA_USE_MKL
@@ -143,6 +236,27 @@ namespace nda::blas::f77 {
 #endif
   }
 
+  void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, float alpha, const float *A, int LDA, int strideA, const float *B, int LDB,
+                          int strideB, float beta, float *C, int LDC, int strideC, int batch_count) {
+#if defined(NDA_USE_MKL) && INTEL_MKL_VERSION >= 20200002
+    sgemm_batch_strided(&op_a, &op_b, &M, &N, &K, &alpha, A, &LDA, &strideA, B, &LDB, &strideB, &beta, C, &LDC, &strideC, &batch_count);
+#else
+    for (int i = 0; i < batch_count; ++i)
+      gemm(op_a, op_b, M, N, K, alpha, A + static_cast<ptrdiff_t>(i * strideA), LDA, B + static_cast<ptrdiff_t>(i * strideB), LDB, beta,
+           C + static_cast<ptrdiff_t>(i * strideC), LDC);
+#endif
+  }
+  void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, scomplex alpha, const scomplex *A, int LDA, int strideA, const scomplex *B,
+                          int LDB, int strideB, scomplex beta, scomplex *C, int LDC, int strideC, int batch_count) {
+#if defined(NDA_USE_MKL) && INTEL_MKL_VERSION >= 20200002
+    cgemm_batch_strided(&op_a, &op_b, &M, &N, &K, mklcplx(&alpha), mklcplx(A), &LDA, &strideA, mklcplx(B), &LDB, &strideB, mklcplx(&beta), mklcplx(C),
+                        &LDC, &strideC, &batch_count);
+#else
+    for (int i = 0; i < batch_count; ++i)
+      gemm(op_a, op_b, M, N, K, alpha, A + static_cast<ptrdiff_t>(i * strideA), LDA, B + static_cast<ptrdiff_t>(i * strideB), LDB, beta,
+           C + static_cast<ptrdiff_t>(i * strideC), LDC);
+#endif
+  }
   void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, double alpha, const double *A, int LDA, int strideA, const double *B, int LDB,
                           int strideB, double beta, double *C, int LDC, int strideC, int batch_count) {
 #if defined(NDA_USE_MKL) && INTEL_MKL_VERSION >= 20200002
@@ -165,6 +279,12 @@ namespace nda::blas::f77 {
 #endif
   }
 
+  void gemv(char op, int M, int N, float alpha, const float *A, int LDA, const float *x, int incx, float beta, float *Y, int incy) {
+    F77_sgemv(&op, &M, &N, &alpha, A, &LDA, x, &incx, &beta, Y, &incy);
+  }
+  void gemv(char op, int M, int N, scomplex alpha, const scomplex *A, int LDA, const scomplex *x, int incx, scomplex beta, scomplex *Y, int incy) {
+    F77_cgemv(&op, &M, &N, blacplx(&alpha), blacplx(A), &LDA, blacplx(x), &incx, blacplx(&beta), blacplx(Y), &incy);
+  }
   void gemv(char op, int M, int N, double alpha, const double *A, int LDA, const double *x, int incx, double beta, double *Y, int incy) {
     F77_dgemv(&op, &M, &N, &alpha, A, &LDA, x, &incx, &beta, Y, &incy);
   }
@@ -172,16 +292,34 @@ namespace nda::blas::f77 {
     F77_zgemv(&op, &M, &N, blacplx(&alpha), blacplx(A), &LDA, blacplx(x), &incx, blacplx(&beta), blacplx(Y), &incy);
   }
 
+  void ger(int M, int N, float alpha, const float *x, int incx, const float *Y, int incy, float *A, int LDA) {
+    F77_sger(&M, &N, &alpha, x, &incx, Y, &incy, A, &LDA);
+  }
+  void ger(int M, int N, scomplex alpha, const scomplex *x, int incx, const scomplex *Y, int incy, scomplex *A, int LDA) {
+    F77_cgeru(&M, &N, blacplx(&alpha), blacplx(x), &incx, blacplx(Y), &incy, blacplx(A), &LDA);
+  }
+  void gerc(int M, int N, scomplex alpha, const scomplex *x, int incx, const scomplex *Y, int incy, scomplex *A, int LDA) {
+    F77_cgerc(&M, &N, blacplx(&alpha), blacplx(x), &incx, blacplx(Y), &incy, blacplx(A), &LDA);
+  }
   void ger(int M, int N, double alpha, const double *x, int incx, const double *Y, int incy, double *A, int LDA) {
     F77_dger(&M, &N, &alpha, x, &incx, Y, &incy, A, &LDA);
   }
   void ger(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA) {
     F77_zgeru(&M, &N, blacplx(&alpha), blacplx(x), &incx, blacplx(Y), &incy, blacplx(A), &LDA);
   }
+  void gerc(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA) {
+    F77_zgerc(&M, &N, blacplx(&alpha), blacplx(x), &incx, blacplx(Y), &incy, blacplx(A), &LDA);
+  }
 
+  void scal(int M, float alpha, float *x, int incx) { F77_sscal(&M, &alpha, x, &incx); }
+  void scal(int M, scomplex alpha, scomplex *x, int incx) { F77_cscal(&M, blacplx(&alpha), blacplx(x), &incx); }
   void scal(int M, double alpha, double *x, int incx) { F77_dscal(&M, &alpha, x, &incx); }
   void scal(int M, dcomplex alpha, dcomplex *x, int incx) { F77_zscal(&M, blacplx(&alpha), blacplx(x), &incx); }
 
+  void swap(int N, float *x, int incx, float *Y, int incy) { F77_cswap(&N, x, &incx, Y, &incy); } // NOLINT (this is a BLAS swap)
+  void swap(int N, scomplex *x, int incx, scomplex *Y, int incy) {                                // NOLINT (this is a BLAS swap)
+    F77_cswap(&N, blacplx(x), &incx, blacplx(Y), &incy);
+  }
   void swap(int N, double *x, int incx, double *Y, int incy) { F77_dswap(&N, x, &incx, Y, &incy); } // NOLINT (this is a BLAS swap)
   void swap(int N, dcomplex *x, int incx, dcomplex *Y, int incy) {                                  // NOLINT (this is a BLAS swap)
     F77_zswap(&N, blacplx(x), &incx, blacplx(Y), &incy);
diff --git a/c++/nda/blas/interface/cxx_interface.hpp b/c++/nda/blas/interface/cxx_interface.hpp
index b2b963eeb..8dd5d8de5 100644
--- a/c++/nda/blas/interface/cxx_interface.hpp
+++ b/c++/nda/blas/interface/cxx_interface.hpp
@@ -14,49 +14,81 @@
 
 #if defined(NDA_HAVE_CUDA)
 #include "./cublas_interface.hpp"
-#endif
+#endif // NDA_HAVE_CUDA
 
 namespace nda::blas::f77 {
 
+  void axpy(int N, float alpha, const float *x, int incx, float *Y, int incy);
+  void axpy(int N, scomplex alpha, const scomplex *x, int incx, scomplex *Y, int incy);
   void axpy(int N, double alpha, const double *x, int incx, double *Y, int incy);
   void axpy(int N, dcomplex alpha, const dcomplex *x, int incx, dcomplex *Y, int incy);
 
+  void copy(int N, const float *x, int incx, float *Y, int incy);
+  void copy(int N, const scomplex *x, int incx, scomplex *Y, int incy);
   void copy(int N, const double *x, int incx, double *Y, int incy);
   void copy(int N, const dcomplex *x, int incx, dcomplex *Y, int incy);
 
+  float dot(int M, const float *x, int incx, const float *Y, int incy);
+  scomplex dot(int M, const scomplex *x, int incx, const scomplex *Y, int incy);
+  scomplex dotc(int M, const scomplex *x, int incx, const scomplex *Y, int incy);
   double dot(int M, const double *x, int incx, const double *Y, int incy);
   dcomplex dot(int M, const dcomplex *x, int incx, const dcomplex *Y, int incy);
   dcomplex dotc(int M, const dcomplex *x, int incx, const dcomplex *Y, int incy);
 
+  void gemm(char op_a, char op_b, int M, int N, int K, float alpha, const float *A, int LDA, const float *B, int LDB, float beta, float *C, int LDC);
+  void gemm(char op_a, char op_b, int M, int N, int K, scomplex alpha, const scomplex *A, int LDA, const scomplex *B, int LDB, scomplex beta,
+            scomplex *C, int LDC);
   void gemm(char op_a, char op_b, int M, int N, int K, double alpha, const double *A, int LDA, const double *B, int LDB, double beta, double *C,
             int LDC);
   void gemm(char op_a, char op_b, int M, int N, int K, dcomplex alpha, const dcomplex *A, int LDA, const dcomplex *B, int LDB, dcomplex beta,
             dcomplex *C, int LDC);
 
+  void gemm_batch(char op_a, char op_b, int M, int N, int K, float alpha, const float **A, int LDA, const float **B, int LDB, float beta, float **C,
+                  int LDC, int batch_count);
+  void gemm_batch(char op_a, char op_b, int M, int N, int K, scomplex alpha, const scomplex **A, int LDA, const scomplex **B, int LDB, scomplex beta,
+                  scomplex **C, int LDC, int batch_count);
   void gemm_batch(char op_a, char op_b, int M, int N, int K, double alpha, const double **A, int LDA, const double **B, int LDB, double beta,
                   double **C, int LDC, int batch_count);
   void gemm_batch(char op_a, char op_b, int M, int N, int K, dcomplex alpha, const dcomplex **A, int LDA, const dcomplex **B, int LDB, dcomplex beta,
                   dcomplex **C, int LDC, int batch_count);
 
+  void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, float alpha, const float **A, int *LDA, const float **B, int *LDB, float beta,
+                   float **C, int *LDC, int batch_count);
+  void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, scomplex alpha, const scomplex **A, int *LDA, const scomplex **B, int *LDB,
+                   scomplex beta, scomplex **C, int *LDC, int batch_count);
   void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, double alpha, const double **A, int *LDA, const double **B, int *LDB, double beta,
                    double **C, int *LDC, int batch_count);
   void gemm_vbatch(char op_a, char op_b, int *M, int *N, int *K, dcomplex alpha, const dcomplex **A, int *LDA, const dcomplex **B, int *LDB,
                    dcomplex beta, dcomplex **C, int *LDC, int batch_count);
 
+  void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, float alpha, const float *A, int LDA, int strideA, const float *B, int LDB,
+                          int strideB, float beta, float *C, int LDC, int strideC, int batch_count);
+  void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, scomplex alpha, const scomplex *A, int LDA, int strideA, const scomplex *B,
+                          int LDB, int srideB, scomplex beta, scomplex *C, int LDC, int strideC, int batch_count);
   void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, double alpha, const double *A, int LDA, int strideA, const double *B, int LDB,
                           int strideB, double beta, double *C, int LDC, int strideC, int batch_count);
   void gemm_batch_strided(char op_a, char op_b, int M, int N, int K, dcomplex alpha, const dcomplex *A, int LDA, int strideA, const dcomplex *B,
                           int LDB, int srideB, dcomplex beta, dcomplex *C, int LDC, int strideC, int batch_count);
 
+  void gemv(char op, int M, int N, float alpha, const float *A, int LDA, const float *x, int incx, float beta, float *Y, int incy);
+  void gemv(char op, int M, int N, scomplex alpha, const scomplex *A, int LDA, const scomplex *x, int incx, scomplex beta, scomplex *Y, int incy);
   void gemv(char op, int M, int N, double alpha, const double *A, int LDA, const double *x, int incx, double beta, double *Y, int incy);
   void gemv(char op, int M, int N, dcomplex alpha, const dcomplex *A, int LDA, const dcomplex *x, int incx, dcomplex beta, dcomplex *Y, int incy);
 
+  void ger(int M, int N, float alpha, const float *x, int incx, const float *Y, int incy, float *A, int LDA);
+  void ger(int M, int N, scomplex alpha, const scomplex *x, int incx, const scomplex *Y, int incy, scomplex *A, int LDA);
+  void gerc(int M, int N, scomplex alpha, const scomplex *x, int incx, const scomplex *Y, int incy, scomplex *A, int LDA);
   void ger(int M, int N, double alpha, const double *x, int incx, const double *Y, int incy, double *A, int LDA);
   void ger(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA);
+  void gerc(int M, int N, dcomplex alpha, const dcomplex *x, int incx, const dcomplex *Y, int incy, dcomplex *A, int LDA);
 
+  void scal(int M, float alpha, float *x, int incx);
+  void scal(int M, scomplex alpha, scomplex *x, int incx);
   void scal(int M, double alpha, double *x, int incx);
   void scal(int M, dcomplex alpha, dcomplex *x, int incx);
 
+  void swap(int N, float *x, int incx, float *Y, int incy);       // NOLINT (this is a BLAS swap)
+  void swap(int N, scomplex *x, int incx, scomplex *Y, int incy); // NOLINT (this is a BLAS swap)
   void swap(int N, double *x, int incx, double *Y, int incy);     // NOLINT (this is a BLAS swap)
   void swap(int N, dcomplex *x, int incx, dcomplex *Y, int incy); // NOLINT (this is a BLAS swap)
 
diff --git a/c++/nda/blas/scal.hpp b/c++/nda/blas/scal.hpp
index 84cc6a802..dea883be6 100644
--- a/c++/nda/blas/scal.hpp
+++ b/c++/nda/blas/scal.hpp
@@ -13,39 +13,37 @@
 #include "./interface/cxx_interface.hpp"
 #include "./tools.hpp"
 #include "../concepts.hpp"
-#include "../device.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
 
+#ifndef NDA_HAVE_DEVICE
+#include "../device.hpp"
+#endif // NDA_HAVE_DEVICE
+
 namespace nda::blas {
 
   /**
    * @ingroup linalg_blas
    * @brief Interface to the BLAS `scal` routine.
    *
-   * @details Scales a vector by a constant. This function calculates
-   * \f[
-   *   \mathbf{x} \leftarrow \alpha \mathbf{x} ;,
-   * \f]
+   * @details Scales a vector by a constant. This function calculates \f$ \mathbf{x} \leftarrow \alpha \mathbf{x} \f$,
    * where \f$ \alpha \f$ is a scalar constant and \f$ \mathbf{x} \f$ is a vector.
    *
-   * @tparam X nda::MemoryVector or a conjugate array expression.
-   * @param alpha Input scalar.
-   * @param x Input/Output vector to be scaled.
+   * @tparam X nda::MemoryVector type.
+   * @param alpha Input scalar \f$ \alpha \f$.
+   * @param x Input/Output vector \f$ \mathbf{x} \f$ to be scaled.
    */
-  template <typename X>
-    requires(MemoryVector<X> or is_conj_array_expr<X>)
+  template <MemoryVector X>
+    requires(is_blas_lapack_v<get_value_t<X>>)
   void scal(get_value_t<X> alpha, X &&x) { // NOLINT (temporary views are allowed here)
-    static_assert(is_blas_lapack_v<get_value_t<X>>, "Error in nda::blas::scal: Value type of vector is incompatible with blas");
-
-    if constexpr (mem::on_host<X>) {
-      f77::scal(x.size(), alpha, x.data(), x.indexmap().strides()[0]);
-    } else {
+    if constexpr (mem::have_device_compatible_addr_space<X>) {
 #if defined(NDA_HAVE_DEVICE)
       device::scal(x.size(), alpha, x.data(), x.indexmap().strides()[0]);
 #else
       compile_error_no_gpu();
 #endif
+    } else {
+      f77::scal(x.size(), alpha, x.data(), x.indexmap().strides()[0]);
     }
   }
 
diff --git a/c++/nda/blas/tools.hpp b/c++/nda/blas/tools.hpp
index c7f841900..9fce8eef5 100644
--- a/c++/nda/blas/tools.hpp
+++ b/c++/nda/blas/tools.hpp
@@ -22,7 +22,13 @@
 namespace nda {
 
   /**
-   * @ingroup linalg_blas
+   * @ingroup linalg_blas_utils
+   * @brief Alias for `std::complex<float>` type.
+   */
+  using scomplex = std::complex<float>;
+
+  /**
+   * @ingroup linalg_blas_utils
    * @brief Alias for `std::complex<double>` type.
    */
   using dcomplex = std::complex<double>;
@@ -32,7 +38,7 @@ namespace nda {
 namespace nda::blas {
 
   /**
-   * @addtogroup linalg_blas
+   * @addtogroup linalg_blas_utils
    * @{
    */
 
@@ -49,7 +55,25 @@ namespace nda::blas {
     requires(!std::is_same_v<A, std::remove_cvref_t<A>>)
   static constexpr bool is_conj_array_expr<A> = is_conj_array_expr<std::remove_cvref_t<A>>;
 
-  /// Constexpr variable that is true if the given nda::Array type has a Fortran memory layout.
+  /**
+   * @brief Get the underlying array of a conjugate lazy expression or return the array itself in case it is an
+   * nda::MemoryArray.
+   *
+   * @tparam A nda::Array type.
+   * @param a Conjugate expression or array/view.
+   * @return nda::MemoryArray object.
+   */
+  template <Array A>
+    requires(MemoryArray<A> or is_conj_array_expr<A>)
+  MemoryArray decltype(auto) get_array(A &&a) {
+    if constexpr (is_conj_array_expr<A>) {
+      return std::get<0>(std::forward<A>(a).a);
+    } else {
+      return std::forward<A>(a);
+    }
+  }
+
+  /// Constexpr variable that is true if the given nda::Array type has nda::F_layout.
   template <Array A>
     requires(MemoryArray<A> or is_conj_array_expr<A>)
   static constexpr bool has_F_layout = []() {
@@ -59,7 +83,7 @@ namespace nda::blas {
       return std::remove_cvref_t<A>::is_stride_order_Fortran();
   }();
 
-  /// Constexpr variable that is true if the given nda::Array type has a C memory layout.
+  /// Constexpr variable that is true if the given nda::Array type has nda::C_layout.
   template <Array A>
     requires(MemoryArray<A> or is_conj_array_expr<A>)
   static constexpr bool has_C_layout = []() {
@@ -78,7 +102,7 @@ namespace nda::blas {
    */
   template <bool conj, bool transpose>
   const char get_op = []() {
-    static_assert(!(conj and not transpose), "Error in nda::blas::get_op: Cannot use conjugate operation alone in blas operations");
+    static_assert(!(conj and not transpose), "Error in nda::blas::get_op: Cannot use conjugate operation alone in BLAS operations");
     if constexpr (conj and transpose)
       return 'C';
     else if constexpr (transpose)
@@ -88,27 +112,43 @@ namespace nda::blas {
   }();
 
   /**
-   * @brief Get the leading dimension in LAPACK jargon of an nda::MemoryMatrix.
+   * @brief Get the leading dimension of an nda::MemoryArray with rank 1 or 2 for BLAS/LAPACK calls.
+   *
+   * @details The leading dimension is the stride between two consecutive columns (rows) of a matrix in Fortran (C)
+   * layout. For 1-dimensional arrays, we simply return the size of the array.
    *
-   * @tparam A nda::MemoryMatrix type.
-   * @param a nda::MemoryMatrix object.
-   * @return Leading dimension.
+   * @tparam A nda::MemoryArray type.
+   * @param a nda::MemoryArray object.
+   * @return Leading dimension for BLAS/LAPACK calls.
    */
-  template <MemoryMatrix A>
+  template <MemoryArray A>
+    requires(get_rank<A> == 1 or get_rank<A> == 2)
   int get_ld(A const &a) {
-    return a.indexmap().strides()[has_F_layout<A> ? 1 : 0];
+    if constexpr (get_rank<A> == 1) {
+      return a.size();
+    } else {
+      return a.indexmap().strides()[has_F_layout<A> ? 1 : 0];
+    }
   }
 
   /**
-   * @brief Get the number of columns in LAPACK jargon of an nda::MemoryMatrix.
+   * @brief Get the number of columns of an nda::MemoryArray with rank 1 or 2 for BLAS/LAPACK calls.
+   *
+   * @details The number of columns corresponds to the extent of the second (first) dimension of a matrix in Fortran
+   * (C) layout. For 1-dimensional arrays, we return 1.
    *
-   * @tparam A nda::MemoryMatrix type.
-   * @param a nda::MemoryMatrix object.
-   * @return Number of columns.
+   * @tparam A nda::MemoryArray type.
+   * @param a nda::MemoryArray object.
+   * @return Number of columns for BLAS/LAPACK calls.
    */
-  template <MemoryMatrix A>
+  template <MemoryArray A>
+    requires(get_rank<A> == 1 or get_rank<A> == 2)
   int get_ncols(A const &a) {
-    return a.shape()[has_F_layout<A> ? 1 : 0];
+    if constexpr (get_rank<A> == 1) {
+      return 1;
+    } else {
+      return a.shape()[has_F_layout<A> ? 1 : 0];
+    }
   }
 
   /** @} */
diff --git a/c++/nda/lapack.hpp b/c++/nda/lapack.hpp
index e53f4ada0..dd4005ccc 100644
--- a/c++/nda/lapack.hpp
+++ b/c++/nda/lapack.hpp
@@ -18,5 +18,9 @@
 #include "./lapack/getri.hpp"
 #include "./lapack/getrs.hpp"
 #include "./lapack/gtsv.hpp"
+#include "./lapack/heev.hpp"
+#include "./lapack/hegv.hpp"
 #include "./lapack/orgqr.hpp"
+#include "./lapack/syev.hpp"
+#include "./lapack/sygv.hpp"
 #include "./lapack/ungqr.hpp"
diff --git a/c++/nda/lapack/gelss.hpp b/c++/nda/lapack/gelss.hpp
index c1e62ddcc..4a9de4889 100644
--- a/c++/nda/lapack/gelss.hpp
+++ b/c++/nda/lapack/gelss.hpp
@@ -12,9 +12,9 @@
 
 #include "./interface/cxx_interface.hpp"
 #include "../basic_array.hpp"
+#include "../basic_functions.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
-#include "../exceptions.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <type_traits>
 
 namespace nda::lapack {
 
@@ -31,68 +32,69 @@ namespace nda::lapack {
    *
    * @details Computes the minimum norm solution to a complex linear least squares problem:
    * \f[
-   *   \min_x | \mathbf{b} - \mathbf{A x} |_2
+   *   \min_{\mathbf{x}} | \mathbf{b} - \mathbf{A x} |_2
    * \f]
-   * using the singular value decomposition (SVD) of \f$ \mathbf{A} \f$. \f$ \mathbf{A} \f$ is an m-by-n matrix which
-   * may be rank-deficient.
+   * using the singular value decomposition (SVD) of \f$ \mathbf{A} \f$. \f$ \mathbf{A} \f$ is an \f$ m \times n \f$
+   * matrix which may be rank-deficient.
    *
    * Several right hand side vectors \f$ \mathbf{b} \f$ and solution vectors \f$ \mathbf{x} \f$ can be handled in a
-   * single call; they are stored as the columns of the m-by-nrhs right hand side matrix \f$ \mathbf{B} \f$ and the
-   * n-by-nrhs solution matrix \f$ \mathbf{X} \f$.
+   * single call; they are stored as the columns of the \f$ m \times n_{\mathrm{rhs}} \f$ right hand side matrix \f$
+   * \mathbf{B} \f$ and the \f$ n \times n_{\mathrm{rhs}} \f$ solution matrix \f$ \mathbf{X} \f$.
    *
    * The effective rank of \f$ \mathbf{A} \f$ is determined by treating as zero those singular values which are less
-   * than `rcond` times the largest singular value.
+   * than \f$ r_{\mathrm{cond}} \f$ times the largest singular value.
    *
    * @tparam A nda::MemoryMatrix type.
    * @tparam B nda::MemoryArray type.
    * @tparam S nda::MemoryVector type.
-   * @param a Input/output matrix. On entry, the m-by-n matrix \f$ \mathbf{A} \f$. On exit, the first `min(m,n)` rows of
-   * \f$ \mathbf{A} \f$ are overwritten with its right singular vectors, stored rowwise.
-   * @param b Input/output array. On entry, the m-by-nrhs right hand side matrix \f$ \mathbf{B} \f$. On exit,
-   * \f$ \mathbf{B} \f$ is overwritten by the n-by-nrhs solution matrix \f$ \mathbf{X} \f$. If `m >= n` and `RANK == n`,
-   * the residual sum-of-squares for the solution in the i-th column is given by the sum of squares of the modulus of
-   * elements `n+1:m` in that column.
-   * @param s Output vector. The singular values of \f$ \mathbf{A} \f$ in decreasing order. The condition number of A in
-   * the 2-norm is `s(1)/s(min(m,n))`.
-   * @param rcond It is used to determine the effective rank of \f$ \mathbf{A} \f$. Singular values `s(i) <= rcond *
-   * s(1)` are treated as zero. If `rcond < 0`, machine precision is used instead.
-   * @param rank Output variable of the effective rank of \f$ \mathbf{A} \f$, i.e., the number of singular values which
-   * are greater than `rcond * s(1)`.
+   * @param a Input/output matrix. On entry, the \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$. On exit, the first \f$
+   * \min(m,n) \f$ rows of \f$ \mathbf{A} \f$ are overwritten with its right singular vectors, stored rowwise.
+   * @param b Input/output array. On entry, the \f$ m \times n_{\mathrm{rhs}} \f$ right hand side matrix \f$ \mathbf{B}
+   * \f$. On exit, \f$ \mathbf{B} \f$ is overwritten by the \f$ n \times n_{\mathrm{rhs}} \f$ solution matrix \f$
+   * \mathbf{X} \f$. If \f$ m \geq n \f$ and if the effective rank is equal \f$ n \f$, the residual sum-of-squares for
+   * the solution in the i<sup>th</sup> column is given by the sum of squares of the modulus of elements \f$ n + 1 \f$
+   * to \f$ m \f$ in that column.
+   * @param s Output vector. The singular values of \f$ \mathbf{A} \f$ in decreasing order. The condition number of \f$
+   * \mathbf{A} \f$ in the 2-norm is \f$ s_1 / s_{min(m,n)} \f$.
+   * @param rcond It is used to determine the effective rank of \f$ \mathbf{A} \f$. Singular values \f$ s_i \leq
+   * r_{\mathrm{cond}} s_1 \f$ are treated as zero. If \f$ r_{\mathrm{cond}} < 0 \f$, machine precision is used instead.
+   * @param rank Output variable. The effective rank of \f$ \mathbf{A} \f$, i.e. the number of singular values which
+   * are greater than \f$ r_{\mathrm{cond}} s_1 \f$.
    * @return Integer return code.
    */
   template <MemoryMatrix A, MemoryArray B, MemoryVector S>
-    requires(have_same_value_type_v<A, B> and mem::on_host<A, B, S> and is_blas_lapack_v<get_value_t<A>>)
-  int gelss(A &&a, B &&b, S &&s, double rcond, int &rank) { // NOLINT (temporary views are allowed here)
-    static_assert(has_F_layout<A> and has_F_layout<B>, "Error in nda::lapack::gelss: C order not supported");
-    static_assert(MemoryVector<B> or MemoryMatrix<B>, "Error in nda::lapack::gelss: B must be a vector or a matrix");
+    requires(have_same_value_type_v<A, B> and mem::have_host_compatible_addr_space<A, B, S> and is_blas_lapack_v<get_value_t<A>>)
+  int gelss(A &&a, B &&b, S &&s, get_fp_t<A> rcond, int &rank) { // NOLINT (temporary views are allowed here)
+    static_assert(std::is_same_v<get_value_t<S>, float> || std::is_same_v<get_value_t<S>, double>,
+                  "Error in nda::lapack::gelss: Singular value array must have elements of type float or double");
+    static_assert(has_F_layout<A> and has_F_layout<B>, "Error in nda::lapack::gelss: Matrices/arrays must have Fortran layout");
+    static_assert(get_rank<B> == 1 || get_rank<B> == 2, "Error in nda::lapack::gelss: Right hand side must have rank 1 or 2");
 
-    auto dm = std::min(a.extent(0), a.extent(1));
-    if (s.size() < dm) s.resize(dm);
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    auto const k      = std::min(m, n);
+    resize_or_check_if_view(s, {k});
+    EXPECTS(b.extent(0) == m);
 
-    // must be lapack compatible
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(b.indexmap().min_stride() == 1);
     EXPECTS(s.indexmap().min_stride() == 1);
 
-    // first call to get the optimal bufferSize
+    // first call to get the optimal buffer size
     using value_type = get_value_t<A>;
-    value_type bufferSize_T{};
-    auto rwork = array<double, 1>(5 * dm);
+    using fp_type    = get_fp_t<A>;
+    value_type tmp_lwork{};
+    auto rwork = array<fp_type, 1>(5 * k);
     int info   = 0;
-    int nrhs = 1, ldb = b.size(); // defaults for B MemoryVector
-    if constexpr (MemoryMatrix<B>) {
-      nrhs = b.extent(1);
-      ldb  = get_ld(b);
-    }
-    f77::gelss(a.extent(0), a.extent(1), nrhs, a.data(), get_ld(a), b.data(), ldb, s.data(), rcond, rank, &bufferSize_T, -1, rwork.data(), info);
-    int bufferSize = static_cast<int>(std::ceil(std::real(bufferSize_T)));
+    int nrhs   = (get_rank<B> == 2 ? b.extent(1) : 1);
+    f77::gelss(m, n, nrhs, a.data(), get_ld(a), b.data(), get_ld(b), s.data(), rcond, rank, &tmp_lwork, -1, rwork.data(), info);
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
 
     // allocate work buffer and perform actual library call
-    array<value_type, 1> work(bufferSize);
-    f77::gelss(a.extent(0), a.extent(1), nrhs, a.data(), get_ld(a), b.data(), ldb, s.data(), rcond, rank, work.data(), bufferSize, rwork.data(),
-               info);
+    array<value_type, 1> work(lwork);
+    f77::gelss(m, n, nrhs, a.data(), get_ld(a), b.data(), get_ld(b), s.data(), rcond, rank, work.data(), lwork, rwork.data(), info);
 
-    if (info) NDA_RUNTIME_ERROR << "Error in nda::lapack::gelss: info = " << info;
     return info;
   }
 
diff --git a/c++/nda/lapack/gelss_worker.hpp b/c++/nda/lapack/gelss_worker.hpp
index 9e522bf81..f827e6fdb 100644
--- a/c++/nda/lapack/gelss_worker.hpp
+++ b/c++/nda/lapack/gelss_worker.hpp
@@ -3,10 +3,16 @@
 // SPDX-License-Identifier: Apache-2.0
 // See LICENSE in the root of this distribution for details.
 
+/**
+ * @file
+ * @brief Provides a class that can solve multiple linear least squares problems for a given matrix \f$ \mathbf{A} \f$.
+ */
+
 #pragma once
 
 #include "./gesvd.hpp"
 #include "../algorithms.hpp"
+#include "../arithmetic.hpp"
 #include "../basic_array.hpp"
 #include "../declarations.hpp"
 #include "../exceptions.hpp"
@@ -15,6 +21,7 @@
 #include "../linalg.hpp"
 #include "../mapped_functions.hpp"
 #include "../matrix_functions.hpp"
+#include "nda/traits.hpp"
 
 #include <itertools/itertools.hpp>
 
@@ -75,18 +82,6 @@ namespace nda::lapack {
    */
   template <typename T>
   class gelss_worker {
-    // Number of rows (M) and columns (N) of the Matrix A.
-    long M_, N_;
-
-    // Pseudo inverse of A, i.e. A^{+} = V * \Sigma^{+} * U^H.
-    matrix<T> A_plus_;
-
-    // U_N^H defining the error of the least squares problem.
-    matrix<T> U_N_H_;
-
-    // Array containing the singular values.
-    array<double, 1> s_;
-
     public:
     /**
      * @brief Get the number of variables of the given problem, i.e. the size of the vector \f$ \mathbf{x} \f$.
@@ -121,7 +116,7 @@ namespace nda::lapack {
       gesvd(A_work, s_, U, V_H);
 
       // calculate the pseudo inverse A^{+} = V * \Sigma^{+} * U^H
-      matrix<double, F_layout> S_plus(N_, M_);
+      matrix<get_fp_t<T>, F_layout> S_plus(N_, M_);
       S_plus = 0.;
       for (long i : range(s_.size())) S_plus(i, i) = 1.0 / s_(i);
       A_plus_ = dagger(V_H) * S_plus * dagger(U);
@@ -168,9 +163,22 @@ namespace nda::lapack {
     auto operator()(vector_const_view<T> b, std::optional<long> /*inner_matrix_dim*/ = {}) const {
       using std::sqrt;
       double err = 0.0;
-      if (M_ != N_) { err = norm(U_N_H_ * b) / sqrt(b.size()); }
+      if (M_ != N_) { err = nda::linalg::norm(U_N_H_ * b) / sqrt(b.size()); }
       return std::pair<vector<T>, double>{A_plus_ * b, err};
     }
+
+    private:
+    // Number of rows (M) and columns (N) of the Matrix A.
+    long M_, N_;
+
+    // Pseudo inverse of A, i.e. A^{+} = V * \Sigma^{+} * U^H.
+    matrix<T> A_plus_;
+
+    // U_N^H defining the error of the least squares problem.
+    matrix<T> U_N_H_;
+
+    // Array containing the singular values.
+    array<get_fp_t<T>, 1> s_;
   };
 
   /**
@@ -194,17 +202,7 @@ namespace nda::lapack {
    *
    * See `triqs::mesh::tail_fitter` for more information.
    */
-  struct gelss_worker_hermitian {
-    private:
-    // Complex double type.
-    using dcomplex = std::complex<double>;
-
-    // Worker for the original least squares problem.
-    gelss_worker<dcomplex> lss_;
-
-    // Worker for the extended least squares problem.
-    gelss_worker<dcomplex> lss_herm_;
-
+  class gelss_worker_hermitian {
     public:
     /**
      * @brief Get the number of variables of the given problem.
@@ -222,7 +220,7 @@ namespace nda::lapack {
      * @brief Construct a new worker object for a given matrix \f$ \mathbf{A} \f$.
      * @param A %Matrix \f$ \mathbf{A} \f$ used in the least squares problem.
      */
-    gelss_worker_hermitian(matrix_const_view<dcomplex> A) : lss_(A), lss_herm_(vstack(A, conj(A))) {}
+    gelss_worker_hermitian(matrix_const_view<std::complex<double>> A) : lss_(A), lss_herm_(vstack(A, conj(A))) {}
 
     /**
      * @brief Solve the least squares problem for a given right hand side matrix \f$ \mathbf{B} \f$.
@@ -232,10 +230,10 @@ namespace nda::lapack {
      *
      * @param B Right hand side matrix.
      * @param inner_matrix_dim Inner matrix dimension \f$ d \f$.
-     * @return A `std::pair<matrix<dcomplex>, double>` containing the solution matrix \f$ \mathbf{X} \f$ and the error
+     * @return A `std::pair<matrix<std::complex<double>>, double>` containing the solution matrix \f$ \mathbf{X} \f$ and the error
      * \f$ \epsilon \f$.
      */
-    auto operator()(matrix_const_view<dcomplex> B, std::optional<long> inner_matrix_dim = {}) const {
+    auto operator()(matrix_const_view<std::complex<double>> B, std::optional<long> inner_matrix_dim = {}) const {
       if (not inner_matrix_dim.has_value())
         NDA_RUNTIME_ERROR << "Error in nda::lapack::gelss_worker_hermitian: Inner matrix dimension required for hermitian least square fitting";
       long d = *inner_matrix_dim;
@@ -252,10 +250,11 @@ namespace nda::lapack {
         long N = shape[1] / (d * d);
 
         // reshape, transpose and take the complex conjugate
-        array<dcomplex, 4> arr_dag = conj(permuted_indices_view<encode(std::array{0, 1, 3, 2})>(reshape(C, std::array{shape[0], N, d, d})));
+        array<std::complex<double>, 4> arr_dag =
+           conj(permuted_indices_view<encode(std::array{0, 1, 3, 2})>(reshape(C, std::array{shape[0], N, d, d})));
 
         // return the result in a new matrix
-        return matrix<dcomplex>{reshape(std::move(arr_dag), shape)};
+        return matrix<std::complex<double>>{reshape(std::move(arr_dag), shape)};
       };
 
       // solve the extended system vstack(A, A*) * X = vstack(B, B_dag)
@@ -263,8 +262,15 @@ namespace nda::lapack {
       auto [x, err] = lss_herm_(vstack(B, B_dag));
 
       // resymmetrize the results to cure small hermiticity violations
-      return std::pair<matrix<dcomplex>, double>{0.5 * (x + inner_adjoint(x)), err};
+      return std::pair<matrix<std::complex<double>>, double>{0.5 * (x + inner_adjoint(x)), err};
     }
+
+    private:
+    // Worker for the original least squares problem.
+    gelss_worker<std::complex<double>> lss_;
+
+    // Worker for the extended least squares problem.
+    gelss_worker<std::complex<double>> lss_herm_;
   };
 
   /** @} */
diff --git a/c++/nda/lapack/geqp3.hpp b/c++/nda/lapack/geqp3.hpp
index 733f24d13..d535cff3c 100644
--- a/c++/nda/lapack/geqp3.hpp
+++ b/c++/nda/lapack/geqp3.hpp
@@ -12,9 +12,9 @@
 
 #include "./interface/cxx_interface.hpp"
 #include "../basic_array.hpp"
+#include "../basic_functions.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
-#include "../exceptions.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
@@ -36,50 +36,65 @@ namespace nda::lapack {
    * \f]
    * using Level 3 BLAS.
    *
+   * The matrix \f$ \mathbf{Q} \f$ is represented as a product of elementary reflectors
+   * \f[
+   *   \mathbf{Q} = \mathbf{H}(1) \mathbf{H}(2) \ldots \mathbf{H}(k) \; ,
+   * \f]
+   * where \f$ k = \min(m,n) \f$.
+   *
+   * Each \f$ \mathbf{H}(i) \f$ has the form
+   * \f[
+   *   \mathbf{H}(i) = \mathbf{I} - \tau_i * \mathbf{v}_i \mathbf{v}_i^H
+   * \f]
+   * where \f$ \tau_i \f$ is a real/complex scalar, and \f$ \mathbf{v}_i \f$ is a real/complex vector with
+   * - elements \f$ 1 \f$ to \f$ i - 1 \f$ equal to 0,
+   * - element \f$ i \f$ equal to 1 and
+   * - elements \f$ i + 1 \f$ to \f$ m \f$ stored on exit in the elements \f$ i + 1 \f$ to \f$ m \f$ in the column \f$ i
+   * \f$ of \f$ \mathbf{A} \f$.
+   *
    * @tparam A nda::MemoryMatrix type.
    * @tparam JPVT nda::MemoryVector type.
    * @tparam TAU nda::MemoryVector type.
-   * @param a Input/output matrix. On entry, the m-by-n matrix \f$ \mathbf{A} \f$. On exit, the upper triangle of the
-   * array contains the `min(m,n)`-by-n upper trapezoidal matrix \f$ \mathbf{R} \f$; the elements below the diagonal,
-   * together with the array `tau`, represent the unitary matrix \f$ \mathbf{Q} \f$ as a product of `min(m,n)`
-   * elementary reflectors.
-   * @param jpvt Input/output vector. On entry, if `jpvt(j) != 0`, the j-th column of \f$ \mathbf{A} \f$ is permuted to
-   * the front of \f$ \mathbf{A P} \f$ (a leading column); if `jpvt(j) == 0`, the j-th column of \f$ \mathbf{A} \f$ is a
-   * free column. On exit, if `jpvt(j) == k`, then the j-th column of \f$ \mathbf{A P} \f$ was the the k-th column of
-   * \f$ \mathbf{A} \f$.
-   * @param tau Output vector. The scalar factors of the elementary reflectors.
+   * @param a Input/output matrix. On entry, the \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$. On exit, the upper
+   * triangle of the array contains the \f$ \min(m,n) \times n \f$ upper trapezoidal matrix \f$ \mathbf{R} \f$; the
+   * elements below the diagonal, together with the array \f$ \mathbf{\tau} \f$, represent the unitary matrix \f$
+   * \mathbf{Q} \f$ as a product of \f$ \min(m,n) \f$ elementary reflectors.
+   * @param jpvt Input/output vector. On entry, if the j<sup>th</sup> element is \f$ \neq 0 \f$, the j<sup>th</sup>
+   * column of \f$ \mathbf{A} \f$ is permuted to the front of \f$ \mathbf{A P} \f$ (a leading column); if the j<sup>th
+   * </sup> element is equal 0, the j<sup>th</sup> column of \f$ \mathbf{A} \f$ is a free column. On exit, if the
+   * j<sup>th</sup> element is equal \f$ k \f$, then the j<sup>th</sup>  column of \f$ \mathbf{A P} \f$ was the the
+   * k<sup>th</sup> column of \f$ \mathbf{A} \f$.
+   * @param tau Output vector. The scalar factors \f$ \tau_i \f$ of the elementary reflectors \f$ \mathbf{H}(i) \f$.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryMatrix A, MemoryVector JPVT, MemoryVector TAU>
-    requires(mem::on_host<A> and is_blas_lapack_v<get_value_t<A>> and have_same_value_type_v<A, TAU>
-             and mem::have_compatible_addr_space<A, JPVT, TAU>)
+    requires(mem::have_host_compatible_addr_space<A, JPVT, TAU> and have_same_value_type_v<A, TAU> and is_blas_lapack_v<get_value_t<A>>)
   int geqp3(A &&a, JPVT &&jpvt, TAU &&tau) { // NOLINT (temporary views are allowed here)
-    static_assert(has_F_layout<A>, "Error in nda::lapack::geqp3: C order not supported");
     static_assert(std::is_same_v<get_value_t<JPVT>, int>, "Error in nda::lapack::geqp3: Pivoting array must have elements of type int");
-    static_assert(mem::have_host_compatible_addr_space<A, JPVT, TAU>, "Error in nda::lapack::geqp3: Only CPU is supported");
+    static_assert(has_F_layout<A>, "Error in nda::lapack::geqp3: A must have Fortran layout");
 
-    auto [m, n] = a.shape();
-    EXPECTS(tau.size() >= std::min(m, n));
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    EXPECTS(jpvt.size() == n);
+    resize_or_check_if_view(tau, {std::min(m, n)});
 
-    // must be lapack compatible
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(jpvt.indexmap().min_stride() == 1);
     EXPECTS(tau.indexmap().min_stride() == 1);
 
-    // first call to get the optimal buffersize
+    // first call to get the optimal buffer size
     using value_type = get_value_t<A>;
-    value_type bufferSize_T{};
+    value_type tmp_lwork{};
     int info = 0;
-    array<double, 1> rwork(2 * n);
-    lapack::f77::geqp3(m, n, a.data(), get_ld(a), jpvt.data(), tau.data(), &bufferSize_T, -1, rwork.data(), info);
-    int bufferSize = static_cast<int>(std::ceil(std::real(bufferSize_T)));
+    array<get_fp_t<value_type>, 1> rwork(2 * n);
+    lapack::f77::geqp3(m, n, a.data(), get_ld(a), jpvt.data(), tau.data(), &tmp_lwork, -1, rwork.data(), info);
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
 
     // allocate work buffer and perform actual library call
-    nda::array<value_type, 1> work(bufferSize);
-    lapack::f77::geqp3(m, n, a.data(), get_ld(a), jpvt.data(), tau.data(), work.data(), bufferSize, rwork.data(), info);
-    jpvt -= 1; // Shift to 0-based indexing
+    array<value_type, 1> work(lwork);
+    lapack::f77::geqp3(m, n, a.data(), get_ld(a), jpvt.data(), tau.data(), work.data(), lwork, rwork.data(), info);
 
-    if (info) NDA_RUNTIME_ERROR << "Error in nda::lapack::geqp3: info = " << info;
     return info;
   }
 
diff --git a/c++/nda/lapack/gesvd.hpp b/c++/nda/lapack/gesvd.hpp
index 3f703f249..7ffe27c27 100644
--- a/c++/nda/lapack/gesvd.hpp
+++ b/c++/nda/lapack/gesvd.hpp
@@ -11,9 +11,10 @@
 #pragma once
 
 #include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
-#include "../exceptions.hpp"
 #include "../layout/policies.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
@@ -27,6 +28,7 @@
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <concepts>
 #include <utility>
 
 namespace nda::lapack {
@@ -35,16 +37,16 @@ namespace nda::lapack {
    * @ingroup linalg_lapack
    * @brief Interface to the LAPACK `gesvd` routine.
    *
-   * @details Computes the singular value decomposition (SVD) of a complex m-by-n matrix \f$ \mathbf{A} \f$, optionally
-   * computing the left and/or right singular vectors. The SVD is written
+   * @details Computes the singular value decomposition (SVD) of an \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$. The
+   * SVD is written as
    * \f[
-   *   \mathbf{A} = \mathbf{U} \mathbf{S} \mathbf{V}^H
+   *   \mathbf{A} = \mathbf{U} \mathbf{S} \mathbf{V}^H \; ,
    * \f]
-   * where \f$ \mathbf{S} \f$ is an m-by-n matrix which is zero except for its `min(m,n)` diagonal elements,
-   * \f$ \mathbf{U} \f$ is an m-by-m unitary matrix, and \f$ \mathbf{V} \f$ is an n-by-n unitary matrix. The diagonal
-   * elements of \f$ \mathbf{S} \f$ are the singular values of \f$ \mathbf{A} \f$; they are real and non-negative, and
-   * are returned in descending order. The first `min(m,n)` columns of \f$ \mathbf{U} \f$ and \f$ \mathbf{V} \f$ are the
-   * left and right singular vectors of \f$ \mathbf{A} \f$.
+   * where \f$ \mathbf{S} \f$ is an \f$ m \times n \f$ matrix which is zero except for its \f$ \min(m,n) \f$ diagonal
+   * elements, \f$ \mathbf{U} \f$ is an \f$ m \times m \f$ unitary matrix, and \f$ \mathbf{V} \f$ is an \f$ n \times n
+   * \f$ unitary matrix. The diagonal elements of \f$ \mathbf{S} \f$ are the singular values of \f$ \mathbf{A} \f$; they
+   * are real and non-negative, and are returned in descending order. The first \f$ min(m,n) \f$ columns of \f$
+   * \mathbf{U} \f$ and \f$ \mathbf{V} \f$ are the left and right singular vectors of \f$ \mathbf{A} \f$.
    *
    * Note that the routine returns \f$ \mathbf{V}^H \f$, not \f$ \mathbf{V} \f$.
    *
@@ -52,28 +54,43 @@ namespace nda::lapack {
    * @tparam S nda::MemoryVector type.
    * @tparam U nda::MemoryMatrix type.
    * @tparam VT nda::MemoryMatrix type.
-   * @param a Input/output matrix. On entry, the m-by-n matrix \f$ \mathbf{A} \f$. On exit, the contents of
+   * @param a Input/output matrix. On entry, the \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$. On exit, the contents of
    * \f$ \mathbf{A} \f$ are destroyed.
-   * @param s Output vector. The singular values of \f$ \mathbf{A} \f$, sorted so that `s(i) >= s(i+1)`.
-   * @param u Output matrix. It contains the m-by-m unitary matrix \f$ \mathbf{U} \f$.
-   * @param vt Output matrix. It contains contains the n-by-n unitary matrix \f$ \mathbf{V}^H \f$.
+   * @param s Output vector. The singular values of \f$ \mathbf{A} \f$, sorted so that \f$ s_i \geq s_{i+1} \f$.
+   * @param u Output matrix. It contains the \f$ m \times m \f$ unitary matrix \f$ \mathbf{U} \f$.
+   * @param vt Output matrix. It contains the \f$ n \times n \f$ unitary matrix \f$ \mathbf{V}^H \f$.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryMatrix A, MemoryVector S, MemoryMatrix U, MemoryMatrix VT>
-    requires(have_same_value_type_v<A, U, VT> and mem::have_compatible_addr_space<A, S, U, VT> and is_blas_lapack_v<get_value_t<A>>)
+    requires(have_same_value_type_v<A, U, VT> and mem::have_compatible_addr_space<A, S, U, VT> and is_blas_lapack_v<get_value_t<A>>
+             and have_same_value_type_v<get_fp_t<A>, S>)
   int gesvd(A &&a, S &&s, U &&u, VT &&vt) { // NOLINT (temporary views are allowed here)
-    static_assert(has_F_layout<A> and has_F_layout<U> and has_F_layout<VT>, "Error in nda::lapack::gesvd: C order not supported");
+    static_assert(has_C_layout<A> == has_C_layout<U> and has_C_layout<A> == has_C_layout<VT>,
+                  "Error in nda::lapack::gesvd: Matrix layouts have to be the same");
 
-    auto dm = std::min(a.extent(0), a.extent(1));
-    if (s.size() < dm) s.resize(dm);
+    // check the dimensions of the output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    auto const k      = std::min(m, n);
+    resize_or_check_if_view(s, {k});
+    resize_or_check_if_view(u, {m, m});
+    resize_or_check_if_view(vt, {n, n});
 
-    // must be lapack compatible
+    // cusolverDn?gesvd only supports matrices with m >= n
+    if constexpr (mem::have_device_compatible_addr_space<A, S, U, VT>) {
+      if constexpr (has_C_layout<A>) {
+        EXPECTS(n >= m);
+      } else {
+        EXPECTS(m >= n);
+      }
+    }
+
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(s.indexmap().min_stride() == 1);
     EXPECTS(u.indexmap().min_stride() == 1);
     EXPECTS(vt.indexmap().min_stride() == 1);
 
-    // call host/device implementation depending on input type
+    // call host/device implementation depending on address space of input arrays/views
     auto gesvd_call = []<typename... Ts>(Ts &&...args) {
       if constexpr (mem::have_device_compatible_addr_space<A, S, U, VT>) {
 #if defined(NDA_HAVE_DEVICE)
@@ -86,21 +103,26 @@ namespace nda::lapack {
       }
     };
 
-    // first call to get the optimal buffersize
+    // first call to get the optimal buffer size
     using value_type = get_value_t<A>;
-    value_type bufferSize_T{};
-    auto rwork = array<double, 1, C_layout, heap<mem::get_addr_space<A>>>(5 * dm);
+    value_type tmp_lwork{};
+    auto rwork = array<get_fp_t<A>, 1, C_layout, heap<mem::get_addr_space<A>>>(5 * k);
     int info   = 0;
-    gesvd_call('A', 'A', a.extent(0), a.extent(1), a.data(), get_ld(a), s.data(), u.data(), get_ld(u), vt.data(), get_ld(vt), &bufferSize_T, -1,
-               rwork.data(), info);
-    int bufferSize = static_cast<int>(std::ceil(std::real(bufferSize_T)));
+    if constexpr (has_C_layout<A>) {
+      gesvd_call('A', 'A', n, m, a.data(), get_ld(a), s.data(), vt.data(), get_ld(vt), u.data(), get_ld(u), &tmp_lwork, -1, rwork.data(), info);
+    } else {
+      gesvd_call('A', 'A', m, n, a.data(), get_ld(a), s.data(), u.data(), get_ld(u), vt.data(), get_ld(vt), &tmp_lwork, -1, rwork.data(), info);
+    }
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
 
     // allocate work buffer and perform actual library call
-    nda::array<value_type, 1, C_layout, heap<mem::get_addr_space<A>>> work(bufferSize);
-    gesvd_call('A', 'A', a.extent(0), a.extent(1), a.data(), get_ld(a), s.data(), u.data(), get_ld(u), vt.data(), get_ld(vt), work.data(), bufferSize,
-               rwork.data(), info);
+    array<value_type, 1, C_layout, heap<mem::get_addr_space<A>>> work(lwork);
+    if constexpr (has_C_layout<A>) {
+      gesvd_call('A', 'A', n, m, a.data(), get_ld(a), s.data(), vt.data(), get_ld(vt), u.data(), get_ld(u), work.data(), lwork, rwork.data(), info);
+    } else {
+      gesvd_call('A', 'A', m, n, a.data(), get_ld(a), s.data(), u.data(), get_ld(u), vt.data(), get_ld(vt), work.data(), lwork, rwork.data(), info);
+    }
 
-    if (info) NDA_RUNTIME_ERROR << "Error in nda::lapack::gesvd: info = " << info;
     return info;
   }
 
diff --git a/c++/nda/lapack/getrf.hpp b/c++/nda/lapack/getrf.hpp
index ca61aa9c5..48324c7f3 100644
--- a/c++/nda/lapack/getrf.hpp
+++ b/c++/nda/lapack/getrf.hpp
@@ -11,11 +11,17 @@
 #pragma once
 
 #include "./interface/cxx_interface.hpp"
+#include "../basic_functions.hpp"
 #include "../concepts.hpp"
+#include "../layout_transforms.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
 
+#ifndef NDA_HAVE_DEVICE
+#include "../device.hpp"
+#endif // NDA_HAVE_DEVICE
+
 #include <algorithm>
 #include <type_traits>
 
@@ -25,36 +31,39 @@ namespace nda::lapack {
    * @ingroup linalg_lapack
    * @brief Interface to the LAPACK `getrf` routine.
    *
-   * @details Computes an LU factorization of a general m-by-n matrix \f$ \mathbf{A} \f$ using partial pivoting with row
-   * interchanges.
+   * @details Computes an LU factorization of a general \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$ using partial
+   * pivoting with row interchanges.
    *
    * The factorization has the form
    * \f[
    *   \mathbf{A} = \mathbf{P L U}
    * \f]
    * where \f$ \mathbf{P} \f$ is a permutation matrix, \f$ \mathbf{L} \f$ is lower triangular with unit diagonal
-   * elements (lower trapezoidal if `m > n`), and \f$ \mathbf{U} \f$ is upper triangular (upper trapezoidal if `m < n`).
+   * elements (lower trapezoidal if \f$ m > n \f$), and \f$ \mathbf{U} \f$ is upper triangular (upper trapezoidal if \f$
+   * m < n \f$).
    *
    * This is the right-looking Level 3 BLAS version of the algorithm.
    *
    * @tparam A nda::MemoryMatrix type.
    * @tparam IPIV nda::MemoryVector type.
-   * @param a Input/output matrix. On entry, the m-by-n matrix to be factored. On exit, the factors \f$ \mathbf{L} \f$
-   * and \f$ \mathbf{U} \f$ from the factorization \f$ \mathbf{A} = \mathbf{P L U} \f$; the unit diagonal elements of
-   * \f$ \mathbf{L} \f$ are not stored.
-   * @param ipiv Output vector. The pivot indices from `getrf`, i.e. for `1 <= i <= n`, row i of the matrix was
-   * interchanged with row `ipiv(i)`.
+   * @param a Input/output matrix. On entry, the \f$ m \times n \f$ matrix to be factored. On exit, the factors \f$
+   * \mathbf{L} \f$ and \f$ \mathbf{U} \f$ from the factorization \f$ \mathbf{A} = \mathbf{P L U} \f$; the unit diagonal
+   * elements of \f$ \mathbf{L} \f$ are not stored.
+   * @param ipiv Output vector. The pivot indices, i.e. for \f$ 1 \leq i \leq \min(m,n) \f$, row \f$ i \f$ of the matrix
+   * was interchanged with row `ipiv(i-1)`.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryMatrix A, MemoryVector IPIV>
-    requires(mem::have_compatible_addr_space<A, IPIV> and is_blas_lapack_v<get_value_t<A>>)
+    requires(mem::have_compatible_addr_space<A, IPIV> and is_blas_lapack_v<get_value_t<A>> and std::is_same_v<get_value_t<IPIV>, int>)
   int getrf(A &&a, IPIV &&ipiv) { // NOLINT (temporary views are allowed here)
-    static_assert(std::is_same_v<get_value_t<IPIV>, int>, "Error in nda::lapack::getri: Pivoting array must have elements of type int");
+    // for C-layout arrays/views, call getrf with the transpose
+    if constexpr (has_C_layout<A>) return getrf(transpose(a), ipiv);
 
-    auto dm = std::min(a.extent(0), a.extent(1));
-    if (ipiv.size() < dm) ipiv.resize(dm); // ipiv needs to be a regular array?
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    resize_or_check_if_view(ipiv, {std::min(m, n)});
 
-    // must be lapack compatible
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(ipiv.indexmap().min_stride() == 1);
 
@@ -64,15 +73,16 @@ namespace nda::lapack {
 #endif
 #endif
 
+    // perform actual library call
     int info = 0;
     if constexpr (mem::have_device_compatible_addr_space<A, IPIV>) {
 #if defined(NDA_HAVE_DEVICE)
-      device::getrf(a.extent(0), a.extent(1), a.data(), get_ld(a), ipiv.data(), info);
+      device::getrf(m, n, a.data(), get_ld(a), ipiv.data(), info);
 #else
       compile_error_no_gpu();
 #endif
     } else {
-      f77::getrf(a.extent(0), a.extent(1), a.data(), get_ld(a), ipiv.data(), info);
+      f77::getrf(m, n, a.data(), get_ld(a), ipiv.data(), info);
     }
     return info;
   }
diff --git a/c++/nda/lapack/getri.hpp b/c++/nda/lapack/getri.hpp
index d224a664e..53d9f600f 100644
--- a/c++/nda/lapack/getri.hpp
+++ b/c++/nda/lapack/getri.hpp
@@ -11,14 +11,13 @@
 #pragma once
 
 #include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
-#include "../exceptions.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
 
-#include <algorithm>
 #include <cmath>
 #include <complex>
 #include <type_traits>
@@ -29,7 +28,8 @@ namespace nda::lapack {
    * @ingroup linalg_lapack
    * @brief Interface to the LAPACK `getri` routine.
    *
-   * @details Computes the inverse of a matrix using the LU factorization computed by `getrf`.
+   * @details Computes the inverse of an \f$ n \times n \f$ matrix \f$ \mathbf{A} \f$ matrix using the LU factorization
+   * computed by nda::lapack::getrf.
    *
    * This method inverts \f$ \mathbf{U} \f$ and then computes \f$ \mathrm{inv}(\mathbf{A}) \f$ by solving the system
    * \f$ \mathrm{inv}(\mathbf{A}) L = \mathrm{inv}(\mathbf{U}) \f$ for \f$ \mathrm{inv}(\mathbf{A}) \f$.
@@ -37,48 +37,40 @@ namespace nda::lapack {
    * @tparam A nda::MemoryMatrix type.
    * @tparam IPIV nda::MemoryVector type.
    * @param a Input/output matrix. On entry, the factors \f$ \mathbf{L} \f$ and \f$ \mathbf{U} \f$ from the
-   * factorization \f$ \mathbf{A} = \mathbf{P L U} \f$ as computed by `getrf`. On exit, if `INFO == 0`, the inverse of
-   * the original matrix \f$ \mathbf{A} \f$.
-   * @param ipiv Input vector. The pivot indices from `getrf`, i.e. for `1 <= i <= N`, row i of the matrix was
-   * interchanged with row `ipiv(i)`.
+   * factorization \f$ \mathbf{A} = \mathbf{P L U} \f$ as computed by nda::lapack::getrf. On exit, if `INFO == 0`, the
+   * inverse of the original matrix \f$ \mathbf{A} \f$.
+   * @param ipiv Input vector. The pivot indices from nda::lapack::getrf, i.e. for \f$ 1 \leq i \leq n \f$, row i of the
+   * matrix was interchanged with row `ipiv(i)`.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryMatrix A, MemoryVector IPIV>
-    requires(mem::have_compatible_addr_space<A, IPIV> and is_blas_lapack_v<get_value_t<A>>)
+    requires(mem::have_host_compatible_addr_space<A, IPIV> and is_blas_lapack_v<get_value_t<A>> and std::is_same_v<get_value_t<IPIV>, int>)
   int getri(A &&a, IPIV const &ipiv) { // NOLINT (temporary views are allowed here)
-    static_assert(std::is_same_v<get_value_t<IPIV>, int>, "Error in nda::lapack::getri: Pivoting array must have elements of type int");
-    auto dm = std::min(a.extent(0), a.extent(1));
+    // check the dimensions of the input/output arrays/views
+    auto const [m, n] = a.shape();
+    EXPECTS(m == n);
+    EXPECTS(ipiv.size() == n);
 
-    if (ipiv.size() < dm)
-      NDA_RUNTIME_ERROR << "Error in nda::lapack::getri: Pivot index array size " << ipiv.size() << " smaller than required size " << dm;
-
-    // must be lapack compatible
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(ipiv.indexmap().min_stride() == 1);
 
-    int info = 0;
-    if constexpr (mem::have_device_compatible_addr_space<A, IPIV>) {
-#if defined(NDA_HAVE_DEVICE)
-      device::getri(a.extent(0), a.data(), get_ld(a), ipiv.data(), NULL, 0, info);
-#else
-      compile_error_no_gpu();
-#endif
-    } else {
-      // first call to get the optimal buffersize
-      using value_type = get_value_t<A>;
-      value_type bufferSize_T{};
-      f77::getri(a.extent(0), a.data(), get_ld(a), ipiv.data(), &bufferSize_T, -1, info);
-      int bufferSize = static_cast<int>(std::ceil(std::real(bufferSize_T)));
+    // first call to get the optimal buffer size
+    using value_type = get_value_t<A>;
+    int info         = 0;
+    value_type tmp_lwork{};
+    f77::getri(n, a.data(), get_ld(a), ipiv.data(), &tmp_lwork, -1, info);
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
 
-      // allocate work buffer and perform actual library call
-      array<value_type, 1> work(bufferSize);
+    // allocate work buffer and perform actual library call
+    array<value_type, 1> work(lwork);
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
-      work = 0;
+    work = 0;
 #endif
 #endif
-      f77::getri(a.extent(0), a.data(), get_ld(a), ipiv.data(), work.data(), bufferSize, info);
-    }
+    f77::getri(n, a.data(), get_ld(a), ipiv.data(), work.data(), lwork, info);
+
     return info;
   }
 
diff --git a/c++/nda/lapack/getrs.hpp b/c++/nda/lapack/getrs.hpp
index 8a5dfdb11..56171a2e6 100644
--- a/c++/nda/lapack/getrs.hpp
+++ b/c++/nda/lapack/getrs.hpp
@@ -12,6 +12,7 @@
 
 #include "./interface/cxx_interface.hpp"
 #include "../concepts.hpp"
+#include "../declarations.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
@@ -20,7 +21,6 @@
 #include "../device.hpp"
 #endif // NDA_HAVE_DEVICE
 
-#include <algorithm>
 #include <type_traits>
 
 namespace nda::lapack {
@@ -30,50 +30,59 @@ namespace nda::lapack {
    * @brief Interface to the LAPACK `getrs` routine.
    *
    * @details Solves a system of linear equations
+   * - \f$ \mathbf{A X} = \mathbf{B} \f$ or
+   * - \f$ \mathbf{A}^* \mathbf{X} = \mathbf{B} \f$.
    *
-   * - \f$ \mathbf{A X} = \mathbf{B} \f$,
-   * - \f$ \mathbf{A}^T \mathbf{X} = \mathbf{B} \f$ or
-   * - \f$ \mathbf{A}^H \mathbf{X} = \mathbf{B} \f$
+   * with a general \f$ n \times n \f$ matrix \f$ \mathbf{A} \f$ using the LU factorization computed by
+   * nda::lapack::getrf.
    *
-   * with a general n-by-n matrix \f$ \mathbf{A} \f$ using the LU factorization computed by `getrf`.
-   *
-   * @tparam A nda::MemoryMatrix type.
-   * @tparam B nda::MemoryMatrix type.
+   * @tparam A nda::Matrix type.
+   * @tparam B nda::MemoryArray type.
    * @tparam IPIV nda::MemoryVector type.
    * @param a Input matrix. The factors \f$ \mathbf{L} \f$ and \f$ \mathbf{U} \f$ from the factorization \f$ \mathbf{A}
-   * = \mathbf{P L U} \f$ as computed by `getrf`.
+   * = \mathbf{P L U} \f$ as computed by nda::lapack::getrf.
    * @param b Input/output matrix. On entry, the right hand side matrix \f$ \mathbf{B} \f$. On exit, the solution matrix
    * \f$ \mathbf{X} \f$.
-   * @param ipiv Input vector. The pivot indices from `getrf`, i.e. for `1 <= i <= n`, row i of the matrix was
-   * interchanged with row `ipiv(i)`.
+   * @param ipiv Input vector. The pivot indices from nda::lapack::getrf, i.e. for \f$ 1 \leq i \leq n \f$, row i of the
+   * matrix was interchanged with row `ipiv(i)`.
    * @return Integer return code from the LAPACK call.
    */
-  template <MemoryMatrix A, MemoryMatrix B, MemoryVector IPIV>
-    requires(have_same_value_type_v<A, B> and mem::have_compatible_addr_space<A, B, IPIV> and is_blas_lapack_v<get_value_t<A>>)
+  template <Matrix A, MemoryArray B, MemoryVector IPIV>
+    requires((MemoryMatrix<A> or is_conj_array_expr<A>)
+             and have_same_value_type_v<A, B> and mem::have_compatible_addr_space<A, B, IPIV> and is_blas_lapack_v<get_value_t<A>>)
   int getrs(A const &a, B &&b, IPIV const &ipiv) { // NOLINT (temporary views are allowed here)
     static_assert(std::is_same_v<get_value_t<IPIV>, int>, "Error in nda::lapack::getrs: Pivoting array must have elements of type int");
-    EXPECTS(ipiv.size() >= std::min(a.extent(0), a.extent(1)));
+    static_assert(get_rank<B> == 1 || get_rank<B> == 2, "Error in nda::lapack::getrs: Right hand side must have rank 1 or 2");
+    static_assert(has_F_layout<B>, "Error in nda::lapack::getrs: B must have Fortran layout");
+
+    // get underlying matrix in case it is given as a lazy conjugate expression
+    auto &a_mat = get_array(a);
 
-    // must be lapack compatible
-    EXPECTS(a.indexmap().min_stride() == 1);
+    // check the dimensions of the input/output arrays/views
+    EXPECTS(a_mat.shape()[0] == a_mat.shape()[1]);
+    EXPECTS(b.extent(0) == a_mat.shape()[0]);
+    EXPECTS(ipiv.size() == a_mat.shape()[0]);
+
+    // arrays/views must be LAPACK compatible
+    EXPECTS(a_mat.indexmap().min_stride() == 1);
     EXPECTS(b.indexmap().min_stride() == 1);
     EXPECTS(ipiv.indexmap().min_stride() == 1);
 
-    // check for lazy expressions
-    static constexpr bool conj_A = is_conj_array_expr<A>;
-    char op_a                    = get_op<conj_A, /* transpose = */ has_C_layout<A>>;
+    // check for conjugate lazy expressions and C-layouts
+    char op_a = get_op<is_conj_array_expr<A>, has_C_layout<A>>;
 
     // perform actual library call
     int info = 0;
     if constexpr (mem::have_device_compatible_addr_space<A, B, IPIV>) {
 #if defined(NDA_HAVE_DEVICE)
-      device::getrs(op_a, get_ncols(a), get_ncols(b), a.data(), get_ld(a), ipiv.data(), b.data(), get_ld(b), info);
+      device::getrs(op_a, get_ncols(a_mat), get_ncols(b), a_mat.data(), get_ld(a_mat), ipiv.data(), b.data(), get_ld(b), info);
 #else
       compile_error_no_gpu();
 #endif
     } else {
-      f77::getrs(op_a, get_ncols(a), get_ncols(b), a.data(), get_ld(a), ipiv.data(), b.data(), get_ld(b), info);
+      f77::getrs(op_a, get_ncols(a_mat), get_ncols(b), a_mat.data(), get_ld(a_mat), ipiv.data(), b.data(), get_ld(b), info);
     }
+
     return info;
   }
 
diff --git a/c++/nda/lapack/gtsv.hpp b/c++/nda/lapack/gtsv.hpp
index ed995cf9f..354c4583f 100644
--- a/c++/nda/lapack/gtsv.hpp
+++ b/c++/nda/lapack/gtsv.hpp
@@ -12,6 +12,7 @@
 
 #include "./interface/cxx_interface.hpp"
 #include "../concepts.hpp"
+#include "../declarations.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
 #include "../traits.hpp"
@@ -26,41 +27,43 @@ namespace nda::lapack {
    * \f[
    *   \mathbf{A} \mathbf{X} = \mathbf{B},
    * \f]
-   * where \f$ \mathbf{A} \f$ is an n-by-n tridiagonal matrix, by Gaussian elimination with partial pivoting.
+   * where \f$ \mathbf{A} \f$ is an \f$ n \times n \f$ tridiagonal matrix, by Gaussian elimination with partial
+   * pivoting.
    *
-   * Note that the equation \f$ \mathbf{A}^T \mathbf{X} = \mathbf{B} \f$ may be solved by interchanging the order of the
+   * Note that the equation \f$ \mathbf{A}^H \mathbf{X} = \mathbf{B} \f$ may be solved by interchanging the order of the
    * arguments containing the subdiagonal elements.
    *
    * @tparam DL nda::MemoryVector type.
    * @tparam D nda::MemoryVector type.
    * @tparam DU nda::MemoryVector type.
    * @tparam B nda::MemoryArray type.
-   * @param dl Input/Output vector. On entry, it must contain the (n-1) subdiagonal elements of \f$ \mathbf{A} \f$. On
-   * exit, it is overwritten by the (n-2) elements of the second superdiagonal of the upper triangular matrix
-   * \f$ \mathbf{U} \f$ from the LU factorization of \f$ \mathbf{A} \f$.
+   * @param dl Input/Output vector. On entry, it must contain the \f$ n - 1 \f$ subdiagonal elements of \f$ \mathbf{A}
+   * \f$. On exit, it is overwritten by the \f$ n - 2 \f$ elements of the second superdiagonal of the upper triangular
+   * matrix \f$ \mathbf{U} \f$ from the LU factorization of \f$ \mathbf{A} \f$.
    * @param d Input/Output vector. On entry, it must contain the diagonal elements of \f$ \mathbf{A} \f$. On exit, it is
-   * overwritten by the n diagonal elements of \f$ \mathbf{U} \f$.
-   * @param du Input/Output vector. On entry, it must contain the (n-1) superdiagonal elements of \f$ \mathbf{A} \f$. On
-   * exit, it is overwritten by the (n-1) elements of the first superdiagonal of \f$ \mathbf{U} \f$ .
-   * @param b Input/Output array. On entry, the n-by-nrhs right hand side matrix \f$ \mathbf{B} \f$. On exit, if
-   * `INFO == 0`, the n-by-nrhs solution matrix \f$ \mathbf{X} \f$.
+   * overwritten by the \f$ n \f$ diagonal elements of \f$ \mathbf{U} \f$.
+   * @param du Input/Output vector. On entry, it must contain the \f$ n - 1 \f$ superdiagonal elements of \f$ \mathbf{A}
+   * \f$. On exit, it is overwritten by the \f$ n - 1 \f$ elements of the first superdiagonal of \f$ \mathbf{U} \f$ .
+   * @param b Input/Output array. On entry, the \f$ n \times n_{\mathrm{rhs}} \f$ right hand side matrix \f$ \mathbf{B}
+   * \f$. On exit, if `INFO == 0`, the \f$ n \times n_{\mathrm{rhs}} \f$ solution matrix \f$ \mathbf{X} \f$.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryVector DL, MemoryVector D, MemoryVector DU, MemoryArray B>
-    requires(have_same_value_type_v<DL, D, DU, B> and mem::on_host<DL, D, DU, B> and is_blas_lapack_v<get_value_t<DL>>)
+    requires(have_same_value_type_v<DL, D, DU, B> and mem::have_host_compatible_addr_space<DL, D, DU, B> and is_blas_lapack_v<get_value_t<DL>>)
   int gtsv(DL &&dl, D &&d, DU &&du, B &&b) { // NOLINT (temporary views are allowed here)
-    static_assert((get_rank<B> == 1 or get_rank<B> == 2), "Error in nda::lapack::gtsv: B must be an matrix/array/view of rank 1 or 2");
+    static_assert((get_rank<B> == 1 or get_rank<B> == 2), "Error in nda::lapack::gtsv: B must be a matrix/array/view of rank 1 or 2");
+    static_assert(has_F_layout<B>, "Error in nda::lapack::gtsv: B must have Fortran layout");
 
-    // get and check dimensions of input arrays
-    EXPECTS(dl.extent(0) == d.extent(0) - 1); // "gtsv : dimension mismatch between sub-diagonal and diagonal vectors "
-    EXPECTS(du.extent(0) == d.extent(0) - 1); // "gtsv : dimension mismatch between super-diagonal and diagonal vectors "
-    EXPECTS(b.extent(0) == d.extent(0));      // "gtsv : dimension mismatch between diagonal vector and RHS matrix, "
+    // check the dimensions of the input/output arrays/views
+    auto const n = d.size();
+    EXPECTS(dl.size() == n - 1);
+    EXPECTS(du.size() == n - 1);
+    EXPECTS(b.extent(0) == n);
 
     // perform actual library call
-    int N    = d.extent(0);
-    int NRHS = (get_rank<B> == 2 ? b.extent(1) : 1);
     int info = 0;
-    f77::gtsv(N, NRHS, dl.data(), d.data(), du.data(), b.data(), N, info);
+    f77::gtsv(n, (get_rank<B> == 2 ? b.extent(1) : 1), dl.data(), d.data(), du.data(), b.data(), get_ld(b), info);
+
     return info;
   }
 
diff --git a/c++/nda/lapack/heev.hpp b/c++/nda/lapack/heev.hpp
new file mode 100644
index 000000000..4b986d558
--- /dev/null
+++ b/c++/nda/lapack/heev.hpp
@@ -0,0 +1,80 @@
+// Copyright (c) 2024--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides a generic interface to the LAPACK `heev` routine.
+ */
+
+#pragma once
+
+#include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../traits.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <concepts>
+
+namespace nda::lapack {
+
+  /**
+   * @ingroup linalg_lapack
+   * @brief Interface to the LAPACK `heev` routine.
+   *
+   * @details Computes all eigenvalues \f$ \lambda_i \f$ and, optionally, eigenvectors \f$ \mathbf{v}_i \f$ of a complex 
+   * hermitian matrix eigenvalue problem of the form
+   * \f[
+   *   \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \; ,
+   * \f]
+   * for a given complex hermitian matrix \f$ \mathbf{A} \f$.
+   *
+   * @tparam A nda::MemoryMatrix with complex value type.
+   * @param a Input/output matrix. On entry, the hermitian matrix \f$ \mathbf{A} \f$. On exit, if `jobz = V`, \f$ 
+   * \mathbf{A} \f$ contains the orthonormal eigenvectors of the matrix \f$ \mathbf{A} \f$. If `jobz = N`, then on 
+   * exit \f$ \mathbf{A} \f$ is destroyed.
+   * @param w Output vector. The eigenvalues in ascending order.
+   * @param jobz Character indicating whether to compute eigenvectors and eigenvalues ('V') or eigenvalues only ('N').
+   * @return Integer return code from the LAPACK call.
+   */
+  template <MemoryMatrix A, MemoryVector W>
+    requires(mem::have_host_compatible_addr_space<A> and is_complex_v<get_value_t<A>> and std::same_as<get_fp_t<A>, get_value_t<W>>)
+  int heev(A &&a, W &&w, char jobz = 'V') { // NOLINT (temporary views are allowed here)
+    static_assert(has_F_layout<A>, "Error in nda::lapack::heev: A must have Fortran layout");
+
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    EXPECTS(m == n);
+    resize_or_check_if_view(w, {n});
+
+    // arrays/views must be LAPACK compatible
+    EXPECTS(a.indexmap().min_stride() == 1);
+    EXPECTS(w.indexmap().min_stride() == 1);
+
+    // check other input parameters for consistency
+    EXPECTS(jobz == 'V' or jobz == 'N');
+
+    // first call to get the optimal buffer size
+    using fp_type = get_value_t<W>;
+    array<fp_type, 1> rwork(std::max(1l, 3 * n - 2));
+    std::complex<fp_type> tmp_lwork{};
+    int info = 0;
+    lapack::f77::heev(jobz, 'U', n, a.data(), get_ld(a), w.data(), &tmp_lwork, -1, rwork.data(), info);
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
+
+    // allocate work buffer and perform actual library call
+    array<std::complex<fp_type>, 1> work(lwork);
+    lapack::f77::heev(jobz, 'U', n, a.data(), get_ld(a), w.data(), work.data(), lwork, rwork.data(), info);
+
+    return info;
+  }
+
+} // namespace nda::lapack
diff --git a/c++/nda/lapack/hegv.hpp b/c++/nda/lapack/hegv.hpp
new file mode 100644
index 000000000..3988f2f51
--- /dev/null
+++ b/c++/nda/lapack/hegv.hpp
@@ -0,0 +1,93 @@
+// Copyright (c) 2024--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides a generic interface to the LAPACK `hegv` routine.
+ */
+
+#pragma once
+
+#include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../traits.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <concepts>
+
+namespace nda::lapack {
+
+  /**
+   * @ingroup linalg_lapack
+   * @brief Interface to the LAPACK `hegv` routine.
+   *
+   * @details Computes all eigenvalues \f$ \lambda_i \f$ and, optionally, eigenvectors \f$ \mathbf{v}_i \f$ of a complex 
+   * generalized Hermitian-definite eigenvalue problem of the form
+   * - \f$ \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{B} \mathbf{v}_i \f$ (`itype = 1`),
+   * - \f$ \mathbf{A} \mathbf{B} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 2`) or
+   * - \f$ \mathbf{B} \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 3`).
+   *
+   * Here \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are assumed to be Hermitian and \f$ \mathbf{B} \f$ is also positive 
+   * definite.
+   *
+   * @tparam A nda::MemoryMatrix with complex value type.
+   * @tparam B nda::MemoryMatrix with complex value type.
+   * @param a Input/output matrix. On entry, the Hermitian matrix \f$ \mathbf{A} \f$. On exit, if `jobz = V`, \f$ 
+   * \mathbf{A} \f$ contains the matrix \f$ \mathbf{V} \f$ of normalized eigenvectors such that \f$ \mathbf{V}^H 
+   * \mathbf{B} \mathbf{V} = \mathbf{I} \f$ (if `itype = 1` or `itype = 2`) or \f$ \mathbf{V}^H \mathbf{B}^{-1} 
+   * \mathbf{V} = \mathbf{I} \f$ (if `itype = 3`). If `jobz = N`, then on exit \f$ \mathbf{A} \f$ is destroyed.
+   * @param b Input/output matrix. On entry, the symmetric positive definite matrix \f$ \mathbf{B} \f$. On exit, the 
+   * part of \f$ \mathbf{B} \f$ containing the matrix is overwritten by the triangular factor \f$ \mathbf{U} \f$ or 
+   * \f$ \mathbf{L} \f$ from a Cholesky factorization.
+   * @param w Output vector. The eigenvalues in ascending order.
+   * @param jobz Character indicating whether to compute eigenvectors and eigenvalues ('V') or eigenvalues only ('N').
+   * @param itype Specifies the problem to be solved.
+   * @return Integer return code from the LAPACK call.
+   */
+  template <MemoryMatrix A, MemoryMatrix B, MemoryVector W>
+    requires(mem::have_host_compatible_addr_space<A, B> and is_complex_v<get_value_t<A>> and have_same_value_type_v<A, B>
+             and std::same_as<get_fp_t<A>, get_value_t<W>>)
+  int hegv(A &&a, B &&b, W &&w, char jobz = 'V', int itype = 1) { // NOLINT (temporary views are allowed here)
+    static_assert(has_F_layout<A> and has_F_layout<B>, "Error in nda::lapack::hegv: A and B must have Fortran layout");
+
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    EXPECTS(m == n);
+    EXPECTS(m == b.shape()[0]);
+    EXPECTS(n == b.shape()[1]);
+    resize_or_check_if_view(w, {n});
+
+    // arrays/views must be LAPACK compatible
+    EXPECTS(a.indexmap().min_stride() == 1);
+    EXPECTS(b.indexmap().min_stride() == 1);
+    EXPECTS(w.indexmap().min_stride() == 1);
+
+    // check other input parameters for consistency
+    EXPECTS(itype == 1 or itype == 2 or itype == 3);
+    EXPECTS(jobz == 'V' or jobz == 'N');
+
+    // first call to get the optimal buffer size
+    using fp_type = get_fp_t<A>;
+    array<fp_type, 1> rwork(std::max(1l, 3 * n - 2));
+    std::complex<fp_type> tmp_lwork{};
+    int info = 0;
+    lapack::f77::hegv(itype, jobz, 'U', n, a.data(), get_ld(a), b.data(), get_ld(b), w.data(), &tmp_lwork, -1, rwork.data(), info);
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
+
+    // allocate work buffer and perform actual library call
+    array<std::complex<fp_type>, 1> work(lwork);
+    lapack::f77::hegv(itype, jobz, 'U', n, a.data(), get_ld(a), b.data(), get_ld(b), w.data(), work.data(), lwork, rwork.data(), info);
+
+    return info;
+  }
+
+} // namespace nda::lapack
diff --git a/c++/nda/lapack/interface/cusolver_interface.hpp b/c++/nda/lapack/interface/cusolver_interface.hpp
index c8e40d326..368a688eb 100644
--- a/c++/nda/lapack/interface/cusolver_interface.hpp
+++ b/c++/nda/lapack/interface/cusolver_interface.hpp
@@ -26,9 +26,6 @@ namespace nda::lapack::device {
   void getrf(int M, int N, double *A, int LDA, int *ipiv, int &info);
   void getrf(int M, int N, dcomplex *A, int LDA, int *ipiv, int &info);
 
-  void getri(int N, double *A, int LDA, int *ipiv, double *WORK, int LWORK, int &info);
-  void getri(int N, dcomplex *A, int LDA, int *ipiv, dcomplex *WORK, int LWORK, int &info);
-
   void getrs(char op, int N, int NRHS, double const *A, int LDA, int const *ipiv, double *B, int LDB, int &info);
   void getrs(char op, int N, int NRHS, dcomplex const *A, int LDA, int const *ipiv, dcomplex *B, int LDB, int &info);
 
diff --git a/c++/nda/lapack/interface/cxx_interface.cpp b/c++/nda/lapack/interface/cxx_interface.cpp
index bb50487cf..deb8d49cb 100644
--- a/c++/nda/lapack/interface/cxx_interface.cpp
+++ b/c++/nda/lapack/interface/cxx_interface.cpp
@@ -16,6 +16,14 @@
 
 namespace nda::lapack::f77 {
 
+  void gelss(int M, int N, int NRHS, float *A, int LDA, float *B, int LDB, float *S, float RCOND, int &RANK, float *WORK, int LWORK,
+             [[maybe_unused]] float *RWORK, int &INFO) {
+    LAPACK_sgelss(&M, &N, &NRHS, A, &LDA, B, &LDB, S, &RCOND, &RANK, WORK, &LWORK, &INFO);
+  }
+  void gelss(int M, int N, int NRHS, std::complex<float> *A, int LDA, std::complex<float> *B, int LDB, float *S, float RCOND, int &RANK,
+             std::complex<float> *WORK, int LWORK, float *RWORK, int &INFO) {
+    LAPACK_cgelss(&M, &N, &NRHS, A, &LDA, B, &LDB, S, &RCOND, &RANK, WORK, &LWORK, RWORK, &INFO);
+  }
   void gelss(int M, int N, int NRHS, double *A, int LDA, double *B, int LDB, double *S, double RCOND, int &RANK, double *WORK, int LWORK,
              [[maybe_unused]] double *RWORK, int &INFO) {
     LAPACK_dgelss(&M, &N, &NRHS, A, &LDA, B, &LDB, S, &RCOND, &RANK, WORK, &LWORK, &INFO);
@@ -25,6 +33,14 @@ namespace nda::lapack::f77 {
     LAPACK_zgelss(&M, &N, &NRHS, A, &LDA, B, &LDB, S, &RCOND, &RANK, WORK, &LWORK, RWORK, &INFO);
   }
 
+  void gesvd(char JOBU, char JOBVT, int M, int N, float *A, int LDA, float *S, float *U, int LDU, float *VT, int LDVT, float *WORK, int LWORK,
+             [[maybe_unused]] float *RWORK, int &INFO) {
+    LAPACK_sgesvd(&JOBU, &JOBVT, &M, &N, A, &LDA, S, U, &LDU, VT, &LDVT, WORK, &LWORK, &INFO);
+  }
+  void gesvd(char JOBU, char JOBVT, int M, int N, std::complex<float> *A, int LDA, float *S, std::complex<float> *U, int LDU, std::complex<float> *VT,
+             int LDVT, std::complex<float> *WORK, int LWORK, float *RWORK, int &INFO) {
+    LAPACK_cgesvd(&JOBU, &JOBVT, &M, &N, A, &LDA, S, U, &LDU, VT, &LDVT, WORK, &LWORK, RWORK, &INFO);
+  }
   void gesvd(char JOBU, char JOBVT, int M, int N, double *A, int LDA, double *S, double *U, int LDU, double *VT, int LDVT, double *WORK, int LWORK,
              [[maybe_unused]] double *RWORK, int &INFO) {
     LAPACK_dgesvd(&JOBU, &JOBVT, &M, &N, A, &LDA, S, U, &LDU, VT, &LDVT, WORK, &LWORK, &INFO);
@@ -34,6 +50,13 @@ namespace nda::lapack::f77 {
     LAPACK_zgesvd(&JOBU, &JOBVT, &M, &N, A, &LDA, S, U, &LDU, VT, &LDVT, WORK, &LWORK, RWORK, &INFO);
   }
 
+  void geqp3(int M, int N, float *A, int LDA, int *JPVT, float *TAU, float *WORK, int LWORK, [[maybe_unused]] float *RWORK, int &INFO) {
+    LAPACK_sgeqp3(&M, &N, A, &LDA, JPVT, TAU, WORK, &LWORK, &INFO);
+  }
+  void geqp3(int M, int N, std::complex<float> *A, int LDA, int *JPVT, std::complex<float> *TAU, std::complex<float> *WORK, int LWORK, float *RWORK,
+             int &INFO) {
+    LAPACK_cgeqp3(&M, &N, A, &LDA, JPVT, TAU, WORK, &LWORK, RWORK, &INFO);
+  }
   void geqp3(int M, int N, double *A, int LDA, int *JPVT, double *TAU, double *WORK, int LWORK, [[maybe_unused]] double *RWORK, int &INFO) {
     LAPACK_dgeqp3(&M, &N, A, &LDA, JPVT, TAU, WORK, &LWORK, &INFO);
   }
@@ -42,17 +65,29 @@ namespace nda::lapack::f77 {
     LAPACK_zgeqp3(&M, &N, A, &LDA, JPVT, TAU, WORK, &LWORK, RWORK, &INFO);
   }
 
+  void orgqr(int M, int N, int K, float *A, int LDA, float *TAU, float *WORK, int LWORK, int &INFO) {
+    LAPACK_sorgqr(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &INFO);
+  }
   void orgqr(int M, int N, int K, double *A, int LDA, double *TAU, double *WORK, int LWORK, int &INFO) {
     LAPACK_dorgqr(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &INFO);
   }
 
+  void ungqr(int M, int N, int K, std::complex<float> *A, int LDA, std::complex<float> *TAU, std::complex<float> *WORK, int LWORK, int &INFO) {
+    LAPACK_cungqr(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &INFO);
+  }
   void ungqr(int M, int N, int K, std::complex<double> *A, int LDA, std::complex<double> *TAU, std::complex<double> *WORK, int LWORK, int &INFO) {
     LAPACK_zungqr(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &INFO);
   }
 
+  void getrf(int M, int N, float *A, int LDA, int *ipiv, int &info) { LAPACK_sgetrf(&M, &N, A, &LDA, ipiv, &info); }
+  void getrf(int M, int N, std::complex<float> *A, int LDA, int *ipiv, int &info) { LAPACK_cgetrf(&M, &N, A, &LDA, ipiv, &info); }
   void getrf(int M, int N, double *A, int LDA, int *ipiv, int &info) { LAPACK_dgetrf(&M, &N, A, &LDA, ipiv, &info); }
   void getrf(int M, int N, std::complex<double> *A, int LDA, int *ipiv, int &info) { LAPACK_zgetrf(&M, &N, A, &LDA, ipiv, &info); }
 
+  void getri(int N, float *A, int LDA, int const *ipiv, float *work, int lwork, int &info) { LAPACK_sgetri(&N, A, &LDA, ipiv, work, &lwork, &info); }
+  void getri(int N, std::complex<float> *A, int LDA, int const *ipiv, std::complex<float> *work, int lwork, int &info) {
+    LAPACK_cgetri(&N, A, &LDA, ipiv, work, &lwork, &info);
+  }
   void getri(int N, double *A, int LDA, int const *ipiv, double *work, int lwork, int &info) {
     LAPACK_dgetri(&N, A, &LDA, ipiv, work, &lwork, &info);
   }
@@ -68,15 +103,43 @@ namespace nda::lapack::f77 {
 
   void stev(char J, int N, double *D, double *E, double *Z, int ldz, double *work, int &info) { LAPACK_dstev(&J, &N, D, E, Z, &ldz, work, &info); }
 
-  void syev(char JOBZ, char UPLO, int N, double *A, int LDA, double *W, double *work, int &lwork, int &info) {
+  void syev(char JOBZ, char UPLO, int N, float *A, int LDA, float *W, float *work, int lwork, int &info) {
+    LAPACK_ssyev(&JOBZ, &UPLO, &N, A, &LDA, W, work, &lwork, &info);
+  }
+  void syev(char JOBZ, char UPLO, int N, double *A, int LDA, double *W, double *work, int lwork, int &info) {
     LAPACK_dsyev(&JOBZ, &UPLO, &N, A, &LDA, W, work, &lwork, &info);
   }
 
-  void heev(char JOBZ, char UPLO, int N, std::complex<double> *A, int LDA, double *W, std::complex<double> *work, int &lwork, double *work2,
+  void heev(char JOBZ, char UPLO, int N, std::complex<float> *A, int LDA, float *W, std::complex<float> *work, int lwork, float *rwork, int &info) {
+    LAPACK_cheev(&JOBZ, &UPLO, &N, A, &LDA, W, work, &lwork, rwork, &info);
+  }
+  void heev(char JOBZ, char UPLO, int N, std::complex<double> *A, int LDA, double *W, std::complex<double> *work, int lwork, double *rwork,
             int &info) {
-    LAPACK_zheev(&JOBZ, &UPLO, &N, A, &LDA, W, work, &lwork, work2, &info);
+    LAPACK_zheev(&JOBZ, &UPLO, &N, A, &LDA, W, work, &lwork, rwork, &info);
   }
 
+  void sygv(int ITYPE, char JOBZ, char UPLO, int N, float *A, int LDA, float *B, int LDB, float *W, float *work, int lwork, int &info) {
+    LAPACK_ssygv(&ITYPE, &JOBZ, &UPLO, &N, A, &LDA, B, &LDB, W, work, &lwork, &info);
+  }
+  void sygv(int ITYPE, char JOBZ, char UPLO, int N, double *A, int LDA, double *B, int LDB, double *W, double *work, int lwork, int &info) {
+    LAPACK_dsygv(&ITYPE, &JOBZ, &UPLO, &N, A, &LDA, B, &LDB, W, work, &lwork, &info);
+  }
+
+  void hegv(int ITYPE, char JOBZ, char UPLO, int N, std::complex<float> *A, int LDA, std::complex<float> *B, int LDB, float *W,
+            std::complex<float> *work, int lwork, float *rwork, int &info) {
+    LAPACK_chegv(&ITYPE, &JOBZ, &UPLO, &N, A, &LDA, B, &LDB, W, work, &lwork, rwork, &info);
+  }
+  void hegv(int ITYPE, char JOBZ, char UPLO, int N, std::complex<double> *A, int LDA, std::complex<double> *B, int LDB, double *W,
+            std::complex<double> *work, int lwork, double *rwork, int &info) {
+    LAPACK_zhegv(&ITYPE, &JOBZ, &UPLO, &N, A, &LDA, B, &LDB, W, work, &lwork, rwork, &info);
+  }
+
+  void getrs(char op, int N, int NRHS, float const *A, int LDA, int const *ipiv, float *B, int LDB, int &info) {
+    LAPACK_sgetrs(&op, &N, &NRHS, A, &LDA, ipiv, B, &LDB, &info);
+  }
+  void getrs(char op, int N, int NRHS, std::complex<float> const *A, int LDA, int const *ipiv, std::complex<float> *B, int LDB, int &info) {
+    LAPACK_cgetrs(&op, &N, &NRHS, A, &LDA, ipiv, B, &LDB, &info);
+  }
   void getrs(char op, int N, int NRHS, double const *A, int LDA, int const *ipiv, double *B, int LDB, int &info) {
     LAPACK_dgetrs(&op, &N, &NRHS, A, &LDA, ipiv, B, &LDB, &info);
   }
diff --git a/c++/nda/lapack/interface/cxx_interface.hpp b/c++/nda/lapack/interface/cxx_interface.hpp
index 7b8b75991..c746ba23b 100644
--- a/c++/nda/lapack/interface/cxx_interface.hpp
+++ b/c++/nda/lapack/interface/cxx_interface.hpp
@@ -20,27 +20,44 @@
 
 namespace nda::lapack::f77 {
 
+  void gelss(int M, int N, int NRHS, float *A, int LDA, float *B, int LDB, float *S, float RCOND, int &RANK, float *WORK, int LWORK, float *RWORK,
+             int &INFO);
+  void gelss(int M, int N, int NRHS, std::complex<float> *A, int LDA, std::complex<float> *B, int LDB, float *S, float RCOND, int &RANK,
+             std::complex<float> *WORK, int LWORK, float *RWORK, int &INFO);
   void gelss(int M, int N, int NRHS, double *A, int LDA, double *B, int LDB, double *S, double RCOND, int &RANK, double *WORK, int LWORK,
              double *RWORK, int &INFO);
   void gelss(int M, int N, int NRHS, std::complex<double> *A, int LDA, std::complex<double> *B, int LDB, double *S, double RCOND, int &RANK,
              std::complex<double> *WORK, int LWORK, double *RWORK, int &INFO);
 
+  void gesvd(char JOBU, char JOBVT, int M, int N, float *A, int LDA, float *S, float *U, int LDU, float *VT, int LDVT, float *WORK, int LWORK,
+             float *RWORK, int &INFO);
+  void gesvd(char JOBU, char JOBVT, int M, int N, std::complex<float> *A, int LDA, float *S, std::complex<float> *U, int LDU, std::complex<float> *VT,
+             int LDVT, std::complex<float> *WORK, int LWORK, float *RWORK, int &INFO);
   void gesvd(char JOBU, char JOBVT, int M, int N, double *A, int LDA, double *S, double *U, int LDU, double *VT, int LDVT, double *WORK, int LWORK,
              double *RWORK, int &INFO);
   void gesvd(char JOBU, char JOBVT, int M, int N, std::complex<double> *A, int LDA, double *S, std::complex<double> *U, int LDU,
              std::complex<double> *VT, int LDVT, std::complex<double> *WORK, int LWORK, double *RWORK, int &INFO);
 
+  void geqp3(int M, int N, float *A, int LDA, int *JPVT, float *TAU, float *WORK, int LWORK, float *RWORK, int &INFO);
+  void geqp3(int M, int N, std::complex<float> *A, int LDA, int *JPVT, std::complex<float> *TAU, std::complex<float> *WORK, int LWORK, float *RWORK,
+             int &INFO);
   void geqp3(int M, int N, double *A, int LDA, int *JPVT, double *TAU, double *WORK, int LWORK, double *RWORK, int &INFO);
   void geqp3(int M, int N, std::complex<double> *A, int LDA, int *JPVT, std::complex<double> *TAU, std::complex<double> *WORK, int LWORK,
              double *RWORK, int &INFO);
 
+  void orgqr(int M, int N, int K, float *A, int LDA, float *TAU, float *WORK, int LWORK, int &INFO);
   void orgqr(int M, int N, int K, double *A, int LDA, double *TAU, double *WORK, int LWORK, int &INFO);
 
+  void ungqr(int M, int N, int K, std::complex<float> *A, int LDA, std::complex<float> *TAU, std::complex<float> *WORK, int LWORK, int &INFO);
   void ungqr(int M, int N, int K, std::complex<double> *A, int LDA, std::complex<double> *TAU, std::complex<double> *WORK, int LWORK, int &INFO);
 
+  void getrf(int M, int N, float *A, int LDA, int *ipiv, int &info);
+  void getrf(int M, int N, std::complex<float> *A, int LDA, int *ipiv, int &info);
   void getrf(int M, int N, double *A, int LDA, int *ipiv, int &info);
   void getrf(int M, int N, std::complex<double> *A, int LDA, int *ipiv, int &info);
 
+  void getri(int N, float *A, int LDA, int const *ipiv, float *work, int lwork, int &info);
+  void getri(int N, std::complex<float> *A, int LDA, int const *ipiv, std::complex<float> *work, int lwork, int &info);
   void getri(int N, double *A, int LDA, int const *ipiv, double *work, int lwork, int &info);
   void getri(int N, std::complex<double> *A, int LDA, int const *ipiv, std::complex<double> *work, int lwork, int &info);
 
@@ -50,11 +67,23 @@ namespace nda::lapack::f77 {
 
   void stev(char J, int N, double *D, double *E, double *Z, int ldz, double *work, int &info);
 
-  void syev(char JOBZ, char UPLO, int N, double *A, int LDA, double *W, double *work, int &lwork, int &info);
+  void syev(char JOBZ, char UPLO, int N, float *A, int LDA, float *W, float *work, int lwork, int &info);
+  void syev(char JOBZ, char UPLO, int N, double *A, int LDA, double *W, double *work, int lwork, int &info);
 
-  void heev(char JOBZ, char UPLO, int N, std::complex<double> *A, int LDA, double *W, std::complex<double> *work, int &lwork, double *work2,
+  void heev(char JOBZ, char UPLO, int N, std::complex<float> *A, int LDA, float *W, std::complex<float> *work, int lwork, float *rwork, int &info);
+  void heev(char JOBZ, char UPLO, int N, std::complex<double> *A, int LDA, double *W, std::complex<double> *work, int lwork, double *rwork,
             int &info);
 
+  void sygv(int ITYPE, char JOBZ, char UPLO, int N, float *A, int LDA, float *B, int LDB, float *W, float *work, int lwork, int &info);
+  void sygv(int ITYPE, char JOBZ, char UPLO, int N, double *A, int LDA, double *B, int LDB, double *W, double *work, int lwork, int &info);
+
+  void hegv(int ITYPE, char JOBZ, char UPLO, int N, std::complex<float> *A, int LDA, std::complex<float> *B, int LDB, float *W,
+            std::complex<float> *work, int lwork, float *rwork, int &info);
+  void hegv(int ITYPE, char JOBZ, char UPLO, int N, std::complex<double> *A, int LDA, std::complex<double> *B, int LDB, double *W,
+            std::complex<double> *work, int lwork, double *rwork, int &info);
+
+  void getrs(char op, int N, int NRHS, float const *A, int LDA, int const *ipiv, float *B, int LDB, int &info);
+  void getrs(char op, int N, int NRHS, std::complex<float> const *A, int LDA, int const *ipiv, std::complex<float> *B, int LDB, int &info);
   void getrs(char op, int N, int NRHS, double const *A, int LDA, int const *ipiv, double *B, int LDB, int &info);
   void getrs(char op, int N, int NRHS, std::complex<double> const *A, int LDA, int const *ipiv, std::complex<double> *B, int LDB, int &info);
 
@@ -81,4 +110,7 @@ namespace nda::lapack {
   // See nda::blas::is_conj_array_expr.
   using blas::is_conj_array_expr;
 
+  // See nda::blas::get_array.
+  using blas::get_array;
+
 } // namespace nda::lapack
diff --git a/c++/nda/lapack/orgqr.hpp b/c++/nda/lapack/orgqr.hpp
index 45fe75deb..081fb9ff0 100644
--- a/c++/nda/lapack/orgqr.hpp
+++ b/c++/nda/lapack/orgqr.hpp
@@ -11,17 +11,15 @@
 #pragma once
 
 #include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
-#include "../exceptions.hpp"
-#include "../layout/policies.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
-#include "../mem/policies.hpp"
 #include "../traits.hpp"
 
+#include <algorithm>
 #include <cmath>
-#include <complex>
 #include <type_traits>
 
 namespace nda::lapack {
@@ -30,47 +28,58 @@ namespace nda::lapack {
    * @ingroup linalg_lapack
    * @brief Interface to the LAPACK `orgqr` routine.
    *
-   * @details Generates an m-by-n real matrix \f$ \mathbf{Q} \f$ with orthonormal columns, which is defined as the first
-   * n columns of a product of k elementary reflectors of order m:
+   * @details Generates an \f$ m \times \min(m,n) = k \f$ real matrix \f$ \mathbf{Q} \f$ with orthonormal columns, which
+   * is defined as the first \f$ k \f$ columns of a product of \f$ k \f$ elementary reflectors of order \f$ m \f$
    * \f[
    *   \mathbf{Q} = \mathbf{H}(1) \mathbf{H}(2) \ldots \mathbf{H}(k) \; ,
    * \f]
-   * as returned by `geqrf`.
+   * as returned by nda::lapack::geqp3.
    *
-   * @tparam A nda::MemoryMatrix with double value type.
-   * @tparam TAU nda::MemoryVector with double value type.
-   * @param a Input/output matrix. On entry, the i-th column must contain the vector which defines the elementary
-   * reflector \f$ H(i) \; , i = 1,2,...,k \f$, as returned by `geqrf` in the first k columns. On exit, the m-by-n
-   * matrix \f$ \mathbf{Q} \f$.
-   * @param tau Input vector. `tau(i)` must contain the scalar factor of the elementary reflector \f$ \mathbf{H}(i) \f$,
-   * as returned by `geqrf`.
+   * Each \f$ \mathbf{H}(i) \f$ has the form
+   * \f[
+   *   \mathbf{H}(i) = \mathbf{I} - \tau_i * \mathbf{v}_i \mathbf{v}_i^H
+   * \f]
+   * where \f$ \tau_i \f$ is a real scalar, and \f$ \mathbf{v}_i \f$ is a real vector with
+   * - elements \f$ 1 \f$ to \f$ i - 1 \f$ equal to 0,
+   * - element \f$ i \f$ equal to 1 and
+   * - elements \f$ i + 1 \f$ to \f$ m \f$ stored in the elements \f$ i + 1 \f$ to \f$ m \f$ in column \f$ i \f$ of
+   * matrix \f$ \mathbf{A} \f$.
+   *
+   * @tparam A nda::MemoryMatrix with float or double value type.
+   * @tparam TAU nda::MemoryVector with float or double value type.
+   * @param a Input/output matrix. On entry, the i<sup>th</sup> column must contain the vector which defines the
+   * elementary reflector \f$ H(i) \; , i = 1,2,...,k \f$, as returned by nda::lapack::geqp3 in the first \f$ k \f$
+   * columns. On exit, the \f$ m \times \min(m,n) = k \f$ matrix \f$ \mathbf{Q} \f$.
+   * @param tau Input vector. \f$ \tau_i \f$ must contain the scalar factor of the elementary reflector \f$
+   * \mathbf{H}(i) \f$, as returned by nda::lapack::geqp3.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryMatrix A, MemoryVector TAU>
-    requires(mem::on_host<A> and std::is_same_v<double, get_value_t<A>> and have_same_value_type_v<A, TAU>
-             and mem::have_compatible_addr_space<A, TAU>)
+    requires(mem::have_host_compatible_addr_space<A> and (std::is_same_v<float, get_value_t<A>> or std::is_same_v<double, get_value_t<A>>) //
+             and have_same_value_type_v<A, TAU>)
   int orgqr(A &&a, TAU &&tau) { // NOLINT (temporary views are allowed here)
-    static_assert(has_F_layout<A>, "Error in nda::lapack::orgqr: C order is not supported");
-    static_assert(mem::have_host_compatible_addr_space<A, TAU>, "Error in nda::lapack::orgqr: Only CPU is supported");
+    static_assert(has_F_layout<A>, "Error in nda::lapack::orgqr: A must have Fortran layout");
+
+    // check the dimensions of the input/output arrays/views
+    auto const [m, n] = a.shape();
+    auto const k      = std::min(m, n);
+    EXPECTS(tau.size() == k);
 
-    // must be lapack compatible
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(tau.indexmap().min_stride() == 1);
 
-    // first call to get the optimal buffersize
+    // first call to get the optimal buffer size
     using value_type = get_value_t<A>;
-    value_type bufferSize_T{};
-    auto [m, n] = a.shape();
-    auto k      = tau.size();
-    int info    = 0;
-    lapack::f77::orgqr(m, std::min(m, n), k, a.data(), get_ld(a), tau.data(), &bufferSize_T, -1, info);
-    int bufferSize = static_cast<int>(std::ceil(std::real(bufferSize_T)));
+    value_type tmp_lwork{};
+    int info = 0;
+    lapack::f77::orgqr(m, k, k, a.data(), get_ld(a), tau.data(), &tmp_lwork, -1, info);
+    int lwork = static_cast<int>(std::ceil(tmp_lwork));
 
     // allocate work buffer and perform actual library call
-    nda::array<value_type, 1, C_layout, heap<mem::get_addr_space<A>>> work(bufferSize);
-    lapack::f77::orgqr(m, std::min(m, n), k, a.data(), get_ld(a), tau.data(), work.data(), bufferSize, info);
+    array<value_type, 1> work(lwork);
+    lapack::f77::orgqr(m, k, k, a.data(), get_ld(a), tau.data(), work.data(), lwork, info);
 
-    if (info) NDA_RUNTIME_ERROR << "Error in nda::lapack::orgqr: info = " << info;
     return info;
   }
 
diff --git a/c++/nda/lapack/syev.hpp b/c++/nda/lapack/syev.hpp
new file mode 100644
index 000000000..a64d29b08
--- /dev/null
+++ b/c++/nda/lapack/syev.hpp
@@ -0,0 +1,78 @@
+// Copyright (c) 2024--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides a generic interface to the LAPACK `syev` routine.
+ */
+
+#pragma once
+
+#include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../traits.hpp"
+
+#include <cmath>
+#include <concepts>
+
+namespace nda::lapack {
+
+  /**
+   * @ingroup linalg_lapack
+   * @brief Interface to the LAPACK `syev` routine.
+   *
+   * @details Computes all eigenvalues \f$ \lambda_i \f$ and, optionally, eigenvectors \f$ \mathbf{v}_i \f$ of a real 
+   * symmetric eigenvalue problem of the form
+   * \f[
+   *   \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \; ,
+   * \f]
+   * for a given real symmetric matrix \f$ \mathbf{A} \f$.
+   *
+   * @tparam A nda::MemoryMatrix with float or double value type.
+   * @param a Input/output matrix. On entry, the symmetric matrix \f$ \mathbf{A} \f$. On exit, if `jobz = V`, \f$ 
+   * \mathbf{A} \f$ contains the orthonormal eigenvectors of the matrix \f$ \mathbf{A} \f$. If `jobz = N`, then on 
+   * exit \f$ \mathbf{A} \f$ is destroyed.
+   * @param w Output vector. The eigenvalues in ascending order.
+   * @param jobz Character indicating whether to compute eigenvectors and eigenvalues ('V') or eigenvalues only ('N').
+   * @return Integer return code from the LAPACK call.
+   */
+  template <MemoryMatrix A, MemoryVector W>
+    requires(mem::have_host_compatible_addr_space<A> and (std::is_same_v<float, get_value_t<A>> or std::is_same_v<double, get_value_t<A>>)
+             and have_same_value_type_v<A, W>)
+  int syev(A &&a, W &&w, char jobz = 'V') { // NOLINT (temporary views are allowed here)
+    static_assert(has_F_layout<A>, "Error in nda::lapack::syev: A must have Fortran layout");
+
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    EXPECTS(m == n);
+    resize_or_check_if_view(w, {n});
+
+    // arrays/views must be LAPACK compatible
+    EXPECTS(a.indexmap().min_stride() == 1);
+    EXPECTS(w.indexmap().min_stride() == 1);
+
+    // check other input parameters for consistency
+    EXPECTS(jobz == 'V' or jobz == 'N');
+
+    // first call to get the optimal buffer size
+    using value_type = get_value_t<A>;
+    value_type tmp_lwork{};
+    int info = 0;
+    lapack::f77::syev(jobz, 'U', n, a.data(), get_ld(a), w.data(), &tmp_lwork, -1, info);
+    int lwork = static_cast<int>(std::ceil(tmp_lwork));
+
+    // allocate work buffer and perform actual library call
+    array<value_type, 1> work(lwork);
+    lapack::f77::syev(jobz, 'U', n, a.data(), get_ld(a), w.data(), work.data(), lwork, info);
+
+    return info;
+  }
+
+} // namespace nda::lapack
diff --git a/c++/nda/lapack/sygv.hpp b/c++/nda/lapack/sygv.hpp
new file mode 100644
index 000000000..c0a58cc19
--- /dev/null
+++ b/c++/nda/lapack/sygv.hpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2024--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides a generic interface to the LAPACK `sygv` routine.
+ */
+
+#pragma once
+
+#include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../traits.hpp"
+
+#include <cmath>
+#include <concepts>
+
+namespace nda::lapack {
+
+  /**
+   * @ingroup linalg_lapack
+   * @brief Interface to the LAPACK `sygv` routine.
+   *
+   * @details Computes all eigenvalues \f$ \lambda_i \f$ and, optionally, eigenvectors \f$ \mathbf{v}_i \f$ of a real 
+   * generalized symmetric-definite eigenvalue problem of the form
+   * - \f$ \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{B} \mathbf{v}_i \f$ (`itype = 1`),
+   * - \f$ \mathbf{A} \mathbf{B} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 2`) or
+   * - \f$ \mathbf{B} \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 3`).
+   *
+   * Here \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are assumed to be symmetric and \f$ \mathbf{B} \f$ is also positive 
+   * definite.
+   *
+   * @tparam A nda::MemoryMatrix with float or double value type.
+   * @tparam B nda::MemoryMatrix with float or double value type.
+   * @param a Input/output matrix. On entry, the symmetric matrix \f$ \mathbf{A} \f$. On exit, if `jobz = V`, \f$ 
+   * \mathbf{A} \f$ contains the matrix \f$ \mathbf{V} \f$ of normalized eigenvectors such that \f$ \mathbf{V}^T 
+   * \mathbf{B} \mathbf{V} = \mathbf{I} \f$ (if `itype = 1` or `itype = 2`) or \f$ \mathbf{V}^T \mathbf{B}^{-1} 
+   * \mathbf{V} = \mathbf{I} \f$ (if `itype = 3`). If `jobz = N`, then on exit \f$ \mathbf{A} \f$ is destroyed.
+   * @param b Input/output matrix. On entry, the symmetric positive definite matrix \f$ \mathbf{B} \f$. On exit, the 
+   * part of \f$ \mathbf{B} \f$ containing the matrix is overwritten by the triangular factor \f$ \mathbf{U} \f$ or 
+   * \f$ \mathbf{L} \f$ from a Cholesky factorization.
+   * @param w Output vector. The eigenvalues in ascending order.
+   * @param jobz Character indicating whether to compute eigenvectors and eigenvalues ('V') or eigenvalues only ('N').
+   * @param itype Specifies the problem to be solved.
+   * @return Integer return code from the LAPACK call.
+   */
+  template <MemoryMatrix A, MemoryMatrix B, MemoryVector W>
+    requires(mem::have_host_compatible_addr_space<A, B> and (std::same_as<float, get_value_t<A>> or std::same_as<double, get_value_t<A>>)
+             and have_same_value_type_v<A, B, W>)
+  int sygv(A &&a, B &&b, W &&w, char jobz = 'V', int itype = 1) { // NOLINT (temporary views are allowed here)
+    static_assert(has_F_layout<A> and has_F_layout<B>, "Error in nda::lapack::sygv: A and B must have Fortran layout");
+
+    // check the dimensions of the input/output arrays/views and resize if necessary
+    auto const [m, n] = a.shape();
+    EXPECTS(m == n);
+    EXPECTS(m == b.shape()[0]);
+    EXPECTS(n == b.shape()[1]);
+    resize_or_check_if_view(w, {n});
+
+    // arrays/views must be LAPACK compatible
+    EXPECTS(a.indexmap().min_stride() == 1);
+    EXPECTS(b.indexmap().min_stride() == 1);
+    EXPECTS(w.indexmap().min_stride() == 1);
+
+    // check other input parameters for consistency
+    EXPECTS(itype == 1 or itype == 2 or itype == 3);
+    EXPECTS(jobz == 'V' or jobz == 'N');
+
+    // first call to get the optimal buffer size
+    using value_type = get_value_t<A>;
+    value_type tmp_lwork{};
+    int info = 0;
+    lapack::f77::sygv(itype, jobz, 'U', n, a.data(), get_ld(a), b.data(), get_ld(b), w.data(), &tmp_lwork, -1, info);
+    int lwork = static_cast<int>(std::ceil(tmp_lwork));
+
+    // allocate work buffer and perform actual library call
+    array<value_type, 1> work(lwork);
+    lapack::f77::sygv(itype, jobz, 'U', n, a.data(), get_ld(a), b.data(), get_ld(b), w.data(), work.data(), lwork, info);
+
+    return info;
+  }
+
+} // namespace nda::lapack
diff --git a/c++/nda/lapack/ungqr.hpp b/c++/nda/lapack/ungqr.hpp
index e7326bcd7..6f6bdc054 100644
--- a/c++/nda/lapack/ungqr.hpp
+++ b/c++/nda/lapack/ungqr.hpp
@@ -11,15 +11,14 @@
 #pragma once
 
 #include "./interface/cxx_interface.hpp"
+#include "../basic_array.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
-#include "../exceptions.hpp"
-#include "../layout/policies.hpp"
 #include "../macros.hpp"
 #include "../mem/address_space.hpp"
-#include "../mem/policies.hpp"
 #include "../traits.hpp"
 
+#include <algorithm>
 #include <cmath>
 #include <complex>
 #include <type_traits>
@@ -30,47 +29,57 @@ namespace nda::lapack {
    * @ingroup linalg_lapack
    * @brief Interface to the LAPACK `ungqr` routine.
    *
-   * @details Generates an m-by-n complex matrix \f$ \mathbf{Q} \f$ with orthonormal columns, which is defined as the
-   * first n columns of a product of k elementary reflectors of order m
+   * @details Generates an \f$ m \times \min(m,n) = k \f$ complex matrix \f$ \mathbf{Q} \f$ with orthonormal columns,
+   * which is defined as the first \f$ k \f$ columns of a product of \f$ k \f$ elementary reflectors of order \f$ m \f$
    * \f[
    *   \mathbf{Q} = \mathbf{H}(1) \mathbf{H}(2) \ldots \mathbf{H}(k) \; ,
    * \f]
-   * as returned by `geqrf`.
+   * as returned by nda::lapack::geqp3.
+   *
+   * Each \f$ \mathbf{H}(i) \f$ has the form
+   * \f[
+   *   \mathbf{H}(i) = \mathbf{I} - \tau_i * \mathbf{v}_i \mathbf{v}_i^H
+   * \f]
+   * where \f$ \tau_i \f$ is a complex scalar, and \f$ \mathbf{v}_i \f$ is a complex vector with
+   * - elements \f$ 1 \f$ to \f$ i - 1 \f$ equal to 0,
+   * - element \f$ i \f$ equal to 1 and
+   * - elements \f$ i + 1 \f$ to \f$ m \f$ stored in the elements \f$ i + 1 \f$ to \f$ m \f$ in column \f$ i \f$ of
+   * matrix \f$ \mathbf{A} \f$.
    *
    * @tparam A nda::MemoryMatrix with complex value type.
    * @tparam TAU nda::MemoryVector with complex value type.
-   * @param a Input/output matrix. On entry, the i-th column must contain the vector which defines the elementary
-   * reflector \f$ H(i) \; , i = 1,2,...,K \f$, as returned by `geqrf` in the first k columns. On exit, the m-by-n
-   * matrix \f$ \mathbf{Q} \f$.
-   * @param tau Input vector. `tau(i)` must contain the scalar factor of the elementary reflector \f$ \mathbf{H}(i) \f$,
-   * as returned by `geqrf`.
+   * @param a Input/output matrix. On entry, the i<sup>th</sup> column must contain the vector which defines the
+   * elementary reflector \f$ H(i) \; , i = 1,2,...,k \f$, as returned by nda::lapack::geqp3 in the first \f$ k \f$
+   * columns. On exit, the \f$ m \times \min(m,n) = k \f$ matrix \f$ \mathbf{Q} \f$.
+   * @param tau Input vector. \f$ \tau_i \f$ must contain the scalar factor of the elementary reflector \f$
+   * \mathbf{H}(i) \f$, as returned by nda::lapack::geqp3.
    * @return Integer return code from the LAPACK call.
    */
   template <MemoryMatrix A, MemoryVector TAU>
-    requires(mem::on_host<A> and std::is_same_v<std::complex<double>, get_value_t<A>> and have_same_value_type_v<A, TAU>
-             and mem::have_compatible_addr_space<A, TAU>)
+    requires(mem::have_host_compatible_addr_space<A> and is_complex_v<get_value_t<A>> and have_same_value_type_v<A, TAU>)
   int ungqr(A &&a, TAU &&tau) { // NOLINT (temporary views are allowed here)
-    static_assert(has_F_layout<A>, "Error in nda::lapack::ungqr: C order is not supported");
-    static_assert(mem::have_host_compatible_addr_space<A, TAU>, "Error in nda::lapack::ungqr: Only CPU is supported");
+    static_assert(has_F_layout<A>, "Error in nda::lapack::ungqr: A must have Fortran layout");
+
+    // check the dimensions of the input/output arrays/views
+    auto const [m, n] = a.shape();
+    auto const k      = std::min(m, n);
+    EXPECTS(tau.size() == k);
 
-    // must be lapack compatible
+    // arrays/views must be LAPACK compatible
     EXPECTS(a.indexmap().min_stride() == 1);
     EXPECTS(tau.indexmap().min_stride() == 1);
 
-    // first call to get the optimal buffersize
+    // first call to get the optimal buffer size
     using value_type = get_value_t<A>;
-    value_type bufferSize_T{};
-    auto [m, n] = a.shape();
-    auto k      = tau.size();
-    int info    = 0;
-    lapack::f77::ungqr(m, std::min(m, n), k, a.data(), get_ld(a), tau.data(), &bufferSize_T, -1, info);
-    int bufferSize = static_cast<int>(std::ceil(std::real(bufferSize_T)));
+    value_type tmp_lwork{};
+    int info = 0;
+    lapack::f77::ungqr(m, k, k, a.data(), get_ld(a), tau.data(), &tmp_lwork, -1, info);
+    int lwork = static_cast<int>(std::ceil(std::real(tmp_lwork)));
 
     // allocate work buffer and perform actual library call
-    nda::array<value_type, 1, C_layout, heap<mem::get_addr_space<A>>> work(bufferSize);
-    lapack::f77::ungqr(m, std::min(m, n), k, a.data(), get_ld(a), tau.data(), work.data(), bufferSize, info);
+    nda::array<value_type, 1> work(lwork);
+    lapack::f77::ungqr(m, k, k, a.data(), get_ld(a), tau.data(), work.data(), lwork, info);
 
-    if (info) NDA_RUNTIME_ERROR << "Error in nda::lapack::ungqr: info = " << info;
     return info;
   }
 
diff --git a/c++/nda/linalg.hpp b/c++/nda/linalg.hpp
index 625552488..d56450166 100644
--- a/c++/nda/linalg.hpp
+++ b/c++/nda/linalg.hpp
@@ -14,8 +14,13 @@
 #include "./lapack.hpp"
 
 #include "./linalg/cross_product.hpp"
-#include "./linalg/det_and_inverse.hpp"
+#include "./linalg/det.hpp"
 #include "./linalg/dot.hpp"
-#include "./linalg/eigenelements.hpp"
+#include "./linalg/eigh.hpp"
+#include "./linalg/inv.hpp"
 #include "./linalg/matmul.hpp"
+#include "./linalg/matvecmul.hpp"
 #include "./linalg/norm.hpp"
+#include "./linalg/solve.hpp"
+#include "./linalg/svd.hpp"
+#include "./linalg/outer_product.hpp"
diff --git a/c++/nda/linalg/cross_product.hpp b/c++/nda/linalg/cross_product.hpp
index 675dead3b..b6ea66275 100644
--- a/c++/nda/linalg/cross_product.hpp
+++ b/c++/nda/linalg/cross_product.hpp
@@ -5,35 +5,40 @@
 
 /**
  * @file
- * @brief Provides a cross product for 3-dimensional vectors or other arrays/views of rank 1.
+ * @brief Provides a cross product for 3-dimensional vectors.
  */
 
 #pragma once
 
+#include "../basic_array.hpp"
+#include "../concepts.hpp"
 #include "../declarations.hpp"
 #include "../macros.hpp"
-#include "../traits.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
 
 namespace nda::linalg {
 
   /**
    * @ingroup linalg_tools
-   * @brief Compute the cross product of two 3-dimensional vectors.
+   * @brief Compute the cross product \f$ \mathbf{x} \times \mathbf{y} \f$ of two 3-dimensional vectors \f$ \mathbf{x} 
+   * \f$ and \f$ \mathbf{y} \f$.
    *
-   * @tparam V Vector type.
-   * @param x Left hand side vector.
-   * @param y Right hand side vector.
-   * @return nda::array of rank 1 containing the cross product of the two vectors.
+   * @tparam X nda::Vector type.
+   * @tparam Y nda::Vector type.
+   * @param x Input vector \f$ \mathbf{x} \f$.
+   * @param y Input vector \f$ \mathbf{y} \f$.
+   * @return nda::vector containing the cross product of the two vectors.
    */
-  template <typename V>
-  auto cross_product(V const &x, V const &y) {
-    EXPECTS_WITH_MESSAGE(x.shape()[0] == 3, "nda::linalg::cross_product: Only defined for 3-dimensional vectors");
-    EXPECTS_WITH_MESSAGE(y.shape()[0] == 3, "nda::linalg::cross_product: Only defined for 3-dimensional vectors");
-    array<get_value_t<V>, 1> r(3);
-    r(0) = x(1) * y(2) - y(1) * x(2);
-    r(1) = -x(0) * y(2) + y(0) * x(2);
-    r(2) = x(0) * y(1) - y(0) * x(1);
-    return r;
+  template <Vector X, Vector Y>
+    requires(nda::mem::have_host_compatible_addr_space<X, Y>)
+  auto cross_product(X const &x, Y const &y) {
+    EXPECTS(x.size() == 3 and y.size() == 3);
+    auto res = vector<decltype(x(0) * y(0)), nda::heap<nda::mem::common_addr_space<X, Y>>>(3);
+    res(0)   = x(1) * y(2) - y(1) * x(2);
+    res(1)   = -x(0) * y(2) + y(0) * x(2);
+    res(2)   = x(0) * y(1) - y(0) * x(1);
+    return res;
   }
 
 } // namespace nda::linalg
diff --git a/c++/nda/linalg/det.hpp b/c++/nda/linalg/det.hpp
new file mode 100644
index 000000000..3d77b4a6a
--- /dev/null
+++ b/c++/nda/linalg/det.hpp
@@ -0,0 +1,146 @@
+// Copyright (c) 2019--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides functions to compute the determinant of a matrix.
+ */
+
+#pragma once
+
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../concepts.hpp"
+#include "../exceptions.hpp"
+#include "../lapack/getrf.hpp"
+#include "../matrix_functions.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
+#include "../traits.hpp"
+
+namespace nda::linalg {
+
+  /**
+   * @addtogroup linalg_tools
+   * @{
+   */
+
+  /**
+   * @brief Compute the determinant of a \f$ 1 \times 1 \f$ matrix \f$ \mathbf{M} \f$.
+   *
+   * @tparam M nda::Matrix type.
+   * @param m Input matrix. The matrix \f$ \mathbf{M} \f$.
+   * @return The determinant \f$ \det(\mathbf{M}) \f$.
+   */
+  template <Matrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  auto det_1d(M const &m) {
+    EXPECTS(is_matrix_square(m) and m.shape()[0] == 1);
+    return m(0, 0);
+  }
+
+  /**
+   * @brief Compute the determinant of a \f$ 2 \times 2 \f$ matrix \f$ \mathbf{M} \f$.
+   *
+   * @tparam M nda::Matrix type.
+   * @param m Input matrix. The matrix \f$ \mathbf{M} \f$.
+   * @return The determinant \f$ \det(\mathbf{M}) \f$.
+   */
+  template <Matrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  auto det_2d(M const &m) {
+    EXPECTS(is_matrix_square(m) and m.shape()[0] == 2);
+    return m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
+  }
+
+  /**
+   * @brief Compute the inverse of a \f$ 3 \times 3 \f$ matrix \f$ \mathbf{M} \f$.
+   *
+   * @tparam M nda::Matrix type.
+   * @param m Input matrix. The matrix \f$ \mathbf{M} \f$.
+   * @return The determinant \f$ \det(\mathbf{M}) \f$.
+   */
+  template <Matrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  auto det_3d(M const &m) {
+    EXPECTS(is_matrix_square(m) and m.shape()[0] == 3);
+    return m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1)) - m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0))
+       + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+  }
+
+  /**
+   * @brief Compute the determinant of an \f$ n \times n \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details For small matrices (\f$ 1 \times 1 \f$, \f$ 2 \times 2 \f$ or \f$ 3 \times 3 \f$), it directly computes
+   * the determinant using one of the optimized routines nda::linalg::det_1d, nda::linalg::det_2d and 
+   * nda::linalg::det_3d.
+   *
+   * For larger matrices, it calls nda::lapack::getrf and calculates the determinant from its LU decomposition.
+   * 
+   * It throws an exception if the call to nda::lapack::getrf fails.
+   *
+   * @note The matrix \f$ \mathbf{M} \f$ is modified if its number of rows/columns is greater than 3.
+   *
+   * @tparam M nda::Matrix type.
+   * @param m Input/output matrix. On entry, the matrix \f$ \mathbf{M} \f$. On exit, the matrix \f$ \mathbf{M} \f$ or
+   * the LU decomposition of \f$ \mathbf{M} \f$ from nda::lapack::getrf.
+   * @return The determinant \f$ \det(\mathbf{M}) \f$.
+   */
+  template <Matrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  auto det_in_place(M &&m) { // NOLINT (temporary views are allowed here)
+    EXPECTS(is_matrix_square(m));
+
+    // use optimized routines for small matrices, otherwise use LAPACK routine
+    auto const dim = m.shape()[0];
+    if (dim == 1) {
+      return det_1d(m);
+    } else if (dim == 2) {
+      return det_2d(m);
+    } else if (dim == 3) {
+      return det_3d(m);
+    } else if (dim > 3) {
+      // LU factorization with getrf
+      auto ipiv = vector<int, sso<100>>(dim);
+      int info  = nda::lapack::getrf(m, ipiv);
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::det_in_place: getrf routine failed: info = " << info;
+
+      // calculate the determinant from the LU decomposition
+      auto det    = get_value_t<M>{1};
+      int n_flips = 0;
+      for (int i = 0; i < dim; i++) {
+        det *= m(i, i);
+        // count the number of column interchanges performed by getrf
+        if (ipiv(i) != i + 1) ++n_flips;
+      }
+
+      return ((n_flips % 2 == 1) ? -det : det);
+    } else {
+      // empty matrix
+      return get_value_t<M>{1};
+    }
+  }
+
+  /**
+   * @brief Compute the determinant of an \f$ n \times n \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details The given matrix/view is not modified. It first makes a copy of the matrix/view and then calls 
+   * nda::linalg::det_in_place.
+   *
+   * @tparam M nda::MemoryMatrix type.
+   * @param m Input matrix. The matrix \f$ \mathbf{M} \f$. 
+   * @return The determinant \f$ \det(\mathbf{M}) \f$.
+   */
+  template <Matrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  auto det(M const &m) {
+    EXPECTS(is_matrix_square(m));
+    auto m_copy = make_regular(m);
+    return det_in_place(m_copy);
+  }
+
+  /** @} */
+
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/det_and_inverse.hpp b/c++/nda/linalg/det_and_inverse.hpp
deleted file mode 100644
index 018524088..000000000
--- a/c++/nda/linalg/det_and_inverse.hpp
+++ /dev/null
@@ -1,313 +0,0 @@
-// Copyright (c) 2019--present, The Simons Foundation
-// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
-// SPDX-License-Identifier: Apache-2.0
-// See LICENSE in the root of this distribution for details.
-
-/**
- * @file
- * @brief Provides functions to compute the determinant and inverse of a matrix.
- */
-
-#pragma once
-
-#include "../basic_array.hpp"
-#include "../basic_functions.hpp"
-#include "../clef/make_lazy.hpp"
-#include "../concepts.hpp"
-#include "../exceptions.hpp"
-#include "../lapack/getrf.hpp"
-#include "../lapack/getri.hpp"
-#include "../layout/policies.hpp"
-#include "../matrix_functions.hpp"
-#include "../mem/address_space.hpp"
-#include "../mem/policies.hpp"
-#include "../print.hpp"
-#include "../traits.hpp"
-
-#include <iostream>
-#include <type_traits>
-#include <utility>
-
-namespace nda {
-
-  /**
-   * @addtogroup linalg_tools
-   * @{
-   */
-
-  /**
-   * @brief Check if a given array/view is square, i.e. if the first dimension has the same extent as the second
-   * dimension.
-   *
-   * @note It does not check if the array/view has rank 2.
-   *
-   * @tparam A Array/View type.
-   * @param a Array/View object.
-   * @param print_error If true, print an error message if the matrix is not square.
-   * @return True if the array/view is square, false otherwise.
-   */
-  template <typename A>
-  bool is_matrix_square(A const &a, bool print_error = false) {
-    bool r = (a.shape()[0] == a.shape()[1]);
-    if (not r and print_error)
-      std::cerr << "Error in nda::detail::is_matrix_square: Dimensions are: (" << a.shape()[0] << "," << a.shape()[1] << ")\n" << std::endl;
-    return r;
-  }
-
-  /**
-   * @brief Check if a given array/view is diagonal, i.e. if it is square (see nda::is_matrix_square) and all the the
-   * off-diagonal elements are zero.
-   *
-   * @note It does not check if the array/view has rank 2.
-   *
-   * @tparam A Array/View type.
-   * @param a Array/View object.
-   * @param print_error If true, print an error message if the matrix is not diagonal.
-   * @return True if the array/view is diagonal, false otherwise.
-   */
-  template <typename A>
-  bool is_matrix_diagonal(A const &a, bool print_error = false) {
-    bool r = is_matrix_square(a) and a == diag(diagonal(a));
-    if (not r and print_error) std::cerr << "Error in nda::detail::is_matrix_diagonal: Non-diagonal matrix: " << a << std::endl;
-    return r;
-  }
-
-  /**
-   * @brief Compute the determinant of a square matrix/view.
-   *
-   * @details It uses nda::lapack::getrf to compute the LU decomposition of the matrix and then calculates the
-   * determinant by multiplying the diagonal elements of the \f$ \mathbf{U} \f$ matrix and taking into account that
-   * `getrf` may change the ordering of the rows/columns of the matrix.
-   *
-   * The given matrix/view is modified in place.
-   *
-   * @tparam M Type of the matrix/view.
-   * @param m Matrix/view object.
-   * @return Determinant of the matrix/view.
-   */
-  template <typename M>
-  auto determinant_in_place(M &m)
-    requires(is_matrix_or_view_v<M>)
-  {
-    using value_t = get_value_t<M>;
-    static_assert(std::is_convertible_v<value_t, double> or std::is_convertible_v<value_t, std::complex<double>>,
-                  "Error in nda::determinant_in_place: Value type needs to be convertible to double or std::complex<double>");
-    static_assert(not std::is_const_v<M>, "Error in nda::determinant_in_place: Value type cannot be const");
-
-    // special case for an empty matrix
-    if (m.empty()) return value_t{1};
-
-    // check if the matrix is square
-    if (m.extent(0) != m.extent(1)) NDA_RUNTIME_ERROR << "Error in nda::determinant_in_place: Matrix is not square: " << m.shape();
-
-    // calculate the LU decomposition using lapack getrf
-    const int dim = m.extent(0);
-    basic_array<int, 1, C_layout, 'A', sso<100>> ipiv(dim);
-    int info = lapack::getrf(m, ipiv); // it is ok to be in C order
-    if (info < 0) NDA_RUNTIME_ERROR << "Error in nda::determinant_in_place: info = " << info;
-
-    // calculate the determinant from the LU decomposition
-    auto det    = value_t{1};
-    int n_flips = 0;
-    for (int i = 0; i < dim; i++) {
-      det *= m(i, i);
-      // count the number of column interchanges performed by getrf
-      if (ipiv(i) != i + 1) ++n_flips;
-    }
-
-    return ((n_flips % 2 == 1) ? -det : det);
-  }
-
-  /**
-   * @brief Compute the determinant of a square matrix/view.
-   *
-   * @details The given matrix/view is not modified. It first makes a copy of the given matrix/view and then calls
-   * nda::determinant_in_place with the copy.
-   *
-   * @tparam M Type of the matrix/view.
-   * @param m Matrix/view object.
-   * @return Determinant of the matrix/view.
-   */
-  template <typename M>
-  auto determinant(M const &m) {
-    auto m_copy = make_regular(m);
-    return determinant_in_place(m_copy);
-  }
-
-  // For small matrices (2x2 and 3x3), we directly
-  // compute the matrix inversion rather than calling the
-  // LaPack routine
-  // ---------- Small Inverse Benchmarks ---------
-  //          Run on (16 X 2400 MHz CPUs) (see benchmarks/small_inv.cpp)
-  // ---------------------------------------------
-  // Matrix Size      Time (old)        Time (new)
-  //     1             502 ns            59.0 ns
-  //     2             595 ns            61.7 ns
-  //     3             701 ns            67.5 ns
-
-  /**
-   * @brief Compute the inverse of a 1-by-1 matrix.
-   *
-   * @details The inversion is performed in place.
-   *
-   * @tparam M nda::MemoryMatrix type.
-   * @param m nda::MemoryMatrix object to be inverted.
-   */
-  template <MemoryMatrix M>
-    requires(get_algebra<M> == 'M' and mem::on_host<M>)
-  void inverse1_in_place(M &&m) { // NOLINT (temporary views are allowed here)
-    if (m(0, 0) == 0.0) NDA_RUNTIME_ERROR << "Error in nda::inverse1_in_place: Matrix is not invertible";
-    m(0, 0) = 1.0 / m(0, 0);
-  }
-
-  /**
-   * @brief Compute the inverse of a 2-by-2 matrix.
-   *
-   * @details The inversion is performed in place.
-   *
-   * @tparam M nda::MemoryMatrix type.
-   * @param m nda::MemoryMatrix object to be inverted.
-   */
-  template <MemoryMatrix M>
-    requires(get_algebra<M> == 'M' and mem::on_host<M>)
-  void inverse2_in_place(M &&m) { // NOLINT (temporary views are allowed here)
-    // calculate the adjoint of the matrix
-    std::swap(m(0, 0), m(1, 1));
-
-    // calculate the inverse determinant of the matrix
-    auto det = (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0));
-    if (det == 0.0) NDA_RUNTIME_ERROR << "Error in nda::inverse2_in_place: Matrix is not invertible";
-    auto detinv = 1.0 / det;
-
-    // multiply the adjoint by the inverse determinant
-    m(0, 0) *= +detinv;
-    m(1, 1) *= +detinv;
-    m(1, 0) *= -detinv;
-    m(0, 1) *= -detinv;
-  }
-
-  /**
-   * @brief Compute the inverse of a 3-by-3 matrix.
-   *
-   * @details The inversion is performed in place.
-   *
-   * @tparam M nda::MemoryMatrix type.
-   * @param m nda::MemoryMatrix object to be inverted.
-   */
-  template <MemoryMatrix M>
-    requires(get_algebra<M> == 'M' and mem::on_host<M>)
-  void inverse3_in_place(M &&m) { // NOLINT (temporary views are allowed here)
-    // calculate the cofactors of the matrix
-    auto b00 = +m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1);
-    auto b10 = -m(1, 0) * m(2, 2) + m(1, 2) * m(2, 0);
-    auto b20 = +m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0);
-    auto b01 = -m(0, 1) * m(2, 2) + m(0, 2) * m(2, 1);
-    auto b11 = +m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0);
-    auto b21 = -m(0, 0) * m(2, 1) + m(0, 1) * m(2, 0);
-    auto b02 = +m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1);
-    auto b12 = -m(0, 0) * m(1, 2) + m(0, 2) * m(1, 0);
-    auto b22 = +m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
-
-    // calculate the inverse determinant of the matrix
-    auto det = m(0, 0) * b00 + m(0, 1) * b10 + m(0, 2) * b20;
-    if (det == 0.0) NDA_RUNTIME_ERROR << "Error in nda::inverse3_in_place: Matrix is not invertible";
-    auto detinv = 1.0 / det;
-
-    // fill the matrix by multiplying the cofactors by the inverse determinant
-    m(0, 0) = detinv * b00;
-    m(0, 1) = detinv * b01;
-    m(0, 2) = detinv * b02;
-    m(1, 0) = detinv * b10;
-    m(1, 1) = detinv * b11;
-    m(1, 2) = detinv * b12;
-    m(2, 0) = detinv * b20;
-    m(2, 1) = detinv * b21;
-    m(2, 2) = detinv * b22;
-  }
-
-  /**
-   * @brief Compute the inverse of an n-by-n matrix.
-   *
-   * @details The inversion is performed in place.
-   *
-   * For small matrices (1-by-1, 2-by-2, 3-by-3), we directly compute the matrix inversion using the optimized routines:
-   * nda::inverse1_in_place, nda::inverse2_in_place, nda::inverse3_in_place.
-   *
-   * For larger matrices, it uses nda::lapack::getrf and nda::lapack::getri.
-   *
-   * @tparam M nda::MemoryMatrix type.
-   * @param m nda::MemoryMatrix object to be inverted.
-   */
-  template <MemoryMatrix M>
-    requires(get_algebra<M> == 'M')
-  void inverse_in_place(M &&m) { // NOLINT (temporary views are allowed here)
-    EXPECTS(is_matrix_square(m, true));
-
-    // nothing to do if the matrix/view is empty
-    if (m.empty()) return;
-
-    // use optimized routines for small matrices
-    if constexpr (mem::on_host<M>) {
-      if (m.shape()[0] == 1) {
-        inverse1_in_place(m);
-        return;
-      }
-
-      if (m.shape()[0] == 2) {
-        inverse2_in_place(m);
-        return;
-      }
-
-      if (m.shape()[0] == 3) {
-        inverse3_in_place(m);
-        return;
-      }
-    }
-
-    // use getrf and getri from lapack for larger matrices
-    array<int, 1> ipiv(m.extent(0));
-    int info = lapack::getrf(m, ipiv); // it is ok to be in C order
-    if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::inverse_in_place: Matrix is not invertible: info = " << info;
-    info = lapack::getri(m, ipiv);
-    if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::inverse_in_place: Matrix is not invertible: info = " << info;
-  }
-
-  /**
-   * @brief Compute the inverse of an n-by-n matrix.
-   *
-   * @details  The given matrix/view is not modified. It first makes copy of the given matrix/view and then calls
-   * nda::inverse_in_place with the copy.
-   *
-   * @tparam M nda::MemoryMatrix type.
-   * @param m nda::MemoryMatrix object to be inverted.
-   * @return Inverse of the matrix.
-   */
-  template <Matrix M>
-  auto inverse(M const &m)
-    requires(get_algebra<M> == 'M')
-  {
-    EXPECTS(is_matrix_square(m, true));
-    auto r = make_regular(m);
-    inverse_in_place(r);
-    return r;
-  }
-
-  /** @} */
-
-} // namespace nda
-
-namespace nda::clef {
-
-  /**
-   * @ingroup linalg_tools
-   * @brief Lazy version of nda::determinant.
-   */
-  CLEF_MAKE_FNT_LAZY(determinant)
-
-  /**
-   * @ingroup linalg_tools
-   * @brief Lazy version of nda::inverse.
-   */
-  CLEF_MAKE_FNT_LAZY(inverse)
-
-} // namespace nda::clef
diff --git a/c++/nda/linalg/dot.hpp b/c++/nda/linalg/dot.hpp
index 4b1f97803..559ca0ac5 100644
--- a/c++/nda/linalg/dot.hpp
+++ b/c++/nda/linalg/dot.hpp
@@ -5,112 +5,135 @@
 
 /**
  * @file
- * @brief Provides a dot product for two arrays, a scalar and an array, or two scalars.
+ * @brief Provides a generic dot product.
  */
 
 #pragma once
 
 #include "../blas/dot.hpp"
-#include "../declarations.hpp"
-#include "../layout/policies.hpp"
+#include "../concepts.hpp"
+#include "../macros.hpp"
 #include "../mem/address_space.hpp"
-#include "../mem/policies.hpp"
+#include "../traits.hpp"
 
-#include <type_traits>
+#include <complex>
+#include <cstddef>
 
-namespace nda {
+namespace nda::linalg {
 
   /**
    * @addtogroup linalg_tools
    * @{
    */
 
+  namespace detail {
+
+    // Implementation of a generic dot/dotc product.
+    template <bool star, typename X, typename Y>
+      requires(Scalar<get_value_t<X>> and Scalar<get_value_t<Y>> and mem::have_host_compatible_addr_space<X, Y>)
+    auto dot_generic(X const &x, Y const &y) {
+      // check the dimensions of the input arrays/views
+      EXPECTS(x.size() == y.size());
+
+      // conditional conjugation
+      auto cond_conj = [](auto z) __attribute__((always_inline)) {
+        if constexpr (star and is_complex_v<decltype(z)>) {
+          return std::conj(z);
+        } else {
+          return z;
+        }
+      };
+
+      // early return for zero-sized vectors
+      long const N = x.size();
+      if (N == 0) return decltype(cond_conj(x(0)) * y(0)){0};
+
+      // loop over vectors and sum up element-wise products
+      if constexpr (has_layout_smallest_stride_is_one<X> and has_layout_smallest_stride_is_one<Y>) {
+        if constexpr (is_regular_or_view_v<X> and is_regular_or_view_v<Y>) {
+          auto *__restrict px = x.data();
+          auto *__restrict py = y.data();
+          auto res            = cond_conj(px[0]) * py[0];
+          for (size_t i = 1; i < N; ++i) res += cond_conj(px[i]) * py[i];
+          return res;
+        } else {
+          auto res = cond_conj(x(_linear_index_t{0})) * y(_linear_index_t{0});
+          for (long i = 1; i < N; ++i) res += cond_conj(x(_linear_index_t{i})) * y(_linear_index_t{i});
+          return res;
+        }
+      } else {
+        auto res = cond_conj(x(0)) * y(0);
+        for (long i = 1; i < N; ++i) res += cond_conj(x(i)) * y(i);
+        return res;
+      }
+    }
+
+  } // namespace detail
+
   /**
-   * @brief Compute the dot product of two real arrays/views.
+   * @brief Compute the dot product of two nda::vector objects or the product of two scalars.
    *
-   * @details It is generic in the sense that it allows the input arrays to belong to a different nda::mem::AddressSpace
-   * (as long as they are compatible).
+   * @details The behaviour of this function is identical to nda::blas::dot, except that it allows
+   * - the two input objects to be scalars,
+   * - lazy expressions as input vectors,
+   * - the value types of the input vectors to be different from each other and
+   * - the value types of the input vectors to be different from nda::is_double_or_complex_v.
    *
-   * If possible, it uses nda::blas::dot, otherwise it calls nda::blas::dot_generic.
+   * For vectors, it calls nda::blas::dot if possible, otherwise it simply loops over the input arrays/views and sums up
+   * the element-wise products.
    *
-   * @tparam X Type of the left hand side array/view.
-   * @tparam Y Type of the right hand side array/view.
-   * @param x Left hand side array/view.
-   * @param y Right hand side array/view.
-   * @return The dot product of the two arrays/views.
+   * @note The first argument is never conjugated. Even for complex types. Use nda::linalg::dotc for that.
+   *
+   * @tparam X nda::Vector or nda::Scalar type.
+   * @tparam Y nda::Vector or nda::Scalar type.
+   * @param x Input vector/scalar.
+   * @param y Input vector/scalar.
+   * @return Result of the dot product.
    */
   template <typename X, typename Y>
-  auto dot(X &&x, Y &&y) { // NOLINT (temporary views are allowed here)
-    // check address space compatibility
-    static constexpr auto L_adr_spc = mem::get_addr_space<X>;
-    static constexpr auto R_adr_spc = mem::get_addr_space<Y>;
-    static_assert(L_adr_spc != mem::None);
-    static_assert(R_adr_spc != mem::None);
-
-    // get resulting value type and vector type
-    using value_t  = decltype(get_value_t<X>{} * get_value_t<Y>{});
-    using vector_t = basic_array<value_t, 1, C_layout, 'V', nda::heap<mem::combine<L_adr_spc, R_adr_spc>>>;
-
-    if constexpr (is_blas_lapack_v<value_t>) {
-      // for double value types we use blas::dot
-      // lambda to form a new vector with the correct value type if necessary
-      auto as_container = []<typename A>(A const &a) -> decltype(auto) {
-        if constexpr (is_regular_or_view_v<A> and std::is_same_v<get_value_t<A>, value_t>)
-          return a;
-        else
-          return vector_t{a};
-      };
-
-      return blas::dot(as_container(x), as_container(y));
+    requires((Scalar<X> and Scalar<Y>) or (Vector<X> and Vector<Y>))
+  auto dot(X const &x, Y const &y) {
+    if constexpr (Scalar<X>) {
+      return x * y;
+    } else if constexpr (requires { nda::blas::dot(x, y); }) {
+      return nda::blas::dot(x, y);
     } else {
-      // for other value types we use a generic implementation
-      return blas::dot_generic(x, y);
+      return detail::dot_generic<false>(x, y);
     }
   }
 
   /**
-   * @brief Compute the dot product of two complex arrays/views.
+   * @brief Compute the dotc (LHS operand is conjugated) product of two nda::vector objects or the product of two
+   * scalars.
    *
-   * @details It is generic in the sense that it allows the input arrays to belong to a different nda::mem::AddressSpace
-   * (as long as they are compatible).
+   * @details The behaviour of this function is identical to nda::blas::dotc, except that it allows
+   * - the two input objects to be scalars,
+   * - lazy expressions as input vectors,
+   * - the value types of the input vectors to be different from each other and
+   * - the value types of the input vectors to be different from nda::is_double_or_complex_v.
    *
-   * If possible, it uses nda::blas::dotc, otherwise it calls nda::blas::dotc_generic.
+   * For vectors, it calls nda::blas::dotc if possible, otherwise it simply loops over the input arrays/views and sums
+   * up the element-wise products with the LHS operand conjugated.
    *
-   * @tparam X Type of the left hand side array/view.
-   * @tparam Y Type of the right hand side array/view.
-   * @param x Left hand side array/view.
-   * @param y Right hand side array/view.
-   * @return The dot product of the two arrays/views.
+   * @tparam X nda::Vector or nda::Scalar type.
+   * @tparam Y nda::Vector or nda::Scalar type.
+   * @param x Input vector/scalar.
+   * @param y Input vector/scalar.
+   * @return Result of the dotc product.
    */
   template <typename X, typename Y>
-  auto dotc(X &&x, Y &&y) { // NOLINT (temporary views are allowed here)
-    // check address space compatibility
-    static constexpr auto L_adr_spc = mem::get_addr_space<X>;
-    static constexpr auto R_adr_spc = mem::get_addr_space<Y>;
-    static_assert(L_adr_spc != mem::None);
-    static_assert(R_adr_spc != mem::None);
-
-    // get resulting value type and vector type
-    using value_t  = decltype(get_value_t<X>{} * get_value_t<Y>{});
-    using vector_t = basic_array<value_t, 1, C_layout, 'V', nda::heap<mem::combine<L_adr_spc, R_adr_spc>>>;
-
-    if constexpr (is_blas_lapack_v<value_t>) {
-      // for double or complex value types we use blas::dotc
-      // lambda to form a new vector with the correct value type if necessary
-      auto as_container = []<typename A>(A const &a) -> decltype(auto) {
-        if constexpr (is_regular_or_view_v<A> and std::is_same_v<get_value_t<A>, value_t>)
-          return a;
-        else
-          return vector_t{a};
-      };
-
-      return blas::dotc(as_container(x), as_container(y));
+    requires((Scalar<X> and Scalar<Y>) or (Vector<X> and Vector<Y>))
+  auto dotc(X const &x, Y const &y) {
+    if constexpr (Scalar<X>) {
+      if constexpr (is_complex_v<X>) return std::conj(x) * y;
+      return x * y;
+    } else if constexpr (requires { nda::blas::dotc(x, y); }) {
+      return nda::blas::dotc(x, y);
     } else {
-      // for other value types we use a generic implementation
-      return blas::dotc_generic(x, y);
+      return detail::dot_generic<true>(x, y);
     }
   }
 
   /** @} */
 
-} // namespace nda
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/eigenelements.hpp b/c++/nda/linalg/eigenelements.hpp
deleted file mode 100644
index bc946db00..000000000
--- a/c++/nda/linalg/eigenelements.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2019--present, The Simons Foundation
-// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
-// SPDX-License-Identifier: Apache-2.0
-// See LICENSE in the root of this distribution for details.
-
-/**
- * @file
- * @brief Provides eigenvalues and eigenvectors of a symmetric or hermitian matrix.
- */
-
-#pragma once
-
-#include "./det_and_inverse.hpp"
-#include "../basic_array.hpp"
-#include "../declarations.hpp"
-#include "../exceptions.hpp"
-#include "../lapack/interface/cxx_interface.hpp"
-#include "../layout/policies.hpp"
-#include "../macros.hpp"
-#include "../traits.hpp"
-
-#include <type_traits>
-#include <utility>
-
-namespace nda::linalg {
-
-  namespace detail {
-
-    // Dispatch the call to the appropriate LAPACK routine based on the value type of the matrix.
-    template <typename M>
-    auto _eigen_element_impl(M &&m, char compz) { // NOLINT (temporary views are allowed here)
-      using value_type = typename std::decay_t<M>::value_type;
-
-      // runtime checks
-      EXPECTS((not m.empty()));
-      EXPECTS(is_matrix_square(m, true));
-      EXPECTS(m.is_contiguous());
-      EXPECTS(m.has_positive_strides());
-
-      // set up the workspace
-      int dim   = m.extent(0);
-      int lwork = 64 * dim;
-      array<double, 1> ev(dim);
-      array<value_type, 1> work(lwork);
-      array<double, 1> work2(is_complex_v<value_type> ? lwork : 0);
-
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-      work2 = 0;
-      work  = 0;
-      ev    = 0;
-#endif
-#endif
-
-      // call the correct LAPACK routine
-      int info = 0;
-      if constexpr (not is_complex_v<value_type>) {
-        lapack::f77::syev(compz, 'U', dim, m.data(), dim, ev.data(), work.data(), lwork, info);
-      } else {
-        lapack::f77::heev(compz, 'U', dim, m.data(), dim, ev.data(), work.data(), lwork, work2.data(), info);
-      }
-      if (info) NDA_RUNTIME_ERROR << "Error in nda::linalg::detail::_eigen_element_impl: Diagonalization error";
-      return ev;
-    }
-
-  } // namespace detail
-
-  /**
-   * @addtogroup linalg_tools
-   * @{
-   */
-
-  /**
-   * @brief Find the eigenvalues and eigenvectors of a symmetric (real) or hermitian (complex) matrix/view.
-   *
-   * @details For a real symmetric matrix/view, it calls the LAPACK routine `syev`. For a complex hermitian matrix/view,
-   * it calls the LAPACK routine `heev`.
-   *
-   * The given matrix/view is copied and the original is not modified.
-   *
-   * @tparam M Type of the matrix/view.
-   * @param m Matrix/View to diagonalize.
-   * @return std::pair consisting of the array of eigenvalues in ascending order and the matrix containing the
-   * eigenvectors in its columns.
-   */
-  template <typename M>
-  auto eigenelements(M const &m) {
-    auto m_copy = matrix<typename M::value_type, F_layout>(m);
-    auto ev     = detail::_eigen_element_impl(m_copy, 'V');
-    return std::pair<array<double, 1>, typename M::regular_type>{ev, m_copy};
-  }
-
-  /**
-   * @brief Find the eigenvalues of a symmetric (real) or hermitian (complex) matrix/view.
-   *
-   * @details For a real symmetric matrix/view, it calls the LAPACK routine `syev`. For a complex hermitian matrix/view,
-   * it calls the LAPACK routine `heev`.
-   *
-   * The given matrix/view is copied and the original is not modified.
-   *
-   * @tparam M Type of the matrix/view.
-   * @param m Matrix/View to diagonalize.
-   * @return An nda::array of rank 1 containing the eigenvalues in ascending order.
-   */
-  template <typename M>
-  auto eigenvalues(M const &m) {
-    auto m_copy = matrix<typename M::value_type, F_layout>(m);
-    return detail::_eigen_element_impl(m_copy, 'N');
-  }
-
-  /**
-   * @brief Find the eigenvalues of a symmetric (real) or hermitian (complex) matrix/view.
-   *
-   * @details For a real symmetric matrix/view, it calls the LAPACK routine `syev`. For a complex hermitian matrix/view,
-   * it calls the LAPACK routine `heev`.
-   *
-   * The given matrix/view will be modified by the diagonalization process.
-   *
-   * @tparam M Type of the matrix/view.
-   * @param m Matrix/View to diagonalize.
-   * @return An nda::array of rank 1 containing the eigenvalues in ascending order.
-   */
-  template <typename M>
-  auto eigenvalues_in_place(M &m) {
-    return detail::_eigen_element_impl(m, 'N');
-  }
-
-  /** @} */
-
-} // namespace nda::linalg
diff --git a/c++/nda/linalg/eigh.hpp b/c++/nda/linalg/eigh.hpp
new file mode 100644
index 000000000..2d5f2e0d8
--- /dev/null
+++ b/c++/nda/linalg/eigh.hpp
@@ -0,0 +1,314 @@
+// Copyright (c) 2019--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides functions to solve (generalized) eigenvalue problems with a symmetric/hermitian matrices.
+ */
+
+#pragma once
+
+#include "../basic_array.hpp"
+#include "../blas/tools.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../exceptions.hpp"
+#include "../lapack/syev.hpp"
+#include "../lapack/sygv.hpp"
+#include "../lapack/heev.hpp"
+#include "../lapack/hegv.hpp"
+#include "../layout/policies.hpp"
+#include "../macros.hpp"
+#include "../matrix_functions.hpp"
+#include "../mem/address_space.hpp"
+#include "../traits.hpp"
+
+#include <complex>
+#include <type_traits>
+#include <utility>
+
+namespace nda::linalg {
+
+  /**
+   * @addtogroup linalg_tools
+   * @{
+   */
+
+  namespace detail {
+
+    // Perform the call to the LAPACK routines syev/heev for eigh_in_place and eigvalsh_in_place.
+    template <typename A>
+    auto eigh_impl(A &&a, char jobz) { // NOLINT (temporary views are allowed here)
+      // early return if the matrix is empty
+      using fp_t = get_fp_t<A>;
+      if (a.empty()) return array<fp_t, 1>{};
+
+      // make the call to syev/heev
+      auto lambda = array<fp_t, 1>(a.extent(0));
+      int info    = 0;
+      if constexpr (is_complex_v<get_value_t<A>>) {
+        info = nda::lapack::heev(a, lambda, jobz);
+      } else {
+        info = nda::lapack::syev(a, lambda, jobz);
+      }
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::detail::eigh_impl: syev/heev routine failed: info = " << info;
+
+      return lambda;
+    }
+
+    // Perform the call to the LAPACK routines sygv/hegv for eigh_in_place and eigvalsh_in_place.
+    template <typename A, typename B>
+    auto eigh_impl(A &&a, B &&b, char jobz, int itype) { // NOLINT (temporary views are allowed here)
+      // early return if the matrix is empty
+      using fp_t = get_fp_t<A>;
+      if (a.empty()) return array<fp_t, 1>{};
+
+      // make the call to sygv/hegv
+      auto lambda = array<fp_t, 1>(a.extent(0));
+      int info    = 0;
+      if constexpr (is_complex_v<get_value_t<A>>) {
+        info = nda::lapack::hegv(a, b, lambda, jobz, itype);
+      } else {
+        info = nda::lapack::sygv(a, b, lambda, jobz, itype);
+      }
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::detail::eigh_impl: sygv/hegv routine failed: info = " << info;
+
+      return lambda;
+    }
+
+  } // namespace detail
+
+  /**
+   * @brief Compute the eigenvalues and eigenvectors of a real symmetric or complex hermitian matrix.
+   *
+   * @details It computes the eigenvectors \f$ \mathbf{v}_i \f$ and eigenvalues \f$ \lambda_i \f$ of the matrix \f$ 
+   * \mathbf{A} \f$ such that
+   * \f[
+   *  \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \; .
+   * \f]
+   * 
+   * If the elements of \f$ \mathbf{A} \f$ are real, it calls nda::lapack::syev. If the elements of \f$ \mathbf{A} \f$ 
+   * are complex, it calls nda::lapack::heev.
+   * 
+   * It throws an exception if the call to LAPACK fails.
+   *
+   * @note The given matrix/view is modified and contains the eigenvectors in its columns after the call.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @param a Input/output matrix. On entry, the matrix \f$ \mathbf{A} \f$. On exit, it contains the orthonormal 
+   * eigenvectors \f$ \mathbf{v}_i \f$ in its columns.
+   * @return An nda::array containing the real eigenvalues \f$ \lambda_i \f$ in ascending order.
+   */
+  template <MemoryMatrix A>
+    requires(nda::mem::have_host_compatible_addr_space<A> and is_blas_lapack_v<get_value_t<A>> and nda::blas::has_F_layout<A>)
+  auto eigh_in_place(A &&a) {
+    return detail::eigh_impl(std::forward<A>(a), 'V');
+  }
+
+  /**
+   * @brief Compute the eigenvalues and eigenvectors of a generalized real symmetric-definite or complex 
+   * hermitian-definite eigenvalue problem.
+   *
+   * @details It computes the eigenvectors \f$ \mathbf{v}_i \f$ and eigenvalues \f$ \lambda_i \f$ of one of the 
+   * following eigenvalue problems:
+   * - \f$ \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{B} \mathbf{v}_i \f$ (`itype = 1`),
+   * - \f$ \mathbf{A} \mathbf{B} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 2`) or
+   * - \f$ \mathbf{B} \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 3`).
+   * 
+   * Here \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are assumed to be real symmetric or complex hermitian. In addition,
+   * \f$ \mathbf{B} \f$ is assumed to be positive definite.
+   * 
+   * If \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are real, it calls nda::lapack::sygv. Otherwise, it calls 
+   * nda::lapack::hegv.
+   * 
+   * It throws an exception if the call to LAPACK fails.
+   *
+   * @note The given matrices/views are modified.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @tparam B nda::MemoryMatrix type.
+   * @param a Input/output matrix. On entry, the matrix \f$ \mathbf{A} \f$. On exit, it contains the normalized 
+   * eigenvectors \f$ \mathbf{v}_i \f$ in its columns (see nda::lapack::sygv or nda::lapack::hegv for details).
+   * @param b Input/output matrix. On entry, the matrix \f$ \mathbf{B} \f$. On exit, it is overwritten (see 
+   * nda::lapack::sygv or nda::lapack::hegv for details).
+   * @param itype Specifies the problem to be solved.
+   * @return An nda::array containing the real eigenvalues \f$ \lambda_i \f$ in ascending order.
+   */
+  template <MemoryMatrix A, MemoryMatrix B>
+    requires(nda::mem::have_host_compatible_addr_space<A, B> and is_blas_lapack_v<get_value_t<A>> and have_same_value_type_v<A, B>
+             and nda::blas::has_F_layout<A> and nda::blas::has_F_layout<B>)
+  auto eigh_in_place(A &&a, B &&b, int itype = 1) {
+    return detail::eigh_impl(std::forward<A>(a), std::forward<B>(b), 'V', itype);
+  }
+
+  /**
+   * @brief Compute the eigenvalues and eigenvectors of a real symmetric or complex hermitian matrix.
+   *
+   * @details It computes the eigenvectors \f$ \mathbf{v}_i \f$ and eigenvalues \f$ \lambda_i \f$ of the matrix \f$ 
+   * \mathbf{A} \f$ such that
+   * \f[
+   *  \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \; .
+   * \f]
+   * 
+   * It makes a copy of the given matrix/view and calls nda::linalg::eigh_in_place with the copy.
+   *
+   * @tparam A nda::Matrix type.
+   * @param a Input matrix. The matrix \f$ \mathbf{A} \f$.
+   * @return `std::pair` containing an nda::array with the real eigenvalues \f$ \lambda_i \f$ in ascending order and an 
+   * nda::matrix in nda::F_layout containing the eigenvectors \f$ \mathbf{v}_i \f$ in its columns.
+   */
+  template <Matrix A>
+    requires(Scalar<get_value_t<A>>)
+  auto eigh(A const &a) {
+    using value_t = get_value_t<A>;
+    auto a_copy   = matrix<value_t, F_layout>{a};
+    auto lambda   = eigh_in_place(a_copy);
+    return std::make_pair(lambda, a_copy);
+  }
+
+  /**
+   * @brief Compute the eigenvalues and eigenvectors of a generalized real symmetric-definite or complex 
+   * hermitian-definite eigenvalue problem.
+   *
+   * @details It computes the eigenvectors \f$ \mathbf{v}_i \f$ and eigenvalues \f$ \lambda_i \f$ of one of the 
+   * following eigenvalue problems:
+   * - \f$ \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{B} \mathbf{v}_i \f$ (`itype = 1`),
+   * - \f$ \mathbf{A} \mathbf{B} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 2`) or
+   * - \f$ \mathbf{B} \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 3`).
+   * 
+   * It makes a copy of the given matrices/views and calls nda::linalg::eigh_in_place(A &&, B&&, int) with the copies. 
+   *
+   * @tparam A nda::Matrix type.
+   * @tparam B nda::Matrix type.
+   * @param a Input matrix. The matrix \f$ \mathbf{A} \f$.
+   * @param b Input matrix. The matrix \f$ \mathbf{B} \f$. 
+   * @param itype Specifies the problem to be solved.
+   * @return `std::pair` containing an nda::array with the real eigenvalues \f$ \lambda_i \f$ in ascending order and an 
+   * nda::matrix in nda::F_layout containing the eigenvectors \f$ \mathbf{v}_i \f$ in its columns.
+   */
+  template <Matrix A, Matrix B>
+    requires(Scalar<get_value_t<A>> and Scalar<get_value_t<B>> and std::is_same_v<get_fp_t<A>, get_fp_t<B>>)
+  auto eigh(A const &a, B const &b, int itype = 1) {
+    using value_t = std::conditional_t<is_complex_v<get_value_t<A>> or is_complex_v<get_value_t<B>>, std::complex<get_fp_t<A>>, get_fp_t<A>>;
+    auto a_copy   = matrix<value_t, F_layout>{a};
+    auto b_copy   = matrix<value_t, F_layout>{b};
+    auto lambda   = eigh_in_place(a_copy, b_copy, itype);
+    return std::make_pair(lambda, a_copy);
+  }
+
+  /**
+   * @brief Compute the eigenvalues of a real symmetric or complex hermitian matrix.
+   *
+   * @details It computes the eigenvalues \f$ \lambda_i \f$ of the matrix \f$ \mathbf{A} \f$ such that
+   * \f[
+   *  \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \; .
+   * \f]
+   * 
+   * If the elements of \f$ \mathbf{A} \f$ are real, it calls nda::lapack::syev. If the elements of \f$ \mathbf{A} \f$ 
+   * are complex, it calls nda::lapack::heev.
+   * 
+   * It throws an exception if the call to LAPACK fails.
+   *
+   * @note The given matrix/view is modified.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @param a Input/output matrix. On entry, the matrix \f$ \mathbf{A} \f$. On exit, the contents of \f$ \mathbf{A} \f$ 
+   * are destroyed.
+   * @return An nda::array containing the real eigenvalues in ascending order.
+   */
+  template <MemoryMatrix A>
+    requires(nda::mem::have_host_compatible_addr_space<A> and is_blas_lapack_v<get_value_t<A>> and nda::blas::has_F_layout<A>)
+  auto eigvalsh_in_place(A &&a) {
+    return detail::eigh_impl(std::forward<A>(a), 'N');
+  }
+
+  /**
+   * @brief Compute the eigenvalues of a generalized real symmetric-definite or complex hermitian-definite eigenvalue 
+   * problem.
+   *
+   * @details It computes the eigenvalues \f$ \lambda_i \f$ of one of the following eigenvalue problems:
+   * - \f$ \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{B} \mathbf{v}_i \f$ (`itype = 1`),
+   * - \f$ \mathbf{A} \mathbf{B} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 2`) or
+   * - \f$ \mathbf{B} \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 3`).
+   * 
+   * Here \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are assumed to be real symmetric or complex hermitian. In addition,
+   * \f$ \mathbf{B} \f$ is assumed to be positive definite.
+   * 
+   * If \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are real, it calls nda::lapack::sygv. Otherwise, it calls 
+   * nda::lapack::hegv.
+   * 
+   * It throws an exception if the call to LAPACK fails.
+   *
+   * @note The given matrices/views are modified.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @tparam B nda::MemoryMatrix type.
+   * @param a Input/output matrix. On entry, the matrix \f$ \mathbf{A} \f$. On exit, the contents of \f$ \mathbf{A} \f$ 
+   * are destroyed.
+   * @param b Input/output matrix. On entry, the matrix \f$ \mathbf{B} \f$. On exit, it is overwritten (see 
+   * nda::lapack::sygv or nda::lapack::hegv for details).
+   * @param itype Specifies the problem to be solved.
+   * @return An nda::array containing the real eigenvalues \f$ \lambda_i \f$ in ascending order.
+   */
+  template <MemoryMatrix A, MemoryMatrix B>
+    requires(nda::mem::have_host_compatible_addr_space<A, B> and is_blas_lapack_v<get_value_t<A>> and have_same_value_type_v<A, B>
+             and nda::blas::has_F_layout<A> and nda::blas::has_F_layout<B>)
+  auto eigvalsh_in_place(A &&a, B &&b, int itype = 1) {
+    return detail::eigh_impl(std::forward<A>(a), std::forward<B>(b), 'N', itype);
+  }
+
+  /**
+   * @brief Compute the eigenvalues of a real symmetric or complex hermitian matrix.
+   *
+   * @details It computes the eigenvalues \f$ \lambda_i \f$ of the matrix \f$ \mathbf{A} \f$ such that
+   * \f[
+   *  \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \; .
+   * \f]
+   * 
+   * It makes a copy of the given matrix/view and calls nda::linalg::eigvalsh_in_place with the copy.
+   *
+   * @tparam A nda::Matrix type.
+   * @param a Input matrix. The matrix \f$ \mathbf{A} \f$.
+   * @return An nda::array containing the real eigenvalues in ascending order.
+   */
+  template <Matrix A>
+    requires(Scalar<get_value_t<A>>)
+  auto eigvalsh(A const &a) {
+    using value_t = get_value_t<A>;
+    auto a_copy   = matrix<value_t, F_layout>{a};
+    return eigvalsh_in_place(a_copy);
+  }
+
+  /**
+   * @brief Compute the eigenvalues of a generalized real symmetric-definite or complex hermitian-definite eigenvalue 
+   * problem.
+   *
+   * @details It computes the eigenvalues \f$ \lambda_i \f$ of one of the following eigenvalue problems:
+   * - \f$ \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{B} \mathbf{v}_i \f$ (`itype = 1`),
+   * - \f$ \mathbf{A} \mathbf{B} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 2`) or
+   * - \f$ \mathbf{B} \mathbf{A} \mathbf{v}_i = \lambda_i \mathbf{v}_i \f$ (`itype = 3`).
+   * 
+   * It makes a copy of the given matrices/views and calls nda::linalg::eigvalsh_in_place(A &&, B&&, int) with the 
+   * copies.
+   *
+   * @tparam A nda::Matrix type.
+   * @tparam B nda::Matrix type.
+   * @param a Input matrix. The matrix \f$ \mathbf{A} \f$.
+   * @param b Input matrix. The matrix \f$ \mathbf{B} \f$. 
+   * @param itype Specifies the problem to be solved.
+   * @return An nda::array containing the real eigenvalues in ascending order.
+   */
+  template <Matrix A, Matrix B>
+    requires(Scalar<get_value_t<A>> and Scalar<get_value_t<B>> and std::is_same_v<get_fp_t<A>, get_fp_t<B>>)
+  auto eigvalsh(A const &a, B const &b, int itype = 1) {
+    using value_t = std::conditional_t<is_complex_v<get_value_t<A>> or is_complex_v<get_value_t<B>>, std::complex<get_fp_t<A>>, get_fp_t<A>>;
+    auto a_copy   = matrix<value_t, F_layout>{a};
+    auto b_copy   = matrix<value_t, F_layout>{b};
+    return eigvalsh_in_place(a_copy, b_copy, itype);
+  }
+
+  /** @} */
+
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/inv.hpp b/c++/nda/linalg/inv.hpp
new file mode 100644
index 000000000..40c18e05b
--- /dev/null
+++ b/c++/nda/linalg/inv.hpp
@@ -0,0 +1,209 @@
+// Copyright (c) 2019--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides functions to compute the inverse of a matrix.
+ */
+
+#pragma once
+
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../concepts.hpp"
+#include "../exceptions.hpp"
+#include "../lapack/getrf.hpp"
+#include "../lapack/getri.hpp"
+#include "../lapack/getrs.hpp"
+#include "../layout/policies.hpp"
+#include "../macros.hpp"
+#include "../matrix_functions.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
+#include "../traits.hpp"
+
+#include <utility>
+
+namespace nda::linalg {
+
+  /**
+   * @addtogroup linalg_tools
+   * @{
+   */
+
+  /**
+   * @brief Compute the inverse of a \f$ 1 \times 1 \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details It throws an exception if the matrix is not invertible (i.e. if \f$ \det(\mathbf{M}) = 0 \f$).
+   *
+   * @note The inversion is performed in place.
+   *
+   * @tparam M nda::MemoryMatrix type.
+   * @param m Input/output matrix. On entry, the matrix \f$ \mathbf{M} \f$. On exit, the matrix \f$ \mathbf{M}^{-1} \f$.
+   */
+  template <MemoryMatrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  void inv_in_place_1d(M &&m) { // NOLINT (temporary views are allowed here)
+    EXPECTS(is_matrix_square(m) and m.extent(0) == 1);
+    if (m(0, 0) == get_value_t<M>{0.0}) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv_in_place_1d: Matrix is not invertible";
+    m(0, 0) = 1.0 / m(0, 0);
+  }
+
+  /**
+   * @brief Compute the inverse of a \f$ 2 \times 2 \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details It throws an exception if the matrix is not invertible (i.e. if \f$ \det(\mathbf{M}) = 0 \f$).
+   *
+   * @note The inversion is performed in place.
+   *
+   * @tparam M nda::MemoryMatrix type.
+   * @param m Input/output matrix. On entry, the matrix \f$ \mathbf{M} \f$. On exit, the matrix \f$ \mathbf{M}^{-1} \f$.
+   */
+  template <MemoryMatrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  void inv_in_place_2d(M &&m) { // NOLINT (temporary views are allowed here)
+    EXPECTS(is_matrix_square(m) and m.extent(0) == 2);
+
+    // calculate the determinant of the matrix
+    auto const det = (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0));
+    if (det == get_value_t<M>{0.0}) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv_in_place_2d: Matrix is not invertible";
+    auto const detinv = 1.0 / det;
+
+    // multiply the adjoint by the inverse determinant
+    std::swap(m(0, 0), m(1, 1));
+    m(0, 0) *= +detinv;
+    m(1, 1) *= +detinv;
+    m(1, 0) *= -detinv;
+    m(0, 1) *= -detinv;
+  }
+
+  /**
+   * @brief Compute the inverse of a \f$ 3 \times 3 \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details It throws an exception if the matrix is not invertible (i.e. if \f$ \det(\mathbf{M}) = 0 \f$).
+   *
+   * @note The inversion is performed in place.
+   *
+   * @tparam M nda::MemoryMatrix type.
+   * @param m Input/output matrix. On entry, the matrix \f$ \mathbf{M} \f$. On exit, the matrix \f$ \mathbf{M}^{-1} \f$.
+   */
+  template <MemoryMatrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  void inv_in_place_3d(M &&m) { // NOLINT (temporary views are allowed here)
+    EXPECTS(is_matrix_square(m) and m.extent(0) == 3);
+
+    // calculate the cofactors of the matrix
+    auto const b00 = +m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1);
+    auto const b10 = -m(1, 0) * m(2, 2) + m(1, 2) * m(2, 0);
+    auto const b20 = +m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0);
+    auto const b01 = -m(0, 1) * m(2, 2) + m(0, 2) * m(2, 1);
+    auto const b11 = +m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0);
+    auto const b21 = -m(0, 0) * m(2, 1) + m(0, 1) * m(2, 0);
+    auto const b02 = +m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1);
+    auto const b12 = -m(0, 0) * m(1, 2) + m(0, 2) * m(1, 0);
+    auto const b22 = +m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0);
+
+    // calculate the determinant of the matrix
+    auto const det = m(0, 0) * b00 + m(0, 1) * b10 + m(0, 2) * b20;
+    if (det == get_value_t<M>{0.0}) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv_in_place_3d: Matrix is not invertible";
+    auto const detinv = 1.0 / det;
+
+    // multiply the cofactors by the inverse determinant
+    m(0, 0) = detinv * b00;
+    m(0, 1) = detinv * b01;
+    m(0, 2) = detinv * b02;
+    m(1, 0) = detinv * b10;
+    m(1, 1) = detinv * b11;
+    m(1, 2) = detinv * b12;
+    m(2, 0) = detinv * b20;
+    m(2, 1) = detinv * b21;
+    m(2, 2) = detinv * b22;
+  }
+
+  /**
+   * @brief Compute the inverse of an \f$ n \times n \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details For small matrices (\f$ 1 \times 1 \f$, \f$ 2 \times 2 \f$ or \f$ 3 \times 3 \f$), it directly computes
+   * the matrix inversion using one of the optimized routines nda::linalg::inv_in_place_1d, nda::linalg::inv_in_place_2d
+   * or nda::linalg::inv_in_place_3d.
+   *
+   * For larger matrices, it calls nda::lapack::getrf and nda::lapack::getri.
+   * 
+   * It throws an exception if the matrix is not invertible, i.e. if \f$ \det(\mathbf{M}) = 0 \f$, or if a call to
+   * LAPACK fails.
+   *
+   * @note The inversion is performed in place.
+   *
+   * @tparam M nda::MemoryMatrix type.
+   * @param m Input/output matrix. On entry, the matrix \f$ \mathbf{M} \f$. On exit, the matrix \f$ \mathbf{M}^{-1} \f$.
+   */
+  template <MemoryMatrix M>
+    requires(get_algebra<M> == 'M' and nda::mem::have_host_compatible_addr_space<M> and is_blas_lapack_v<get_value_t<M>>)
+  void inv_in_place(M &&m) { // NOLINT (temporary views are allowed here)
+    EXPECTS(is_matrix_square(m));
+
+    // use optimized routines for small matrices, otherwise use LAPACK routines
+    auto const dim = m.shape()[0];
+    if (dim == 1) {
+      inv_in_place_1d(m);
+    } else if (dim == 2) {
+      inv_in_place_2d(m);
+    } else if (dim == 3) {
+      inv_in_place_3d(m);
+    } else if (dim > 3) {
+      // LU factorization with getrf
+      auto ipiv = vector<int, nda::heap<nda::mem::get_addr_space<M>>>(dim);
+      int info  = nda::lapack::getrf(m, ipiv);
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv_in_place: getrf routine failed: info = " << info;
+
+      // calculate the inverse with getri
+      info = nda::lapack::getri(m, ipiv);
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv_in_place: getri routine failed: info = " << info;
+    }
+  }
+
+  /**
+   * @brief Compute the inverse of an \f$ n \times n \f$ matrix \f$ \mathbf{M} \f$.
+   * 
+   * @details The given matrix/view is not modified. It first makes a copy of the matrix/view and then 
+   * 
+   * - uses nda::lapack::getrf and nda::lapack::getrs to compute the inverse in case the matrix's memory space is 
+   * compatible with the device memory space. The resulting inverse matrix is always in nda::F_layout.
+   * - calls nda::linalg::inv_in_place if the matrix is stored on the host memory space.
+   * 
+   * @warning This function makes copies of the input arrays/views. When working on the device memory space, this may
+   * lead to runtime errors if the copying fails.
+   *
+   * @tparam M nda::MemoryMatrix type.
+   * @param m Input matrix. The matrix \f$ \mathbf{M} \f$. 
+   * @return The inverse matrix \f$ \mathbf{M}^{-1} \f$.
+   */
+  template <Matrix M>
+    requires(get_algebra<M> == 'M' and is_blas_lapack_v<get_value_t<M>>)
+  auto inv(M const &m) {
+    EXPECTS(is_matrix_square(m));
+    auto m_copy = make_regular(m);
+
+    // for device compatible address spaces, we use getrf and getrs, otherwise we call inv_in_place
+    if constexpr (nda::mem::have_device_compatible_addr_space<M>) {
+      // LU factorization with getrf
+      auto ipiv = vector<int, heap<nda::mem::get_addr_space<M>>>(m_copy.extent(0));
+      int info  = nda::lapack::getrf(m_copy, ipiv);
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv: getrf routine failed: info = " << info;
+
+      // calculate the inverse with getrs and the identity matrix
+      auto B = matrix<get_value_t<M>, F_layout, heap<nda::mem::get_addr_space<M>>>{transpose(eye<get_value_t<M>>(m_copy.extent(0)))};
+      info   = nda::lapack::getrs(m_copy, B, ipiv);
+      if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::linalg::inv: getrs routine failed: info = " << info;
+      return B;
+    } else {
+      inv_in_place(m_copy);
+      return m_copy;
+    }
+  }
+
+  /** @} */
+
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/matmul.hpp b/c++/nda/linalg/matmul.hpp
index 307447aab..aff31f196 100644
--- a/c++/nda/linalg/matmul.hpp
+++ b/c++/nda/linalg/matmul.hpp
@@ -5,17 +5,18 @@
 
 /**
  * @file
- * @brief Provides matrix-matrix an matrix-vector multiplication.
+ * @brief Provides a generic matrix-matrix multiplication.
  */
 
 #pragma once
 
+#include "../basic_array.hpp"
 #include "../basic_functions.hpp"
 #include "../blas/gemm.hpp"
-#include "../blas/gemv.hpp"
 #include "../blas/tools.hpp"
 #include "../concepts.hpp"
 #include "../declarations.hpp"
+#include "../exceptions.hpp"
 #include "../layout/policies.hpp"
 #include "../mem/address_space.hpp"
 #include "../mem/policies.hpp"
@@ -24,7 +25,7 @@
 #include <type_traits>
 #include <utility>
 
-namespace nda {
+namespace nda::linalg {
 
   /**
    * @addtogroup linalg_tools
@@ -33,159 +34,134 @@ namespace nda {
 
   namespace detail {
 
-    // Helper variable template to check if the three matrix types can be passed to gemm.
-    // The following combinations are allowed (gemm can only be called with 'N', 'T' or 'C' op tags):
-    // - C in Fortran layout:
-    // -- A/B is not a conj expression and has Fortran layout
-    // -- A/B is a conj expression and has C layout
-    // - C in C layout:
-    // -- A/B is not a conj expression and has C layout
-    // -- A/B is a conj expression and has Fortran layout
-    template <Matrix A, Matrix B, MemoryMatrix C, bool conj_A = blas::is_conj_array_expr<A>, bool conj_B = blas::is_conj_array_expr<B>>
-      requires((MemoryMatrix<A> or conj_A) and (MemoryMatrix<B> or conj_B))
-    static constexpr bool is_valid_gemm_triple = []() {
-      using blas::has_F_layout;
-      if constexpr (has_F_layout<C>) {
-        return !(conj_A and has_F_layout<A>) and !(conj_B and has_F_layout<B>);
+    /// Generic matrix-matrix multiplication for types not supported by BLAS.
+    template <Matrix A, Matrix B, MemoryMatrix C>
+      requires(mem::have_host_compatible_addr_space<A, B, C>)
+    void gemm_generic(auto alpha, A const &a, B const &b, auto beta, C &&c) { // NOLINT (temporary views are allowed here)
+      // check the dimensions of the input/output arrays/views
+      auto const [m, k] = a.shape();
+      auto const [l, n] = b.shape();
+      EXPECTS(k == l);
+      EXPECTS(m == c.extent(0));
+      EXPECTS(n == c.extent(1));
+
+      // perform the matrix-matrix multiplication
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+          c(i, j) = beta * c(i, j);
+          for (int r = 0; r < k; ++r) c(i, j) += alpha * a(i, r) * b(r, j);
+        }
+      }
+    }
+
+    // Make compile time checks if blas::gemm can handle the given input matrix. If it can, simply forward the matrix.
+    // Otherwise, return a copy with the given value type T, layout policy LP and container policy CP.
+    template <typename T, typename LP, typename CP, MemoryMatrix C, Matrix A>
+    decltype(auto) get_gemm_matrix(A &&a) {
+      if constexpr (requires { blas::get_array(a); } and std::is_same_v<get_value_t<A>, T>) {
+        if constexpr (MemoryMatrix<A>
+                      or (blas::is_conj_array_expr<A>
+                          and ((blas::has_F_layout<C> and blas::has_C_layout<A>) or (blas::has_C_layout<C> and blas::has_F_layout<A>)))) {
+          return std::forward<A>(a);
+        } else {
+          return matrix<T, LP, CP>{a};
+        }
       } else {
-        return !(conj_B and !has_F_layout<B>) and !(conj_A and !has_F_layout<A>);
+        return matrix<T, LP, CP>{a};
       }
-    }();
+    }
+
+    // Make the call to nda::blas::gemm (with copies of the matrices if they are not contiguous).
+    template <Matrix A, Matrix B, MemoryMatrix C>
+    void make_gemm_call(A const &a, B const &b, C &c) {
+      auto try_gemm = []<typename A2, typename B2, typename C2>(A2 &&a2, B2 &&b2, C2 &&c2) {
+        if constexpr (requires { blas::gemm(1, std::forward<A2>(a2), std::forward<B2>(b2), 0, std::forward<C2>(c2)); }) {
+          blas::gemm(1, std::forward<A2>(a2), std::forward<B2>(b2), 0, std::forward<C2>(c2));
+        } else {
+          NDA_RUNTIME_ERROR << "Error in nda::linalg::matmul: Cannot call blas::gemm with the given input arrys/views.";
+        }
+      };
+      if (blas::get_array(a).is_contiguous()) {
+        if (blas::get_array(b).is_contiguous()) {
+          try_gemm(a, b, c);
+        } else {
+          try_gemm(a, nda::make_regular(b), c);
+        }
+      } else {
+        if (blas::get_array(b).is_contiguous()) {
+          try_gemm(nda::make_regular(a), b, c);
+        } else {
+          try_gemm(nda::make_regular(a), nda::make_regular(b), c);
+        }
+      }
+    }
 
     // Get the layout policy for a given array type.
     template <Array A>
-    using get_layout_policy = typename std::remove_reference_t<decltype(make_regular(std::declval<A>()))>::layout_policy_t;
+    using get_layout_policy = typename std::remove_cvref_t<decltype(make_regular(std::declval<A>()))>::layout_policy_t;
 
   } // namespace detail
 
   /**
-   * @brief Perform a matrix-matrix multiplication.
+   * @brief Compute the matrix-matrix product of two nda::matrix objects.
    *
-   * @details It is generic in the sense that it allows the input matrices to belong to a different
-   * nda::mem::AddressSpace (as long as they are compatible).
+   * @details This function computes the matrix-matrix product 
+   * \f[
+   *   \mathrm{op}_A(\mathbf{A}) \mathrm{op}_B(\mathbf{B}) \; ,
+   * \f]
+   * where \f$ \mathrm{op}_A(\mathbf{A}) \f$ and \f$ \mathrm{op}_B(\mathbf{B}) \f$ are \f$ m \times k \f$ and \f$ k 
+   * \times n \f$ matrices, respectively. \f$ \mathrm{op}_i \f$ can be some lazy operation, e.g. nda::conj, nda::sin, 
+   * etc.
    *
-   * If possible, it uses nda::blas::gemm, otherwise it calls nda::blas::gemm_generic.
+   * We try to call nda::blas::gemm whenever possible, i.e. when the value type of the result is compatible with
+   * nda::is_blas_lapack_v, even if this requires to make copies of the input arrays/views. Otherwise, we perform a very
+   * naive and inefficient matrix-matrix multiplication manually.
    *
-   * @tparam A nda::Matrix type of lhs operand.
-   * @tparam B nda::Matrix type of rhs operand.
-   * @param a Left hand side matrix operand.
-   * @param b Right hand side matrix operand.
-   * @return Result of the matrix-matrix multiplication.
+   * Therefore, if performance is important, users should make sure to pass input arrays/views which are compatible with
+   * nda::blas::gemm.
+   *
+   * @note The layout of the returned matrix depends on the layout of the input matrices. If both input matrices are in
+   * nda::F_layout, the returned matrix is also in nda::F_layout. Otherwise, it is in nda::C_layout.
+   * 
+   * @warning This function might make copies of the input arrays/views. When working on the device memory space, this 
+   * may lead to runtime errors if the copying fails.
+   *
+   * @tparam A nda::Matrix type.
+   * @tparam B nda::Matrix type.
+   * @param a Input matrix \f$ \mathrm{op}_A(\mathbf{A}) \f$ of size \f$ m \times k \f$.
+   * @param b Input matrix \f$ \mathrm{op}_B(\mathbf{B}) \f$ of size \f$ k \times n \f$.
+   * @return Resulting matrix of the matrix-matrix multiplication of size \f$ m \times n \f$.
    */
   template <Matrix A, Matrix B>
   auto matmul(A &&a, B &&b) { // NOLINT (temporary views are allowed here)
-    // check dimensions
-    EXPECTS_WITH_MESSAGE(a.shape()[1] == b.shape()[0], "Error in nda::matmul: Dimension mismatch in matrix-matrix product");
-
-    // check address space compatibility
-    static constexpr auto L_adr_spc = mem::get_addr_space<A>;
-    static constexpr auto R_adr_spc = mem::get_addr_space<B>;
-    mem::check_adr_sp_valid<L_adr_spc, R_adr_spc>();
-
-    // get resulting value type, layout policy and matrix type
-    using value_t = decltype(get_value_t<A>{} * get_value_t<B>{});
-    using layout_policy =
-       std::conditional_t<get_layout_info<A>.stride_order == get_layout_info<B>.stride_order, detail::get_layout_policy<A>, C_layout>;
-    using matrix_t = basic_array<value_t, 2, layout_policy, 'M', nda::heap<mem::combine<L_adr_spc, R_adr_spc>>>;
-
-    // perform matrix-matrix multiplication
-    auto result = matrix_t(a.shape()[0], b.shape()[1]);
-    if constexpr (is_blas_lapack_v<value_t>) {
-      // for double or complex value types we use blas::gemm
-      // lambda to form a new matrix with the correct value type if necessary
-      auto as_container = []<Matrix M>(M &&m) -> decltype(auto) {
-        if constexpr (std::is_same_v<get_value_t<M>, value_t> and (MemoryMatrix<M> or blas::is_conj_array_expr<M>))
-          return std::forward<M>(m);
-        else
-          return matrix_t{std::forward<M>(m)};
-      };
-
-      // MSAN has no way to know that we are calling with beta = 0, hence this is not necessary.
-      // Of course, in production code, we do NOT waste time to do this.
+    // get the return type
+    using value_t    = decltype(a(0, 0) * b(0, 0));
+    using cont_pol   = heap<mem::common_addr_space<A, B>>;
+    using layout_pol = std::conditional_t<get_layout_info<A>.stride_order == get_layout_info<B>.stride_order, detail::get_layout_policy<A>, C_layout>;
+    using return_t   = matrix<value_t, layout_pol, cont_pol>;
+
+    // result matrix (MSAN complains if it is not initialized)
+    auto res = return_t(a.shape()[0], b.shape()[1]);
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
-      result = 0;
+    res = 0;
 #endif
 #endif
 
-      // check if we can call gemm directly
-      if constexpr (detail::is_valid_gemm_triple<decltype(as_container(a)), decltype(as_container(b)), matrix_t>) {
-        blas::gemm(1, as_container(a), as_container(b), 0, result);
-      } else {
-        // otherwise, turn the lhs and rhs first into regular matrices and then call gemm
-        blas::gemm(1, make_regular(as_container(a)), make_regular(as_container(b)), 0, result);
-      }
-
-    } else {
-      // for other value types we use a generic implementation
-      blas::gemm_generic(1, a, b, 0, result);
-    }
-    return result;
-  }
-
-  /**
-   * @brief Perform a matrix-vector multiplication.
-   *
-   * @details It is generic in the sense that it allows the input matrix and vector to belong to a different
-   * nda::mem::AddressSpace (as long as they are compatible).
-   *
-   * If possible, it uses nda::blas::gemv, otherwise it calls nda::blas::gemv_generic.
-   *
-   * @tparam A nda::Matrix type of lhs operand.
-   * @tparam X nda::Vector type of rhs operand.
-   * @param a Left hand side matrix operand.
-   * @param x Right hand side vector operand.
-   * @return Result of the matrix-vector multiplication.
-   */
-  template <Matrix A, Vector X>
-  auto matvecmul(A &&a, X &&x) { // NOLINT (temporary views are allowed here)
-    // check dimensions
-    EXPECTS_WITH_MESSAGE(a.shape()[1] == x.shape()[0], "Error in nda::matvecmul: Dimension mismatch in matrix-vector product");
-
-    // check address space compatibility
-    static constexpr auto L_adr_spc = mem::get_addr_space<A>;
-    static constexpr auto R_adr_spc = mem::get_addr_space<X>;
-    static_assert(L_adr_spc == R_adr_spc, "Error in nda::matvecmul: Matrix-vector product requires arguments with same address spaces");
-    static_assert(L_adr_spc != mem::None);
-
-    // get resulting value type and vector type
-    using value_t  = decltype(get_value_t<A>{} * get_value_t<X>{});
-    using vector_t = vector<value_t, heap<L_adr_spc>>;
-
-    // perform matrix-matrix multiplication
-    auto result = vector_t(a.shape()[0]);
+    // perform matrix-matrix multiplication (if possible we try to call blas::gemv even if this requires making copies)
     if constexpr (is_blas_lapack_v<value_t>) {
-      // for double or complex value types we use blas::gemv
-      // lambda to form a new array with the correct value type if necessary
-      auto as_container = []<Array B>(B &&b) -> decltype(auto) {
-        if constexpr (std::is_same_v<get_value_t<B>, value_t> and (MemoryMatrix<B> or (Matrix<B> and blas::is_conj_array_expr<B>)))
-          return std::forward<B>(b);
-        else
-          return basic_array<value_t, get_rank<B>, C_layout, 'A', heap<L_adr_spc>>{std::forward<B>(b)};
-      };
-
-      // MSAN has no way to know that we are calling with beta = 0, hence this is not necessary.
-      // Of course, in production code, we do NOT waste time to do this.
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-      result = 0;
-#endif
-#endif
+      // check at compile time if we need to make a copy of the input matrices
+      auto &&a_mat = detail::get_gemm_matrix<value_t, layout_pol, cont_pol, return_t>(a);
+      auto &&b_mat = detail::get_gemm_matrix<value_t, layout_pol, cont_pol, return_t>(b);
 
-      // for expressions of the kind 'conj(M) * V' with a Matrix in Fortran Layout, we have to explicitly
-      // form the conj operation in memory as gemv only provides op tags 'N', 'T' and 'C' (hermitian conjugate)
-      if constexpr (blas::is_conj_array_expr<decltype(as_container(a))> and blas::has_F_layout<decltype(as_container(a))>) {
-        blas::gemv(1, make_regular(as_container(a)), as_container(x), 0, result);
-      } else {
-        blas::gemv(1, as_container(a), as_container(x), 0, result);
-      }
+      // check at runtime if the input matrices are contiguous, make copies if not and call blas::gemm
+      detail::make_gemm_call(a_mat, b_mat, res);
     } else {
-      // for other value types we use a generic implementation
-      blas::gemv_generic(1, a, x, 0, result);
+      detail::gemm_generic(1, a, b, 0, res);
     }
-    return result;
+    return res;
   }
 
   /** @} */
 
-} // namespace nda
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/matvecmul.hpp b/c++/nda/linalg/matvecmul.hpp
new file mode 100644
index 000000000..89a28eb7c
--- /dev/null
+++ b/c++/nda/linalg/matvecmul.hpp
@@ -0,0 +1,155 @@
+// Copyright (c) 2019--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides a generic matrix-vector multiplication.
+ */
+
+#pragma once
+
+#include "../basic_array.hpp"
+#include "../basic_functions.hpp"
+#include "../blas/gemv.hpp"
+#include "../blas/tools.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../exceptions.hpp"
+#include "../layout/policies.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
+#include "../traits.hpp"
+
+#include <type_traits>
+#include <utility>
+
+namespace nda::linalg {
+
+  /**
+   * @addtogroup linalg_tools
+   * @{
+   */
+
+  namespace detail {
+
+    // Generic matrix-vector multiplication for types not supported by BLAS.
+    template <Matrix A, Vector X, MemoryVector Y>
+      requires(nda::mem::have_host_compatible_addr_space<A, X, Y>)
+    void gemv_generic(auto alpha, A const &a, X const &x, auto beta, Y &&y) { // NOLINT (temporary views are allowed here)
+      // check the dimensions of the input/output arrays/views
+      auto const [m, n] = a.shape();
+      EXPECTS(n == x.size());
+      EXPECTS(m == y.size());
+
+      // perform the matrix-vector multiplication
+      for (int i = 0; i < m; ++i) {
+        y(i) = beta * y(i);
+        for (int j = 0; j < n; ++j) y(i) += alpha * a(i, j) * x(j);
+      }
+    }
+
+    // Make compile time checks if blas::gemv can handle the given input vector. If it can, simply forward the vector.
+    // Otherwise, return a copy with the given value type T and container policy CP.
+    template <typename T, typename CP, Vector X>
+    decltype(auto) get_gemv_vector(X &&x) {
+      if constexpr (std::is_same_v<get_value_t<X>, T> and MemoryVector<X>) {
+        return std::forward<X>(x);
+      } else {
+        return vector<T, CP>{x};
+      }
+    }
+
+    // Make compile time checks if blas::gemv can handle the given input matrix. If it can, simply forward the matrix.
+    // Otherwise, return a copy with the given value type T and container policy CP.
+    template <typename T, typename CP, Matrix A>
+    decltype(auto) get_gemv_matrix(A &&a) {
+      if constexpr (requires { blas::get_array(a); } and std::is_same_v<get_value_t<A>, T>) {
+        if constexpr (MemoryMatrix<A> or (blas::is_conj_array_expr<A> and blas::has_C_layout<A>)) {
+          return std::forward<A>(a);
+        } else {
+          return matrix<T, C_layout, CP>{a};
+        }
+      } else {
+        return matrix<T, C_layout, CP>{a};
+      }
+    }
+
+    // Make the call to nda::blas::gemv with a copy of the matrix if it is not contiguous.
+    template <Matrix A, Vector X, MemoryVector Y>
+    void make_gemv_call(A const &a, X const &x, Y &y) {
+      auto try_gemv = []<typename A2, typename X2, typename Y2>(A2 &&a2, X2 &&x2, Y2 &&y2) {
+        if constexpr (requires { blas::gemv(1, std::forward<A2>(a2), std::forward<X2>(x2), 0, std::forward<Y2>(y2)); }) {
+          blas::gemv(1, std::forward<A2>(a2), std::forward<X2>(x2), 0, std::forward<Y2>(y2));
+        } else {
+          NDA_RUNTIME_ERROR << "Error in nda::linalg::matvecmul: Cannot call blas::gemv with the given input arrys/views.";
+        }
+      };
+      if (blas::get_array(a).is_contiguous()) {
+        try_gemv(a, x, y);
+      } else {
+        try_gemv(nda::make_regular(a), x, y);
+      }
+    }
+
+  } // namespace detail
+
+  /**
+   * @brief Compute the matrix-vector product of an nda::matrix and an nda::vector object.
+   *
+   * @details This function computes the matrix-vector product 
+   * \f[ 
+   *   \mathrm{op}_A(\mathbf{A}) \mathrm{op}_x(\mathbf{x}) \; ,
+   * \f]
+   * where \f$ \mathrm{op}_A(\mathbf{A}) \f$ is an \f$ m \times n \f$ matrix and \f$ \mathrm{op}_x(\mathbf{x}) \f$ is a 
+   * vector of size \f$ n \f$. \f$ \mathrm{op}_i \f$ can be some lazy operation, e.g. nda::conj, nda::sin, etc.
+   *
+   * We try to call nda::blas::gemv whenever possible, i.e. when the value type of the result is compatible with
+   * nda::is_blas_lapack_v, even if this requires to make copies of the input arrays/views. Otherwise, we perform a very
+   * naive and inefficient matrix-vector multiplication manually.
+   *
+   * Therefore, if performance is important, users should make sure to pass input arrays/views which are compatible with
+   * nda::blas::gemv.
+   * 
+   * @warning This function might make copies of the input arrays/views. When working on the device memory space, this 
+   * may lead to runtime errors if the copying fails.
+   *
+   * @tparam A nda::Matrix type.
+   * @tparam X nda::Vector type.
+   * @param a Input matrix \f$ \mathrm{op}_A(\mathbf{A}) \f$ of size \f$ m \times n \f$.
+   * @param x Input vector \f$ \mathrm{op}_x(\mathbf{x}) \f$ of size \f$ n \f$.
+   * @return Resulting vector of the matrix-vector multiplication of size \f$ m \f$.
+   */
+  template <Matrix A, Vector X>
+    requires(mem::have_compatible_addr_space<A, X>)
+  auto matvecmul(A const &a, X const &x) {
+    // get the return type
+    using value_t  = decltype(a(0, 0) * x(0));
+    using cont_pol = heap<mem::common_addr_space<A, X>>;
+    using return_t = vector<value_t, cont_pol>;
+
+    // result vector (MSAN complains if it is not initialized)
+    auto res = return_t(a.shape()[0]);
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+    res = 0;
+#endif
+#endif
+
+    // perform matrix-vector multiplication (if possible we try to call blas::gemv even if this requires making copies)
+    if constexpr (is_blas_lapack_v<value_t>) {
+      auto &&a_mat = detail::get_gemv_matrix<value_t, cont_pol>(a);
+      auto &&x_vec = detail::get_gemv_vector<value_t, cont_pol>(x);
+
+      // check at runtime if the input matrix is contiguous, make copies if not and call blas::gemv
+      detail::make_gemv_call(a_mat, x_vec, res);
+    } else {
+      detail::gemv_generic(1, a, x, 0, res);
+    }
+    return res;
+  }
+
+  /** @} */
+
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/norm.hpp b/c++/nda/linalg/norm.hpp
index eeea37a08..d017957cc 100644
--- a/c++/nda/linalg/norm.hpp
+++ b/c++/nda/linalg/norm.hpp
@@ -3,8 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // See LICENSE in the root of this distribution for details.
 
-#pragma once
-
 /**
  * @file
  * @brief Provides the p-norm for general arrays/views of rank 1 and with scalar elements.
@@ -23,30 +21,29 @@
 #include <complex>
 #include <limits>
 
-namespace nda {
+namespace nda::linalg {
 
   /**
    * @ingroup linalg_tools
-   * @brief Calculate the p-norm of an nda::ArrayOfRank<1> object \f$ \mathbf{x} \f$ with scalar values.
-   * The p-norm is defined as
+   * @brief Calculate the p-norm of an nda::ArrayOfRank<1> object \f$ \mathbf{x} \f$ with scalar values. The p-norm is
+   * defined as
    * \f[
    *   || \mathbf{x} ||_p = \left( \sum_{i=0}^{N-1} |x_i|^p \right)^{1/p}
    * \f]
-   * with the special cases (following numpy.linalg.norm convention)
+   * with the special cases (following `numpy.linalg.norm` convention)
    *
    * - \f$ || \mathbf{x} ||_0 = \text{number of non-zero elements} \f$,
-   * - \f$ || \mathbf{x} ||_{\infty} = \max_i |x_i| \f$,
-   * - \f$ || \mathbf{x} ||_{-\infty} = \min_i |x_i| \f$.
+   * - \f$ || \mathbf{x} ||_{\infty} = \max \{ |x_i| : i = 0, \dots, N - 1 \} \f$,
+   * - \f$ || \mathbf{x} ||_{-\infty} = \min \{ |x_i| : i = 0, \dots, N - 1 \} \f$.
    *
    * @tparam A nda::ArrayOfRank<1> type.
-   * @param a nda::ArrayOfRank<1> object.
+   * @param a nda::ArrayOfRank<1> object \f$ \mathbf{x} \f$.
    * @param p Order of the norm.
-   * @return Norm of the array/view as a double.
+   * @return p-norm of the array/view.
    */
   template <ArrayOfRank<1> A>
+    requires(Scalar<get_value_t<A>>)
   double norm(A const &a, double p = 2.0) {
-    static_assert(Scalar<get_value_t<A>>, "Error in nda::norm: Only scalar value types are allowed");
-
     if (p == 2.0) [[likely]] {
       if constexpr (MemoryArray<A>)
         return std::sqrt(std::real(nda::blas::dotc(a, a)));
@@ -69,4 +66,4 @@ namespace nda {
     }
   }
 
-} // namespace nda
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/outer_product.hpp b/c++/nda/linalg/outer_product.hpp
new file mode 100644
index 000000000..73642e665
--- /dev/null
+++ b/c++/nda/linalg/outer_product.hpp
@@ -0,0 +1,80 @@
+// Copyright (c) 2020--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+/**
+ * @file
+ * @brief Provides a generic outer product function.
+ */
+
+#pragma once
+
+#include "../blas/ger.hpp"
+#include "../blas/tools.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
+#include "../traits.hpp"
+
+namespace nda::linalg {
+
+  /**
+   * @ingroup linalg_tools
+   * @brief Outer product of two arrays/views.
+   *
+   * @details It calculates the outer product \f$ \mathbf{C} = \mathbf{A} \otimes \mathbf{B} \f$, such that
+   * \f[
+   *   \mathbf{C}_{i_1 \ldots i_k j_1 \ldots j_l} = \mathbf{A}_{i_1 \ldots i_k} \mathbf{B}_{j_1 \ldots j_l} \; .
+   * \f]
+   * Here, \f$ \mathbf{A} \f$ and \f$ \mathbf{B} \f$ are the input arrays with shape \f$ (m_1, \ldots, m_k) \f$ and \f$
+   * (n_1, \ldots, n_l) \f$, respectively. The resulting array \f$ \mathbf{C} \f$ has shape \f$ (m_1, \ldots, m_k, n_1,
+   * \ldots, n_l) \f$.
+   *
+   * The outer product is performed by calling nda::blas::ger, which imposes various constraints on the supported input
+   * arrays/views, e.g.
+   * - their memory layouts have to be the same and either nda::C_layout or nda::F_layout,
+   * - they have to be contiguous in memory,
+   * - etc.
+   *
+   * See nda::blas::ger for more details.
+   *
+   * @tparam A nda::MemoryArray type.
+   * @tparam B nda::MemoryArray type.
+   * @param a Input array/view \f$ \mathbf{A} \f$.
+   * @param b Input array/view \f$ \mathbf{B} \f$.
+   * @return Outer product \f$ \mathbf{A} \otimes \mathbf{B} \f$.
+   */
+  template <MemoryArray A, MemoryArray B>
+    requires((nda::blas::has_C_layout<A> or nda::blas::has_F_layout<A>) and nda::blas::has_C_layout<A> == nda::blas::has_C_layout<B>)
+  auto outer_product(A const &a, B const &b) {
+    // check the input arrays/views
+    EXPECTS(a.is_contiguous());
+    EXPECTS(b.is_contiguous());
+
+    // get the return type
+    auto constexpr rank    = get_rank<A> + get_rank<B>;
+    auto constexpr algebra = []() {
+      if constexpr (get_algebra<A> == 'V' and get_algebra<B> == 'V') {
+        return 'M';
+      } else {
+        return 'A';
+      }
+    }();
+    using layout_pol = typename A::layout_policy_t::contiguous_t;
+    using cont_pol   = heap<nda::mem::common_addr_space<A, B>>;
+    using return_t   = basic_array<get_value_t<A>, rank, layout_pol, algebra, cont_pol>;
+
+    // use ger to calculate the outer product
+    auto res   = return_t::zeros(stdutil::join(a.shape(), b.shape()));
+    auto a_vec = reshape(a, std::array{a.size()});
+    auto b_vec = reshape(b, std::array{b.size()});
+    auto mat   = reshape(res, std::array{a.size(), b.size()});
+    nda::blas::ger(1.0, a_vec, b_vec, mat);
+
+    return res;
+  }
+
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/solve.hpp b/c++/nda/linalg/solve.hpp
new file mode 100644
index 000000000..6964afd44
--- /dev/null
+++ b/c++/nda/linalg/solve.hpp
@@ -0,0 +1,141 @@
+// Copyright (c) 2019-2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides functions to solve linear systems of equations.
+ */
+
+#pragma once
+
+#include "../basic_array.hpp"
+#include "../blas/tools.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../exceptions.hpp"
+#include "../lapack/getrf.hpp"
+#include "../lapack/getrs.hpp"
+#include "../layout/policies.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
+#include "../traits.hpp"
+
+#include <type_traits>
+
+namespace nda::linalg {
+
+  /**
+   * @addtogroup linalg_tools
+   * @{
+   */
+
+  /**
+   * @brief Solve a system of linear equations.
+   *
+   * @details The function solves a system of linear equations
+   *
+   * - \f$ \mathbf{A X} = \mathbf{B} \f$ or
+   * - \f$ \mathbf{A x} = \mathbf{b} \f$,
+   *
+   * with a general \f$ n \times n \f$ matrix \f$ \mathbf{A} \f$ and either
+   * 
+   * - \f$ n \times m \f$  matrices \f$ \mathbf{X} \f$ and \f$ \mathbf{B} \f$ or 
+   * - vectors \f$ \mathbf{x} \f$ and \f$ \mathbf{b} \f$ of size \f$ n \f$.
+   *
+   * It uses nda::lapack::getrf to compute the LU factorization of the matrix \f$ \mathbf{A} \f$ and then
+   * nda::lapack::getrs to solve the system of linear equations.
+   *
+   * An exception is thrown, if the LAPACK calls return a non-zero value.
+   *
+   * @note Right hand side matrix \f$ \mathbf{B} \f$ must have nda::F_layout.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @tparam B nda::MemoryArray type of rank 1 or 2.
+   * @param a Input/Output matrix. On entry, the \f$ n \times n \f$ matrix \f$ \mathbf{A} \f$ determining the linear
+   * system. On exit, the result of the LU factorization from nda::lapack::getrf.
+   * @param b Input/Output matrix. On entry, the right hand side matrix \f$ \mathbf{B} \f$ (vector \f$ \mathbf{b} \f$). 
+   * On exit, the solution matrix \f$ \mathbf{X} \f$ (vector \f$ \mathbf{x} \f$).
+   */
+  template <MemoryMatrix A, MemoryArray B>
+    requires(have_same_value_type_v<A, B> and nda::mem::have_compatible_addr_space<A, B> and is_blas_lapack_v<get_value_t<A>>)
+  void solve_in_place(A &&a, B &&b) { // NOLINT (temporary views are allowed here)
+    constexpr auto addr_space = nda::mem::common_addr_space<A, B>;
+
+    // check the dimensions of the input/output arrays/views
+    EXPECTS_WITH_MESSAGE(a.shape()[0] == a.shape()[1], "Error in nda::solve_in_place: Matrix A is not square");
+    EXPECTS_WITH_MESSAGE(a.shape()[0] == b.extent(0), "Error in nda::solve_in_place: Dimension mismatch between matrix A and B");
+
+    // pivot indices vector
+    auto ipiv = vector<int, heap<addr_space>>(a.extent(0));
+
+    // call lapack getrf
+    int info = lapack::getrf(a, ipiv);
+    if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::solve_in_place: getrf returned a non-zero value: info = " << info;
+
+    // call lapack getrs
+    info = lapack::getrs(a, b, ipiv);
+    if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::solve_in_place: getrs returned a non-zero value: info = " << info;
+  }
+
+  /**
+   * @brief Solve a system of linear equations.
+   *
+   * @details The function solves a system of linear equations
+   *
+   * - \f$ \mathbf{A X} = \mathbf{B} \f$ or
+   * - \f$ \mathbf{A x} = \mathbf{b} \f$,
+   * 
+   * with a general \f$ n \times n \f$ matrix \f$ \mathbf{A} \f$ and either
+   *
+   * - \f$ n \times m \f$ matrices \f$ \mathbf{X} \f$ and \f$ \mathbf{B} \f$ or 
+   * - vectors \f$ \mathbf{x} \f$ and \f$ \mathbf{b} \f$ of size \f$ n \f$.
+   *
+   * It calls nda::linalg::solve_in_place with a copy of the input matrix \f$ \mathbf{A} \f$ and input matrix/vector \f$
+   * \mathbf{B} \f$/\f$ \mathbf{b} \f$.
+   *
+   * @note The solution matrix \f$ \mathbf{X} \f$ is always in nda::F_layout.
+   * 
+   * @warning This function makes copies of the input arrays/views. When working on the device memory space, this may
+   * lead to runtime errors if the copying fails.
+   *
+   * @tparam A nda::Matrix type.
+   * @tparam B nda::Array type of rank 1 or 2.
+   * @param a Input matrix. The \f$ n \times n \f$ matrix \f$ \mathbf{A} \f$ determining the linear system.
+   * @param b Input matrix. The right hand side matrix \f$ \mathbf{B} \f$ (vector \f$ \mathbf{b} \f$).
+   * @return Solution matrix \f$ \mathbf{X} \f$ (vector \f$ \mathbf{x} \f$).
+   */
+  template <Matrix A, Array B>
+    requires(have_same_value_type_v<A, B> and nda::mem::have_compatible_addr_space<A, B> and is_blas_lapack_v<get_value_t<A>>)
+  auto solve(A const &a, B const &b) { // NOLINT (temporary views are allowed here)
+    // copy A and preserve its layout
+    using a_layout_policy = nda::detail::layout_to_policy<typename std::remove_cvref_t<A>::layout_t>::type;
+    auto a_copy           = matrix<get_value_t<A>, a_layout_policy, heap<nda::mem::common_addr_space<A, B>>>(a);
+
+    // copy B and enforce Fortran layout if it is a matrix
+    using vector_t = vector<get_value_t<A>, heap<nda::mem::common_addr_space<A, B>>>;
+    using matrix_t = matrix<get_value_t<A>, F_layout, heap<nda::mem::common_addr_space<A, B>>>;
+    using b_type   = std::conditional_t<get_rank<B> == 1, vector_t, matrix_t>;
+    auto b_copy    = b_type(b);
+
+    // call solve_in_place with the copies
+    solve_in_place(a_copy, b_copy);
+    return b_copy;
+  }
+
+  /** @} */
+
+} // namespace nda::linalg
diff --git a/c++/nda/linalg/svd.hpp b/c++/nda/linalg/svd.hpp
new file mode 100644
index 000000000..a9f69c358
--- /dev/null
+++ b/c++/nda/linalg/svd.hpp
@@ -0,0 +1,120 @@
+// Copyright (c) 2019-2024 Simons Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Authors: Thomas Hahn, Olivier Parcollet, Nils Wentzell
+
+/**
+ * @file
+ * @brief Provides functions to compute the singular value decoomposition of a matrix.
+ */
+
+#pragma once
+
+#include "../basic_array.hpp"
+#include "../blas/tools.hpp"
+#include "../concepts.hpp"
+#include "../declarations.hpp"
+#include "../exceptions.hpp"
+#include "../lapack/gesvd.hpp"
+#include "../layout/policies.hpp"
+#include "../macros.hpp"
+#include "../mem/address_space.hpp"
+#include "../mem/policies.hpp"
+#include "../traits.hpp"
+
+#include <algorithm>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace nda::linalg {
+
+  /**
+   * @addtogroup linalg_tools
+   * @{
+   */
+
+  /**
+   * @brief Compute the singular value decomposition (SVD) of a matrix in place.
+   *
+   * @details The function computes the SVD of a given \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$:
+   * \f[
+   *   \mathbf{A} = \mathbf{U} \mathbf{S} \mathbf{V}^H \; ,
+   * \f]
+   * where \f$ \mathbf{U} \f$ is a unitary \f$ m \times m \f$ matrix, \f$ \mathbf{V} \f$ is a unitary \f$ n \times n \f$
+   * matrix and \f$ \mathbf{S} \f$ is an \f$ m \times n \f$ matrix with non-negative real numbers on the diagonal.
+   *
+   * It first constructs the output vector \f$ \mathbf{s} \f$, which contains the singular values, and the output
+   * matrices \f$ \mathbf{U} \f$ and \f$ \mathbf{V}^H \f$. It then calls nda::lapack::gesvd to compute the SVD.
+   *
+   * An exception is thrown, if the LAPACK call returns a non-zero value.
+   *
+   * @note If the input matrix \f$ \mathbf{A} \f$ is in nda::F_layout, the output matrices \f$ \mathbf{U} \f$ and
+   * \f$ \mathbf{V}^H \f$ are also in nda::F_layout. Otherwise, they are in nda::C_layout.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @param a Input/output matrix. On entry, the \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$. On exit, the contents of
+   * \f$ \mathbf{A} \f$ are destroyed.
+   * @return `std::tuple` containing \f$ \mathbf{U} \f$, \f$ \mathbf{s} \f$ and \f$ \mathbf{V}^H \f$.
+   */
+  template <MemoryMatrix A>
+    requires(is_blas_lapack_v<get_value_t<A>>)
+  auto svd_in_place(A &&a) { // NOLINT (temporary views are allowed here)
+    using layout_policy       = nda::detail::layout_to_policy<typename std::remove_cvref_t<A>::layout_t>::type;
+    constexpr auto addr_space = nda::mem::get_addr_space<A>;
+
+    // vector s and matrices U and V^H
+    auto const [m, n] = a.shape();
+    auto s            = vector<get_fp_t<A>, heap<addr_space>>(std::min(m, n));
+    auto U            = matrix<get_value_t<A>, layout_policy, heap<addr_space>>(m, m);
+    auto VH           = matrix<get_value_t<A>, layout_policy, heap<addr_space>>(n, n);
+
+    // call lapack gesvd
+    int info = lapack::gesvd(a, s, U, VH);
+    if (info != 0) NDA_RUNTIME_ERROR << "Error in nda::svd_in_place: gesvd returned a non-zero value: info = " << info;
+
+    return std::make_tuple(U, s, VH);
+  }
+
+  /**
+   * @brief Compute the singular value decomposition (SVD) of a matrix.
+   *
+   * @details The function computes the SVD of a given \f$ m \times n \f$ matrix \f$ \mathbf{A} \f$:
+   * \f[
+   *   \mathbf{A} = \mathbf{U} \mathbf{S} \mathbf{V}^H \; ,
+   * \f]
+   * where \f$ \mathbf{U} \f$ is a unitary \f$ m \times m \f$ matrix, \f$ \mathbf{V} \f$ is a unitary \f$ n \times n \f$
+   * matrix and \f$ \mathbf{S} \f$ is an \f$ m \times n \f$ matrix with non-negative real numbers on the diagonal.
+   *
+   * It calls nda::linalg::svd_in_place with a copy of the input matrix \f$ \mathbf{A} \f$.
+   *
+   * @note If the input matrix \f$ \mathbf{A} \f$ is in nda::F_layout, the output matrices \f$ \mathbf{U} \f$ and
+   * \f$ \mathbf{V}^H \f$ are also in nda::F_layout. Otherwise, they are in nda::C_layout.
+   * 
+   * @warning This function makes copies of the input arrays/views. When working on the device memory space, this may
+   * lead to runtime errors if the copying fails.
+   *
+   * @tparam A nda::MemoryMatrix type.
+   * @param a Input matrix \f$ \mathbf{A} \f$.
+   * @return `std::tuple` containing \f$ \mathbf{U} \f$, \f$ \mathbf{s} \f$ and \f$ \mathbf{V}^H \f$.
+   */
+  template <Matrix A>
+    requires(is_blas_lapack_v<get_value_t<A>>)
+  auto svd(A const &a) { // NOLINT (temporary views are allowed here)
+    return svd_in_place(basic_array{a});
+  }
+
+  /** @} */
+
+} // namespace nda::linalg
diff --git a/c++/nda/matrix_functions.hpp b/c++/nda/matrix_functions.hpp
index 5dea59408..85b8595a5 100644
--- a/c++/nda/matrix_functions.hpp
+++ b/c++/nda/matrix_functions.hpp
@@ -22,6 +22,7 @@
 
 #include <algorithm>
 #include <concepts>
+#include <iostream>
 #include <ranges>
 #include <type_traits>
 
@@ -92,7 +93,7 @@ namespace nda {
    * @return A view with the 'V' algebra of the diagonal of the array/view.
    */
   template <MemoryArrayOfRank<2> M>
-  ArrayOfRank<1> auto diagonal(M &&m) {
+  ArrayOfRank<1> auto diagonal(M &&m) { // NOLINT
     long dim    = std::min(m.shape()[0], m.shape()[1]);
     long stride = stdutil::sum(m.indexmap().strides());
     using vector_view_t =
@@ -148,6 +149,38 @@ namespace nda {
     return res;
   }
 
+  /**
+   * @brief Check if a given matrix is square, i.e. if the first dimension has the same extent as the second
+   * dimension.
+   *
+   * @tparam A nda::Matrix type.
+   * @param a Matrix to be checked.
+   * @param print_error If true, print an error message if the matrix is not square.
+   * @return True if the matrix is square, false otherwise.
+   */
+  template <Matrix A>
+  bool is_matrix_square(A const &a, bool print_error = false) {
+    auto const [m, n] = a.shape();
+    if (m != n and print_error) std::cerr << "Error in nda::is_matrix_square: Dimensions are: (" << m << "," << n << ")\n" << std::endl;
+    return m == n;
+  }
+
+  /**
+   * @brief Check if a given matrix is diagonal, i.e. if it is square (see nda::is_matrix_square) and all the the
+   * off-diagonal elements are zero.
+   *
+   * @tparam A nda::Matrix type.
+   * @param a Matrix to be checked.
+   * @param print_error If true, print an error message if the matrix is not diagonal.
+   * @return True if the matrix is diagonal, false otherwise.
+   */
+  template <Matrix A>
+  bool is_matrix_diagonal(A const &a, bool print_error = false) {
+    bool const r = is_matrix_square(a) and a == diag(diagonal(a));
+    if (not r and print_error) std::cerr << "Error in nda::is_matrix_diagonal: Non-diagonal matrix: " << a << std::endl;
+    return r;
+  }
+
   /** @} */
 
 } // namespace nda
diff --git a/c++/nda/mem/address_space.hpp b/c++/nda/mem/address_space.hpp
index 9d8d48370..b55d4b0ca 100644
--- a/c++/nda/mem/address_space.hpp
+++ b/c++/nda/mem/address_space.hpp
@@ -98,14 +98,14 @@ namespace nda::mem {
   }();
 
   /**
-   * @brief Get common address space for a number of given nda::MemoryArray types.
+   * @brief Get common address space for a number of given nda::Array types.
    *
    * @details See nda::mem::combine for how the address spaces are combined.
    *
-   * @tparam A1 nda::MemoryArray type.
-   * @tparam As nda::MemoryArray types.
+   * @tparam A1 nda::Array type.
+   * @tparam As nda::Array types.
    */
-  template <MemoryArray A1, MemoryArray... As>
+  template <Array A1, Array... As>
   constexpr AddressSpace common_addr_space = combine<get_addr_space<A1>, get_addr_space<As>...>;
 
   /// Specialization of nda::mem::get_addr_space for nda::Memory Array types.
diff --git a/c++/nda/mem/fill.hpp b/c++/nda/mem/fill.hpp
index 7bdb66bdd..17bc39a62 100644
--- a/c++/nda/mem/fill.hpp
+++ b/c++/nda/mem/fill.hpp
@@ -16,28 +16,34 @@
 
 #pragma once
 
-#include <cstdlib>
+#include "./address_space.hpp"
+#include "../device.hpp"
+#include "../traits.hpp"
+
 #include <algorithm>
-#include <vector>
-#include <span>
+#include <cstddef>
+#include <cstdlib>
+#include <iterator>
 #include <ranges>
-
-#include "address_space.hpp"
-#include "../traits.hpp"
+#include <span>
+#include <vector>
 
 namespace nda::mem {
 
   /**
-   * @brief Fills a range of memory with a specified value.
+   * @brief Fill a range of memory with a specified value.
    *
-   * The behavior depends on the AddressSpace (Host, Device, or Unified).
+   * @details The behaviour of the function depends on the address spaces:
+   * - For `Host`, it simply calls `std::fill_n`.
+   * - For `Device` and `Unified`, it calls `cudaMemset` or `cudaMemset2D` to transfer each byte of the value to the
+   * destination memory.
    *
-   * @tparam AdrSp The address space (e.g., Host, Device, Unified).
-   * @tparam T The type of the elements to fill.
-   * @param first Pointer to the beginning of the range.
+   * @tparam AdrSp nda::mem::AddressSpace of the destination.
+   * @tparam T Value type.
+   * @param first Pointer to the beginning of the destination memory.
    * @param count Number of elements to fill.
-   * @param value The value to fill the range with.
-   * @return Pointer to the end of the filled range.
+   * @param value Value to fill the memory with.
+   * @return Pointer one past the last element filled.
    */
   template <AddressSpace AdrSp, typename T>
     requires(nda::is_scalar_or_convertible_v<T>)
@@ -54,8 +60,7 @@ namespace nda::mem {
         device_error_check(cudaMemset(first, 0, count * sizeof(T)), "cudaMemset");
       } else {
         for (int n = 0; n < sizeof(T); ++n) {
-          const int byte_value [[maybe_unused]] = static_cast<int>(value_bytes[n]);
-          device_error_check(cudaMemset2D((char *)(first) + n, sizeof(T), byte_value, 1, count), "cudaMemset2D");
+          device_error_check(cudaMemset2D((char *)(first) + n, sizeof(T), static_cast<int>(value_bytes[n]), 1, count), "cudaMemset2D");
         }
       }
       return first + count;
@@ -63,16 +68,16 @@ namespace nda::mem {
   }
 
   /**
-   * @brief Fills a range of memory between two pointers with a specified value.
+   * @brief Fill a range of memory between two pointers with a specified value.
    *
-   * Internally calls `fill_n`.
+   * @details It simply calls nda::mem::fill_n with the number of elements calculated from the pointers.
    *
-   * @tparam AdrSp The address space (e.g., Host, Device, Unified).
-   * @tparam T The type of the elements to fill.
-   * @param first Pointer to the beginning of the range.
-   * @param end Pointer to the end of the range.
-   * @param value The value to fill the range with.
-   * @return Pointer to the end of the filled range.
+   * @tparam AdrSp nda::mem::AddressSpace of the destination.
+   * @tparam T Value type.
+   * @param first Pointer to the beginning of the destination memory.
+   * @param end Pointer to the end of the destination memory.
+   * @param value Value to fill the memory with.
+   * @return Pointer one past the last element filled.
    */
   template <AddressSpace AdrSp, typename T>
     requires(nda::is_scalar_or_convertible_v<T>)
@@ -82,17 +87,20 @@ namespace nda::mem {
   }
 
   /**
-   * @brief Fills a 2D memory region with a specified value.
+   * @brief Fill a 2D memory region with a specified value.
    *
-   * The behavior depends on the AddressSpace (Host, Device, or Unified).
+   * @details The behaviour of the function depends on the address spaces:
+   * - For `Host`, the function is not implemented.
+   * - For `Device` and `Unified`, it calls `cudaMemset2D` or `cudaMemcpy2D` to fill the 2D memory region with the 
+   * specified value.
    *
-   * @tparam AdrSp The address space (e.g., Host, Device, Unified).
-   * @tparam T The type of the elements to fill.
-   * @param first Pointer to the beginning of the 2D memory region.
-   * @param pitch The memory pitch between rows.
-   * @param width The number of elements to fill in each row.
-   * @param height The number of rows to fill.
-   * @param value The value to fill the 2D region with.
+   * @tparam AdrSp nda::mem::AddressSpace of the destination.
+   * @tparam T Value type.
+   * @param first Pointer to the beginning of the destination memory.
+   * @param pitch Pitch of destination memory.
+   * @param width Number of elements to fill in each row.
+   * @param height Number of rows to fill.
+   * @param value Value to fill the memory with.
    */
   template <AddressSpace AdrSp, typename T>
     requires(nda::is_scalar_or_convertible_v<T>)
diff --git a/c++/nda/mem/malloc.hpp b/c++/nda/mem/malloc.hpp
index f761008e1..2c69aed00 100644
--- a/c++/nda/mem/malloc.hpp
+++ b/c++/nda/mem/malloc.hpp
@@ -42,7 +42,7 @@ namespace nda::mem {
     void *ptr = nullptr;
     if constexpr (AdrSp == Host) {
       ptr = std::malloc(size); // NOLINT (we want to return a void*)
-    } else if constexpr (AdrSp == Device) {
+    } else if constexpr (AdrSp == Device) { // NOLINT (branch is not repeated)
       device_error_check(cudaMalloc((void **)&ptr, size), "cudaMalloc");
     } else {
       device_error_check(cudaMallocManaged((void **)&ptr, size), "cudaMallocManaged");
diff --git a/c++/nda/mem/memcpy.hpp b/c++/nda/mem/memcpy.hpp
index f730277c6..b08eb1959 100644
--- a/c++/nda/mem/memcpy.hpp
+++ b/c++/nda/mem/memcpy.hpp
@@ -63,7 +63,7 @@ namespace nda::mem {
    * @tparam DestAdrSp nda::mem::AddressSpace of the destination.
    * @tparam SrcAdrSp nda::mem::AddressSpace of the source.
    * @param dest Pointer to the destination memory.
-   * @param dpitch Pitch of destination memory
+   * @param dpitch Pitch of destination memory.
    * @param src Pointer to the source memory.
    * @param spitch Pitch of source memory.
    * @param width Width of matrix transfer (columns in bytes).
diff --git a/c++/nda/traits.hpp b/c++/nda/traits.hpp
index 4a2072fe6..daa9cc0d2 100644
--- a/c++/nda/traits.hpp
+++ b/c++/nda/traits.hpp
@@ -87,9 +87,10 @@ namespace nda {
   template <typename T>
   inline constexpr bool is_double_or_complex_v = is_complex_v<T> or std::is_same_v<double, std::remove_cvref_t<T>>;
 
-  /// Alias for nda::is_double_or_complex_v.
+  /// Constexpr variable that is true if type `T` is a real (float64/float32) or complex type
   template <typename T>
-  inline constexpr bool is_blas_lapack_v = is_double_or_complex_v<T>;
+  inline constexpr bool is_blas_lapack_v =
+     is_complex_v<T> or std::is_same_v<double, std::remove_cvref_t<T>> or std::is_same_v<float, std::remove_cvref_t<T>>;
 
   /** @} */
 
@@ -181,6 +182,18 @@ namespace nda {
   template <typename A>
   using get_value_t = std::decay_t<decltype(get_first_element(std::declval<A const>()))>;
 
+  template <typename A>
+  decltype(auto) get_fp_type() {
+    if constexpr (is_complex_v<get_value_t<A>>) {
+      return std::remove_cvref_t<typename get_value_t<A>::value_type>{};
+    } else {
+      return std::remove_cvref_t<get_value_t<A>>{};
+    }
+  }
+
+  template <typename A>
+  using get_fp_t = std::decay_t<decltype(get_fp_type<A>())>;
+
   /// Constexpr variable that is true if all types in `As` have the same value type as `A0`.
   template <typename A0, typename... As>
   inline constexpr bool have_same_value_type_v = (std::is_same_v<get_value_t<A0>, get_value_t<As>> and ... and true);
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index dd7e1b208..32cdce4af 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -126,6 +126,7 @@
       </tab>
       <tab type="usergroup" url="@ref linalg" title="Linear algebra">
         <tab type="user" url="@ref linalg_blas" title="BLAS interface"/>
+        <tab type="user" url="@ref linalg_blas_utils" title="BLAS utilities"/>
         <tab type="usergroup" url="@ref linalg_lapack" title="LAPACK interface">
           <tab type="user" url="@ref nda::lapack::gelss_worker" title="lapack::gelss_worker"/>
           <tab type="user" url="@ref nda::lapack::gelss_worker_hermitian" title="lapack::gelss_worker_hermitian"/>
diff --git a/doc/ex1.md b/doc/ex1.md
index 68e4f18cd..c5cea4f28 100644
--- a/doc/ex1.md
+++ b/doc/ex1.md
@@ -17,7 +17,7 @@ int main(int argc, char *argv[]) {
 }
 ```
 
-@subsection ex1_p1 Creating and initializing an array
+@section ex1_p1 Creating and initializing an array
 
 In its simplest form, an nda::array has 2 template parameters
 
@@ -45,7 +45,7 @@ We could have achieved the same using the constructor which takes `std::initiali
 auto A2 = nda::array<int, 2>{{0, 1}, {2, 3}, {4, 5}};
 ```
 
-@subsection ex1_p2 Choosing a memory layout
+@section ex1_p2 Choosing a memory layout
 
 By default, nda::array stores its elements in C-order. To create an array in Fortran-order, we can specify a third
 template parameter:
@@ -61,7 +61,7 @@ Here, nda::F_layout is one of the @ref layout_pols.
 While in 2-dimensions, the only possibilities are C-order or Fortran-order, in higher dimensions one can also specify
 other stride orders (see nda::basic_layout and nda::basic_layout_str).
 
-@subsection ex1_p3 Printing an array
+@section ex1_p3 Printing an array
 
 Let's check the contents, sizes and shapes of the arrays using the overloaded streaming operators:
 
@@ -115,7 +115,7 @@ You can see the difference between the memory layouts of the array:
 >  [4,5]]
 > ```
 
-@subsection ex1_p4 Accessing single elements
+@section ex1_p4 Accessing single elements
 
 We can access single elements of the array using the function call operator of the array object.
 For a 2-dimensional array, we have to pass exactly two indices, otherwise it won't compile:
@@ -153,7 +153,7 @@ B(2, 1) = 100
 > A(3, 2) = 9
 > ```
 
-@subsection ex1_p5 Assigning to an array
+@section ex1_p5 Assigning to an array
 
 It is straightforward to assign a scalar or another array to an existing array:
 
@@ -180,7 +180,7 @@ A =
  [2,100]]
 ```
 
-@subsection ex1_p6 Working with views
+@section ex1_p6 Working with views
 
 Views offer a lightweight and efficient way to manipulate and operate on existing arrays since they do not own their
 data, i.e. there is no memory allocation or copying involved when creating a view (see nda::basic_array_view).
@@ -228,7 +228,7 @@ A =
 In most cases, views will just behave like arrays and the majority of functions and operations that can be performed
 with arrays, also work with views.
 
-@subsection ex1_p7 Working with slices
+@section ex1_p7 Working with slices
 
 A slice is a view on only some parts of an existing array.
 
@@ -282,7 +282,7 @@ A =
  [2,100]]
 ```
 
-@subsection ex1_p8 Performing arithmetic operations
+@section ex1_p8 Performing arithmetic operations
 
 We can perform various @ref av_ops with arrays and views.
 Arithmetic operations are (mostly) implemented as lazy expressions. That means the operations are not performed right
@@ -370,7 +370,7 @@ M1 * M2 =
 Here, an nda::matrix is the same as an nda::array of rank 2, except that it belongs to the ``'M'`` algebra instead of
 the ``'A'`` algebra.
 
-@subsection ex1_p9 Applying mathematical functions and algorithms
+@section ex1_p9 Applying mathematical functions and algorithms
 
 Similar to arithmetic operations, most of the @ref av_math that can operate on arrays and views return a lazy function
 call expression (nda::expr_call) and are sensitive to their algebras:
@@ -433,7 +433,7 @@ They expect the given array elements to have a `bool` value type which is not th
 We therefore use nda::map to create a lazy nda::expr_call that returns a `bool` upon calling it.
 Since nda::expr_call fulfills the nda::Array concept, it can be passed to the algorithms.
 
-@subsection ex1_p10 Writing/Reading HDF5
+@section ex1_p10 Writing/Reading HDF5
 
 Writing arrays and views to HDF5 files using **nda's** @ref av_hdf5 is as easy as
 
@@ -478,7 +478,7 @@ C_copy =
  [3,4]]
 ```
 
-@subsection ex1_p11 Doing linear algebra with arrays
+@section ex1_p11 Doing linear algebra with arrays
 
 **nda** has a @ref linalg_blas and an @ref linalg_lapack and provides the nda::matrix and nda::vector types.
 While nda::matrix is a 2-dimensional array belonging to the ``'M'`` algebra, nda::vector is a 1-dimensional array
@@ -534,7 +534,7 @@ M3 * x = [5,8]
 > **Note**: Matrix-matrix and matrix-vector multiplication do not return lazy expressions, since they call the
 > corresponding BLAS routines directly, while element-wise array-array multiplication does return a lazy expression.
 
-@subsection ex1_p12 Initializing with CLEF's automatic assignment
+@section ex1_p12 Initializing with CLEF's automatic assignment
 
 **nda** contains the @ref clef (CLEF) library which is a more or less standalone implementation of general lazy
 expressions.
@@ -567,7 +567,7 @@ assign the result to the corresponding array element, e.g. `F(3, 4) = 3 * 7 + 4
 This is especially helpful for high-dimensional arrays where the element at `(i, j, ..., k)` can be written as some
 function \f$ g \f$ of its indices, i.e. \f$ F_{ij \dots k} = g(i, j, \dots, k) \f$.
 
-@subsection ex1_p13 Further examples
+@section ex1_p13 Further examples
 
 The above features constitute only a fraction of what you can do with **nda**.
 
diff --git a/doc/ex2.md b/doc/ex2.md
index 9bd7a0ed8..ed3320019 100644
--- a/doc/ex2.md
+++ b/doc/ex2.md
@@ -16,7 +16,7 @@ int main(int argc, char *argv[]) {
 }
 ```
 
-@subsection ex2_p1 Default constructor
+@section ex2_p1 Default constructor
 
 The default constructor creates an empty array of size 0:
 
@@ -65,7 +65,7 @@ A.size() = 100
 A.shape() = (10 10)
 ```
 
-@subsection ex2_p2 Constructing an array with a given shape
+@section ex2_p2 Constructing an array with a given shape
 
 The usual way to create an array is by specifying its shape.
 While the shape is a runtime parameter, the rank of the array still has to be known at compile-time (it is a template
@@ -105,7 +105,7 @@ v1.size() = 5
 v1.shape() = (5)
 ```
 
-@subsection ex2_p3 Copy/Move constructors
+@section ex2_p3 Copy/Move constructors
 
 The copy and move constructors behave as expected:
 
@@ -142,7 +142,7 @@ v2.empty() = 1
 > **Note**: After moving, the vector `v2` is empty, i.e. its memory handle does not manage any memory at the moment. To
 > use it, one should again resize it, so that new memory is allocated.
 
-@subsection ex2_p4 Constructing an array from its data
+@section ex2_p4 Constructing an array from its data
 
 1-, 2- and 3-dimensional arrays can be constructed directly from their data using `std::initializer_list` objects:
 - a 1-dimensional array is constructed from a single list,
@@ -186,7 +186,7 @@ A3.size() = 12
 A3.shape() = (2 3 2)
 ```
 
-@subsection ex2_p5 Constructing an array from an nda::Array
+@section ex2_p5 Constructing an array from an nda::Array
 
 We can also construct an array from any object that satisfies the nda::Array concept, has a compatible value type and
 the same rank.
@@ -225,7 +225,7 @@ A2_f.size() = 6
 A2_f.shape() = (3 2)
 ```
 
-@subsection ex2_p6 Factories and transformations
+@section ex2_p6 Factories and transformations
 
 **nda** provides various factory functions and transformations that allow us to construct special arrays very easily
 either from scratch or from some other input arrays.
diff --git a/doc/ex3.md b/doc/ex3.md
index 06b4c4f8a..8efc30b37 100644
--- a/doc/ex3.md
+++ b/doc/ex3.md
@@ -16,7 +16,7 @@ int main(int argc, char *argv[]) {
 }
 ```
 
-@subsection ex3_p1 Assigning a scalar to an array
+@section ex3_p1 Assigning a scalar to an array
 
 The simplest way to initialize an array is to assign a scalar to it:
 
@@ -72,7 +72,7 @@ Output:
 A_arr = [[1,2,3],[1,2,3],[1,2,3],[1,2,3]]
 ```
 
-@subsection ex3_p2 Copy/Move assignment
+@section ex3_p2 Copy/Move assignment
 
 The copy and move assignment operations behave as expected:
 
@@ -108,7 +108,7 @@ M_copy.empty() = 1
 
 > **Note**: Be careful when reusing an object after it has been moved (see the note at @ref ex2_p3)!
 
-@subsection ex3_p3 Assigning an nda::Array to an array
+@section ex3_p3 Assigning an nda::Array to an array
 
 To assign an object that satisfies the nda::Array concept to an array is similar to @ref ex2_p5.
 
@@ -145,7 +145,7 @@ correct shape.
 This is not true for the second assignment.
 `A2` has been default constructed and therefore has a size of 0.
 
-@subsection ex3_p4 Assigning a contiguous range
+@section ex3_p4 Assigning a contiguous range
 
 It is possible to assign an object that satisfies the `std::ranges::contiguous_range` concept to an 1-dimensional array:
 
@@ -165,7 +165,7 @@ A_vec = [1,2,3,4,5]
 
 As expected, the elements of the range are simply copied into the array.
 
-@subsection ex3_p5 Initializing an array manually
+@section ex3_p5 Initializing an array manually
 
 We can also initialize an array by assigning to each element manually.
 This can be done in different ways.
@@ -230,7 +230,7 @@ For example,
 While the traditional for-loops are perhaps the most flexible option, it becomes tedious quite fast with increasing
 dimensionality.
 
-@subsection ex3_p6 Initializing an array using automatic assignment
+@section ex3_p6 Initializing an array using automatic assignment
 
 This has already been explained in @ref ex1_p12.
 
diff --git a/doc/ex4.md b/doc/ex4.md
index f75c12a79..15c23aea0 100644
--- a/doc/ex4.md
+++ b/doc/ex4.md
@@ -15,7 +15,7 @@ int main(int argc, char *argv[]) {
 }
 ```
 
-@subsection ex4_p1 Creating a full view on an array/view
+@section ex4_p1 Creating a full view on an array/view
 
 We have already seen in @ref ex1_p6 how we can get a full view on an existing array by doing an empty function call:
 
@@ -63,7 +63,7 @@ A_vv =
  [20,21,22,23,24]]
 ```
 
-@subsection ex4_p2 Value type of views
+@section ex4_p2 Value type of views
 
 While the value type of an array is always non-const, views can have const or non-const value types.
 
@@ -106,7 +106,7 @@ As expected, we cannot assign to const arrays/views or views with a const value
 A_vc(0, 0) = -2;
 ```
 
-@subsection ex4_p3 Creating a slice of an array/view
+@section ex4_p3 Creating a slice of an array/view
 
 @ref ex1_p7 has already explained what a slice is and how we can create one.
 In the following, we will give some more examples to show how slices can be used in practice.
@@ -183,7 +183,7 @@ S_3 =
  [20,22,24]]
 ```
 
-@subsection ex4_p4 Assigning to views
+@section ex4_p4 Assigning to views
 
 Before assigning to the view `S_3`, let's make a copy of its contents so that we can restore everything later on:
 
@@ -261,7 +261,7 @@ A =
 > **Note**: In contrast to arrays, views cannot be resized. When assigning some general nda::Array object to a view,
 > their shapes have to match, otherwise this may result in undefined behavior.
 
-@subsection ex4_p5 Copy/Move operations
+@section ex4_p5 Copy/Move operations
 
 The copy and move operations of views are a little bit different than their array counterparts:
 
@@ -307,7 +307,7 @@ C =
  [20,22,24]]
 ```
 
-@subsection ex4_p6 Operating on views/slices
+@section ex4_p6 Operating on views/slices
 
 We can perform various arithmetic operations, mathematical functions and algorithms with views and slices just like we
 did with arrays in @ref ex1_p8 and @ref ex1_p9.
@@ -336,7 +336,7 @@ sum(S_3) = 108
 product(S_3) = 0
 ```
 
-@subsection ex4_p7 Rebinding a view to another array/view
+@section ex4_p7 Rebinding a view to another array/view
 
 If we want to bind an existing view to a new array/view/memory location, we cannot simply use the copy assignment (since
 it makes a deep copy of the view's contents). Instead we have to call nda::basic_array_view::rebind:
@@ -355,7 +355,7 @@ S_3.data() == C_v.data() = 0
 S_3.data() == C_v.data() = 1
 ```
 
-@subsection ex4_p8 Viewing generic 1-dimensional ranges
+@section ex4_p8 Viewing generic 1-dimensional ranges
 
 The views in **nda** can also view generic 1-dimensional ranges like `std::vector` or `std::array`.
 The only requirement is that they are contiguous:
@@ -378,7 +378,7 @@ arr_v = [1,2,3,4,5]
 arr = (2 4 6 8 10)
 ```
 
-@subsection ex4_p9 Factories and transformations
+@section ex4_p9 Factories and transformations
 
 @ref av_factories contain various functions to create new and transform existing views.
 
diff --git a/doc/ex5.md b/doc/ex5.md
index 956c8fdb2..7fa7f3ac4 100644
--- a/doc/ex5.md
+++ b/doc/ex5.md
@@ -47,7 +47,7 @@ A =
 > writing/reading to/from HDF5, the interface always checks if the arrays/views are in C-order. If this is not the case,
 > it will use a temporary C-order array to perform the writing/reading.
 
-@subsection ex5_p1 Writing an array/view
+@section ex5_p1 Writing an array/view
 
 Writing an array to an HDF5 file is as simple as
 
@@ -115,7 +115,7 @@ In this case, a 3-by-3 view.
 > h5::write(file, "A", A, /* compression off */ false);
 > ```
 
-@subsection ex5_p2 Reading into an array/view
+@section ex5_p2 Reading into an array/view
 
 Reading a full dataset into an array is straightforward:
 
@@ -166,7 +166,7 @@ B =
 Here, we read the 3-by-3 dataset `A_v` into a view `B_v` consisting of every other column and the rows 1, 2 and 3 of the
 underlying 5-by-5 array `B`.
 
-@subsection ex5_p3 Writing to a slice of an existing dataset
+@section ex5_p3 Writing to a slice of an existing dataset
 
 So far we have only written to an automatically created dataset with exactly the same size and shape as the array/view
 that is being written.
@@ -259,7 +259,7 @@ DATASET "/B" {
 }
 ```
 
-@subsection ex5_p4 Reading a slice from an existing dataset
+@section ex5_p4 Reading a slice from an existing dataset
 
 Instead of reading the full dataset as we have done before, it is possible to specify a slice of the dataset that should
 be read.
@@ -297,7 +297,7 @@ C =
  [15,16,17,18,19]]
 ```
 
-@subsection ex5_p5 Writing/Reading 1-dimensional arrays/views of strings
+@section ex5_p5 Writing/Reading 1-dimensional arrays/views of strings
 
 For the user, writing and reading an 1-dimensional array/view of strings works exactly the same way as with an
 array/view of arithmetic scalars:
@@ -338,7 +338,7 @@ DATASET "/S" {
 }
 ```
 
-@subsection ex5_p6 Writing/Reading arrays/views of generic types
+@section ex5_p6 Writing/Reading arrays/views of generic types
 
 **nda** allows us to write/read arbitrary arrays/views as long as the objects contained in the array have specialized
 `h5_write` and `h5_read` functions (see [h5 docs](https://triqs.github.io/h5/unstable/group__rw__generic.html)).
diff --git a/doc/ex6.md b/doc/ex6.md
index d656f9181..e59a2965b 100644
--- a/doc/ex6.md
+++ b/doc/ex6.md
@@ -34,7 +34,7 @@ The examples below are run on 4 processes.
 > **Note**: Only regular arrays and views are allowed in the **nda** MPI routines, no lazy expressions. You can use
 > nda::make_regular to turn your lazy expressions into regular arrays.
 
-@subsection ex6_p1 Broadcasting an array/view
+@section ex6_p1 Broadcasting an array/view
 
 Let us first default construct an array on all MPI ranks and then resize and initialize it on rank 0, the root rank:
 
@@ -166,7 +166,7 @@ contiguous arrays/views, so that **nda** can make the MPI calls as efficiently a
 > **Note**: All MPI routines have certain requirements for the arrays/views involved in the operation. Please check out
 > the documentation of the individual function, e.g. in this case nda::mpi_broadcast, if you have doubts.
 
-@subsection ex6_p2 Gathering an array/view
+@section ex6_p2 Gathering an array/view
 
 Suppose we have a 1-dimensional array with rank specific elements and sizes:
 
@@ -313,7 +313,7 @@ Rank 3:
 []
 ```
 
-@subsection ex6_p3 Scattering an array/view
+@section ex6_p3 Scattering an array/view
 
 Scattering of an array/view is basically the inverse operation of gathering.
 It takes an array/view and splits it along the first dimensions as evenly as possible among the processes.
@@ -373,7 +373,7 @@ Here, a 2-by-2 array is scattered from rank 2 to all other processes.
 It is split along the first dimension and the resulting 1-by-2 subarrays are sent to the ranks 0 and 1 while the ranks
 2 and 3 do not receive any data.
 
-@subsection ex6_p4 Reducing an array/view
+@section ex6_p4 Reducing an array/view
 
 Let us reduce the same 2-by-2 arrays from above.
 Be default, `mpi::reduce` performs an element-wise summation among the ranks in the communicator and makes the result
@@ -441,7 +441,7 @@ Rank 1:
 In contrast to the standard `mpi::all_reduce` function, the in-place operation does not create and return a new array.
 Instead the result is directly written into the input array.
 
-@subsection ex6_p5 Using existing arrays/views
+@section ex6_p5 Using existing arrays/views
 
 Note that the functions nda::mpi_reduce, nda::mpi_gather and nda::mpi_scatter all return a newly constructed array which
 contains the result of the respective MPI operation.
diff --git a/doc/ex7.md b/doc/ex7.md
index 50537da9c..9d636cc7f 100644
--- a/doc/ex7.md
+++ b/doc/ex7.md
@@ -35,7 +35,7 @@ Here, we have defined some basic quantities/types:
 
 Let's take a look at a simple example.
 
-@subsection ex7_p1 Defining the symmetry
+@section ex7_p1 Defining the symmetry
 
 We start by defining a hermitian symmetry for our matrix.
 A symmetry is simply an object of type `sym_func_t` that specifies how elements in an array are related to one another.
@@ -53,7 +53,7 @@ The two lines have the following meaning:
 1. Diagonal elements are not related to any other elements (except to themselves by the identity operation).
 2. Off-diagonal elements with an index `(i,j)` are related to the element `(j,i)` by a complex conjugation.
 
-@subsection ex7_p2 Constructing the symmetry group
+@section ex7_p2 Constructing the symmetry group
 
 Once all of the desired symmetries have been defined, we can construct an nda::sym_grp object:
 
@@ -137,7 +137,7 @@ Output:
 Now we can see why the elements 0, 4 and 8 are in their own symmetry class and elements 1 and 3, 2 and 6 and elements 5
 and 7 are related to each other via a complex conjugation.
 
-@subsection ex7_p3 Initializing an array
+@section ex7_p3 Initializing an array
 
 One of the features of symmetry groups is that they can be used to initialize or assign to an existing array with an
 initializer function satisfying the nda::NdaInitFunc concept:
@@ -196,7 +196,7 @@ In our simple example, this does not really make a difference but for large arra
 is much smaller than the size of the array and where evaluating `init_func` is expensive, this can give a considerable
 speedup.
 
-@subsection ex7_p4 Representative data
+@section ex7_p4 Representative data
 
 We have already learned above about representative elements of symmetry classes.
 The symmetry group object gives us access to those elements:
@@ -238,7 +238,7 @@ B =
 
 Here, we first multiplied the original representative data by 2 and then initialized a new array with it.
 
-@subsection ex7_p5 Symmetrizing an array
+@section ex7_p5 Symmetrizing an array
 
 The nda::sym_grp class provides a method that let's us symmetrize an existing array and simultaneously obtain the
 maximum symmetry violation.
diff --git a/doc/ex8.md b/doc/ex8.md
index 42b825d2b..a61409996 100644
--- a/doc/ex8.md
+++ b/doc/ex8.md
@@ -19,7 +19,7 @@ int main(int argc, char *argv[]) {
 Before showing some linear algebra related operations, let us first introduce the nda::matrix and nda::vector types and
 highlight their similarities and differences with respect to nda::array.
 
-@subsection ex8_p1 Matrix and vector types
+@section ex8_p1 Matrix and vector types
 
 As already mentioned in the quick introduction @ref ex1_p11, **nda** provides an nda::matrix and an nda::vector type.
 Both are specializations of the general nda::basic_array with the following features:
@@ -30,7 +30,7 @@ Their algebras determine how matrices and vectors behave in certain operations a
 
 Otherwise, everything that is true for nda::basic_array objects is also true for nda::matrix and nda::vector objects.
 
-@subsection ex8_p2 Constructing matrices and vectors
+@section ex8_p2 Constructing matrices and vectors
 
 To construct a matrix or a vector, we can use the same methods as for nda::array objects.
 We refer the user to @ref ex2 for more information.
@@ -72,7 +72,7 @@ D =
 nda::eye constructs an identity matrix of a certain size and nda::diag takes a 1-dimensional array and constructs a
 square diagonal matrix containing the values of the given array.
 
-@subsection ex8_p3 Initializing and assigning to matrices and vectors
+@section ex8_p3 Initializing and assigning to matrices and vectors
 
 Again, initializing and assigning to matrices and vectors works (almost) exactly in the same way as it does for arrays
 (see @ref ex3)
@@ -100,7 +100,7 @@ M =
 v = [2.7182,2.7182,2.7182]
 ```
 
-@subsection ex8_p4 Views on matrices and vectors
+@section ex8_p4 Views on matrices and vectors
 
 There are some @ref av_factories for views that are specific to matrices and vectors, otherwise everything mentioned in
 @ref ex4 still applies:
@@ -131,7 +131,7 @@ Algebra of A: A
 Algebra of A_mv: M
 ```
 
-@subsection ex8_p5 HDF5, MPI and symmetry support for matrices and vectors
+@section ex8_p5 HDF5, MPI and symmetry support for matrices and vectors
 
 We refer the user to the examples
 - @ref ex5,
@@ -140,7 +140,7 @@ We refer the user to the examples
 
 There are no mentionable differences between arrays, matrices and vectors regarding those features.
 
-@subsection ex8_p6 Arithmetic operations with matrices and vectors
+@section ex8_p6 Arithmetic operations with matrices and vectors
 
 Here the algebra of the involved types becomes important.
 In section @ref ex1_p8, we have shortly introduced how arithmetic operations are implemented in **nda** in terms of lazy
@@ -183,7 +183,7 @@ Then the following operations are allowed  (all operations are lazy unless menti
   - `s1 / M1`: multiplies (lazy) the scalar with the inverse (non-lazy) of `M1`, only square matrices are allowed (since
     `M1` is inverted), result is also square with the same size
 
-@subsection ex8_p7 Using linear algebra tools
+@section ex8_p7 Using linear algebra tools
 
 In addition to the basic matrix-matrix and matrix-vector multiplications described above, **nda** provides some useful
 @ref linalg_tools.
@@ -324,7 +324,7 @@ M1_reconstructed =
  [-12,13,1]]
 ```
 
-@subsection ex8_p8 Using the BLAS/LAPACK interface
+@section ex8_p8 Using the BLAS/LAPACK interface
 
 While the functions in @ref linalg_tools offer a very user-friendly experience, @ref linalg_blas and @ref linalg_lapack
 are more low-level and usually require more input from the users.
diff --git a/doc/groups.dox b/doc/groups.dox
index 7fa081aae..6b1c7308f 100644
--- a/doc/groups.dox
+++ b/doc/groups.dox
@@ -401,6 +401,12 @@
  * @brief Interface to parts of the BLAS library.
  */
 
+/**
+ * @defgroup linalg_blas_utils BLAS utilities
+ * @ingroup linalg
+ * @brief Utilities for the BLAS/LAPACK interface.
+ */
+
 /**
  * @defgroup linalg_lapack LAPACK interface
  * @ingroup linalg
diff --git a/test/c++/nda_applications.cpp b/test/c++/nda_applications.cpp
index 6889d6105..e165930eb 100644
--- a/test/c++/nda_applications.cpp
+++ b/test/c++/nda_applications.cpp
@@ -19,14 +19,14 @@ TEST(NDA, InverseOfTensorProductOfMatrices) {
   nda::matrix<double> B(2, 2), C(3, 3), Binv, Cinv;
   C(i_, j_) << 1.7 / (3.4 * i_ - 2.3 * j_ + 1);
   B(i_, j_) << 2 * i_ + j_;
-  Binv = inverse(B);
-  Cinv = inverse(C);
+  Binv = nda::linalg::inv(B);
+  Cinv = nda::linalg::inv(C);
 
   {
     nda::array<double, 4> A(2, 3, 2, 3);
     A(i_, j_, k_, l_) << B(i_, k_) * C(j_, l_);
     auto M = make_matrix_view(group_indices_view(A, nda::idx_group<0, 1>, nda::idx_group<2, 3>));
-    M      = inverse(M);
+    M      = nda::linalg::inv(M);
     nda::array<double, 4> R(A.shape());
     R(i_, j_, k_, l_) << Binv(i_, k_) * Cinv(j_, l_);
     EXPECT_ARRAY_NEAR(R, A, 5.e-15);
@@ -36,7 +36,7 @@ TEST(NDA, InverseOfTensorProductOfMatrices) {
     nda::array<double, 4> A(2, 3, 2, 3);
     A(i_, j_, k_, l_) << B(i_, k_) * C(j_, l_);
     auto M = make_matrix_view(group_indices_view(A, nda::idx_group<2, 3>, nda::idx_group<0, 1>));
-    M      = inverse(M);
+    M      = nda::linalg::inv(M);
     nda::array<double, 4> R(A.shape());
     R(i_, j_, k_, l_) << Binv(i_, k_) * Cinv(j_, l_);
     EXPECT_ARRAY_NEAR(R, A, 5.e-15);
@@ -46,7 +46,7 @@ TEST(NDA, InverseOfTensorProductOfMatrices) {
     nda::array<double, 4, nda::basic_layout<0, nda::encode(std::array{1, 0, 3, 2}), nda::layout_prop_e::contiguous>> A(2, 3, 2, 3);
     A(i_, j_, k_, l_) << B(i_, k_) * C(j_, l_);
     auto M = make_matrix_view(group_indices_view(A, nda::idx_group<0, 1>, nda::idx_group<2, 3>));
-    M      = inverse(M);
+    M      = nda::linalg::inv(M);
     nda::array<double, 4> R(A.shape());
     R(i_, j_, k_, l_) << Binv(i_, k_) * Cinv(j_, l_);
     EXPECT_ARRAY_NEAR(R, A, 5.e-15);
@@ -56,7 +56,7 @@ TEST(NDA, InverseOfTensorProductOfMatrices) {
     nda::array<double, 4, nda::basic_layout<0, nda::encode(std::array{1, 0, 3, 2}), nda::layout_prop_e::contiguous>> A(2, 3, 2, 3);
     A(i_, j_, k_, l_) << B(i_, k_) * C(j_, l_);
     auto M = make_matrix_view(group_indices_view(A, nda::idx_group<2, 3>, nda::idx_group<0, 1>));
-    M      = inverse(M);
+    M      = nda::linalg::inv(M);
     nda::array<double, 4> R(A.shape());
     R(i_, j_, k_, l_) << Binv(i_, k_) * Cinv(j_, l_);
     EXPECT_ARRAY_NEAR(R, A, 5.e-15);
@@ -66,7 +66,7 @@ TEST(NDA, InverseOfTensorProductOfMatrices) {
     nda::array<double, 4, nda::basic_layout<0, nda::encode(std::array{0, 2, 1, 3}), nda::layout_prop_e::contiguous>> A(2, 2, 3, 3);
     A(i_, k_, j_, l_) << B(i_, k_) * C(j_, l_);
     auto M = make_matrix_view(group_indices_view(A, nda::idx_group<0, 2>, nda::idx_group<1, 3>));
-    M      = inverse(M);
+    M      = nda::linalg::inv(M);
     nda::array<double, 4> R(A.shape());
     R(i_, k_, j_, l_) << Binv(i_, k_) * Cinv(j_, l_);
     EXPECT_ARRAY_NEAR(R, A, 5.e-15);
@@ -88,7 +88,7 @@ TEST(NDA, MatrixVectorMultiplicationWithPermutedViews) {
     auto Cmat = nda::matrix_view<double>{C(k, nda::ellipsis{})};
     EXPECT_EQ_ARRAY(Amat, Bmat);
     EXPECT_EQ_ARRAY(Amat * v, Bmat * v);
-    EXPECT_DEBUG_DEATH(Cmat * v, "gemv");
+    EXPECT_EQ_ARRAY(Amat * v, Cmat * v);
   }
 }
 
diff --git a/test/c++/nda_arithmetic_ops.cpp b/test/c++/nda_arithmetic_ops.cpp
index 0afc08b04..8108efa80 100644
--- a/test/c++/nda_arithmetic_ops.cpp
+++ b/test/c++/nda_arithmetic_ops.cpp
@@ -178,7 +178,7 @@ TEST_F(NDAArithmeticOps, ScalarDivision) {
 
   // scalar - matrix division
   auto N_d        = x / M_d_sq;
-  auto inv_M_d_sq = nda::inverse(M_d_sq);
+  auto inv_M_d_sq = nda::linalg::inv(M_d_sq);
   nda::for_each(mat_sq_shape, [&](auto... idxs) { EXPECT_DOUBLE_EQ(N_d(idxs...), x * inv_M_d_sq(idxs...)); });
 
   // matrix - scalar division
@@ -425,7 +425,7 @@ TEST_F(NDAArithmeticOps, MatrixExpressions) {
   EXPECT_ARRAY_NEAR(nda::matrix<double>(A_d * (B_d + C_d)), (nda::matrix<long>{{22, 56}, {262, 666}}));
 
   // matrix division
-  EXPECT_ARRAY_NEAR(nda::matrix<double>(2 * nda::inverse(A_d)), nda::matrix<double>(2 / A_d));
+  EXPECT_ARRAY_NEAR(nda::matrix<double>(2 * nda::linalg::inv(A_d)), nda::matrix<double>(2 / A_d));
 
   // scalar division
   EXPECT_ARRAY_NEAR(nda::matrix<double>(A_d / 2), (nda::matrix<double>{{0.0, 0.5}, {5.0, 5.5}}));
diff --git a/test/c++/nda_blas.cpp b/test/c++/nda_blas.cpp
index 4ed0671aa..afc854f1d 100644
--- a/test/c++/nda_blas.cpp
+++ b/test/c++/nda_blas.cpp
@@ -10,195 +10,392 @@
 
 #include <complex>
 #include <vector>
+#include <utility>
 
-// Test the BLAS gemm function and its generic implementation.
-template <typename value_t, typename Layout>
+using namespace std::complex_literals;
+
+// Test the BLAS gemm function.
+template <typename T, typename Layout1, typename Layout2, typename Layout3>
 void test_gemm() {
-  nda::matrix<value_t, Layout> M1{{0, 1}, {1, 2}}, M2{{1, 1}, {1, 1}}, M3{{1, 0}, {0, 1}}, M3_gen;
-  M3_gen = M3;
+  constexpr auto a_is_f_layout = std::same_as<Layout1, nda::F_layout>;
+  constexpr auto b_is_f_layout = std::same_as<Layout2, nda::F_layout>;
+  constexpr auto c_is_f_layout = std::same_as<Layout3, nda::F_layout>;
+  auto A                       = nda::matrix<T, Layout1>{{1, 2, 3}, {4, 5, 6}};
+  auto B                       = nda::matrix<T, Layout2>{{1, 2}, {3, 4}, {5, 6}};
+  auto exp_C                   = nda::matrix<T, Layout3>{{22, 28}, {49, 64}};
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    B *= 2 - 1i;
+    exp_C *= (1 - 1i) * (2 - 1i);
+  }
 
-  nda::blas::gemm(1.0, M1, M2, 1.0, M3);
-  EXPECT_ARRAY_NEAR(M1, nda::matrix<value_t>{{0, 1}, {1, 2}});
-  EXPECT_ARRAY_NEAR(M2, nda::matrix<value_t>{{1, 1}, {1, 1}});
-  EXPECT_ARRAY_NEAR(M3, nda::matrix<value_t>{{2, 1}, {3, 4}});
+  // C = A * B
+  auto C = nda::matrix<T, Layout3>(2, 2);
+  nda::blas::gemm(1.0, A, B, 0.0, C);
+  EXPECT_ARRAY_NEAR(C, exp_C);
+
+  // C = 3 * A * B + 2 * C
+  nda::blas::gemm(3, A, B, 2, C);
+  EXPECT_ARRAY_NEAR(C, 5 * exp_C);
+
+  // C_t = B^T * A^T
+  auto C_t = nda::matrix<T, Layout3>(2, 2);
+  nda::blas::gemm(1.0, nda::transpose(B), nda::transpose(A), 0.0, C_t);
+  EXPECT_ARRAY_NEAR(C_t, nda::transpose(exp_C));
+
+  // C_h = B^H * A^H
+  if constexpr ((a_is_f_layout and b_is_f_layout and c_is_f_layout) or (!a_is_f_layout and !b_is_f_layout and !c_is_f_layout)) {
+    auto C_h = nda::matrix<T, Layout3>(2, 2);
+    nda::blas::gemm(1.0, nda::dagger(B), nda::dagger(A), 0.0, C_h);
+    EXPECT_ARRAY_NEAR(C_h, nda::dagger(exp_C));
+  }
 
-  nda::blas::gemm_generic(1.0, M1, M2, 1.0, M3_gen);
-  EXPECT_ARRAY_NEAR(M1, nda::matrix<value_t>{{0, 1}, {1, 2}});
-  EXPECT_ARRAY_NEAR(M2, nda::matrix<value_t>{{1, 1}, {1, 1}});
-  EXPECT_ARRAY_NEAR(M3_gen, nda::matrix<value_t>{{2, 1}, {3, 4}});
+  // contiguous matrix views
+  if constexpr (a_is_f_layout and !b_is_f_layout and !c_is_f_layout) {
+    auto exp_C_v = nda::matrix<T, Layout3>{{13, 16}, {37, 46}};
+    if constexpr (nda::is_complex_v<T>) exp_C_v *= (1 - 1i) * (2 - 1i);
+    auto C_v = nda::matrix<T, Layout3>(5, 2);
+    nda::blas::gemm(1.0, A(nda::range::all, nda::range(0, 2)), B(nda::range(1, 3), nda::range::all), 0.0, C_v(nda::range(2, 4), nda::range::all));
+    EXPECT_ARRAY_NEAR(C_v(nda::range(2, 4), nda::range::all), exp_C_v);
+  }
 }
 
+template <typename T>
+constexpr auto test_gemm_layouts = []() {
+  test_gemm<T, nda::C_layout, nda::C_layout, nda::C_layout>();
+  test_gemm<T, nda::C_layout, nda::C_layout, nda::F_layout>();
+  test_gemm<T, nda::C_layout, nda::F_layout, nda::C_layout>();
+  test_gemm<T, nda::C_layout, nda::F_layout, nda::F_layout>();
+  test_gemm<T, nda::F_layout, nda::C_layout, nda::C_layout>();
+  test_gemm<T, nda::F_layout, nda::C_layout, nda::F_layout>();
+  test_gemm<T, nda::F_layout, nda::F_layout, nda::C_layout>();
+  test_gemm<T, nda::F_layout, nda::F_layout, nda::F_layout>();
+};
+
 TEST(NDA, BLASGemm) {
-  test_gemm<double, nda::C_layout>();
-  test_gemm<double, nda::F_layout>();
-  test_gemm<std::complex<double>, nda::C_layout>();
-  test_gemm<std::complex<double>, nda::F_layout>();
+  test_gemm_layouts<float>();
+  test_gemm_layouts<std::complex<float>>();
+  test_gemm_layouts<double>();
+  test_gemm_layouts<std::complex<double>>();
 }
 
-// Test the BLAS gemm_batch function.
-template <typename value_t, typename Layout>
+// Test the BLAS gemm_batch, gemm_vbatch and gemm_batch_strided functions.
+template <typename T, typename Layout, bool is_vbatch>
 void test_gemm_batch() {
-  int batch_count = 10;
-  long size       = 64;
+  int const batch_count = 4;
+  long size             = 2;
+  long fac              = 2;
+  if constexpr (!is_vbatch) {
+    size = 16;
+    fac  = 1;
+  }
 
-  auto vec_A = std::vector(batch_count, nda::matrix<value_t, Layout>::rand({size, size}));
-  auto vec_B = std::vector(batch_count, nda::matrix<value_t, Layout>::rand({size, size}));
-  auto vec_C = std::vector(batch_count, nda::matrix<value_t, Layout>::zeros({size, size}));
-  nda::blas::gemm_batch(1.0, vec_A, vec_B, 0.0, vec_C);
+  // create vector of matrices
+  std::vector<nda::matrix<T, Layout>> vec_A, vec_B, vec_C, exp_C;
+  for ([[maybe_unused]] auto i : nda::range(batch_count)) {
+    vec_A.push_back(nda::matrix<T, Layout>::rand({size, size}));
+    vec_B.push_back(nda::matrix<T, Layout>::rand({size, size}));
+    vec_C.push_back(nda::matrix<T, Layout>::zeros({size, size}));
+    auto tmp = nda::matrix<T, Layout>::zeros({size, size});
+    nda::blas::gemm(1.0, vec_A.back(), vec_B.back(), 0.0, tmp);
+    exp_C.push_back(std::move(tmp));
+    size *= fac;
+  }
 
-  for (auto i : nda::range(batch_count)) EXPECT_ARRAY_NEAR(make_regular(vec_A[i] * vec_B[i]), vec_C[i]);
+  // test batched gemm routines
+  if constexpr (is_vbatch) {
+    nda::blas::gemm_vbatch(1.0, vec_A, vec_B, 0.0, vec_C);
+  } else {
+    nda::blas::gemm_batch(1.0, vec_A, vec_B, 0.0, vec_C);
+  }
+  for (auto i : nda::range(batch_count)) EXPECT_ARRAY_NEAR(vec_C[i], exp_C[i]);
 }
 
 TEST(NDA, BLASGemmBatch) {
-  test_gemm_batch<double, nda::C_layout>();
-  test_gemm_batch<double, nda::F_layout>();
-  test_gemm_batch<std::complex<double>, nda::C_layout>();
-  test_gemm_batch<std::complex<double>, nda::F_layout>();
+  test_gemm_batch<double, nda::C_layout, false>();
+  test_gemm_batch<double, nda::F_layout, false>();
+  test_gemm_batch<std::complex<double>, nda::C_layout, false>();
+  test_gemm_batch<std::complex<double>, nda::F_layout, false>();
 }
 
-// Test the BLAS gemm_vbatch function.
-template <typename value_t, typename Layout>
-void test_gemm_vbatch() {
-  int batch_count = 10;
-  long size       = 64;
-
-  auto vec_A = std::vector(batch_count, nda::matrix<value_t, Layout>::rand({size, size}));
-  auto vec_B = std::vector(batch_count, nda::matrix<value_t, Layout>::rand({size, size}));
-  auto vec_C = std::vector(batch_count, nda::matrix<value_t, Layout>::zeros({size, size}));
-  nda::blas::gemm_vbatch(1.0, vec_A, vec_B, 0.0, vec_C);
+TEST(NDA, BLASGemmVbatch) {
+  test_gemm_batch<double, nda::C_layout, true>();
+  test_gemm_batch<double, nda::F_layout, true>();
+  test_gemm_batch<std::complex<double>, nda::C_layout, true>();
+  test_gemm_batch<std::complex<double>, nda::F_layout, true>();
+}
 
-  for (auto i : nda::range(batch_count)) EXPECT_ARRAY_NEAR(make_regular(vec_A[i] * vec_B[i]), vec_C[i]);
+template <typename T, typename Layout>
+void test_gemm_batch_strided() {
+  int const batch_count = 10;
+  long const size       = 16;
+
+  // create arrays
+  auto arr_A = nda::array<T, 3, Layout>::rand({batch_count, size, size});
+  auto arr_B = nda::array<T, 3, Layout>::rand({batch_count, size, size});
+  auto arr_C = nda::array<T, 3, Layout>::zeros({batch_count, size, size});
+
+  // test strided, batched gemm routine
+  nda::blas::gemm_batch_strided(1.0, arr_A, arr_B, 0.0, arr_C);
+  for (auto i : nda::range(batch_count)) {
+    auto tmp = nda::matrix<T, Layout>::zeros({size, size});
+    nda::blas::gemm(1.0, arr_A(i, nda::range::all, nda::range::all), arr_B(i, nda::range::all, nda::range::all), 0.0, tmp);
+    EXPECT_ARRAY_NEAR(arr_C(i, nda::range::all, nda::range::all), tmp);
+  }
 }
 
-TEST(NDA, BLASGemmVbatch) {
-  test_gemm_vbatch<double, nda::C_layout>();
-  test_gemm_vbatch<double, nda::F_layout>();
-  test_gemm_vbatch<std::complex<double>, nda::C_layout>();
-  test_gemm_vbatch<std::complex<double>, nda::F_layout>();
+TEST(NDA, BLASGemmBatchStrided) {
+  test_gemm_batch_strided<double, nda::C_layout>();
+  test_gemm_batch_strided<std::complex<double>, nda::C_layout>();
 }
 
-// Test the BLAS gemv function and its generic implementation.
-template <typename value_t, typename Layout>
+// Test the BLAS gemv function.
+template <typename T, typename Layout>
 void test_gemv() {
-  using namespace nda::clef::literals;
-
-  nda::matrix<value_t, Layout> A(5, 5);
-  A(i_, j_) << i_ + 2 * j_ + 1;
-
-  nda::vector<value_t> v(5), w(5);
-  v() = 1;
-  w() = 0;
-
-  nda::range rg(1, 3);
-  nda::blas::gemv(1, A(rg, rg), v(rg), 0, w(rg));
-  EXPECT_ARRAY_NEAR(w, nda::vector<value_t>{0, 10, 12, 0, 0});
-
-  nda::vector<value_t> w_gen(5);
-  w_gen() = 0;
-  nda::blas::gemv_generic(1, A(rg, rg), v(rg), 0, w_gen(rg));
-  EXPECT_ARRAY_NEAR(w_gen, nda::vector<value_t>{0, 10, 12, 0, 0});
-
-  auto AT = nda::make_regular(transpose(A));
-  nda::blas::gemv(1, AT(rg, rg), v(rg), 0, w(rg));
-  EXPECT_ARRAY_NEAR(w, nda::vector<value_t>{0, 9, 13, 0, 0});
-
-  nda::blas::gemv_generic(1, AT(rg, rg), v(rg), 0, w_gen(rg));
-  EXPECT_ARRAY_NEAR(w_gen, nda::vector<value_t>{0, 9, 13, 0, 0});
+  auto x       = nda::vector<T>{1, 2, 3};
+  auto x_t     = nda::vector<T>{1, 2, 3, 4};
+  auto exp_y   = nda::vector<T>{14, 32, 50, 68};
+  auto exp_y_t = nda::vector<T>{70, 80, 90};
+  auto A       = nda::matrix<T, Layout>(4, 3);
+  nda::for_each(A.shape(), [&A](auto i, auto j) { A(i, j) = i * 3 + j + 1; });
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    x *= 2 - 1i;
+    x_t *= 2 - 1i;
+    exp_y *= (1 - 1i) * (2 - 1i);
+    exp_y_t *= (1 - 1i) * (2 - 1i);
+  }
 
-  // test operator*
-  w()   = -8;
-  w(rg) = AT(rg, rg) * v(rg);
-  EXPECT_ARRAY_NEAR(w, nda::vector<value_t>{-8, 9, 13, -8, -8});
+  // y = A * x
+  auto y = nda::vector<T>(4);
+  nda::blas::gemv(1.0, A, x, 0.0, y);
+  EXPECT_ARRAY_NEAR(y, exp_y);
+
+  // y = 3 * A * x + 2y
+  nda::blas::gemv(3, A, x, 2, y);
+  EXPECT_ARRAY_NEAR(y, 5 * exp_y);
+
+  // y_t = A^T * x_t
+  auto y_t = nda::vector<T>(3);
+  nda::blas::gemv(1.0, nda::transpose(A), x_t, 0.0, y_t);
+  EXPECT_ARRAY_NEAR(y_t, exp_y_t);
+
+  if constexpr (std::same_as<Layout, nda::F_layout>) {
+    // y_h = A^H * x_t
+    auto exp_y_h = exp_y_t;
+    if constexpr (nda::is_complex_v<T>) exp_y_h = nda::vector<T>{T{210 + 70i}, T{240 + 80i}, T{270 + 90i}};
+    auto y_h = nda::vector<T>(3);
+    nda::blas::gemv(1.0, nda::dagger(A), x_t, 0.0, y_h);
+    EXPECT_ARRAY_NEAR(y_h, exp_y_h);
+  } else {
+    // contiguous matrix view * strided vector view
+    auto x_v                 = nda::vector<T>(6);
+    x_v(nda::range(0, 6, 2)) = x;
+    auto y_v                 = nda::vector<T>(4);
+    nda::blas::gemv(1, A(nda::range(2), nda::range::all), x_v(nda::range(0, 6, 2)), 0, y(nda::range(0, 4, 2)));
+    EXPECT_ARRAY_NEAR(y(nda::range(0, 4, 2)), exp_y(nda::range(2)));
+  }
 }
 
 TEST(NDA, BLASGemv) {
+  test_gemv<float, nda::C_layout>();
+  test_gemv<float, nda::F_layout>();
+  test_gemv<std::complex<float>, nda::C_layout>();
+  test_gemv<std::complex<float>, nda::F_layout>();
   test_gemv<double, nda::C_layout>();
   test_gemv<double, nda::F_layout>();
   test_gemv<std::complex<double>, nda::C_layout>();
   test_gemv<std::complex<double>, nda::F_layout>();
 }
 
-// Test the BLAS ger function.
-template <typename value_t, typename Layout>
-void test_ger() {
-  nda::matrix<value_t, Layout> M(2, 2);
-  M = 0;
-  nda::array<value_t, 1> v{1, 2};
-
-  nda::blas::ger(1.0, v, v, M);
-  EXPECT_ARRAY_NEAR(M, nda::matrix<value_t>{{1, 2}, {2, 4}});
+// Test the BLAS ger/gerc function.
+template <typename T, typename Layout, bool star>
+void test_ger(auto ger) {
+  T fac = 1.0;
+  if constexpr (nda::is_complex_v<T>) fac = 1.0i;
+
+  // resulting 2 x 2 matrix
+  auto exp_M1 = nda::matrix<T>{{1, 2}, {2, 4}};
+  if constexpr (nda::is_complex_v<T> and not star) exp_M1 *= -1;
+  auto M1 = nda::matrix<T, Layout>::zeros(2, 2);
+  nda::vector<T> v{1, 2};
+  v *= fac;
+  ger(1.0, v, v, M1);
+  EXPECT_ARRAY_NEAR(M1, exp_M1);
+  ger(1.0, v, v, M1);
+  EXPECT_ARRAY_NEAR(M1, exp_M1 * 2);
+
+  // resulting 2 x 3 matrix
+  auto exp_M2 = nda::matrix<T>{{3, 4, 5}, {6, 8, 10}};
+  if constexpr (nda::is_complex_v<T>) exp_M2 *= fac;
+  auto M2 = nda::matrix<T, Layout>::zeros(2, 3);
+  nda::vector<T> w{3, 4, 5};
+  ger(1.0, v, w, M2);
+  EXPECT_ARRAY_NEAR(M2, exp_M2);
+  ger(1.0, v, w, M2);
+  EXPECT_ARRAY_NEAR(M2, exp_M2 * 2);
+
+  // resulting 3 x 2 matrix
+  auto exp_M3 = nda::matrix<T>{{3, 6}, {4, 8}, {5, 10}};
+  if constexpr (nda::is_complex_v<T> and not star) exp_M3 *= fac;
+  if constexpr (nda::is_complex_v<T> and star) exp_M3 *= -fac;
+  auto M3 = nda::matrix<T, Layout>::zeros(3, 2);
+  ger(1.0, w, v, M3);
+  EXPECT_ARRAY_NEAR(M3, exp_M3);
+  ger(1.0, w, v, M3);
+  EXPECT_ARRAY_NEAR(M3, exp_M3 * 2);
+
+  // outer product of strided views
+  auto exp_M4 = nda::matrix<T>{{6, 8, 10}, {12, 16, 20}};
+  if constexpr (nda::is_complex_v<T> and not star) exp_M4 *= -1.0;
+  auto M4        = nda::matrix<T, Layout>::zeros(2, 3);
+  auto v_strided = nda::vector<T>{0, 1, 0, 2, 0};
+  v_strided *= fac;
+  auto w_strided = nda::vector<T>{3, 0, 0, 4, 0, 0, 5};
+  w_strided *= fac;
+  ger(2.0, v_strided(nda::range(1, 5, 2)), w_strided(nda::range(0, 7, 3)), M4);
+  EXPECT_ARRAY_NEAR(M4, exp_M4);
 }
 
 TEST(NDA, BLASGer) {
-  test_ger<double, nda::C_layout>();
-  test_ger<double, nda::F_layout>();
-  test_ger<std::complex<double>, nda::C_layout>();
-  test_ger<std::complex<double>, nda::C_layout>();
+  auto ger = [](auto alpha, auto &&x, auto &&y, auto &&m) { return nda::blas::ger(alpha, x, y, m); };
+  test_ger<float, nda::C_layout, false>(ger);
+  test_ger<float, nda::F_layout, false>(ger);
+  test_ger<std::complex<float>, nda::C_layout, false>(ger);
+  test_ger<std::complex<float>, nda::F_layout, false>(ger);
+  test_ger<double, nda::C_layout, false>(ger);
+  test_ger<double, nda::F_layout, false>(ger);
+  test_ger<std::complex<double>, nda::C_layout, false>(ger);
+  test_ger<std::complex<double>, nda::F_layout, false>(ger);
 }
 
-TEST(NDA, BLASOuterProduct) {
-  auto N = nda::rand<double>(2, 3);
-  auto M = nda::rand<double>(4, 5);
-
-  nda::array<double, 4> P(2, 3, 4, 5);
-  for (auto [i, j] : N.indices())
-    for (auto [k, l] : M.indices()) P(i, j, k, l) = N(i, j) * M(k, l);
-
-  EXPECT_ARRAY_NEAR(P, nda::blas::outer_product(N, M));
+TEST(NDA, BLASGerc) {
+  auto gerc = [](auto alpha, auto &&x, auto &&y, auto &&m) { return nda::blas::gerc(alpha, x, y, m); };
+  test_ger<float, nda::F_layout, true>(gerc);
+  test_ger<std::complex<float>, nda::F_layout, true>(gerc);
+  test_ger<double, nda::F_layout, true>(gerc);
+  test_ger<std::complex<double>, nda::F_layout, true>(gerc);
 }
 
-// Test the BLAS dot function and its generic implementation.
-template <typename value_t>
-void test_dot() {
-  nda::vector<value_t> a{1, 2, 3, 4, 5};
-  nda::vector<value_t> b{10, 20, 30, 40, 50};
-  if constexpr (nda::is_complex_v<value_t>) {
+// Test the BLAS dot/dotc function.
+template <typename T, bool star>
+void test_dot(auto dot) {
+  auto exp_dot = [](auto const &a, auto const &b) {
+    T res = 0.0;
+    for (size_t i = 0; i < a.size(); ++i) {
+      if constexpr (star and nda::is_complex_v<T>) {
+        res += std::conj(a(i)) * b(i);
+      } else {
+        res += a(i) * b(i);
+      }
+    }
+    return res;
+  };
+  nda::vector<T> a{1, 2, 3, 4, 5};
+  nda::vector<T> b{10, 20, 30, 40, 50};
+  if constexpr (nda::is_complex_v<T>) {
     a *= 1 + 1i;
     b *= 1 + 2i;
   }
 
-  EXPECT_COMPLEX_NEAR(nda::blas::dot(a, b), nda::blas::dot_generic(a, b), 1.e-14);
+  // vector dot vector
+  EXPECT_COMPLEX_NEAR(dot(a, b), exp_dot(a, b), 1.e-14);
+
+  // size 0 vectors
+  EXPECT_EQ(dot(nda::vector<T>{}, nda::vector<T>{}), T(0));
+
+  // strided vector dot strided vector
+  auto a_v = a(nda::range(0, 5, 2));
+  auto b_v = b(nda::range(0, 5, 2));
+  EXPECT_COMPLEX_NEAR(dot(a_v, b_v), exp_dot(a_v, b_v), 1.e-14);
 }
 
 TEST(NDA, BLASDot) {
-  test_dot<double>();
-  test_dot<std::complex<double>>();
+  auto dot = []<typename A, typename B>(A &&a, B &&b) { return nda::blas::dot(std::forward<A>(a), std::forward<B>(b)); };
+  test_dot<float, false>(dot);
+  test_dot<std::complex<float>, false>(dot);
+  test_dot<double, false>(dot);
+  test_dot<std::complex<double>, false>(dot);
 }
 
-// Test the BLAS dotc function and its generic implementation.
-template <typename value_t>
-void test_dotc() {
-  nda::vector<value_t> a{1, 2, 3, 4, 5};
-  nda::vector<value_t> b{10, 20, 30, 40, 50};
-  if constexpr (nda::is_complex_v<value_t>) {
-    a *= 1 + 1i;
-    b *= 1 + 2i;
-  }
+TEST(NDA, BLASDotc) {
+  auto dotc = []<typename A, typename B>(A &&a, B &&b) { return nda::blas::dotc(std::forward<A>(a), std::forward<B>(b)); };
+  test_dot<float, true>(dotc);
+  test_dot<std::complex<float>, true>(dotc);
+  test_dot<double, true>(dotc);
+  test_dot<std::complex<double>, true>(dotc);
+}
 
-  EXPECT_COMPLEX_NEAR(nda::blas::dotc(a, b), nda::blas::dotc_generic(a, b), 1.e-14);
+// Test the BLAS scal function.
+TEST(NDA, BLASScalEmptyVector) {
+  nda::vector<double> v;
+  nda::blas::scal(3.0, v);
+  EXPECT_TRUE(v.empty());
 }
 
-TEST(NDA, BLASDotc) {
-  test_dotc<double>();
-  test_dotc<std::complex<double>>();
+TEST(NDA, BLASScalFloat) {
+  nda::vector<float> v{1, 2, 3, 4, 5};
+
+  // scale by a float
+  auto v1 = v;
+  auto xd = 3.0f;
+  nda::blas::scal(xd, v1);
+  EXPECT_ARRAY_NEAR(v1, xd * v);
+
+  // scale by an integer
+  auto v2 = v;
+  // int32/auto is fine, but int16 avoids type narrowing
+  int16_t xi = 3;
+  nda::blas::scal(xi, v2);
+  EXPECT_ARRAY_NEAR(v2, xi * v);
 }
 
-// Test the BLAS scal function.
-template <typename value_t>
-void test_scal() {
-  nda::vector<value_t> a{1, 2, 3, 4, 5};
-  value_t x = 3.0;
-  if constexpr (nda::is_complex_v<value_t>) {
-    a *= 1 + 1i;
-    x = 3.0 + 2.0i;
-  }
+TEST(NDA, BLASScalSComplex) {
+  nda::vector<std::complex<float>> v{1, 2, 3, 4, 5};
+  v *= 1 - 1i;
+
+  // scale by a float
+  auto v1 = v;
+  auto xd = 3.0f;
+  nda::blas::scal(xd, v1);
+  EXPECT_ARRAY_NEAR(v1, xd * v);
+
+  // scale by a complex float
+  auto v2 = v;
+  auto xc = 3.0f + 2.0if;
+  nda::blas::scal(xc, v2);
+  EXPECT_ARRAY_NEAR(v2, xc * v);
+}
+
+TEST(NDA, BLASScalDouble) {
+  nda::vector<double> v{1, 2, 3, 4, 5};
+
+  // scale by a double
+  auto v1 = v;
+  auto xd = 3.0;
+  nda::blas::scal(xd, v1);
+  EXPECT_ARRAY_NEAR(v1, xd * v);
 
-  auto exp = nda::make_regular(x * a);
-  nda::blas::scal(x, a);
-  EXPECT_ARRAY_NEAR(a, exp);
+  // scale by an integer
+  auto v2 = v;
+  auto xi = 3;
+  nda::blas::scal(xi, v2);
+  EXPECT_ARRAY_NEAR(v2, xi * v);
 }
 
-TEST(NDA, BLASScal) {
-  test_scal<double>();
-  test_scal<std::complex<double>>();
+TEST(NDA, BLASScalDComplex) {
+  nda::vector<std::complex<double>> v{1, 2, 3, 4, 5};
+  v *= 1 - 1i;
+
+  // scale by a double
+  auto v1 = v;
+  auto xd = 3.0;
+  nda::blas::scal(xd, v1);
+  EXPECT_ARRAY_NEAR(v1, xd * v);
+
+  // scale by a complex double
+  auto v2 = v;
+  auto xc = 3.0 + 2.0i;
+  nda::blas::scal(xc, v2);
+  EXPECT_ARRAY_NEAR(v2, xc * v);
 }
diff --git a/test/c++/nda_cublas.cpp b/test/c++/nda_cublas.cpp
index f076f4fd7..fe5a85f2c 100644
--- a/test/c++/nda_cublas.cpp
+++ b/test/c++/nda_cublas.cpp
@@ -9,182 +9,441 @@
 #include <nda/nda.hpp>
 
 #include <complex>
+#include <concepts>
+#include <utility>
 #include <vector>
 
 // Test the CUBLAS gemm function.
-template <typename value_t, typename Layout>
+template <typename T, typename Layout1, typename Layout2, typename Layout3, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2,
+          nda::mem::AddressSpace AS3>
 void test_gemm() {
-  nda::matrix<value_t, Layout> M1{{0, 1}, {1, 2}}, M2{{1, 1}, {1, 1}}, M3{{1, 0}, {0, 1}};
-  nda::cumatrix<value_t, Layout> M1_d{M1}, M2_d{M2}, M3_d{M3};
-
-  nda::blas::gemm(1.0, M1_d, M2_d, 1.0, M3_d);
-  M3 = M3_d;
+  constexpr auto a_is_f_layout = std::same_as<Layout1, nda::F_layout>;
+  constexpr auto b_is_f_layout = std::same_as<Layout2, nda::F_layout>;
+  constexpr auto c_is_f_layout = std::same_as<Layout3, nda::F_layout>;
+  auto A                       = nda::matrix<T, Layout1>{{1, 2, 3}, {4, 5, 6}};
+  auto B                       = nda::matrix<T, Layout2>{{1, 2}, {3, 4}, {5, 6}};
+  auto exp_C                   = nda::matrix<T, Layout3>{{22, 28}, {49, 64}};
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    B *= 2 - 1i;
+    exp_C *= (1 - 1i) * (2 - 1i);
+  }
+  auto A_d = to_addr_space<AS1>(A);
+  auto B_d = to_addr_space<AS2>(B);
+
+  // C = A * B
+  auto C_d = to_addr_space<AS3>(nda::matrix<T, Layout3>(2, 2));
+  nda::blas::gemm(1.0, A_d, B_d, 0.0, C_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(C_d), exp_C);
+
+  // C = 3 * A * B + 2 * C
+  nda::blas::gemm(3, A_d, B_d, 2, C_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(C_d), 5 * exp_C);
+
+  // C_t = B^T * A^T
+  auto C_t_d = to_addr_space<AS3>(nda::matrix<T, Layout3>(2, 2));
+  nda::blas::gemm(1.0, nda::transpose(B_d), nda::transpose(A_d), 0.0, C_t_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(C_t_d), nda::transpose(exp_C));
+
+  // C_h = B^H * A^H
+  if constexpr ((a_is_f_layout and b_is_f_layout and c_is_f_layout) or (!a_is_f_layout and !b_is_f_layout and !c_is_f_layout)) {
+    auto C_h_d = to_addr_space<AS3>(nda::matrix<T, Layout3>(2, 2));
+    nda::blas::gemm(1.0, nda::dagger(B_d), nda::dagger(A_d), 0.0, C_h_d);
+    EXPECT_ARRAY_NEAR(nda::to_host(C_h_d), nda::dagger(exp_C));
+  }
 
-  EXPECT_ARRAY_NEAR(M3, nda::matrix<value_t>{{2, 1}, {3, 4}});
+  // contiguous matrix views
+  if constexpr (a_is_f_layout and !b_is_f_layout and !c_is_f_layout) {
+    auto exp_C_v = nda::matrix<T, Layout3>{{13, 16}, {37, 46}};
+    if constexpr (nda::is_complex_v<T>) exp_C_v *= (1 - 1i) * (2 - 1i);
+    auto C_v_d = to_addr_space<AS3>(nda::matrix<T, Layout3>(5, 2));
+    nda::blas::gemm(1.0, A_d(nda::range::all, nda::range(0, 2)), B_d(nda::range(1, 3), nda::range::all), 0.0,
+                    C_v_d(nda::range(2, 4), nda::range::all));
+    EXPECT_ARRAY_NEAR(nda::to_host(C_v_d)(nda::range(2, 4), nda::range::all), exp_C_v);
+  }
 }
 
 TEST(NDA, CUBLASGemm) {
-  test_gemm<double, nda::C_layout>();
-  test_gemm<double, nda::F_layout>();
-  test_gemm<std::complex<double>, nda::C_layout>();
-  test_gemm<std::complex<double>, nda::F_layout>();
+  // double, C-layout
+  test_gemm<double, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemm<double, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Device, nda::mem::Unified, nda::mem::Device>();
+  test_gemm<double, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<double, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified>();
+
+  // double, F-layout
+  test_gemm<double, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemm<double, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Unified>();
+  test_gemm<double, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<double, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Unified>();
+
+  // double, mixed layout
+  test_gemm<double, nda::C_layout, nda::F_layout, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemm<double, nda::F_layout, nda::C_layout, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<double, nda::C_layout, nda::F_layout, nda::F_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<double, nda::F_layout, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Unified>();
+
+  // complex, C-layout
+  test_gemm<std::complex<double>, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemm<std::complex<double>, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Device, nda::mem::Unified, nda::mem::Device>();
+  test_gemm<std::complex<double>, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<std::complex<double>, nda::C_layout, nda::C_layout, nda::C_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified>();
+
+  // complex, F-layout
+  test_gemm<std::complex<double>, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemm<std::complex<double>, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Unified>();
+  test_gemm<std::complex<double>, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<std::complex<double>, nda::F_layout, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Unified>();
+
+  // complex, mixed layout
+  test_gemm<std::complex<double>, nda::C_layout, nda::F_layout, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemm<std::complex<double>, nda::F_layout, nda::C_layout, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<std::complex<double>, nda::C_layout, nda::F_layout, nda::F_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified>();
+  test_gemm<std::complex<double>, nda::F_layout, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Unified>();
 }
 
-// Test the CUBLAS gemm_batch function.
-template <typename value_t, typename Layout>
+// Test the CUBLAS/Magma gemm_batch, gemm_vbatch and gemm_batch_strided functions.
+template <typename T, typename Layout, nda::mem::AddressSpace AS, bool is_vbatch>
 void test_gemm_batch() {
-  int batch_count = 10;
-  long size       = 64;
+  int const batch_count = 4;
+  long size             = 2;
+  long fac              = 2;
+  if constexpr (!is_vbatch) {
+    size = 16;
+    fac  = 1;
+  }
 
-  auto vec_A_d = std::vector(batch_count, nda::to_device(nda::matrix<value_t, Layout>::rand({size, size})));
-  auto vec_B_d = std::vector(batch_count, nda::to_device(nda::matrix<value_t, Layout>::rand({size, size})));
-  auto vec_C_d = std::vector(batch_count, nda::to_device(nda::matrix<value_t, Layout>::zeros({size, size})));
-  nda::blas::gemm_batch(1.0, vec_A_d, vec_B_d, 0.0, vec_C_d);
+  // create vector of matrices
+  std::vector<nda::matrix<T, Layout, nda::heap<AS>>> vec_A, vec_B, vec_C;
+  std::vector<nda::matrix<T, Layout>> exp_C;
+  for ([[maybe_unused]] auto i : nda::range(batch_count)) {
+    auto A = nda::matrix<T, Layout>::rand({size, size});
+    auto B = nda::matrix<T, Layout>::rand({size, size});
+    auto C = nda::matrix<T, Layout>::zeros({size, size});
+    vec_A.push_back(A);
+    vec_B.push_back(B);
+    vec_C.push_back(C);
+    nda::blas::gemm(1.0, A, B, 0.0, C);
+    exp_C.push_back(std::move(C));
+    size *= fac;
+  }
 
-  for (auto i : nda::range(batch_count))
-    EXPECT_ARRAY_NEAR(nda::make_regular(nda::to_host(vec_A_d[i]) * nda::to_host(vec_B_d[i])), nda::to_host(vec_C_d[i]));
+  // test batched gemm routines
+  if constexpr (is_vbatch) {
+    nda::blas::gemm_vbatch(1.0, vec_A, vec_B, 0.0, vec_C);
+  } else {
+    nda::blas::gemm_batch(1.0, vec_A, vec_B, 0.0, vec_C);
+  }
+  for (auto i : nda::range(batch_count)) EXPECT_ARRAY_NEAR(nda::to_host(vec_C[i]), exp_C[i]);
 }
 
 TEST(NDA, CUBLASGemmBatch) {
-  test_gemm_batch<double, nda::C_layout>();
-  test_gemm_batch<double, nda::F_layout>();
-  test_gemm_batch<std::complex<double>, nda::C_layout>();
-  test_gemm_batch<std::complex<double>, nda::F_layout>();
+  test_gemm_batch<double, nda::C_layout, nda::mem::Device, false>();
+  test_gemm_batch<double, nda::F_layout, nda::mem::Device, false>();
+  test_gemm_batch<std::complex<double>, nda::C_layout, nda::mem::Device, false>();
+  test_gemm_batch<std::complex<double>, nda::F_layout, nda::mem::Device, false>();
+
+  test_gemm_batch<double, nda::C_layout, nda::mem::Unified, false>();
+  test_gemm_batch<double, nda::F_layout, nda::mem::Unified, false>();
+  test_gemm_batch<std::complex<double>, nda::C_layout, nda::mem::Unified, false>();
+  test_gemm_batch<std::complex<double>, nda::F_layout, nda::mem::Unified, false>();
 }
 
 #ifdef NDA_HAVE_MAGMA
-template <typename value_t, typename Layout>
-void test_gemm_vbatch() {
-  int batch_count = 10;
-  long size       = 64;
-
-  auto vec_A_d = std::vector(batch_count, nda::to_device(nda::matrix<value_t, Layout>::rand({size, size})));
-  auto vec_B_d = std::vector(batch_count, nda::to_device(nda::matrix<value_t, Layout>::rand({size, size})));
-  auto vec_C_d = std::vector(batch_count, nda::to_device(nda::matrix<value_t, Layout>::zeros({size, size})));
-  nda::blas::gemm_vbatch(1.0, vec_A_d, vec_B_d, 0.0, vec_C_d);
-
-  for (auto i : nda::range(batch_count))
-    EXPECT_ARRAY_NEAR(nda::make_regular(nda::to_host(vec_A_d[i]) * nda::to_host(vec_B_d[i])), nda::to_host(vec_C_d[i]));
+TEST(NDA, MAGMAGemmVbatch) {
+  test_gemm_batch<double, nda::C_layout, nda::mem::Device, true>();
+  test_gemm_batch<double, nda::F_layout, nda::mem::Device, true>();
+  test_gemm_batch<std::complex<double>, nda::C_layout, nda::mem::Device, true>();
+  test_gemm_batch<std::complex<double>, nda::F_layout, nda::mem::Device, true>();
+
+  test_gemm_batch<double, nda::C_layout, nda::mem::Unified, true>();
+  test_gemm_batch<double, nda::F_layout, nda::mem::Unified, true>();
+  test_gemm_batch<std::complex<double>, nda::C_layout, nda::mem::Unified, true>();
+  test_gemm_batch<std::complex<double>, nda::F_layout, nda::mem::Unified, true>();
+}
+#endif // NDA_HAVE_MAGMA
+
+template <typename T, typename Layout, nda::mem::AddressSpace AS>
+void test_gemm_batch_strided() {
+  int const batch_count = 10;
+  long const size       = 16;
+
+  // create arrays
+  auto arr_A   = nda::array<T, 3, Layout>::rand({batch_count, size, size});
+  auto arr_B   = nda::array<T, 3, Layout>::rand({batch_count, size, size});
+  auto arr_C   = nda::array<T, 3, Layout>::zeros({batch_count, size, size});
+  auto arr_A_d = nda::array<T, 3, Layout, nda::heap<AS>>{arr_A};
+  auto arr_B_d = nda::array<T, 3, Layout, nda::heap<AS>>{arr_B};
+  auto arr_C_d = nda::array<T, 3, Layout, nda::heap<AS>>{arr_C};
+
+  // test strided, batched gemm routine
+  nda::blas::gemm_batch_strided(1.0, arr_A_d, arr_B_d, 0.0, arr_C_d);
+  nda::blas::gemm_batch_strided(1.0, arr_A, arr_B, 0.0, arr_C);
+  for (auto i : nda::range(batch_count)) {
+    EXPECT_ARRAY_NEAR(nda::to_host(arr_C_d(i, nda::range::all, nda::range::all)), arr_C(i, nda::range::all, nda::range::all));
+  }
 }
 
-TEST(NDA, CUBLASGemmVbatch) {
-  test_gemm_vbatch<double, nda::C_layout>();
-  test_gemm_vbatch<double, nda::F_layout>();
-  test_gemm_vbatch<std::complex<double>, nda::C_layout>();
-  test_gemm_vbatch<std::complex<double>, nda::F_layout>();
+TEST(NDA, BLASGemmBatchStrided) {
+  test_gemm_batch_strided<double, nda::C_layout, nda::mem::Device>();
+  test_gemm_batch_strided<double, nda::C_layout, nda::mem::Unified>();
+  test_gemm_batch_strided<std::complex<double>, nda::C_layout, nda::mem::Device>();
+  test_gemm_batch_strided<std::complex<double>, nda::C_layout, nda::mem::Unified>();
 }
-#endif
 
 // Test the CUBLAS gemv function.
-template <typename value_t, typename Layout>
+template <typename T, typename Layout, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2, nda::mem::AddressSpace AS3>
 void test_gemv() {
-  using namespace nda::clef::literals;
-
-  nda::matrix<value_t, Layout> A(5, 5);
-  A(i_, j_) << i_ + 2 * j_ + 1;
-
-  nda::vector<value_t> v(5), w(5);
-  v() = 1;
-  w() = 0;
-
-  nda::cumatrix<value_t, Layout> A_d{A};
-  nda::cuvector<value_t> v_d{v}, w_d{w};
-
-  nda::range rg(1, 3);
-  nda::blas::gemv(1, A_d(rg, rg), v_d(rg), 0, w_d(rg));
-  w = w_d;
-  EXPECT_ARRAY_NEAR(w, nda::vector<value_t>{0, 10, 12, 0, 0});
-
-  auto AT_d = nda::transpose(A_d);
-  nda::blas::gemv(1, AT_d(rg, rg), v_d(rg), 0, w_d(rg));
-  w = w_d;
-  EXPECT_ARRAY_NEAR(w, nda::vector<value_t>{0, 9, 13, 0, 0});
-
-  // test operator*
-  w_d(rg) = AT_d(rg, rg) * v_d(rg);
-  w()     = -8;
-  w(rg)   = w_d(rg);
-  EXPECT_ARRAY_NEAR(w, nda::vector<value_t>{-8, 9, 13, -8, -8});
+  auto x       = nda::vector<T>{1, 2, 3};
+  auto x_t     = nda::vector<T>{1, 2, 3, 4};
+  auto exp_y   = nda::vector<T>{14, 32, 50, 68};
+  auto exp_y_t = nda::vector<T>{70, 80, 90};
+  auto A       = nda::matrix<T, Layout>(4, 3);
+  nda::for_each(A.shape(), [&A](auto i, auto j) { A(i, j) = i * 3 + j + 1; });
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    x *= 2 - 1i;
+    x_t *= 2 - 1i;
+    exp_y *= (1 - 1i) * (2 - 1i);
+    exp_y_t *= (1 - 1i) * (2 - 1i);
+  }
+  auto A_d   = to_addr_space<AS1>(A);
+  auto x_d   = to_addr_space<AS2>(x);
+  auto x_t_d = to_addr_space<AS2>(x_t);
+
+  // y = A * x
+  auto y_d = to_addr_space<AS3>(nda::vector<T>(4));
+  nda::blas::gemv(1.0, A_d, x_d, 0.0, y_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(y_d), exp_y);
+
+  // y = 3 * A * x + 2y
+  nda::blas::gemv(3, A_d, x_d, 2, y_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(y_d), 5 * exp_y);
+
+  // y_t = A^T * x_t
+  auto y_t_d = to_addr_space<AS3>(nda::vector<T>(3));
+  nda::blas::gemv(1.0, nda::transpose(A_d), x_t_d, 0.0, y_t_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(y_t_d), exp_y_t);
+
+  if constexpr (std::same_as<Layout, nda::F_layout>) {
+    // y_h = A^H * x_t
+    auto exp_y_h = exp_y_t;
+    if constexpr (nda::is_complex_v<T>) exp_y_h = nda::vector<T>{210 + 70i, 240 + 80i, 270 + 90i};
+    auto y_h_d = to_addr_space<AS3>(nda::vector<T>(3));
+    nda::blas::gemv(1.0, nda::dagger(A_d), x_t_d, 0.0, y_h_d);
+    EXPECT_ARRAY_NEAR(nda::to_host(y_h_d), exp_y_h);
+  } else {
+    // contiguous matrix view * strided vector view
+    auto x_v                 = nda::vector<T>(6);
+    x_v(nda::range(0, 6, 2)) = x;
+    auto x_v_d               = to_addr_space<AS2>(x_v);
+    auto y_v_d               = to_addr_space<AS3>(nda::vector<T>(4));
+    nda::blas::gemv(1, A_d(nda::range(2), nda::range::all), x_v_d(nda::range(0, 6, 2)), 0, y_v_d(nda::range(0, 4, 2)));
+    EXPECT_ARRAY_NEAR(nda::to_host(y_v_d)(nda::range(0, 4, 2)), exp_y(nda::range(2)));
+  }
 }
 
 TEST(NDA, CUBLASGemv) {
-  test_gemv<double, nda::C_layout>();
-  test_gemv<double, nda::F_layout>();
-  test_gemv<std::complex<double>, nda::C_layout>();
-  test_gemv<std::complex<double>, nda::F_layout>();
+  test_gemv<double, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemv<double, nda::C_layout, nda::mem::Device, nda::mem::Unified, nda::mem::Device>();
+  test_gemv<double, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemv<double, nda::C_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified>();
+
+  test_gemv<double, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemv<double, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Unified>();
+  test_gemv<double, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemv<double, nda::F_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Unified>();
+
+  test_gemv<std::complex<double>, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemv<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Device>();
+  test_gemv<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemv<std::complex<double>, nda::C_layout, nda::mem::Host, nda::mem::Host, nda::mem::Unified>();
+
+  test_gemv<std::complex<double>, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device>();
+  test_gemv<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Device>();
+  test_gemv<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified>();
+  test_gemv<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Host>();
 }
 
-// Test the CUBLAS ger function.
-template <typename value_t, typename Layout>
-void test_ger() {
-  nda::matrix<value_t, Layout> M(2, 2);
-  M = 0;
-  nda::array<value_t, 1> v{1, 2};
-
-  nda::cumatrix<value_t, Layout> M_d{M};
-  nda::cuvector<value_t> v_d{v};
-
-  nda::blas::ger(1.0, v_d, v_d, M_d);
-
-  M = M_d;
-  EXPECT_ARRAY_NEAR(M, nda::matrix<value_t>{{1, 2}, {2, 4}});
+// Test the CUBLAS ger/gerc function.
+template <typename T, typename Layout, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2, nda::mem::AddressSpace AS3, bool star>
+void test_ger(auto ger) {
+  using matrix_t = nda::matrix<T, Layout>;
+  T fac          = 1.0;
+  if constexpr (nda::is_complex_v<T>) fac = 1.0i;
+
+  // resulting 2 x 2 matrix
+  auto exp_M1 = nda::matrix<T>{{1, 2}, {2, 4}};
+  if constexpr (nda::is_complex_v<T> and not star) exp_M1 *= -1;
+  auto M1_d = to_addr_space<AS1>(matrix_t::zeros(2, 2));
+  nda::vector<T> v{1, 2};
+  v *= fac;
+  auto v_d = to_addr_space<AS2>(v);
+  ger(1.0, v_d, v_d, M1_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M1_d), exp_M1);
+  ger(1.0, v_d, v_d, M1_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M1_d), exp_M1 * 2);
+
+  // resulting 2 x 3 matrix
+  auto exp_M2 = nda::matrix<T>{{3, 4, 5}, {6, 8, 10}};
+  if constexpr (nda::is_complex_v<T>) exp_M2 *= fac;
+  auto M2_d = to_addr_space<AS1>(matrix_t::zeros(2, 3));
+  auto w_d  = to_addr_space<AS3>(nda::vector<T>{3, 4, 5});
+  ger(1.0, v_d, w_d, M2_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M2_d), exp_M2);
+  ger(1.0, v_d, w_d, M2_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M2_d), exp_M2 * 2);
+
+  // resulting 3 x 2 matrix
+  auto exp_M3 = nda::matrix<T>{{3, 6}, {4, 8}, {5, 10}};
+  if constexpr (nda::is_complex_v<T> and not star) exp_M3 *= fac;
+  if constexpr (nda::is_complex_v<T> and star) exp_M3 *= -fac;
+  auto M3_d = to_addr_space<AS1>(matrix_t::zeros(3, 2));
+  ger(1.0, w_d, v_d, M3_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M3_d), exp_M3);
+  ger(1.0, w_d, v_d, M3_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M3_d), exp_M3 * 2);
+
+  // outer product of strided views
+  auto exp_M4 = nda::matrix<T>{{6, 8, 10}, {12, 16, 20}};
+  if constexpr (nda::is_complex_v<T> and not star) exp_M4 *= -1.0;
+  auto M4_d      = to_addr_space<AS1>(matrix_t::zeros(2, 3));
+  auto v_strided = nda::vector<T>{0, 1, 0, 2, 0};
+  v_strided *= fac;
+  auto v_strided_d = to_addr_space<AS2>(v_strided);
+  auto w_strided   = nda::vector<T>{3, 0, 0, 4, 0, 0, 5};
+  w_strided *= fac;
+  auto w_strided_d = to_addr_space<AS3>(w_strided);
+  ger(2.0, v_strided_d(nda::range(1, 5, 2)), w_strided_d(nda::range(0, 7, 3)), M4_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(M4_d), exp_M4);
 }
 
 TEST(NDA, CUBLASGer) {
-  test_ger<double, nda::C_layout>();
-  test_ger<double, nda::F_layout>();
-  test_ger<std::complex<double>, nda::C_layout>();
-  test_ger<std::complex<double>, nda::C_layout>();
+  auto ger = [](auto alpha, auto &&x, auto &&y, auto &&m) { return nda::blas::ger(alpha, x, y, m); };
+  test_ger<double, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device, false>(ger);
+  test_ger<double, nda::C_layout, nda::mem::Device, nda::mem::Unified, nda::mem::Unified, false>(ger);
+  test_ger<double, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified, false>(ger);
+  test_ger<double, nda::C_layout, nda::mem::Host, nda::mem::Host, nda::mem::Unified, false>(ger);
+
+  test_ger<double, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device, false>(ger);
+  test_ger<double, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Unified, false>(ger);
+  test_ger<double, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified, false>(ger);
+  test_ger<double, nda::F_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Unified, false>(ger);
+
+  test_ger<std::complex<double>, nda::C_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device, false>(ger);
+  test_ger<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Device, false>(ger);
+  test_ger<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified, false>(ger);
+  test_ger<std::complex<double>, nda::C_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified, false>(ger);
+
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device, false>(ger);
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Device, nda::mem::Device, false>(ger);
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified, false>(ger);
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Host, nda::mem::Host, false>(ger);
 }
 
-TEST(NDA, CUBLASOuterProduct) {
-  auto N = nda::rand<double>(2, 3);
-  auto M = nda::rand<double>(4, 5);
-
-  nda::array<double, 4> P(2, 3, 4, 5);
-  for (auto [i, j] : N.indices())
-    for (auto [k, l] : M.indices()) P(i, j, k, l) = N(i, j) * M(k, l);
-
-  nda::cumatrix<double> M_d{M}, N_d{N};
-  auto Res_d = nda::blas::outer_product(N_d, M_d);
-  auto Res   = nda::array<double, 4>{Res_d};
-  EXPECT_ARRAY_NEAR(P, Res);
+TEST(NDA, CUBLASGerc) {
+  auto gerc = [](auto alpha, auto &&x, auto &&y, auto &&m) { return nda::blas::gerc(alpha, x, y, m); };
+  test_ger<double, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device, true>(gerc);
+  test_ger<double, nda::F_layout, nda::mem::Device, nda::mem::Unified, nda::mem::Device, true>(gerc);
+  test_ger<double, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified, true>(gerc);
+  test_ger<double, nda::F_layout, nda::mem::Host, nda::mem::Host, nda::mem::Unified, true>(gerc);
+
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Device, nda::mem::Device, nda::mem::Device, true>(gerc);
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Device, nda::mem::Unified, true>(gerc);
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified, nda::mem::Unified, true>(gerc);
+  test_ger<std::complex<double>, nda::F_layout, nda::mem::Host, nda::mem::Unified, nda::mem::Unified, true>(gerc);
 }
 
-// Test the CUBLAS dot function.
-template <typename value_t>
-void test_dot() {
-  nda::vector<value_t> a{1, 2, 3, 4, 5};
-  nda::vector<value_t> b{10, 20, 30, 40, 50};
-  if constexpr (nda::is_complex_v<value_t>) {
+// Test the CUBLAS dot/dotc function.
+template <typename T, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2, bool star>
+void test_dot(auto dot) {
+  auto exp_dot = [](auto const &a, auto const &b) {
+    T res = 0.0;
+    for (size_t i = 0; i < a.size(); ++i) {
+      if constexpr (star and nda::is_complex_v<T>) {
+        res += std::conj(a(i)) * b(i);
+      } else {
+        res += a(i) * b(i);
+      }
+    }
+    return res;
+  };
+  nda::vector<T> a{1, 2, 3, 4, 5};
+  nda::vector<T> b{10, 20, 30, 40, 50};
+  if constexpr (nda::is_complex_v<T>) {
     a *= 1 + 1i;
     b *= 1 + 2i;
   }
+  auto a_d = to_addr_space<AS1>(a);
+  auto b_d = to_addr_space<AS2>(b);
 
-  nda::cuvector<value_t> a_d{a}, b_d{b};
-  EXPECT_COMPLEX_NEAR((nda::blas::dot(a_d, b_d)), (nda::blas::dot_generic(a, b)), 1.e-14);
+  // vector dot vector
+  EXPECT_COMPLEX_NEAR(dot(a_d, b_d), exp_dot(a, b), 1.e-14);
+
+  // size 0 vectors
+  EXPECT_EQ(dot(to_addr_space<AS1>(nda::vector<T>{}), to_addr_space<AS2>(nda::vector<T>{})), T(0));
+
+  // strided vector dot strided vector
+  EXPECT_COMPLEX_NEAR(dot(a_d(nda::range(0, 5, 2)), b_d(nda::range(0, 5, 2))), exp_dot(a(nda::range(0, 5, 2)), b(nda::range(0, 5, 2))), 1.e-14);
 }
 
 TEST(NDA, CUBLASDot) {
-  test_dot<double>();
-  test_dot<std::complex<double>>();
+  auto dot = []<typename A, typename B>(A &&a, B &&b) { return nda::blas::dot(std::forward<A>(a), std::forward<B>(b)); };
+  test_dot<double, nda::mem::Device, nda::mem::Device, false>(dot);
+  test_dot<double, nda::mem::Device, nda::mem::Unified, false>(dot);
+  test_dot<double, nda::mem::Unified, nda::mem::Unified, false>(dot);
+  test_dot<double, nda::mem::Unified, nda::mem::Host, false>(dot);
+  test_dot<std::complex<double>, nda::mem::Device, nda::mem::Device, false>(dot);
+  test_dot<std::complex<double>, nda::mem::Unified, nda::mem::Device, false>(dot);
+  test_dot<std::complex<double>, nda::mem::Unified, nda::mem::Unified, false>(dot);
+  test_dot<std::complex<double>, nda::mem::Host, nda::mem::Unified, false>(dot);
 }
 
-// Test the CUBLAS dotc function.
-template <typename value_t>
-void test_dotc() {
-  nda::vector<value_t> a{1, 2, 3, 4, 5};
-  nda::vector<value_t> b{10, 20, 30, 40, 50};
-  if constexpr (nda::is_complex_v<value_t>) {
-    a *= 1 + 1i;
-    b *= 1 + 2i;
-  }
+TEST(NDA, CUBLASDotc) {
+  auto dotc = []<typename A, typename B>(A &&a, B &&b) { return nda::blas::dotc(std::forward<A>(a), std::forward<B>(b)); };
+  test_dot<double, nda::mem::Device, nda::mem::Device, true>(dotc);
+  test_dot<double, nda::mem::Device, nda::mem::Unified, true>(dotc);
+  test_dot<double, nda::mem::Unified, nda::mem::Unified, true>(dotc);
+  test_dot<double, nda::mem::Unified, nda::mem::Host, true>(dotc);
+  test_dot<std::complex<double>, nda::mem::Device, nda::mem::Device, true>(dotc);
+  test_dot<std::complex<double>, nda::mem::Unified, nda::mem::Device, true>(dotc);
+  test_dot<std::complex<double>, nda::mem::Unified, nda::mem::Unified, true>(dotc);
+  test_dot<std::complex<double>, nda::mem::Host, nda::mem::Unified, true>(dotc);
+}
 
-  nda::cuvector<value_t> a_d{a}, b_d{b};
-  EXPECT_COMPLEX_NEAR((nda::blas::dotc(a_d, b_d)), (nda::blas::dotc_generic(a, b)), 1.e-14);
+// Test the CUBLAS scal function.
+template <nda::mem::AddressSpace AS>
+void test_scal() {
+  // empty vector
+  nda::vector<double> v;
+  auto v_d = to_addr_space<AS>(v);
+  nda::blas::scal(3.0, v_d);
+  EXPECT_TRUE(v_d.empty());
+
+  // scale a double vector by a double
+  v   = {1, 2, 3, 4, 5};
+  v_d = to_addr_space<AS>(v);
+  nda::blas::scal(3.0, v_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(v_d), 3.0 * v);
+
+  // scale a double vector by an integer
+  v_d = to_addr_space<AS>(v);
+  nda::blas::scal(3, v_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(v_d), 3 * v);
+
+  // scale a complex double vector by a double
+  auto vc = nda::vector<std::complex<double>>{1, 2, 3, 4, 5};
+  vc *= 1 - 1i;
+  auto vc_d = to_addr_space<AS>(vc);
+  nda::blas::scal(3.0, vc_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(vc_d), 3.0 * vc);
+
+  // scale by a complex double
+  vc_d = to_addr_space<AS>(vc);
+  nda::blas::scal(3.0 + 2.0i, vc_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(vc_d), (3.0 + 2.0i) * vc);
 }
 
-TEST(NDA, CUBLASDotc) {
-  test_dotc<double>();
-  test_dotc<std::complex<double>>();
+TEST(NDA, CUBLASScal) {
+  test_scal<nda::mem::Device>();
+  test_scal<nda::mem::Unified>();
 }
diff --git a/test/c++/nda_culapack.cpp b/test/c++/nda_culapack.cpp
index fe29cd2cd..fb64e30bc 100644
--- a/test/c++/nda_culapack.cpp
+++ b/test/c++/nda_culapack.cpp
@@ -10,65 +10,148 @@
 
 #include <algorithm>
 #include <complex>
+#include <concepts>
+#include <tuple>
+#include <type_traits>
 
 // Test the CULAPACK gesvd function.
-template <typename value_t>
+template <typename T, typename Layout, nda::mem::AddressSpace AS>
 void test_gesvd() {
-  using matrix_t = nda::matrix<value_t, nda::F_layout>;
-
-  auto A      = matrix_t{{{1, 1, 1}, {2, 3, 4}, {3, 5, 2}, {4, 2, 5}, {5, 4, 3}}};
+  auto A = nda::matrix<T, Layout>{{{1, 1, 1}, {2, 3, 4}, {3, 5, 2}, {4, 2, 5}, {5, 4, 3}}};
+  if constexpr (std::same_as<Layout, nda::C_layout>) {
+    // CUDA cannot handle when m < n
+    A = nda::matrix<T, Layout>(nda::transpose(A));
+  }
   auto [m, n] = A.shape();
 
-  auto U  = matrix_t(m, m);
-  auto VT = matrix_t(n, n);
-
-  auto s = nda::vector<double>(std::min(m, n));
-
-  auto A_d  = to_device(A);
-  auto s_d  = to_device(s);
-  auto U_d  = to_device(U);
-  auto VT_d = to_device(VT);
-  nda::lapack::gesvd(A_d, s_d, U_d, VT_d);
-  s  = s_d;
-  U  = U_d;
-  VT = VT_d;
+  auto A_d  = to_addr_space<AS>(A);
+  auto U_d  = to_addr_space<AS>(nda::matrix<T, Layout>(m, m));
+  auto VT_d = to_addr_space<AS>(nda::matrix<T, Layout>(n, n));
+  auto S_d  = to_addr_space<AS>(nda::vector<double>(std::min(m, n)));
+  nda::lapack::gesvd(A_d, S_d, U_d, VT_d);
 
-  auto Sigma = matrix_t::zeros(A.shape());
-  for (auto i : nda::range(std::min(m, n))) Sigma(i, i) = s(i);
-  EXPECT_ARRAY_NEAR(A, U * Sigma * VT, 1e-14);
+  auto S     = nda::to_host(S_d);
+  auto Sigma = nda::matrix<double, Layout>::zeros(A.shape());
+  for (auto i : nda::range(std::min(m, n))) Sigma(i, i) = S(i);
+  EXPECT_ARRAY_NEAR(A, nda::to_host(U_d) * Sigma * nda::to_host(VT_d), 1e-14);
 }
 
 TEST(NDA, CULAPACKGesvd) {
-  test_gesvd<double>();
-  test_gesvd<std::complex<double>>();
+  test_gesvd<double, nda::C_layout, nda::mem::Device>();
+  test_gesvd<double, nda::F_layout, nda::mem::Device>();
+  test_gesvd<std::complex<double>, nda::C_layout, nda::mem::Device>();
+  test_gesvd<std::complex<double>, nda::F_layout, nda::mem::Device>();
+
+  test_gesvd<double, nda::C_layout, nda::mem::Unified>();
+  test_gesvd<double, nda::F_layout, nda::mem::Unified>();
+  test_gesvd<std::complex<double>, nda::C_layout, nda::mem::Unified>();
+  test_gesvd<std::complex<double>, nda::F_layout, nda::mem::Unified>();
 }
 
 // Test the CULAPACK getrs and getrf functions.
-template <typename value_t>
+template <typename T, typename Layout, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2>
 void test_getrs_getrf() {
-  using matrix_t = nda::matrix<value_t, nda::F_layout>;
-
-  auto A = matrix_t{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
-  auto B = matrix_t{{1, 5}, {4, 5}, {3, 6}};
-
-  // solve A * x = B using exact matrix inverse
-  auto Ainv = matrix_t{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
-  auto X1   = matrix_t{Ainv * B};
-  EXPECT_ARRAY_NEAR(matrix_t{A * X1}, B);
-
-  // solve A * x = B using getrf and getrs
-  auto A_d = to_device(A);
-  auto B_d = to_device(B);
-  nda::cuarray<int, 1> ipiv(3);
-  nda::lapack::getrf(A_d, ipiv);
-  nda::lapack::getrs(A_d, B_d, ipiv);
-
-  auto X2 = to_host(B_d);
-  EXPECT_ARRAY_NEAR(matrix_t{A * X2}, B);
-  EXPECT_ARRAY_NEAR(X1, X2);
+  T fac = 1.0;
+  if constexpr (nda::is_complex_v<T>) fac = 1.0i;
+
+  auto A = nda::matrix<T, Layout>{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
+  A *= fac;
+  auto Ainv = nda::matrix<T, Layout>{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
+  Ainv /= fac;
+  auto B = nda::matrix<T, nda::F_layout>{{1, 5}, {4, 5}, {3, 6}};
+
+  // solve A * X = B using getrf and getrs
+  auto A_d    = to_addr_space<AS1>(A);
+  auto B_d    = to_addr_space<AS2>(B);
+  auto ipiv_d = to_addr_space<AS1>(nda::array<int, 1>(3));
+  nda::lapack::getrf(A_d, ipiv_d);
+  nda::lapack::getrs(A_d, B_d, ipiv_d);
+  EXPECT_ARRAY_NEAR(A * nda::to_host(B_d), B);
+  EXPECT_ARRAY_NEAR(Ainv * B, nda::to_host(B_d));
+
+  // solve A^T * X = B using getrf and getrs
+  A_d = A;
+  B_d = B;
+  nda::lapack::getrf(A_d, ipiv_d);
+  nda::lapack::getrs(nda::transpose(A_d), B_d, ipiv_d);
+  EXPECT_ARRAY_NEAR(nda::transpose(A) * nda::to_host(B_d), B);
+  EXPECT_ARRAY_NEAR(nda::transpose(Ainv) * B, nda::to_host(B_d));
+
+  // solve A^H * X = B using getrf and getrs
+  if constexpr (std::same_as<Layout, nda::F_layout>) {
+    A_d = A;
+    B_d = B;
+    nda::lapack::getrf(A_d, ipiv_d);
+    nda::lapack::getrs(nda::conj(nda::transpose(A_d)), B_d, ipiv_d);
+    EXPECT_ARRAY_NEAR(nda::conj(nda::transpose(A)) * nda::to_host(B_d), B);
+    EXPECT_ARRAY_NEAR(nda::conj(nda::transpose(Ainv)) * B, nda::to_host(B_d));
+  }
+
+  // solve A * x = b using getrf and getrs
+  A_d      = A;
+  auto b_d = to_addr_space<AS2>(nda::vector<T>{B(nda::range::all, 0)});
+  nda::lapack::getrf(A_d, ipiv_d);
+  nda::lapack::getrs(A_d, b_d, ipiv_d);
+  EXPECT_ARRAY_NEAR(A * nda::to_host(b_d), B(nda::range::all, 0));
+  EXPECT_ARRAY_NEAR((Ainv * B)(nda::range::all, 0), nda::to_host(b_d));
 }
 
 TEST(NDA, CULAPACKGetrsAndGetrf) {
-  test_getrs_getrf<double>();
-  test_getrs_getrf<std::complex<double>>();
+  test_getrs_getrf<double, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_getrs_getrf<double, nda::F_layout, nda::mem::Device, nda::mem::Unified>();
+  test_getrs_getrf<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Device>();
+  test_getrs_getrf<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified>();
+
+  test_getrs_getrf<double, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_getrs_getrf<double, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+  test_getrs_getrf<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Host>();
+  test_getrs_getrf<std::complex<double>, nda::F_layout, nda::mem::Device, nda::mem::Device>();
+}
+
+TEST(NDA, CULAPACKGetrfWithRectangularMatrix) {
+  auto A      = nda::matrix<double, nda::F_layout>{{1, 5}, {4, 5}, {3, 6}};
+  auto AT     = nda::matrix<double, nda::F_layout>(nda::transpose(A));
+  auto A_c    = nda::matrix<double, nda::C_layout>{A};
+  auto AT_c   = nda::matrix<double, nda::C_layout>{AT};
+  auto ipiv_d = nda::cuarray<int, 1>(2);
+
+  // get the matrices P, L, U from getrf output
+  auto get_plu = [](auto const &M, auto const &ipiv, int m, int n) {
+    using layout_t   = std::conditional_t<nda::blas::has_C_layout<decltype(M)>, nda::C_layout, nda::F_layout>;
+    auto P           = nda::matrix<double, layout_t>::zeros(m, m);
+    auto L           = nda::matrix<double, layout_t>::zeros(m, m);
+    auto U           = nda::matrix<double, layout_t>::zeros(m, n);
+    nda::diagonal(P) = 1;
+    nda::diagonal(L) = 1;
+    for (int i = 0; i < ipiv.size(); ++i) deep_swap(P(i, nda::range::all), P(ipiv(i) - 1, nda::range::all));
+    for (int i = 0; i < m; ++i) {
+      L(i, nda::range(i))    = (nda::blas::has_C_layout<decltype(M)> ? M(nda::range(i), i) : M(i, nda::range(i)));
+      U(i, nda::range(i, n)) = (nda::blas::has_C_layout<decltype(M)> ? M(nda::range(i, n), i) : M(i, nda::range(i, n)));
+    }
+    return std::make_tuple(P, L, U);
+  };
+
+  // LU decomposition for 3x2 Fortran layout matrix
+  auto LU_f_32 = nda::to_device(A);
+  nda::lapack::getrf(LU_f_32, ipiv_d);
+  auto [P_f_32, L_f_32, U_f_32] = get_plu(nda::to_host(LU_f_32), nda::to_host(ipiv_d), 3, 2);
+  EXPECT_ARRAY_NEAR(P_f_32 * A, L_f_32 * U_f_32);
+
+  // LU decomposition for 2x3 Fortran layout matrix
+  auto LU_f_23 = nda::to_device(AT);
+  nda::lapack::getrf(LU_f_23, ipiv_d);
+  auto [P_f_23, L_f_23, U_f_23] = get_plu(nda::to_host(LU_f_23), nda::to_host(ipiv_d), 2, 3);
+  EXPECT_ARRAY_NEAR(P_f_23 * AT, L_f_23 * U_f_23);
+
+  // LU decomposition for 3x2 C layout matrix
+  auto LU_c_32 = nda::to_device(A_c);
+  nda::lapack::getrf(LU_c_32, ipiv_d);
+  auto [P_c_32, L_c_32, U_c_32] = get_plu(nda::to_host(LU_c_32), nda::to_host(ipiv_d), 2, 3);
+  EXPECT_ARRAY_NEAR(P_c_32 * nda::transpose(A_c), L_c_32 * U_c_32);
+
+  // LU decomposition for 2x3 C layout matrix
+  auto LU_c_23 = nda::to_device(AT_c);
+  nda::lapack::getrf(LU_c_23, ipiv_d);
+  auto [P_c_23, L_c_23, U_c_23] = get_plu(nda::to_host(LU_c_23), nda::to_host(ipiv_d), 3, 2);
+  EXPECT_ARRAY_NEAR(P_c_23 * nda::transpose(AT_c), L_c_23 * U_c_23);
 }
diff --git a/test/c++/nda_culinear_algebra.cpp b/test/c++/nda_culinear_algebra.cpp
new file mode 100644
index 000000000..f2373dda6
--- /dev/null
+++ b/test/c++/nda_culinear_algebra.cpp
@@ -0,0 +1,392 @@
+// Copyright (c) 2019--present, The Simons Foundation
+// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
+// SPDX-License-Identifier: Apache-2.0
+// See LICENSE in the root of this distribution for details.
+
+#include "./test_common.hpp"
+
+#include <nda/gtest_tools.hpp>
+#include <nda/nda.hpp>
+
+#include <complex>
+#include <concepts>
+
+using namespace std::complex_literals;
+
+// Test the generic dot/dotc function.
+auto exp_dot(auto const &a, auto const &b) {
+  auto res = a(0) * b(0);
+  for (size_t i = 1; i < a.size(); ++i) res += a(i) * b(i);
+  return res;
+}
+
+auto exp_dotc(auto const &a, auto const &b) {
+  auto res = std::conj(a(0)) * b(0);
+  for (size_t i = 1; i < a.size(); ++i) res += std::conj(a(i)) * b(i);
+  return res;
+}
+
+template <nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2>
+void test_dot() {
+  // BLAS compatible vectors
+  auto a   = nda::vector<double>{1, 2, 3, 4, 5};
+  auto b   = nda::vector<double>{10, 20, 30, 40, 50};
+  auto a_d = to_addr_space<AS1>(a);
+  auto b_d = to_addr_space<AS2>(b);
+  EXPECT_DOUBLE_EQ(nda::linalg::dot(a_d, b_d), nda::blas::dot(a, b));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(a_d, b_d), nda::blas::dotc(a, b));
+
+  auto c   = nda::vector<std::complex<double>>{a * (1.1 - 2.1i)};
+  auto d   = nda::vector<std::complex<double>>{b * (3 + 4i)};
+  auto c_d = to_addr_space<AS1>(c);
+  auto d_d = to_addr_space<AS2>(d);
+  EXPECT_COMPLEX_NEAR(nda::linalg::dot(c_d, d_d), exp_dot(c, d));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(c_d, d_d), exp_dotc(c, d));
+
+  // vectors with different value types
+  if constexpr (nda::mem::have_host_compatible_addr_space<decltype(a_d), decltype(b_d)>) {
+    EXPECT_COMPLEX_NEAR(nda::linalg::dot(a_d, c_d), exp_dot(a, c));
+    EXPECT_COMPLEX_NEAR(nda::linalg::dotc(a_d, c_d), exp_dotc(a, c));
+
+    auto e   = nda::vector<int>{1, 2, 3, 4, 5};
+    auto e_d = to_addr_space<AS1>(e);
+    EXPECT_EQ(nda::linalg::dot(e_d, e_d), exp_dot(e, e));
+    EXPECT_DOUBLE_EQ(nda::linalg::dot(e_d, b_d), exp_dot(e, b));
+    EXPECT_COMPLEX_NEAR(nda::linalg::dotc(e_d, b_d), exp_dotc(e, b));
+
+    // lazy expressions
+    auto sin_a = nda::make_regular(nda::sin(a));
+    EXPECT_DOUBLE_EQ(nda::linalg::dot(nda::sin(a_d), b), exp_dot(sin_a, b));
+    EXPECT_COMPLEX_NEAR(nda::linalg::dotc(nda::sin(a_d), b), exp_dotc(sin_a, b));
+  }
+
+  // (strided) vector views
+  auto rg1 = nda::range(0, 5, 2);
+  auto rg2 = nda::range(1, 4);
+  EXPECT_COMPLEX_NEAR(nda::linalg::dot(c_d(rg1), d_d(rg2)), exp_dot(c(rg1), d(rg2)));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(c_d(rg1), d_d(rg2)), exp_dotc(c(rg1), d(rg2)));
+}
+
+TEST(NDA, CULinearAlgebraDotProduct) {
+  test_dot<nda::mem::Device, nda::mem::Device>();
+  test_dot<nda::mem::Device, nda::mem::Unified>();
+  test_dot<nda::mem::Unified, nda::mem::Device>();
+  test_dot<nda::mem::Unified, nda::mem::Unified>();
+  test_dot<nda::mem::Unified, nda::mem::Host>();
+  test_dot<nda::mem::Host, nda::mem::Unified>();
+}
+
+// Test the generic matvecmul function.
+template <typename T, typename Layout, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2>
+void test_matvecmul() {
+  auto x       = nda::vector<T>{1, 2, 3};
+  auto x_t     = nda::vector<T>{1, 2, 3, 4};
+  auto exp_y   = nda::vector<T>{14, 32, 50, 68};
+  auto exp_y_t = nda::vector<T>{70, 80, 90};
+  auto A       = nda::matrix<T, Layout>(4, 3);
+  nda::for_each(A.shape(), [&A](auto i, auto j) { A(i, j) = i * 3 + j + 1; });
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    x *= 2 - 1i;
+    x_t *= 2 - 1i;
+    exp_y *= (1 - 1i) * (2 - 1i);
+    exp_y_t *= (1 - 1i) * (2 - 1i);
+  }
+  auto A_d   = to_addr_space<AS1>(A);
+  auto x_d   = to_addr_space<AS2>(x);
+  auto x_t_d = to_addr_space<AS2>(x_t);
+
+  // y = A * x
+  auto y_d = nda::linalg::matvecmul(A_d, x_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(y_d), exp_y);
+
+  // y_t = A^T * x_t
+  auto y_t_d = nda::linalg::matvecmul(nda::transpose(A_d), x_t_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(y_t_d), exp_y_t);
+
+  // y_h = A^H * x_t
+  if constexpr (nda::blas::has_F_layout<decltype(A_d)>) {
+    auto exp_y_h = exp_y_t;
+    if constexpr (nda::is_complex_v<T>) exp_y_h = nda::vector<T>{210 + 70i, 240 + 80i, 270 + 90i};
+    auto y_h_d = nda::linalg::matvecmul(nda::conj(nda::transpose(A_d)), x_t_d);
+    EXPECT_ARRAY_NEAR(nda::to_host(y_h_d), exp_y_h);
+  }
+
+  // strided matrix and vector views
+  if constexpr (nda::mem::have_host_compatible_addr_space<decltype(A_d), decltype(x_d)>) {
+    auto y_v_d = nda::linalg::matvecmul(A_d(nda::range(0, 4, 2), nda::range(0, 3, 2)), x_d(nda::range(0, 3, 2)));
+    if constexpr (nda::is_complex_v<T>) {
+      EXPECT_ARRAY_EQ(nda::to_host(y_v_d), (nda::vector<T>{10 - 30i, 34 - 102i}));
+    } else {
+      EXPECT_ARRAY_EQ(nda::to_host(y_v_d), (nda::vector<T>{10, 34}));
+    }
+  }
+}
+
+TEST(NDA, CULinearAlgebraMatvecmulGenericGemvBranch) {
+  test_matvecmul<long, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matvecmul<long, nda::C_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matvecmul<long, nda::C_layout, nda::mem::Host, nda::mem::Unified>();
+
+  test_matvecmul<long, nda::F_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matvecmul<long, nda::F_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matvecmul<long, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+}
+
+TEST(NDA, CULinearAlgebraMatvecmulBLASBranch) {
+  test_matvecmul<double, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_matvecmul<double, nda::F_layout, nda::mem::Unified, nda::mem::Device>();
+  test_matvecmul<double, nda::C_layout, nda::mem::Device, nda::mem::Unified>();
+  test_matvecmul<std::complex<double>, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_matvecmul<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Device>();
+  test_matvecmul<std::complex<double>, nda::C_layout, nda::mem::Device, nda::mem::Unified>();
+
+  test_matvecmul<double, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matvecmul<double, nda::F_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matvecmul<double, nda::C_layout, nda::mem::Host, nda::mem::Unified>();
+  test_matvecmul<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matvecmul<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matvecmul<std::complex<double>, nda::C_layout, nda::mem::Host, nda::mem::Unified>();
+}
+
+// Test the generic matmul function.
+template <typename T, typename Layout1, typename Layout2, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2>
+void test_matmul() {
+  auto A     = nda::matrix<T, Layout1>{{1, 2, 3}, {4, 5, 6}};
+  auto B     = nda::matrix<T, Layout2>{{1, 2}, {3, 4}, {5, 6}};
+  auto exp_C = nda::matrix<T>{{22, 28}, {49, 64}};
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    B *= 2 - 1i;
+    exp_C *= (1 - 1i) * (2 - 1i);
+  }
+  auto A_d = to_addr_space<AS1>(A);
+  auto B_d = to_addr_space<AS2>(B);
+
+  // C = A * B
+  auto C_d = nda::linalg::matmul(A_d, B_d);
+  EXPECT_ARRAY_NEAR(nda::to_host(C_d), exp_C);
+
+  // C_t = B^T * A^T
+  auto C_t_d = nda::linalg::matmul(nda::transpose(B_d), nda::transpose(A_d));
+  EXPECT_ARRAY_NEAR(nda::to_host(C_t_d), nda::transpose(exp_C));
+
+  // C_h = B^H * A^H --> not working right now because of how we determine the layout of C
+  // if constexpr (std::same_as<Layout1, Layout2>) {
+  //   auto C_h_d = nda::linalg::matmul(nda::dagger(B_d), nda::dagger(A_d));
+  //   EXPECT_ARRAY_NEAR(nda::to_host(C_h_d), nda::dagger(exp_C));
+  // }
+
+  // strided matrix views
+  if constexpr (nda::mem::have_host_compatible_addr_space<decltype(A_d), decltype(B_d)>) {
+    auto exp_C_v = nda::matrix<T>{{16, 20}, {34, 44}};
+    if constexpr (nda::is_complex_v<T>) exp_C_v *= (1 - 1i) * (2 - 1i);
+    auto C_v_d = nda::matrix<T>(4, 4);
+    C_v_d(nda::range(0, 4, 2), nda::range(0, 4, 2)) =
+       nda::linalg::matmul(A_d(nda::range::all, nda::range(0, 3, 2)), B_d(nda::range(0, 3, 2), nda::range::all));
+    EXPECT_ARRAY_NEAR(C_v_d(nda::range(0, 4, 2), nda::range(0, 4, 2)), exp_C_v);
+  }
+}
+
+TEST(NDA, CULinearAlgebraMatmulGenericGemmBranch) {
+  test_matmul<long, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matmul<long, nda::C_layout, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+  test_matmul<long, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matmul<long, nda::F_layout, nda::C_layout, nda::mem::Host, nda::mem::Unified>();
+}
+
+TEST(NDA, CULinearAlgebraMatmulBLASBranch) {
+  test_matmul<double, nda::C_layout, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_matmul<double, nda::C_layout, nda::F_layout, nda::mem::Unified, nda::mem::Device>();
+  test_matmul<double, nda::F_layout, nda::F_layout, nda::mem::Device, nda::mem::Unified>();
+  test_matmul<double, nda::F_layout, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_matmul<std::complex<double>, nda::C_layout, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_matmul<std::complex<double>, nda::C_layout, nda::F_layout, nda::mem::Unified, nda::mem::Device>();
+  test_matmul<std::complex<double>, nda::F_layout, nda::F_layout, nda::mem::Device, nda::mem::Unified>();
+  test_matmul<std::complex<double>, nda::F_layout, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+
+  test_matmul<double, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matmul<double, nda::C_layout, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+  test_matmul<double, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matmul<double, nda::F_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matmul<std::complex<double>, nda::C_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_matmul<std::complex<double>, nda::C_layout, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+  test_matmul<std::complex<double>, nda::F_layout, nda::F_layout, nda::mem::Unified, nda::mem::Host>();
+  test_matmul<std::complex<double>, nda::F_layout, nda::C_layout, nda::mem::Unified, nda::mem::Unified>();
+}
+
+// Test general inverse functions.
+template <typename T, typename Layout, nda::mem::AddressSpace AS>
+void test_inv() {
+  using matrix_t = nda::matrix<T, Layout>;
+  T fac          = 1.0;
+  if constexpr (nda::is_complex_v<T>) fac = 1.0i;
+
+  // lambda that checks inverse function
+  auto check_inv = [](auto const &M, auto const &Minv) {
+    auto M_d    = to_addr_space<AS>(M);
+    auto Minv_d = nda::linalg::inv(M_d);
+    EXPECT_ARRAY_NEAR(Minv, nda::to_host(Minv_d));
+    auto M2_d = nda::linalg::inv(Minv_d);
+    EXPECT_ARRAY_NEAR(M, nda::to_host(M2_d));
+  };
+
+  // 1x1 matrix
+  auto C = matrix_t{{3}};
+  C *= fac;
+  auto Cinv = matrix_t{{1.0 / 3.0}};
+  Cinv /= fac;
+  check_inv(C, Cinv);
+
+  // 2x2 matrix
+  auto B = matrix_t{{1, 2}, {0, 1}};
+  B *= fac;
+  auto Binv = matrix_t{{1, -2}, {0, 1}};
+  Binv /= fac;
+  check_inv(B, Binv);
+
+  // 3x3 matrix
+  auto A = matrix_t{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
+  A *= fac;
+  auto Ainv = matrix_t{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
+  Ainv /= fac;
+  check_inv(A, Ainv);
+
+  // 4x4 matrix
+  auto D = matrix_t{{2, 2, 2, 2}, {2, 4, 6, 8}, {2, 6, 12, 20}, {2, 8, 20, 40}};
+  D *= fac;
+  auto Dinv = matrix_t{{2, -3, 2, -0.5}, {-3, 7, -5.5, 1.5}, {2, -5.5, 5, -1.5}, {-0.5, 1.5, -1.5, 0.5}};
+  Dinv /= fac;
+  check_inv(D, Dinv);
+}
+
+TEST(NDA, CULinearAlgebraInv) {
+  test_inv<double, nda::C_layout, nda::mem::Device>();
+  test_inv<double, nda::F_layout, nda::mem::Device>();
+  test_inv<std::complex<double>, nda::C_layout, nda::mem::Device>();
+  test_inv<std::complex<double>, nda::F_layout, nda::mem::Device>();
+
+  test_inv<double, nda::C_layout, nda::mem::Unified>();
+  test_inv<double, nda::F_layout, nda::mem::Unified>();
+  test_inv<std::complex<double>, nda::C_layout, nda::mem::Unified>();
+  test_inv<std::complex<double>, nda::F_layout, nda::mem::Unified>();
+}
+
+// Test the outer product function.
+template <typename T, typename Layout, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2>
+void test_outer_product() {
+  // outer product of two arrays
+  auto A = nda::array<T, 2, Layout>::rand(2, 3);
+  auto B = nda::array<T, 3, Layout>::rand(4, 5, 6);
+  auto C = nda::array<T, 5, Layout>(2, 3, 4, 5, 6);
+  for (auto [i, j] : A.indices())
+    for (auto [k, l, m] : B.indices()) C(i, j, k, l, m) = A(i, j) * B(k, l, m);
+  auto A_d = to_addr_space<AS1>(A);
+  auto B_d = to_addr_space<AS2>(B);
+  EXPECT_ARRAY_NEAR(C, nda::to_host(nda::linalg::outer_product(A_d, B_d)));
+
+  // outer product of two vectors
+  nda::vector<T> v{1, 2};
+  nda::vector<T> w{3, 4, 5};
+  auto v_d = to_addr_space<AS1>(v);
+  auto w_d = to_addr_space<AS2>(w);
+  auto M_d = nda::linalg::outer_product(v_d, w_d);
+  static_assert(nda::get_algebra<decltype(M_d)> == 'M');
+  static_assert(nda::blas::has_C_layout<decltype(M_d)>);
+  EXPECT_ARRAY_NEAR(nda::matrix<T>{{3, 4, 5}, {6, 8, 10}}, nda::to_host(M_d));
+}
+
+TEST(NDA, CULinearAlgebraOuterProduct) {
+  test_outer_product<double, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_outer_product<double, nda::F_layout, nda::mem::Device, nda::mem::Unified>();
+  test_outer_product<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Device>();
+  test_outer_product<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_outer_product<double, nda::C_layout, nda::mem::Unified, nda::mem::Host>();
+  test_outer_product<std::complex<double>, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+}
+
+// Test the generic solve and solve_in_place functions.
+template <typename T, typename Layout, nda::mem::AddressSpace AS1, nda::mem::AddressSpace AS2>
+void test_solve() {
+  auto A   = nda::matrix<T, Layout>{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
+  auto B   = nda::matrix<T, nda::F_layout>{{1, 5}, {4, 5}, {3, 6}};
+  auto A_d = to_addr_space<AS1>(A);
+  auto B_d = to_addr_space<AS2>(B);
+  auto b_d = to_addr_space<AS2>(nda::make_regular(B(nda::range::all, 0)));
+
+  // solve A * X = B using the exact matrix inverse
+  auto Ainv = nda::matrix<T, Layout>{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
+  auto X    = nda::matrix<T, Layout>{Ainv * B};
+  EXPECT_ARRAY_NEAR(A * X, B);
+
+  // solve A * X = B using solve_in_place
+  auto A2_d = A_d;
+  auto B2_d = B_d;
+  nda::linalg::solve_in_place(A2_d, B2_d);
+  EXPECT_ARRAY_NEAR(A * nda::to_host(B2_d), B);
+  EXPECT_ARRAY_NEAR(X, nda::to_host(B2_d));
+
+  // solve A * x = b using solve_in_place
+  A2_d      = A;
+  auto b2_d = b_d;
+  nda::linalg::solve_in_place(A2_d, b2_d);
+  EXPECT_ARRAY_NEAR(A * nda::to_host(b2_d), B(nda::range::all, 0));
+  EXPECT_ARRAY_NEAR(X(nda::range::all, 0), nda::to_host(b2_d));
+
+  // solve A * X = B using solve
+  auto X_d = nda::linalg::solve(A_d, B_d);
+  EXPECT_ARRAY_NEAR(A * nda::to_host(X_d), B);
+  EXPECT_ARRAY_NEAR(X, nda::to_host(X_d));
+
+  // solve A * x = b using solve
+  auto x_d = nda::linalg::solve(A_d, b_d);
+  EXPECT_ARRAY_NEAR(A * nda::to_host(x_d), B(nda::range::all, 0));
+  EXPECT_ARRAY_NEAR(X(nda::range::all, 0), nda::to_host(x_d));
+}
+
+TEST(NDA, CULinearAlgebraSolve) {
+  test_solve<double, nda::C_layout, nda::mem::Device, nda::mem::Device>();
+  test_solve<double, nda::F_layout, nda::mem::Device, nda::mem::Unified>();
+  test_solve<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Device>();
+  test_solve<std::complex<double>, nda::F_layout, nda::mem::Unified, nda::mem::Unified>();
+  test_solve<double, nda::F_layout, nda::mem::Host, nda::mem::Unified>();
+  test_solve<std::complex<double>, nda::C_layout, nda::mem::Unified, nda::mem::Host>();
+}
+
+// Test the svd and svd_in_place functions.
+template <typename T, typename Layout, nda::mem::AddressSpace AS>
+void test_svd() {
+  auto A = nda::matrix<T, Layout>{{2, -2, 1}, {-4, -8, -8}};
+  if constexpr (std::same_as<Layout, nda::F_layout>) {
+    // CUDA cannot handle when m < n
+    A = nda::matrix<T, Layout>(nda::transpose(A));
+  }
+  auto s = nda::vector<double>{12, 3};
+
+  // compute the SVD of A
+  auto A_d              = to_addr_space<AS>(A);
+  auto [U_d, s_d, VH_d] = nda::linalg::svd(A_d);
+  auto S                = nda::matrix<T, Layout>::zeros(A.shape());
+  nda::diagonal(S)      = nda::to_host(s_d);
+  EXPECT_ARRAY_NEAR(s, nda::to_host(s_d), 1e-14);
+  EXPECT_ARRAY_NEAR(A, nda::to_host(U_d) * S * nda::to_host(VH_d), 1e-14);
+
+  // compute the SVD of A in place
+  auto [U_d2, s_d2, VH_d2] = nda::linalg::svd_in_place(A_d);
+  auto S2                  = nda::matrix<T, Layout>::zeros(A.shape());
+  nda::diagonal(S2)        = nda::to_host(s_d2);
+  EXPECT_ARRAY_NEAR(s, nda::to_host(s_d2), 1e-14);
+  EXPECT_ARRAY_NEAR(A, nda::to_host(U_d2) * S2 * nda::to_host(VH_d2), 1e-14);
+}
+
+TEST(NDA, CULinearAlgebraSVD) {
+  test_svd<double, nda::C_layout, nda::mem::Device>();
+  test_svd<double, nda::F_layout, nda::mem::Device>();
+  test_svd<std::complex<double>, nda::C_layout, nda::mem::Device>();
+  test_svd<std::complex<double>, nda::F_layout, nda::mem::Device>();
+
+  test_svd<double, nda::C_layout, nda::mem::Unified>();
+  test_svd<double, nda::F_layout, nda::mem::Unified>();
+  test_svd<std::complex<double>, nda::C_layout, nda::mem::Unified>();
+  test_svd<std::complex<double>, nda::F_layout, nda::mem::Unified>();
+}
diff --git a/test/c++/nda_lapack.cpp b/test/c++/nda_lapack.cpp
index 1603f0468..da290b2f3 100644
--- a/test/c++/nda_lapack.cpp
+++ b/test/c++/nda_lapack.cpp
@@ -4,116 +4,90 @@
 // See LICENSE in the root of this distribution for details.
 
 #include "./test_common.hpp"
+#include "nda/traits.hpp"
 
+#include <limits>
 #include <nda/gtest_tools.hpp>
 #include <nda/lapack/gelss_worker.hpp>
 #include <nda/nda.hpp>
 
 #include <algorithm>
 #include <complex>
+#include <tuple>
+#include <type_traits>
 
 using namespace nda;
+using namespace std::complex_literals;
 
 // Test LAPACK gtsv function.
-template <typename value_t>
-void test_gtsv() {
-  // sub-diagonal, diagonal, and super-diagonal elements
-  vector<value_t> subdiag_vec   = {4, 3, 2, 1};
-  vector<value_t> diag_vec      = {1, 2, 3, 4, 5};
-  vector<value_t> superdiag_vec = {1, 2, 3, 4};
-
-  // right hand side
-  vector<value_t> B1 = {6, 2, 7, 4, 5};
-  vector<value_t> B2 = {1, 3, 8, 9, 10};
-  auto B             = matrix<value_t, F_layout>(5, 2);
-  B(range::all, 0)   = B1;
-  B(range::all, 1)   = B2;
-
-  // reference solutions
-  vector<double> ref_sol_1 = {43.0 / 33.0, 155.0 / 33.0, -208.0 / 33.0, 130.0 / 33.0, 7.0 / 33.0};
-  vector<double> ref_sol_2 = {-28.0 / 33.0, 61.0 / 33.0, 89.0 / 66.0, -35.0 / 66.0, 139.0 / 66.0};
-  matrix<double, F_layout> ref_sol(5, 2);
-  ref_sol(range::all, 0) = ref_sol_1;
-  ref_sol(range::all, 1) = ref_sol_2;
-
-  {
-    auto dl(subdiag_vec);
-    auto d(diag_vec);
-    auto du(superdiag_vec);
-    int info = lapack::gtsv(dl, d, du, B1);
-    EXPECT_EQ(info, 0);
-    EXPECT_ARRAY_NEAR(B1, ref_sol_1);
-  }
-  {
-    auto dl(subdiag_vec);
-    auto d(diag_vec);
-    auto du(superdiag_vec);
-    int info = lapack::gtsv(dl, d, du, B2);
-    EXPECT_EQ(info, 0);
-    EXPECT_ARRAY_NEAR(B2, ref_sol_2);
-  }
-  {
-    auto dl(subdiag_vec);
-    auto d(diag_vec);
-    auto du(superdiag_vec);
-    int info = lapack::gtsv(dl, d, du, B);
-    EXPECT_EQ(info, 0);
-    EXPECT_ARRAY_NEAR(B, ref_sol);
-  }
+void test_gtsv(auto dl, auto d, auto du, auto B, auto exp) {
+  int info = lapack::gtsv(dl, d, du, B);
+  EXPECT_EQ(info, 0);
+  EXPECT_ARRAY_NEAR(B, exp);
 }
-TEST(NDA, LAPACKGtsv) {
-  test_gtsv<double>();
-  test_gtsv<std::complex<double>>();
-
-  // test cgtsv
-  vector<std::complex<double>> subdiag_vec   = {-4i, -3i, -2i, -1i};
-  vector<std::complex<double>> diag_vec      = {1, 2, 3, 4, 5};
-  vector<std::complex<double>> superdiag_vec = {1i, 2i, 3i, 4i};
-
-  // right hand side
-  vector<std::complex<double>> B1 = {6 + 0i, 2i, 7 + 0i, 4i, 5 + 0i};
-  vector<std::complex<double>> B2 = {1i, 3 + 0i, 8i, 9 + 0i, 10i};
-  matrix<std::complex<double>, F_layout> B(5, 2);
-  B(range::all, 0) = B1;
-  B(range::all, 1) = B2;
-
-  // reference solutions
-  vector<std::complex<double>> ref_sol_1 = {137.0 / 33.0 + 0i, -61i / 33.0, 368.0 / 33.0 + 0i, 230i / 33.0, -13.0 / 33.0 + 0i};
-  vector<std::complex<double>> ref_sol_2 = {-35i / 33.0, 68.0 / 33.0 + 0i, -103i / 66.0, 415.0 / 66.0 + 0i, 215i / 66.0};
-  matrix<std::complex<double>, F_layout> ref_sol(5, 2);
-  ref_sol(range::all, 0) = ref_sol_1;
-  ref_sol(range::all, 1) = ref_sol_2;
-
-  {
-    auto dl(subdiag_vec);
-    auto d(diag_vec);
-    auto du(superdiag_vec);
-    int info = lapack::gtsv(dl, d, du, B1);
-    EXPECT_EQ(info, 0);
-    EXPECT_ARRAY_NEAR(B1, ref_sol_1);
-  }
-  {
-    auto dl(subdiag_vec);
-    auto d(diag_vec);
-    auto du(superdiag_vec);
-    int info = lapack::gtsv(dl, d, du, B2);
-    EXPECT_EQ(info, 0);
-    EXPECT_ARRAY_NEAR(B2, ref_sol_2);
-  }
-  {
-    auto dl(subdiag_vec);
-    auto d(diag_vec);
-    auto du(superdiag_vec);
-    int info = lapack::gtsv(dl, d, du, B);
-    EXPECT_EQ(info, 0);
-    EXPECT_ARRAY_NEAR(B, ref_sol);
-  }
+
+TEST(NDA, LAPACKGtsvDouble) {
+  auto check = []<typename T>() {
+    // sub-diagonal, diagonal, and super-diagonal elements
+    auto dl = vector<T>{4, 3, 2, 1};
+    auto d  = vector<T>{1, 2, 3, 4, 5};
+    auto du = vector<T>{1, 2, 3, 4};
+
+    // right hand sides
+    auto b1          = vector<T>{6, 2, 7, 4, 5};
+    auto b2          = vector<T>{1, 3, 8, 9, 10};
+    auto B           = matrix<T, F_layout>(5, 2);
+    B(range::all, 0) = b1;
+    B(range::all, 1) = b2;
+
+    // expected solutions
+    auto exp_x1          = vector<double>{43.0 / 33.0, 155.0 / 33.0, -208.0 / 33.0, 130.0 / 33.0, 7.0 / 33.0};
+    auto exp_x2          = vector<double>{-28.0 / 33.0, 61.0 / 33.0, 89.0 / 66.0, -35.0 / 66.0, 139.0 / 66.0};
+    auto exp_X           = matrix<double, F_layout>(5, 2);
+    exp_X(range::all, 0) = exp_x1;
+    exp_X(range::all, 1) = exp_x2;
+
+    test_gtsv(dl, d, du, b1, exp_x1);
+    test_gtsv(dl, d, du, b2, exp_x2);
+    test_gtsv(dl, d, du, B, exp_X);
+  };
+
+  check.operator()<double>();
+  check.operator()<std::complex<double>>();
+}
+
+TEST(NDA, LAPACKGtsvComplex) {
+  // sub-diagonal, diagonal, and super-diagonal elements
+  auto dl = vector<std::complex<double>>{-4i, -3i, -2i, -1i};
+  auto d  = vector<std::complex<double>>{1, 2, 3, 4, 5};
+  auto du = vector<std::complex<double>>{1i, 2i, 3i, 4i};
+
+  // right hand sides
+  auto b1          = vector<std::complex<double>>{6 + 0i, 2i, 7 + 0i, 4i, 5 + 0i};
+  auto b2          = vector<std::complex<double>>{1i, 3 + 0i, 8i, 9 + 0i, 10i};
+  auto B           = matrix<std::complex<double>, F_layout>(5, 2);
+  B(range::all, 0) = b1;
+  B(range::all, 1) = b2;
+
+  // expected solutions
+  auto exp_x1          = vector<std::complex<double>>{137.0 / 33.0 + 0i, -61i / 33.0, 368.0 / 33.0 + 0i, 230i / 33.0, -13.0 / 33.0 + 0i};
+  auto exp_x2          = vector<std::complex<double>>{-35i / 33.0, 68.0 / 33.0 + 0i, -103i / 66.0, 415.0 / 66.0 + 0i, 215i / 66.0};
+  auto exp_X           = matrix<std::complex<double>, F_layout>(5, 2);
+  exp_X(range::all, 0) = exp_x1;
+  exp_X(range::all, 1) = exp_x2;
+
+  test_gtsv(dl, d, du, b1, exp_x1);
+  test_gtsv(dl, d, du, b2, exp_x2);
+  test_gtsv(dl, d, du, B, exp_X);
 }
 
 // Test LAPACK gesvd function.
-template <typename value_t>
+template <typename T, typename Layout>
 void test_gesvd() {
-  using matrix_t = matrix<value_t, F_layout>;
+  using matrix_t = matrix<T, Layout>;
+  using fp_type  = nda::get_fp_t<T>;
+  // condition number is ~7, and magnitude 5 with absolute error checking
+  constexpr double eps_close = 7 * 5 * std::numeric_limits<fp_type>::epsilon();
 
   auto A      = matrix_t{{{1, 1, 1}, {2, 3, 4}, {3, 5, 2}, {4, 2, 5}, {5, 4, 3}}};
   auto [m, n] = A.shape();
@@ -121,60 +95,75 @@ void test_gesvd() {
   auto U  = matrix_t(m, m);
   auto VT = matrix_t(n, n);
 
-  auto S     = vector<double>(std::min(m, n));
+  auto S     = vector<fp_type>(std::min(m, n));
   auto Acopy = matrix_t{A};
   lapack::gesvd(Acopy, S, U, VT);
 
   auto Sigma = matrix_t::zeros(A.shape());
   for (auto i : range(std::min(m, n))) Sigma(i, i) = S(i);
-  EXPECT_ARRAY_NEAR(A, U * Sigma * VT, 1e-14);
+  EXPECT_ARRAY_NEAR(A, U * Sigma * VT, eps_close);
 }
 
 TEST(NDA, LAPACKGesvd) {
-  test_gesvd<double>();
-  test_gesvd<std::complex<double>>();
+  test_gesvd<float, C_layout>();
+  test_gesvd<float, F_layout>();
+  test_gesvd<std::complex<float>, C_layout>();
+  test_gesvd<std::complex<float>, F_layout>();
+  test_gesvd<double, C_layout>();
+  test_gesvd<double, F_layout>();
+  test_gesvd<std::complex<double>, C_layout>();
+  test_gesvd<std::complex<double>, F_layout>();
 }
 
 // Test LAPACK geqp3, orgqr and ungqr functions.
-template <typename value_t, bool wide_matrix = false>
+template <typename T, bool wide_matrix = false>
 void test_geqp3_orgqr_ungqr() {
-  using matrix_t = matrix<value_t, F_layout>;
+  using matrix_t = matrix<T, F_layout>;
+  using fp_type  = nda::get_fp_t<T>;
+  // condition number is ~7, and magnitude 5 with absolute error checking
+  constexpr double eps_close = 7 * 5 * std::numeric_limits<fp_type>::epsilon();
 
   auto A = matrix_t{{{1, 1, 1}, {3, 2, 4}, {5, 3, 2}, {2, 4, 5}, {4, 5, 3}}};
-  if (wide_matrix) A = matrix_t{transpose(A)};
+  if constexpr (wide_matrix) A = matrix_t{transpose(A)};
   auto [m, n] = A.shape();
 
   // compute QR factorization with column pivoting, i.e. A * P = Q * R
   auto jpvt = nda::zeros<int>(n);
-  auto tau  = nda::vector<value_t>(std::min(m, n));
+  auto tau  = nda::vector<T>(std::min(m, n));
   auto Q    = matrix_t{A};
   lapack::geqp3(Q, jpvt, tau);
 
   // compute A * P by permuting columns of A
+  jpvt -= 1;
   auto AP = matrix_t{A};
   for (int j = 0; j < n; ++j) { AP(range::all, j) = A(range::all, jpvt(j)); }
 
   // extract upper triangular matrix R
-  auto R = nda::matrix<value_t, F_layout>::zeros(std::min(m, n), n);
+  auto R = nda::matrix<T, F_layout>::zeros(std::min(m, n), n);
   for (int i = 0; i < std::min(m, n); ++i) {
     for (int j = i; j < n; ++j) { R(i, j) = Q(i, j); }
   }
 
   // extract matrix Q with orthonormal columns
-  if constexpr (std::is_same_v<value_t, double>) {
+  if constexpr (std::is_same_v<T, double> or std::is_same_v<T, float>) {
     lapack::orgqr(Q, tau);
   } else {
     lapack::ungqr(Q, tau);
   }
 
-  EXPECT_ARRAY_NEAR(AP, Q(range::all, range(std::min(m, n))) * R, 1e-14);
+  EXPECT_ARRAY_NEAR(AP, Q(range::all, range(std::min(m, n))) * R, eps_close);
 }
+
 TEST(NDA, LAPACKGeqp3UngqrAndOrgqr) {
   // tall matrix, i.e. n_rows > n_cols
+  test_geqp3_orgqr_ungqr<float>();
+  test_geqp3_orgqr_ungqr<std::complex<float>>();
   test_geqp3_orgqr_ungqr<double>();
   test_geqp3_orgqr_ungqr<std::complex<double>>();
 
   // wide matrix, i.e. n_rows < n_cols
+  test_geqp3_orgqr_ungqr<float, true>();
+  test_geqp3_orgqr_ungqr<std::complex<float>, true>();
   test_geqp3_orgqr_ungqr<double, true>();
   test_geqp3_orgqr_ungqr<std::complex<double>, true>();
 }
@@ -182,6 +171,10 @@ TEST(NDA, LAPACKGeqp3UngqrAndOrgqr) {
 // Test LAPACK gelss function and the gelss_worker class.
 template <typename value_t>
 void test_gelss() {
+  using fp_type = nda::get_fp_t<value_t>;
+  // condition number of B is ~8, and magnitude ~10 with absolute error checking
+  constexpr double eps_close = 8 * 10 * std::numeric_limits<fp_type>::epsilon();
+
   // Cf. https://www.netlib.org/lapack/lapack-3.9.0/LAPACKE/example/example_DGELS_colmajor.c
   auto A = matrix<value_t>{{1, 1, 1}, {2, 3, 4}, {3, 5, 2}, {4, 2, 5}, {5, 4, 3}};
   auto B = matrix<value_t>{{-10, -3}, {12, 14}, {14, 12}, {16, 16}, {18, 16}};
@@ -189,54 +182,85 @@ void test_gelss() {
 
   auto [m, n]  = A.shape();
   auto x_exact = matrix<value_t>{{2, 1}, {1, 1}, {1, 2}};
-  auto s       = vector<double>(std::min(m, n));
+  auto s       = vector<fp_type>(std::min(m, n));
 
   // using the gelss_worker class
   auto worker       = lapack::gelss_worker<value_t>{A};
   auto [x_1, eps_1] = worker(B);
-  EXPECT_ARRAY_NEAR(x_exact, x_1, 1e-14);
+  EXPECT_ARRAY_NEAR(x_exact, x_1, eps_close);
 
   auto [x_2, eps_2] = worker(b);
-  EXPECT_ARRAY_NEAR(x_exact(range::all, 0), x_2, 1e-14);
+  EXPECT_ARRAY_NEAR(x_exact(range::all, 0), x_2, eps_close);
 
   // call the gelss function directly
   int rank{};
   matrix<value_t, F_layout> A_f{A}, B_f{B};
   lapack::gelss(A_f, B_f, s, 1e-18, rank);
-  EXPECT_ARRAY_NEAR(x_exact, B_f(range(n), range::all), 1e-14);
+  EXPECT_ARRAY_NEAR(x_exact, B_f(range(n), range::all), eps_close);
 
   A_f = A;
   lapack::gelss(A_f, b, s, 1e-18, rank);
-  EXPECT_ARRAY_NEAR(x_exact(range::all, 0), b(range(n)), 1e-14);
+  EXPECT_ARRAY_NEAR(x_exact(range::all, 0), b(range(n)), eps_close);
 }
 
 TEST(NDA, LAPACKGelss) {
+  test_gelss<float>();
+  test_gelss<std::complex<float>>();
   test_gelss<double>();
   test_gelss<std::complex<double>>();
 }
 
 // Test LAPACK getrs, getrf and getri functions.
-template <typename value_t>
+template <typename T, typename Layout>
 void test_getrs_getrf_getri() {
-  using matrix_t = matrix<value_t, F_layout>;
+  using matrix_t   = matrix<T, Layout>;
+  using f_matrix_t = matrix<T, F_layout>;
+  T fac            = 1.0;
+  if constexpr (nda::is_complex_v<T>) fac = 1.0i;
 
   auto A = matrix_t{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
-  auto B = matrix_t{{1, 5}, {4, 5}, {3, 6}};
-
-  // solve A * x = B using the exact matrix inverse
+  A *= fac;
   auto Ainv = matrix_t{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
-  auto X1   = matrix_t{Ainv * B};
-  EXPECT_ARRAY_NEAR(matrix_t{A * X1}, B);
+  Ainv /= fac;
+  auto B = matrix_t{{1, 5}, {4, 5}, {3, 6}};
 
-  // solve A * x = B using getrf and getrs
+  // solve A * X = B using getrf and getrs
   auto Acopy = matrix_t{A};
-  auto Bcopy = matrix_t{B};
+  auto Bcopy = f_matrix_t{B};
   array<int, 1> ipiv(3);
   lapack::getrf(Acopy, ipiv);
   lapack::getrs(Acopy, Bcopy, ipiv);
-  auto X2 = matrix_t{Bcopy};
-  EXPECT_ARRAY_NEAR(matrix_t{A * X2}, B);
-  EXPECT_ARRAY_NEAR(X1, X2);
+  auto X = matrix_t{Bcopy};
+  EXPECT_ARRAY_NEAR(matrix_t{A * X}, B);
+  EXPECT_ARRAY_NEAR(matrix_t{Ainv * B}, X);
+
+  // solve A^T * X = B using getrf and getrs
+  Acopy = A;
+  Bcopy = B;
+  lapack::getrf(Acopy, ipiv);
+  lapack::getrs(nda::transpose(Acopy), Bcopy, ipiv);
+  X = matrix_t{Bcopy};
+  EXPECT_ARRAY_NEAR(matrix_t{nda::transpose(A) * X}, B);
+  EXPECT_ARRAY_NEAR(matrix_t{nda::transpose(Ainv) * B}, X);
+
+  // solve A^H * X = B using getrf and getrs
+  if constexpr (blas::has_F_layout<matrix_t>) {
+    Acopy = A;
+    Bcopy = B;
+    lapack::getrf(Acopy, ipiv);
+    lapack::getrs(nda::conj(nda::transpose(Acopy)), Bcopy, ipiv);
+    X = matrix_t{Bcopy};
+    EXPECT_ARRAY_NEAR(matrix_t{nda::conj(nda::transpose(A)) * X}, B);
+    EXPECT_ARRAY_NEAR(matrix_t{nda::conj(nda::transpose(Ainv)) * B}, X);
+  }
+
+  // solve A * x = b using getrf and getrs
+  Acopy  = A;
+  auto b = vector<T>{B(range::all, 0)};
+  lapack::getrf(Acopy, ipiv);
+  lapack::getrs(Acopy, b, ipiv);
+  EXPECT_ARRAY_NEAR(A * b, B(range::all, 0));
+  EXPECT_ARRAY_NEAR((Ainv * B)(range::all, 0), b);
 
   // compute the inverse of A using getrf and getri
   auto Ainv2 = Acopy;
@@ -244,7 +268,206 @@ void test_getrs_getrf_getri() {
   EXPECT_ARRAY_NEAR(Ainv, Ainv2);
 }
 
-TEST(NDA, LAPAKCGetrsGetrfAndGetri) {
-  test_getrs_getrf_getri<double>();
-  test_getrs_getrf_getri<std::complex<double>>();
+TEST(NDA, LAPACKGetrsGetrfAndGetri) {
+  test_getrs_getrf_getri<double, C_layout>();
+  test_getrs_getrf_getri<double, F_layout>();
+  test_getrs_getrf_getri<std::complex<double>, C_layout>();
+  test_getrs_getrf_getri<std::complex<double>, F_layout>();
+}
+
+TEST(NDA, LAPACKGetrfWithRectangularMatrix) {
+  auto A    = matrix<double, F_layout>{{1, 5}, {4, 5}, {3, 6}};
+  auto AT   = matrix<double, F_layout>(nda::transpose(A));
+  auto A_c  = matrix<double, C_layout>{A};
+  auto AT_c = matrix<double, C_layout>{AT};
+  auto ipiv = array<int, 1>(2);
+
+  // get the matrices P, L, U from getrf output
+  auto get_plu = [](auto const &M, auto const &ipiv, int m, int n) {
+    using layout_t   = std::conditional_t<blas::has_C_layout<decltype(M)>, C_layout, F_layout>;
+    auto P           = matrix<double, layout_t>::zeros(m, m);
+    auto L           = matrix<double, layout_t>::zeros(m, m);
+    auto U           = matrix<double, layout_t>::zeros(m, n);
+    nda::diagonal(P) = 1;
+    nda::diagonal(L) = 1;
+    for (int i = 0; i < ipiv.size(); ++i) deep_swap(P(i, nda::range::all), P(ipiv(i) - 1, nda::range::all));
+    for (int i = 0; i < m; ++i) {
+      L(i, nda::range(i))    = (blas::has_C_layout<decltype(M)> ? M(nda::range(i), i) : M(i, nda::range(i)));
+      U(i, nda::range(i, n)) = (blas::has_C_layout<decltype(M)> ? M(nda::range(i, n), i) : M(i, nda::range(i, n)));
+    }
+    return std::make_tuple(P, L, U);
+  };
+
+  // LU decomposition for 3x2 Fortran layout matrix
+  auto LU_f_32 = A;
+  lapack::getrf(LU_f_32, ipiv);
+  auto [P_f_32, L_f_32, U_f_32] = get_plu(LU_f_32, ipiv, 3, 2);
+  EXPECT_ARRAY_NEAR(P_f_32 * A, L_f_32 * U_f_32);
+
+  // LU decomposition for 2x3 Fortran layout matrix
+  auto LU_f_23 = AT;
+  lapack::getrf(LU_f_23, ipiv);
+  auto [P_f_23, L_f_23, U_f_23] = get_plu(LU_f_23, ipiv, 2, 3);
+  EXPECT_ARRAY_NEAR(P_f_23 * AT, L_f_23 * U_f_23);
+
+  // LU decomposition for 3x2 C layout matrix
+  auto LU_c_32 = A_c;
+  lapack::getrf(LU_c_32, ipiv);
+  auto [P_c_32, L_c_32, U_c_32] = get_plu(LU_c_32, ipiv, 2, 3);
+  EXPECT_ARRAY_NEAR(P_c_32 * nda::transpose(A_c), L_c_32 * U_c_32);
+
+  // LU decomposition for 2x3 C layout matrix
+  auto LU_c_23 = AT_c;
+  lapack::getrf(LU_c_23, ipiv);
+  auto [P_c_23, L_c_23, U_c_23] = get_plu(LU_c_23, ipiv, 3, 2);
+  EXPECT_ARRAY_NEAR(P_c_23 * nda::transpose(AT_c), L_c_23 * U_c_23);
+}
+
+// Check that the eigenvectors/values are correct.
+void check_eigen(auto const &A, auto const &V, auto const &l, double eps_close) {
+  for (auto i : nda::range(0, A.extent(0))) { EXPECT_ARRAY_NEAR(A * V(nda::range::all, i), l(i) * V(nda::range::all, i), eps_close); }
+}
+
+void check_eigen(auto const &A, auto const &B, auto const &V, auto const &l, int itype, double eps_close) {
+  for (auto i : nda::range(0, A.extent(0))) {
+    if (itype == 1) {
+      EXPECT_ARRAY_NEAR(A * V(nda::range::all, i), l(i) * B * V(nda::range::all, i), eps_close);
+    } else if (itype == 2) {
+      EXPECT_ARRAY_NEAR(A * B * V(nda::range::all, i), l(i) * V(nda::range::all, i), eps_close);
+    } else {
+      EXPECT_ARRAY_NEAR(B * A * V(nda::range::all, i), l(i) * V(nda::range::all, i), eps_close);
+    }
+  }
+}
+
+// Create a symmetric or hermitian matrix with restricted eigenvalues.
+template <typename T>
+auto syhe_matrix(int n, double a = 1e-6, double b = 1.0) {
+  using matrix_t = nda::matrix<T, nda::F_layout>;
+
+  // orthogonal/unitary matrix Q
+  auto jpvt = nda::zeros<int>(n);
+  auto tau  = nda::vector<T>(n);
+  auto Q    = nda::matrix<T, nda::F_layout>::rand(n, n);
+  nda::lapack::geqp3(Q, jpvt, tau);
+  if constexpr (nda::is_complex_v<T>) {
+    nda::lapack::ungqr(Q, tau);
+  } else {
+    nda::lapack::orgqr(Q, tau);
+  }
+
+  // diagonal matrix containing the eigenvalues
+  auto D = nda::eye<double>(n) * a + nda::diag(nda::vector<double>::rand(n)) * (b - a);
+
+  // return Q * D * Q^H (hermitian/symmetric)
+  return matrix_t{Q * D * nda::dagger(Q)};
+}
+
+// Test LAPACK syev and heev functions.
+template <typename T>
+void test_syev_heev(auto xxev) {
+  using fp_type = nda::get_fp_t<T>;
+  // 100*epsilon is heuristic, since we're not actually applying the eigenvectors, we don't
+  // need the condition number
+  constexpr double eps_close = 100 * std::numeric_limits<fp_type>::epsilon();
+
+  for (auto i : nda::range(1, 6)) {
+    auto A = syhe_matrix<T>(i, -1, 1);
+
+    // compute eigenvalues and eigenvectors
+    auto A1 = A;
+    auto w1 = nda::vector<fp_type>(i);
+    xxev(A1, w1);
+    check_eigen(A, A1, w1, eps_close);
+
+    // compute eigenvalues only
+    auto A2 = A;
+    auto w2 = nda::vector<fp_type>{};
+    xxev(A2, w2, 'N');
+    EXPECT_ARRAY_NEAR(w2, w1, eps_close);
+
+    // compute eigenvalues and eigenvectors of the transpose
+    auto A3 = nda::matrix<T, nda::C_layout>{A};
+    auto w3 = nda::vector<fp_type>(i);
+    xxev(nda::transpose(A3), w3);
+    EXPECT_ARRAY_NEAR(w3, w1, eps_close);
+    if constexpr (nda::is_complex_v<T>) {
+      check_eigen(nda::transpose(A), nda::transpose(A3), w3, eps_close);
+    } else {
+      check_eigen(A, nda::transpose(A3), w3, eps_close);
+      EXPECT_ARRAY_NEAR(nda::transpose(A3), A1, eps_close);
+    }
+
+    // compute eigenvalues and eigenvectors of a view
+    if (i > 3) {
+      auto A4 = A;
+      auto w4 = nda::vector<fp_type>{};
+      xxev(A4(nda::range(3), nda::range(3)), w4);
+      check_eigen(A(nda::range(3), nda::range(3)), A4(nda::range(3), nda::range(3)), w4, eps_close);
+    }
+  }
+}
+
+TEST(NDA, LAPACKSyevAndHeev) {
+  constexpr auto syev = [](auto &&...ts) { return lapack::syev(ts...); };
+  constexpr auto heev = [](auto &&...ts) { return lapack::heev(ts...); };
+  test_syev_heev<float>(syev);
+  test_syev_heev<std::complex<float>>(heev);
+  test_syev_heev<double>(syev);
+  test_syev_heev<std::complex<double>>(heev);
+}
+
+// Test LAPACK sygv and hegv functions.
+template <typename T>
+void test_sygv_hegv(int itype, auto xxgv) {
+  using fp_type = nda::get_fp_t<T>;
+  // 100*epsilon is heuristic, since we're not actually applying the eigenvectors, we don't
+  // need the condition number
+  constexpr double eps_close = 100 * std::numeric_limits<fp_type>::epsilon();
+
+  for (auto i : nda::range(1, 6)) {
+    auto A = syhe_matrix<T>(i, -1, 1);
+    auto B = syhe_matrix<T>(i, 1e-6, 1);
+
+    // compute eigenvalues and eigenvectors
+    auto A1 = A;
+    auto B1 = B;
+    auto w1 = nda::vector<fp_type>(i);
+    xxgv(A1, B1, w1, 'V', itype);
+    check_eigen(A, B, A1, w1, itype, eps_close);
+
+    // compute eigenvalues only
+    auto A2 = A;
+    auto B2 = B;
+    auto w2 = nda::vector<fp_type>{};
+    xxgv(A2, B2, w2, 'N', itype);
+    EXPECT_ARRAY_NEAR(w2, w1, eps_close);
+
+    // compute eigenvalues and eigenvectors of a view
+    if (i > 3) {
+      auto A3 = A;
+      auto B3 = B;
+      auto w3 = nda::vector<fp_type>{};
+      auto rg = nda::range(3);
+      xxgv(A3(rg, rg), B3(rg, rg), w3, 'V', itype);
+      check_eigen(A(rg, rg), B(rg, rg), A3(rg, rg), w3, itype, eps_close);
+    }
+  }
+}
+
+TEST(NDA, LAPACKSyegvAndHegv) {
+  auto sygv = [](auto &&...ts) { return lapack::sygv(ts...); };
+  auto hegv = [](auto &&...ts) { return lapack::hegv(ts...); };
+  test_sygv_hegv<float>(1, sygv);
+  test_sygv_hegv<float>(2, sygv);
+  test_sygv_hegv<float>(3, sygv);
+  test_sygv_hegv<std::complex<float>>(1, hegv);
+  test_sygv_hegv<std::complex<float>>(2, hegv);
+  test_sygv_hegv<std::complex<float>>(3, hegv);
+  test_sygv_hegv<double>(1, sygv);
+  test_sygv_hegv<double>(2, sygv);
+  test_sygv_hegv<double>(3, sygv);
+  test_sygv_hegv<std::complex<double>>(1, hegv);
+  test_sygv_hegv<std::complex<double>>(2, hegv);
+  test_sygv_hegv<std::complex<double>>(3, hegv);
 }
diff --git a/test/c++/nda_linear_algebra.cpp b/test/c++/nda_linear_algebra.cpp
index 3d48a5826..1a2cb5e84 100644
--- a/test/c++/nda_linear_algebra.cpp
+++ b/test/c++/nda_linear_algebra.cpp
@@ -4,357 +4,536 @@
 // See LICENSE in the root of this distribution for details.
 
 #include "./test_common.hpp"
+#include "nda/traits.hpp"
 
 #include <nda/gtest_tools.hpp>
 #include <nda/nda.hpp>
 
+#include <cmath>
 #include <complex>
+#include <concepts>
 #include <limits>
-#include <type_traits>
 
-TEST(NDA, LinearAlgebraDotProduct) {
-  nda::array<double, 1> a(2), a2(2), c(2);
-  a() = 2.0;
-  c() = 1;
-  nda::array<int, 1> b(2);
-  b() = 3;
-  a2  = 2 * a;
+using namespace std::complex_literals;
 
-  EXPECT_DOUBLE_EQ(dot(a, b), 12);
-  EXPECT_DOUBLE_EQ(dot(a2, a), 16);
-  EXPECT_DOUBLE_EQ(dot(a2, b), 24);
-  EXPECT_DOUBLE_EQ(dot(a2 - a, b), 12);
+// Test the generic dot/dotc function.
+auto exp_dot(auto const &a, auto const &b) {
+  auto res = a(0) * b(0);
+  for (size_t i = 1; i < a.size(); ++i) res += a(i) * b(i);
+  return res;
 }
 
-TEST(NDA, LinearAlgebraComplexDotProduct) {
-  // added by I. Krivenko, #122
-  // test the complex version, especially with the zdotu workaround on macOS
-  nda::array<std::complex<double>, 1> v(2);
-  v(0) = 0;
-  v(1) = {0, 1};
-
-  EXPECT_COMPLEX_NEAR(nda::blas::dot(v, v), -1);
-  EXPECT_COMPLEX_NEAR(nda::blas::dotc(v, v), 1);
+auto exp_dotc(auto const &a, auto const &b) {
+  auto res = std::conj(a(0)) * b(0);
+  for (size_t i = 1; i < a.size(); ++i) res += std::conj(a(i)) * b(i);
+  return res;
 }
 
-// Test matrix-matrix multiplication for specific memory layouts.
-template <typename T, typename L1, typename L2, typename L3>
-void test_matmul() {
-  nda::matrix<T, L1> M1(2, 3);
-  nda::matrix<T, L2> M2(3, 4);
-  nda::matrix<T, L2> M3(2, 4), M3b(2, 4);
-  for (int i = 0; i < 2; ++i)
-    for (int j = 0; j < 3; ++j) { M1(i, j) = i + j; }
-  for (int i = 0; i < 3; ++i)
-    for (int j = 0; j < 4; ++j) { M2(i, j) = 1 + i - j; }
-
-  // BLAS gemm
-  M3b = 0;
-  if constexpr (nda::is_blas_lapack_v<T>) { nda::blas::gemm(1, M1, M2, 0, M3b); }
-
-  // operator*
-  M3 = 0;
-  M3 = M1 * M2;
-
-  // brute force
-  auto M4 = M3;
-  M4      = 0;
-  for (int i = 0; i < 2; ++i)
-    for (int k = 0; k < 3; ++k)
-      for (int j = 0; j < 4; ++j) M4(i, j) += M1(i, k) * M2(k, j);
-  EXPECT_ARRAY_NEAR(M4, M3, 1.e-13);
-  if constexpr (nda::is_blas_lapack_v<T>) { EXPECT_ARRAY_NEAR(M4, M3b, 1.e-13); }
-
-  // generic gemm implementation
-  nda::blas::gemm_generic(1, M1, M2, 0, M4);
-  EXPECT_ARRAY_NEAR(M4, M3, 1.e-13);
-}
-
-// Call test_matmul for all various memory layouts.
 template <typename T>
-void all_test_matmul() {
-  test_matmul<T, nda::C_layout, nda::C_layout, nda::C_layout>();
-  test_matmul<T, nda::C_layout, nda::C_layout, nda::F_layout>();
-  test_matmul<T, nda::C_layout, nda::F_layout, nda::F_layout>();
-  test_matmul<T, nda::C_layout, nda::F_layout, nda::C_layout>();
-  test_matmul<T, nda::F_layout, nda::F_layout, nda::F_layout>();
-  test_matmul<T, nda::F_layout, nda::C_layout, nda::F_layout>();
-  test_matmul<T, nda::F_layout, nda::F_layout, nda::C_layout>();
-  test_matmul<T, nda::F_layout, nda::C_layout, nda::C_layout>();
-}
-
-TEST(NDA, LinearAlgebraMatmul) {
-  all_test_matmul<double>();
-  all_test_matmul<std::complex<double>>();
-  all_test_matmul<long>();
-}
-
-TEST(NDA, LinearAlgebraMatumulPromoteValueType) {
-  nda::matrix<double> A_d = {{1.0, 2.3}, {3.1, 4.3}};
-  nda::matrix<int> B_i    = {{1, 2}, {3, 4}};
-  nda::matrix<double> B_d = {{1, 2}, {3, 4}};
-
-  auto C = nda::make_regular(A_d * B_i);
-  auto D = nda::make_regular(A_d * B_d);
-  static_assert(std::is_same_v<nda::get_value_t<decltype(C)>, double>);
-  static_assert(std::is_same_v<nda::get_value_t<decltype(D)>, double>);
-  EXPECT_ARRAY_NEAR(C, D, 1.e-13);
-}
-
-TEST(NDA, LinearAlgebraMatmulCache) {
-  // test with view for possible cache issue
-  nda::array<std::complex<double>, 3> A(2, 2, 5);
-  A() = -1;
-  nda::matrix_view<std::complex<double>> A_v(A(nda::range::all, nda::range::all, 2));
-  nda::matrix<std::complex<double>> M1(2, 2), Res(2, 2);
-  M1()      = 0;
-  M1(0, 0)  = 2;
-  M1(1, 1)  = 3.2;
-  Res()     = 0;
-  Res(0, 0) = 8;
-  Res(1, 1) = 16.64;
-  A_v()     = 0;
-  A_v()     = nda::matrix<std::complex<double>>{M1 * (M1 + 2.0)};
-  EXPECT_ARRAY_NEAR(A_v(), Res, 1.e-13);
-
-  // not matmul, just recheck diagonal unity
-  Res()     = 0;
-  Res(0, 0) = 4;
-  Res(1, 1) = 5.2;
-  A_v()     = 0;
-  A_v()     = nda::matrix<std::complex<double>>{(M1 + 2.0)};
-  EXPECT_ARRAY_NEAR(A_v(), Res, 1.e-13);
+void test_dotproduct() {
+  // scalars
+  std::complex<T> u{1, 2};
+  std::complex<T> v{3, -4};
+  EXPECT_EQ(nda::linalg::dot(1, 2), 2);
+  EXPECT_EQ(nda::linalg::dotc(1, 2), 2);
+  EXPECT_EQ(nda::linalg::dot(2, -5.0), -10.0);
+  EXPECT_EQ(nda::linalg::dotc(2, -5.0), -10.0);
+  EXPECT_COMPLEX_NEAR(nda::linalg::dot(u, v), u * v);
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(u, v), std::conj(u) * v);
+
+  // BLAS compatible vectors
+  nda::vector<T> a{1, 2, 3, 4, 5};
+  nda::vector<T> b{10, 20, 30, 40, 50};
+  EXPECT_EQ(nda::linalg::dot(a, b), nda::blas::dot(a, b));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(a, b), nda::blas::dotc(a, b));
+
+  nda::vector<std::complex<T>> c = a * (1.1 - 2.1i);
+  nda::vector<std::complex<T>> d = b * (3 + 4i);
+  EXPECT_COMPLEX_NEAR(nda::linalg::dot(c, d), exp_dot(c, d));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(c, d), exp_dotc(c, d));
+
+  // vectors with different value types
+  EXPECT_COMPLEX_NEAR(nda::linalg::dot(a, c), exp_dot(a, c));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(a, c), exp_dotc(a, c));
+
+  nda::vector<int> e{1, 2, 3, 4, 5};
+  EXPECT_EQ(nda::linalg::dot(e, e), exp_dot(e, e));
+  EXPECT_EQ(nda::linalg::dot(e, b), exp_dot(e, b));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(e, b), exp_dotc(e, b));
+
+  // lazy expressions
+  auto sin_a = nda::make_regular(nda::sin(a));
+  EXPECT_EQ(nda::linalg::dot(nda::sin(a), b), exp_dot(sin_a, b));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(nda::sin(a), b), exp_dotc(sin_a, b));
+
+  // (strided) vector views
+  auto c_v = c(nda::range(0, 5, 2));
+  auto d_v = d(nda::range(1, 4));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dot(c_v, d_v), exp_dot(c_v, d_v));
+  EXPECT_COMPLEX_NEAR(nda::linalg::dotc(c_v, d_v), exp_dotc(c_v, d_v));
 }
 
-TEST(NDA, LinearAlgebraMatmulAlias) {
-  nda::array<std::complex<double>, 3> A(10, 2, 2);
-  A() = -1;
+TEST(NDA, LinearAlgebraDotProduct) {
+  test_dotproduct<float>();
+  test_dotproduct<double>();
+}
 
-  A(4, nda::range::all, nda::range::all) = 1;
-  A(5, nda::range::all, nda::range::all) = 2;
+// Test the generic matvecmul function.
+template <typename T, typename Layout>
+void test_matvecmul() {
+  auto x       = nda::vector<T>{1, 2, 3};
+  auto x_t     = nda::vector<T>{1, 2, 3, 4};
+  auto exp_y   = nda::vector<T>{14, 32, 50, 68};
+  auto exp_y_t = nda::vector<T>{70, 80, 90};
+  auto A       = nda::matrix<T, Layout>(4, 3);
+  nda::for_each(A.shape(), [&A](auto i, auto j) { A(i, j) = i * 3 + j + 1; });
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    x *= 2 - 1i;
+    x_t *= 2 - 1i;
+    exp_y *= (1 - 1i) * (2 - 1i);
+    exp_y_t *= (1 - 1i) * (2 - 1i);
+  }
 
-  nda::matrix_view<std::complex<double>> M1 = A(4, nda::range::all, nda::range::all);
-  nda::matrix_view<std::complex<double>> M2 = A(5, nda::range::all, nda::range::all);
+  // y = A * x
+  auto y = nda::linalg::matvecmul(A, x);
+  EXPECT_ARRAY_NEAR(y, exp_y);
+
+  // y_t = A^T * x_t
+  auto y_t = nda::linalg::matvecmul(nda::transpose(A), x_t);
+  EXPECT_ARRAY_NEAR(y_t, exp_y_t);
+
+  // y_h = A^H * x_t
+  auto exp_y_h = exp_y_t;
+  if constexpr (nda::is_complex_v<T>) exp_y_h = nda::vector<T>{T{210 + 70i}, T{240 + 80i}, T{270 + 90i}};
+  auto y_h = nda::linalg::matvecmul(nda::conj(nda::transpose(A)), x_t);
+  EXPECT_ARRAY_NEAR(y_h, exp_y_h);
+
+  // strided matrix and vector views
+  auto y_v = nda::linalg::matvecmul(A(nda::range(0, 4, 2), nda::range(0, 3, 2)), x(nda::range(0, 3, 2)));
+  if constexpr (nda::is_complex_v<T>) {
+    EXPECT_ARRAY_EQ(y_v, (nda::vector<T>{T{10 - 30i}, T{34 - 102i}}));
+  } else {
+    EXPECT_ARRAY_EQ(y_v, (nda::vector<T>{10, 34}));
+  }
+}
 
-  M1 = M1 * M2;
-  EXPECT_ARRAY_NEAR(M1, nda::matrix<std::complex<double>>{{4, 4}, {4, 4}});
-  EXPECT_ARRAY_NEAR(M2, nda::matrix<std::complex<double>>{{2, 2}, {2, 2}});
+template <typename T>
+constexpr auto test_matvecmul_layouts = []() {
+  test_matvecmul<T, nda::C_layout>();
+  test_matvecmul<T, nda::F_layout>();
+};
+
+TEST(NDA, LinearAlgebraMatvecmulGenericGemvBranch) { test_matvecmul_layouts<long>(); }
+
+TEST(NDA, LinearAlgebraMatvecmulBLASBranch) {
+  test_matvecmul_layouts<float>();
+  test_matvecmul_layouts<std::complex<float>>();
+  test_matvecmul_layouts<double>();
+  test_matvecmul_layouts<std::complex<double>>();
+}
 
-  nda::matrix<double> B1(2, 2), B2(2, 2);
-  B1() = 2;
-  B2() = 3;
+template <typename T>
+void test_matvecmul_promotion() {
+  auto A_i = nda::matrix<int>{{1, 2}, {3, 4}};
+  auto A_d = nda::matrix<T>{{1, 2}, {3, 4}};
+  auto w_i = nda::vector<int>{1, 1};
+  auto w_d = nda::vector<T>{1, 1};
+
+  auto v_d1 = nda::linalg::matvecmul(A_d, w_i);
+  static_assert(std::same_as<nda::get_value_t<decltype(v_d1)>, T>);
+  EXPECT_ARRAY_NEAR(v_d1, (nda::vector<T>{3, 7}), std::numeric_limits<T>::epsilon());
+
+  auto v_d2 = nda::linalg::matvecmul(A_i, w_d);
+  static_assert(std::same_as<nda::get_value_t<decltype(v_d2)>, T>);
+  EXPECT_ARRAY_NEAR(v_d2, (nda::vector<T>{3, 7}), std::numeric_limits<T>::epsilon());
+
+  auto v_i = nda::linalg::matvecmul(A_i, w_i);
+  static_assert(std::same_as<nda::get_value_t<decltype(v_i)>, int>);
+  EXPECT_ARRAY_EQ(v_i, (nda::vector<int>{3, 7}));
+}
 
-  B1 = nda::make_regular(B1) * B2;
-  EXPECT_ARRAY_NEAR(B1, nda::matrix<double>{{6, 0}, {0, 6}});
+TEST(NDA, LinearAlgebraMatvecmulPromotion) {
+  test_matvecmul_promotion<float>();
+  test_matvecmul_promotion<double>();
 }
 
-// Test determinant for a specific memory layout.
-template <typename L>
-void test_determinant() {
-  nda::matrix<double, L> W1(1, 1);
-  W1(0, 0) = 1.0;
-  EXPECT_NEAR(determinant(W1), 1.0, 1.e-12);
+TEST(NDA, LinearAlgebraMatvecmulWithLazyExpressions) {
+  auto A     = nda::array<double, 2>{{1, 2}, {3, 4}};
+  auto A_sin = nda::array<double, 2>{nda::sin(A)};
+  auto w     = nda::vector<double>{1, 1};
+  auto w_sin = nda::vector<double>{nda::sin(w)};
+  EXPECT_ARRAY_NEAR(nda::linalg::matvecmul(nda::sin(A), nda::sin(w)), nda::linalg::matvecmul(A_sin, w_sin), 1.e-13);
+}
 
-  nda::matrix<double, L> W2{{1.0, 2.0}, {3.0, 4.0}};
-  EXPECT_NEAR(determinant(W2), -2.0, 1.e-12);
+// Test the generic matmul function.
+template <typename T, typename Layout1, typename Layout2>
+void test_matmul() {
+  auto A     = nda::matrix<T, Layout1>{{1, 2, 3}, {4, 5, 6}};
+  auto B     = nda::matrix<T, Layout2>{{1, 2}, {3, 4}, {5, 6}};
+  auto exp_C = nda::matrix<T>{{22, 28}, {49, 64}};
+  if constexpr (nda::is_complex_v<T>) {
+    A *= 1 - 1i;
+    B *= 2 - 1i;
+    exp_C *= (1 - 1i) * (2 - 1i);
+  }
 
-  nda::matrix<double, L> W3(3, 3);
-  for (int i = 0; i < 3; ++i)
-    for (int j = 0; j < 3; ++j) W3(i, j) = (i > j ? i + 2.5 * j : i * 0.8 - j);
-  EXPECT_NEAR(determinant(W3), -7.8, 1.e-12);
+  // C = A * B
+  auto C = nda::linalg::matmul(A, B);
+  EXPECT_ARRAY_NEAR(C, exp_C);
+
+  // C_t = B^T * A^T
+  auto C_t = nda::linalg::matmul(nda::transpose(B), nda::transpose(A));
+  EXPECT_ARRAY_NEAR(C_t, nda::transpose(exp_C));
+
+  // C_h = B^H * A^H
+  auto C_h = nda::linalg::matmul(nda::dagger(B), nda::dagger(A));
+  EXPECT_ARRAY_NEAR(C_h, nda::dagger(exp_C));
+
+  // strided matrix views
+  auto exp_C_v = nda::matrix<T>{{16, 20}, {34, 44}};
+  if constexpr (nda::is_complex_v<T>) exp_C_v *= (1 - 1i) * (2 - 1i);
+  auto C_v = nda::matrix<T>(4, 4);
+  C_v(nda::range(0, 4, 2), nda::range(0, 4, 2)) =
+     nda::linalg::matmul(A(nda::range::all, nda::range(0, 3, 2)), B(nda::range(0, 3, 2), nda::range::all));
+  EXPECT_ARRAY_NEAR(C_v(nda::range(0, 4, 2), nda::range(0, 4, 2)), exp_C_v);
 }
 
-TEST(NDA, LinearAlgebraDeterminant) {
-  test_determinant<nda::F_layout>();
-  test_determinant<nda::C_layout>();
+template <typename T>
+constexpr auto test_matmul_all_layouts = []() {
+  test_matmul<T, nda::C_layout, nda::C_layout>();
+  test_matmul<T, nda::C_layout, nda::F_layout>();
+  test_matmul<T, nda::F_layout, nda::F_layout>();
+  test_matmul<T, nda::F_layout, nda::C_layout>();
+};
+
+TEST(NDA, LinearAlgebraMatmulGenericGemmBranch) { test_matmul_all_layouts<long>(); }
+
+TEST(NDA, LinearAlgebraMatmulBLASBranch) {
+  test_matmul_all_layouts<float>();
+  test_matmul_all_layouts<std::complex<float>>();
+  test_matmul_all_layouts<double>();
+  test_matmul_all_layouts<std::complex<double>>();
 }
 
-// Test inverse for a specific memory layout.
-template <typename L>
-void test_inverse() {
-  using matrix_t = nda::matrix<double, L>;
-
-  matrix_t W(3, 3), Winv(3, 3);
-  for (int i = 0; i < 3; ++i)
-    for (int j = 0; j < 3; ++j) W(i, j) = (i > j ? i + 2.5 * j : i * 0.8 - j);
+template <typename T>
+void test_matmul_promotion() {
+  auto A_i = nda::matrix<int>{{1, 2}, {3, 4}};
+  auto A_d = nda::matrix<T>{{1, 2}, {3, 4}};
 
-  Winv = inverse(W);
-  EXPECT_NEAR(determinant(Winv), -1 / 7.8, 1.e-12);
+  auto B_d1 = nda::linalg::matmul(A_d, A_i);
+  static_assert(std::same_as<nda::get_value_t<decltype(B_d1)>, T>);
+  EXPECT_ARRAY_NEAR(B_d1, (nda::matrix<T>{{7, 10}, {15, 22}}), std::numeric_limits<T>::epsilon());
 
-  nda::matrix<double, nda::F_layout> id(W * Winv);
-  for (int i = 0; i < 3; ++i)
-    for (int j = 0; j < 3; ++j) EXPECT_NEAR(std::abs(id(i, j)), (i == j ? 1 : 0), 1.e-13);
+  auto B_d2 = nda::linalg::matmul(A_d, A_d);
+  static_assert(std::same_as<nda::get_value_t<decltype(B_d2)>, T>);
+  EXPECT_ARRAY_NEAR(B_d2, (nda::matrix<T>{{7, 10}, {15, 22}}), std::numeric_limits<T>::epsilon());
 
-  // calculate the inverse of the inverse by calling the lapack routines directly
-  nda::array<int, 1> ipiv(3);
-  ipiv     = 0;
-  int info = nda::lapack::getrf(Winv, ipiv);
-  EXPECT_EQ(info, 0);
-  info = nda::lapack::getri(Winv, ipiv);
-  EXPECT_EQ(info, 0);
-  EXPECT_ARRAY_NEAR(Winv, W, 1.e-12);
+  auto B_i = nda::linalg::matmul(A_i, A_i);
+  static_assert(std::same_as<nda::get_value_t<decltype(B_i)>, int>);
+  EXPECT_ARRAY_NEAR(B_i, (nda::matrix<int>{{7, 10}, {15, 22}}), std::numeric_limits<T>::epsilon());
 }
 
-TEST(NDA, LinearAlgebraInverse) {
-  test_inverse<nda::F_layout>();
-  test_inverse<nda::C_layout>();
+TEST(NDA, LinearAlgebraMatmulPromoteValueType) {
+  test_matmul_promotion<float>();
+  test_matmul_promotion<double>();
 }
 
-TEST(NDA, LinearAlgebraInverseInvolution) {
-  using matrix_t = nda::matrix<double, nda::C_layout>;
-
-  matrix_t W(3, 3);
-  for (int i = 0; i < 3; ++i)
-    for (int j = 0; j < 3; ++j) W(i, j) = (i > j ? i + 2.5 * j : i * 0.8 - j);
-  auto W_copy = W;
-
-  W = inverse(W);
-  W = inverse(W);
-  EXPECT_ARRAY_NEAR(W, W_copy, 1.e-12);
+TEST(NDA, LinearAlgebraMatmulWithLazyExpressions) {
+  auto A     = nda::array<double, 2>{{1, 2}, {3, 4}};
+  auto A_sin = nda::array<double, 2>{nda::sin(A)};
+  EXPECT_ARRAY_NEAR(nda::linalg::matmul(nda::sin(A), nda::sin(A)), nda::linalg::matmul(A_sin, A_sin), 1.e-13);
 }
 
-TEST(NDA, LinearAlgebraInverseSlice) {
-  using matrix_t = nda::matrix<double, nda::C_layout>;
-
-  matrix_t W(3, 3);
-  for (int i = 0; i < 3; ++i)
-    for (int j = 0; j < 3; ++j) W(i, j) = (i > j ? i + 2.5 * j : i * 0.8 - j);
-
-  auto V        = W(nda::range(0, 3, 2), nda::range(0, 3, 2));
-  matrix_t Vinv = inverse(V);
-  matrix_t Vinv_ref{{-0.1, 0.5}, {-0.5, 0.0}};
-  EXPECT_ARRAY_NEAR(Vinv, Vinv_ref, 1.e-12);
-
-  W = inverse(W);
+// Test general inverse and determinant functions.
+template <typename T, typename Layout>
+void test_inv_and_det() {
+  using matrix_t = nda::matrix<T, Layout>;
+  using fp_t     = nda::get_fp_t<T>;
+  T fac          = 1.0;
+  if constexpr (nda::is_complex_v<T>) fac = 1.0i;
+  // condition number for matrix is ~275, but magnitudes are ~5 and we use absolute error
+  constexpr double eps_close = 5 * 275 * std::numeric_limits<nda::get_fp_t<T>>::epsilon();
+
+  // A is 3x3, B is 2x2, C is 1x1
+  auto A = matrix_t{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
+  A *= fac;
+  auto Ainv = matrix_t{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
+  Ainv /= fac;
+  T detA = std::pow(fac, fp_t{3});
+  auto B = matrix_t{{1, 2}, {0, 1}};
+  B *= fac;
+  auto Binv = matrix_t{{1, -2}, {0, 1}};
+  Binv /= fac;
+  T detB = std::pow(fac, fp_t{2});
+  auto C = matrix_t{{3}};
+  C *= fac;
+  auto Cinv = matrix_t{{1.0 / 3.0}};
+  Cinv /= fac;
+  T detC = 3 * fac;
+
+  // lambda that checks inverse functions for small matrices
+  auto check_small_mat = [](auto const &M, auto const &Minv, auto detM, auto opt_inv, auto opt_det) {
+    auto Minv2 = nda::linalg::inv(M);
+    EXPECT_ARRAY_NEAR(Minv, Minv2, eps_close);
+    EXPECT_COMPLEX_NEAR(nda::linalg::det(Minv2), 1.0 / detM, eps_close);
+    Minv2 = nda::linalg::inv(Minv2);
+    static_assert(std::is_same_v<nda::get_value_t<decltype(Minv2)>, T>);
+    EXPECT_ARRAY_NEAR(M, Minv2, eps_close);
+    EXPECT_COMPLEX_NEAR(nda::linalg::det(Minv2), detM, eps_close);
+
+    auto Minv3 = M;
+    nda::linalg::inv_in_place(Minv3);
+    EXPECT_ARRAY_NEAR(Minv, Minv3, eps_close);
+    EXPECT_COMPLEX_NEAR(nda::linalg::det_in_place(Minv3), 1.0 / detM, eps_close);
+    nda::linalg::inv_in_place(Minv3);
+    EXPECT_ARRAY_NEAR(M, Minv3, eps_close);
+    EXPECT_COMPLEX_NEAR(nda::linalg::det_in_place(Minv3), detM, eps_close);
+
+    auto Minv4 = M;
+    opt_inv(Minv4);
+    EXPECT_ARRAY_NEAR(Minv, Minv4, eps_close);
+    EXPECT_COMPLEX_NEAR(opt_det(Minv4), 1.0 / detM, eps_close);
+    opt_inv(Minv4);
+    EXPECT_ARRAY_NEAR(M, Minv4, eps_close);
+    EXPECT_COMPLEX_NEAR(opt_det(Minv4), detM, eps_close);
+  };
 
-  auto U        = W(nda::range(0, 3, 2), nda::range(0, 3, 2));
-  matrix_t Uinv = inverse(U);
-  matrix_t Uinv_ref{{-5.0, 4.0}, {24.5, -27.4}};
-  EXPECT_ARRAY_NEAR(Uinv, Uinv_ref, 1.e-12);
+  check_small_mat(A, Ainv, detA, [](auto &M) { return nda::linalg::inv_in_place_3d(M); }, [](auto &M) { return nda::linalg::det_3d(M); });
+  check_small_mat(B, Binv, detB, [](auto &M) { return nda::linalg::inv_in_place_2d(M); }, [](auto &M) { return nda::linalg::det_2d(M); });
+  check_small_mat(C, Cinv, detC, [](auto &M) { return nda::linalg::inv_in_place_1d(M); }, [](auto &M) { return nda::linalg::det_1d(M); });
+
+  // matrix view
+  EXPECT_ARRAY_NEAR(nda::linalg::inv(A(nda::range(0, 2), nda::range(0, 2))), Binv, eps_close);
+  EXPECT_COMPLEX_NEAR(nda::linalg::det(A(nda::range(0, 2), nda::range(0, 2))), detB, eps_close);
+
+  // 4x4 matrix
+  auto D = matrix_t{{2, 2, 2, 2}, {2, 4, 6, 8}, {2, 6, 12, 20}, {2, 8, 20, 40}};
+  D *= fac;
+  auto Dinv = matrix_t{{2, -3, 2, -0.5}, {-3, 7, -5.5, 1.5}, {2, -5.5, 5, -1.5}, {-0.5, 1.5, -1.5, 0.5}};
+  Dinv /= fac;
+  T detD = 16 * std::pow(fac, fp_t{4});
+
+  auto Dinv2 = nda::linalg::inv(D);
+  static_assert(std::is_same_v<nda::get_value_t<decltype(Dinv2)>, T>);
+  EXPECT_ARRAY_NEAR(Dinv, Dinv2, eps_close);
+  EXPECT_COMPLEX_NEAR(nda::linalg::det(Dinv2), 1.0 / detD, eps_close);
+  Dinv2 = nda::linalg::inv(Dinv2);
+  EXPECT_ARRAY_NEAR(D, Dinv2, eps_close);
+  // This checks absolute error, but 16 is fairly 'large' in fp32, so scale down to relative error
+  EXPECT_COMPLEX_NEAR(nda::linalg::det(Dinv2) / fp_t(16), detD / fp_t(16), eps_close);
+
+  auto Dinv3 = D;
+  nda::linalg::inv_in_place(Dinv3);
+  EXPECT_ARRAY_NEAR(Dinv, Dinv3, eps_close);
+  nda::linalg::inv_in_place(Dinv3);
+  EXPECT_ARRAY_NEAR(D, Dinv3, eps_close);
+  // This checks absolute error, but 16 is fairly 'large' in fp32, so scale down to relative error
+  EXPECT_COMPLEX_NEAR(nda::linalg::det_in_place(Dinv3) / fp_t{16}, detD / fp_t{16}, eps_close);
 }
 
-TEST(NDA, LinearAlgebraInverseSmall) {
-  for (auto n : {1, 2, 3}) {
-
-    nda::matrix<double> W(n, n);
-    for (int i = 0; i < n; ++i)
-      for (int j = 0; j < n; ++j) W(i, j) = (i > j ? 0.5 + i + 2.5 * j : i * 0.8 - j - 0.5);
+TEST(NDA, LinearAlgebraInvAndDet) {
+  test_inv_and_det<float, nda::C_layout>();
+  test_inv_and_det<float, nda::F_layout>();
+  test_inv_and_det<std::complex<float>, nda::C_layout>();
+  test_inv_and_det<std::complex<float>, nda::F_layout>();
+  test_inv_and_det<double, nda::C_layout>();
+  test_inv_and_det<double, nda::F_layout>();
+  test_inv_and_det<std::complex<double>, nda::C_layout>();
+  test_inv_and_det<std::complex<double>, nda::F_layout>();
+}
 
-    auto Winv = inverse(W);
-    EXPECT_NEAR(determinant(Winv), 1.0 / determinant(W), 1.e-12);
-    EXPECT_ARRAY_NEAR(W * Winv, nda::eye<double>(n), 1.e-13);
+// Check that the eigenvectors/values are correct.
+void check_eigen(auto const &A, auto const &V, auto const &l, double eps_close) {
+  for (auto i : nda::range(0, A.extent(0))) { EXPECT_ARRAY_NEAR(A * V(nda::range::all, i), l(i) * V(nda::range::all, i), eps_close); }
+}
 
-    auto Winv_inv = inverse(Winv);
-    EXPECT_ARRAY_NEAR(Winv_inv, W, 1.e-12);
+void check_eigen(auto const &A, auto const &B, auto const &V, auto const &l, int itype, double eps_close) {
+  for (auto i : nda::range(0, A.extent(0))) {
+    if (itype == 1) {
+      EXPECT_ARRAY_NEAR(A * V(nda::range::all, i), l(i) * B * V(nda::range::all, i), eps_close);
+    } else if (itype == 2) {
+      EXPECT_ARRAY_NEAR(A * B * V(nda::range::all, i), l(i) * V(nda::range::all, i), eps_close);
+    } else {
+      EXPECT_ARRAY_NEAR(B * A * V(nda::range::all, i), l(i) * V(nda::range::all, i), eps_close);
+    }
   }
 }
 
-TEST(NDA, LinearAlgebraMatvecmulPromotion) {
-  nda::matrix<int> A_i    = {{1, 2}, {3, 4}};
-  nda::matrix<double> A_d = {{1, 2}, {3, 4}};
-  nda::array<int, 1> v_i, w_i    = {1, 1};
-  nda::array<double, 1> v_d, w_d = {1, 1};
+// Create a symmetric or hermitian matrix with restricted eigenvalues.
+template <typename T>
+auto syhe_matrix(int n, double a = 1e-6, double b = 1.0) {
+  using matrix_t = nda::matrix<T, nda::F_layout>;
+
+  // orthogonal/unitary matrix Q
+  auto jpvt = nda::zeros<int>(n);
+  auto tau  = nda::vector<T>(n);
+  auto Q    = nda::matrix<T, nda::F_layout>::rand(n, n);
+  nda::lapack::geqp3(Q, jpvt, tau);
+  if constexpr (nda::is_complex_v<T>) {
+    nda::lapack::ungqr(Q, tau);
+  } else {
+    nda::lapack::orgqr(Q, tau);
+  }
 
-  v_d = matvecmul(A_d, w_i);
-  v_i = matvecmul(A_i, w_i);
+  // diagonal matrix containing the eigenvalues
+  using fp_type = nda::get_fp_t<T>;
+  auto D        = nda::eye<fp_type>(n) * a + nda::diag(nda::vector<fp_type>::rand(n)) * (b - a);
 
-  EXPECT_ARRAY_NEAR(v_d, v_i, 1.e-13);
+  // return Q * D * Q^H (hermitian/symmetric)
+  return matrix_t{Q * D * nda::dagger(Q)};
 }
 
-// Check that the eigenvectors/values are correct.
-template <typename M, typename V1, typename V2>
-void check_eig(M const &m, V1 const &vectors, V2 const &values) {
-  for (auto i : nda::range(0, m.extent(0))) {
-    EXPECT_ARRAY_NEAR(matvecmul(m, vectors(nda::range::all, i)), values(i) * vectors(nda::range::all, i), 1.e-13);
+// Test the eigh and eigvalsh functions.
+template <typename T>
+void test_eigh_eigvalsh() {
+  // Max condition number I got for the syhe_matrix is ~300, matrix vals are ~1
+  constexpr double eps_close = 300 * std::numeric_limits<nda::get_fp_t<T>>::epsilon();
+
+  for (auto i : nda::range(1, 6)) {
+    auto A = syhe_matrix<T>(i, -1, 1);
+
+    // use eigh to compute eigenvalues and eigenvectors
+    auto [w1, V1] = nda::linalg::eigh(A);
+    check_eigen(A, V1, w1, eps_close);
+
+    // use eigh_in_place to compute eigenvalues and eigenvectors
+    auto V2 = A;
+    auto w2 = nda::linalg::eigh_in_place(V2);
+    check_eigen(A, V2, w2, eps_close);
+    // Eigenvectors are only the same up to a sign, so some columns in V1 are minus that in V2
+    // checking the absolute values should be sufficient in any non-trivial case
+    EXPECT_ARRAY_NEAR(nda::abs(V1), nda::abs(V2), eps_close);
+    EXPECT_ARRAY_NEAR(w1, w2, eps_close);
+
+    // use eigvalsh to compute eigenvalues only
+    auto w3 = nda::linalg::eigvalsh(A);
+    EXPECT_ARRAY_NEAR(w1, w3, eps_close);
+
+    // use eigvalsh_in_place to compute eigenvalues only
+    auto A4 = A;
+    auto w4 = nda::linalg::eigvalsh_in_place(A4);
+    EXPECT_ARRAY_NEAR(w1, w4, eps_close);
+
+    // use eigh with a C-layout matrix
+    auto A5       = nda::matrix<T, nda::C_layout>{A};
+    auto [w5, V5] = nda::linalg::eigh(A5);
+    check_eigen(A5, V5, w5, eps_close);
+    EXPECT_ARRAY_NEAR(V1, V5, eps_close);
+    EXPECT_ARRAY_NEAR(w1, w5, eps_close);
+
+    // use eigvalsh with a C-layout matrix
+    auto w6 = nda::linalg::eigvalsh(A5);
+    EXPECT_ARRAY_NEAR(w1, w6, eps_close);
   }
 }
 
-TEST(NDA, LinearAlgebraEigenelements) {
-  // calculate eigenvalues and eigenvectors and check that they are correct
-  auto test_eigenelements = [](auto &&M) {
-    auto [ev1, vecs] = nda::linalg::eigenelements(M);
-    check_eig(M, vecs, ev1);
-    auto Mcopy = M;
-    auto ev2   = nda::linalg::eigenvalues_in_place(Mcopy);
-    EXPECT_ARRAY_NEAR(ev1, ev2);
-  };
+TEST(NDA, LinearAlgebraEighAndEigvalsh) {
+  test_eigh_eigvalsh<float>();
+  test_eigh_eigvalsh<std::complex<float>>();
+  test_eigh_eigvalsh<double>();
+  test_eigh_eigvalsh<std::complex<double>>();
+}
 
-  // double matrix in C layout
-  nda::matrix<double> A(3, 3);
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j <= i; ++j) {
-      A(i, j) = (i > j ? i + 2 * j : i - j);
-      A(j, i) = A(i, j);
-    }
+// Test the eigh and eigvalsh functions for generalized eigenvalue problems.
+template <typename T>
+void test_generalized_eigh_eigvalsh(int itype) {
+  constexpr double eps_close = 100 * std::numeric_limits<nda::get_fp_t<T>>::epsilon();
+
+  for (auto i : nda::range(1, 6)) {
+    auto A = syhe_matrix<T>(i, -1, 1);
+    auto B = syhe_matrix<T>(i, 1e-6, 1);
+
+    // use eigh to compute eigenvalues and eigenvectors
+    auto [w1, V1] = nda::linalg::eigh(A, B, itype);
+    check_eigen(A, B, V1, w1, itype, eps_close);
+
+    // use eigh_in_place to compute eigenvalues and eigenvectors
+    auto V2 = A;
+    auto B2 = B;
+    auto w2 = nda::linalg::eigh_in_place(V2, B2, itype);
+    check_eigen(A, B, V2, w2, itype, eps_close);
+    EXPECT_ARRAY_NEAR(V1, V2, eps_close);
+    EXPECT_ARRAY_NEAR(w1, w2, eps_close);
+
+    // use eigvalsh to compute eigenvalues only
+    auto w3 = nda::linalg::eigvalsh(A, B, itype);
+    EXPECT_ARRAY_NEAR(w1, w3, eps_close);
+
+    // use eigvalsh_in_place to compute eigenvalues only
+    auto A4 = A;
+    auto B4 = B;
+    auto w4 = nda::linalg::eigvalsh_in_place(A4, B4, itype);
+    EXPECT_ARRAY_NEAR(w1, w4, eps_close);
+
+    // use eigh with a C-layout matrices
+    auto A5       = nda::matrix<T, nda::C_layout>{A};
+    auto B5       = nda::matrix<T, nda::C_layout>{B};
+    auto [w5, V5] = nda::linalg::eigh(A5, B5, itype);
+    check_eigen(A, B, V5, w5, itype, eps_close);
+    EXPECT_ARRAY_NEAR(V1, V5, eps_close);
+    EXPECT_ARRAY_NEAR(w1, w5, eps_close);
+
+    // use eigvalsh with a C-layout matrices
+    auto w6 = nda::linalg::eigvalsh(A5, B5, itype);
+    EXPECT_ARRAY_NEAR(w1, w6, eps_close);
   }
-  test_eigenelements(A);
-
-  A()     = 0;
-  A(0, 1) = 1;
-  A(1, 0) = 1;
-  A(2, 2) = 8;
-  A(0, 2) = 2;
-  A(2, 0) = 2;
-  test_eigenelements(A);
-
-  A()     = 0;
-  A(0, 1) = 1;
-  A(1, 0) = 1;
-  A(2, 2) = 8;
-  test_eigenelements(A);
-
-  // double matrix in Fortran layout
-  nda::matrix<double, nda::F_layout> D{{1.3, 1.2}, {1.2, 2.2}};
-  test_eigenelements(D);
-
-  // complex matrix in C layout
-  nda::matrix<std::complex<double>> B{{{1.0, 0.0}, {0.0, 1.0}}, {{0.0, -1.0}, {2.0, 0.0}}};
-  test_eigenelements(B);
+}
 
-  // complex matrix in Fortran layout
-  nda::matrix<std::complex<double>, nda::F_layout> C{{{1.3, 0.0}, {0.0, 1.1}}, {{0.0, -1.1}, {2.4, 0.0}}};
-  test_eigenelements(C);
+TEST(NDA, LinearAlgebraGeneralizedEighAndEigvalsh) {
+  test_generalized_eigh_eigvalsh<float>(1);
+  test_generalized_eigh_eigvalsh<float>(2);
+  test_generalized_eigh_eigvalsh<float>(3);
+  test_generalized_eigh_eigvalsh<std::complex<float>>(1);
+  test_generalized_eigh_eigvalsh<std::complex<float>>(2);
+  test_generalized_eigh_eigvalsh<std::complex<float>>(3);
+  test_generalized_eigh_eigvalsh<double>(1);
+  test_generalized_eigh_eigvalsh<double>(2);
+  test_generalized_eigh_eigvalsh<double>(3);
+  test_generalized_eigh_eigvalsh<std::complex<double>>(1);
+  test_generalized_eigh_eigvalsh<std::complex<double>>(2);
+  test_generalized_eigh_eigvalsh<std::complex<double>>(3);
 }
 
+// Test the norm function.
+bool check_norm_p(auto &v, double p) { return nda::linalg::norm(v, p) == std::pow(nda::sum(nda::pow(nda::abs(v), p)), 1.0 / p); };
+
 TEST(NDA, LinearAlgebraNormZeros) {
   const int size = 100;
   auto v         = nda::zeros<double>(size);
 
-  EXPECT_EQ(nda::norm(v), nda::norm(v, 2.0));
-  EXPECT_EQ(nda::norm(v, 0.0), 0.0);
-  EXPECT_EQ(nda::norm(v, 1.0), 0.0);
-  EXPECT_EQ(nda::norm(v, 2.0), 0.0);
-  EXPECT_EQ(nda::norm(v, std::numeric_limits<double>::infinity()), 0.0);
-  EXPECT_EQ(nda::norm(v, -std::numeric_limits<double>::infinity()), 0.0);
-  EXPECT_EQ(nda::norm(v, 1.5), 0.0);
+  EXPECT_EQ(nda::linalg::norm(v), nda::linalg::norm(v, 2.0));
+  EXPECT_EQ(nda::linalg::norm(v, 0.0), 0.0);
+  EXPECT_EQ(nda::linalg::norm(v, 1.0), 0.0);
+  EXPECT_EQ(nda::linalg::norm(v, 2.0), 0.0);
+  EXPECT_EQ(nda::linalg::norm(v, std::numeric_limits<double>::infinity()), 0.0);
+  EXPECT_EQ(nda::linalg::norm(v, -std::numeric_limits<double>::infinity()), 0.0);
+  EXPECT_EQ(nda::linalg::norm(v, 1.5), 0.0);
 }
 
 TEST(NDA, LinearAlgebraNormOnes) {
   const int size = 100;
   auto v         = nda::ones<double>(size);
 
-  EXPECT_EQ(nda::norm(v), nda::norm(v, 2.0));
-  EXPECT_EQ(nda::norm(v, 0.0), size);
-  EXPECT_EQ(nda::norm(v, 1.0), size);
-  EXPECT_EQ(nda::norm(v, 2.0), std::sqrt(size));
-  EXPECT_EQ(nda::norm(v, std::numeric_limits<double>::infinity()), 1);
-  EXPECT_EQ(nda::norm(v, -std::numeric_limits<double>::infinity()), 1);
-  EXPECT_EQ(nda::norm(v, 1.5), std::pow(double(size), 1.0 / 1.5));
+  EXPECT_EQ(nda::linalg::norm(v), nda::linalg::norm(v, 2.0));
+  EXPECT_EQ(nda::linalg::norm(v, 0.0), size);
+  EXPECT_EQ(nda::linalg::norm(v, 1.0), size);
+  EXPECT_EQ(nda::linalg::norm(v, 2.0), std::sqrt(size));
+  EXPECT_EQ(nda::linalg::norm(v, std::numeric_limits<double>::infinity()), 1);
+  EXPECT_EQ(nda::linalg::norm(v, -std::numeric_limits<double>::infinity()), 1);
+  EXPECT_EQ(nda::linalg::norm(v, 1.5), std::pow(double(size), 1.0 / 1.5));
 }
 
-// Check that the p-norm is correct by comparing it to its definition.
-bool check_norm_p(auto &v, double p) { return norm(v, p) == std::pow(nda::sum(nda::pow(nda::abs(v), p)), 1.0 / p); };
-
 TEST(NDA, LinearAlgebraNormRand) {
   const int size = 100;
   auto v         = nda::rand<double>(size);
 
-  EXPECT_EQ(nda::norm(v), nda::norm(v, 2.0));
-  EXPECT_EQ(nda::norm(v, 0.0), size);
-  EXPECT_EQ(nda::norm(v, 1.0), nda::sum(abs(v)));
-  EXPECT_EQ(nda::norm(v, 2.0), std::sqrt(std::real(nda::blas::dotc(v, v))));
-  EXPECT_EQ(nda::norm(v, std::numeric_limits<double>::infinity()), nda::max_element(v));
-  EXPECT_EQ(nda::norm(v, -std::numeric_limits<double>::infinity()), nda::min_element(v));
+  EXPECT_EQ(nda::linalg::norm(v), nda::linalg::norm(v, 2.0));
+  EXPECT_EQ(nda::linalg::norm(v, 0.0), size);
+  EXPECT_EQ(nda::linalg::norm(v, 1.0), nda::sum(abs(v)));
+  EXPECT_EQ(nda::linalg::norm(v, 2.0), std::sqrt(std::real(nda::blas::dotc(v, v))));
+  EXPECT_EQ(nda::linalg::norm(v, std::numeric_limits<double>::infinity()), nda::max_element(v));
+  EXPECT_EQ(nda::linalg::norm(v, -std::numeric_limits<double>::infinity()), nda::min_element(v));
 
   EXPECT_TRUE((check_norm_p(v, -1.5)));
   EXPECT_TRUE((check_norm_p(v, -1.0)));
@@ -362,12 +541,11 @@ TEST(NDA, LinearAlgebraNormRand) {
 }
 
 TEST(NDA, LinearAlgebraNormExample) {
-  // check various p-norms of a vector
   auto run_checks = [](auto const &v) {
-    EXPECT_EQ(nda::norm(v), nda::norm(v, 2.0));
-    EXPECT_EQ(nda::norm(v, 0.0), 3);
-    EXPECT_EQ(nda::norm(v, 1.0), 4);
-    EXPECT_NEAR(nda::norm(v, 2.0), std::sqrt(7.5), 1e-15);
+    EXPECT_EQ(nda::linalg::norm(v), nda::linalg::norm(v, 2.0));
+    EXPECT_EQ(nda::linalg::norm(v, 0.0), 3);
+    EXPECT_EQ(nda::linalg::norm(v, 1.0), 4);
+    EXPECT_NEAR(nda::linalg::norm(v, 2.0), std::sqrt(7.5), 1e-15);
 
     EXPECT_TRUE((check_norm_p(v, -1.5)));
     EXPECT_TRUE((check_norm_p(v, -1.0)));
@@ -377,7 +555,134 @@ TEST(NDA, LinearAlgebraNormExample) {
   auto v = nda::array<double, 1>{-0.5, 0.0, 1.0, 2.5};
   run_checks(v);
   run_checks(1i * v);
-  run_checks((1 + 1i) / sqrt(2) * v);
-  EXPECT_EQ(nda::norm(v, std::numeric_limits<double>::infinity()), 2.5);
-  EXPECT_EQ(nda::norm(v, -std::numeric_limits<double>::infinity()), 0.0);
+  run_checks((1 + 1i) / std::sqrt(2) * v);
+  EXPECT_EQ(nda::linalg::norm(v, std::numeric_limits<double>::infinity()), 2.5);
+  EXPECT_EQ(nda::linalg::norm(v, -std::numeric_limits<double>::infinity()), 0.0);
+}
+
+// Test the outer product function.
+template <typename T, typename Layout>
+void test_outer_product() {
+  // outer product of two arrays
+  auto A = nda::array<T, 2, Layout>::rand(2, 3);
+  auto B = nda::array<T, 3, Layout>::rand(4, 5, 6);
+  auto C = nda::array<T, 5, Layout>(2, 3, 4, 5, 6);
+  for (auto [i, j] : A.indices())
+    for (auto [k, l, m] : B.indices()) C(i, j, k, l, m) = A(i, j) * B(k, l, m);
+  EXPECT_ARRAY_NEAR(C, nda::linalg::outer_product(A, B));
+
+  // outer product of two vectors
+  nda::vector<T> v{1, 2};
+  nda::vector<T> w{3, 4, 5};
+  auto M = nda::linalg::outer_product(v, w);
+  static_assert(nda::get_algebra<decltype(M)> == 'M');
+  static_assert(nda::blas::has_C_layout<decltype(M)>);
+  EXPECT_ARRAY_NEAR(nda::matrix<T>{{3, 4, 5}, {6, 8, 10}}, M);
+}
+
+TEST(NDA, LinearAlgebraOuterProduct) {
+  test_outer_product<double, nda::C_layout>();
+  test_outer_product<double, nda::F_layout>();
+  test_outer_product<std::complex<double>, nda::C_layout>();
+  test_outer_product<std::complex<double>, nda::F_layout>();
+}
+
+// Test the generic solve and solve_in_place functions.
+template <typename value_t, typename Layout>
+void test_solve() {
+  using matrix_t = nda::matrix<value_t, Layout>;
+  using vector_t = nda::vector<value_t>;
+  // condition number for matrix is ~275, but magnitudes are ~5 and we use absolute error
+  constexpr double eps_close = 5 * 275 * std::numeric_limits<nda::get_fp_t<value_t>>::epsilon();
+
+  auto A = matrix_t{{1, 2, 3}, {0, 1, 4}, {5, 6, 0}};
+  auto B = matrix_t{{1, 5}, {4, 5}, {3, 6}};
+
+  // solve A * X = B using the exact matrix inverse
+  auto Ainv = matrix_t{{-24, 18, 5}, {20, -15, -4}, {-5, 4, 1}};
+  auto X    = matrix_t{Ainv * B};
+  EXPECT_ARRAY_NEAR(matrix_t{A * X}, B, eps_close);
+
+  // solve A * X = B using solve_in_place
+  if constexpr (nda::blas::has_F_layout<matrix_t>) {
+    auto Acopy = matrix_t{A};
+    auto Bcopy = matrix_t{B};
+    nda::linalg::solve_in_place(Acopy, Bcopy);
+    EXPECT_ARRAY_NEAR(matrix_t{A * Bcopy}, B, eps_close);
+    EXPECT_ARRAY_NEAR(X, Bcopy, eps_close);
+
+    // solve A * x = b using solve_in_place
+    Acopy  = A;
+    auto b = vector_t{B(nda::range::all, 0)};
+    nda::linalg::solve_in_place(Acopy, b);
+    EXPECT_ARRAY_NEAR(A * b, B(nda::range::all, 0), eps_close);
+    EXPECT_ARRAY_NEAR(X(nda::range::all, 0), b, eps_close);
+  }
+
+  // solve A * X = B using solve
+  auto X2 = nda::linalg::solve(A, B);
+  EXPECT_ARRAY_NEAR(matrix_t{A * X2}, B, eps_close);
+  EXPECT_ARRAY_NEAR(X, X2, eps_close);
+
+  // solve A * x = b using solve
+  auto x = nda::linalg::solve(A, B(nda::range::all, 0));
+  EXPECT_ARRAY_NEAR(A * x, B(nda::range::all, 0), eps_close);
+  EXPECT_ARRAY_NEAR(X(nda::range::all, 0), x, eps_close);
+}
+
+TEST(NDA, LinearAlgebraSolve) {
+  test_solve<float, nda::C_layout>();
+  test_solve<float, nda::F_layout>();
+  test_solve<std::complex<float>, nda::C_layout>();
+  test_solve<std::complex<float>, nda::F_layout>();
+  test_solve<double, nda::C_layout>();
+  test_solve<double, nda::F_layout>();
+  test_solve<std::complex<double>, nda::C_layout>();
+  test_solve<std::complex<double>, nda::F_layout>();
+}
+
+// Test the svd and svd_in_place functions.
+template <typename T, typename Layout>
+void test_svd() {
+  using matrix_t = nda::matrix<T, Layout>;
+  // condition number for matrix is 4, but max magnitude is 8 and we use absolute error
+  constexpr double eps_close = 4 * 8 * std::numeric_limits<nda::get_fp_t<T>>::epsilon();
+
+  auto A = matrix_t{{2, -2, 1}, {-4, -8, -8}};
+  auto s = nda::vector<double>{12, 3};
+
+  // compute the SVD of A
+  auto [U_1, s_1, VH_1] = nda::linalg::svd(A);
+  auto S_1              = matrix_t::zeros(A.shape());
+  diagonal(S_1)         = s_1;
+  EXPECT_ARRAY_NEAR(s_1, s, eps_close);
+  EXPECT_ARRAY_NEAR(A, U_1 * S_1 * VH_1, eps_close);
+
+  // compute the SVD of A in place
+  auto A_copy           = A;
+  auto [U_2, s_2, VH_2] = nda::linalg::svd_in_place(A_copy);
+  auto S_2              = matrix_t::zeros(A.shape());
+  diagonal(S_2)         = s_2;
+  EXPECT_ARRAY_NEAR(s, s_2, eps_close);
+  EXPECT_ARRAY_NEAR(A, U_2 * S_2 * VH_2, eps_close);
+}
+
+TEST(NDA, LinearAlgebraSVD) {
+  test_svd<float, nda::C_layout>();
+  test_svd<float, nda::F_layout>();
+  test_svd<double, nda::C_layout>();
+  test_svd<double, nda::F_layout>();
+  test_svd<std::complex<double>, nda::C_layout>();
+  test_svd<std::complex<double>, nda::F_layout>();
+}
+
+// Test the cross product function.
+TEST(NDA, LinearAlgebraCrossProduct) {
+  nda::vector<double> e1{1, 0, 0};
+  nda::vector<double> e2{0, 1, 0};
+  nda::vector<double> e3{0, 0, 1};
+
+  EXPECT_ARRAY_NEAR(nda::linalg::cross_product(e1, e2), e3);
+  EXPECT_ARRAY_NEAR(nda::linalg::cross_product(e2, e3), e1);
+  EXPECT_ARRAY_NEAR(nda::linalg::cross_product(e3, e1), e2);
 }
diff --git a/test/c++/nda_matrix.cpp b/test/c++/nda_matrix.cpp
index 918e22cd7..dda4e3bda 100644
--- a/test/c++/nda_matrix.cpp
+++ b/test/c++/nda_matrix.cpp
@@ -143,13 +143,19 @@ TEST(NDA, MatrixSliceDagger) {
   EXPECT_ARRAY_NEAR(M_slice_dag, exp_slice, 1.e-14);
 }
 
-TEST(NDA, IdentityMatrix) { EXPECT_EQ_ARRAY(nda::eye<long>(3), (nda::matrix<long>{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}})); }
+TEST(NDA, IdentityMatrix) {
+  EXPECT_EQ_ARRAY(nda::eye<long>(3), (nda::matrix<long>{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}));
+  EXPECT_TRUE(nda::is_matrix_square(nda::eye<long>(3)));
+  EXPECT_TRUE(nda::is_matrix_diagonal(nda::eye<double>(3)));
+}
 
 TEST(NDA, DiagonalMatrix) {
   auto v = nda::vector<int>{1, 2, 3};
   auto M = nda::diag(v);
   EXPECT_EQ_ARRAY(M, (nda::matrix<int>{{1, 0, 0}, {0, 2, 0}, {0, 0, 3}}));
   EXPECT_EQ_ARRAY(nda::diagonal(M), v);
+  EXPECT_TRUE(nda::is_matrix_diagonal(M));
+  EXPECT_TRUE(nda::is_matrix_square(M));
 
   nda::diagonal(M) += v;
   EXPECT_EQ_ARRAY(nda::diagonal(M), 2 * v);
@@ -160,6 +166,8 @@ TEST(NDA, MatrixSlice) {
   auto v = M(nda::range(2, 4), 7);
   static_assert(decltype(v)::layout_t::layout_prop == nda::layout_prop_e::strided_1d);
   static_assert(nda::has_contiguous(decltype(v)::layout_t::layout_prop) == false);
+  EXPECT_TRUE(nda::is_matrix_square(M));
+  EXPECT_FALSE(nda::is_matrix_square(M(nda::range(2, 4), nda::range::all)));
 }
 
 TEST(NDA, MatrixAlgebra) {
@@ -167,5 +175,5 @@ TEST(NDA, MatrixAlgebra) {
   auto M2   = nda::matrix<double>{{1, 2}, {2, 1}};
   auto prod = nda::matrix<double>{{5, 4}, {11, 10}};
   EXPECT_EQ(prod, nda::make_regular(M1 * M2));
-  EXPECT_EQ(nda::make_regular(M1 / M2), nda::make_regular(M1 * nda::inverse(M2)));
+  EXPECT_EQ(nda::make_regular(M1 / M2), nda::make_regular(M1 * nda::linalg::inv(M2)));
 }
diff --git a/test/c++/test_common.hpp b/test/c++/test_common.hpp
index e1edf536b..19a160863 100644
--- a/test/c++/test_common.hpp
+++ b/test/c++/test_common.hpp
@@ -5,11 +5,6 @@
 
 #pragma once
 
-#include <gtest/gtest.h>
-
-#include <array>
-#include <numeric>
-
 #ifndef NDA_DEBUG
 #define NDA_DEBUG
 #endif // NDA_DEBUG
@@ -18,6 +13,12 @@
 #define NDA_ENFORCE_BOUNDCHECK
 #endif // NDA_ENFORCE_BOUNDCHECK
 
+#include <gtest/gtest.h>
+#include <nda/nda.hpp>
+
+#include <array>
+#include <numeric>
+
 // Check if function arguments are equal.
 template <typename T, typename... Ts>
 bool are_equal(const T &a, const Ts &...args) {
@@ -99,3 +100,15 @@ struct array_of_rank {
   [[nodiscard]] auto size() const { return std::accumulate(shape_.begin(), shape_.end(), 1l, std::multiplies<>{}); }
   [[nodiscard]] auto operator()(auto &&...) const { return static_cast<value_type>(R); }
 };
+
+// Return the array in the given address space.
+template <nda::mem::AddressSpace AS, nda::MemoryArray A>
+decltype(auto) to_addr_space(A &&a) {
+  if constexpr (AS == nda::mem::Host) {
+    return nda::get_regular_host_t<A>{std::forward<A>(a)};
+  } else if constexpr (AS == nda::mem::Device) {
+    return nda::get_regular_device_t<A>{std::forward<A>(a)};
+  } else if constexpr (AS == nda::mem::Unified) {
+    return nda::get_regular_unified_t<A>{std::forward<A>(a)};
+  }
+}