diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4c6a097d9a..66c006a28b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,13 +12,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os : [ macos-latest, ubuntu-22.04 ]
+        os : [ macos-latest, ubuntu-24.04 ]
         build_type : [ Release, Debug ]
         task_backend: [ Pthreads, PaRSEC ]
         include:
-          - os: ubuntu-22.04
-            cc: /usr/bin/gcc-12
-            cxx: /usr/bin/g++-12
+          - os: ubuntu-24.04
+            cc: /usr/bin/gcc-14
+            cxx: /usr/bin/g++-14
           - os: macos-latest
             cc: clang
             cxx: clang++
@@ -36,12 +36,12 @@ jobs:
         -DMADNESS_TASK_BACKEND=${{ matrix.task_backend }}
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
         -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root'
-        -DCMAKE_PREFIX_PATH="/usr/local/opt/bison;/usr/local/opt/scalapack"
+        -DCMAKE_PREFIX_PATH="/usr/local/opt/bison;/usr/local/opt/scalapack;/usr/local/opt/boost"
         -DTA_ASSERT_POLICY=TA_ASSERT_THROW
         -DENABLE_SCALAPACK=ON
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Host system info
       shell: bash
@@ -55,19 +55,28 @@ jobs:
         echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV
 
     - name: Install prerequisites Ubuntu packages
-      if: ${{ matrix.os == 'ubuntu-22.04' }}
+      if: ${{ matrix.os == 'ubuntu-24.04' }}
       run: |
         wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
         sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
         sudo apt-get update
-        sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-mpi-dev cmake doxygen
+        sudo apt-get -y install ninja-build g++-14 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-mpi-dev cmake doxygen
         sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so
         echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV
 
-    - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@v1.2
+    - name: Prepare ccache timestamp
+      id: ccache_cache_timestamp
+      shell: cmake -P {0}
+      run: |
+        string(TIMESTAMP current_date "%Y-%m-%d-%H;%M;%S" UTC)
+        message("::set-output name=timestamp::${current_date}")
+    - name: Setup ccache cache files
+      uses: actions/cache@v4
       with:
-        key: ccache-${{ matrix.os }}-${{ matrix.build_type }}-${{ matrix.task_backend }}
+        path: ${{github.workspace}}/build/.ccache
+        key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }}
+        restore-keys: |
+          ${{ matrix.config.name }}-ccache-
 
     - name: "Configure build: ${{ env.BUILD_CONFIG }}"
       shell: bash
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a130211293..730d92abea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,9 @@ add_feature_info(TENSOR_MEM_TRACE TA_TENSOR_MEM_TRACE "instrumented tracing of T
 option(TA_TENSOR_MEM_PROFILE "Turn on instrumented profiling of TA::Tensor memory use" ${TA_TENSOR_MEM_TRACE})
 add_feature_info(TENSOR_MEM_PROFILE TA_TENSOR_MEM_PROFILE "instrumented profiling of TA::Tensor memory use")
 
+option(TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED "Turn on TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data" OFF)
+add_feature_info(TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED "TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data")
+
 option(TA_EXPERT "TiledArray Expert mode: disables automatically downloading or building dependencies" OFF)
 
 option(TA_SIGNED_1INDEX_TYPE "Enables the use of signed 1-index coordinate type (OFF in 1.0.0-alpha.2 and older)" ON)
diff --git a/INSTALL.md b/INSTALL.md
index 49564b9eb0..742d967f71 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -35,15 +35,15 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
 - [CMake](https://cmake.org/), version 3.15 or higher; if {CUDA,HIP} support is needed, CMake {3.18,3.21} or higher is required.
 - [Git](https://git-scm.com/) 1.8 or later (required to obtain TiledArray and MADNESS source code from GitHub)
 - [Eigen](http://eigen.tuxfamily.org/), version 3.3.5 or higher; if CUDA is enabled then 3.3.7 is required (will be downloaded automatically, if missing)
-- [Boost libraries](www.boost.org/), version 1.59 or higher (will be downloaded automatically, if missing). The following principal Boost components are used:
+- [Boost libraries](www.boost.org/), version 1.81 or higher (will be downloaded automatically, if missing). The following principal Boost components are used:
   - Boost.Iterator: header-only
   - Boost.Container: header-only
   - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing*
   - Boost.Range: header-only, *only used for unit testing*
 - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later.
-- [BTAS](http://github.com/ValeevGroup/BTAS), tag 1cfcb12647c768ccd83b098c64cda723e1275e49 . If usable BTAS installation is not found, TiledArray will download and compile
+- [BTAS](http://github.com/ValeevGroup/BTAS), tag 62d57d9b1e0c733b4b547bc9cfdd07047159dbca . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
-- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag ef97ad1f0080da04f9592f03185c1a331cd5e001 .
+- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag bd84a52766ab497dedc2f15f2162fb0eb7ec4653 .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
   If usable MADNESS installation is not found, TiledArray will download and compile
   MADNESS from source. *This is the recommended way to compile MADNESS for all users*.
diff --git a/external/boost.cmake b/external/boost.cmake
index c89b2e3667..d78d909fe5 100644
--- a/external/boost.cmake
+++ b/external/boost.cmake
@@ -1,3 +1,18 @@
+# -*- mode: cmake -*-
+
+# update the Boost version that we can tolerate
+if (NOT DEFINED Boost_OLDEST_BOOST_VERSION)
+    set(Boost_OLDEST_BOOST_VERSION ${TA_OLDEST_BOOST_VERSION})
+else()
+    if (${Boost_OLDEST_BOOST_VERSION} VERSION_LESS ${TA_OLDEST_BOOST_VERSION})
+        if (DEFINED CACHE{Boost_OLDEST_BOOST_VERSION})
+            set(Boost_OLDEST_BOOST_VERSION "${TA_OLDEST_BOOST_VERSION}" CACHE STRING "Oldest Boost version to use" FORCE)
+        else()
+            set(Boost_OLDEST_BOOST_VERSION ${TA_OLDEST_BOOST_VERSION})
+        endif()
+    endif()
+endif()
+
 # Boost can be discovered by every (sub)package but only the top package can *build* it ...
 # in either case must declare the components used by TA
 set(required_components
diff --git a/external/versions.cmake b/external/versions.cmake
index d375cedf42..72386d61f6 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG ef97ad1f0080da04f9592f03185c1a331cd5e001)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
-set(TA_TRACKED_BTAS_TAG 1cfcb12647c768ccd83b098c64cda723e1275e49)
-set(TA_TRACKED_BTAS_PREVIOUS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a)
+set(TA_TRACKED_BTAS_TAG 62d57d9b1e0c733b4b547bc9cfdd07047159dbca)
+set(TA_TRACKED_BTAS_PREVIOUS_TAG 1cfcb12647c768ccd83b098c64cda723e1275e49)
 
 set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece)
 set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83)
@@ -34,3 +34,7 @@ set(TA_TRACKED_RANGEV3_PREVIOUS_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864)
 set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg)
 set(TA_TRACKED_TTG_TAG 3fe4a06dbf4b05091269488aab38223da1f8cb8e)
 set(TA_TRACKED_TTG_PREVIOUS_TAG 26da9b40872660b864794658d4fdeee1a95cb4d6)
+
+# oldest Boost we can tolerate ... old is fine but if Boost is missing build it requires something much younger
+# SeQuant requires at least 1.81, so go with that
+set(TA_OLDEST_BOOST_VERSION 1.81)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 776b85f4a1..a1688f5862 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -133,7 +133,6 @@ TiledArray/expressions/index_list.h
 TiledArray/external/btas.h
 TiledArray/external/madness.h
 TiledArray/external/umpire.h
-TiledArray/host/env.cpp
 TiledArray/host/env.h
 TiledArray/math/blas.h
 TiledArray/math/gemm_helper.h
@@ -162,6 +161,8 @@ TiledArray/tensor/complex.h
 TiledArray/tensor/kernels.h
 TiledArray/tensor/operators.h
 TiledArray/tensor/permute.h
+TiledArray/tensor/print.ipp
+TiledArray/tensor/print.h
 TiledArray/tensor/shift_wrapper.h
 TiledArray/tensor/tensor.h
 TiledArray/tensor/tensor_interface.h
@@ -204,11 +205,32 @@ TiledArray/util/time.h
 TiledArray/util/vector.h
 )
 
+set(TILEDARRAY_SOURCE_FILES
+        TiledArray/array_impl.cpp
+        TiledArray/dist_array.cpp
+        TiledArray/range.cpp
+        TiledArray/sparse_shape.cpp
+        TiledArray/tensor_impl.cpp
+        TiledArray/tiledarray.cpp
+        TiledArray/version.cpp
+        TiledArray/einsum/index.cpp
+        TiledArray/expressions/permopt.cpp
+        TiledArray/host/env.cpp
+        TiledArray/math/linalg/basic.cpp
+        TiledArray/math/linalg/rank-local.cpp
+        TiledArray/tensor/print.cpp
+        TiledArray/tensor/tensor.cpp
+        TiledArray/util/backtrace.cpp
+        TiledArray/util/bug.cpp
+        TiledArray/util/ptr_registry.cpp
+        TiledArray/util/random.cpp
+        TiledArray/util/threads.cpp
+)
+
 if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
   list(APPEND TILEDARRAY_HEADER_FILES
           TiledArray/external/device.h
           TiledArray/external/librett.h
-          TiledArray/device/blas.cpp
           TiledArray/device/blas.h
           TiledArray/device/btas.h
           TiledArray/device/btas_um_tensor.h
@@ -219,7 +241,11 @@ if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
           TiledArray/device/kernel/thrust/reduce_kernel.h
           TiledArray/device/platform.h
           TiledArray/device/thrust.h
-          TiledArray/device/um_storage.h)
+          TiledArray/device/um_storage.h
+  )
+  list(APPEND TILEDARRAY_SOURCE_FILES
+          TiledArray/device/blas.cpp
+  )
   if(TILEDARRAY_HAS_CUDA)
     list(APPEND TILEDARRAY_HEADER_FILES
        TiledArray/external/cuda.h
@@ -227,24 +253,6 @@ if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
   endif(TILEDARRAY_HAS_CUDA)
 endif(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
 
-set(TILEDARRAY_SOURCE_FILES
-TiledArray/tiledarray.cpp
-TiledArray/tensor/tensor.cpp
-TiledArray/sparse_shape.cpp
-TiledArray/tensor_impl.cpp
-TiledArray/array_impl.cpp
-TiledArray/dist_array.cpp
-TiledArray/version.cpp
-TiledArray/einsum/index.cpp
-TiledArray/expressions/permopt.cpp
-TiledArray/math/linalg/basic.cpp
-TiledArray/math/linalg/rank-local.cpp
-TiledArray/util/backtrace.cpp
-TiledArray/util/bug.cpp
-TiledArray/util/ptr_registry.cpp
-TiledArray/util/random.cpp
-TiledArray/util/threads.cpp
-)
 # feed TILEDARRAY_GIT_REVISION and TILEDARRAY_GIT_DESCRIPTION to TiledArray/version.cpp only to avoid recompiling everything
 set_source_files_properties(
         TiledArray/version.cpp
diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h
index 7d5b59d7c1..78ced80eed 100644
--- a/src/TiledArray/array_impl.h
+++ b/src/TiledArray/array_impl.h
@@ -129,8 +129,9 @@ bool operator!=(const TileReference<Impl>& a, const TileReference<Impl>& b) {
 }
 
 /// redirect operator to std::ostream for TileReference objects
-template <typename Impl>
-std::ostream& operator<<(std::ostream& os, const TileReference<Impl>& a) {
+template <typename Char, typename CharTraits, typename Impl>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const TileReference<Impl>& a) {
   os << a.get();
   return os;
 }
@@ -192,8 +193,10 @@ bool operator!=(const TileConstReference<Impl>& a,
 }
 
 /// redirect operator to std::ostream for TileConstReference objects
-template <typename Impl>
-std::ostream& operator<<(std::ostream& os, const TileConstReference<Impl>& a) {
+template <typename Char, typename CharTraits, typename Impl>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os,
+    const TileConstReference<Impl>& a) {
   os << a.get();
   return os;
 }
diff --git a/src/TiledArray/bitset.h b/src/TiledArray/bitset.h
index 3449c68f59..a214172624 100644
--- a/src/TiledArray/bitset.h
+++ b/src/TiledArray/bitset.h
@@ -613,8 +613,9 @@ Bitset<Block> operator^(Bitset<Block> left, const Bitset<Block>& right) {
   return left;
 }
 
-template <typename Block>
-std::ostream& operator<<(std::ostream& os, const Bitset<Block>& bitset) {
+template <typename Char, typename CharTraits, typename Block>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const Bitset<Block>& bitset) {
   os << std::hex;
   for (long i = bitset.num_blocks() - 1l; i >= 0l; --i)
     os << std::setfill('0') << std::setw(sizeof(Block) * 2) << bitset.get()[i]
diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
index 483847067f..f0399b5b01 100644
--- a/src/TiledArray/config.h.in
+++ b/src/TiledArray/config.h.in
@@ -97,6 +97,9 @@
 /* Is TA::Tensor memory tracing enabled? */
 #cmakedefine TA_TENSOR_MEM_TRACE 1
 
+/* TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data? */
+#cmakedefine TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED 1
+
 /* Is TTG available? */
 #cmakedefine TILEDARRAY_HAS_TTG 1
 
diff --git a/src/TiledArray/dense_shape.h b/src/TiledArray/dense_shape.h
index 730f649663..9ab1ccaf50 100644
--- a/src/TiledArray/dense_shape.h
+++ b/src/TiledArray/dense_shape.h
@@ -408,7 +408,9 @@ constexpr inline bool is_replicated(World& world, const DenseShape& t) {
 /// \param os The output stream
 /// \param shape the DenseShape object
 /// \return A reference to the output stream
-inline std::ostream& operator<<(std::ostream& os, const DenseShape& shape) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const DenseShape& shape) {
   os << "DenseShape:" << std::endl;
   return os;
 }
diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h
index c6d38abf7c..bb75523c93 100644
--- a/src/TiledArray/dist_array.h
+++ b/src/TiledArray/dist_array.h
@@ -1776,9 +1776,10 @@ extern template class DistArray<Tensor<std::complex<float>>, SparsePolicy>;
 /// \param a The array to be put in the output stream
 /// \return A reference to the output stream
 /// \note this is a collective operation
-template <typename Tile, typename Policy>
-inline std::ostream& operator<<(std::ostream& os,
-                                const DistArray<Tile, Policy>& a) {
+template <typename Char, typename CharTraits, typename Tile, typename Policy>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os,
+    const DistArray<Tile, Policy>& a) {
   if (a.world().rank() == 0) {
     for (std::size_t i = 0; i < a.size(); ++i)
       if (!a.is_zero(i)) {
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index 3d0ef11c10..907a1632fd 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -279,10 +279,9 @@ class ContEngine : public BinaryEngine<Derived> {
             outer_size(left_indices_), outer_size(right_indices_),
             (!implicit_permute_outer_ ? std::move(outer_perm) : Permutation{}));
       } else {
-
         auto make_total_perm = [this]() -> BipartitePermutation {
-          if (this->product_type() != TensorProduct::Contraction
-              || this->implicit_permute_inner_)
+          if (this->product_type() != TensorProduct::Contraction ||
+              this->implicit_permute_inner_)
             return this->implicit_permute_outer_
                        ? BipartitePermutation()
                        : BipartitePermutation(outer(this->perm_));
@@ -299,11 +298,9 @@ class ContEngine : public BinaryEngine<Derived> {
         auto total_perm = make_total_perm();
 
         // factor_ is absorbed into inner_tile_nonreturn_op_
-        op_ = op_type(
-            left_op, right_op, scalar_type(1), outer_size(indices_),
-            outer_size(left_indices_), outer_size(right_indices_),
-            total_perm,
-            this->element_nonreturn_op_);
+        op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_),
+                      outer_size(left_indices_), outer_size(right_indices_),
+                      total_perm, this->element_nonreturn_op_);
       }
       trange_ = ContEngine_::make_trange(outer_perm);
       shape_ = ContEngine_::make_shape(outer_perm);
@@ -314,10 +311,9 @@ class ContEngine : public BinaryEngine<Derived> {
         op_ = op_type(left_op, right_op, factor_, outer_size(indices_),
                       outer_size(left_indices_), outer_size(right_indices_));
       } else {
-
         auto make_total_perm = [this]() -> BipartitePermutation {
-          if (this->product_type() != TensorProduct::Contraction
-              || this->implicit_permute_inner_)
+          if (this->product_type() != TensorProduct::Contraction ||
+              this->implicit_permute_inner_)
             return {};
 
           // Here,
@@ -547,7 +543,7 @@ class ContEngine : public BinaryEngine<Derived> {
                             inner_size(this->right_indices_));
           this->element_nonreturn_op_ =
               [contrreduce_op, permute_inner = this->product_type() !=
-                                                   TensorProduct::Contraction](
+                                               TensorProduct::Contraction](
                   result_tile_element_type& result,
                   const left_tile_element_type& left,
                   const right_tile_element_type& right) {
@@ -582,11 +578,11 @@ class ContEngine : public BinaryEngine<Derived> {
                 [mult_op, outer_prod](result_tile_element_type& result,
                                       const left_tile_element_type& left,
                                       const right_tile_element_type& right) {
+                  TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                            outer_prod == TensorProduct::Contraction);
                   if (outer_prod == TensorProduct::Hadamard)
                     result = mult_op(left, right);
-                  else {
-                    TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
-                              outer_prod == TensorProduct::Contraction);
+                  else {  // outer_prod == TensorProduct::Contraction
                     // there is currently no fused MultAdd ternary Op, only Add
                     // and Mult thus implement this as 2 separate steps
                     // TODO optimize by implementing (ternary) MultAdd
@@ -677,6 +673,7 @@ class ContEngine : public BinaryEngine<Derived> {
                   const left_tile_element_type& left,
                   const right_tile_element_type& right) {
                 if (outer_prod == TensorProduct::Contraction) {
+                  // TODO implement X-permuting AXPY
                   if (empty(result))
                     result = scal_op(left, right);
                   else {
diff --git a/src/TiledArray/expressions/expr_trace.h b/src/TiledArray/expressions/expr_trace.h
index e9010ea6d7..7a42420232 100644
--- a/src/TiledArray/expressions/expr_trace.h
+++ b/src/TiledArray/expressions/expr_trace.h
@@ -122,8 +122,8 @@ class ExprTraceTarget {
 /// \param os The output stream for the expression trace
 /// \param tsr The tensor that will be the target of the expression
 /// \return The expression trace object
-template <typename A, bool Alias>
-inline ExprTraceTarget operator<<(std::ostream& os,
+template <typename Char, typename CharTraits, typename A, bool Alias>
+inline ExprTraceTarget operator<<(std::basic_ostream<Char, CharTraits>& os,
                                   const TsrExpr<A, Alias>& tsr) {
   return ExprTraceTarget(os, tsr.annotation());
 }
diff --git a/src/TiledArray/expressions/index_list.h b/src/TiledArray/expressions/index_list.h
index e364e16560..91d74aaccb 100644
--- a/src/TiledArray/expressions/index_list.h
+++ b/src/TiledArray/expressions/index_list.h
@@ -340,7 +340,9 @@ inline IndexList operator*(const ::TiledArray::Permutation& p,
 }
 
 /// ostream IndexList output operator.
-inline std::ostream& operator<<(std::ostream& out, const IndexList& v) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& out, const IndexList& v) {
   out << "(";
   std::size_t d;
   std::size_t n = v.size() - 1;
@@ -791,8 +793,9 @@ inline BipartiteIndexList operator*(const ::TiledArray::Permutation& p,
 /// \param[in,out] out the stream that \c v will be written to.
 /// \param[in] v The BipartiteIndexList instance to insert into the stream.
 /// \return \c out will be returned after adding \c v to it.
-inline std::ostream& operator<<(std::ostream& out,
-                                const BipartiteIndexList& v) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& out, const BipartiteIndexList& v) {
   const std::string str = "(" + static_cast<std::string>(v) + ")";
   return out << str;
 }
diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h
index c22afd3813..9a49362916 100644
--- a/src/TiledArray/external/btas.h
+++ b/src/TiledArray/external/btas.h
@@ -255,6 +255,13 @@ inline btas::Tensor<T, Range, Storage>& shift_to(
   return arg;
 }
 
+template <typename T, typename Range, typename Storage, typename Index>
+inline btas::Tensor<T, Range, Storage>&& shift_to(
+    btas::Tensor<T, Range, Storage>&& arg, const Index& range_shift) {
+  const_cast<Range&>(arg.range()).inplace_shift(range_shift);
+  return std::move(arg);
+}
+
 /// result[i] = arg1[i] + arg2[i]
 template <typename T, typename Range, typename Storage>
 inline btas::Tensor<T, Range, Storage> add(
@@ -388,6 +395,16 @@ inline btas::Tensor<T, Range, Storage>& subt_to(
   return result;
 }
 
+template <typename T, typename Range, typename Storage>
+inline btas::Tensor<T, Range, Storage>&& subt_to(
+    btas::Tensor<T, Range, Storage>&& result,
+    const btas::Tensor<T, Range, Storage>& arg) {
+  auto result_view = make_ti(result);
+  auto arg_view = make_ti(arg);
+  result_view.subt_to(arg_view);
+  return std::move(result);
+}
+
 template <typename T, typename Range, typename Storage, typename Scalar,
           typename std::enable_if<
               TiledArray::detail::is_numeric_v<Scalar>>::type* = nullptr>
@@ -400,6 +417,18 @@ inline btas::Tensor<T, Range, Storage>& subt_to(
   return result;
 }
 
+template <typename T, typename Range, typename Storage, typename Scalar,
+          typename std::enable_if<
+              TiledArray::detail::is_numeric_v<Scalar>>::type* = nullptr>
+inline btas::Tensor<T, Range, Storage>&& subt_to(
+    btas::Tensor<T, Range, Storage>&& result,
+    const btas::Tensor<T, Range, Storage>& arg, const Scalar factor) {
+  auto result_view = make_ti(result);
+  auto arg_view = make_ti(arg);
+  result_view.subt_to(arg_view, factor);
+  return std::move(result);
+}
+
 /// result[i] = arg1[i] * arg2[i]
 template <typename T, typename Range, typename Storage>
 inline btas::Tensor<T, Range, Storage> mult(
@@ -460,6 +489,16 @@ inline btas::Tensor<T, Range, Storage>& mult_to(
   return result;
 }
 
+template <typename T, typename Range, typename Storage>
+inline btas::Tensor<T, Range, Storage>&& mult_to(
+    btas::Tensor<T, Range, Storage>&& result,
+    const btas::Tensor<T, Range, Storage>& arg) {
+  auto result_view = make_ti(result);
+  auto arg_view = make_ti(arg);
+  result_view.mult_to(arg_view);
+  return std::move(result);
+}
+
 /// result[i] *= arg[i] * factor
 template <typename T, typename Range, typename Storage, typename Scalar,
           typename std::enable_if<
@@ -473,6 +512,18 @@ inline btas::Tensor<T, Range, Storage>& mult_to(
   return result;
 }
 
+template <typename T, typename Range, typename Storage, typename Scalar,
+          typename std::enable_if<
+              TiledArray::detail::is_numeric_v<Scalar>>::type* = nullptr>
+inline btas::Tensor<T, Range, Storage>&& mult_to(
+    btas::Tensor<T, Range, Storage>&& result,
+    const btas::Tensor<T, Range, Storage>& arg, const Scalar factor) {
+  auto result_view = make_ti(result);
+  auto arg_view = make_ti(arg);
+  result_view.mult_to(arg_view, factor);
+  return std::move(result);
+}
+
 // Generic element-wise binary operations
 // ---------------------------------------------
 
@@ -540,6 +591,14 @@ inline btas::Tensor<T, Range, Storage>& neg_to(
   return result;
 }
 
+template <typename T, typename Range, typename Storage>
+inline btas::Tensor<T, Range, Storage>&& neg_to(
+    btas::Tensor<T, Range, Storage>&& result) {
+  auto result_view = make_ti(result);
+  result_view.neg_to();
+  return std::move(result);
+}
+
 template <typename T, typename Range, typename Storage>
 inline btas::Tensor<T, Range, Storage> neg(
     const btas::Tensor<T, Range, Storage>& arg) {
@@ -600,6 +659,14 @@ inline btas::Tensor<T, Range, Storage>& conj_to(
   return arg;
 }
 
+template <typename T, typename Range, typename Storage>
+inline btas::Tensor<T, Range, Storage>&& conj_to(
+    btas::Tensor<T, Range, Storage>&& arg) {
+  auto arg_view = make_ti(arg);
+  arg_view.conj_to();
+  return std::move(arg);
+}
+
 template <typename T, typename Range, typename Storage, typename Scalar,
           std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>* = nullptr>
 inline btas::Tensor<T, Range, Storage>& conj_to(
@@ -609,6 +676,15 @@ inline btas::Tensor<T, Range, Storage>& conj_to(
   return arg;
 }
 
+template <typename T, typename Range, typename Storage, typename Scalar,
+          std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>* = nullptr>
+inline btas::Tensor<T, Range, Storage>&& conj_to(
+    btas::Tensor<T, Range, Storage>&& arg, const Scalar factor) {
+  auto arg_view = make_ti(arg);
+  arg_view.conj_to(factor);
+  return std::move(arg);
+}
+
 // Generic element-wise unary operations
 // ---------------------------------------------
 
diff --git a/src/TiledArray/permutation.h b/src/TiledArray/permutation.h
index d70b283034..6be12e0a00 100644
--- a/src/TiledArray/permutation.h
+++ b/src/TiledArray/permutation.h
@@ -36,7 +36,9 @@ namespace TiledArray {
 class Permutation;
 
 bool operator==(const Permutation&, const Permutation&);
-std::ostream& operator<<(std::ostream&, const Permutation&);
+template <typename Char, typename CharTraits>
+std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>&, const Permutation&);
 template <typename T, std::size_t N>
 inline std::array<T, N> operator*(const Permutation&, const std::array<T, N>&);
 template <typename T, std::size_t N>
@@ -475,7 +477,9 @@ inline bool operator<(const Permutation& p1, const Permutation& p2) {
 /// \param[out] output The output stream
 /// \param[in] p The permutation to be added to the output stream
 /// \return The output stream
-inline std::ostream& operator<<(std::ostream& output, const Permutation& p) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& output, const Permutation& p) {
   std::size_t n = p.size();
   output << "{";
   for (unsigned int dim = 0; dim < n - 1; ++dim)
diff --git a/src/TiledArray/range.cpp b/src/TiledArray/range.cpp
new file mode 100644
index 0000000000..5993146ad9
--- /dev/null
+++ b/src/TiledArray/range.cpp
@@ -0,0 +1,30 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2025  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "TiledArray/range.h"
+
+namespace TiledArray {
+
+std::string to_string(const Range& r) {
+  std::ostringstream oss;
+  oss << r;
+  return oss.str();
+}
+
+}  // namespace TiledArray
diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h
index cdebd7ddfc..2235da948e 100644
--- a/src/TiledArray/range.h
+++ b/src/TiledArray/range.h
@@ -1321,7 +1321,9 @@ inline bool operator!=(const Range& r1, const Range& r2) {
 /// \param os The output stream that will be used to print \c r
 /// \param r The range to be printed
 /// \return A reference to the output stream
-inline std::ostream& operator<<(std::ostream& os, const Range& r) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const Range& r) {
   os << "[ ";
   detail::print_array(os, r.lobound_data(), r.rank());
   os << ", ";
@@ -1330,6 +1332,12 @@ inline std::ostream& operator<<(std::ostream& os, const Range& r) {
   return os;
 }
 
+/// creates a string using operator<<(basic_ostream,Range)
+
+/// \param r a Range
+/// \return string representation of \p r
+std::string to_string(const Range& r);
+
 /// Test the two ranges are congruent
 
 /// This function tests that the rank and extent of
diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h
index a29e0d607c..0086868dc6 100644
--- a/src/TiledArray/range1.h
+++ b/src/TiledArray/range1.h
@@ -215,7 +215,9 @@ inline void swap(Range1& r0, Range1& r1) {  // no throw
 }
 
 /// Range1 ostream operator
-inline std::ostream& operator<<(std::ostream& out, const Range1& rng) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& out, const Range1& rng) {
   out << "[ " << rng.first << ", " << rng.second << " )";
   return out;
 }
diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h
index ef2ed1e121..8ae7f44109 100644
--- a/src/TiledArray/size_array.h
+++ b/src/TiledArray/size_array.h
@@ -482,9 +482,9 @@ inline std::vector<T> operator*(const Permutation& perm,
   return result;
 }
 
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os,
-                                const SizeArray<T>& size_array) {
+template <typename Char, typename CharTraits, typename T>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const SizeArray<T>& size_array) {
   print_array(os, size_array);
   return os;
 }
diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h
index 100483eeed..7da071a88b 100644
--- a/src/TiledArray/sparse_shape.h
+++ b/src/TiledArray/sparse_shape.h
@@ -1734,8 +1734,9 @@ typename SparseShape<T>::value_type SparseShape<T>::threshold_ =
 /// \param os The output stream
 /// \param shape the SparseShape<T> object
 /// \return A reference to the output stream
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os, const SparseShape<T>& shape) {
+template <typename Char, typename CharTraits, typename T>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const SparseShape<T>& shape) {
   os << "SparseShape<" << typeid(T).name() << ">:" << std::endl
      << shape.data() << std::endl;
   return os;
diff --git a/src/TiledArray/special/kronecker_delta.h b/src/TiledArray/special/kronecker_delta.h
index 35a8da6e57..44ce5e5107 100644
--- a/src/TiledArray/special/kronecker_delta.h
+++ b/src/TiledArray/special/kronecker_delta.h
@@ -168,7 +168,11 @@ Tensor<T> mult(const KroneckerDeltaTile& arg1, const Tensor<T>& arg2,
 template <typename T>
 Tensor<T>& mult_to(Tensor<T>& result, const KroneckerDeltaTile& arg1) {
   abort();
-  return result;
+}
+
+template <typename T>
+Tensor<T>&& mult_to(Tensor<T>&& result, const KroneckerDeltaTile& arg1) {
+  abort();
 }
 
 // dense_result[i] = binary(dense_arg1[i], sparse_arg2[i], op)
diff --git a/src/TiledArray/symm/permutation.h b/src/TiledArray/symm/permutation.h
index cb3e2e1000..dbcf2befa5 100644
--- a/src/TiledArray/symm/permutation.h
+++ b/src/TiledArray/symm/permutation.h
@@ -134,8 +134,9 @@ class Permutation {
     }
     return output;
   }
-  friend inline std::ostream& operator<<(std::ostream& output,
-                                         const Permutation& p);
+  template <typename Char, typename CharTraits>
+  friend inline std::basic_ostream<Char, CharTraits>& operator<<(
+      std::basic_ostream<Char, CharTraits>& output, const Permutation& p);
 
   /// Validate permutation specified in one-line form as an iterator range
   /// \return \c true if each element of \c [first,last) is non-negative and
@@ -436,7 +437,7 @@ class Permutation {
   /// \param[in,out] ar The serialization archive
   template <typename Archive>
   void serialize(Archive& ar) {
-    ar& p_;
+    ar & p_;
   }
 
 };  // class Permutation
@@ -478,7 +479,9 @@ inline bool operator<(const Permutation& p1, const Permutation& p2) {
 /// \param[out] output The output stream
 /// \param[in] p The permutation to be added to the output stream
 /// \return The output stream
-inline std::ostream& operator<<(std::ostream& output, const Permutation& p) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& output, const Permutation& p) {
   output << "{";
   Permutation::print_map(output, p.data());
   output << "}";
diff --git a/src/TiledArray/symm/permutation_group.h b/src/TiledArray/symm/permutation_group.h
index 1aba4b6e4c..0d55abb388 100644
--- a/src/TiledArray/symm/permutation_group.h
+++ b/src/TiledArray/symm/permutation_group.h
@@ -245,8 +245,9 @@ inline bool operator<(const PermutationGroup& p1, const PermutationGroup& p2) {
 /// \param[out] output The output stream
 /// \param[in] p The permutation group to be added to the output stream
 /// \return The output stream
-inline std::ostream& operator<<(std::ostream& output,
-                                const PermutationGroup& p) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& output, const PermutationGroup& p) {
   output << "{";
   for (auto i = p.cbegin(); i != p.cend();) {
     output << *i;
diff --git a/src/TiledArray/tensor.h b/src/TiledArray/tensor.h
index 20ecab9e0e..6d7b75eee2 100644
--- a/src/TiledArray/tensor.h
+++ b/src/TiledArray/tensor.h
@@ -30,6 +30,7 @@
 
 #include <TiledArray/tensor/tensor.h>
 
+#include <TiledArray/tensor/print.h>
 #include <TiledArray/tensor/tensor_interface.h>
 #include <TiledArray/tensor/tensor_map.h>
 
@@ -49,80 +50,6 @@ template <typename T>
 using TensorConstView =
     detail::TensorInterface<typename std::add_const<T>::type, BlockRange>;
 
-/// Tensor output operator
-
-/// Output tensor \c t to the output stream, \c os .
-/// \tparam T The tensor type
-/// \param os The output stream
-/// \param t The tensor to be output
-/// \return A reference to the output stream
-template <typename T, typename std::enable_if<detail::is_tensor<T>::value &&
-                                              detail::is_contiguous_tensor<
-                                                  T>::value>::type* = nullptr>
-inline std::ostream& operator<<(std::ostream& os, const T& t) {
-  os << t.range() << " { ";
-  const auto n = t.range().volume();
-  std::size_t offset = 0ul;
-  const auto more_than_1_batch = t.nbatch() > 1;
-  for (auto b = 0ul; b != t.nbatch(); ++b) {
-    if (more_than_1_batch) {
-      os << "[batch " << b << "]{ ";
-    }
-    for (auto ord = 0ul; ord < n; ++ord) {
-      os << t.data()[offset + ord] << " ";
-    }
-    if (more_than_1_batch) {
-      os << "} ";
-    }
-    offset += n;
-  }
-  os << "}";
-
-  return os;
-}
-
-/// Tensor output operator
-
-/// Output tensor \c t to the output stream, \c os .
-/// \tparam T The tensor type
-/// \param os The output stream
-/// \param t The tensor to be output
-/// \return A reference to the output stream
-template <typename T, typename std::enable_if<detail::is_tensor<T>::value &&
-                                              !detail::is_contiguous_tensor<
-                                                  T>::value>::type* = nullptr>
-inline std::ostream& operator<<(std::ostream& os, const T& t) {
-  const auto stride = inner_size(t);
-  const auto volume = t.range().volume();
-
-  auto tensor_print_range =
-      [&os, stride](typename T::const_pointer MADNESS_RESTRICT const t_data) {
-        for (decltype(t.range().volume()) i = 0ul; i < stride; ++i)
-          os << t_data[i] << " ";
-      };
-
-  os << t.range() << " { ";
-
-  for (decltype(t.range().volume()) i = 0ul; i < volume; i += stride)
-    tensor_print_range(t.data() + t.range().ordinal(i));
-
-  os << "}";
-
-  return os;
-}
-
-template <typename T,
-          typename = std::enable_if_t<detail::is_tensor_of_tensor_v<T>>>
-inline std::ostream& operator<<(std::ostream& os, const T& t) {
-  os << t.range() << " {" << std::endl;  // Outer tensor's range
-  for (auto idx : t.range()) {           // Loop over inner tensors
-    const auto& inner_t = t(idx);
-    os << "  " << idx << ":" << inner_t << std::endl;
-  }
-  os << "}";  // End outer tensor
-  return os;
-}
-
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_SRC_TILEDARRAY_TENSOR_H__INCLUDED
diff --git a/src/TiledArray/tensor/complex.h b/src/TiledArray/tensor/complex.h
index 676327427f..a7a25787bb 100644
--- a/src/TiledArray/tensor/complex.h
+++ b/src/TiledArray/tensor/complex.h
@@ -126,8 +126,9 @@ class ComplexConjugate {
     return ComplexConjugate<S>(-factor_);
   }
 
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const ComplexConjugate<S>& cc) {
+  template <typename Char, typename CharTraits>
+  friend std::basic_ostream<Char, CharTraits>& operator<<(
+      std::basic_ostream<Char, CharTraits>& os, const ComplexConjugate<S>& cc) {
     os << "conj()] [" << cc.factor_;
     return os;
   }
@@ -157,8 +158,10 @@ class ComplexConjugate<void> {
 
   inline ComplexConjugate<ComplexNegTag> operator-() const;
 
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const ComplexConjugate<void>& cc) {
+  template <typename Char, typename CharTraits>
+  friend std::basic_ostream<Char, CharTraits>& operator<<(
+      std::basic_ostream<Char, CharTraits>& os,
+      const ComplexConjugate<void>& cc) {
     os << "conj()";
     return os;
   }
@@ -173,8 +176,10 @@ class ComplexConjugate<ComplexNegTag> {
  public:
   inline ComplexConjugate<void> operator-() const;
 
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const ComplexConjugate<ComplexNegTag>& cc) {
+  template <typename Char, typename CharTraits>
+  friend std::basic_ostream<Char, CharTraits>& operator<<(
+      std::basic_ostream<Char, CharTraits>& os,
+      const ComplexConjugate<ComplexNegTag>& cc) {
     os << "conj()] [-1";
     return os;
   }
diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h
index 1fa12552bc..34e3ea0c9a 100644
--- a/src/TiledArray/tensor/kernels.h
+++ b/src/TiledArray/tensor/kernels.h
@@ -23,8 +23,8 @@
  *
  */
 
-#ifndef TILEDARRAY_TENSOR_KENERLS_H__INCLUDED
-#define TILEDARRAY_TENSOR_KENERLS_H__INCLUDED
+#ifndef TILEDARRAY_TENSOR_KERNELS_H__INCLUDED
+#define TILEDARRAY_TENSOR_KERNELS_H__INCLUDED
 
 #include <TiledArray/einsum/index.h>
 #include <TiledArray/math/gemm_helper.h>
@@ -167,7 +167,7 @@ void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
       std::unique_ptr<T[]> data_copy;
       size_t tile_volume;
       if (twostep) {
-        tile_volume = C.range().volume();
+        tile_volume = C.total_size();
         data_copy = std::make_unique<T[]>(tile_volume);
         std::copy(C.data(), C.data() + tile_volume, data_copy.get());
       }
@@ -261,6 +261,7 @@ inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) {
     return std::forward<Op>(op)(tensor1, tensors...);
   } else {
     static_assert(detail::is_nested_tensor_v<TR, T1, Ts...>);
+    TA_ASSERT(!empty(tensor1, tensors...));
     return TiledArray::detail::transform<TR>()(std::forward<Op>(op), tensor1,
                                                tensors...);
   }
@@ -419,10 +420,6 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
 
   auto volume = result.total_size();
   for (decltype(volume) ord = 0; ord < volume; ++ord) {
-    if constexpr (is_tensor_of_tensor_v<TR>)
-      if (result.data()[ord].range().volume() == 0) continue;
-    if constexpr (is_tensor_of_tensor_v<Ts...>)
-      if (((tensors.data()[ord].range().volume() == 0) || ...)) continue;
     if constexpr (std::is_invocable_r_v<void, Op, typename TR::value_type&,
                                         typename Ts::value_type...>)
       op(result.data()[ord], tensors.data()[ord]...);
@@ -914,20 +911,25 @@ template <
                         std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
 auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity,
                    const T1& tensor1, const Ts&... tensors) {
-  TA_ASSERT(!empty(tensor1, tensors...));
-  TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
-
-  const auto volume = [&tensor1]() {
-    if constexpr (detail::has_total_size_v<T1>)
-      return tensor1.total_size();
-    else
-      return tensor1.size();
-  }();
-
   auto init = std::forward<Identity>(identity);
-  math::reduce_op(std::forward<ReduceOp>(reduce_op),
-                  std::forward<JoinOp>(join_op), init, volume, init,
-                  tensor1.data(), tensors.data()...);
+
+  // early exit if any tensors are empty
+  // WARNING some operations make sense with empty arguments (e.g. max), but not
+  // supported for now since this is only used for multiply (`*`)
+  if (!empty(tensor1, tensors...)) {
+    TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
+
+    const auto volume = [&tensor1]() {
+      if constexpr (detail::has_member_function_total_size_anyreturn_v<T1>)
+        return tensor1.total_size();
+      else
+        return tensor1.size();
+    }();
+
+    math::reduce_op(std::forward<ReduceOp>(reduce_op),
+                    std::forward<JoinOp>(join_op), init, volume, init,
+                    tensor1.data(), tensors.data()...);
+  }
 
   return init;
 }
@@ -991,7 +993,7 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
   const auto volume = [&tensor1]() {
-    if constexpr (detail::has_total_size_v<T1>)
+    if constexpr (detail::has_member_function_total_size_anyreturn_v<T1>)
       return tensor1.total_size();
     else
       return tensor1.size();
@@ -999,9 +1001,6 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
 
   auto result = identity;
   for (std::remove_cv_t<decltype(volume)> ord = 0ul; ord < volume; ++ord) {
-    if (tensor1.data()[ord].range().volume() == 0 ||
-        ((tensors.data()[ord].range().volume() == 0) || ...))
-      continue;
     auto temp = tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord],
                               tensors.data()[ord]...);
     join_op(result, temp);
@@ -1041,7 +1040,7 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
   const auto volume = [&tensor1]() {
-    if constexpr (detail::has_total_size_v<T1>)
+    if constexpr (detail::has_member_function_total_size_anyreturn_v<T1>)
       return tensor1.total_size();
     else
       return tensor1.size();
@@ -1110,7 +1109,7 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
   // remaining tensors
 
   const auto volume = [&tensor1]() {
-    if constexpr (detail::has_total_size_v<T1>)
+    if constexpr (detail::has_member_function_total_size_anyreturn_v<T1>)
       return tensor1.total_size();
     else
       return tensor1.size();
@@ -1301,4 +1300,4 @@ auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B,
 }  // namespace detail
 }  // namespace TiledArray
 
-#endif  // TILEDARRAY_TENSOR_KENERLS_H__INCLUDED
+#endif  // TILEDARRAY_TENSOR_KERNELS_H__INCLUDED
diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h
index b8ed77671d..97f9e5cdd8 100644
--- a/src/TiledArray/tensor/operators.h
+++ b/src/TiledArray/tensor/operators.h
@@ -32,7 +32,7 @@ namespace TiledArray {
 
 // Tensor arithmetic operators
 
-/// Tensor plus operator
+/// Tensor plus Tensor operator
 
 /// Add two tensors
 /// \tparam T1 The left-hand tensor type
@@ -47,9 +47,37 @@ inline decltype(auto) operator+(T1&& left, T2&& right) {
   return add(std::forward<T1>(left), std::forward<T2>(right));
 }
 
-/// Tensor minus operator
+/// Tensor plus number operator
+
+/// Adds a number to a tensor
+/// \tparam T1 A tensor type
+/// \param tensor The tensor argument
+/// \param number The number argument
+/// \return A tensor where element \c i is equal to <tt>tensor[i] + number</tt>
+template <typename T1, typename = std::enable_if_t<detail::is_nested_tensor_v<
+                           detail::remove_cvr_t<T1>>>>
+inline decltype(auto) operator+(
+    T1&& tensor, detail::numeric_t<detail::remove_cvr_t<T1>> number) {
+  return std::forward<T1>(tensor).add(number);
+}
+
+/// Number plus Tensor operator
+
+/// Adds a number to a tensor
+/// \tparam T1 A tensor type
+/// \param number The number argument
+/// \param tensor The tensor argument
+/// \return A tensor where element \c i is equal to <tt>tensor[i] + number</tt>
+template <typename T1, typename = std::enable_if_t<detail::is_nested_tensor_v<
+                           detail::remove_cvr_t<T1>>>>
+inline decltype(auto) operator+(
+    detail::numeric_t<detail::remove_cvr_t<T1>> number, T1&& tensor) {
+  return std::forward<T1>(tensor).add(number);
+}
+
+/// Tensor minus Tensor operator
 
-/// Subtract two tensors
+/// Subtracts two tensors
 /// \tparam T1 The left-hand tensor type
 /// \tparam T2 The right-hand tensor type
 /// \param left The left-hand tensor argument
@@ -62,7 +90,21 @@ inline decltype(auto) operator-(T1&& left, T2&& right) {
   return subt(std::forward<T1>(left), std::forward<T2>(right));
 }
 
-/// Tensor multiplication operator
+/// Tensor minus number operator
+
+/// Subtracts a number from a tensor
+/// \tparam T1 A tensor type
+/// \param tensor The tensor argument
+/// \param number The number argument
+/// \return A tensor where element \c i is equal to <tt>tensor[i] - number</tt>
+template <typename T1, typename = std::enable_if_t<detail::is_nested_tensor_v<
+                           detail::remove_cvr_t<T1>>>>
+inline decltype(auto) operator-(
+    T1&& tensor, detail::numeric_t<detail::remove_cvr_t<T1>> number) {
+  return std::forward<T1>(tensor).subt(number);
+}
+
+/// Element-wise multiplication operator for Tensors
 
 /// Element-wise multiplication of two tensors
 /// \tparam T1 The left-hand tensor type
@@ -238,6 +280,88 @@ inline decltype(auto) operator*=(T&& left, N right) {
   return scale_to(std::forward<T>(left), right);
 }
 
+/// Tensor output operator
+
+/// Output tensor \c t to the output stream, \c os .
+/// \tparam T The tensor type
+/// \param os The output stream
+/// \param t The tensor to be output
+/// \return A reference to the output stream
+template <
+    typename Char, typename CharTraits, typename T,
+    typename std::enable_if<detail::is_nested_tensor_v<T> &&
+                            detail::is_contiguous_tensor_v<T>>::type* = nullptr>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const T& t) {
+  os << t.range() << " {\n";
+  const auto n = t.range().volume();
+  std::size_t offset = 0ul;
+  std::size_t nbatch = 1;
+  if constexpr (detail::has_member_function_nbatch_anyreturn_v<T>)
+    nbatch = t.nbatch();
+  const auto more_than_1_batch = nbatch > 1;
+  for (auto b = 0ul; b != nbatch; ++b) {
+    if (more_than_1_batch) {
+      os << "  [batch " << b << "]{\n";
+    }
+    if constexpr (detail::is_tensor_v<T>) {  // tensor of scalars
+      detail::NDArrayPrinter{}.print(
+          t.data() + offset, t.range().rank(), t.range().extent_data(),
+          t.range().stride_data(), os, more_than_1_batch ? 4 : 2);
+    } else {  // tensor of tensors, need to annotate each element by its index
+      for (auto&& idx : t.range()) {  // Loop over inner tensors
+        const auto& inner_t = *(t.data() + offset + t.range().ordinal(idx));
+        os << "  " << idx << ":";
+        detail::NDArrayPrinter{}.print(inner_t.data(), inner_t.range().rank(),
+                                       inner_t.range().extent_data(),
+                                       inner_t.range().stride_data(), os,
+                                       more_than_1_batch ? 6 : 4);
+        os << "\n";
+      }
+    }
+    if (more_than_1_batch) {
+      os << "\n  }";
+      if (b + 1 != nbatch) os << "\n";  // not last batch
+    }
+    offset += n;
+  }
+  os << "\n}\n";
+
+  return os;
+}
+
+/// Tensor output operator
+
+/// Output tensor \c t to the output stream, \c os .
+/// \tparam T The tensor type
+/// \param os The output stream
+/// \param t The tensor to be output
+/// \return A reference to the output stream
+template <typename Char, typename CharTraits, typename T,
+          typename std::enable_if<
+              detail::is_tensor<T>::value &&
+              !detail::is_contiguous_tensor<T>::value>::type* = nullptr>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const T& t) {
+  const auto stride = inner_size(t);
+  const auto volume = t.range().volume();
+
+  auto tensor_print_range =
+      [&os, stride](typename T::const_pointer MADNESS_RESTRICT const t_data) {
+        for (decltype(t.range().volume()) i = 0ul; i < stride; ++i)
+          os << t_data[i] << " ";
+      };
+
+  os << t.range() << " { ";
+
+  for (decltype(t.range().volume()) i = 0ul; i < volume; i += stride)
+    tensor_print_range(t.data() + t.range().ordinal(i));
+
+  os << "}\n";
+
+  return os;
+}
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_TENSOR_OPERATORS_H__INCLUDED
diff --git a/src/TiledArray/tensor/print.cpp b/src/TiledArray/tensor/print.cpp
new file mode 100644
index 0000000000..7b02a2caaf
--- /dev/null
+++ b/src/TiledArray/tensor/print.cpp
@@ -0,0 +1,56 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2025  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Eduard Valeyev
+ *  Department of Chemistry, Virginia Tech
+ *
+ *  print.cpp
+ *  Mar 14, 2025
+ *
+ */
+
+#include <TiledArray/tensor/print.ipp>
+
+namespace TiledArray {
+
+namespace detail {
+
+#define TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(T, C)                   \
+  template void NDArrayPrinter::print<T, C>(                                  \
+      const T* data, const std::size_t order,                                 \
+      const Range1::index1_type* extents, const Range1::index1_type* strides, \
+      std::basic_ostream<C>&, std::size_t);                                   \
+  template std::basic_string<C> NDArrayPrinter::toString<T, C>(               \
+      const T* data, const std::size_t order,                                 \
+      const Range1::index1_type* extents, const Range1::index1_type* strides);
+
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(double, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(double, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<double>, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<double>, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(float, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(float, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<float>, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<float>, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(int, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(int, wchar_t);
+
+#undef TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION
+
+}  // namespace detail
+
+}  // namespace TiledArray
diff --git a/src/TiledArray/tensor/print.h b/src/TiledArray/tensor/print.h
new file mode 100644
index 0000000000..a2497bff15
--- /dev/null
+++ b/src/TiledArray/tensor/print.h
@@ -0,0 +1,103 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2025  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Eduard Valeyev
+ *  Department of Chemistry, Virginia Tech
+ *
+ *  print.h
+ *  Mar 14, 2025
+ *
+ */
+
+#ifndef TILEDARRAY_SRC_TILEDARRAY_TENSOR_PRINT_H__INCLUDED
+#define TILEDARRAY_SRC_TILEDARRAY_TENSOR_PRINT_H__INCLUDED
+
+#include <TiledArray/range1.h>
+
+#include <complex>
+#include <iomanip>
+#include <iosfwd>
+
+namespace TiledArray {
+
+namespace detail {
+
+// Class to print n-dimensional arrays in NumPy style but with curly braces
+class NDArrayPrinter {
+ public:
+  NDArrayPrinter(int width = 10, int precision = 6)
+      : width(width), precision(precision) {}
+
+ private:
+  int width = 10;
+  int precision = 10;
+
+  // Helper function to recursively print the array
+  template <typename T, typename Index = Range1::index1_type,
+            typename Char = char, typename CharTraits = std::char_traits<Char>>
+  void printArray(const T* data, const std::size_t order, const Index* extents,
+                  const Index* strides,
+                  std::basic_ostream<Char, CharTraits>& os, size_t level = 0,
+                  size_t offset = 0, size_t extra_indentation = 0);
+
+ public:
+  // Print a row-major array to a stream
+  template <typename T, typename Char = char,
+            typename Index = Range1::index1_type,
+            typename CharTraits = std::char_traits<Char>>
+  void print(const T* data, const std::size_t order, const Index* extents,
+             const Index* strides, std::basic_ostream<Char, CharTraits>& os,
+             std::size_t extra_indentation = 0);
+
+  // Helper function to create a string representation
+  template <typename T, typename Char = char,
+            typename Index = Range1::index1_type,
+            typename CharTraits = std::char_traits<Char>>
+  std::basic_string<Char, CharTraits> toString(const T* data,
+                                               const std::size_t order,
+                                               const Index* extents,
+                                               const Index* strides);
+};
+
+// Explicit template instantiations
+#define TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(T, C)                   \
+  extern template void NDArrayPrinter::print<T, C>(                           \
+      const T* data, const std::size_t order,                                 \
+      const Range1::index1_type* extents, const Range1::index1_type* strides, \
+      std::basic_ostream<C>&, std::size_t);                                   \
+  extern template std::basic_string<C> NDArrayPrinter::toString<T, C>(        \
+      const T* data, const std::size_t order,                                 \
+      const Range1::index1_type* extents, const Range1::index1_type* strides);
+
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(double, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(double, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<double>, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<double>, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(float, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(float, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<float>, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(std::complex<float>, wchar_t);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(int, char);
+TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION(int, wchar_t);
+
+#undef TILEDARRAY_MAKE_NDARRAY_PRINTER_INSTANTIATION
+
+}  // namespace detail
+
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_SRC_TILEDARRAY_TENSOR_PRINT_H__INCLUDED
diff --git a/src/TiledArray/tensor/print.ipp b/src/TiledArray/tensor/print.ipp
new file mode 100644
index 0000000000..8634418138
--- /dev/null
+++ b/src/TiledArray/tensor/print.ipp
@@ -0,0 +1,94 @@
+/*
+*  This file is a part of TiledArray.
+*  Copyright (C) 2025  Virginia Tech
+*
+*  This program is free software: you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation, either version 3 of the License, or
+*  (at your option) any later version.
+*
+*  This program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  You should have received a copy of the GNU General Public License
+*  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*  Eduard Valeyev
+*  Department of Chemistry, Virginia Tech
+*
+*  print.ipp
+*  Mar 14, 2025
+*
+*/
+
+#include <TiledArray/tensor/print.h>
+
+#include <iosfwd>
+#include <iomanip>
+#include <complex>
+
+namespace TiledArray {
+
+namespace detail {
+
+// Class to print n-dimensional arrays in NumPy style but with curly braces
+template <typename T, typename Index, typename Char, typename CharTraits>
+void NDArrayPrinter::printArray(const T* data, const std::size_t order,
+                                const Index* extents, const Index* strides,
+                                std::basic_ostream<Char, CharTraits>& os,
+                                size_t level, size_t offset,
+                                size_t extra_indentation) {
+  if (level >= order) {
+    return;
+  }
+
+  if (level == 0 && extra_indentation > 0)
+    os << std::basic_string<Char, CharTraits>(extra_indentation, ' ');
+  os << "{";
+
+  for (size_t i = 0; i < extents[level]; ++i) {
+    if (level == order - 1) {
+      // At the deepest level, print the actual values
+      os << std::fixed << std::setprecision(precision) << std::setw(width) << std::setfill(Char(' '))
+         << data[offset + i * strides[level]];
+      if (i < extents[level] - 1) {
+        os << ", ";
+      }
+    } else {
+      // For higher levels, recurse deeper
+      printArray(data, order, extents, strides, os, level + 1, offset + i * strides[level],
+                 extra_indentation);
+      if (i < extents[level] - 1) {
+        os << ",\n" << std::basic_string<Char, CharTraits>(level + 1 + extra_indentation, ' ');
+      }
+    }
+  }
+  os << "}";
+}
+
+// Print a row-major array to a stream
+template <typename T, typename Char, typename Index, typename CharTraits>
+void NDArrayPrinter::print(const T* data, const std::size_t order,
+                           const Index* extents, const Index* strides,
+                           std::basic_ostream<Char, CharTraits>& os,
+                           std::size_t extra_indentation) {
+  // Note: Can't validate data size with raw pointers, caller must ensure data has sufficient size
+
+  printArray(data, order, extents, strides, os, 0, 0, extra_indentation);
+}
+
+// Helper function to create a string representation
+template <typename T, typename Char, typename Index, typename CharTraits>
+std::basic_string<Char, CharTraits> NDArrayPrinter::toString(
+    const T* data, const std::size_t order, const Index* extents,
+    const Index* strides) {
+  std::basic_stringstream<Char, CharTraits> oss;
+  print(data, order, extents, strides, oss, /* extra_indentation = */ 0);
+  return oss.str();
+}
+
+}  // namespace detail
+
+}  // namespace TiledArray
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index 2ea580e69d..dc43137797 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -43,6 +43,44 @@ namespace detail {
 template <typename T, typename A>
 struct TraceIsDefined<Tensor<T, A>, enable_if_numeric_t<T>> : std::true_type {};
 
+template <typename To, typename From,
+          typename = std::enable_if_t<
+              detail::is_nested_tensor_v<To, detail::remove_cvr_t<From>>>>
+To clone_or_cast(From&& f) {
+  if constexpr (std::is_same_v<To, detail::remove_cvr_t<From>>)
+    return std::forward<From>(f).clone();
+  else if constexpr (detail::is_convertible_v<From, To>) {
+    return static_cast<To>(std::forward<From>(f));
+  } else if constexpr (detail::is_range_v<To> &&
+                       detail::is_range_v<detail::remove_cvr_t<From>>) {
+    using std::begin;
+    using std::data;
+    using std::end;
+
+    To t(f.range());
+    if constexpr (detail::is_contiguous_tensor_v<detail::remove_cvr_t<From>>) {
+      const auto n = f.range().volume();
+      if constexpr (detail::is_contiguous_tensor_v<To>) {
+        std::copy(data(f), data(f) + n, data(t));
+      } else {
+        std::copy(data(f), data(f) + n, begin(t));
+      }
+    } else {
+      if constexpr (detail::is_contiguous_tensor_v<To>) {
+        std::copy(begin(f), end(f), data(t));
+      } else
+        std::copy(begin(f), end(f), begin(t));
+    }
+    return t;
+  } else {
+    static_assert(
+        !std::is_void_v<To>,
+        "clone_or_cast<To,From>: could not figure out how to convert From to "
+        "To, either overload of a member function of Tensor is missing or From "
+        "need to provide a conversion operator to To");
+  }
+}
+
 }  // namespace detail
 
 /// An N-dimensional tensor object
@@ -50,8 +88,9 @@ struct TraceIsDefined<Tensor<T, A>, enable_if_numeric_t<T>> : std::true_type {};
 /// A contiguous row-major tensor with __shallow-copy__ semantics.
 /// As of TiledArray 1.1 Tensor represents a batch of tensors with same Range
 /// (the default batch size = 1).
-/// \tparam T the value type of this tensor
-/// \tparam A The allocator type for the data
+/// \tparam T The value type of this tensor
+/// \tparam A The allocator type for the data; only default-constructible
+/// allocators are supported to save space
 template <typename T, typename Allocator>
 class Tensor {
   // meaningful error if T& is not assignable, see
@@ -59,6 +98,11 @@ class Tensor {
   static_assert(std::is_assignable<std::add_lvalue_reference_t<T>, T>::value,
                 "Tensor<T,Allocator>: T must be an assignable type (e.g. "
                 "cannot be const)");
+  // default-constructible Allocator allows to reduce the size of default Tensor
+  // and minimize the overhead of null elements in Tensor<Tensor<T>>
+  static_assert(
+      std::is_default_constructible_v<Allocator>,
+      "Tensor<T,Allocator>: only default-constructible Allocator is supported");
 
 #ifdef TA_TENSOR_MEM_TRACE
   template <typename... Ts>
@@ -289,13 +333,26 @@ class Tensor {
 #endif
   }
 
+  struct nbatches {
+    template <typename Int,
+              typename = std::enable_if_t<std::is_integral_v<Int>>>
+    nbatches(Int n) : n(n) {}
+    template <typename Int,
+              typename = std::enable_if_t<std::is_integral_v<Int>>>
+    nbatches& operator=(Int n) {
+      this->n = n;
+    }
+
+    size_type n = 1;
+  };
+
   /// Construct a tensor with a range equal to \c range. The data is
   /// default-initialized (which, for `T` with trivial default constructor,
   /// means data is uninitialized).
   /// \param range The range of the tensor
   /// \param nbatch The number of batches (default is 1)
-  explicit Tensor(const range_type& range, size_type nbatch = 1)
-      : Tensor(range, nbatch, default_construct{true}) {}
+  explicit Tensor(const range_type& range, nbatches nb = 1)
+      : Tensor(range, nb.n, default_construct{true}) {}
 
   /// Construct a tensor of tensor values, setting all elements to the same
   /// value
@@ -437,12 +494,13 @@ class Tensor {
     }
   }
 
-  /// Copy and modify the data from \c other
+  /// "Element-wise" unary transform of \c other
 
   /// \tparam T1 A tensor type
-  /// \tparam Op An element-wise operation type
+  /// \tparam Op A unary callable
   /// \param other The tensor argument
-  /// \param op The element-wise operation
+  /// \param op Unary operation that can be invoked on elements of \p other ;
+  ///        if it is not, it will be "threaded" over \p other via `tensor_op`
   template <typename T1, typename Op,
             typename std::enable_if_t<
                 is_tensor<T1>::value &&
@@ -452,13 +510,15 @@ class Tensor {
     detail::tensor_init(op, *this, other);
   }
 
-  /// Copy, modify, and permute the data from \c other
+  /// "Element-wise" unary transform of \c other fused with permutation
 
+  /// equivalent, but more efficient, than `Tensor(other, op).permute(perm)`
   /// \tparam T1 A tensor type
-  /// \tparam Op An element-wise operation type
+  /// \tparam Op A unary callable
   /// \tparam Perm A permutation type
   /// \param other The tensor argument
-  /// \param op The element-wise operation
+  /// \param op Unary operation that can be invoked as` op(other[i]))`;
+  ///        if it is not, it will be "threaded" over \p other via `tensor_op`
   template <
       typename T1, typename Op, typename Perm,
       typename std::enable_if_t<is_tensor<T1>::value &&
@@ -480,14 +540,15 @@ class Tensor {
     }
   }
 
-  /// Copy and modify the data from \c left, and \c right
+  /// "Element-wise" binary transform of \c {left,right}
 
   /// \tparam T1 A tensor type
   /// \tparam T2 A tensor type
-  /// \tparam Op An element-wise operation type
+  /// \tparam Op A binary callable
   /// \param left The left-hand tensor argument
   /// \param right The right-hand tensor argument
-  /// \param op The element-wise operation
+  /// \param op Binary operation that can be invoked as `op(left[i],right[i]))`;
+  ///        if it is not, it will be "threaded" over \p other via `tensor_op`
   template <typename T1, typename T2, typename Op,
             typename = std::enable_if_t<detail::is_nested_tensor_v<T1, T2>>>
   Tensor(const T1& left, const T2& right, Op&& op)
@@ -495,15 +556,16 @@ class Tensor {
     detail::tensor_init(op, *this, left, right);
   }
 
-  /// Copy, modify, and permute the data from \c left, and \c right
+  /// "Element-wise" binary transform of \c {left,right} fused with permutation
 
   /// \tparam T1 A tensor type
   /// \tparam T2 A tensor type
-  /// \tparam Op An element-wise operation type
+  /// \tparam Op A binary callable
   /// \tparam Perm A permutation tile
   /// \param left The left-hand tensor argument
   /// \param right The right-hand tensor argument
-  /// \param op The element-wise operation
+  /// \param op Binary operation that can be invoked as `op(left[i],right[i]))`;
+  ///        if it is not, it will be "threaded" over \p other via `tensor_op`
   /// \param perm The permutation that will be applied to the arguments
   template <
       typename T1, typename T2, typename Op, typename Perm,
@@ -585,7 +647,7 @@ class Tensor {
   }
 
   /// @return a deep copy of `*this`
-  Tensor clone() const {
+  Tensor clone() const& {
     Tensor result;
     if (data_) {
       if constexpr (detail::is_tensor_of_tensor_v<Tensor>) {
@@ -603,6 +665,11 @@ class Tensor {
     return result;
   }
 
+  /// cloning an rvalue ref forwards the contents of this
+  /// @return a deep copy of `*this`
+  /// @post this is in a moved-from state
+  Tensor clone() && { return std::move(*this); }
+
   template <typename T1,
             typename std::enable_if<is_tensor<T1>::value>::type* = nullptr>
   Tensor& operator=(const T1& other) {
@@ -1383,6 +1450,11 @@ class Tensor {
   template <typename Index,
             std::enable_if_t<detail::is_integral_range_v<Index>>* = nullptr>
   Tensor& shift_to(const Index& bound_shift) {
+// although shift_to is currently fine on shared objects since ranges are
+// not shared, this will change in the future
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+    TA_ASSERT(data_.use_count() <= 1);
+#endif
     this->range_.inplace_shift(bound_shift);
     return *this;
   }
@@ -1395,6 +1467,11 @@ class Tensor {
   template <typename Integer,
             std::enable_if_t<std::is_integral_v<Integer>>* = nullptr>
   Tensor& shift_to(const std::initializer_list<Integer>& bound_shift) {
+    // although shift_to is currently fine on shared objects since ranges are
+    // not shared, this will change in the future
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+    TA_ASSERT(data_.use_count() <= 1);
+#endif
     this->range_.template inplace_shift<std::initializer_list<Integer>>(
         bound_shift);
     return *this;
@@ -1507,6 +1584,9 @@ class Tensor {
             typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
                 nullptr>
   Tensor& inplace_binary(const Right& right, Op&& op) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+    TA_ASSERT(data_.use_count() <= 1);
+#endif
     detail::inplace_tensor_op(op, *this, right);
     return *this;
   }
@@ -1519,10 +1599,23 @@ class Tensor {
   /// \c op(*this[i])
   /// \throw TiledArray::Exception When this tensor is empty.
   template <typename Op>
-  Tensor unary(Op&& op) const {
+  Tensor unary(Op&& op) const& {
     return Tensor(*this, op);
   }
 
+  /// Use a unary, element wise operation to construct a new tensor
+
+  /// \tparam Op The unary operation type
+  /// \param op The unary element-wise operation
+  /// \return A tensor where element \c i of the new tensor is equal to
+  /// \c op(*this[i])
+  /// \throw TiledArray::Exception When this tensor is empty.
+  template <typename Op>
+  Tensor unary(Op&& op) && {
+    inplace_unary(std::forward<Op>(op));
+    return std::move(*this);
+  }
+
   /// Use a unary, element wise operation to construct a new, permuted tensor
 
   /// \tparam Op The unary operation type
@@ -1542,7 +1635,7 @@ class Tensor {
         detail::is_bipartite_permutation_v<Perm>;
     // tile ops pass bipartite permutations here even if this is a plain tensor
     if constexpr (!is_tot) {
-      if (empty()) return *this;
+      if (empty()) return {};
       if constexpr (is_bperm) {
         TA_ASSERT(inner_size(perm) == 0);  // ensure this is a plain permutation
         return Tensor(*this, op, outer(std::forward<Perm>(perm)));
@@ -1564,6 +1657,9 @@ class Tensor {
   /// \throw TiledArray::Exception When this tensor is empty.
   template <typename Op>
   Tensor& inplace_unary(Op&& op) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+    TA_ASSERT(data_.use_count() <= 1);
+#endif
     detail::inplace_tensor_op(op, *this);
     return *this;
   }
@@ -1578,14 +1674,29 @@ class Tensor {
   /// \c factor
   template <typename Scalar, typename std::enable_if<
                                  detail::is_numeric_v<Scalar>>::type* = nullptr>
-  Tensor scale(const Scalar factor) const {
-    if (range().volume() == 0) return *this;
-    return unary([factor](const value_type& a) -> decltype(auto) {
+  Tensor scale(const Scalar factor) const& {
+    // early exit for empty this
+    if (empty()) return {};
+
+    return unary([factor](const value_type& a) {
       using namespace TiledArray::detail;
       return a * factor;
     });
   }
 
+  /// Construct a scaled copy of this tensor
+
+  /// \tparam Scalar A scalar type
+  /// \param factor The scaling factor
+  /// \return A new tensor where the elements of this tensor are scaled by
+  /// \c factor
+  template <typename Scalar, typename std::enable_if<
+                                 detail::is_numeric_v<Scalar>>::type* = nullptr>
+  Tensor scale(const Scalar factor) && {
+    scale_to(factor);
+    return std::move(*this);
+  }
+
   /// Construct a scaled and permuted copy of this tensor
 
   /// \tparam Scalar A scalar type
@@ -1598,8 +1709,11 @@ class Tensor {
             typename = std::enable_if_t<detail::is_numeric_v<Scalar> &&
                                         detail::is_permutation_v<Perm>>>
   Tensor scale(const Scalar factor, const Perm& perm) const {
+    // early exit for empty this
+    if (empty()) return {};
+
     return unary(
-        [factor](const numeric_type a) -> numeric_type {
+        [factor](const value_type& a) {
           using namespace TiledArray::detail;
           return a * factor;
         },
@@ -1614,6 +1728,9 @@ class Tensor {
   template <typename Scalar, typename std::enable_if<
                                  detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& scale_to(const Scalar factor) {
+    // early exit for empty this
+    if (empty()) return *this;
+
     return inplace_unary(
         [factor](value_type& MADNESS_RESTRICT res) { res *= factor; });
   }
@@ -1629,7 +1746,12 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor add(const Right& right) const& {
-    if (right.empty()) return *this;
+    // early exit for empty right
+    if (right.empty()) return this->clone();
+
+    // early exit for empty this
+    if (empty()) detail::clone_or_cast<Tensor>(right);
+
     return binary(
         right,
         [](const value_type& l, const value_t<Right>& r) -> decltype(l + r) {
@@ -1638,10 +1760,10 @@ class Tensor {
               if (r.empty())
                 return {};
               else
-                return r;
+                return r.clone();
             } else {
               if (r.empty())
-                return l;
+                return l.clone();
               else
                 return l + r;
             }
@@ -1677,10 +1799,7 @@ class Tensor {
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor add(const Right& right, const Perm& perm) const {
     return binary(
-        right,
-        [](const value_type& l, const value_type& r) -> decltype(auto) {
-          return l + r;
-        },
+        right, [](const value_type& l, const value_type& r) { return l + r; },
         perm);
   }
 
@@ -1697,11 +1816,9 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor add(const Right& right, const Scalar factor) const {
-    return binary(
-        right,
-        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
-          return (l + r) * factor;
-        });
+    return binary(right, [factor](const value_type& l, const value_type& r) {
+      return (l + r) * factor;
+    });
   }
 
   /// Scale and add this and \c other to construct a new, permuted tensor
@@ -1721,7 +1838,7 @@ class Tensor {
   Tensor add(const Right& right, const Scalar factor, const Perm& perm) const {
     return binary(
         right,
-        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+        [factor](const value_type& l, const value_type& r) {
           return (l + r) * factor;
         },
         perm);
@@ -1733,8 +1850,10 @@ class Tensor {
   /// \return A new tensor where the elements are the sum of the elements of
   /// \c this and \c value
   Tensor add(const numeric_type value) const {
-    return unary(
-        [value](const numeric_type a) -> numeric_type { return a + value; });
+    // early exit for empty this
+    if (empty()) return {};
+
+    return unary([value](const value_type& a) { return a + value; });
   }
 
   /// Add a constant to a permuted copy of this tensor
@@ -1747,9 +1866,10 @@ class Tensor {
   template <typename Perm,
             typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
   Tensor add(const numeric_type value, const Perm& perm) const {
-    return unary(
-        [value](const numeric_type a) -> numeric_type { return a + value; },
-        perm);
+    // early exit for empty this
+    if (empty()) return {};
+
+    return unary([value](const value_type& a) { return a + value; }, perm);
   }
 
   /// Add \c other to this tensor
@@ -1760,10 +1880,15 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor& add_to(const Right& right) {
+    // early exit for empty right
     if (right.empty()) return *this;
+
+    // early exit for empty this
     if (empty()) {
-      *this = Tensor{right.range(), value_type{}};
+      *this = detail::clone_or_cast<Tensor>(right);
+      return *this;
     }
+
     return inplace_binary(right, [](value_type& MADNESS_RESTRICT l,
                                     const value_t<Right> r) { l += r; });
   }
@@ -1789,7 +1914,9 @@ class Tensor {
 
   /// \param value The constant to be added
   /// \return A reference to this tensor
-  Tensor& add_to(const numeric_type value) {
+  template <typename Scalar,
+            typename = std::enable_if_t<detail::is_numeric_v<Scalar>>>
+  Tensor& add_to(const Scalar value) {
     return inplace_unary(
         [value](numeric_type& MADNESS_RESTRICT res) { res += value; });
   }
@@ -1817,7 +1944,7 @@ class Tensor {
                 return -r;
             } else {
               if (r.empty())
-                return l;
+                return l.clone();
               else
                 return l - r;
             }
@@ -1841,10 +1968,7 @@ class Tensor {
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor subt(const Right& right, const Perm& perm) const {
     return binary(
-        right,
-        [](const value_type& l, const value_type& r) -> decltype(auto) {
-          return l - r;
-        },
+        right, [](const value_type& l, const value_type& r) { return l - r; },
         perm);
   }
 
@@ -1862,11 +1986,9 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor subt(const Right& right, const Scalar factor) const {
-    return binary(
-        right,
-        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
-          return (l - r) * factor;
-        });
+    return binary(right, [factor](const value_type& l, const value_type& r) {
+      return (l - r) * factor;
+    });
   }
 
   /// Subtract \c right from this and return the result scaled by a scaling \c
@@ -1887,7 +2009,7 @@ class Tensor {
   Tensor subt(const Right& right, const Scalar factor, const Perm& perm) const {
     return binary(
         right,
-        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+        [factor](const value_type& l, const value_type& r) {
           return (l - r) * factor;
         },
         perm);
@@ -1920,6 +2042,9 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor& subt_to(const Right& right) {
+    // early exit for empty right
+    if (right.empty()) return *this;
+
     return inplace_binary(
         right, [](auto& MADNESS_RESTRICT l, const auto& r) { l -= r; });
   }
@@ -1936,6 +2061,11 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& subt_to(const Right& right, const Scalar factor) {
+    // early exit for empty right
+    if (right.empty()) {
+      return this->scale_to(factor);
+    }
+
     return inplace_binary(right,
                           [factor](auto& MADNESS_RESTRICT l, const auto& r) {
                             (l -= r) *= factor;
@@ -1959,8 +2089,7 @@ class Tensor {
             typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
                 nullptr>
   decltype(auto) mult(const Right& right) const {
-    auto mult_op = [](const value_type& l,
-                      const value_t<Right>& r) -> decltype(auto) {
+    auto mult_op = [](const value_type& l, const value_t<Right>& r) {
       return l * r;
     };
 
@@ -1988,9 +2117,7 @@ class Tensor {
   decltype(auto) mult(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
-          return l * r;
-        },
+        [](const value_type& l, const value_t<Right>& r) { return l * r; },
         perm);
   }
 
@@ -2008,8 +2135,9 @@ class Tensor {
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   decltype(auto) mult(const Right& right, const Scalar factor) const {
     return binary(right,
-                  [factor](const value_type& l, const value_t<Right>& r)
-                      -> decltype(auto) { return (l * r) * factor; });
+                  [factor](const value_type& l, const value_t<Right>& r) {
+                    return (l * r) * factor;
+                  });
   }
 
   /// Scale and multiply this by \c right to create a new, permuted tensor
@@ -2031,8 +2159,9 @@ class Tensor {
                       const Perm& perm) const {
     return binary(
         right,
-        [factor](const value_type& l, const value_t<Right>& r)
-            -> decltype(auto) { return (l * r) * factor; },
+        [factor](const value_type& l, const value_t<Right>& r) {
+          return (l * r) * factor;
+        },
         perm);
   }
 
@@ -2045,6 +2174,15 @@ class Tensor {
             typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
                 nullptr>
   Tensor& mult_to(const Right& right) {
+    // early exit for empty right
+    if (right.empty()) {
+      *this = Tensor{};
+      return *this;
+    }
+
+    // early exit for empty this
+    if (empty()) return *this;
+
     return inplace_binary(right, [](value_type& MADNESS_RESTRICT l,
                                     const value_t<Right>& r) { l *= r; });
   }
@@ -2061,6 +2199,9 @@ class Tensor {
       typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& mult_to(const Right& right, const Scalar factor) {
+    // early exit for empty this
+    if (empty()) return *this;
+
     return inplace_binary(
         right, [factor](value_type& MADNESS_RESTRICT l,
                         const value_t<Right>& r) { (l *= r) *= factor; });
@@ -2072,7 +2213,10 @@ class Tensor {
 
   /// \return A new tensor that contains the negative values of this tensor
   Tensor neg() const {
-    return unary([](const numeric_type r) -> numeric_type { return -r; });
+    // early exit for empty this
+    if (empty()) return this->clone();
+
+    return unary([](const value_type r) { return -r; });
   }
 
   /// Create a negated and permuted copy of this tensor
@@ -2083,13 +2227,19 @@ class Tensor {
   template <typename Perm,
             typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
   Tensor neg(const Perm& perm) const {
-    return unary([](const numeric_type l) -> numeric_type { return -l; }, perm);
+    // early exit for empty this
+    if (empty()) return this->clone();
+
+    return unary([](const value_type l) { return -l; }, perm);
   }
 
   /// Negate elements of this tensor
 
   /// \return A reference to this tensor
   Tensor& neg_to() {
+    // early exit for empty this
+    if (empty()) return *this;
+
     return inplace_unary([](numeric_type& MADNESS_RESTRICT l) { l = -l; });
   }
 
@@ -2487,17 +2637,6 @@ class Tensor {
 
   /// \return The vector norm of this tensor
   scalar_type squared_norm() const {
-    if constexpr (detail::is_tensor_v<T>) {
-      // If uninitialized tensor of tensor return zero.
-      // All elements of this->data() are empty tensors in this case,
-      // however, we only look at the first element.
-      // Because
-      //          - It is expensive to look at all elements.
-      //          - The state of the array having only some empty elements
-      //            is ill-defined and should never happen.
-      if (detail::empty(*data())) return 0;
-    }
-
     auto square_op = [](scalar_type& MADNESS_RESTRICT res,
                         const numeric_type arg) {
       res += TiledArray::detail::squared_norm(arg);
diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h
index a32de32e4a..059ad141c8 100644
--- a/src/TiledArray/tensor/type_traits.h
+++ b/src/TiledArray/tensor/type_traits.h
@@ -532,23 +532,6 @@ struct ordinal_traits<T, std::enable_if_t<is_contiguous_tensor_v<T>>> {
       std::decay_t<decltype(std::declval<const T&>().range())>>::type;
 };
 
-template <class E>
-class has_total_size {
-  /// true case
-  template <class U>
-  static auto __test(U* p) -> decltype(p->total_size(), std::true_type());
-  /// false case
-  template <class>
-  static std::false_type __test(...);
-
- public:
-  static constexpr const bool value =
-      std::is_same<std::true_type, decltype(__test<E>(0))>::value;
-};
-
-template <typename T>
-constexpr inline bool has_total_size_v = has_total_size<T>::value;
-
 }  // namespace detail
 
 }  // namespace TiledArray
diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h
index 90f7366bbc..f07365ccd2 100644
--- a/src/TiledArray/tile.h
+++ b/src/TiledArray/tile.h
@@ -32,12 +32,18 @@ namespace TiledArray {
  * @{
  */
 
-/// An N-dimensional shallow copy wrapper for tile objects
-
-/// \c Tile represents a block of an \c Array. The rank of the tile block is
-/// the same as the owning \c Array object. In order for a user defined tensor
-/// object to be used in TiledArray expressions, users must also define the
-/// following functions:
+/// An N-dimensional shallow-copy wrapper for Tensor-like types that, unlike
+/// Tensor, have deep-copy semantics. Like Tensor, Tile is
+/// default-constructible. The default constructor produced a Tile in
+/// null state (not referring to any tensor object). The name refers to its
+/// intended use as a tile of DistArray.
+///
+/// \tparam T a tensor type. It may provide a subset of the full operation
+/// set of Tensor, since only those operations that are actually used
+/// need to be defined. For full equivalence to Tensor \p T must define the
+/// following functions, either as members or as non-member functions (see the
+/// \ref NonIntrusiveTileInterface "non-intrusive tile interface"
+/// documentation for more details on the latter):
 /// \li \c add
 /// \li \c add_to (in-place add)
 /// \li \c subt
@@ -62,10 +68,7 @@ namespace TiledArray {
 /// \li \c abs_min
 /// \li \c abs_max
 /// \li \c dot
-/// as for the intrusive or non-instrusive interface. See the
-/// \ref NonIntrusiveTileInterface "non-intrusive tile interface"
-/// documentation for more details.
-/// \tparam T The tensor type used to represent tile data
+///
 template <typename T>
 class Tile {
  public:
@@ -171,8 +174,25 @@ class Tile {
 
   // State accessor ----------------------------------------------------------
 
+  /// \return true if this is null (default-constructed or
+  /// after reset()) OR if the referred object is in null state (i.e. if
+  /// `tensor().empty()` is true.
+  /// \note use use_count() to check if this is in a null state
   bool empty() const { return pimpl_ ? pimpl_->empty() : true; }
 
+  /// \return the number of Tile objects that refer to the same tensor
+  /// as this (if any); `0` is returned if this is in a null state
+  /// (default-constructed or
+  /// after reset()).
+  long use_count() const { return pimpl_.use_count(); }
+
+  // State operations --------------------------------------------------------
+
+  /// release the reference to the managed tensor, and delete it
+  /// if this is the last Tile object that refers to it.
+  /// \post this object is in a null state
+  void reset() { pimpl_.reset(); }
+
   // Tile accessor -----------------------------------------------------------
 
   tensor_type& tensor() { return *pimpl_; }
@@ -710,6 +730,26 @@ class Tile {
 
 };  // class Tile
 
+namespace detail {
+
+template <typename T>
+inline constexpr bool is_tile_v = false;
+
+template <typename T>
+inline constexpr bool is_tile_v<Tile<T>> = true;
+template <typename T>
+inline constexpr bool is_tile_v<const Tile<T>> = true;
+template <typename T>
+inline constexpr bool is_tile_v<Tile<T>&> = true;
+template <typename T>
+inline constexpr bool is_tile_v<const Tile<T>&> = true;
+template <typename T>
+inline constexpr bool is_tile_v<Tile<T>&&> = true;
+template <typename T>
+inline constexpr bool is_tile_v<const Tile<T>&&> = true;
+
+}  // namespace detail
+
 // The following functions define the non-intrusive interface used to apply
 // math operations to Tiles. These functions in turn use the non-intrusive
 // interface functions to evaluate tiles.
@@ -800,31 +840,39 @@ inline decltype(auto) shift(const Tile<Arg>& arg,
 
 /// Shift the range of \c arg in place
 
-/// \tparam Arg The tensor argument type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Index An integral range type
 /// \param arg The tile argument to be shifted
 /// \param range_shift The offset to be applied to the argument range
 /// \return A copy of the tile with a new range
-template <typename Arg, typename Index,
-          typename = std::enable_if_t<detail::is_integral_range_v<Index>>>
-inline Tile<Arg>& shift_to(Tile<Arg>& arg, const Index& range_shift) {
+template <typename TileResult, typename Index,
+          typename = std::enable_if_t<detail::is_integral_range_v<Index> &&
+                                      detail::is_tile_v<TileResult>>>
+inline decltype(auto) shift_to(TileResult&& arg, const Index& range_shift) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(arg.use_count() <= 1);
+#endif
   shift_to(arg.tensor(), range_shift);
-  return arg;
+  return std::forward<TileResult>(arg);
 }
 
 /// Shift the range of \c arg in place
 
-/// \tparam Arg The tensor argument type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Index An integral type
 /// \param arg The tile argument to be shifted
 /// \param range_shift The offset to be applied to the argument range
 /// \return A copy of the tile with a new range
-template <typename Arg, typename Index,
-          typename = std::enable_if_t<std::is_integral_v<Index>>>
-inline Tile<Arg>& shift_to(Tile<Arg>& arg,
-                           const std::initializer_list<Index>& range_shift) {
+template <typename TileResult, typename Index>
+inline decltype(auto) shift_to(
+    TileResult&& arg, const std::initializer_list<Index>& range_shift,
+    std::enable_if_t<std::is_integral_v<Index> &&
+                     detail::is_tile_v<TileResult>>* = nullptr) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(arg.use_count() <= 1);
+#endif
   shift_to(arg.tensor(), range_shift);
-  return arg;
+  return std::forward<TileResult>(arg);
 }
 
 // Addition operations -------------------------------------------------------
@@ -928,20 +976,24 @@ inline decltype(auto) add(const Tile<Arg>& arg, const Scalar value,
 
 /// Add to the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Arg The argument tile type
 /// \param result The result tile
 /// \param arg The argument to be added to the result
 /// \return A tile that is equal to <tt>result[i] += arg[i]</tt>
-template <typename Result, typename Arg>
-inline Tile<Result>& add_to(Tile<Result>& result, const Tile<Arg>& arg) {
+template <typename TileResult, typename Arg,
+          typename = std::enable_if_t<detail::is_tile_v<TileResult>>>
+inline decltype(auto) add_to(TileResult&& result, const Tile<Arg>& arg) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   add_to(result.tensor(), arg.tensor());
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 /// Add and scale to the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Arg The argument tile type
 /// \tparam Scalar A scalar type
 /// \param result The result tile
@@ -949,27 +1001,35 @@ inline Tile<Result>& add_to(Tile<Result>& result, const Tile<Arg>& arg) {
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>(result[i] += arg[i]) * factor</tt>
 template <
-    typename Result, typename Arg, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& add_to(Tile<Result>& result, const Tile<Arg>& arg,
-                            const Scalar factor) {
+    typename TileResult, typename Arg, typename Scalar,
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::is_tile_v<TileResult>>::type* = nullptr>
+inline decltype(auto) add_to(TileResult&& result, const Tile<Arg>& arg,
+                             const Scalar factor) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   add_to(result.tensor(), arg.tensor(), factor);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 /// Add constant scalar to the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Scalar A scalar type
 /// \param result The result tile
 /// \param value The constant scalar to be added to \c result
 /// \return A tile that is equal to <tt>(result[i] += arg[i]) *= factor</tt>
 template <
-    typename Result, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& add_to(Tile<Result>& result, const Scalar value) {
+    typename TileResult, typename Scalar,
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::is_tile_v<TileResult>>::type* = nullptr>
+inline decltype(auto) add_to(TileResult&& result, const Scalar value) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   add_to(result.tensor(), value);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 // Subtraction ---------------------------------------------------------------
@@ -1069,46 +1129,58 @@ inline decltype(auto) subt(const Tile<Arg>& arg, const Scalar value,
 
 /// Subtract from the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Arg The argument tile type
 /// \param result The result tile
 /// \param arg The argument to be subtracted from the result
 /// \return A tile that is equal to <tt>result[i] -= arg[i]</tt>
-template <typename Result, typename Arg>
-inline Tile<Result>& subt_to(Tile<Result>& result, const Tile<Arg>& arg) {
+template <typename TileResult, typename Arg,
+          typename = std::enable_if_t<detail::is_tile_v<TileResult>>>
+inline decltype(auto) subt_to(TileResult&& result, const Tile<Arg>& arg) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   subt_to(result.tensor(), arg.tensor());
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 /// Subtract and scale from the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Arg The argument tile type
 /// \param result The result tile
 /// \param arg The argument to be subtracted from \c result
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>(result -= arg) *= factor</tt>
 template <
-    typename Result, typename Arg, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& subt_to(Tile<Result>& result, const Tile<Arg>& arg,
-                             const Scalar factor) {
+    typename TileResult, typename Arg, typename Scalar,
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::is_tile_v<TileResult>>::type* = nullptr>
+inline decltype(auto) subt_to(TileResult&& result, const Tile<Arg>& arg,
+                              const Scalar factor) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   subt_to(result.tensor(), arg.tensor(), factor);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 /// Subtract constant scalar from the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \param result The result tile
 /// \param value The constant scalar to be subtracted from \c result
 /// \return A tile that is equal to <tt>(result -= arg) *= factor</tt>
 template <
-    typename Result, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& subt_to(Tile<Result>& result, const Scalar value) {
+    typename TileResult, typename Scalar,
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::is_type_v<TileResult>>::type* = nullptr>
+inline decltype(auto) subt_to(TileResult&& result, const Scalar value) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   subt_to(result.tensor(), value);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 // Multiplication operations -------------------------------------------------
@@ -1178,32 +1250,40 @@ inline decltype(auto) mult(const Tile<Left>& left, const Tile<Right>& right,
 
 /// Multiply to the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Arg The argument tile type
 /// \param result The result tile  to be multiplied
 /// \param arg The argument to be multiplied by the result
 /// \return A tile that is equal to <tt>result *= arg</tt>
-template <typename Result, typename Arg>
-inline Tile<Result>& mult_to(Tile<Result>& result, const Tile<Arg>& arg) {
+template <typename TileResult, typename Arg,
+          typename = std::enable_if_t<detail::is_tile_v<TileResult>>>
+inline decltype(auto) mult_to(TileResult&& result, const Tile<Arg>& arg) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   mult_to(result.tensor(), arg.tensor());
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 /// Multiply and scale to the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Arg The argument tile type
 /// \param result The result tile to be multiplied
 /// \param arg The argument to be multiplied by \c result
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>(result *= arg) *= factor</tt>
 template <
-    typename Result, typename Arg, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& mult_to(Tile<Result>& result, const Tile<Arg>& arg,
-                             const Scalar factor) {
+    typename TileResult, typename Arg, typename Scalar,
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::is_tile_v<TileResult>>::type* = nullptr>
+inline decltype(auto) mult_to(TileResult&& result, const Tile<Arg>& arg,
+                              const Scalar factor) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   mult_to(result.tensor(), arg.tensor(), factor);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 // Generic element-wise binary operations
@@ -1262,6 +1342,9 @@ inline decltype(auto) binary(const Tile<Left>& left, const Tile<Right>& right,
 template <typename Left, typename Right, typename Op>
 inline Tile<Left>& inplace_binary(Tile<Left>& left, const Tile<Right>& right,
                                   Op&& op) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(left.use_count() <= 1);
+#endif
   inplace_binary(left.tensor(), right.tensor(), std::forward<Op>(op));
   return left;
 }
@@ -1300,16 +1383,20 @@ inline decltype(auto) scale(const Tile<Arg>& arg, const Scalar factor,
 
 /// Scale to the result tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \param result The result tile to be scaled
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>result *= factor</tt>
 template <
-    typename Result, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& scale_to(Tile<Result>& result, const Scalar factor) {
+    typename TileResult, typename Scalar,
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::is_tile_v<TileResult>>::type* = nullptr>
+inline decltype(auto) scale_to(TileResult&& result, const Scalar factor) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   scale_to(result.tensor(), factor);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 // Negation operations -------------------------------------------------------
@@ -1341,14 +1428,19 @@ inline decltype(auto) neg(const Tile<Arg>& arg, const Perm& perm) {
 
 /// In-place negate tile
 
-/// \tparam Result The result tile type
+/// \tparam TileResult A Tile<> type instance
 /// \param result The result tile to be negated
 /// \return negated <tt>result</tt>
 /// \note equivalent to @c scale_to(arg,-1)
-template <typename Result>
-inline Tile<Result>& neg_to(Tile<Result>& result) {
+template <typename TileResult>
+inline decltype(auto) neg_to(
+    TileResult&& result,
+    std::enable_if_t<detail::is_tile_v<TileResult>>* = nullptr) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   neg_to(result.tensor());
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 // Complex conjugate operations ---------------------------------------------
@@ -1410,28 +1502,37 @@ inline decltype(auto) conj(const Tile<Arg>& arg, const Scalar factor,
 
 /// In-place complex conjugate a tile
 
-/// \tparam Result The tile type
+/// \tparam TileResult A Tile<> type instance
 /// \param result The tile to be conjugated
 /// \return A reference to `result`
-template <typename Result>
-inline Tile<Result>& conj_to(Tile<Result>& result) {
+template <typename TileResult>
+inline decltype(auto) conj_to(
+    TileResult&& result,
+    std::enable_if_t<detail::is_tile_v<TileResult>>* = nullptr) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   conj_to(result.tensor());
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 /// In-place complex conjugate and scale a tile
 
-/// \tparam Result The tile type
+/// \tparam TileResult A Tile<> type instance
 /// \tparam Scalar A scalar type
 /// \param result The tile to be conjugated
 /// \param factor The scaling factor
 /// \return A reference to `result`
-template <typename Result, typename Scalar,
-          typename std::enable_if<
-              TiledArray::detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Tile<Result>& conj_to(Tile<Result>& result, const Scalar factor) {
+template <
+    typename TileResult, typename Scalar,
+    typename std::enable_if<TiledArray::detail::is_numeric_v<Scalar> &&
+                            detail::is_tile_v<TileResult>>::type* = nullptr>
+inline decltype(auto) conj_to(TileResult&& result, const Scalar factor) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(result.use_count() <= 1);
+#endif
   conj_to(result.tensor(), factor);
-  return result;
+  return std::forward<TileResult>(result);
 }
 
 // Generic element-wise unary operations
@@ -1478,6 +1579,9 @@ inline decltype(auto) unary(const Tile<Arg>& arg, Op&& op, const Perm& perm) {
 // clang-format on
 template <typename Result, typename Op>
 inline Tile<Result>& inplace_unary(Tile<Result>& arg, Op&& op) {
+#ifdef TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED
+  TA_ASSERT(arg.use_count() <= 1);
+#endif
   inplace_unary(arg.tensor(), std::forward<Op>(op));
   return arg;
 }
@@ -1721,8 +1825,9 @@ inline decltype(auto) inner_product(const Tile<Left>& left,
 /// \param os The output stream
 /// \param tile The tile to be printed
 /// \return The modified output stream
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os, const Tile<T>& tile) {
+template <typename Char, typename CharTraits, typename T>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const Tile<T>& tile) {
   os << tile.tensor();
   return os;
 }
diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h
index 879d2ed9d2..d621e2731e 100644
--- a/src/TiledArray/tile_interface/add.h
+++ b/src/TiledArray/tile_interface/add.h
@@ -355,6 +355,10 @@ class AddTo {
     using TiledArray::add_to;
     return add_to(left, right);
   }
+  result_type&& operator()(left_type&& left, const right_type& right) const {
+    using TiledArray::add_to;
+    return add_to(std::move(left), right);
+  }
 };
 
 template <typename Result, typename Left, typename Right>
diff --git a/src/TiledArray/tile_interface/scale.h b/src/TiledArray/tile_interface/scale.h
index cbfa48c972..9e7303397e 100644
--- a/src/TiledArray/tile_interface/scale.h
+++ b/src/TiledArray/tile_interface/scale.h
@@ -76,9 +76,13 @@ inline auto scale(const Arg& arg, const Scalar factor, const Perm& perm) {
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>result *= factor</tt>
 template <typename Result, typename Scalar,
-          std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>* = nullptr>
-inline Result& scale_to(Result& result, const Scalar factor) {
-  return result.scale_to(factor);
+          std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                           detail::has_member_function_scale_to_anyreturn_v<
+                               Result&&, Scalar>>* = nullptr>
+inline decltype(auto) scale_to(Result&& result, const Scalar factor) {
+  static_assert(!std::is_const_v<std::remove_reference_t<Result>>,
+                "TA::scale_to(result,factor): result cannot be const");
+  return std::forward<Result>(result).scale_to(factor);
 }
 
 namespace tile_interface {
diff --git a/src/TiledArray/tile_interface/shift.h b/src/TiledArray/tile_interface/shift.h
index 6fad3b9428..08715608eb 100644
--- a/src/TiledArray/tile_interface/shift.h
+++ b/src/TiledArray/tile_interface/shift.h
@@ -65,10 +65,13 @@ inline auto shift(const Arg& arg,
 /// \param arg The tile argument to be shifted
 /// \param range_shift The offset to be applied to the argument range
 /// \return A copy of the tile with a new range
-template <typename Arg, typename Index,
-          typename = std::enable_if_t<detail::is_integral_range_v<Index>>>
-inline auto shift_to(Arg& arg, const Index& range_shift) {
-  return arg.shift_to(range_shift);
+template <
+    typename Arg, typename Index,
+    typename = std::enable_if_t<
+        detail::is_integral_range_v<Index> &&
+        detail::has_member_function_shift_to_anyreturn_v<Arg&&, const Index&>>>
+inline auto shift_to(Arg&& arg, const Index& range_shift) {
+  return std::forward<Arg>(arg).shift_to(range_shift);
 }
 
 /// Shift the range of \c arg in place
@@ -79,10 +82,13 @@ inline auto shift_to(Arg& arg, const Index& range_shift) {
 /// \param range_shift The offset to be applied to the argument range
 /// \return A copy of the tile with a new range
 template <typename Arg, typename Index,
-          typename = std::enable_if_t<std::is_integral_v<Index>>>
-inline auto shift_to(Arg& arg,
-                     const std::initializer_list<Index>& range_shift) {
-  return arg.shift_to(range_shift);
+          typename =
+              std::enable_if_t<std::is_integral_v<Index> &&
+                               detail::has_member_function_shift_to_anyreturn_v<
+                                   Arg&&, const std::initializer_list<Index>&>>>
+inline decltype(auto) shift_to(
+    Arg&& arg, const std::initializer_list<Index>& range_shift) {
+  return std::forward<Arg>(arg).shift_to(range_shift);
 }
 
 namespace tile_interface {
@@ -140,6 +146,10 @@ class ShiftTo {
   result_type operator()(argument_type& arg, const Index& range_shift) const {
     return shift_to(arg, range_shift);
   }
+  template <typename Index>
+  result_type operator()(argument_type&& arg, const Index& range_shift) const {
+    return shift_to(std::move(arg), range_shift);
+  }
 };
 
 template <typename Result, typename Arg>
@@ -158,6 +168,10 @@ class ShiftTo<Result, Arg,
   result_type operator()(argument_type& arg, const Index& range_shift) const {
     return Cast_::operator()(shift_to(arg, range_shift));
   }
+  template <typename Index>
+  result_type operator()(argument_type&& arg, const Index& range_shift) const {
+    return Cast_::operator()(shift_to(std::move(arg), range_shift));
+  }
 };
 
 template <typename Arg, typename Enabler = void>
diff --git a/src/TiledArray/tile_op/add.h b/src/TiledArray/tile_op/add.h
index f801193fe3..140a43c71e 100644
--- a/src/TiledArray/tile_op/add.h
+++ b/src/TiledArray/tile_op/add.h
@@ -109,14 +109,14 @@ class Add {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   static result_type eval(left_type& first, const right_type& second) {
     using TiledArray::add_to;
-    return add_to(first, second);
+    return add_to(std::move(first), second);
   }
 
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   static result_type eval(const left_type& first, right_type& second) {
     using TiledArray::add_to;
-    return add_to(second, first);
+    return add_to(std::move(second), first);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!RC>::type* = nullptr>
@@ -296,14 +296,14 @@ class ScalAdd {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, const right_type& second) const {
     using TiledArray::add_to;
-    return add_to(first, second, factor_);
+    return add_to(std::move(first), second, factor_);
   }
 
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   result_type eval(const left_type& first, right_type& second) const {
     using TiledArray::add_to;
-    return add_to(second, first, factor_);
+    return add_to(std::move(second), first, factor_);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!RC>::type* = nullptr>
@@ -315,7 +315,7 @@ class ScalAdd {
   template <bool LC, bool RC, typename std::enable_if<RC>::type* = nullptr>
   result_type eval(const ZeroTensor&, right_type& second) const {
     using TiledArray::scale_to;
-    return scale_to(second, factor_);
+    return scale_to(std::move(second), factor_);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!LC>::type* = nullptr>
@@ -327,7 +327,7 @@ class ScalAdd {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, const ZeroTensor&) const {
     using TiledArray::scale_to;
-    return scale_to(first, factor_);
+    return scale_to(std::move(first), factor_);
   }
 
  public:
diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h
index 07dd9d19fd..c9d4cb0876 100644
--- a/src/TiledArray/tile_op/binary_wrapper.h
+++ b/src/TiledArray/tile_op/binary_wrapper.h
@@ -224,7 +224,8 @@ class BinaryWrapper {
                             madness::future_to_ref_t<decltype(eval_right)> r) {
       return BinaryWrapper_::operator()(l, r);
     };
-    return detail::invoke(continuation, eval_left, eval_right);
+    return detail::invoke(continuation, std::move(eval_left),
+                          std::move(eval_right));
   }
 
   /// Evaluate lazy and non-lazy tiles
@@ -249,7 +250,8 @@ class BinaryWrapper {
                                R&& r) {
       return BinaryWrapper_::operator()(l, std::forward<R>(r));
     };
-    return detail::invoke(continuation, eval_left, std::forward<R>(right));
+    return detail::invoke(continuation, std::move(eval_left),
+                          std::forward<R>(right));
   }
 
   /// Evaluate non-lazy and lazy tiles
@@ -273,7 +275,8 @@ class BinaryWrapper {
         [this](L&& l, madness::future_to_ref_t<decltype(eval_right)> r) {
           return BinaryWrapper_::operator()(std::forward<L>(l), r);
         };
-    return detail::invoke(continuation, std::forward<L>(left), eval_right);
+    return detail::invoke(continuation, std::forward<L>(left),
+                          std::move(eval_right));
   }
 
   /// Evaluate two lazy-array tiles
@@ -294,7 +297,9 @@ class BinaryWrapper {
     auto eval_left = invoke_cast(std::forward<L>(left));
     auto eval_right = invoke_cast(std::forward<R>(right));
 
-    if (perm_) return detail::invoke(op_, eval_left, eval_right, perm_);
+    if (perm_)
+      return detail::invoke(op_, std::move(eval_left), std::move(eval_right),
+                            perm_);
 
     auto op_left = [this](eval_t<L>& _left, eval_t<R>& _right) {
       return op_.consume_left(_left, _right);
@@ -308,7 +313,7 @@ class BinaryWrapper {
     if (is_consumable_tile<eval_t<R>>::value && right.is_consumable())
       return detail::invoke(op_right, eval_left, eval_right);
 
-    return detail::invoke(op_, eval_left, eval_right);
+    return detail::invoke(op_, std::move(eval_left), std::move(eval_right));
   }
 
   template <
@@ -325,7 +330,7 @@ class BinaryWrapper {
     if (is_consumable_tile<eval_t<L>>::value && left.is_consumable())
       return op_.consume_left(eval_left, std::forward<R>(right));
 
-    return op_(eval_left, std::forward<R>(right));
+    return op_(std::move(eval_left), std::forward<R>(right));
   }
 
   template <
@@ -342,7 +347,7 @@ class BinaryWrapper {
     if (is_consumable_tile<eval_t<L>>::value && left.is_consumable())
       return op_.consume_left(eval_left, eval_right);
 
-    return op_(eval_left, eval_right);
+    return op_(std::move(eval_left), eval_right);
   }
 
   template <
diff --git a/src/TiledArray/tile_op/mult.h b/src/TiledArray/tile_op/mult.h
index 577ea94115..329bf96e58 100644
--- a/src/TiledArray/tile_op/mult.h
+++ b/src/TiledArray/tile_op/mult.h
@@ -130,11 +130,12 @@ class Mult {
   result_type eval(left_type& first, const right_type& second) const {
     if (!element_op_) {
       using TiledArray::mult_to;
-      return mult_to(first, second);
+      return mult_to(std::move(first), second);
     } else {
       // TODO figure out why this does not compiles!!!
-      //      using TiledArray::inplace_binary;
-      //      return inplace_binary(first, second, element_op_);
+      //            using TiledArray::inplace_binary;
+      //            return inplace_binary(std::move(first), second,
+      //            element_op_);
       using TiledArray::binary;
       return binary(first, second, element_op_);
     }
@@ -145,7 +146,7 @@ class Mult {
   result_type eval(const left_type& first, right_type& second) const {
     if (!element_op_) {
       using TiledArray::mult_to;
-      return mult_to(second, first);
+      return mult_to(std::move(second), first);
     } else {  // WARNING: element_op_ might be noncommuting, so can't swap first
               // and second! for GEMM could optimize, but can't introspect
               // element_op_
@@ -340,14 +341,14 @@ class ScalMult {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, const right_type& second) const {
     using TiledArray::mult_to;
-    return mult_to(first, second, factor_);
+    return mult_to(std::move(first), second, factor_);
   }
 
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   result_type eval(const left_type& first, right_type& second) const {
     using TiledArray::mult_to;
-    return mult_to(second, first, factor_);
+    return mult_to(std::move(second), first, factor_);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!RC>::type* = nullptr>
diff --git a/src/TiledArray/tile_op/scal.h b/src/TiledArray/tile_op/scal.h
index a89770c5a7..854b592504 100644
--- a/src/TiledArray/tile_op/scal.h
+++ b/src/TiledArray/tile_op/scal.h
@@ -79,7 +79,7 @@ class Scal {
   template <bool C, typename std::enable_if<C>::type* = nullptr>
   result_type eval(argument_type& arg) const {
     using TiledArray::scale_to;
-    return scale_to(arg, factor_);
+    return scale_to(std::move(arg), factor_);
   }
 
  public:
diff --git a/src/TiledArray/tile_op/shift.h b/src/TiledArray/tile_op/shift.h
index 316c8a323e..cb5a614e14 100644
--- a/src/TiledArray/tile_op/shift.h
+++ b/src/TiledArray/tile_op/shift.h
@@ -82,15 +82,13 @@ class Shift {
   template <bool C, typename = typename std::enable_if<C>::type>
   auto eval(argument_type& arg) const {
     TiledArray::ShiftTo<result_type, argument_type> shift_to;
-    shift_to(arg, range_shift_);
-    return arg;
+    return shift_to(std::move(arg), range_shift_);
   }
 
   template <bool C, typename = typename std::enable_if<C>::type>
   auto eval(argument_type&& arg) const {
     TiledArray::ShiftTo<result_type, argument_type> shift_to;
-    shift_to(arg, range_shift_);
-    return arg;
+    return shift_to(std::move(arg), range_shift_);
   }
 
  public:
@@ -182,7 +180,7 @@ class ScalShift {
     using TiledArray::scale;
     using TiledArray::shift_to;
     result_type result = scale(arg, factor_, perm);
-    return shift_to(result, range_shift_);
+    return shift_to(std::move(result), range_shift_);
   }
 
   // Non-permuting tile evaluation functions
@@ -195,16 +193,14 @@ class ScalShift {
     using TiledArray::scale;
     using TiledArray::shift_to;
     result_type result = scale(arg, factor_);
-    return shift_to(result, range_shift_);
+    return shift_to(std::move(result), range_shift_);
   }
 
   template <bool C>
   typename std::enable_if<C, result_type>::type eval(argument_type& arg) const {
     using TiledArray::scale_to;
     using TiledArray::shift_to;
-    scale_to(arg, factor_);
-    shift_to(arg, range_shift_);
-    return arg;
+    return shift_to(scale_to(std::move(arg), factor_), range_shift_);
   }
 
  public:
diff --git a/src/TiledArray/tile_op/subt.h b/src/TiledArray/tile_op/subt.h
index 3c8755f588..c3e9071976 100644
--- a/src/TiledArray/tile_op/subt.h
+++ b/src/TiledArray/tile_op/subt.h
@@ -107,14 +107,14 @@ class Subt {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   static result_type eval(left_type& first, const right_type& second) {
     using TiledArray::subt_to;
-    return subt_to(first, second);
+    return subt_to(std::move(first), second);
   }
 
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   static result_type eval(const left_type& first, right_type& second) {
     using TiledArray::subt_to;
-    return subt_to(second, first, -1);
+    return subt_to(std::move(second), first, -1);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!RC>::type* = nullptr>
@@ -126,7 +126,7 @@ class Subt {
   template <bool LC, bool RC, typename std::enable_if<RC>::type* = nullptr>
   static result_type eval(ZeroTensor, right_type& second) {
     using TiledArray::neg_to;
-    return neg_to(second);
+    return neg_to(std::move(second));
   }
 
   template <bool LC, bool RC, typename std::enable_if<!LC>::type* = nullptr>
@@ -293,14 +293,14 @@ class ScalSubt {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, const right_type& second) const {
     using TiledArray::subt_to;
-    return subt_to(first, second, factor_);
+    return subt_to(std::move(first), second, factor_);
   }
 
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   result_type eval(const left_type& first, right_type& second) const {
     using TiledArray::subt_to;
-    return subt_to(second, first, -factor_);
+    return subt_to(std::move(second), first, -factor_);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!RC>::type* = nullptr>
@@ -312,7 +312,7 @@ class ScalSubt {
   template <bool LC, bool RC, typename std::enable_if<RC>::type* = nullptr>
   result_type eval(ZeroTensor, right_type& second) const {
     using TiledArray::scale_to;
-    return scale_to(second, -factor_);
+    return scale_to(std::move(second), -factor_);
   }
 
   template <bool LC, bool RC, typename std::enable_if<!LC>::type* = nullptr>
@@ -324,7 +324,7 @@ class ScalSubt {
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, ZeroTensor) const {
     using TiledArray::scale_to;
-    return scale_to(first, factor_);
+    return scale_to(std::move(first), factor_);
   }
 
  public:
diff --git a/src/TiledArray/tile_op/tile_interface.h b/src/TiledArray/tile_op/tile_interface.h
index ee8c1093a2..6ab18a2384 100644
--- a/src/TiledArray/tile_op/tile_interface.h
+++ b/src/TiledArray/tile_op/tile_interface.h
@@ -371,8 +371,13 @@ inline auto subt(const Arg& arg, const Scalar value, const Perm& perm) {
 /// \param result The result tile
 /// \param arg The argument to be subtracted from the result
 /// \return A tile that is equal to <tt>result[i] -= arg[i]</tt>
-template <typename Result, typename Arg>
+template <
+    typename Result, typename Arg,
+    typename = std::enable_if_t<
+        detail::has_member_function_subt_to_anyreturn_v<Result&&, const Arg&>>>
 inline decltype(auto) subt_to(Result&& result, const Arg& arg) {
+  static_assert(!std::is_const_v<std::remove_reference_t<Result>>,
+                "TA::subt_to(result, arg): result cannot be const");
   return std::forward<Result>(result).subt_to(arg);
 }
 
@@ -387,9 +392,14 @@ inline decltype(auto) subt_to(Result&& result, const Arg& arg) {
 /// \return A tile that is equal to <tt>(result -= arg) *= factor</tt>
 template <
     typename Result, typename Arg, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Result& subt_to(Result& result, const Arg& arg, const Scalar factor) {
-  return result.subt_to(arg, factor);
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::has_member_function_subt_to_anyreturn_v<
+                                Result&&, const Arg&, Scalar>>::type* = nullptr>
+inline decltype(auto) subt_to(Result&& result, const Arg& arg,
+                              const Scalar factor) {
+  static_assert(!std::is_const_v<std::remove_reference_t<Result>>,
+                "TA::subt_to(result,arg,factor): result cannot be const");
+  return std::forward<Result>(result).subt_to(arg, factor);
 }
 
 /// Subtract constant scalar from the result tile
@@ -401,9 +411,13 @@ inline Result& subt_to(Result& result, const Arg& arg, const Scalar factor) {
 /// \return A tile that is equal to <tt>(result -= arg) *= factor</tt>
 template <
     typename Result, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Result& subt_to(Result& result, const Scalar value) {
-  return result.subt_to(value);
+    typename std::enable_if<detail::is_numeric_v<Scalar> &&
+                            detail::has_member_function_subt_to_anyreturn_v<
+                                Result&&, Scalar>>::type* = nullptr>
+inline decltype(auto) subt_to(Result&& result, const Scalar value) {
+  static_assert(!std::is_const_v<std::remove_reference_t<Result>>,
+                "TA::subt_to(result,value): result cannot be const");
+  return std::forward<Result>(result).subt_to(value);
 }
 
 template <typename... T>
@@ -483,9 +497,12 @@ inline auto mult(const Left& left, const Right& right, const Scalar factor,
 /// \param result The result tile  to be multiplied
 /// \param arg The argument to be multiplied by the result
 /// \return A tile that is equal to <tt>result *= arg</tt>
-template <typename Result, typename Arg>
-inline Result& mult_to(Result& result, const Arg& arg) {
-  return result.mult_to(arg);
+template <
+    typename Result, typename Arg,
+    typename = std::enable_if_t<
+        detail::has_member_function_mult_to_anyreturn_v<Result&&, const Arg&>>>
+inline decltype(auto) mult_to(Result&& result, const Arg& arg) {
+  return std::forward<Result>(result).mult_to(arg);
 }
 
 /// Multiply and scale to the result tile
@@ -498,9 +515,12 @@ inline Result& mult_to(Result& result, const Arg& arg) {
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>(result *= arg) *= factor</tt>
 template <typename Result, typename Arg, typename Scalar,
-          std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>* = nullptr>
-inline Result& mult_to(Result& result, const Arg& arg, const Scalar factor) {
-  return result.mult_to(arg, factor);
+          std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar> &&
+                           detail::has_member_function_mult_to_anyreturn_v<
+                               Result&&, const Arg&, Scalar>>* = nullptr>
+inline decltype(auto) mult_to(Result&& result, const Arg& arg,
+                              const Scalar factor) {
+  return std::forward<Result>(result).mult_to(arg, factor);
 }
 
 template <typename... T>
@@ -559,9 +579,12 @@ inline decltype(auto) binary(const Left& left, const Right& right, Op&& op,
 /// \param op An element-wise operation
 /// \return reference to \p left
 // clang-format on
-template <typename Left, typename Right, typename Op>
-inline Left& inplace_binary(Left& left, const Right& right, Op&& op) {
-  return left.inplace_binary(right, std::forward<Op>(op));
+template <typename Left, typename Right, typename Op,
+          typename = std::enable_if_t<
+              detail::has_member_function_inplace_binary_anyreturn_v<
+                  Left&&, const Right&, Op&&>>>
+inline decltype(auto) inplace_binary(Left&& left, const Right& right, Op&& op) {
+  return std::forward<Left>(left).inplace_binary(right, std::forward<Op>(op));
 }
 
 template <typename... T>
@@ -604,9 +627,13 @@ inline auto neg(const Arg& arg, const Perm& perm) {
 /// \tparam Result The result tile type
 /// \param result The result tile to be negated
 /// \return Reference to \p result
-template <typename Result>
-inline Result& neg_to(Result& result) {
-  return result.neg_to();
+template <typename Result,
+          typename = std::enable_if_t<
+              detail::has_member_function_neg_to_anyreturn_v<Result&&>>>
+inline decltype(auto) neg_to(Result&& result) {
+  static_assert(!std::is_const_v<std::remove_reference_t<Result>>,
+                "TA::neg_to(result): result cannot be const");
+  return std::forward<Result>(result).neg_to();
 }
 
 template <typename... T>
@@ -674,9 +701,11 @@ inline auto conj(const Arg& arg, const Scalar factor, const Perm& perm) {
 /// \tparam Result The tile type
 /// \param result The tile to be conjugated
 /// \return A reference to `result`
-template <typename Result>
-inline Result& conj_to(Result& result) {
-  return result.conj_to();
+template <typename Result,
+          typename = std::enable_if_t<
+              detail::has_member_function_conj_to_anyreturn_v<Result&&>>>
+inline decltype(auto) conj_to(Result&& result) {
+  return std::forward<Result>(result).conj_to();
 }
 
 /// In-place complex conjugate and scale a tile
@@ -686,11 +715,13 @@ inline Result& conj_to(Result& result) {
 /// \param result The tile to be conjugated
 /// \param factor The scaling factor
 /// \return A reference to `result`
-template <typename Result, typename Scalar,
-          typename std::enable_if<
-              TiledArray::detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Result& conj_to(Result& result, const Scalar factor) {
-  return result.conj_to(factor);
+template <
+    typename Result, typename Scalar,
+    typename std::enable_if<TiledArray::detail::is_numeric_v<Scalar> &&
+                            detail::has_member_function_conj_to_anyreturn_v<
+                                Result&&>>::type* = nullptr>
+inline decltype(auto) conj_to(Result&& result, const Scalar factor) {
+  return std::forward<Result>(result).conj_to(factor);
 }
 
 template <typename... T>
@@ -741,9 +772,12 @@ inline decltype(auto) unary(const Arg& arg, Op&& op, const Perm& perm) {
 /// \param op An element-wise operation
 /// \return \c reference to \p arg
 // clang-format on
-template <typename Result, typename Op>
-inline Result& inplace_unary(Result& arg, Op&& op) {
-  return arg.inplace_unary(std::forward<Op>(op));
+template <
+    typename Result, typename Op,
+    typename = std::enable_if_t<
+        detail::has_member_function_inplace_unary_anyreturn_v<Result&&, Op&&>>>
+inline decltype(auto) inplace_unary(Result&& arg, Op&& op) {
+  return std::forward<Result>(arg).inplace_unary(std::forward<Op>(op));
 }
 
 template <typename... T>
diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h
index fb73512560..d4db1c4911 100644
--- a/src/TiledArray/tiled_range.h
+++ b/src/TiledArray/tiled_range.h
@@ -440,7 +440,9 @@ inline bool operator!=(const TiledRange& r1, const TiledRange& r2) {
   return !operator==(r1, r2);
 }
 
-inline std::ostream& operator<<(std::ostream& out, const TiledRange& rng) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& out, const TiledRange& rng) {
   out << "("
       << " tiles = " << rng.tiles_range()
       << ", elements = " << rng.elements_range() << " )";
diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h
index aa75916442..3bd3af1e54 100644
--- a/src/TiledArray/tiled_range1.h
+++ b/src/TiledArray/tiled_range1.h
@@ -481,7 +481,9 @@ class TiledRange1 {
     }
   }
 
-  friend std::ostream& operator<<(std::ostream&, const TiledRange1&);
+  template <typename Char, typename CharTraits>
+  friend std::basic_ostream<Char, CharTraits>& operator<<(
+      std::basic_ostream<Char, CharTraits>&, const TiledRange1&);
 
   // TiledRange1 data
   range_type range_;           ///< the range of tile indices
@@ -511,7 +513,9 @@ inline bool operator!=(const TiledRange1& r1, const TiledRange1& r2) {
 }
 
 /// TiledRange1 ostream operator
-inline std::ostream& operator<<(std::ostream& out, const TiledRange1& rng) {
+template <typename Char, typename CharTraits>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& out, const TiledRange1& rng) {
   out << "( tiles = " << rng.tiles_range()
       << ", elements = " << rng.elements_range() << " )";
   return out;
diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h
index 80c6bd924f..f3842387c3 100644
--- a/src/TiledArray/type_traits.h
+++ b/src/TiledArray/type_traits.h
@@ -322,8 +322,6 @@ GENERATE_HAS_MEMBER_TYPE(mapped_type)
 
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(size)
 GENERATE_HAS_MEMBER_FUNCTION(size)
-GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(total_size)
-GENERATE_HAS_MEMBER_FUNCTION(total_size)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(data)
 GENERATE_HAS_MEMBER_FUNCTION(data)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(empty)
@@ -332,6 +330,12 @@ GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(clear)
 GENERATE_HAS_MEMBER_FUNCTION(clear)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(resize)
 
+// Tensor-only
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(total_size)
+GENERATE_HAS_MEMBER_FUNCTION(total_size)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(nbatch)
+GENERATE_HAS_MEMBER_FUNCTION(nbatch)
+
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(begin)
 GENERATE_HAS_MEMBER_FUNCTION(begin)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(end)
@@ -378,6 +382,30 @@ GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(mult)
 GENERATE_HAS_MEMBER_FUNCTION(mult)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(mult_to)
 GENERATE_HAS_MEMBER_FUNCTION(mult_to)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(scale)
+GENERATE_HAS_MEMBER_FUNCTION(scale)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(scale_to)
+GENERATE_HAS_MEMBER_FUNCTION(scale_to)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(neg)
+GENERATE_HAS_MEMBER_FUNCTION(neg)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(neg_to)
+GENERATE_HAS_MEMBER_FUNCTION(neg_to)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(conj)
+GENERATE_HAS_MEMBER_FUNCTION(conj)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(conj_to)
+GENERATE_HAS_MEMBER_FUNCTION(conj_to)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(shift)
+GENERATE_HAS_MEMBER_FUNCTION(shift)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(shift_to)
+GENERATE_HAS_MEMBER_FUNCTION(shift_to)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(unary)
+GENERATE_HAS_MEMBER_FUNCTION(unary)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(inplace_unary)
+GENERATE_HAS_MEMBER_FUNCTION(inplace_unary)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(binary)
+GENERATE_HAS_MEMBER_FUNCTION(binary)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(inplace_binary)
+GENERATE_HAS_MEMBER_FUNCTION(inplace_binary)
 
 GENERATE_IS_FREE_FUNCTION_ANYRETURN(permute)
 
diff --git a/src/TiledArray/util/logger.h b/src/TiledArray/util/logger.h
index f33f96a35f..8993767539 100644
--- a/src/TiledArray/util/logger.h
+++ b/src/TiledArray/util/logger.h
@@ -63,7 +63,9 @@ struct TileOpsLogger : public Singleton<TileOpsLogger<T>> {
     return *this;
   }
 
-  TileOpsLogger& operator<<(std::ostream& (*func)(std::ostream&)) {
+  template <typename Char, typename CharTraits>
+  TileOpsLogger& operator<<(std::basic_ostream<Char, CharTraits>& (*func)(
+      std::basic_ostream<Char, CharTraits>&)) {
     *log << func;
     return *this;
   }
diff --git a/src/TiledArray/util/vector.h b/src/TiledArray/util/vector.h
index 6e69f523f4..d0e0651ecc 100644
--- a/src/TiledArray/util/vector.h
+++ b/src/TiledArray/util/vector.h
@@ -126,9 +126,9 @@ decltype(auto) operator-(const boost::container::small_vector<T1, N1>& v1,
 namespace TiledArray {
 
 /// Vector output stream operator
-template <typename T, typename A>
-inline std::ostream& operator<<(std::ostream& os,
-                                const std::vector<T, A>& vec) {
+template <typename Char, typename CharTraits, typename T, typename A>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const std::vector<T, A>& vec) {
   TiledArray::detail::print_array(os, vec);
   return os;
 }
@@ -139,9 +139,10 @@ namespace boost {
 namespace container {
 
 /// Vector output stream operator
-template <typename T, std::size_t N>
-inline std::ostream& operator<<(
-    std::ostream& os, const boost::container::small_vector<T, N>& vec) {
+template <typename Char, typename CharTraits, typename T, std::size_t N>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os,
+    const boost::container::small_vector<T, N>& vec) {
   TiledArray::detail::print_array(os, vec);
   return os;
 }
diff --git a/src/TiledArray/utility.h b/src/TiledArray/utility.h
index 24086eeb19..9e515c59dc 100644
--- a/src/TiledArray/utility.h
+++ b/src/TiledArray/utility.h
@@ -44,8 +44,9 @@ namespace detail {
 /// \param out A standard output stream
 /// \param a The array-like container to be printed
 /// \param n The number of elements in the array.
-template <typename A>
-inline void print_array(std::ostream& out, const A& a, const std::size_t n) {
+template <typename Char, typename CharTraits, typename A>
+inline void print_array(std::basic_ostream<Char, CharTraits>& out, const A& a,
+                        const std::size_t n) {
   out << "[";
   for (std::size_t i = 0; i < n; ++i) {
     out << a[i];
@@ -59,8 +60,8 @@ inline void print_array(std::ostream& out, const A& a, const std::size_t n) {
 /// \tparam A The array container type
 /// \param out A standard output stream
 /// \param a The array-like container to be printed
-template <typename A>
-inline void print_array(std::ostream& out, const A& a) {
+template <typename Char, typename CharTraits, typename A>
+inline void print_array(std::basic_ostream<Char, CharTraits>& out, const A& a) {
   using std::size;
   print_array(out, a, size(a));
 }
diff --git a/src/TiledArray/val_array.h b/src/TiledArray/val_array.h
index 9a8620443d..2c9bea5b74 100644
--- a/src/TiledArray/val_array.h
+++ b/src/TiledArray/val_array.h
@@ -464,16 +464,16 @@ class ValArray : private SizeArray<T> {
             typename = std::enable_if_t<madness::is_input_archive_v<Archive>>>
   void serialize(Archive& ar) {
     size_t sz = 0;
-    ar& sz;
+    ar & sz;
     init(sz);
     ar& madness::archive::wrap(data(), size());
   }
 
 };  // class ValArray
 
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os,
-                                const ValArray<T>& val_array) {
+template <typename Char, typename CharTraits, typename T>
+inline std::basic_ostream<Char, CharTraits>& operator<<(
+    std::basic_ostream<Char, CharTraits>& os, const ValArray<T>& val_array) {
   print_array(os, val_array);
   return os;
 }
diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h
index 70897d7ca1..ef5bd13dff 100644
--- a/tests/sparse_tile.h
+++ b/tests/sparse_tile.h
@@ -296,23 +296,6 @@ EigenSparseTile<T, TagType> add(const EigenSparseTile<T, TagType>& arg1,
                                      arg1.range());
 }
 
-//// dense_result[i] = dense_arg1[i] + sparse_arg2[i]
-// template <typename T, typename TagType>
-// TiledArray::Tensor<T> add(const TiledArray::Tensor<T>& arg1,
-//                           const EigenSparseTile<T, TagType>& arg2) {
-//   TA_ASSERT(arg1.range() == arg2.range());
-//
-//   // this could be done better ...
-//   return TiledArray::add(arg1, static_cast<TiledArray::Tensor<T>>(arg2));
-// }
-//
-//// dense_result[i] = sparse_arg1[i] + dense_arg2[i]
-// template <typename T, typename TagType>
-// TiledArray::Tensor<T> add(const EigenSparseTile<T, TagType>& arg1,
-//                           const TiledArray::Tensor<T>& arg2) {
-//   return TiledArray::add(arg2, static_cast<TiledArray::Tensor<T>>(arg1));
-// }
-
 // dense_result[perm ^ i] = dense_arg1[i] + sparse_arg2[i]
 template <
     typename T, typename TagType, typename Perm,
diff --git a/tests/tensor.cpp b/tests/tensor.cpp
index 99b10fc7b7..0909004e00 100644
--- a/tests/tensor.cpp
+++ b/tests/tensor.cpp
@@ -31,6 +31,14 @@ const TensorFixture::range_type TensorFixture::r = make_range(81);
 
 BOOST_FIXTURE_TEST_SUITE(tensor_suite, TensorFixture, TA_UT_LABEL_SERIAL)
 
+BOOST_AUTO_TEST_CASE(anatomy) {
+  // Tensor = Range + nbatch + shared_ptr to data
+  BOOST_CHECK(sizeof(TensorD) == sizeof(Range) + sizeof(size_t) +
+                                     sizeof(std::shared_ptr<double[]>));
+  // std::wcout << "sizeof(TensorD) = " << sizeof(TensorD) << " sizeof(TensorI)
+  // = " << sizeof(TensorN) << std::endl;
+}
+
 BOOST_AUTO_TEST_CASE(default_constructor) {
   // check constructor
   BOOST_REQUIRE_NO_THROW(TensorN x);
@@ -294,18 +302,36 @@ BOOST_AUTO_TEST_CASE(binary_perm_constructor) {
 }
 
 BOOST_AUTO_TEST_CASE(clone) {
-  // check default constructor
+  // clone non-default-constructed
   TensorN tc;
   BOOST_CHECK(tc.empty());
   BOOST_REQUIRE_NO_THROW(tc = t.clone());
-
   BOOST_CHECK_EQUAL(tc.empty(), t.empty());
-
-  // Check that range data is correct.
   BOOST_CHECK_NE(tc.data(), t.data());
   BOOST_CHECK_EQUAL(tc.size(), t.size());
   BOOST_CHECK_EQUAL(tc.range(), t.range());
   BOOST_CHECK_EQUAL_COLLECTIONS(tc.begin(), tc.end(), t.begin(), t.end());
+
+  // clone default-constructed tensor
+  {
+    TensorN tnull;
+    BOOST_REQUIRE_NO_THROW(tc = tnull.clone());
+    BOOST_CHECK_EQUAL(tc.empty(), tnull.empty());
+  }
+
+  // clone rvalue (e.g. temporary) tensor = move
+  {
+    TensorN t2 = t.clone();
+    const auto t2_data = t2.data();
+    BOOST_REQUIRE_NO_THROW(tc = std::move(t2).clone());
+    BOOST_CHECK(t2.empty());  // t2 is moved-from state
+    BOOST_CHECK(!tc.empty());
+    BOOST_CHECK_NE(tc.data(), t.data());
+    BOOST_CHECK_EQUAL(tc.data(), t2_data);
+    BOOST_CHECK_EQUAL(tc.size(), t.size());
+    BOOST_CHECK_EQUAL(tc.range(), t.range());
+    BOOST_CHECK_EQUAL_COLLECTIONS(tc.begin(), tc.end(), t.begin(), t.end());
+  }
 }
 
 BOOST_AUTO_TEST_CASE(copy_assignment_operator) {
@@ -744,4 +770,17 @@ BOOST_AUTO_TEST_CASE(rebind) {
   static_assert(std::is_same_v<TiledArray::detail::real_t<TensorZ>, TensorD>);
 }
 
+BOOST_AUTO_TEST_CASE(print) {
+  std::ostringstream oss;
+  std::wostringstream woss;
+  BOOST_REQUIRE_NO_THROW(oss << t);
+  BOOST_REQUIRE_NO_THROW(woss << t);
+  // std::cout << t;
+  decltype(t) tb(t.range(), decltype(t)::nbatches{2});
+  rand_fill(1, tb.total_size(), tb.data());
+  BOOST_REQUIRE_NO_THROW(oss << tb);
+  BOOST_REQUIRE_NO_THROW(woss << tb);
+  // std::cout << tb;
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp
index f0aa8be3e8..9faa155cc9 100644
--- a/tests/tensor_of_tensor.cpp
+++ b/tests/tensor_of_tensor.cpp
@@ -1316,4 +1316,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(rebind, ITensor, itensor_types) {
                                Tensor<TensorZ>>);
 }
 
+BOOST_AUTO_TEST_CASE_TEMPLATE(print, ITensor, itensor_types) {
+  const auto& t = ToT<ITensor>(0);
+
+  BOOST_REQUIRE_NO_THROW(std::cout << t);
+  // std::wcout << t;
+}
+
 BOOST_AUTO_TEST_SUITE_END()