From ba9580a62ceea0d4310b4ffbef210aaf6e7e9a0b Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 20 Nov 2024 07:10:48 -0800
Subject: [PATCH 01/96] test and quick fix for zero batch

---
 .../primitives/equivariant_tensor_product.py           |  5 +----
 .../primitives/symmetric_tensor_product.py             |  6 ++----
 .../cuequivariance_torch/primitives/tensor_product.py  |  5 ++++-
 .../tests/operations/symmetric_contraction_test.py     | 10 +++++-----
 .../tests/operations/tp_channel_wise_test.py           |  6 ++++--
 docs/changelog.md                                      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index cd0df78..113fc39 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import *
+from typing import Optional, Union
 
 import torch
 
@@ -70,9 +70,6 @@ def __init__(
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
-        cue.descriptors.fully_connected_tensor_product(
-            cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1")
-        )
         if not isinstance(layout_in, tuple):
             layout_in = (layout_in,) * e.num_inputs
         if len(layout_in) != e.num_inputs:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 9d67863..8738ac1 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -14,15 +14,13 @@
 # limitations under the License.
 import logging
 import math
-import warnings
-from typing import *
+from typing import Optional
 
 import torch
 import torch.fx
 
 import cuequivariance.segmented_tensor_product as stp
 import cuequivariance_torch as cuet
-from cuequivariance import segmented_tensor_product as stp
 
 logger = logging.getLogger(__name__)
 
@@ -341,7 +339,7 @@ def forward(
             f"Calling SymmetricTensorContraction: {self.descriptors}, input shapes: {x0.shape}, {i0.shape}, {x1.shape}"
         )
         out = self.f(x1, x0, i0)
-        out = out.reshape(out.shape[0], -1)
+        out = out.reshape(out.shape[0], out.shape[1] * self.u)
         return out
 
 
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index b50db4f..b3c0a20 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -88,6 +88,9 @@ def forward(self, *args, use_fallback: Optional[bool] = None):
         Raises:
             RuntimeError: If `use_fallback` is `False` and either no CUDA kernel is available or the input tensor is not on CUDA.
         """
+        if any(x.numel() == 0 for x in args):
+            use_fallback = True  # Empty tensors are not supported by the CUDA kernel
+
         if (
             args
             and args[0].device.type == "cuda"
@@ -285,7 +288,7 @@ def forward(self, *args):
                     (math.prod(shape), arg.shape[-1])
                 )
                 if math.prod(arg.shape[:-1]) > 1
-                else arg.reshape((1, arg.shape[-1]))
+                else arg.reshape((math.prod(arg.shape[:-1]), arg.shape[-1]))
             )
             for arg in args
         ]
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index 2ff42f3..62ba30e 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -30,7 +30,8 @@
 @pytest.mark.parametrize("dtype", [torch.float64, torch.float32])
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("original_mace", [True, False])
-def test_symmetric_contraction(dtype, layout, original_mace):
+@pytest.mark.parametrize("batch", [0, 32])
+def test_symmetric_contraction(dtype, layout, original_mace, batch):
     mul = 64
     irreps_in = mul * cue.Irreps("O3", "0e + 1o + 2e")
     irreps_out = mul * cue.Irreps("O3", "0e + 1o")
@@ -48,12 +49,11 @@ def test_symmetric_contraction(dtype, layout, original_mace):
         original_mace=original_mace,
     )
 
-    Z = 32
-    x = torch.randn((Z, irreps_in.dim), dtype=dtype).cuda()
-    indices = torch.randint(0, 5, (Z,), dtype=torch.int32).cuda()
+    x = torch.randn((batch, irreps_in.dim), dtype=dtype).cuda()
+    indices = torch.randint(0, 5, (batch,), dtype=torch.int32).cuda()
 
     out = m(x, indices)
-    assert out.shape == (Z, irreps_out.dim)
+    assert out.shape == (batch, irreps_out.dim)
 
 
 def from64(shape: tuple[int, ...], data: str) -> torch.Tensor:
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index 155c73b..aa39e25 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -31,12 +31,14 @@
 @pytest.mark.parametrize("irreps3", list_of_irreps)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
+@pytest.mark.parametrize("batch", [0, 32])
 def test_channel_wise(
     irreps1: cue.Irreps,
     irreps2: cue.Irreps,
     irreps3: cue.Irreps,
     layout: cue.IrrepsLayout,
     use_fallback: bool,
+    batch: int,
 ):
     m = cuet.ChannelWiseTensorProduct(
         irreps1,
@@ -49,8 +51,8 @@ def test_channel_wise(
         dtype=torch.float64,
     )
 
-    x1 = torch.randn(32, irreps1.dim, dtype=torch.float64).cuda()
-    x2 = torch.randn(32, irreps2.dim, dtype=torch.float64).cuda()
+    x1 = torch.randn(batch, irreps1.dim, dtype=torch.float64).cuda()
+    x2 = torch.randn(batch, irreps2.dim, dtype=torch.float64).cuda()
 
     out1 = m(x1, x2, use_fallback=use_fallback)
 
diff --git a/docs/changelog.md b/docs/changelog.md
index e1636f8..e1a8414 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,3 +1,3 @@
-# Change Log
+# Changelog
 
 ```{include} ../CHANGELOG.md

From 0bfada92c66dd5e74ffe029fc459e64725b986ad Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 20 Nov 2024 07:17:40 -0800
Subject: [PATCH 02/96] trigger uniform 1d in test

---
 cuequivariance_torch/tests/operations/tp_channel_wise_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index aa39e25..64c08c3 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -20,7 +20,7 @@
 from cuequivariance import descriptors
 
 list_of_irreps = [
-    cue.Irreps("O3", "4x0e + 4x1o"),
+    cue.Irreps("O3", "32x0e + 32x1o"),
     cue.Irreps("O3", "2x1o + 5x0e + 2e + 1e + 1o"),
     cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
 ]

From fd097c67e0ebae227987bda1352baa8af88fbbaa Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 00:05:48 -0800
Subject: [PATCH 03/96] satisfy linter

Signed-off-by: Mario Geiger <mgeiger@nvidia.com>
---
 .../experimental/mace/symmetric_contractions.py               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py b/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py
index 59c8e41..ebfc5c7 100644
--- a/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py
+++ b/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import cache
-from typing import *
+from typing import Optional
 
 import numpy as np
 
@@ -164,7 +164,7 @@ def U_matrix_real(
     assert isinstance(ir_out, cue.Irrep)
 
     if correlation == 4:
-        filter_ir_mid = frozenset([G(l, (-1) ** l) for l in range(11 + 1)])
+        filter_ir_mid = frozenset([G(l, (-1) ** l) for l in range(11 + 1)])  # noqa E741
     else:
         filter_ir_mid = None
 

From 251fc4d6146e1ffa55e9287b23e294441f873d8e Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 06:19:43 -0800
Subject: [PATCH 04/96] from typing import

---
 .../cuequivariance_torch/primitives/tensor_product.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index b3c0a20..86f04a0 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -15,7 +15,7 @@
 import logging
 import math
 import warnings
-from typing import *
+from typing import Optional, OrderedDict, Tuple
 
 import torch
 import torch.fx

From 3498a32a2e124b499aa4ed7cfe842e83492db91a Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 06:24:46 -0800
Subject: [PATCH 05/96] determine math_dtype earlier

---
 .../primitives/tensor_product.py                   | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 86f04a0..affa7ce 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -47,6 +47,9 @@ def __init__(
         super().__init__()
         self.descriptor = descriptor
 
+        if math_dtype is None:
+            math_dtype = torch.get_default_dtype()
+
         try:
             self.f_cuda = _tensor_product_cuda(descriptor, device, math_dtype)
         except NotImplementedError as e:
@@ -116,7 +119,7 @@ def forward(self, *args, use_fallback: Optional[bool] = None):
 def _tensor_product_fx(
     descriptor: stp.SegmentedTensorProduct,
     device: Optional[torch.device],
-    math_dtype: Optional[torch.dtype],
+    math_dtype: torch.dtype,
     optimize_einsums: bool,
 ) -> torch.nn.Module:
     """
@@ -124,10 +127,6 @@ def _tensor_product_fx(
     - at least one input operand should have a batch dimension (ndim=2)
     - the output operand will have a batch dimension (ndim=2)
     """
-
-    if math_dtype is None:
-        math_dtype = torch.get_default_dtype()
-
     descriptor = descriptor.remove_zero_paths()
     descriptor = descriptor.remove_empty_segments()
 
@@ -313,7 +312,7 @@ def _sum(tensors, *, shape=None, like=None):
 def _tensor_product_cuda(
     descriptor: stp.SegmentedTensorProduct,
     device: Optional[torch.device],
-    math_dtype: Optional[torch.dtype],
+    math_dtype: torch.dtype,
 ) -> torch.nn.Module:
     logger.debug(f"Starting search for a cuda kernel for {descriptor}")
 
@@ -326,9 +325,6 @@ def _tensor_product_cuda(
             f" Got {descriptor.subscripts}."
         )
 
-    if math_dtype is None:
-        math_dtype = torch.get_default_dtype()
-
     if not torch.cuda.is_available():
         raise NotImplementedError("CUDA is not available.")
 

From 7f3cf05c1fe200078385a7fc8ce555035a1f9ed7 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 06:30:41 -0800
Subject: [PATCH 06/96] warning with pip commands

---
 .../cuequivariance_torch/primitives/tensor_product.py       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index affa7ce..4ac91af 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -57,6 +57,12 @@ def __init__(
             self.f_cuda = None
         except ImportError as e:
             logger.warning(f"CUDA implementation not available: {e}")
+            logger.warning(
+                "Did you forget to install the CUDA version of cuequivariance-ops-torch?\n"
+                "Install it with one of the following commands:\n"
+                "pip install cuequivariance-ops-torch-cu11\n"
+                "pip install cuequivariance-ops-torch-cu12"
+            )
             self.f_cuda = None
 
         self.f_fx = _tensor_product_fx(

From 262433557ca53b9d3a9dcae00bca8639b97f8d76 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 06:41:47 -0800
Subject: [PATCH 07/96] remove unused argument

---
 .../cuequivariance_torch/primitives/tensor_product.py         | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 4ac91af..736c2aa 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -443,12 +443,10 @@ def forward(
         self,
         x0: torch.Tensor,
         x1: torch.Tensor,
-        b2: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         x0, x1 = self._perm(x0, x1)
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
-        assert b2 is None
 
         shape = torch.broadcast_shapes(x0.shape[:-1], x1.shape[:-1])
         x0 = _reshape(x0, shape)
@@ -504,13 +502,11 @@ def forward(
         x0: torch.Tensor,
         x1: torch.Tensor,
         x2: torch.Tensor,
-        b3: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         x0, x1, x2 = self._perm(x0, x1, x2)
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
         assert x2.ndim >= 1, x2.ndim
-        assert b3 is None
 
         shape = torch.broadcast_shapes(x0.shape[:-1], x1.shape[:-1], x2.shape[:-1])
         x0 = _reshape(x0, shape)

From 91f7fce1457de45fe19547f329e0ceda86c0dd1a Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 06:43:18 -0800
Subject: [PATCH 08/96] changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb7a140..0c32d2c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## Latest Changes
+
+- Add support for empty batch dimension in `cuequivariance-torch`.
+
 ## 0.1.0 (2024-11-18)
 
 - Beta version of cuEquivariance released.

From 4401048d23e2028a7ec5ea0c1717fffae292075a Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 07:08:49 -0800
Subject: [PATCH 09/96] list of inputs

---
 .../cuequivariance_torch/operations/linear.py  |  2 +-
 .../operations/rotation.py                     |  7 ++-----
 .../operations/spherical_harmonics.py          |  2 +-
 .../operations/symmetric_contraction.py        |  2 +-
 .../operations/tp_channel_wise.py              |  2 +-
 .../operations/tp_fully_connected.py           |  2 +-
 .../primitives/equivariant_tensor_product.py   | 12 ++++++------
 .../primitives/symmetric_tensor_product.py     |  4 ++--
 .../primitives/tensor_product.py               | 18 +++++++++---------
 .../tests/operations/tp_channel_wise_test.py   |  2 +-
 .../operations/tp_fully_connected_test.py      |  4 +---
 .../equivariant_tensor_product_test.py         | 10 +++++-----
 .../tests/primitives/tensor_product_test.py    |  4 ++--
 docs/tutorials/etp.rst                         |  2 +-
 docs/tutorials/stp.rst                         |  2 +-
 15 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/linear.py b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
index e3e34da..197f5ac 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/linear.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
@@ -126,4 +126,4 @@ def forward(
         if not self.shared_weights and weight.ndim != 2:
             raise ValueError("Weights should be 2D tensor")
 
-        return self.f(weight, x, use_fallback=use_fallback)
+        return self.f([weight, x], use_fallback=use_fallback)
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/rotation.py b/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
index a9c13b8..9fb4d86 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
@@ -96,10 +96,7 @@ def forward(
         encodings_alpha = encode_rotation_angle(alpha, self.lmax)
 
         return self.f(
-            encodings_gamma,
-            encodings_beta,
-            encodings_alpha,
-            x,
+            [encodings_gamma, encodings_beta, encodings_alpha, x],
             use_fallback=use_fallback,
         )
 
@@ -194,4 +191,4 @@ def __init__(
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Apply the inversion layer."""
-        return self.f(x)
+        return self.f([x])
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
index 7a8a6e2..bfd0163 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
@@ -55,6 +55,6 @@ def spherical_harmonics(
         math_dtype=x.dtype,
         optimize_fallback=optimize_fallback,
     )
-    y = m(x)
+    y = m([x])
     y = y.reshape(vectors.shape[:-1] + (y.shape[-1],))
     return y
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index 9c02c19..35f1b56 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -188,4 +188,4 @@ def forward(
             weight = self.weight
         weight = weight.flatten(1)
 
-        return self.f(weight, x, indices=indices, use_fallback=use_fallback)
+        return self.f([weight, x], indices=indices, use_fallback=use_fallback)
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
index fcd3643..76402e7 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
@@ -147,4 +147,4 @@ def forward(
         if not self.shared_weights and weight.ndim != 2:
             raise ValueError("Weights should be 2D tensor")
 
-        return self.f(weight, x1, x2, use_fallback=use_fallback)
+        return self.f([weight, x1, x2], use_fallback=use_fallback)
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
index 4f1dcf4..f33c7f6 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
@@ -148,4 +148,4 @@ def forward(
         if not self.shared_weights and weight.ndim != 2:
             raise ValueError("Weights should be 2D tensor")
 
-        return self.f(weight, x1, x2, use_fallback=use_fallback)
+        return self.f([weight, x1, x2], use_fallback=use_fallback)
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 113fc39..18461b7 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import torch
 
@@ -41,7 +41,7 @@ class EquivariantTensorProduct(torch.nn.Module):
         >>> x1 = torch.ones(17, e.inputs[1].irreps.dim)
         >>> x2 = torch.ones(17, e.inputs[2].irreps.dim)
         >>> tp = cuet.EquivariantTensorProduct(e, layout=cue.ir_mul)
-        >>> tp(w, x1, x2)
+        >>> tp([w, x1, x2])
         tensor([[0., 0., 0., 0., 0., 0.],
         ...
                 [0., 0., 0., 0., 0., 0.]])
@@ -50,7 +50,7 @@ class EquivariantTensorProduct(torch.nn.Module):
 
         >>> w = torch.ones(3, e.inputs[0].irreps.dim)
         >>> indices = torch.randint(3, (17,))
-        >>> tp(w, x1, x2, indices=indices)
+        >>> tp([w, x1, x2], indices=indices)
         tensor([[0., 0., 0., 0., 0., 0.],
         ...
                 [0., 0., 0., 0., 0., 0.]])
@@ -136,14 +136,14 @@ def extra_repr(self) -> str:
 
     def forward(
         self,
-        *inputs: torch.Tensor,
+        inputs: List[torch.Tensor],
         indices: Optional[torch.Tensor] = None,
         use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         If ``indices`` is not None, the first input is indexed by ``indices``.
         """
-        inputs: list[torch.Tensor] = list(inputs)
+        inputs: List[torch.Tensor] = list(inputs)
 
         assert len(inputs) == len(self.etp.inputs)
         for a, b in zip(inputs, self.etp.inputs):
@@ -162,7 +162,7 @@ def forward(
                 # TODO: at some point we will have kernel for this
                 assert len(inputs) >= 1
                 inputs[0] = inputs[0][indices]
-            output = self.tp(*inputs, use_fallback=use_fallback)
+            output = self.tp(inputs, use_fallback=use_fallback)
 
         if self.symm_tp is not None:
             if len(inputs) == 1:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 8738ac1..386d04a 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -109,7 +109,7 @@ def forward(
             use_fallback=use_fallback,
         )
         if self.f0 is not None:
-            out += self.f0()
+            out += self.f0([])
         return out
 
 
@@ -368,6 +368,6 @@ def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
         return sum(
-            f(x0[i0], *[x1] * (f.descriptor.num_operands - 2), use_fallback=True)
+            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2), use_fallback=True)
             for f in self.fs
         )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 736c2aa..644c612 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -15,7 +15,7 @@
 import logging
 import math
 import warnings
-from typing import Optional, OrderedDict, Tuple
+from typing import List, Optional, OrderedDict, Tuple
 
 import torch
 import torch.fx
@@ -76,12 +76,12 @@ def __repr__(self):
         )
         return f"TensorProduct({self.descriptor} {has_cuda_kernel})"
 
-    def forward(self, *args, use_fallback: Optional[bool] = None):
+    def forward(self, inputs: List[torch.Tensor], use_fallback: Optional[bool] = None):
         r"""
         Perform the tensor product based on the specified descriptor.
 
         Args:
-            args (list of torch.Tensor): The input tensors. The number of input tensors should match the number of operands in the descriptor minus one.
+            inputs (list of torch.Tensor): The input tensors. The number of input tensors should match the number of operands in the descriptor minus one.
                 Each input tensor should have a shape of ((batch,) operand_size), where `operand_size` corresponds to the size
                 of each operand as defined in the tensor product descriptor.
             use_fallback (bool, optional):  Determines the computation method. If `None` (default), a CUDA kernel will be used if available and the input
@@ -97,16 +97,16 @@ def forward(self, *args, use_fallback: Optional[bool] = None):
         Raises:
             RuntimeError: If `use_fallback` is `False` and either no CUDA kernel is available or the input tensor is not on CUDA.
         """
-        if any(x.numel() == 0 for x in args):
+        if any(x.numel() == 0 for x in inputs):
             use_fallback = True  # Empty tensors are not supported by the CUDA kernel
 
         if (
-            args
-            and args[0].device.type == "cuda"
+            inputs
+            and inputs[0].device.type == "cuda"
             and self.f_cuda is not None
             and (use_fallback is not True)
         ):
-            return self.f_cuda(*args)
+            return self.f_cuda(*inputs)
 
         if use_fallback is False:
             if self.f_cuda is not None:
@@ -119,7 +119,7 @@ def forward(self, *args, use_fallback: Optional[bool] = None):
                 "The fallback method is used but it has not been optimized. "
                 "Consider setting optimize_fallback=True when creating the TensorProduct module."
             )
-        return self.f_fx(*args)
+        return self.f_fx(inputs)
 
 
 def _tensor_product_fx(
@@ -278,7 +278,7 @@ def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProdu
         self.module = module
         self.descriptor = descriptor
 
-    def forward(self, *args):
+    def forward(self, args):
         for oid, arg in enumerate(args):
             torch._assert(
                 arg.shape[-1] == self.descriptor.operands[oid].size,
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index 64c08c3..9540c73 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -62,7 +62,7 @@ def test_channel_wise(
     if layout == cue.mul_ir:
         d = d.add_or_transpose_modes("u,ui,j,uk+ijk")
     mfx = cuet.TensorProduct(d, math_dtype=torch.float64).cuda()
-    out2 = mfx(m.weight, x1, x2, use_fallback=True)
+    out2 = mfx([m.weight, x1, x2], use_fallback=True)
 
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index d00e6ba..64944fb 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -59,9 +59,7 @@ def test_fully_connected(
         d = d.add_or_transpose_modes("uvw,ui,vj,wk+ijk")
     mfx = cuet.TensorProduct(d, math_dtype=torch.float64).cuda()
     out2 = mfx(
-        m.weight.to(torch.float64),
-        x1.to(torch.float64),
-        x2.to(torch.float64),
+        [m.weight.to(torch.float64), x1.to(torch.float64), x2.to(torch.float64)],
         use_fallback=True,
     ).to(out1.dtype)
 
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index aa1b0dd..0700a2c 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -84,11 +84,11 @@ def test_performance_cuda_vs_fx(
     ]
 
     for _ in range(10):
-        m(*inputs, use_fallback=False)
-        m(*inputs, use_fallback=True)
+        m(inputs, use_fallback=False)
+        m(inputs, use_fallback=True)
 
     def f(ufb: bool):
-        m(*inputs, use_fallback=ufb)
+        m(inputs, use_fallback=ufb)
         torch.cuda.synchronize()
 
     t0 = timeit.Timer(lambda: f(False)).timeit(number=10)
@@ -130,7 +130,7 @@ def test_precision_cuda_vs_fx(
         device=device,
         math_dtype=math_dtype,
     )
-    y0 = m(*inputs, use_fallback=False)
+    y0 = m(inputs, use_fallback=False)
 
     m = cuet.EquivariantTensorProduct(
         e,
@@ -140,6 +140,6 @@ def test_precision_cuda_vs_fx(
         optimize_fallback=True,
     )
     inputs = map(lambda x: x.to(torch.float64), inputs)
-    y1 = m(*inputs, use_fallback=True).to(dtype)
+    y1 = m(inputs, use_fallback=True).to(dtype)
 
     torch.testing.assert_close(y0, y1, atol=atol, rtol=rtol)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 1171f8b..53e8bfc 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -111,12 +111,12 @@ def test_primitive_tensor_product_cuda_vs_fx(
         m = cuet.TensorProduct(
             d, device=device, math_dtype=math_dtype, optimize_fallback=False
         )
-        out1 = m(*inputs, use_fallback=False)
+        out1 = m(inputs, use_fallback=False)
         m = cuet.TensorProduct(
             d, device=device, math_dtype=torch.float64, optimize_fallback=False
         )
         inputs_ = [inp.clone().to(torch.float64) for inp in inputs]
-        out2 = m(*inputs_, use_fallback=True)
+        out2 = m(inputs_, use_fallback=True)
 
         assert out1.shape[:-1] == torch.broadcast_shapes(*batches)
         assert out1.dtype == dtype
diff --git a/docs/tutorials/etp.rst b/docs/tutorials/etp.rst
index 1f2b20b..529eda8 100644
--- a/docs/tutorials/etp.rst
+++ b/docs/tutorials/etp.rst
@@ -94,6 +94,6 @@ We can execute an :class:`cuequivariance.EquivariantTensorProduct` with PyTorch.
     w = torch.randn(e.inputs[0].irreps.dim)
     x = torch.randn(e.inputs[1].irreps.dim)
 
-    module(w, x)
+    module([w, x])
 
 Note that you have to specify the layout. If the layout specified is different from the one in the descriptor, the module will transpose the inputs/output to match the layout.
diff --git a/docs/tutorials/stp.rst b/docs/tutorials/stp.rst
index 2a098a1..b9516ee 100644
--- a/docs/tutorials/stp.rst
+++ b/docs/tutorials/stp.rst
@@ -112,7 +112,7 @@ Now we can execute the linear layer with random input and weight tensors.
    w = torch.randn(d.operands[0].size)
    x1 = torch.randn(3000, irreps1.dim)
 
-   x2 = linear_torch(w, x1)
+   x2 = linear_torch([w, x1])
 
    assert x2.shape == (3000, irreps2.dim)
 

From ad2db8d5a540ba2ca54020248395bd6ba6c821b1 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 07:12:23 -0800
Subject: [PATCH 10/96] add Fixed subtite

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c32d2c..0d75a9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## Latest Changes
 
+### Fixed
+
 - Add support for empty batch dimension in `cuequivariance-torch`.
 
 ## 0.1.0 (2024-11-18)

From 889051ad16c421bc912cb6733b18665c72b91341 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 21 Nov 2024 07:13:37 -0800
Subject: [PATCH 11/96] changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0d75a9c..867796d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 ## Latest Changes
 
+### Changed
+
+- `cuequivariance_torch.TensorProduct` and `cuequivariance_torch.EquivariantTensorProduct` now require lists of `torch.Tensor` as input.
+
 ### Fixed
 
 - Add support for empty batch dimension in `cuequivariance-torch`.

From bc6b405c7e23fd8ab18b2a935e4c4bfffdec0c88 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 3 Dec 2024 01:24:15 -0800
Subject: [PATCH 12/96] add test for torch.jit.script

---
 .../primitives/equivariant_tensor_product_test.py     | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 110b3cf..d08af4b 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -154,3 +154,14 @@ def test_compile():
     input1 = torch.randn(100, e.inputs[0].irreps.dim)
     input2 = torch.randn(100, e.inputs[1].irreps.dim)
     m_compile(input1, input2)
+
+
+def test_script():
+    e = cue.descriptors.symmetric_contraction(
+        cue.Irreps("O3", "32x0e + 32x1o"), cue.Irreps("O3", "32x0e + 32x1o"), [1, 2, 3]
+    )
+    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir, optimize_fallback=False)
+    m_script = torch.jit.script(m)
+    input1 = torch.randn(100, e.inputs[0].irreps.dim)
+    input2 = torch.randn(100, e.inputs[1].irreps.dim)
+    m_script(input1, input2)

From c8de1858760b8112d3fde2766110b089c9e1bdf0 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 3 Dec 2024 01:43:47 -0800
Subject: [PATCH 13/96] fix

---
 .../primitives/equivariant_tensor_product.py                  | 4 ++++
 .../primitives/symmetric_tensor_product.py                    | 2 +-
 .../tests/primitives/equivariant_tensor_product_test.py       | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 2381dca..ed0bbd1 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -174,6 +174,10 @@ def forward(
             if len(inputs) == 2:
                 [x0, x1] = inputs
                 if indices is None:
+                    torch._assert(
+                        x0.ndim == 2,
+                        f"Expected x0 to have shape (batch, dim), got {x0.shape}",
+                    )
                     if x0.shape[0] == 1:
                         indices = torch.zeros(
                             (x1.shape[0],), dtype=torch.int32, device=x1.device
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 386d04a..79d92a6 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -201,7 +201,7 @@ def forward(
 
         torch._assert(
             x0.ndim == 2,
-            f"Expected 2 dims (i0.max() + 1, x0_size), got {x0.ndim}",
+            f"Expected 2 dims (i0.max() + 1, x0_size), got shape {x0.shape}",
         )
         shape = torch.broadcast_shapes(i0.shape, x1.shape[:-1])
         i0 = i0.expand(shape).reshape((math.prod(shape),))
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 110b3cf..04d3aef 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -153,4 +153,4 @@ def test_compile():
     m_compile = torch.compile(m, fullgraph=True)
     input1 = torch.randn(100, e.inputs[0].irreps.dim)
     input2 = torch.randn(100, e.inputs[1].irreps.dim)
-    m_compile(input1, input2)
+    m_compile([input1, input2])

From 16e4450b2acbeec082b8d6d4b9080c5388467298 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 3 Dec 2024 01:49:47 -0800
Subject: [PATCH 14/96] remove keyword-only and import in the forward

---
 .../primitives/transpose.py                    | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index 8f6546f..4c40036 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import *
+from typing import Optional
 
 import torch
 import torch.fx
@@ -59,7 +59,7 @@ def __repr__(self):
         return f"TransposeIrrepsLayout({self.source} -> {self.target})"
 
     def forward(
-        self, x: torch.Tensor, *, use_fallback: Optional[bool] = None
+        self, x: torch.Tensor, use_fallback: Optional[bool] = None
     ) -> torch.Tensor:
         r"""
         Perform the transposition.
@@ -92,7 +92,7 @@ def __init__(
 
         if info is not None:
             try:
-                import cuequivariance_ops_torch
+                import cuequivariance_ops_torch  # noqa: F401
             except ImportError:
                 self.f_cuda = None
             else:
@@ -104,10 +104,10 @@ def __init__(
             self.f = torch.nn.Identity()
 
     def __repr__(self):
-        return f"TransposeSegments()"
+        return "TransposeSegments()"
 
     def forward(
-        self, x: torch.Tensor, *, use_fallback: Optional[bool] = None
+        self, x: torch.Tensor, use_fallback: Optional[bool] = None
     ) -> torch.Tensor:
         """
         Perform the transposition of the input tensor using either a CUDA kernel or a PyTorch fallback.
@@ -184,12 +184,16 @@ def _transpose_info(
     return torch.IntTensor(info).to(device=device)
 
 
+try:
+    from cuequivariance_ops_torch import segmented_transpose
+except ImportError:
+    pass
+
+
 class _transpose(torch.nn.Module):
     def __init__(self, info: torch.IntTensor):
         super().__init__()
         self.register_buffer("_info", info, persistent=False)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        from cuequivariance_ops_torch import segmented_transpose
-
         return segmented_transpose(x, self._info, True)

From b2c4fbb8653fdceb5f60aedd2c73949483065741 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 4 Dec 2024 00:54:32 -0800
Subject: [PATCH 15/96] low lvl script tests

---
 .../primitives/symmetric_tensor_product.py    |  8 +--
 .../equivariant_tensor_product_test.py        | 11 ----
 .../tests/primitives/script_test.py           | 66 +++++++++++++++++++
 3 files changed, 70 insertions(+), 15 deletions(-)
 create mode 100644 cuequivariance_torch/tests/primitives/script_test.py

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 79d92a6..1417b67 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -138,6 +138,9 @@ def __init__(
     ):
         super().__init__()
 
+        if math_dtype is None:
+            math_dtype = torch.get_default_dtype()
+
         _check_descriptors(descriptors)
         self.descriptors = descriptors
 
@@ -258,13 +261,10 @@ def __init__(
         self,
         stps: list[stp.SegmentedTensorProduct],
         device: Optional[torch.device],
-        math_dtype: Optional[torch.dtype],
+        math_dtype: torch.dtype,
     ):
         super().__init__()
 
-        if math_dtype is None:
-            math_dtype = torch.get_default_dtype()
-
         max_degree = max(d.num_operands - 2 for d in stps)
         if max_degree > 6:
             raise NotImplementedError("Correlation > 6 is not implemented.")
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index e040687..04d3aef 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -154,14 +154,3 @@ def test_compile():
     input1 = torch.randn(100, e.inputs[0].irreps.dim)
     input2 = torch.randn(100, e.inputs[1].irreps.dim)
     m_compile([input1, input2])
-
-
-def test_script():
-    e = cue.descriptors.symmetric_contraction(
-        cue.Irreps("O3", "32x0e + 32x1o"), cue.Irreps("O3", "32x0e + 32x1o"), [1, 2, 3]
-    )
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir, optimize_fallback=False)
-    m_script = torch.jit.script(m)
-    input1 = torch.randn(100, e.inputs[0].irreps.dim)
-    input2 = torch.randn(100, e.inputs[1].irreps.dim)
-    m_script([input1, input2])
diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/script_test.py
new file mode 100644
index 0000000..8829041
--- /dev/null
+++ b/cuequivariance_torch/tests/primitives/script_test.py
@@ -0,0 +1,66 @@
+import torch
+
+import cuequivariance as cue
+from cuequivariance_torch.primitives.symmetric_tensor_product import (
+    CUDAKernel as SymmetricTensorProduct,
+)
+from cuequivariance_torch.primitives.tensor_product import (
+    FusedTensorProductOp3,
+    TensorProductUniform3x1d,
+)
+
+
+def test_script_symmetric_contraction():
+    ds = cue.descriptors.symmetric_contraction(
+        32 * cue.Irreps("SO3", "0 + 1"), 32 * cue.Irreps("SO3", "0 + 1"), [1, 2, 3]
+    ).ds
+
+    batch = 12
+    x0 = torch.randn(3, ds[0].operands[0].size, device="cuda:0", dtype=torch.float32)
+    i0 = torch.zeros(batch, device="cuda:0", dtype=torch.int32)
+    x1 = torch.randn(
+        batch, ds[0].operands[1].size, device="cuda:0", dtype=torch.float32
+    )
+
+    module = SymmetricTensorProduct(ds, torch.device("cuda:0"), torch.float32)
+    module = torch.jit.script(module)
+
+    assert module(x0, i0, x1).shape == (batch, ds[0].operands[-1].size)
+
+
+def test_script_fused_tp():
+    d = (
+        cue.descriptors.full_tensor_product(
+            cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1")
+        )
+        .d.flatten_coefficient_modes()
+        .squeeze_modes("v")
+    )
+
+    batch = 12
+    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
+
+    module = FusedTensorProductOp3(d, (0, 1), torch.device("cuda:0"), torch.float32)
+    module = torch.jit.script(module)
+
+    assert module(x0, x1).shape == (batch, d.operands[2].size)
+
+
+def test_script_uniform_tp():
+    d = (
+        cue.descriptors.full_tensor_product(
+            cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1")
+        )
+        .d.flatten_coefficient_modes()
+        .squeeze_modes("v")
+    )
+
+    batch = 12
+    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
+
+    module = TensorProductUniform3x1d(d, torch.device("cuda:0"), torch.float32)
+    module = torch.jit.script(module)
+
+    assert module(x0, x1).shape == (batch, d.operands[2].size)

From 4669a86f8da4cbe3bc0cd328731deee1ba9bd237 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 4 Dec 2024 01:07:19 -0800
Subject: [PATCH 16/96] TensorProduct working with script()

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/equivariant_tensor_product.py  |   1 -
 .../primitives/symmetric_tensor_product.py    |   8 +-
 .../primitives/tensor_product.py              | 163 ++++++++++++------
 .../tests/primitives/tensor_product_test.py   |   7 +-
 4 files changed, 118 insertions(+), 61 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index ed0bbd1..4c540b6 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -145,7 +145,6 @@ def forward(
         """
         If ``indices`` is not None, the first input is indexed by ``indices``.
         """
-        inputs: List[torch.Tensor] = list(inputs)
 
         assert len(inputs) == len(self.etp.inputs)
         for a, dim in zip(inputs, self.operands_dims):
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 79d92a6..7011d71 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -203,7 +203,7 @@ def forward(
             x0.ndim == 2,
             f"Expected 2 dims (i0.max() + 1, x0_size), got shape {x0.shape}",
         )
-        shape = torch.broadcast_shapes(i0.shape, x1.shape[:-1])
+        shape = broadcast_shapes(i0.shape, x1.shape[:-1])
         i0 = i0.expand(shape).reshape((math.prod(shape),))
         x1 = x1.expand(shape + (x1.shape[-1],)).reshape(
             (math.prod(shape), x1.shape[-1])
@@ -335,9 +335,9 @@ def forward(
         i0 = i0.to(torch.int32)
         x0 = x0.reshape(x0.shape[0], x0.shape[1] // self.u, self.u)
         x1 = x1.reshape(x1.shape[0], x1.shape[1] // self.u, self.u)
-        logger.debug(
-            f"Calling SymmetricTensorContraction: {self.descriptors}, input shapes: {x0.shape}, {i0.shape}, {x1.shape}"
-        )
+        # logger.debug(
+        #     f"Calling SymmetricTensorContraction: {self.descriptors}, input shapes: {x0.shape}, {i0.shape}, {x1.shape}"
+        # )
         out = self.f(x1, x0, i0)
         out = out.reshape(out.shape[0], out.shape[1] * self.u)
         return out
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 1dceb9e..10b7080 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -19,11 +19,48 @@
 
 import torch
 import torch.fx
-
+from torch.jit import Final
 from cuequivariance import segmented_tensor_product as stp
 
 logger = logging.getLogger(__name__)
 
+def prod(numbers: List[int]):
+    product = 1
+    for num in numbers:
+        product *= num
+    return product
+
+def broadcast_shapes(shapes: List[List[int]]):
+    if torch.jit.is_scripting():
+        max_len = 0
+        for shape in shapes:
+            if isinstance(shape, int):
+                if max_len < 1:
+                    max_len = 1
+            elif isinstance(shape, (tuple, list)):
+                s = len(shape)
+                if max_len < s:
+                    max_len = s
+        result = [1] * max_len
+        for shape in shapes:
+            if isinstance(shape, int):
+                shape = (shape,)
+            if isinstance(shape, (tuple, list)):
+                for i in range(-1, -1 - len(shape), -1):
+                    if shape[i] < 0:
+                        raise RuntimeError("Trying to create tensor with negative dimension ({}): ({})"
+                                           .format(shape[i], shape[i]))
+                    if shape[i] == 1 or shape[i] == result[i]:
+                        continue
+                    if result[i] != 1:
+                        raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
+                    result[i] = shape[i]
+            else:
+                raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape)
+        return torch.Size(result)
+    else:
+        return torch.functional.broadcast_shapes(*shapes)
+
 
 class TensorProduct(torch.nn.Module):
     """
@@ -36,25 +73,30 @@ class TensorProduct(torch.nn.Module):
         optimize_fallback (bool, optional): If `True`, the fallback method is optimized. If `False`, the fallback method is used without optimization.
     """
 
+    num_operands: Final[int]
+
     def __init__(
         self,
         descriptor: stp.SegmentedTensorProduct,
         *,
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         self.descriptor = descriptor
-
+        # for script()
+        self.num_operands = descriptor.num_operands
         if math_dtype is None:
             math_dtype = torch.get_default_dtype()
 
         try:
-            self.f_cuda = _tensor_product_cuda(descriptor, device, math_dtype)
+            self.f_cuda3, self.f_cuda4 = _tensor_product_cuda(descriptor, device, math_dtype)
         except NotImplementedError as e:
             logger.info(f"CUDA implementation not available: {e}")
-            self.f_cuda = None
+            self.f_cuda3 = None
+            self.f_cuda4 = None
         except ImportError as e:
             logger.warning(f"CUDA implementation not available: {e}")
             logger.warning(
@@ -63,16 +105,20 @@ def __init__(
                 "pip install cuequivariance-ops-torch-cu11\n"
                 "pip install cuequivariance-ops-torch-cu12"
             )
-            self.f_cuda = None
+            self.f_cuda3 = None
+            self.f_cuda4 = None
 
-        self.f_fx = _tensor_product_fx(
-            descriptor, device, math_dtype, optimize_fallback is True
-        )
+        if use_fallback == True: 
+            self.f_fx = _tensor_product_fx(
+                descriptor, device, math_dtype, optimize_fallback is True
+            )
+        else:
+            self.f_fx = None
         self._optimize_fallback = optimize_fallback
 
     def __repr__(self):
         has_cuda_kernel = (
-            "(with CUDA kernel)" if self.f_cuda is not None else "(without CUDA kernel)"
+            "(with CUDA kernel)" if self.f_cuda3 is not None or self.f_cuda4 is not None else "(without CUDA kernel)"
         )
         return f"TensorProduct({self.descriptor} {has_cuda_kernel})"
 
@@ -103,13 +149,15 @@ def forward(self, inputs: List[torch.Tensor], use_fallback: Optional[bool] = Non
         if (
             inputs
             and inputs[0].device.type == "cuda"
-            and self.f_cuda is not None
             and (use_fallback is not True)
         ):
-            return self.f_cuda(*inputs)
+            if self.f_cuda3 is not None:
+                return self.f_cuda3(inputs[0], inputs[1])
+            else:
+                return self.f_cuda4(inputs[0], inputs[1], inputs[2])
 
         if use_fallback is False:
-            if self.f_cuda is not None:
+            if self.f_cuda3 is not None and self.f_cuda4 is not None:
                 raise RuntimeError("CUDA kernel available but input is not on CUDA")
             else:
                 raise RuntimeError("No CUDA kernel available")
@@ -119,6 +167,8 @@ def forward(self, inputs: List[torch.Tensor], use_fallback: Optional[bool] = Non
                 "The fallback method is used but it has not been optimized. "
                 "Consider setting optimize_fallback=True when creating the TensorProduct module."
             )
+        if self.f_fx is None:
+                raise RuntimeError("No fallback method available")
         return self.f_fx(inputs)
 
 
@@ -190,7 +240,7 @@ def _tensor_product_fx(
             seg_shape = descriptor.get_segment_shape(-1, path)
             outputs += [
                 out.reshape(
-                    out.shape[: out.ndim - len(seg_shape)] + (math.prod(seg_shape),)
+                    out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),)
                 )
             ]
 
@@ -206,7 +256,7 @@ def _tensor_product_fx(
                         for out, path in zip(outputs, descriptor.paths)
                         if path.indices[-1] == i
                     ],
-                    shape=batch_shape + (math.prod(descriptor.operands[-1][i]),),
+                    shape=batch_shape + (prod(descriptor.operands[-1][i]),),
                     like=outputs[0],
                 )
                 for i in range(descriptor.operands[-1].num_segments)
@@ -252,7 +302,7 @@ def __init__(self, descriptor: stp.SegmentedTensorProduct):
                     )
 
             def forward(self, *args):
-                shape = torch.broadcast_shapes(*[arg.shape[:-1] for arg in args])
+                shape = broadcast_shapes([arg.shape[:-1] for arg in args])
                 output = torch.zeros(
                     shape + (descriptor.operands[-1].size,),
                     device=device,
@@ -278,22 +328,23 @@ def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProdu
         self.module = module
         self.descriptor = descriptor
 
-    def forward(self, args):
-        for oid, arg in enumerate(args):
-            torch._assert(
-                arg.shape[-1] == self.descriptor.operands[oid].size,
-                "input shape[-1] does not match operand size",
-            )
+    def forward(self, args:List[torch.Tensor]):
+        if not torch.jit.is_scripting():
+            for oid, arg in enumerate(args):
+                torch._assert(
+                    arg.shape[-1] == self.descriptor.operands[oid].size,
+                    "input shape[-1] does not match operand size",
+                )
 
-        shape = torch.broadcast_shapes(*[arg.shape[:-1] for arg in args])
+        shape = broadcast_shapes([arg.shape[:-1] for arg in args])
 
         args = [
             (
                 arg.expand(shape + (arg.shape[-1],)).reshape(
-                    (math.prod(shape), arg.shape[-1])
+                    (prod(shape), arg.shape[-1])
                 )
-                if math.prod(arg.shape[:-1]) > 1
-                else arg.reshape((math.prod(arg.shape[:-1]), arg.shape[-1]))
+                if prod(arg.shape[:-1]) > 1
+                else arg.reshape((prod(arg.shape[:-1]), arg.shape[-1]))
             )
             for arg in args
         ]
@@ -353,9 +404,9 @@ def _tensor_product_cuda(
                 operand_num_segments=[o.num_segments for o in d.operands],
             ):
                 if descriptor.num_operands == 3:
-                    return TensorProductUniform3x1d(d, device, math_dtype)
+                    return TensorProductUniform3x1d(d, device, math_dtype), None
                 else:
-                    return TensorProductUniform4x1d(d, device, math_dtype)
+                    return None, TensorProductUniform4x1d(d, device, math_dtype)
 
     supported_targets = [
         stp.Subscripts(subscripts)
@@ -385,18 +436,18 @@ def _tensor_product_cuda(
         )
 
     if descriptor.num_operands == 3:
-        return FusedTensorProductOp3(descriptor, perm[:2], device, math_dtype)
+        return FusedTensorProductOp3(descriptor, perm[:2], device, math_dtype), None
     elif descriptor.num_operands == 4:
-        return FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
-
+        return None, FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
 
-def _reshape(x: torch.Tensor, leading_shape: tuple[int, ...]) -> torch.Tensor:
+                        
+def _reshape(x: torch.Tensor, leading_shape: List[int]) -> torch.Tensor:
     # Make x have shape (Z, x.shape[-1]) or (x.shape[-1],)
-    if math.prod(leading_shape) > 1 and math.prod(x.shape[:-1]) == 1:
+    if prod(leading_shape) > 1 and prod(x.shape[:-1]) == 1:
         return x.reshape((x.shape[-1],))
     else:
         return x.expand(leading_shape + (x.shape[-1],)).reshape(
-            (math.prod(leading_shape), x.shape[-1])
+            (prod(leading_shape), x.shape[-1])
         )
 
 
@@ -434,7 +485,7 @@ def __init__(
         ).to(device=device)
 
     def __repr__(self) -> str:
-        return f"TensorProductCUDA({self.descriptor} (output last operand))"
+        return f"FusedTensorProductOp3({self.descriptor} (output last operand))"
 
     def forward(
         self,
@@ -445,13 +496,14 @@ def forward(
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
 
-        shape = torch.broadcast_shapes(x0.shape[:-1], x1.shape[:-1])
+        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1]])
         x0 = _reshape(x0, shape)
         x1 = _reshape(x1, shape)
 
-        logger.debug(
-            f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
-        )
+        if not torch.jit.is_scripting():
+            logger.debug(
+                f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
+            )
 
         out = self._f(x0, x1)
 
@@ -492,7 +544,7 @@ def __init__(
         ).to(device=device)
 
     def __repr__(self) -> str:
-        return f"TensorProductCUDA({self.descriptor} (output last operand))"
+        return f"FusedTensorProductOp4({self.descriptor} (output last operand))"
 
     def forward(
         self,
@@ -505,14 +557,15 @@ def forward(
         assert x1.ndim >= 1, x1.ndim
         assert x2.ndim >= 1, x2.ndim
 
-        shape = torch.broadcast_shapes(x0.shape[:-1], x1.shape[:-1], x2.shape[:-1])
+        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1], x2.shape[:-1]])
         x0 = _reshape(x0, shape)
         x1 = _reshape(x1, shape)
         x2 = _reshape(x2, shape)
 
-        logger.debug(
-            f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
-        )
+        if not torch.jit.is_scripting():
+            logger.debug(
+                f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
+            )
 
         out = self._f(x0, x1, x2)
 
@@ -546,13 +599,13 @@ def __init__(
         ).to(device=device)
 
     def __repr__(self):
-        return f"TensorProductCUDA({self.descriptor} (output last operand))"
+        return f"TensorProductUniform3x1d({self.descriptor} (output last operand))"
 
-    def forward(self, x0, x1):
+    def forward(self, x0:torch.Tensor, x1:torch.Tensor):
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
 
-        shape = torch.broadcast_shapes(x0.shape[:-1], x1.shape[:-1])
+        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1]])
         x0 = _reshape(x0, shape)
         x1 = _reshape(x1, shape)
 
@@ -561,9 +614,10 @@ def forward(self, x0, x1):
         if x1.ndim == 1:
             x1 = x1.unsqueeze(0)
 
-        logger.debug(
-            f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
-        )
+        if not torch.jit.is_scripting():
+            logger.debug(
+                f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
+            )
 
         out = self._f(x0, x1)
 
@@ -597,14 +651,14 @@ def __init__(
         ).to(device=device)
 
     def __repr__(self):
-        return f"TensorProductCUDA({self.descriptor} (output last operand))"
+        return f"TensorProductUniform4x1d({self.descriptor} (output last operand))"
 
     def forward(self, x0, x1, x2):
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
         assert x2.ndim >= 1, x2.ndim
 
-        shape = torch.broadcast_shapes(x0.shape[:-1], x1.shape[:-1], x2.shape[:-1])
+        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1], x2.shape[:-1]])
         x0 = _reshape(x0, shape)
         x1 = _reshape(x1, shape)
         x2 = _reshape(x2, shape)
@@ -616,9 +670,10 @@ def forward(self, x0, x1, x2):
         if x2.ndim == 1:
             x2 = x2.unsqueeze(0)
 
-        logger.debug(
-            f"Calling TensorProductUniform4x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
-        )
+        if not torch.jit.is_scripting():
+            logger.debug(
+                f"Calling TensorProductUniform4x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
+            )
 
         out = self._f(x0, x1, x2)
 
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 53e8bfc..dc6c2e9 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -111,9 +111,11 @@ def test_primitive_tensor_product_cuda_vs_fx(
         m = cuet.TensorProduct(
             d, device=device, math_dtype=math_dtype, optimize_fallback=False
         )
-        out1 = m(inputs, use_fallback=False)
+        m = torch.jit.script(m)
+        out1 = m(inputs)
+        
         m = cuet.TensorProduct(
-            d, device=device, math_dtype=torch.float64, optimize_fallback=False
+            d, device=device, math_dtype=torch.float64, use_fallback=True, optimize_fallback=False
         )
         inputs_ = [inp.clone().to(torch.float64) for inp in inputs]
         out2 = m(inputs_, use_fallback=True)
@@ -134,3 +136,4 @@ def test_primitive_tensor_product_cuda_vs_fx(
 
         for g1, g2 in zip(double_grad1, double_grad2):
             torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)
+

From dc9d5b0e77164ad8e968742deac70fccdfa1fe8c Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 4 Dec 2024 01:26:41 -0800
Subject: [PATCH 17/96] add 4 operands tests

---
 .../tests/primitives/script_test.py           | 47 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/script_test.py
index 8829041..44e880f 100644
--- a/cuequivariance_torch/tests/primitives/script_test.py
+++ b/cuequivariance_torch/tests/primitives/script_test.py
@@ -6,7 +6,9 @@
 )
 from cuequivariance_torch.primitives.tensor_product import (
     FusedTensorProductOp3,
+    FusedTensorProductOp4,
     TensorProductUniform3x1d,
+    TensorProductUniform4x1d,
 )
 
 
@@ -28,7 +30,7 @@ def test_script_symmetric_contraction():
     assert module(x0, i0, x1).shape == (batch, ds[0].operands[-1].size)
 
 
-def test_script_fused_tp():
+def test_script_fused_tp_3():
     d = (
         cue.descriptors.full_tensor_product(
             cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1")
@@ -47,7 +49,28 @@ def test_script_fused_tp():
     assert module(x0, x1).shape == (batch, d.operands[2].size)
 
 
-def test_script_uniform_tp():
+def test_script_fused_tp_4():
+    d = (
+        cue.descriptors.fully_connected_tensor_product(
+            cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1"), cue.Irreps("SO3", "32x1")
+        )
+        .d.flatten_coefficient_modes()
+        .squeeze_modes("v")
+        .permute_operands([1, 2, 0, 3])
+    )
+
+    batch = 12
+    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
+    x2 = torch.randn(batch, d.operands[2].size, device="cuda:0", dtype=torch.float32)
+
+    module = FusedTensorProductOp4(d, (0, 1, 2), torch.device("cuda:0"), torch.float32)
+    module = torch.jit.script(module)
+
+    assert module(x0, x1, x2).shape == (batch, d.operands[3].size)
+
+
+def test_script_uniform_tp_3():
     d = (
         cue.descriptors.full_tensor_product(
             cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1")
@@ -64,3 +87,23 @@ def test_script_uniform_tp():
     module = torch.jit.script(module)
 
     assert module(x0, x1).shape == (batch, d.operands[2].size)
+
+
+def test_script_uniform_tp_4():
+    d = (
+        cue.descriptors.channelwise_tensor_product(
+            cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1"), cue.Irreps("SO3", "32x1")
+        )
+        .d.flatten_coefficient_modes()
+        .squeeze_modes("v")
+    )
+
+    batch = 12
+    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
+    x2 = torch.randn(batch, d.operands[2].size, device="cuda:0", dtype=torch.float32)
+
+    module = TensorProductUniform4x1d(d, torch.device("cuda:0"), torch.float32)
+    module = torch.jit.script(module)
+
+    assert module(x0, x1, x2).shape == (batch, d.operands[3].size)

From 334b4604dd506537155de9bb49d4482ef2e4c2a8 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 4 Dec 2024 20:12:41 -0800
Subject: [PATCH 18/96] Unit tests run

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../layers/tp_conv_fully_connected.py         |   6 +
 .../cuequivariance_torch/operations/linear.py |  13 +-
 .../operations/rotation.py                    |  17 +-
 .../operations/spherical_harmonics.py         |   6 +
 .../operations/symmetric_contraction.py       |  13 +-
 .../operations/tp_channel_wise.py             |  13 +-
 .../operations/tp_fully_connected.py          |  12 +-
 .../primitives/equivariant_tensor_product.py  | 152 +++++++++++-------
 .../primitives/symmetric_tensor_product.py    |  81 ++++------
 .../primitives/tensor_product.py              | 125 ++++++--------
 .../primitives/transpose.py                   |  43 ++---
 .../equivariant_tensor_product_test.py        |  48 ++++--
 .../symmetric_tensor_product_test.py          |  15 +-
 .../tests/primitives/tensor_product_test.py   |   2 +-
 .../tests/primitives/transpose_test.py        |   4 +-
 15 files changed, 286 insertions(+), 264 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
index 3d842ea..e4b3a58 100644
--- a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
@@ -56,6 +56,10 @@ class FullyConnectedTensorProductConv(nn.Module):
         mlp_channels (Sequence of int, optional): A sequence of integers defining the number of neurons in each layer in MLP before the output layer. If None, no MLP will be added. The input layer contains edge embeddings and node scalar features. Defaults to None.
         mlp_activation (``nn.Module`` or Sequence of ``nn.Module``, optional): A sequence of functions to be applied in between linear layers in MLP, e.g., ``nn.Sequential(nn.ReLU(), nn.Dropout(0.4))``. Defaults to ``nn.GELU()``.
         layout (IrrepsLayout, optional): The layout of the input and output irreps. Default is ``cue.mul_ir`` which is the layout corresponding to e3nn.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Examples:
         >>> in_irreps = cue.Irreps("O3", "4x0e + 4x1o")
@@ -121,6 +125,7 @@ def __init__(
         mlp_channels: Optional[Sequence[int]] = None,
         mlp_activation: Union[nn.Module, Sequence[nn.Module], None] = nn.GELU(),
         layout: cue.IrrepsLayout = None,  # e3nn_compat_mode
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -141,6 +146,7 @@ def __init__(
             out_irreps,
             layout=self.layout,
             shared_weights=False,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/linear.py b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
index 197f5ac..977ff2d 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/linear.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
@@ -32,6 +32,10 @@ class Linear(torch.nn.Module):
         layout (IrrepsLayout, optional): The layout of the irreducible representations, by default ``cue.mul_ir``. This is the layout used in the e3nn library.
         shared_weights (bool, optional): Whether to use shared weights, by default True.
         internal_weights (bool, optional): Whether to use internal weights, by default True if shared_weights is True, otherwise False.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
     """
 
     def __init__(
@@ -47,6 +51,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -84,6 +89,7 @@ def __init__(
             layout_out=layout_out,
             device=device,
             math_dtype=math_dtype,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
@@ -94,8 +100,6 @@ def forward(
         self,
         x: torch.Tensor,
         weight: Optional[torch.Tensor] = None,
-        *,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         Forward pass of the linear layer.
@@ -103,9 +107,6 @@ def forward(
         Args:
             x (torch.Tensor): The input tensor.
             weight (torch.Tensor, optional): The weight tensor. If None, the internal weight tensor is used.
-            use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
-                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns:
             torch.Tensor: The output tensor after applying the linear transformation.
@@ -126,4 +127,4 @@ def forward(
         if not self.shared_weights and weight.ndim != 2:
             raise ValueError("Weights should be 2D tensor")
 
-        return self.f([weight, x], use_fallback=use_fallback)
+        return self.f([weight, x])
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/rotation.py b/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
index 9fb4d86..9c03468 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
@@ -40,6 +40,7 @@ def __init__(
         layout_out: Optional[cue.IrrepsLayout] = None,
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -60,6 +61,7 @@ def __init__(
             layout_out=layout_out,
             device=device,
             math_dtype=math_dtype,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
@@ -69,8 +71,6 @@ def forward(
         beta: torch.Tensor,
         alpha: torch.Tensor,
         x: torch.Tensor,
-        *,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         Forward pass of the rotation layer.
@@ -80,9 +80,6 @@ def forward(
             beta (torch.Tensor): The beta angles. Second rotation around the x-axis.
             alpha (torch.Tensor): The alpha angles. Third rotation around the y-axis.
             x (torch.Tensor): The input tensor.
-            use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
-                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns:
             torch.Tensor: The rotated tensor.
@@ -97,7 +94,6 @@ def forward(
 
         return self.f(
             [encodings_gamma, encodings_beta, encodings_alpha, x],
-            use_fallback=use_fallback,
         )
 
 
@@ -159,6 +155,11 @@ class Inversion(torch.nn.Module):
     Args:
         irreps (Irreps): The irreducible representations of the tensor to invert.
         layout (IrrepsLayout, optional): The memory layout of the tensor, ``cue.ir_mul`` is preferred.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+
+        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
     """
 
     def __init__(
@@ -170,6 +171,8 @@ def __init__(
         layout_out: Optional[cue.IrrepsLayout] = None,
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
+        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         (irreps,) = default_irreps(irreps)
@@ -187,6 +190,8 @@ def __init__(
             layout_out=layout_out,
             device=device,
             math_dtype=math_dtype,
+            use_fallback=use_fallback,
+            optimize_fallback=optimize_fallback,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
index bfd0163..b2ecc93 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
@@ -25,6 +25,7 @@ def spherical_harmonics(
     ls: list[int],
     vectors: torch.Tensor,
     normalize: bool = True,
+    use_fallback: Optional[bool] = None,
     optimize_fallback: Optional[bool] = None,
 ) -> torch.Tensor:
     r"""Compute the spherical harmonics of the input vectors.
@@ -33,6 +34,10 @@ def spherical_harmonics(
         ls (list of int): List of spherical harmonic degrees.
         vectors (torch.Tensor): Input vectors of shape (..., 3).
         normalize (bool, optional): Whether to normalize the input vectors. Defaults to True.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+
         optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Returns:
@@ -53,6 +58,7 @@ def spherical_harmonics(
         layout=cue.ir_mul,
         device=x.device,
         math_dtype=x.dtype,
+        use_fallback=use_fallback,
         optimize_fallback=optimize_fallback,
     )
     y = m([x])
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index 35f1b56..fac5739 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -38,6 +38,10 @@ class SymmetricContraction(torch.nn.Module):
         layout (IrrepsLayout, optional): The layout of the input and output irreps. If not provided, a default layout is used.
         math_dtype (torch.dtype, optional): The data type for mathematical operations. If not specified, the default data type
             from the torch environment is used.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Examples:
         >>> irreps_in = cue.Irreps("O3", "32x0e + 32x1o")
@@ -102,6 +106,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
         original_mace: bool = False,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -147,6 +152,7 @@ def __init__(
             layout_out=layout_out,
             device=device,
             math_dtype=math_dtype or dtype,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
@@ -160,8 +166,6 @@ def forward(
         self,
         x: torch.Tensor,
         indices: torch.Tensor,
-        *,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         Perform the forward pass of the symmetric contraction operation.
@@ -170,9 +174,6 @@ def forward(
             x (torch.Tensor): The input tensor. It should have shape (..., irreps_in.dim).
             indices (torch.Tensor): The index of the weight to use for each batch element.
                 It should have shape (...).
-            use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
-                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns:
             torch.Tensor: The output tensor. It has shape (batch, irreps_out.dim).
@@ -188,4 +189,4 @@ def forward(
             weight = self.weight
         weight = weight.flatten(1)
 
-        return self.f([weight, x], indices=indices, use_fallback=use_fallback)
+        return self.f([weight, x], indices=indices)
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
index 76402e7..169a248 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
@@ -33,6 +33,10 @@ class ChannelWiseTensorProduct(torch.nn.Module):
         layout (IrrepsLayout, optional): The layout of the input and output irreps. Default is ``cue.mul_ir`` which is the layout corresponding to e3nn.
         shared_weights (bool, optional): Whether to share weights across the batch dimension. Default is True.
         internal_weights (bool, optional): Whether to create module parameters for weights. Default is None.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Note:
         In e3nn there was a irrep_normalization and path_normalization parameters.
@@ -54,6 +58,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -95,6 +100,7 @@ def __init__(
             layout_out=layout_out,
             device=device,
             math_dtype=math_dtype,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
@@ -110,8 +116,6 @@ def forward(
         x1: torch.Tensor,
         x2: torch.Tensor,
         weight: Optional[torch.Tensor] = None,
-        *,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         Perform the forward pass of the fully connected tensor product operation.
@@ -122,9 +126,6 @@ def forward(
             weight (torch.Tensor, optional): Weights for the tensor product. It should have the shape (batch_size, weight_numel)
                 if shared_weights is False, or (weight_numel,) if shared_weights is True.
                 If None, the internal weights are used.
-            use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
-                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns:
             torch.Tensor:
@@ -147,4 +148,4 @@ def forward(
         if not self.shared_weights and weight.ndim != 2:
             raise ValueError("Weights should be 2D tensor")
 
-        return self.f([weight, x1, x2], use_fallback=use_fallback)
+        return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
index f33c7f6..fd7706f 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
@@ -33,6 +33,10 @@ class FullyConnectedTensorProduct(torch.nn.Module):
         layout (IrrepsLayout, optional): The layout of the input and output irreps. Default is ``cue.mul_ir`` which is the layout corresponding to e3nn.
         shared_weights (bool, optional): Whether to share weights across the batch dimension. Default is True.
         internal_weights (bool, optional): Whether to create module parameters for weights. Default is None.
+        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Note:
         In e3nn there was a irrep_normalization and path_normalization parameters.
@@ -54,6 +58,7 @@ def __init__(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -111,8 +116,6 @@ def forward(
         x1: torch.Tensor,
         x2: torch.Tensor,
         weight: Optional[torch.Tensor] = None,
-        *,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         Perform the forward pass of the fully connected tensor product operation.
@@ -123,9 +126,6 @@ def forward(
             weight (torch.Tensor, optional): Weights for the tensor product. It should have the shape (batch_size, weight_numel)
                 if shared_weights is False, or (weight_numel,) if shared_weights is True.
                 If None, the internal weights are used.
-            use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
-                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns:
             torch.Tensor:
@@ -148,4 +148,4 @@ def forward(
         if not self.shared_weights and weight.ndim != 2:
             raise ValueError("Weights should be 2D tensor")
 
-        return self.f([weight, x1, x2], use_fallback=use_fallback)
+        return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 4c540b6..06ea215 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -21,6 +21,59 @@
 from cuequivariance.irreps_array.misc_ui import default_layout
 
 
+class Dispatcher(torch.nn.Module):
+    def __init__(self, tp):
+        super().__init__()
+        self.tp = tp
+
+
+class TPDispatcher(Dispatcher):
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if indices is not None:
+            # TODO: at some point we will have kernel for this
+            assert len(inputs) >= 1
+            inputs[0] = inputs[0][indices]
+        return  self.tp(inputs)
+
+    
+class SymmetricTPDispatcher(Dispatcher):
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert indices is None
+        return self.tp(inputs[0])
+    
+class IWeightedSymmetricTPDispatcher(Dispatcher):
+    def forward(
+            self,
+            inputs: List[torch.Tensor],
+            indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        x0 = inputs[0]
+        x1 = inputs[1]
+        if indices is None:
+            torch._assert(
+                x0.ndim == 2,
+                f"Expected x0 to have shape (batch, dim), got {x0.shape}",
+            )
+            if x0.shape[0] == 1:
+                indices = torch.zeros(
+                    (x1.shape[0],), dtype=torch.int32, device=x1.device
+                )
+            else:  #  x0.shape[0] == x1.shape[0]:
+                indices = torch.arange(
+                    x1.shape[0], dtype=torch.int32, device=x1.device
+                )
+        # borisf : why was it here ?
+        # if indices is not None:
+        return self.tp(x0, indices, x1)
+
 class EquivariantTensorProduct(torch.nn.Module):
     r"""Equivariant tensor product.
 
@@ -31,7 +84,10 @@ class EquivariantTensorProduct(torch.nn.Module):
         layout_out (IrrepsLayout): layout for output.
         device (torch.device): device of the Module.
         math_dtype (torch.dtype): dtype for internal computations.
+        use_fallback (bool, optional):  Determines the computation method. If `None` (default), a CUDA kernel will be used if available. If `False`, a CUDA kernel will be used, and an exception is raised if it's not available. If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
         optimize_fallback (bool): whether to optimize the fallback implementation.
+    Raises:
+        RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
     Examples:
         >>> e = cue.descriptors.fully_connected_tensor_product(
@@ -55,7 +111,7 @@ class EquivariantTensorProduct(torch.nn.Module):
         ...
                 [0., 0., 0., 0., 0., 0.]])
     """
-
+    
     def __init__(
         self,
         e: cue.EquivariantTensorProduct,
@@ -67,6 +123,7 @@ def __init__(
         layout_out: Optional[cue.IrrepsLayout] = None,
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -92,6 +149,7 @@ def __init__(
                     source=layout_used,
                     target=input_expected.layout,
                     device=device,
+                    use_fallback = use_fallback
                 )
             )
         self.transpose_out = cuet.TransposeIrrepsLayout(
@@ -99,37 +157,42 @@ def __init__(
             source=e.output.layout,
             target=layout_out,
             device=device,
+            use_fallback = use_fallback
         )
 
         if any(d.num_operands != e.num_inputs + 1 for d in e.ds):
-            self.tp = None
-
             if e.num_inputs == 1:
-                self.symm_tp = cuet.SymmetricTensorProduct(
-                    e.ds,
-                    device=device,
-                    math_dtype=math_dtype,
-                    optimize_fallback=optimize_fallback,
+                self.tp = SymmetricTPDispatcher(
+                    cuet.SymmetricTensorProduct(
+                        e.ds,
+                        device=device,
+                        math_dtype=math_dtype,
+                        use_fallback=use_fallback,
+                        optimize_fallback=optimize_fallback,
+                    )
                 )
             elif e.num_inputs == 2:
-                self.symm_tp = cuet.IWeightedSymmetricTensorProduct(
-                    e.ds,
-                    device=device,
-                    math_dtype=math_dtype,
-                    optimize_fallback=optimize_fallback,
+                self.tp = IWeightedSymmetricTPDispatcher(
+                    cuet.IWeightedSymmetricTensorProduct(
+                        e.ds,
+                        device=device,
+                        math_dtype=math_dtype,
+                        use_fallback=use_fallback,
+                        optimize_fallback=optimize_fallback,
+                    )
                 )
             else:
                 raise NotImplementedError("This should not happen")
         else:
-            [d] = e.ds
-
-            self.tp = cuet.TensorProduct(
-                d,
-                device=device,
-                math_dtype=math_dtype,
-                optimize_fallback=optimize_fallback,
+            self.tp = TPDispatcher(
+                cuet.TensorProduct(
+                    e.ds[0],
+                    device=device,
+                    math_dtype=math_dtype,
+                    use_fallback = use_fallback,
+                    optimize_fallback=optimize_fallback,
+                )
             )
-            self.symm_tp = None
 
         self.operands_dims = [op.irreps.dim for op in e.operands]
 
@@ -140,59 +203,24 @@ def forward(
         self,
         inputs: List[torch.Tensor],
         indices: Optional[torch.Tensor] = None,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         """
         If ``indices`` is not None, the first input is indexed by ``indices``.
         """
 
-        assert len(inputs) == len(self.etp.inputs)
+        # assert len(inputs) == len(self.etp.inputs)
         for a, dim in zip(inputs, self.operands_dims):
             assert a.shape[-1] == dim
 
         # Transpose inputs
-        inputs = [
-            t(a, use_fallback=use_fallback) for t, a in zip(self.transpose_in, inputs)
-        ]
+        inputs[0] = self.transpose_in[0](inputs[0])
+        if len(self.transpose_in) > 1:
+            inputs[1] = self.transpose_in[1](inputs[1])
 
         # Compute tensor product
-        output = None
-
-        if self.tp is not None:
-            if indices is not None:
-                # TODO: at some point we will have kernel for this
-                assert len(inputs) >= 1
-                inputs[0] = inputs[0][indices]
-            output = self.tp(inputs, use_fallback=use_fallback)
-
-        if self.symm_tp is not None:
-            if len(inputs) == 1:
-                assert indices is None
-                output = self.symm_tp(inputs[0], use_fallback=use_fallback)
-
-            if len(inputs) == 2:
-                [x0, x1] = inputs
-                if indices is None:
-                    torch._assert(
-                        x0.ndim == 2,
-                        f"Expected x0 to have shape (batch, dim), got {x0.shape}",
-                    )
-                    if x0.shape[0] == 1:
-                        indices = torch.zeros(
-                            (x1.shape[0],), dtype=torch.int32, device=x1.device
-                        )
-                    elif x0.shape[0] == x1.shape[0]:
-                        indices = torch.arange(
-                            x1.shape[0], dtype=torch.int32, device=x1.device
-                        )
-
-                if indices is not None:
-                    output = self.symm_tp(x0, indices, x1, use_fallback=use_fallback)
-
-        if output is None:
-            raise NotImplementedError("This should not happen")
+        output = self.tp(inputs, indices)
 
         # Transpose output
-        output = self.transpose_out(output, use_fallback=use_fallback)
+        output = self.transpose_out(output)
 
         return output
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 7011d71..8f8ea0d 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -21,6 +21,7 @@
 
 import cuequivariance.segmented_tensor_product as stp
 import cuequivariance_torch as cuet
+from cuequivariance_torch.primitives.tensor_product import broadcast_shapes, prod
 
 logger = logging.getLogger(__name__)
 
@@ -41,6 +42,7 @@ def __init__(
         *,
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -55,6 +57,7 @@ def __init__(
                 d0,
                 device=device,
                 math_dtype=math_dtype,
+                use_fallback = use_fallback,
                 optimize_fallback=optimize_fallback,
             )
         else:
@@ -86,7 +89,7 @@ def __init__(
         )
 
     def forward(
-        self, x0: torch.Tensor, use_fallback: Optional[bool] = None
+        self, x0: torch.Tensor
     ) -> torch.Tensor:
         r"""
         Perform the forward pass of the indexed symmetric tensor product operation.
@@ -105,8 +108,7 @@ def forward(
         out = self.f(
             torch.ones((1, 1), dtype=x0.dtype, device=x0.device),
             torch.zeros((x0.shape[0],), dtype=torch.int32, device=x0.device),
-            x0,
-            use_fallback=use_fallback,
+            x0
         )
         if self.f0 is not None:
             out += self.f0([])
@@ -134,6 +136,7 @@ def __init__(
         *,
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
+        use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
@@ -141,30 +144,35 @@ def __init__(
         _check_descriptors(descriptors)
         self.descriptors = descriptors
 
-        try:
-            self.f_cuda = CUDAKernel(descriptors, device, math_dtype)
-        except NotImplementedError as e:
-            logger.info(f"Failed to initialize CUDA implementation: {e}")
-            self.f_cuda = None
-        except ImportError as e:
-            logger.warning(f"Failed to initialize CUDA implementation: {e}")
-            self.f_cuda = None
-
-        self.f_fx = FallbackImpl(
-            descriptors,
-            device,
-            math_dtype=math_dtype,
-            optimize_fallback=optimize_fallback,
-        )
-
         d = next(d for d in descriptors if d.num_operands >= 3)
         self.x0_size = d.operands[0].size
         self.x1_size = d.operands[1].size
         self.x2_size = d.operands[-1].size
+        self.has_cuda = False
+        
+        if not use_fallback == True:
+            try:
+                self.f = CUDAKernel(descriptors, device, math_dtype)
+                self.has_cuda = True
+                return
+            except NotImplementedError as e:
+                logger.info(f"Failed to initialize CUDA implementation: {e}")
+            except ImportError as e:
+                logger.warning(f"Failed to initialize CUDA implementation: {e}")
+                
+        if use_fallback == False:
+            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available")
+        else:
+            self.f = FallbackImpl(
+                descriptors,
+                device,
+                math_dtype=math_dtype,
+                optimize_fallback=optimize_fallback,
+            )
 
     def __repr__(self):
         has_cuda_kernel = (
-            "(with CUDA kernel)" if self.f_cuda is not None else "(without CUDA kernel)"
+            "(with CUDA kernel)" if self.has_cuda is not None else "(without CUDA kernel)"
         )
         return f"IWeightedSymmetricTensorProduct({has_cuda_kernel})"
 
@@ -173,7 +181,6 @@ def forward(
         x0: torch.Tensor,
         i0: torch.Tensor,
         x1: torch.Tensor,
-        use_fallback: Optional[bool] = None,
     ) -> torch.Tensor:
         r"""
         Perform the forward pass of the indexed symmetric tensor product operation.
@@ -187,10 +194,6 @@ def forward(
             The index tensor for the first operand. It should have the shape (...).
         x1 : torch.Tensor
             The repeated input tensor. It should have the shape (..., x1_size).
-        use_fallback : Optional[bool], optional
-            If `None` (default), a CUDA kernel will be used if available.
-            If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-            If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns
         -------
@@ -203,32 +206,18 @@ def forward(
             x0.ndim == 2,
             f"Expected 2 dims (i0.max() + 1, x0_size), got shape {x0.shape}",
         )
-        shape = broadcast_shapes(i0.shape, x1.shape[:-1])
-        i0 = i0.expand(shape).reshape((math.prod(shape),))
+        shape = broadcast_shapes([i0.shape, x1.shape[:-1]])
+        i0 = i0.expand(shape).reshape((prod(shape),))
         x1 = x1.expand(shape + (x1.shape[-1],)).reshape(
-            (math.prod(shape), x1.shape[-1])
+            (prod(shape), x1.shape[-1])
         )
-
-        if (
-            x0.device.type == "cuda"
-            and self.f_cuda is not None
-            and (use_fallback is not True)
-        ):
-            out = self.f_cuda(x0, i0, x1)
-            out = out.reshape(shape + (self.x2_size,))
-            return out
-
-        if use_fallback is False:
-            if self.f_cuda is not None:
-                raise RuntimeError("CUDA kernel available but input is not on CUDA")
-            else:
-                raise RuntimeError("No CUDA kernel available")
-
-        out = self.f_fx(x0, i0, x1)
+        
+        out = self.f(x0, i0, x1)  
         out = out.reshape(shape + (self.x2_size,))
         return out
 
 
+
 def _check_descriptors(descriptors: list[stp.SegmentedTensorProduct]):
     if len(descriptors) == 0:
         raise ValueError("stps must contain at least one STP.")
@@ -368,6 +357,6 @@ def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
         return sum(
-            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2), use_fallback=True)
+            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2))
             for f in self.fs
         )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 10b7080..941b34c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -70,11 +70,13 @@ class TensorProduct(torch.nn.Module):
         descriptor (SegmentedTensorProduct): The descriptor of the segmented tensor product.
         math_dtype (torch.dtype, optional): The data type of the coefficients and calculations.
         device (torch.device, optional): The device on which the calculations are performed.
-        optimize_fallback (bool, optional): If `True`, the fallback method is optimized. If `False`, the fallback method is used without optimization.
-    """
+        use_fallback (bool, optional):  Determines the computation method. If `None` (default), a CUDA kernel will be used if available. If `False`, a CUDA kernel will be used, and an exception is raised if it's not available. If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
-    num_operands: Final[int]
+        optimize_fallback (bool, optional): If `True`, the fallback method is optimized. If `False`, the fallback method is used without optimization.
+        Raises:
+            RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
+    """
     def __init__(
         self,
         descriptor: stp.SegmentedTensorProduct,
@@ -86,43 +88,47 @@ def __init__(
     ):
         super().__init__()
         self.descriptor = descriptor
-        # for script()
-        self.num_operands = descriptor.num_operands
         if math_dtype is None:
             math_dtype = torch.get_default_dtype()
-
-        try:
-            self.f_cuda3, self.f_cuda4 = _tensor_product_cuda(descriptor, device, math_dtype)
-        except NotImplementedError as e:
-            logger.info(f"CUDA implementation not available: {e}")
-            self.f_cuda3 = None
-            self.f_cuda4 = None
-        except ImportError as e:
-            logger.warning(f"CUDA implementation not available: {e}")
-            logger.warning(
-                "Did you forget to install the CUDA version of cuequivariance-ops-torch?\n"
-                "Install it with one of the following commands:\n"
-                "pip install cuequivariance-ops-torch-cu11\n"
-                "pip install cuequivariance-ops-torch-cu12"
-            )
-            self.f_cuda3 = None
-            self.f_cuda4 = None
-
-        if use_fallback == True: 
-            self.f_fx = _tensor_product_fx(
+        self.f = None
+        self.has_cuda = False
+        
+        if not use_fallback == True: 
+            try:
+                self.f = _tensor_product_cuda(descriptor, device, math_dtype)
+                self.has_cuda = True
+                return
+            except NotImplementedError as e:
+                logger.info(f"CUDA implementation not available: {e}")
+            except ImportError as e:
+                logger.warning(f"CUDA implementation not available: {e}")
+                logger.warning(
+                    "Did you forget to install the CUDA version of cuequivariance-ops-torch?\n"
+                    "Install it with one of the following commands:\n"
+                    "pip install cuequivariance-ops-torch-cu11\n"
+                    "pip install cuequivariance-ops-torch-cu12"
+                )
+                
+        if use_fallback == False: 
+            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available!")
+        else:
+            self.f = _tensor_product_fx(
                 descriptor, device, math_dtype, optimize_fallback is True
             )
-        else:
-            self.f_fx = None
-        self._optimize_fallback = optimize_fallback
+            if optimize_fallback is None:
+                warnings.warn(
+                    "The fallback method is used but it has not been optimized. "
+                    "Consider setting optimize_fallback=True when creating the TensorProduct module."
+                )    
+            self._optimize_fallback = optimize_fallback
 
     def __repr__(self):
         has_cuda_kernel = (
-            "(with CUDA kernel)" if self.f_cuda3 is not None or self.f_cuda4 is not None else "(without CUDA kernel)"
+            "(with CUDA kernel)" if self.has_cuda else "(without CUDA kernel)"
         )
         return f"TensorProduct({self.descriptor} {has_cuda_kernel})"
 
-    def forward(self, inputs: List[torch.Tensor], use_fallback: Optional[bool] = None):
+    def forward(self, inputs: List[torch.Tensor]):
         r"""
         Perform the tensor product based on the specified descriptor.
 
@@ -130,9 +136,6 @@ def forward(self, inputs: List[torch.Tensor], use_fallback: Optional[bool] = Non
             inputs (list of torch.Tensor): The input tensors. The number of input tensors should match the number of operands in the descriptor minus one.
                 Each input tensor should have a shape of ((batch,) operand_size), where `operand_size` corresponds to the size
                 of each operand as defined in the tensor product descriptor.
-            use_fallback (bool, optional):  Determines the computation method. If `None` (default), a CUDA kernel will be used if available and the input
-                is on CUDA. If `False`, a CUDA kernel will be used, and an exception is raised if it's not available or the
-                input is not on CUDA. If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
         Returns:
             torch.Tensor:
@@ -140,36 +143,11 @@ def forward(self, inputs: List[torch.Tensor], use_fallback: Optional[bool] = Non
                 It has a shape of (batch, last_operand_size), where
                 `last_operand_size` is the size of the last operand in the descriptor.
 
-        Raises:
-            RuntimeError: If `use_fallback` is `False` and either no CUDA kernel is available or the input tensor is not on CUDA.
         """
-        if any(x.numel() == 0 for x in inputs):
-            use_fallback = True  # Empty tensors are not supported by the CUDA kernel
-
-        if (
-            inputs
-            and inputs[0].device.type == "cuda"
-            and (use_fallback is not True)
-        ):
-            if self.f_cuda3 is not None:
-                return self.f_cuda3(inputs[0], inputs[1])
-            else:
-                return self.f_cuda4(inputs[0], inputs[1], inputs[2])
+        # if any(x.numel() == 0 for x in inputs):
+        #    use_fallback = True  # Empty tensors are not supported by the CUDA kernel
 
-        if use_fallback is False:
-            if self.f_cuda3 is not None and self.f_cuda4 is not None:
-                raise RuntimeError("CUDA kernel available but input is not on CUDA")
-            else:
-                raise RuntimeError("No CUDA kernel available")
-
-        if self._optimize_fallback is None:
-            warnings.warn(
-                "The fallback method is used but it has not been optimized. "
-                "Consider setting optimize_fallback=True when creating the TensorProduct module."
-            )
-        if self.f_fx is None:
-                raise RuntimeError("No fallback method available")
-        return self.f_fx(inputs)
+        return self.f(inputs)
 
 
 def _tensor_product_fx(
@@ -404,9 +382,9 @@ def _tensor_product_cuda(
                 operand_num_segments=[o.num_segments for o in d.operands],
             ):
                 if descriptor.num_operands == 3:
-                    return TensorProductUniform3x1d(d, device, math_dtype), None
+                    return TensorProductUniform3x1d(d, device, math_dtype)
                 else:
-                    return None, TensorProductUniform4x1d(d, device, math_dtype)
+                    return TensorProductUniform4x1d(d, device, math_dtype)
 
     supported_targets = [
         stp.Subscripts(subscripts)
@@ -436,9 +414,9 @@ def _tensor_product_cuda(
         )
 
     if descriptor.num_operands == 3:
-        return FusedTensorProductOp3(descriptor, perm[:2], device, math_dtype), None
+        return FusedTensorProductOp3(descriptor, perm[:2], device, math_dtype)
     elif descriptor.num_operands == 4:
-        return None, FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
+        return FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
 
                         
 def _reshape(x: torch.Tensor, leading_shape: List[int]) -> torch.Tensor:
@@ -489,10 +467,9 @@ def __repr__(self) -> str:
 
     def forward(
         self,
-        x0: torch.Tensor,
-        x1: torch.Tensor,
+        inputs: List[torch.Tensor]
     ) -> torch.Tensor:
-        x0, x1 = self._perm(x0, x1)
+        x0, x1 = self._perm(inputs[0], inputs[1])
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
 
@@ -548,11 +525,9 @@ def __repr__(self) -> str:
 
     def forward(
         self,
-        x0: torch.Tensor,
-        x1: torch.Tensor,
-        x2: torch.Tensor,
+        inputs: List[torch.Tensor]
     ) -> torch.Tensor:
-        x0, x1, x2 = self._perm(x0, x1, x2)
+        x0, x1, x2 = self._perm(inputs[0], inputs[1], inputs[2])
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
         assert x2.ndim >= 1, x2.ndim
@@ -601,7 +576,8 @@ def __init__(
     def __repr__(self):
         return f"TensorProductUniform3x1d({self.descriptor} (output last operand))"
 
-    def forward(self, x0:torch.Tensor, x1:torch.Tensor):
+    def forward(self, inputs: List[torch.Tensor]):
+        x0, x1 = inputs
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
 
@@ -653,7 +629,8 @@ def __init__(
     def __repr__(self):
         return f"TensorProductUniform4x1d({self.descriptor} (output last operand))"
 
-    def forward(self, x0, x1, x2):
+    def forward(self, inputs: List[torch.Tensor]):
+        x0, x1, x2 = inputs
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
         assert x2.ndim >= 1, x2.ndim
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index 4c40036..252e45e 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -36,19 +36,22 @@ def __init__(
         source: cue.IrrepsLayout,
         target: cue.IrrepsLayout,
         device: Optional[torch.device] = None,
+        use_fallback: Optional[bool] = False
     ):
         super().__init__()
 
         if (source, target) == (cue.mul_ir, cue.ir_mul):
             self.f = TransposeSegments(
-                [(mul, ir.dim) for mul, ir in irreps], device=device
+                [(mul, ir.dim) for mul, ir in irreps], device=device,
+                use_fallback = use_fallback
             )
         elif (source, target) == (cue.ir_mul, cue.mul_ir):
             self.f = TransposeSegments(
-                [(ir.dim, mul) for mul, ir in irreps], device=device
+                [(ir.dim, mul) for mul, ir in irreps], device=device,
+                use_fallback = use_fallback
             )
         else:
-            self.f = _Identity()
+            self.f = torch.nn.Identity()
 
         self.source, self.target = source, target
 
@@ -59,7 +62,7 @@ def __repr__(self):
         return f"TransposeIrrepsLayout({self.source} -> {self.target})"
 
     def forward(
-        self, x: torch.Tensor, use_fallback: Optional[bool] = None
+        self, x: torch.Tensor
     ) -> torch.Tensor:
         r"""
         Perform the transposition.
@@ -74,17 +77,13 @@ def forward(
             torch.Tensor: The transposed tensor.
         """
 
-        return self.f(x, use_fallback=use_fallback)
-
-
-class _Identity(torch.nn.Module):
-    def forward(self, x: torch.Tensor, **kwargs):
-        return x
+        return self.f(x)
 
 
 class TransposeSegments(torch.nn.Module):
     def __init__(
-        self, segments: list[tuple[int, int]], device: Optional[torch.device] = None
+            self, segments: list[tuple[int, int]], device: Optional[torch.device] = None,
+            use_fallback: Optional[bool] = False
     ):
         super().__init__()
 
@@ -97,8 +96,8 @@ def __init__(
                 self.f_cuda = None
             else:
                 self.f_cuda = _transpose(info).to(device=device)
-
-            self.f = _transpose_segments_fx(segments).to(device=device)
+            if use_fallback:
+                self.f = _transpose_segments_fx(segments).to(device=device)
         else:
             self.f_cuda = torch.nn.Identity()
             self.f = torch.nn.Identity()
@@ -107,7 +106,7 @@ def __repr__(self):
         return "TransposeSegments()"
 
     def forward(
-        self, x: torch.Tensor, use_fallback: Optional[bool] = None
+        self, x: torch.Tensor
     ) -> torch.Tensor:
         """
         Perform the transposition of the input tensor using either a CUDA kernel or a PyTorch fallback.
@@ -131,20 +130,10 @@ def forward(
         RuntimeError
             If `use_fallback` is `False` and a CUDA kernel is not available or the input is not on CUDA.
         """
-        if (
-            x.device.type == "cuda"
-            and self.f_cuda is not None
-            and (use_fallback is not True)
-        ):
+        if self.f_cuda is not None:
             return self.f_cuda(x)
-
-        if use_fallback is False:
-            if self.f_cuda is not None:
-                raise RuntimeError("CUDA kernel available but input is not on CUDA")
-            else:
-                raise RuntimeError("No CUDA kernel available")
-
-        return self.f(x)
+        else:
+            return self.f(x)
 
 
 def _transpose_segments_fx(segments: list[tuple[int, int]]) -> torch.nn.Module:
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index e040687..f9f6a7b 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -75,6 +75,15 @@ def test_performance_cuda_vs_fx(
         layout=cue.ir_mul,
         device=device,
         math_dtype=math_dtype,
+        use_fallback=False,
+        optimize_fallback=True,
+    )
+    m1 = cuet.EquivariantTensorProduct(
+        e,
+        layout=cue.ir_mul,
+        device=device,
+        math_dtype=math_dtype,
+        use_fallback=True,
         optimize_fallback=True,
     )
 
@@ -84,15 +93,19 @@ def test_performance_cuda_vs_fx(
     ]
 
     for _ in range(10):
-        m(inputs, use_fallback=False)
-        m(inputs, use_fallback=True)
+        m(inputs)
+        m1(inputs)
+
+    def f():
+        m(inputs)
+        torch.cuda.synchronize()
 
-    def f(ufb: bool):
-        m(inputs, use_fallback=ufb)
+    def f1():
+        m1(inputs)
         torch.cuda.synchronize()
 
-    t0 = timeit.Timer(lambda: f(False)).timeit(number=10)
-    t1 = timeit.Timer(lambda: f(True)).timeit(number=10)
+    t0 = timeit.Timer(f).timeit(number=10)
+    t1 = timeit.Timer(f1).timeit(number=10)
     assert t0 < t1
 
 
@@ -129,18 +142,20 @@ def test_precision_cuda_vs_fx(
         layout=cue.ir_mul,
         device=device,
         math_dtype=math_dtype,
+        use_fallback=False
     )
-    y0 = m(inputs, use_fallback=False)
+    y0 = m(inputs)
 
     m = cuet.EquivariantTensorProduct(
         e,
         layout=cue.ir_mul,
         device=device,
         math_dtype=torch.float64,
+        use_fallback=True,
         optimize_fallback=True,
     )
-    inputs = map(lambda x: x.to(torch.float64), inputs)
-    y1 = m(inputs, use_fallback=True).to(dtype)
+    inputs = [x.to(torch.float64) for x in inputs]
+    y1 = m(inputs).to(dtype)
 
     torch.testing.assert_close(y0, y1, atol=atol, rtol=rtol)
 
@@ -149,10 +164,10 @@ def test_compile():
     e = cue.descriptors.symmetric_contraction(
         cue.Irreps("O3", "32x0e + 32x1o"), cue.Irreps("O3", "32x0e + 32x1o"), [1, 2, 3]
     )
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir, optimize_fallback=False)
+    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir, device="cuda", optimize_fallback=False)
     m_compile = torch.compile(m, fullgraph=True)
-    input1 = torch.randn(100, e.inputs[0].irreps.dim)
-    input2 = torch.randn(100, e.inputs[1].irreps.dim)
+    input1 = torch.randn(100, e.inputs[0].irreps.dim).cuda()
+    input2 = torch.randn(100, e.inputs[1].irreps.dim).cuda()
     m_compile([input1, input2])
 
 
@@ -160,8 +175,11 @@ def test_script():
     e = cue.descriptors.symmetric_contraction(
         cue.Irreps("O3", "32x0e + 32x1o"), cue.Irreps("O3", "32x0e + 32x1o"), [1, 2, 3]
     )
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir, optimize_fallback=False)
+    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
+                                      use_fallback=False,
+                                      device="cuda",
+                                      optimize_fallback=False)
     m_script = torch.jit.script(m)
-    input1 = torch.randn(100, e.inputs[0].irreps.dim)
-    input2 = torch.randn(100, e.inputs[1].irreps.dim)
+    input1 = torch.randn(100, e.inputs[0].irreps.dim).cuda()
+    input2 = torch.randn(100, e.inputs[1].irreps.dim).cuda()
     m_script([input1, input2])
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 95dfc4d..909d71d 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -64,8 +64,9 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
     device = torch.device("cuda:0")
 
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds, math_dtype=math_dtype, device=device, optimize_fallback=False
+        ds, math_dtype=math_dtype, device=device, use_fallback=False
     )
+    m = torch.jit.script(m)
 
     x0 = torch.randn((2, m.x0_size), device=device, dtype=dtype, requires_grad=True)
     i0 = torch.tensor([0, 1, 0], dtype=torch.int32, device=device)
@@ -75,11 +76,11 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
     x0_ = x0.clone().to(torch.float64)
     x1_ = x1.clone().to(torch.float64)
 
-    out1 = m(x0, i0, x1, use_fallback=False)
+    out1 = m(x0, i0, x1)
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds, math_dtype=torch.float64, device=device, optimize_fallback=True
+        ds, math_dtype=torch.float64, device=device, use_fallback=True, optimize_fallback=True
     )
-    out2 = m(x0_, i0, x1_, use_fallback=True)
+    out2 = m(x0_, i0, x1_)
 
     assert out1.dtype == dtype
 
@@ -121,19 +122,19 @@ def test_math_dtype(
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
     ).ds
-    m = cuet.IWeightedSymmetricTensorProduct(ds, math_dtype=math_dtype, device=device)
+    m = cuet.IWeightedSymmetricTensorProduct(ds, math_dtype=math_dtype, device=device, use_fallback=False)
     x0 = torch.randn((20, m.x0_size), dtype=dtype, device=device)
     i0 = torch.randint(0, m.x0_size, (1000,), dtype=torch.int32, device=device)
     x1 = torch.randn((i0.size(0), m.x1_size), dtype=dtype, device=device)
 
-    out1 = m(x0, i0, x1, use_fallback=False)
+    out1 = m(x0, i0, x1)
 
     # .to should have no effect
     for param in m.parameters():
         assert False  # no parameters
     m = m.to(torch.float64)
 
-    out2 = m(x0, i0, x1, use_fallback=False)
+    out2 = m(x0, i0, x1)
 
     assert out1.dtype == dtype
     assert out2.dtype == dtype
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index dc6c2e9..c7bf8e2 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -118,7 +118,7 @@ def test_primitive_tensor_product_cuda_vs_fx(
             d, device=device, math_dtype=torch.float64, use_fallback=True, optimize_fallback=False
         )
         inputs_ = [inp.clone().to(torch.float64) for inp in inputs]
-        out2 = m(inputs_, use_fallback=True)
+        out2 = m(inputs_)
 
         assert out1.shape[:-1] == torch.broadcast_shapes(*batches)
         assert out1.dtype == dtype
diff --git a/cuequivariance_torch/tests/primitives/transpose_test.py b/cuequivariance_torch/tests/primitives/transpose_test.py
index cd39cfc..67ad700 100644
--- a/cuequivariance_torch/tests/primitives/transpose_test.py
+++ b/cuequivariance_torch/tests/primitives/transpose_test.py
@@ -42,5 +42,5 @@ def test_transpose(use_fallback: bool, dtype: torch.dtype):
         [[1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 10, 12, 11, 13]], dtype=dtype
     ).cuda()
 
-    m = cuet.TransposeSegments(segments).cuda()
-    torch.testing.assert_close(m(x, use_fallback=use_fallback), xt)
+    m = cuet.TransposeSegments(segments, use_fallback=use_fallback).cuda()
+    torch.testing.assert_close(m(x), xt)

From 79e7c5f750879c0b61f1d8b9e8b1ebdd15ada7ac Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 4 Dec 2024 20:32:47 -0800
Subject: [PATCH 19/96] Restoring debug logging

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/symmetric_tensor_product.py                 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 8f8ea0d..c5ac0f0 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -324,9 +324,10 @@ def forward(
         i0 = i0.to(torch.int32)
         x0 = x0.reshape(x0.shape[0], x0.shape[1] // self.u, self.u)
         x1 = x1.reshape(x1.shape[0], x1.shape[1] // self.u, self.u)
-        # logger.debug(
-        #     f"Calling SymmetricTensorContraction: {self.descriptors}, input shapes: {x0.shape}, {i0.shape}, {x1.shape}"
-        # )
+        if not torch.jit.is_scripting():
+            logger.debug(
+                f"Calling SymmetricTensorContraction: {self.descriptors}, input shapes: {x0.shape}, {i0.shape}, {x1.shape}"
+            )
         out = self.f(x1, x0, i0)
         out = out.reshape(out.shape[0], out.shape[1] * self.u)
         return out

From 6c5cdb023d1b57a937195265112d0c51ee0a05eb Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Thu, 5 Dec 2024 01:01:37 -0800
Subject: [PATCH 20/96] Parameterized script test

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/equivariant_tensor_product.py  |  3 +-
 .../primitives/tensor_product.py              | 34 +++-------------
 .../equivariant_tensor_product_test.py        | 39 +++++++++++++------
 .../tests/primitives/script_test.py           |  8 ++--
 4 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 06ea215..549e38c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -55,8 +55,7 @@ def forward(
             inputs: List[torch.Tensor],
             indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        x0 = inputs[0]
-        x1 = inputs[1]
+        x0, x1 = inputs
         if indices is None:
             torch._assert(
                 x0.ndim == 2,
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 941b34c..0a246cc 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -546,8 +546,7 @@ def forward(
 
         return out.reshape(shape + (out.shape[-1],))
 
-
-class TensorProductUniform3x1d(torch.nn.Module):
+class TensorProductUniform1d(torch.nn.Module):
     def __init__(
         self,
         descriptor: stp.SegmentedTensorProduct,
@@ -573,10 +572,11 @@ def __init__(
             math_dtype=math_dtype,
         ).to(device=device)
 
+class TensorProductUniform3x1d(TensorProductUniform1d):
     def __repr__(self):
         return f"TensorProductUniform3x1d({self.descriptor} (output last operand))"
 
-    def forward(self, inputs: List[torch.Tensor]):
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1 = inputs
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
@@ -595,36 +595,12 @@ def forward(self, inputs: List[torch.Tensor]):
                 f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
-        out = self._f(x0, x1)
+        out = self._f(x0, x1, x0)
 
         return out.reshape(shape + (out.shape[-1],))
 
 
-class TensorProductUniform4x1d(torch.nn.Module):
-    def __init__(
-        self,
-        descriptor: stp.SegmentedTensorProduct,
-        device: Optional[torch.device],
-        math_dtype: torch.dtype,
-    ):
-        super().__init__()
-        import cuequivariance_ops_torch as ops
-
-        self.descriptor = descriptor
-
-        assert len(descriptor.subscripts.modes()) == 1
-        assert descriptor.all_same_segment_shape()
-        assert descriptor.coefficient_subscripts == ""
-        u = next(iter(descriptor.get_dims(descriptor.subscripts.modes()[0])))
-
-        self._f = ops.TensorProductUniform1d(
-            operand_dim=[ope.ndim for ope in descriptor.operands],
-            operand_extent=u,
-            operand_num_segments=[ope.num_segments for ope in descriptor.operands],
-            path_indices=[path.indices for path in descriptor.paths],
-            path_coefficients=[float(path.coefficients) for path in descriptor.paths],
-            math_dtype=math_dtype,
-        ).to(device=device)
+class TensorProductUniform4x1d(TensorProductUniform1d):
 
     def __repr__(self):
         return f"TensorProductUniform4x1d({self.descriptor} (output last operand))"
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 42736a6..518d150 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -76,8 +76,8 @@ def test_performance_cuda_vs_fx(
         device=device,
         math_dtype=math_dtype,
         use_fallback=False,
-        optimize_fallback=True,
     )
+    
     m1 = cuet.EquivariantTensorProduct(
         e,
         layout=cue.ir_mul,
@@ -95,14 +95,17 @@ def test_performance_cuda_vs_fx(
     for _ in range(10):
         m(inputs)
         m1(inputs)
+    torch.cuda.synchronize()
 
     def f():
-        m(inputs)
-        torch.cuda.synchronize()
+        ret = m(inputs)
+        ret = torch.sum(ret)
+        return ret
 
     def f1():
-        m1(inputs)
-        torch.cuda.synchronize()
+        ret = m1(inputs)
+        ret = torch.sum(ret)
+        return ret
 
     t0 = timeit.Timer(f).timeit(number=10)
     t1 = timeit.Timer(f1).timeit(number=10)
@@ -170,16 +173,28 @@ def test_compile():
     input2 = torch.randn(100, e.inputs[1].irreps.dim).cuda()
     m_compile([input1, input2])
 
-def test_script():
-    e = cue.descriptors.symmetric_contraction(
-        cue.Irreps("O3", "32x0e + 32x1o"), cue.Irreps("O3", "32x0e + 32x1o"), [1, 2, 3]
-    )
+@pytest.mark.parametrize("e", make_descriptors())
+@pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
+def test_script(
+    e: cue.EquivariantTensorProduct,
+    dtype: torch.dtype,
+    math_dtype: torch.dtype,
+    atol: float,
+    rtol: float,
+):
+
+    device = torch.device("cuda:0")
+
     m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
                                       use_fallback=False,
                                       device="cuda",
                                       optimize_fallback=False)
+    inputs = [
+        torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
+        for inp in e.inputs
+    ]
+    res = m(inputs)
     m_script = torch.jit.script(m)
-    input1 = torch.randn(100, e.inputs[0].irreps.dim).cuda()
-    input2 = torch.randn(100, e.inputs[1].irreps.dim).cuda()
-    m_script([input1, input2])
+    # res_script = m_script(inputs)
+    # torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/script_test.py
index 44e880f..37b2a0c 100644
--- a/cuequivariance_torch/tests/primitives/script_test.py
+++ b/cuequivariance_torch/tests/primitives/script_test.py
@@ -46,7 +46,7 @@ def test_script_fused_tp_3():
     module = FusedTensorProductOp3(d, (0, 1), torch.device("cuda:0"), torch.float32)
     module = torch.jit.script(module)
 
-    assert module(x0, x1).shape == (batch, d.operands[2].size)
+    assert module([x0, x1]).shape == (batch, d.operands[2].size)
 
 
 def test_script_fused_tp_4():
@@ -67,7 +67,7 @@ def test_script_fused_tp_4():
     module = FusedTensorProductOp4(d, (0, 1, 2), torch.device("cuda:0"), torch.float32)
     module = torch.jit.script(module)
 
-    assert module(x0, x1, x2).shape == (batch, d.operands[3].size)
+    assert module([x0, x1, x2]).shape == (batch, d.operands[3].size)
 
 
 def test_script_uniform_tp_3():
@@ -86,7 +86,7 @@ def test_script_uniform_tp_3():
     module = TensorProductUniform3x1d(d, torch.device("cuda:0"), torch.float32)
     module = torch.jit.script(module)
 
-    assert module(x0, x1).shape == (batch, d.operands[2].size)
+    assert module([x0, x1]).shape == (batch, d.operands[2].size)
 
 
 def test_script_uniform_tp_4():
@@ -106,4 +106,4 @@ def test_script_uniform_tp_4():
     module = TensorProductUniform4x1d(d, torch.device("cuda:0"), torch.float32)
     module = torch.jit.script(module)
 
-    assert module(x0, x1, x2).shape == (batch, d.operands[3].size)
+    assert module([x0, x1, x2]).shape == (batch, d.operands[3].size)

From e21c45f57d7a8c3a4cbc5f7affc9445cb3b39eb7 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Thu, 5 Dec 2024 12:04:35 -0800
Subject: [PATCH 21/96] Fixed transpose for script(), script_test successful

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/equivariant_tensor_product.py  | 49 +++++++++++++------
 .../primitives/symmetric_tensor_product.py    |  9 ++--
 .../primitives/tensor_product.py              |  1 -
 .../equivariant_tensor_product_test.py        |  8 +--
 4 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 549e38c..a8dbd5e 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -26,6 +26,31 @@ def __init__(self, tp):
         super().__init__()
         self.tp = tp
 
+class Transpose1Dispatcher(Dispatcher):
+    def forward(
+        self,
+        inputs: List[torch.Tensor]
+    ):
+        inputs[0] = self.tp[0](inputs[0])
+
+class Transpose2Dispatcher(Dispatcher):
+    def forward(
+        self,
+        inputs: List[torch.Tensor]
+    ):
+        inputs[0] = self.tp[0](inputs[0])
+        inputs[1] = self.tp[1](inputs[1])
+
+class Transpose3Dispatcher(Dispatcher):
+    def forward(
+        self,
+        inputs: List[torch.Tensor]
+    ):
+        inputs[0] = self.tp[0](inputs[0])
+        inputs[1] = self.tp[1](inputs[1])
+        inputs[2] = self.tp[2](inputs[2])
+
+TRANSPOSE_DISPATCHERS = [Transpose1Dispatcher, Transpose2Dispatcher, Transpose3Dispatcher]
 
 class TPDispatcher(Dispatcher):
     def forward(
@@ -61,16 +86,9 @@ def forward(
                 x0.ndim == 2,
                 f"Expected x0 to have shape (batch, dim), got {x0.shape}",
             )
-            if x0.shape[0] == 1:
-                indices = torch.zeros(
-                    (x1.shape[0],), dtype=torch.int32, device=x1.device
-                )
-            else:  #  x0.shape[0] == x1.shape[0]:
-                indices = torch.arange(
-                    x1.shape[0], dtype=torch.int32, device=x1.device
-                )
-        # borisf : why was it here ?
-        # if indices is not None:
+            indices = torch.arange(
+                x1.shape[0], dtype=torch.int32, device=x1.device
+            )
         return self.tp(x0, indices, x1)
 
 class EquivariantTensorProduct(torch.nn.Module):
@@ -140,9 +158,9 @@ def __init__(
         self.layout_in = layout_in = tuple(map(default_layout, layout_in))
         self.layout_out = layout_out = default_layout(layout_out)
 
-        self.transpose_in = torch.nn.ModuleList()
+        transpose_in = torch.nn.ModuleList()
         for layout_used, input_expected in zip(layout_in, e.inputs):
-            self.transpose_in.append(
+            transpose_in.append(
                 cuet.TransposeIrrepsLayout(
                     input_expected.irreps,
                     source=layout_used,
@@ -151,6 +169,9 @@ def __init__(
                     use_fallback = use_fallback
                 )
             )
+        # script() requires literal addressing and fails to eliminate dead branches
+        self.transpose_in = TRANSPOSE_DISPATCHERS[len(transpose_in)-1](transpose_in)
+        
         self.transpose_out = cuet.TransposeIrrepsLayout(
             e.output.irreps,
             source=e.output.layout,
@@ -212,9 +233,7 @@ def forward(
             assert a.shape[-1] == dim
 
         # Transpose inputs
-        inputs[0] = self.transpose_in[0](inputs[0])
-        if len(self.transpose_in) > 1:
-            inputs[1] = self.transpose_in[1](inputs[1])
+        self.transpose_in.forward(inputs)
 
         # Compute tensor product
         output = self.tp(inputs, indices)
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index fc7eb47..4553d01 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -85,6 +85,7 @@ def __init__(
             descriptors,
             device=device,
             math_dtype=math_dtype,
+            use_fallback = use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
@@ -153,7 +154,7 @@ def __init__(
         self.x2_size = d.operands[-1].size
         self.has_cuda = False
         
-        if not use_fallback == True:
+        if use_fallback is None or not use_fallback:
             try:
                 self.f = CUDAKernel(descriptors, device, math_dtype)
                 self.has_cuda = True
@@ -163,15 +164,15 @@ def __init__(
             except ImportError as e:
                 logger.warning(f"Failed to initialize CUDA implementation: {e}")
                 
-        if use_fallback == False:
-            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available")
-        else:
+        if use_fallback is None or use_fallback:
             self.f = FallbackImpl(
                 descriptors,
                 device,
                 math_dtype=math_dtype,
                 optimize_fallback=optimize_fallback,
             )
+        else:
+            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available")
 
     def __repr__(self):
         has_cuda_kernel = (
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 0a246cc..410c323 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -19,7 +19,6 @@
 
 import torch
 import torch.fx
-from torch.jit import Final
 from cuequivariance import segmented_tensor_product as stp
 
 logger = logging.getLogger(__name__)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 518d150..16f3cf2 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -187,14 +187,14 @@ def test_script(
 
     m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
                                       use_fallback=False,
-                                      device="cuda",
-                                      optimize_fallback=False)
+                                      device="cuda")
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
     ]
+    copy_inputs = [i.clone() for i in inputs]
     res = m(inputs)
     m_script = torch.jit.script(m)
-    # res_script = m_script(inputs)
-    # torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
+    res_script = m_script(copy_inputs)
+    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 

From 779dd9ccd02f8e6585ca2485b908cf5717421ca6 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Thu, 5 Dec 2024 12:48:57 -0800
Subject: [PATCH 22/96] Fixed input mutation

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/equivariant_tensor_product.py  | 22 ++++++++++++-------
 .../equivariant_tensor_product_test.py        |  3 +--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index a8dbd5e..3a18f76 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -31,24 +31,30 @@ def forward(
         self,
         inputs: List[torch.Tensor]
     ):
-        inputs[0] = self.tp[0](inputs[0])
-
+        ret = inputs.copy()
+        ret[0] = self.tp[0](ret[0])
+        return ret
+    
 class Transpose2Dispatcher(Dispatcher):
     def forward(
         self,
         inputs: List[torch.Tensor]
     ):
-        inputs[0] = self.tp[0](inputs[0])
-        inputs[1] = self.tp[1](inputs[1])
+        ret = inputs.copy()
+        ret[0] = self.tp[0](ret[0])
+        ret[1] = self.tp[1](ret[1])
+        return ret
 
 class Transpose3Dispatcher(Dispatcher):
     def forward(
         self,
         inputs: List[torch.Tensor]
     ):
-        inputs[0] = self.tp[0](inputs[0])
-        inputs[1] = self.tp[1](inputs[1])
-        inputs[2] = self.tp[2](inputs[2])
+        ret = inputs.copy()
+        ret[0] = self.tp[0](ret[0])
+        ret[1] = self.tp[1](ret[1])
+        ret[2] = self.tp[1](ret[2])
+        return ret
 
 TRANSPOSE_DISPATCHERS = [Transpose1Dispatcher, Transpose2Dispatcher, Transpose3Dispatcher]
 
@@ -233,7 +239,7 @@ def forward(
             assert a.shape[-1] == dim
 
         # Transpose inputs
-        self.transpose_in.forward(inputs)
+        inputs = self.transpose_in(inputs)
 
         # Compute tensor product
         output = self.tp(inputs, indices)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 16f3cf2..60bbacf 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -192,9 +192,8 @@ def test_script(
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
     ]
-    copy_inputs = [i.clone() for i in inputs]
     res = m(inputs)
     m_script = torch.jit.script(m)
-    res_script = m_script(copy_inputs)
+    res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 

From c315857c2128380d562b231af7b198d2ecacbad1 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Thu, 5 Dec 2024 18:10:49 -0800
Subject: [PATCH 23/96] Fixed tests

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/equivariant_tensor_product.py  | 17 +++++++-
 .../primitives/symmetric_tensor_product.py    |  2 +-
 .../primitives/tensor_product.py              | 10 ++---
 .../tests/operations/linear_test.py           | 37 ++++++++++------
 .../operations/spherical_harmonics_test.py    |  8 ++--
 .../operations/symmetric_contraction_test.py  |  2 +-
 .../tests/operations/tp_channel_wise_test.py  | 42 ++++++++++++-------
 .../operations/tp_fully_connected_test.py     | 14 ++++---
 .../equivariant_tensor_product_test.py        | 29 +++++++++----
 9 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 3a18f76..cacbd9c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -53,10 +53,22 @@ def forward(
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
-        ret[2] = self.tp[1](ret[2])
+        ret[2] = self.tp[2](ret[2])
         return ret
 
-TRANSPOSE_DISPATCHERS = [Transpose1Dispatcher, Transpose2Dispatcher, Transpose3Dispatcher]
+class Transpose4Dispatcher(Dispatcher):
+    def forward(
+        self,
+        inputs: List[torch.Tensor]
+    ):
+        ret = inputs.copy()
+        ret[0] = self.tp[0](ret[0])
+        ret[1] = self.tp[1](ret[1])
+        ret[2] = self.tp[2](ret[2])
+        ret[3] = self.tp[3](ret[3])
+        return ret
+
+TRANSPOSE_DISPATCHERS = [Transpose1Dispatcher, Transpose2Dispatcher, Transpose3Dispatcher, Transpose4Dispatcher]
 
 class TPDispatcher(Dispatcher):
     def forward(
@@ -175,6 +187,7 @@ def __init__(
                     use_fallback = use_fallback
                 )
             )
+
         # script() requires literal addressing and fails to eliminate dead branches
         self.transpose_in = TRANSPOSE_DISPATCHERS[len(transpose_in)-1](transpose_in)
         
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 4553d01..f02be9c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -325,7 +325,7 @@ def forward(
         i0 = i0.to(torch.int32)
         x0 = x0.reshape(x0.shape[0], x0.shape[1] // self.u, self.u)
         x1 = x1.reshape(x1.shape[0], x1.shape[1] // self.u, self.u)
-        if not torch.jit.is_scripting():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling SymmetricTensorContraction: {self.descriptors}, input shapes: {x0.shape}, {i0.shape}, {x1.shape}"
             )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 410c323..0540eda 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -306,7 +306,7 @@ def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProdu
         self.descriptor = descriptor
 
     def forward(self, args:List[torch.Tensor]):
-        if not torch.jit.is_scripting():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             for oid, arg in enumerate(args):
                 torch._assert(
                     arg.shape[-1] == self.descriptor.operands[oid].size,
@@ -476,7 +476,7 @@ def forward(
         x0 = _reshape(x0, shape)
         x1 = _reshape(x1, shape)
 
-        if not torch.jit.is_scripting():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
@@ -536,7 +536,7 @@ def forward(
         x1 = _reshape(x1, shape)
         x2 = _reshape(x2, shape)
 
-        if not torch.jit.is_scripting():
+        if not torch.jit.is_scripting and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
@@ -589,7 +589,7 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         if x1.ndim == 1:
             x1 = x1.unsqueeze(0)
 
-        if not torch.jit.is_scripting():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
@@ -622,7 +622,7 @@ def forward(self, inputs: List[torch.Tensor]):
         if x2.ndim == 1:
             x2 = x2.unsqueeze(0)
 
-        if not torch.jit.is_scripting():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling TensorProductUniform4x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index 1f786b7..d0d8a41 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -45,16 +45,26 @@ def test_linear_fwd(
         shared_weights=shared_weights,
         device="cuda",
         dtype=torch.float64,
+        use_fallback=False
+    )
+    linear_fx = cuet.Linear(
+        irreps_in,
+        irreps_out,
+        layout=layout,
+        shared_weights=shared_weights,
+        device="cuda",
+        dtype=torch.float64,
+        use_fallback=True
     )
     x = torch.randn(10, irreps_in.dim, dtype=torch.float64).cuda()
 
     if shared_weights:
         y = linear(x)
-        y_fx = linear(x, use_fallback=True)
+        y_fx = linear_fx(x)
     else:
         w = torch.randn(10, linear.weight_numel, dtype=torch.float64).cuda()
         y = linear(x, w)
-        y_fx = linear(x, w, use_fallback=True)
+        y_fx = linear_fx(x, w)
 
     assert y.shape == (10, irreps_out.dim)
 
@@ -71,17 +81,18 @@ def test_linear_bwd_bwd(
     layout: cue.IrrepsLayout,
     shared_weights: bool,
 ):
-    linear = cuet.Linear(
-        irreps_in,
-        irreps_out,
-        layout=layout,
-        shared_weights=shared_weights,
-        device="cuda",
-        dtype=torch.float64,
-    )
-
     outputs = dict()
     for use_fallback in [True, False]:
+        linear = cuet.Linear(
+            irreps_in,
+            irreps_out,
+            layout=layout,
+            shared_weights=shared_weights,
+            device="cuda",
+            dtype=torch.float64,
+            use_fallback=use_fallback
+        )
+
         # reset the seed to ensure the same initialization
         torch.manual_seed(0)
 
@@ -90,12 +101,12 @@ def test_linear_bwd_bwd(
         )
 
         if shared_weights:
-            y = linear(x, use_fallback=use_fallback)
+            y = linear(x)
         else:
             w = torch.randn(
                 10, linear.weight_numel, requires_grad=True, dtype=torch.float64
             ).cuda()
-            y = linear(x, w, use_fallback=use_fallback)
+            y = linear(x, w)
 
         (grad,) = torch.autograd.grad(
             y.pow(2).sum(),
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index e1f07ab..6024401 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -26,15 +26,15 @@
 )
 @pytest.mark.parametrize("l", [1, 2, 3])
 def test_spherical_harmonics(l: int, dtype, tol):
-    vec = torch.randn(3, dtype=dtype)
+    vec = torch.randn(3, dtype=dtype, device="cuda")
     axis = np.random.randn(3)
     angle = np.random.rand()
     scale = 1.3
 
     yl = cuet.spherical_harmonics([l], vec, False)
 
-    R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype)
-    Rl = torch.from_numpy(cue.SO3(l).rotation(axis, angle)).to(dtype)
+    R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).cuda()
+    Rl = torch.from_numpy(cue.SO3(l).rotation(axis, angle)).to(dtype).cuda()
 
     yl1 = cuet.spherical_harmonics([l], scale * R @ vec, False)
     yl2 = scale**l * Rl @ yl
@@ -43,7 +43,7 @@ def test_spherical_harmonics(l: int, dtype, tol):
 
 
 def test_spherical_harmonics_full():
-    vec = torch.randn(3)
+    vec = torch.randn(3, device="cuda")
     ls = [0, 1, 2, 3]
     yl = cuet.spherical_harmonics(ls, vec, False)
 
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index 62ba30e..5576397 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -30,7 +30,7 @@
 @pytest.mark.parametrize("dtype", [torch.float64, torch.float32])
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("original_mace", [True, False])
-@pytest.mark.parametrize("batch", [0, 32])
+@pytest.mark.parametrize("batch", [1, 32])
 def test_symmetric_contraction(dtype, layout, original_mace, batch):
     mul = 64
     irreps_in = mul * cue.Irreps("O3", "0e + 1o + 2e")
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index 9540c73..4e12f73 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -31,7 +31,7 @@
 @pytest.mark.parametrize("irreps3", list_of_irreps)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
-@pytest.mark.parametrize("batch", [0, 32])
+@pytest.mark.parametrize("batch", [1, 32])
 def test_channel_wise(
     irreps1: cue.Irreps,
     irreps2: cue.Irreps,
@@ -50,19 +50,30 @@ def test_channel_wise(
         device="cuda",
         dtype=torch.float64,
     )
+    m_fx = cuet.ChannelWiseTensorProduct(
+        irreps1,
+        irreps2,
+        irreps3,
+        shared_weights=True,
+        internal_weights=True,
+        layout=layout,
+        device="cuda",
+        dtype=torch.float64,
+        use_fallback=True
+    )
 
     x1 = torch.randn(batch, irreps1.dim, dtype=torch.float64).cuda()
     x2 = torch.randn(batch, irreps2.dim, dtype=torch.float64).cuda()
 
-    out1 = m(x1, x2, use_fallback=use_fallback)
+    out1 = m(x1, x2)
 
     d = descriptors.channelwise_tensor_product(irreps1, irreps2, irreps3).d
     d = d.squeeze_modes("v")
     assert d.subscripts == "u,iu,j,ku+ijk"
     if layout == cue.mul_ir:
         d = d.add_or_transpose_modes("u,ui,j,uk+ijk")
-    mfx = cuet.TensorProduct(d, math_dtype=torch.float64).cuda()
-    out2 = mfx([m.weight, x1, x2], use_fallback=True)
+    mfx = cuet.TensorProduct(d, math_dtype=torch.float64, use_fallback=True).cuda()
+    out2 = mfx([m.weight, x1, x2])
 
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
@@ -72,17 +83,6 @@ def test_channel_wise_bwd_bwd():
     irreps2 = cue.Irreps("SO3", "0 + 1")
     irreps3 = cue.Irreps("SO3", "0 + 1")
 
-    m = cuet.ChannelWiseTensorProduct(
-        irreps1,
-        irreps2,
-        irreps3,
-        shared_weights=True,
-        internal_weights=False,
-        layout=cue.ir_mul,
-        device="cuda",
-        dtype=torch.float64,
-    )
-
     x1 = torch.randn(
         32, irreps1.dim, device="cuda", requires_grad=True, dtype=torch.float64
     )
@@ -95,6 +95,18 @@ def test_channel_wise_bwd_bwd():
 
     outputs = {}
     for use_fallback in [True, False]:
+        m = cuet.ChannelWiseTensorProduct(
+            irreps1,
+            irreps2,
+            irreps3,
+            shared_weights=True,
+            internal_weights=False,
+            layout=cue.ir_mul,
+            device="cuda",
+            dtype=torch.float64,
+            use_fallback=use_fallback
+        )
+
         (grad1, grad2, grad3) = torch.autograd.grad(
             m(x1, x2, w).pow(2).sum(), (x1, x2, w), create_graph=True
         )
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index 4e197fd..49c65a0 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -47,35 +47,37 @@ def test_fully_connected(
         layout=layout,
         device="cuda",
         dtype=torch.float64,
+        use_fallback=use_fallback
     )
 
     x1 = torch.randn(32, irreps1.dim, dtype=torch.float64).cuda()
     x2 = torch.randn(32, irreps2.dim, dtype=torch.float64).cuda()
 
-    out1 = m(x1, x2, use_fallback=use_fallback)
+    out1 = m(x1, x2)
 
     d = descriptors.fully_connected_tensor_product(irreps1, irreps2, irreps3).d
     if layout == cue.mul_ir:
         d = d.add_or_transpose_modes("uvw,ui,vj,wk+ijk")
-    mfx = cuet.TensorProduct(d, math_dtype=torch.float64).cuda()
+    mfx = cuet.TensorProduct(d, math_dtype=torch.float64, use_fallback=True).cuda()
     out2 = mfx(
         [m.weight.to(torch.float64), x1.to(torch.float64), x2.to(torch.float64)],
-        use_fallback=True,
     ).to(out1.dtype)
 
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
 
 def test_compile():
+    device = "cuda"
     m = cuet.FullyConnectedTensorProduct(
         irreps_in1=cue.Irreps("O3", "32x0e + 32x1o"),
         irreps_in2=cue.Irreps("O3", "32x0e + 32x1o"),
         irreps_out=cue.Irreps("O3", "32x0e + 32x1o"),
         layout=cue.mul_ir,
-        optimize_fallback=False,
+        device=device,
+        use_fallback=False
     )
 
     m_compile = torch.compile(m, fullgraph=True)
-    input1 = torch.randn(100, m.irreps_in1.dim)
-    input2 = torch.randn(100, m.irreps_in2.dim)
+    input1 = torch.randn(100, m.irreps_in1.dim, device=device)
+    input2 = torch.randn(100, m.irreps_in2.dim, device=device)
     m_compile(input1, input2)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 60bbacf..8f3ea2f 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -163,15 +163,28 @@ def test_precision_cuda_vs_fx(
     torch.testing.assert_close(y0, y1, atol=atol, rtol=rtol)
 
 
-def test_compile():
-    e = cue.descriptors.symmetric_contraction(
-        cue.Irreps("O3", "32x0e + 32x1o"), cue.Irreps("O3", "32x0e + 32x1o"), [1, 2, 3]
-    )
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir, device="cuda", optimize_fallback=False)
+@pytest.mark.parametrize("e", make_descriptors())
+@pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
+def test_compile(
+    e: cue.EquivariantTensorProduct,
+    dtype: torch.dtype,
+    math_dtype: torch.dtype,
+    atol: float,
+    rtol: float,
+):
+    device = torch.device("cuda:0")
+
+    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
+                                      use_fallback=False,
+                                      device="cuda")
+    inputs = [
+        torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
+        for inp in e.inputs
+    ]
+    res = m(inputs)
     m_compile = torch.compile(m, fullgraph=True)
-    input1 = torch.randn(100, e.inputs[0].irreps.dim).cuda()
-    input2 = torch.randn(100, e.inputs[1].irreps.dim).cuda()
-    m_compile([input1, input2])
+    res_script = m_compile(inputs)
+    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)

From ab590c8d8d0e875e8d88dfe267e746d225782b5f Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 01:37:44 -0800
Subject: [PATCH 24/96] format with black

---
 .../primitives/equivariant_tensor_product.py  | 65 +++++++++----------
 .../primitives/symmetric_tensor_product.py    | 34 +++++-----
 .../primitives/tensor_product.py              | 53 ++++++++-------
 .../primitives/transpose.py                   | 26 ++++----
 .../tests/operations/linear_test.py           |  6 +-
 .../tests/operations/tp_channel_wise_test.py  |  4 +-
 .../operations/tp_fully_connected_test.py     |  4 +-
 .../equivariant_tensor_product_test.py        | 22 +++----
 .../symmetric_tensor_product_test.py          | 10 ++-
 .../tests/primitives/tensor_product_test.py   |  9 ++-
 10 files changed, 121 insertions(+), 112 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index cacbd9c..1808ada 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -26,41 +26,33 @@ def __init__(self, tp):
         super().__init__()
         self.tp = tp
 
+
 class Transpose1Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         return ret
-    
+
+
 class Transpose2Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
         return ret
 
+
 class Transpose3Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
         ret[2] = self.tp[2](ret[2])
         return ret
 
+
 class Transpose4Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
@@ -68,7 +60,14 @@ def forward(
         ret[3] = self.tp[3](ret[3])
         return ret
 
-TRANSPOSE_DISPATCHERS = [Transpose1Dispatcher, Transpose2Dispatcher, Transpose3Dispatcher, Transpose4Dispatcher]
+
+TRANSPOSE_DISPATCHERS = [
+    Transpose1Dispatcher,
+    Transpose2Dispatcher,
+    Transpose3Dispatcher,
+    Transpose4Dispatcher,
+]
+
 
 class TPDispatcher(Dispatcher):
     def forward(
@@ -80,9 +79,9 @@ def forward(
             # TODO: at some point we will have kernel for this
             assert len(inputs) >= 1
             inputs[0] = inputs[0][indices]
-        return  self.tp(inputs)
+        return self.tp(inputs)
+
 
-    
 class SymmetricTPDispatcher(Dispatcher):
     def forward(
         self,
@@ -91,12 +90,13 @@ def forward(
     ) -> torch.Tensor:
         assert indices is None
         return self.tp(inputs[0])
-    
+
+
 class IWeightedSymmetricTPDispatcher(Dispatcher):
     def forward(
-            self,
-            inputs: List[torch.Tensor],
-            indices: Optional[torch.Tensor] = None,
+        self,
+        inputs: List[torch.Tensor],
+        indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         x0, x1 = inputs
         if indices is None:
@@ -104,11 +104,10 @@ def forward(
                 x0.ndim == 2,
                 f"Expected x0 to have shape (batch, dim), got {x0.shape}",
             )
-            indices = torch.arange(
-                x1.shape[0], dtype=torch.int32, device=x1.device
-            )
+            indices = torch.arange(x1.shape[0], dtype=torch.int32, device=x1.device)
         return self.tp(x0, indices, x1)
 
+
 class EquivariantTensorProduct(torch.nn.Module):
     r"""Equivariant tensor product.
 
@@ -146,7 +145,7 @@ class EquivariantTensorProduct(torch.nn.Module):
         ...
                 [0., 0., 0., 0., 0., 0.]])
     """
-    
+
     def __init__(
         self,
         e: cue.EquivariantTensorProduct,
@@ -184,19 +183,19 @@ def __init__(
                     source=layout_used,
                     target=input_expected.layout,
                     device=device,
-                    use_fallback = use_fallback
+                    use_fallback=use_fallback,
                 )
             )
 
         # script() requires literal addressing and fails to eliminate dead branches
-        self.transpose_in = TRANSPOSE_DISPATCHERS[len(transpose_in)-1](transpose_in)
-        
+        self.transpose_in = TRANSPOSE_DISPATCHERS[len(transpose_in) - 1](transpose_in)
+
         self.transpose_out = cuet.TransposeIrrepsLayout(
             e.output.irreps,
             source=e.output.layout,
             target=layout_out,
             device=device,
-            use_fallback = use_fallback
+            use_fallback=use_fallback,
         )
 
         if any(d.num_operands != e.num_inputs + 1 for d in e.ds):
@@ -228,7 +227,7 @@ def __init__(
                     e.ds[0],
                     device=device,
                     math_dtype=math_dtype,
-                    use_fallback = use_fallback,
+                    use_fallback=use_fallback,
                     optimize_fallback=optimize_fallback,
                 )
             )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index f02be9c..bb30e39 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -57,7 +57,7 @@ def __init__(
                 d0,
                 device=device,
                 math_dtype=math_dtype,
-                use_fallback = use_fallback,
+                use_fallback=use_fallback,
                 optimize_fallback=optimize_fallback,
             )
         else:
@@ -85,13 +85,11 @@ def __init__(
             descriptors,
             device=device,
             math_dtype=math_dtype,
-            use_fallback = use_fallback,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
-    def forward(
-        self, x0: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, x0: torch.Tensor) -> torch.Tensor:
         r"""
         Perform the forward pass of the indexed symmetric tensor product operation.
 
@@ -109,7 +107,7 @@ def forward(
         out = self.f(
             torch.ones((1, 1), dtype=x0.dtype, device=x0.device),
             torch.zeros((x0.shape[0],), dtype=torch.int32, device=x0.device),
-            x0
+            x0,
         )
         if self.f0 is not None:
             out += self.f0([])
@@ -153,7 +151,7 @@ def __init__(
         self.x1_size = d.operands[1].size
         self.x2_size = d.operands[-1].size
         self.has_cuda = False
-        
+
         if use_fallback is None or not use_fallback:
             try:
                 self.f = CUDAKernel(descriptors, device, math_dtype)
@@ -163,7 +161,7 @@ def __init__(
                 logger.info(f"Failed to initialize CUDA implementation: {e}")
             except ImportError as e:
                 logger.warning(f"Failed to initialize CUDA implementation: {e}")
-                
+
         if use_fallback is None or use_fallback:
             self.f = FallbackImpl(
                 descriptors,
@@ -172,11 +170,15 @@ def __init__(
                 optimize_fallback=optimize_fallback,
             )
         else:
-            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available")
+            raise RuntimeError(
+                "`use_fallback` is `False` and no CUDA kernel is available"
+            )
 
     def __repr__(self):
         has_cuda_kernel = (
-            "(with CUDA kernel)" if self.has_cuda is not None else "(without CUDA kernel)"
+            "(with CUDA kernel)"
+            if self.has_cuda is not None
+            else "(without CUDA kernel)"
         )
         return f"IWeightedSymmetricTensorProduct({has_cuda_kernel})"
 
@@ -212,16 +214,13 @@ def forward(
         )
         shape = broadcast_shapes([i0.shape, x1.shape[:-1]])
         i0 = i0.expand(shape).reshape((prod(shape),))
-        x1 = x1.expand(shape + (x1.shape[-1],)).reshape(
-            (prod(shape), x1.shape[-1])
-        )
-        
-        out = self.f(x0, i0, x1)  
+        x1 = x1.expand(shape + (x1.shape[-1],)).reshape((prod(shape), x1.shape[-1]))
+
+        out = self.f(x0, i0, x1)
         out = out.reshape(shape + (self.x2_size,))
         return out
 
 
-
 def _check_descriptors(descriptors: list[stp.SegmentedTensorProduct]):
     if len(descriptors) == 0:
         raise ValueError("stps must contain at least one STP.")
@@ -359,6 +358,5 @@ def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
         return sum(
-            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2))
-            for f in self.fs
+            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2)) for f in self.fs
         )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 0540eda..e0d5e98 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -23,12 +23,14 @@
 
 logger = logging.getLogger(__name__)
 
+
 def prod(numbers: List[int]):
     product = 1
     for num in numbers:
         product *= num
     return product
 
+
 def broadcast_shapes(shapes: List[List[int]]):
     if torch.jit.is_scripting():
         max_len = 0
@@ -47,15 +49,23 @@ def broadcast_shapes(shapes: List[List[int]]):
             if isinstance(shape, (tuple, list)):
                 for i in range(-1, -1 - len(shape), -1):
                     if shape[i] < 0:
-                        raise RuntimeError("Trying to create tensor with negative dimension ({}): ({})"
-                                           .format(shape[i], shape[i]))
+                        raise RuntimeError(
+                            "Trying to create tensor with negative dimension ({}): ({})".format(
+                                shape[i], shape[i]
+                            )
+                        )
                     if shape[i] == 1 or shape[i] == result[i]:
                         continue
                     if result[i] != 1:
-                        raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
+                        raise RuntimeError(
+                            "Shape mismatch: objects cannot be broadcast to a single shape"
+                        )
                     result[i] = shape[i]
             else:
-                raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape)
+                raise RuntimeError(
+                    "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
+                    shape,
+                )
         return torch.Size(result)
     else:
         return torch.functional.broadcast_shapes(*shapes)
@@ -76,6 +86,7 @@ class TensorProduct(torch.nn.Module):
             RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
     """
+
     def __init__(
         self,
         descriptor: stp.SegmentedTensorProduct,
@@ -91,8 +102,8 @@ def __init__(
             math_dtype = torch.get_default_dtype()
         self.f = None
         self.has_cuda = False
-        
-        if not use_fallback == True: 
+
+        if not use_fallback == True:
             try:
                 self.f = _tensor_product_cuda(descriptor, device, math_dtype)
                 self.has_cuda = True
@@ -107,9 +118,11 @@ def __init__(
                     "pip install cuequivariance-ops-torch-cu11\n"
                     "pip install cuequivariance-ops-torch-cu12"
                 )
-                
-        if use_fallback == False: 
-            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available!")
+
+        if use_fallback == False:
+            raise RuntimeError(
+                "`use_fallback` is `False` and no CUDA kernel is available!"
+            )
         else:
             self.f = _tensor_product_fx(
                 descriptor, device, math_dtype, optimize_fallback is True
@@ -118,7 +131,7 @@ def __init__(
                 warnings.warn(
                     "The fallback method is used but it has not been optimized. "
                     "Consider setting optimize_fallback=True when creating the TensorProduct module."
-                )    
+                )
             self._optimize_fallback = optimize_fallback
 
     def __repr__(self):
@@ -216,9 +229,7 @@ def _tensor_product_fx(
 
             seg_shape = descriptor.get_segment_shape(-1, path)
             outputs += [
-                out.reshape(
-                    out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),)
-                )
+                out.reshape(out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),))
             ]
 
         if len(outputs) == 0:
@@ -305,7 +316,7 @@ def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProdu
         self.module = module
         self.descriptor = descriptor
 
-    def forward(self, args:List[torch.Tensor]):
+    def forward(self, args: List[torch.Tensor]):
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             for oid, arg in enumerate(args):
                 torch._assert(
@@ -417,7 +428,7 @@ def _tensor_product_cuda(
     elif descriptor.num_operands == 4:
         return FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
 
-                        
+
 def _reshape(x: torch.Tensor, leading_shape: List[int]) -> torch.Tensor:
     # Make x have shape (Z, x.shape[-1]) or (x.shape[-1],)
     if prod(leading_shape) > 1 and prod(x.shape[:-1]) == 1:
@@ -464,10 +475,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"FusedTensorProductOp3({self.descriptor} (output last operand))"
 
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ) -> torch.Tensor:
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1 = self._perm(inputs[0], inputs[1])
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
@@ -522,10 +530,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"FusedTensorProductOp4({self.descriptor} (output last operand))"
 
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ) -> torch.Tensor:
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1, x2 = self._perm(inputs[0], inputs[1], inputs[2])
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
@@ -545,6 +550,7 @@ def forward(
 
         return out.reshape(shape + (out.shape[-1],))
 
+
 class TensorProductUniform1d(torch.nn.Module):
     def __init__(
         self,
@@ -571,6 +577,7 @@ def __init__(
             math_dtype=math_dtype,
         ).to(device=device)
 
+
 class TensorProductUniform3x1d(TensorProductUniform1d):
     def __repr__(self):
         return f"TensorProductUniform3x1d({self.descriptor} (output last operand))"
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index 252e45e..9e7156e 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -36,19 +36,21 @@ def __init__(
         source: cue.IrrepsLayout,
         target: cue.IrrepsLayout,
         device: Optional[torch.device] = None,
-        use_fallback: Optional[bool] = False
+        use_fallback: Optional[bool] = False,
     ):
         super().__init__()
 
         if (source, target) == (cue.mul_ir, cue.ir_mul):
             self.f = TransposeSegments(
-                [(mul, ir.dim) for mul, ir in irreps], device=device,
-                use_fallback = use_fallback
+                [(mul, ir.dim) for mul, ir in irreps],
+                device=device,
+                use_fallback=use_fallback,
             )
         elif (source, target) == (cue.ir_mul, cue.mul_ir):
             self.f = TransposeSegments(
-                [(ir.dim, mul) for mul, ir in irreps], device=device,
-                use_fallback = use_fallback
+                [(ir.dim, mul) for mul, ir in irreps],
+                device=device,
+                use_fallback=use_fallback,
             )
         else:
             self.f = torch.nn.Identity()
@@ -61,9 +63,7 @@ def __init__(
     def __repr__(self):
         return f"TransposeIrrepsLayout({self.source} -> {self.target})"
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         r"""
         Perform the transposition.
 
@@ -82,8 +82,10 @@ def forward(
 
 class TransposeSegments(torch.nn.Module):
     def __init__(
-            self, segments: list[tuple[int, int]], device: Optional[torch.device] = None,
-            use_fallback: Optional[bool] = False
+        self,
+        segments: list[tuple[int, int]],
+        device: Optional[torch.device] = None,
+        use_fallback: Optional[bool] = False,
     ):
         super().__init__()
 
@@ -105,9 +107,7 @@ def __init__(
     def __repr__(self):
         return "TransposeSegments()"
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Perform the transposition of the input tensor using either a CUDA kernel or a PyTorch fallback.
 
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index d0d8a41..26b9e5b 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -45,7 +45,7 @@ def test_linear_fwd(
         shared_weights=shared_weights,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=False
+        use_fallback=False,
     )
     linear_fx = cuet.Linear(
         irreps_in,
@@ -54,7 +54,7 @@ def test_linear_fwd(
         shared_weights=shared_weights,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=True
+        use_fallback=True,
     )
     x = torch.randn(10, irreps_in.dim, dtype=torch.float64).cuda()
 
@@ -90,7 +90,7 @@ def test_linear_bwd_bwd(
             shared_weights=shared_weights,
             device="cuda",
             dtype=torch.float64,
-            use_fallback=use_fallback
+            use_fallback=use_fallback,
         )
 
         # reset the seed to ensure the same initialization
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index 4e12f73..c48e1e1 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -59,7 +59,7 @@ def test_channel_wise(
         layout=layout,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=True
+        use_fallback=True,
     )
 
     x1 = torch.randn(batch, irreps1.dim, dtype=torch.float64).cuda()
@@ -104,7 +104,7 @@ def test_channel_wise_bwd_bwd():
             layout=cue.ir_mul,
             device="cuda",
             dtype=torch.float64,
-            use_fallback=use_fallback
+            use_fallback=use_fallback,
         )
 
         (grad1, grad2, grad3) = torch.autograd.grad(
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index 49c65a0..d9b19b4 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -47,7 +47,7 @@ def test_fully_connected(
         layout=layout,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=use_fallback
+        use_fallback=use_fallback,
     )
 
     x1 = torch.randn(32, irreps1.dim, dtype=torch.float64).cuda()
@@ -74,7 +74,7 @@ def test_compile():
         irreps_out=cue.Irreps("O3", "32x0e + 32x1o"),
         layout=cue.mul_ir,
         device=device,
-        use_fallback=False
+        use_fallback=False,
     )
 
     m_compile = torch.compile(m, fullgraph=True)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 8f3ea2f..59d44b8 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -77,7 +77,7 @@ def test_performance_cuda_vs_fx(
         math_dtype=math_dtype,
         use_fallback=False,
     )
-    
+
     m1 = cuet.EquivariantTensorProduct(
         e,
         layout=cue.ir_mul,
@@ -141,11 +141,7 @@ def test_precision_cuda_vs_fx(
         for inp in e.inputs
     ]
     m = cuet.EquivariantTensorProduct(
-        e,
-        layout=cue.ir_mul,
-        device=device,
-        math_dtype=math_dtype,
-        use_fallback=False
+        e, layout=cue.ir_mul, device=device, math_dtype=math_dtype, use_fallback=False
     )
     y0 = m(inputs)
 
@@ -174,9 +170,9 @@ def test_compile(
 ):
     device = torch.device("cuda:0")
 
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
-                                      use_fallback=False,
-                                      device="cuda")
+    m = cuet.EquivariantTensorProduct(
+        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+    )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
@@ -186,6 +182,7 @@ def test_compile(
     res_script = m_compile(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
+
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
 def test_script(
@@ -198,9 +195,9 @@ def test_script(
 
     device = torch.device("cuda:0")
 
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
-                                      use_fallback=False,
-                                      device="cuda")
+    m = cuet.EquivariantTensorProduct(
+        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+    )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
@@ -209,4 +206,3 @@ def test_script(
     m_script = torch.jit.script(m)
     res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
-
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 909d71d..f5ab6aa 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -78,7 +78,11 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 
     out1 = m(x0, i0, x1)
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds, math_dtype=torch.float64, device=device, use_fallback=True, optimize_fallback=True
+        ds,
+        math_dtype=torch.float64,
+        device=device,
+        use_fallback=True,
+        optimize_fallback=True,
     )
     out2 = m(x0_, i0, x1_)
 
@@ -122,7 +126,9 @@ def test_math_dtype(
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
     ).ds
-    m = cuet.IWeightedSymmetricTensorProduct(ds, math_dtype=math_dtype, device=device, use_fallback=False)
+    m = cuet.IWeightedSymmetricTensorProduct(
+        ds, math_dtype=math_dtype, device=device, use_fallback=False
+    )
     x0 = torch.randn((20, m.x0_size), dtype=dtype, device=device)
     i0 = torch.randint(0, m.x0_size, (1000,), dtype=torch.int32, device=device)
     x1 = torch.randn((i0.size(0), m.x1_size), dtype=dtype, device=device)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index c7bf8e2..e4ba7cc 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -113,9 +113,13 @@ def test_primitive_tensor_product_cuda_vs_fx(
         )
         m = torch.jit.script(m)
         out1 = m(inputs)
-        
+
         m = cuet.TensorProduct(
-            d, device=device, math_dtype=torch.float64, use_fallback=True, optimize_fallback=False
+            d,
+            device=device,
+            math_dtype=torch.float64,
+            use_fallback=True,
+            optimize_fallback=False,
         )
         inputs_ = [inp.clone().to(torch.float64) for inp in inputs]
         out2 = m(inputs_)
@@ -136,4 +140,3 @@ def test_primitive_tensor_product_cuda_vs_fx(
 
         for g1, g2 in zip(double_grad1, double_grad2):
             torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)
-

From ec1eb27425f7aea8f1289dffde94f180ae0925b2 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 01:39:39 -0800
Subject: [PATCH 25/96] format with black

---
 .../primitives/equivariant_tensor_product.py  | 65 +++++++++----------
 .../primitives/symmetric_tensor_product.py    | 34 +++++-----
 .../primitives/tensor_product.py              | 53 ++++++++-------
 .../primitives/transpose.py                   | 26 ++++----
 .../tests/operations/linear_test.py           |  6 +-
 .../tests/operations/tp_channel_wise_test.py  |  4 +-
 .../operations/tp_fully_connected_test.py     |  4 +-
 .../equivariant_tensor_product_test.py        | 22 +++----
 .../symmetric_tensor_product_test.py          | 10 ++-
 .../tests/primitives/tensor_product_test.py   |  9 ++-
 10 files changed, 121 insertions(+), 112 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index cacbd9c..1808ada 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -26,41 +26,33 @@ def __init__(self, tp):
         super().__init__()
         self.tp = tp
 
+
 class Transpose1Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         return ret
-    
+
+
 class Transpose2Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
         return ret
 
+
 class Transpose3Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
         ret[2] = self.tp[2](ret[2])
         return ret
 
+
 class Transpose4Dispatcher(Dispatcher):
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ):
+    def forward(self, inputs: List[torch.Tensor]):
         ret = inputs.copy()
         ret[0] = self.tp[0](ret[0])
         ret[1] = self.tp[1](ret[1])
@@ -68,7 +60,14 @@ def forward(
         ret[3] = self.tp[3](ret[3])
         return ret
 
-TRANSPOSE_DISPATCHERS = [Transpose1Dispatcher, Transpose2Dispatcher, Transpose3Dispatcher, Transpose4Dispatcher]
+
+TRANSPOSE_DISPATCHERS = [
+    Transpose1Dispatcher,
+    Transpose2Dispatcher,
+    Transpose3Dispatcher,
+    Transpose4Dispatcher,
+]
+
 
 class TPDispatcher(Dispatcher):
     def forward(
@@ -80,9 +79,9 @@ def forward(
             # TODO: at some point we will have kernel for this
             assert len(inputs) >= 1
             inputs[0] = inputs[0][indices]
-        return  self.tp(inputs)
+        return self.tp(inputs)
+
 
-    
 class SymmetricTPDispatcher(Dispatcher):
     def forward(
         self,
@@ -91,12 +90,13 @@ def forward(
     ) -> torch.Tensor:
         assert indices is None
         return self.tp(inputs[0])
-    
+
+
 class IWeightedSymmetricTPDispatcher(Dispatcher):
     def forward(
-            self,
-            inputs: List[torch.Tensor],
-            indices: Optional[torch.Tensor] = None,
+        self,
+        inputs: List[torch.Tensor],
+        indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         x0, x1 = inputs
         if indices is None:
@@ -104,11 +104,10 @@ def forward(
                 x0.ndim == 2,
                 f"Expected x0 to have shape (batch, dim), got {x0.shape}",
             )
-            indices = torch.arange(
-                x1.shape[0], dtype=torch.int32, device=x1.device
-            )
+            indices = torch.arange(x1.shape[0], dtype=torch.int32, device=x1.device)
         return self.tp(x0, indices, x1)
 
+
 class EquivariantTensorProduct(torch.nn.Module):
     r"""Equivariant tensor product.
 
@@ -146,7 +145,7 @@ class EquivariantTensorProduct(torch.nn.Module):
         ...
                 [0., 0., 0., 0., 0., 0.]])
     """
-    
+
     def __init__(
         self,
         e: cue.EquivariantTensorProduct,
@@ -184,19 +183,19 @@ def __init__(
                     source=layout_used,
                     target=input_expected.layout,
                     device=device,
-                    use_fallback = use_fallback
+                    use_fallback=use_fallback,
                 )
             )
 
         # script() requires literal addressing and fails to eliminate dead branches
-        self.transpose_in = TRANSPOSE_DISPATCHERS[len(transpose_in)-1](transpose_in)
-        
+        self.transpose_in = TRANSPOSE_DISPATCHERS[len(transpose_in) - 1](transpose_in)
+
         self.transpose_out = cuet.TransposeIrrepsLayout(
             e.output.irreps,
             source=e.output.layout,
             target=layout_out,
             device=device,
-            use_fallback = use_fallback
+            use_fallback=use_fallback,
         )
 
         if any(d.num_operands != e.num_inputs + 1 for d in e.ds):
@@ -228,7 +227,7 @@ def __init__(
                     e.ds[0],
                     device=device,
                     math_dtype=math_dtype,
-                    use_fallback = use_fallback,
+                    use_fallback=use_fallback,
                     optimize_fallback=optimize_fallback,
                 )
             )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index f02be9c..bb30e39 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -57,7 +57,7 @@ def __init__(
                 d0,
                 device=device,
                 math_dtype=math_dtype,
-                use_fallback = use_fallback,
+                use_fallback=use_fallback,
                 optimize_fallback=optimize_fallback,
             )
         else:
@@ -85,13 +85,11 @@ def __init__(
             descriptors,
             device=device,
             math_dtype=math_dtype,
-            use_fallback = use_fallback,
+            use_fallback=use_fallback,
             optimize_fallback=optimize_fallback,
         )
 
-    def forward(
-        self, x0: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, x0: torch.Tensor) -> torch.Tensor:
         r"""
         Perform the forward pass of the indexed symmetric tensor product operation.
 
@@ -109,7 +107,7 @@ def forward(
         out = self.f(
             torch.ones((1, 1), dtype=x0.dtype, device=x0.device),
             torch.zeros((x0.shape[0],), dtype=torch.int32, device=x0.device),
-            x0
+            x0,
         )
         if self.f0 is not None:
             out += self.f0([])
@@ -153,7 +151,7 @@ def __init__(
         self.x1_size = d.operands[1].size
         self.x2_size = d.operands[-1].size
         self.has_cuda = False
-        
+
         if use_fallback is None or not use_fallback:
             try:
                 self.f = CUDAKernel(descriptors, device, math_dtype)
@@ -163,7 +161,7 @@ def __init__(
                 logger.info(f"Failed to initialize CUDA implementation: {e}")
             except ImportError as e:
                 logger.warning(f"Failed to initialize CUDA implementation: {e}")
-                
+
         if use_fallback is None or use_fallback:
             self.f = FallbackImpl(
                 descriptors,
@@ -172,11 +170,15 @@ def __init__(
                 optimize_fallback=optimize_fallback,
             )
         else:
-            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available")
+            raise RuntimeError(
+                "`use_fallback` is `False` and no CUDA kernel is available"
+            )
 
     def __repr__(self):
         has_cuda_kernel = (
-            "(with CUDA kernel)" if self.has_cuda is not None else "(without CUDA kernel)"
+            "(with CUDA kernel)"
+            if self.has_cuda is not None
+            else "(without CUDA kernel)"
         )
         return f"IWeightedSymmetricTensorProduct({has_cuda_kernel})"
 
@@ -212,16 +214,13 @@ def forward(
         )
         shape = broadcast_shapes([i0.shape, x1.shape[:-1]])
         i0 = i0.expand(shape).reshape((prod(shape),))
-        x1 = x1.expand(shape + (x1.shape[-1],)).reshape(
-            (prod(shape), x1.shape[-1])
-        )
-        
-        out = self.f(x0, i0, x1)  
+        x1 = x1.expand(shape + (x1.shape[-1],)).reshape((prod(shape), x1.shape[-1]))
+
+        out = self.f(x0, i0, x1)
         out = out.reshape(shape + (self.x2_size,))
         return out
 
 
-
 def _check_descriptors(descriptors: list[stp.SegmentedTensorProduct]):
     if len(descriptors) == 0:
         raise ValueError("stps must contain at least one STP.")
@@ -359,6 +358,5 @@ def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
         return sum(
-            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2))
-            for f in self.fs
+            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2)) for f in self.fs
         )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 0540eda..e0d5e98 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -23,12 +23,14 @@
 
 logger = logging.getLogger(__name__)
 
+
 def prod(numbers: List[int]):
     product = 1
     for num in numbers:
         product *= num
     return product
 
+
 def broadcast_shapes(shapes: List[List[int]]):
     if torch.jit.is_scripting():
         max_len = 0
@@ -47,15 +49,23 @@ def broadcast_shapes(shapes: List[List[int]]):
             if isinstance(shape, (tuple, list)):
                 for i in range(-1, -1 - len(shape), -1):
                     if shape[i] < 0:
-                        raise RuntimeError("Trying to create tensor with negative dimension ({}): ({})"
-                                           .format(shape[i], shape[i]))
+                        raise RuntimeError(
+                            "Trying to create tensor with negative dimension ({}): ({})".format(
+                                shape[i], shape[i]
+                            )
+                        )
                     if shape[i] == 1 or shape[i] == result[i]:
                         continue
                     if result[i] != 1:
-                        raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
+                        raise RuntimeError(
+                            "Shape mismatch: objects cannot be broadcast to a single shape"
+                        )
                     result[i] = shape[i]
             else:
-                raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape)
+                raise RuntimeError(
+                    "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
+                    shape,
+                )
         return torch.Size(result)
     else:
         return torch.functional.broadcast_shapes(*shapes)
@@ -76,6 +86,7 @@ class TensorProduct(torch.nn.Module):
             RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
     """
+
     def __init__(
         self,
         descriptor: stp.SegmentedTensorProduct,
@@ -91,8 +102,8 @@ def __init__(
             math_dtype = torch.get_default_dtype()
         self.f = None
         self.has_cuda = False
-        
-        if not use_fallback == True: 
+
+        if not use_fallback == True:
             try:
                 self.f = _tensor_product_cuda(descriptor, device, math_dtype)
                 self.has_cuda = True
@@ -107,9 +118,11 @@ def __init__(
                     "pip install cuequivariance-ops-torch-cu11\n"
                     "pip install cuequivariance-ops-torch-cu12"
                 )
-                
-        if use_fallback == False: 
-            raise RuntimeError("`use_fallback` is `False` and no CUDA kernel is available!")
+
+        if use_fallback == False:
+            raise RuntimeError(
+                "`use_fallback` is `False` and no CUDA kernel is available!"
+            )
         else:
             self.f = _tensor_product_fx(
                 descriptor, device, math_dtype, optimize_fallback is True
@@ -118,7 +131,7 @@ def __init__(
                 warnings.warn(
                     "The fallback method is used but it has not been optimized. "
                     "Consider setting optimize_fallback=True when creating the TensorProduct module."
-                )    
+                )
             self._optimize_fallback = optimize_fallback
 
     def __repr__(self):
@@ -216,9 +229,7 @@ def _tensor_product_fx(
 
             seg_shape = descriptor.get_segment_shape(-1, path)
             outputs += [
-                out.reshape(
-                    out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),)
-                )
+                out.reshape(out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),))
             ]
 
         if len(outputs) == 0:
@@ -305,7 +316,7 @@ def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProdu
         self.module = module
         self.descriptor = descriptor
 
-    def forward(self, args:List[torch.Tensor]):
+    def forward(self, args: List[torch.Tensor]):
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             for oid, arg in enumerate(args):
                 torch._assert(
@@ -417,7 +428,7 @@ def _tensor_product_cuda(
     elif descriptor.num_operands == 4:
         return FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
 
-                        
+
 def _reshape(x: torch.Tensor, leading_shape: List[int]) -> torch.Tensor:
     # Make x have shape (Z, x.shape[-1]) or (x.shape[-1],)
     if prod(leading_shape) > 1 and prod(x.shape[:-1]) == 1:
@@ -464,10 +475,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"FusedTensorProductOp3({self.descriptor} (output last operand))"
 
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ) -> torch.Tensor:
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1 = self._perm(inputs[0], inputs[1])
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
@@ -522,10 +530,7 @@ def __init__(
     def __repr__(self) -> str:
         return f"FusedTensorProductOp4({self.descriptor} (output last operand))"
 
-    def forward(
-        self,
-        inputs: List[torch.Tensor]
-    ) -> torch.Tensor:
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1, x2 = self._perm(inputs[0], inputs[1], inputs[2])
         assert x0.ndim >= 1, x0.ndim
         assert x1.ndim >= 1, x1.ndim
@@ -545,6 +550,7 @@ def forward(
 
         return out.reshape(shape + (out.shape[-1],))
 
+
 class TensorProductUniform1d(torch.nn.Module):
     def __init__(
         self,
@@ -571,6 +577,7 @@ def __init__(
             math_dtype=math_dtype,
         ).to(device=device)
 
+
 class TensorProductUniform3x1d(TensorProductUniform1d):
     def __repr__(self):
         return f"TensorProductUniform3x1d({self.descriptor} (output last operand))"
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index 252e45e..9e7156e 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -36,19 +36,21 @@ def __init__(
         source: cue.IrrepsLayout,
         target: cue.IrrepsLayout,
         device: Optional[torch.device] = None,
-        use_fallback: Optional[bool] = False
+        use_fallback: Optional[bool] = False,
     ):
         super().__init__()
 
         if (source, target) == (cue.mul_ir, cue.ir_mul):
             self.f = TransposeSegments(
-                [(mul, ir.dim) for mul, ir in irreps], device=device,
-                use_fallback = use_fallback
+                [(mul, ir.dim) for mul, ir in irreps],
+                device=device,
+                use_fallback=use_fallback,
             )
         elif (source, target) == (cue.ir_mul, cue.mul_ir):
             self.f = TransposeSegments(
-                [(ir.dim, mul) for mul, ir in irreps], device=device,
-                use_fallback = use_fallback
+                [(ir.dim, mul) for mul, ir in irreps],
+                device=device,
+                use_fallback=use_fallback,
             )
         else:
             self.f = torch.nn.Identity()
@@ -61,9 +63,7 @@ def __init__(
     def __repr__(self):
         return f"TransposeIrrepsLayout({self.source} -> {self.target})"
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         r"""
         Perform the transposition.
 
@@ -82,8 +82,10 @@ def forward(
 
 class TransposeSegments(torch.nn.Module):
     def __init__(
-            self, segments: list[tuple[int, int]], device: Optional[torch.device] = None,
-            use_fallback: Optional[bool] = False
+        self,
+        segments: list[tuple[int, int]],
+        device: Optional[torch.device] = None,
+        use_fallback: Optional[bool] = False,
     ):
         super().__init__()
 
@@ -105,9 +107,7 @@ def __init__(
     def __repr__(self):
         return "TransposeSegments()"
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Perform the transposition of the input tensor using either a CUDA kernel or a PyTorch fallback.
 
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index d0d8a41..26b9e5b 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -45,7 +45,7 @@ def test_linear_fwd(
         shared_weights=shared_weights,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=False
+        use_fallback=False,
     )
     linear_fx = cuet.Linear(
         irreps_in,
@@ -54,7 +54,7 @@ def test_linear_fwd(
         shared_weights=shared_weights,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=True
+        use_fallback=True,
     )
     x = torch.randn(10, irreps_in.dim, dtype=torch.float64).cuda()
 
@@ -90,7 +90,7 @@ def test_linear_bwd_bwd(
             shared_weights=shared_weights,
             device="cuda",
             dtype=torch.float64,
-            use_fallback=use_fallback
+            use_fallback=use_fallback,
         )
 
         # reset the seed to ensure the same initialization
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index 4e12f73..c48e1e1 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -59,7 +59,7 @@ def test_channel_wise(
         layout=layout,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=True
+        use_fallback=True,
     )
 
     x1 = torch.randn(batch, irreps1.dim, dtype=torch.float64).cuda()
@@ -104,7 +104,7 @@ def test_channel_wise_bwd_bwd():
             layout=cue.ir_mul,
             device="cuda",
             dtype=torch.float64,
-            use_fallback=use_fallback
+            use_fallback=use_fallback,
         )
 
         (grad1, grad2, grad3) = torch.autograd.grad(
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index 49c65a0..d9b19b4 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -47,7 +47,7 @@ def test_fully_connected(
         layout=layout,
         device="cuda",
         dtype=torch.float64,
-        use_fallback=use_fallback
+        use_fallback=use_fallback,
     )
 
     x1 = torch.randn(32, irreps1.dim, dtype=torch.float64).cuda()
@@ -74,7 +74,7 @@ def test_compile():
         irreps_out=cue.Irreps("O3", "32x0e + 32x1o"),
         layout=cue.mul_ir,
         device=device,
-        use_fallback=False
+        use_fallback=False,
     )
 
     m_compile = torch.compile(m, fullgraph=True)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 8f3ea2f..59d44b8 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -77,7 +77,7 @@ def test_performance_cuda_vs_fx(
         math_dtype=math_dtype,
         use_fallback=False,
     )
-    
+
     m1 = cuet.EquivariantTensorProduct(
         e,
         layout=cue.ir_mul,
@@ -141,11 +141,7 @@ def test_precision_cuda_vs_fx(
         for inp in e.inputs
     ]
     m = cuet.EquivariantTensorProduct(
-        e,
-        layout=cue.ir_mul,
-        device=device,
-        math_dtype=math_dtype,
-        use_fallback=False
+        e, layout=cue.ir_mul, device=device, math_dtype=math_dtype, use_fallback=False
     )
     y0 = m(inputs)
 
@@ -174,9 +170,9 @@ def test_compile(
 ):
     device = torch.device("cuda:0")
 
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
-                                      use_fallback=False,
-                                      device="cuda")
+    m = cuet.EquivariantTensorProduct(
+        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+    )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
@@ -186,6 +182,7 @@ def test_compile(
     res_script = m_compile(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
+
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
 def test_script(
@@ -198,9 +195,9 @@ def test_script(
 
     device = torch.device("cuda:0")
 
-    m = cuet.EquivariantTensorProduct(e, layout=cue.mul_ir,
-                                      use_fallback=False,
-                                      device="cuda")
+    m = cuet.EquivariantTensorProduct(
+        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+    )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
@@ -209,4 +206,3 @@ def test_script(
     m_script = torch.jit.script(m)
     res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
-
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 909d71d..f5ab6aa 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -78,7 +78,11 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 
     out1 = m(x0, i0, x1)
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds, math_dtype=torch.float64, device=device, use_fallback=True, optimize_fallback=True
+        ds,
+        math_dtype=torch.float64,
+        device=device,
+        use_fallback=True,
+        optimize_fallback=True,
     )
     out2 = m(x0_, i0, x1_)
 
@@ -122,7 +126,9 @@ def test_math_dtype(
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
     ).ds
-    m = cuet.IWeightedSymmetricTensorProduct(ds, math_dtype=math_dtype, device=device, use_fallback=False)
+    m = cuet.IWeightedSymmetricTensorProduct(
+        ds, math_dtype=math_dtype, device=device, use_fallback=False
+    )
     x0 = torch.randn((20, m.x0_size), dtype=dtype, device=device)
     i0 = torch.randint(0, m.x0_size, (1000,), dtype=torch.int32, device=device)
     x1 = torch.randn((i0.size(0), m.x1_size), dtype=dtype, device=device)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index c7bf8e2..e4ba7cc 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -113,9 +113,13 @@ def test_primitive_tensor_product_cuda_vs_fx(
         )
         m = torch.jit.script(m)
         out1 = m(inputs)
-        
+
         m = cuet.TensorProduct(
-            d, device=device, math_dtype=torch.float64, use_fallback=True, optimize_fallback=False
+            d,
+            device=device,
+            math_dtype=torch.float64,
+            use_fallback=True,
+            optimize_fallback=False,
         )
         inputs_ = [inp.clone().to(torch.float64) for inp in inputs]
         out2 = m(inputs_)
@@ -136,4 +140,3 @@ def test_primitive_tensor_product_cuda_vs_fx(
 
         for g1, g2 in zip(double_grad1, double_grad2):
             torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)
-

From faf235eb7e68484b4e37a56a5d09889ddb237119 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 01:58:46 -0800
Subject: [PATCH 26/96] fix tests

---
 cuequivariance_torch/tests/operations/linear_test.py      | 4 ++++
 .../tests/operations/tp_channel_wise_test.py              | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index 26b9e5b..f06c8e5 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -38,6 +38,7 @@ def test_linear_fwd(
     layout: cue.IrrepsLayout,
     shared_weights: bool,
 ):
+    torch.manual_seed(0)
     linear = cuet.Linear(
         irreps_in,
         irreps_out,
@@ -47,6 +48,8 @@ def test_linear_fwd(
         dtype=torch.float64,
         use_fallback=False,
     )
+
+    torch.manual_seed(0)
     linear_fx = cuet.Linear(
         irreps_in,
         irreps_out,
@@ -83,6 +86,7 @@ def test_linear_bwd_bwd(
 ):
     outputs = dict()
     for use_fallback in [True, False]:
+        torch.manual_seed(0)
         linear = cuet.Linear(
             irreps_in,
             irreps_out,
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index c48e1e1..d3628ab 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -89,9 +89,6 @@ def test_channel_wise_bwd_bwd():
     x2 = torch.randn(
         32, irreps2.dim, device="cuda", requires_grad=True, dtype=torch.float64
     )
-    w = torch.randn(
-        m.weight_numel, device="cuda", requires_grad=True, dtype=torch.float64
-    )
 
     outputs = {}
     for use_fallback in [True, False]:
@@ -107,6 +104,11 @@ def test_channel_wise_bwd_bwd():
             use_fallback=use_fallback,
         )
 
+        torch.manual_seed(0)
+        w = torch.randn(
+            m.weight_numel, device="cuda", requires_grad=True, dtype=torch.float64
+        )
+
         (grad1, grad2, grad3) = torch.autograd.grad(
             m(x1, x2, w).pow(2).sum(), (x1, x2, w), create_graph=True
         )

From c476af9599a96a21a5650333838d810d30d5c2ec Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:11:36 -0800
Subject: [PATCH 27/96] fix missing parenthesis

---
 .../cuequivariance_torch/primitives/tensor_product.py          | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index e0d5e98..ac8cf42 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -541,7 +541,7 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x1 = _reshape(x1, shape)
         x2 = _reshape(x2, shape)
 
-        if not torch.jit.is_scripting and not torch.compiler.is_compiling():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
@@ -607,7 +607,6 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
 
 
 class TensorProductUniform4x1d(TensorProductUniform1d):
-
     def __repr__(self):
         return f"TensorProductUniform4x1d({self.descriptor} (output last operand))"
 

From 994b8d9640ee752938155f930dfa8161d0718dcc Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:11:53 -0800
Subject: [PATCH 28/96] fix tests: increase
 torch._dynamo.config.cache_size_limit

---
 .../tests/primitives/equivariant_tensor_product_test.py   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 59d44b8..5001a0c 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -16,11 +16,14 @@
 
 import pytest
 import torch
+import torch._dynamo
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+torch._dynamo.config.cache_size_limit = 100
+
 
 def make_descriptors():
     # This ETP will trigger the fusedTP kernel
@@ -171,7 +174,7 @@ def test_compile(
     device = torch.device("cuda:0")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+        e, layout=cue.mul_ir, use_fallback=False, device="cuda", math_dtype=math_dtype
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
@@ -192,11 +195,10 @@ def test_script(
     atol: float,
     rtol: float,
 ):
-
     device = torch.device("cuda:0")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+        e, layout=cue.mul_ir, use_fallback=False, device="cuda", math_dtype=math_dtype
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)

From f240eb8ca5a621adc2f1a1385f89427922ef8717 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:24:25 -0800
Subject: [PATCH 29/96] fix docstring tests

---
 .../layers/tp_conv_fully_connected.py         | 22 +------------------
 .../operations/symmetric_contraction.py       |  5 +++--
 .../primitives/equivariant_tensor_product.py  | 18 ++++++---------
 3 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
index e4b3a58..c3eca17 100644
--- a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
@@ -72,27 +72,7 @@ class FullyConnectedTensorProductConv(nn.Module):
         >>> conv1 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
         ...     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU(), layout=cue.ir_mul).cuda()
         >>> conv1
-        FullyConnectedTensorProductConv(
-          (tp): FullyConnectedTensorProduct(
-            shared_weights=False, internal_weights=False, weight_numel=64
-            (f): EquivariantTensorProduct(
-              EquivariantTensorProduct(64x0e x 4x0e+4x1o x 0e+1o -> 4x0e+4x1o)
-              (transpose_in): ModuleList(
-                (0-2): 3 x TransposeIrrepsLayout((irrep,mul) -> (irrep,mul))
-              )
-              (transpose_out): TransposeIrrepsLayout((irrep,mul) -> (irrep,mul))
-              (tp): TensorProduct(uvw,iu,jv,kw+ijk sizes=64,16,4,16 num_segments=4,2,2,2 num_paths=4 i={1, 3} j={1, 3} k={1, 3} u=4 v=1 w=4 (with CUDA kernel))
-            )
-          )
-          (batch_norm): BatchNorm(4x0e+4x1o, layout=(irrep,mul), eps=1e-05, momentum=0.1)
-          (mlp): Sequential(
-            (0): Linear(in_features=6, out_features=16, bias=True)
-            (1): ReLU()
-            (2): Linear(in_features=16, out_features=16, bias=True)
-            (3): ReLU()
-            (4): Linear(in_features=16, out_features=64, bias=True)
-          )
-        )
+        FullyConnectedTensorProductConv(...)
         >>> # out = conv1(src_features, edge_sh, edge_emb, graph)
 
         **Case 2**: If edge_emb is constructed by concatenating scalar features from
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index fac5739..e1b8d66 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -71,14 +71,15 @@ class SymmetricContraction(torch.nn.Module):
         ...     layout_out=cue.mul_ir,
         ...     original_mace=True,
         ...     dtype=torch.float64,
+        ...     device=torch.device("cuda"),
         ... )
 
         Then the execution is as follows:
 
-        >>> node_feats = torch.randn(128, 32, feats_irreps.dim // 32, dtype=torch.float64)
+        >>> node_feats = torch.randn(128, 32, feats_irreps.dim // 32, dtype=torch.float64).cuda()
         >>> # with node_attrs_index being the index version of node_attrs, sth like:
         >>> # node_attrs_index = torch.nonzero(node_attrs)[:, 1].int()
-        >>> node_attrs_index = torch.randint(0, 10, (128,), dtype=torch.int32)
+        >>> node_attrs_index = torch.randint(0, 10, (128,), dtype=torch.int32).cuda()
         >>> # OLD CALL:
         >>> # symmetric_contractions_old(node_feats, node_attrs)
         >>> # NEW CALL:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 1808ada..48f4cdd 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -127,23 +127,19 @@ class EquivariantTensorProduct(torch.nn.Module):
         >>> e = cue.descriptors.fully_connected_tensor_product(
         ...    cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1")
         ... )
-        >>> w = torch.ones(e.inputs[0].irreps.dim)
-        >>> x1 = torch.ones(17, e.inputs[1].irreps.dim)
-        >>> x2 = torch.ones(17, e.inputs[2].irreps.dim)
-        >>> tp = cuet.EquivariantTensorProduct(e, layout=cue.ir_mul)
+        >>> w = torch.ones(e.inputs[0].irreps.dim).cuda()
+        >>> x1 = torch.ones(17, e.inputs[1].irreps.dim).cuda()
+        >>> x2 = torch.ones(17, e.inputs[2].irreps.dim).cuda()
+        >>> tp = cuet.EquivariantTensorProduct(e, layout=cue.ir_mul, device=torch.device("cuda"))
         >>> tp([w, x1, x2])
-        tensor([[0., 0., 0., 0., 0., 0.],
-        ...
-                [0., 0., 0., 0., 0., 0.]])
+        tensor([[0., 0., 0., 0., 0., 0.],...)
 
         You can optionally index the first input tensor:
 
-        >>> w = torch.ones(3, e.inputs[0].irreps.dim)
+        >>> w = torch.ones(3, e.inputs[0].irreps.dim).cuda()
         >>> indices = torch.randint(3, (17,))
         >>> tp([w, x1, x2], indices=indices)
-        tensor([[0., 0., 0., 0., 0., 0.],
-        ...
-                [0., 0., 0., 0., 0., 0.]])
+        tensor([[0., 0., 0., 0., 0., 0.],...)
     """
 
     def __init__(

From fbfb9d084187e82f95bd37fa216fdafcbcb3f76c Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:29:01 -0800
Subject: [PATCH 30/96] replace == by is

---
 .../cuequivariance_torch/primitives/tensor_product.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index ac8cf42..f5f783d 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -103,7 +103,7 @@ def __init__(
         self.f = None
         self.has_cuda = False
 
-        if not use_fallback == True:
+        if use_fallback is None or use_fallback is False:
             try:
                 self.f = _tensor_product_cuda(descriptor, device, math_dtype)
                 self.has_cuda = True
@@ -119,7 +119,7 @@ def __init__(
                     "pip install cuequivariance-ops-torch-cu12"
                 )
 
-        if use_fallback == False:
+        if use_fallback is False:
             raise RuntimeError(
                 "`use_fallback` is `False` and no CUDA kernel is available!"
             )

From dc20be5f6285d0e6e9524748fcd5936ef543ecff Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:46:20 -0800
Subject: [PATCH 31/96] clean use_fallback conditions

---
 .../primitives/symmetric_tensor_product.py    | 13 +++++-----
 .../primitives/tensor_product.py              | 14 ++++++-----
 .../primitives/transpose.py                   | 25 ++++++++++---------
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index bb30e39..4b1485d 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -152,7 +152,7 @@ def __init__(
         self.x2_size = d.operands[-1].size
         self.has_cuda = False
 
-        if use_fallback is None or not use_fallback:
+        if use_fallback is None or use_fallback is False:
             try:
                 self.f = CUDAKernel(descriptors, device, math_dtype)
                 self.has_cuda = True
@@ -162,17 +162,18 @@ def __init__(
             except ImportError as e:
                 logger.warning(f"Failed to initialize CUDA implementation: {e}")
 
-        if use_fallback is None or use_fallback:
+        if use_fallback is False and not self.has_cuda:
+            raise RuntimeError(
+                "`use_fallback` is `False` and no CUDA kernel is available!"
+            )
+
+        if self.f is None:
             self.f = FallbackImpl(
                 descriptors,
                 device,
                 math_dtype=math_dtype,
                 optimize_fallback=optimize_fallback,
             )
-        else:
-            raise RuntimeError(
-                "`use_fallback` is `False` and no CUDA kernel is available"
-            )
 
     def __repr__(self):
         has_cuda_kernel = (
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index f5f783d..23392eb 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -107,7 +107,6 @@ def __init__(
             try:
                 self.f = _tensor_product_cuda(descriptor, device, math_dtype)
                 self.has_cuda = True
-                return
             except NotImplementedError as e:
                 logger.info(f"CUDA implementation not available: {e}")
             except ImportError as e:
@@ -119,19 +118,22 @@ def __init__(
                     "pip install cuequivariance-ops-torch-cu12"
                 )
 
-        if use_fallback is False:
+        if use_fallback is False and not self.has_cuda:
             raise RuntimeError(
                 "`use_fallback` is `False` and no CUDA kernel is available!"
             )
-        else:
-            self.f = _tensor_product_fx(
-                descriptor, device, math_dtype, optimize_fallback is True
-            )
+
+        if self.f is None:
             if optimize_fallback is None:
+                optimize_fallback = False
                 warnings.warn(
                     "The fallback method is used but it has not been optimized. "
                     "Consider setting optimize_fallback=True when creating the TensorProduct module."
                 )
+
+            self.f = _tensor_product_fx(
+                descriptor, device, math_dtype, optimize_fallback
+            )
             self._optimize_fallback = optimize_fallback
 
     def __repr__(self):
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index 9e7156e..848920c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -92,16 +92,20 @@ def __init__(
         info = _transpose_info(segments, device=device)
 
         if info is not None:
-            try:
-                import cuequivariance_ops_torch  # noqa: F401
-            except ImportError:
-                self.f_cuda = None
-            else:
-                self.f_cuda = _transpose(info).to(device=device)
-            if use_fallback:
+            if use_fallback is False or use_fallback is None:
+                try:
+                    import cuequivariance_ops_torch  # noqa: F401
+                except ImportError:
+                    self.f = None
+                else:
+                    self.f = _transpose(info).to(device=device)
+
+            if use_fallback is False and self.f is None:
+                raise RuntimeError("CUDA kernel not available for TransposeSegments.")
+
+            if self.f is None:
                 self.f = _transpose_segments_fx(segments).to(device=device)
         else:
-            self.f_cuda = torch.nn.Identity()
             self.f = torch.nn.Identity()
 
     def __repr__(self):
@@ -130,10 +134,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         RuntimeError
             If `use_fallback` is `False` and a CUDA kernel is not available or the input is not on CUDA.
         """
-        if self.f_cuda is not None:
-            return self.f_cuda(x)
-        else:
-            return self.f(x)
+        return self.f(x)
 
 
 def _transpose_segments_fx(segments: list[tuple[int, int]]) -> torch.nn.Module:

From 4b201c35d6c90f383b91e692bef2316981b538e9 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:47:55 -0800
Subject: [PATCH 32/96] fix

---
 .../cuequivariance_torch/primitives/transpose.py               | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index 848920c..b23777a 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -90,13 +90,14 @@ def __init__(
         super().__init__()
 
         info = _transpose_info(segments, device=device)
+        self.f = None
 
         if info is not None:
             if use_fallback is False or use_fallback is None:
                 try:
                     import cuequivariance_ops_torch  # noqa: F401
                 except ImportError:
-                    self.f = None
+                    pass
                 else:
                     self.f = _transpose(info).to(device=device)
 

From b5b59b8daf3b3edcbe01127fc3764bafe7f2edac Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Fri, 6 Dec 2024 02:49:06 -0800
Subject: [PATCH 33/96] fix

---
 .../cuequivariance_torch/primitives/symmetric_tensor_product.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 4b1485d..b62d1b8 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -150,7 +150,9 @@ def __init__(
         self.x0_size = d.operands[0].size
         self.x1_size = d.operands[1].size
         self.x2_size = d.operands[-1].size
+
         self.has_cuda = False
+        self.f = None
 
         if use_fallback is None or use_fallback is False:
             try:

From 72baf17fe6ef76a6c4e5fa7ad84234f6b334e5d1 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Fri, 6 Dec 2024 19:09:44 -0800
Subject: [PATCH 34/96] Export test added, scripting fallback attempt

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/symmetric_tensor_product.py    |   3 +-
 .../primitives/tensor_product.py              |  39 ++-
 .../equivariant_tensor_product_test.py        |  38 ++-
 .../tests/primitives/utils.py                 | 267 ++++++++++++++++++
 4 files changed, 338 insertions(+), 9 deletions(-)
 create mode 100644 cuequivariance_torch/tests/primitives/utils.py

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index bb30e39..2b9b564 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -348,6 +348,7 @@ def __init__(
                     d,
                     device=device,
                     math_dtype=math_dtype,
+                    use_fallback=True,
                     optimize_fallback=optimize_fallback,
                 )
                 for d in stps
@@ -358,5 +359,5 @@ def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
         return sum(
-            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2)) for f in self.fs
+            f([x0[i0]] + [x1] * (f.num_operands - 2)) for f in self.fs
         )
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index e0d5e98..064e286 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -102,7 +102,8 @@ def __init__(
             math_dtype = torch.get_default_dtype()
         self.f = None
         self.has_cuda = False
-
+        self.num_operands = descriptor.num_operands
+        
         if not use_fallback == True:
             try:
                 self.f = _tensor_product_cuda(descriptor, device, math_dtype)
@@ -289,7 +290,7 @@ def __init__(self, descriptor: stp.SegmentedTensorProduct):
                         ),
                     )
 
-            def forward(self, *args):
+            def forward(self, args: List[torch.Tensor]):
                 shape = broadcast_shapes([arg.shape[:-1] for arg in args])
                 output = torch.zeros(
                     shape + (descriptor.operands[-1].size,),
@@ -310,10 +311,37 @@ def forward(self, *args):
     return _Wrapper(graphmod, descriptor)
 
 
+class _Caller(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module):
+        super().__init__()
+        self.module = module
+    
+class _NoArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module()
+
+class _OneArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(args[0])
+
+class _TwoArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(args[0], args[1])
+    
+class _ThreeArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(args[0], args[1], args[2])
+
+class _FourArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(args[0], args[1], args[2], args[3])
+
+CALL_DISPATCHERS = [_NoArgCaller, _OneArgCaller, _TwoArgCaller, _ThreeArgCaller, _FourArgCaller]
+
 class _Wrapper(torch.nn.Module):
     def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProduct):
         super().__init__()
-        self.module = module
+        self.module = CALL_DISPATCHERS[descriptor.num_operands-1](module)
         self.descriptor = descriptor
 
     def forward(self, args: List[torch.Tensor]):
@@ -336,8 +364,7 @@ def forward(self, args: List[torch.Tensor]):
             )
             for arg in args
         ]
-
-        out = self.module(*args)
+        out = self.module(args)
 
         return out.reshape(shape + (out.shape[-1],))
 
@@ -541,7 +568,7 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x1 = _reshape(x1, shape)
         x2 = _reshape(x2, shape)
 
-        if not torch.jit.is_scripting and not torch.compiler.is_compiling():
+        if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 59d44b8..a6099a1 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -21,6 +21,9 @@
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+from utils import (
+    module_with_mode,
+)
 
 def make_descriptors():
     # This ETP will trigger the fusedTP kernel
@@ -171,7 +174,7 @@ def test_compile(
     device = torch.device("cuda:0")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=False, device="cuda"
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
@@ -196,7 +199,7 @@ def test_script(
     device = torch.device("cuda:0")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device="cuda"
+        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=False, device="cuda"
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
@@ -206,3 +209,34 @@ def test_script(
     m_script = torch.jit.script(m)
     res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
+
+# export_modes = ["onnx", "onnx_dynamo", "trt", "torch_trt", "jit"]
+export_modes = ["trt","onnx"]
+
+@pytest.mark.parametrize("e", make_descriptors())
+@pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
+@pytest.mark.parametrize("mode", export_modes)
+
+def test_export(
+    e: cue.EquivariantTensorProduct,
+    dtype: torch.dtype,
+    math_dtype: torch.dtype,
+    atol: float,
+    rtol: float,
+    mode: str,
+    tmp_path
+):
+
+    device = torch.device("cuda:0")
+
+    m = cuet.EquivariantTensorProduct(
+        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=False, device="cuda"
+    )
+    inputs = [
+        torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
+        for inp in e.inputs
+    ]
+    res = m(inputs)
+    m_script = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
+    res_script = m_script(inputs)
+    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
diff --git a/cuequivariance_torch/tests/primitives/utils.py b/cuequivariance_torch/tests/primitives/utils.py
new file mode 100644
index 0000000..ce646fd
--- /dev/null
+++ b/cuequivariance_torch/tests/primitives/utils.py
@@ -0,0 +1,267 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import pytest
+import torch
+from typing import Sequence
+
+try:
+    import onnx  # noqa: F401
+    import onnxscript  # noqa: F401
+    import onnxruntime  # noqa: F401
+    import cuequivariance_ops_torch.onnx  # noqa: F401
+    from cuequivariance_ops_torch.tensorrt import register_plugins
+
+    ONNX_AVAILABLE = True
+except Exception:
+    ONNX_AVAILABLE = False
+
+
+try:
+    import torch_tensorrt
+
+    TORCH_TRT_AVAILABLE = True
+except Exception:
+    TORCH_TRT_AVAILABLE = False
+
+
+def verify_onnx(module, onnx_module, inputs, dtype):
+    if dtype != torch.float32:
+        pytest.skip("onnxrt only checked for float32")
+    from onnxruntime import SessionOptions
+    from onnxruntime_extensions import get_library_path
+    from torch.onnx.verification import (
+        _compare_onnx_pytorch_model,
+        VerificationOptions,
+    )
+
+    original_init = SessionOptions.__init__
+
+    def new_init(self):
+        original_init(self)
+        try:
+            self.register_custom_ops_library(get_library_path())
+        except Exception:
+            pass
+
+    SessionOptions.__init__ = new_init
+    _compare_onnx_pytorch_model(
+        module, onnx_module, tuple(inputs), None, None, VerificationOptions()
+    )
+    SessionOptions.__init__ = original_init
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+
+def verify_trt(module, onnx_module, inputs, dtype):
+    import tensorrt
+    from pkg_resources import parse_version
+
+    if parse_version(tensorrt.__version__) < parse_version("10.3.0"):
+        pytest.skip("TRT < 10.3.0 is not supported!")
+    if dtype == torch.float64:
+        pytest.skip("TRT does not support float64")
+
+    from polygraphy.backend.trt import (
+        engine_from_network,
+        network_from_onnx_path,
+        TrtRunner,
+        CreateConfig,
+    )
+    from polygraphy.backend.onnxrt import OnnxrtRunner
+    from polygraphy.comparator import Comparator, DataLoader
+    from onnxruntime import InferenceSession, SessionOptions
+    from onnxruntime_extensions import get_library_path
+
+    register_plugins()
+
+    network = network_from_onnx_path(onnx_module)
+    trt_engine = engine_from_network(network, config=CreateConfig())
+
+    if dtype != torch.float32:
+        pytest.skip("Comparator only supports float32")
+
+    # Create runners for ONNX and TRT models
+    trt_runner = TrtRunner(trt_engine)
+
+    options = SessionOptions()
+    options.register_custom_ops_library(get_library_path())
+    onnx_runner = OnnxrtRunner(InferenceSession(onnx_module, sess_options=options))
+
+    results = Comparator.run([trt_runner, onnx_runner], data_loader=DataLoader())
+    Comparator.compare_accuracy(results)
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+
+def module_with_mode(
+    mode,
+    module,
+    inputs,
+    math_dtype,
+    tmp_path,
+    grad_modes=["eager", "compile", "jit", "export"],
+):
+    if isinstance(inputs[0], list):
+        dtype = inputs[0][0].dtype
+    else:
+        dtype = inputs[0].dtype
+    if mode in ["trt", "torch_trt", "onnx", "onnx_dynamo", "export"]:
+        if not ONNX_AVAILABLE:
+            pytest.skip("ONNX not available!")
+        if dtype == torch.float64 or math_dtype == torch.float64:
+            pytest.skip("TRT/ORT do not support float64")
+
+    with torch.set_grad_enabled(mode in grad_modes):
+        if mode == "compile":
+            import sys
+
+            if sys.version_info.major == 3 and sys.version_info.minor >= 12:
+                pytest.skip("torch dynamo needs cpy <= 3.11")
+                module = torch.compile(module)
+        elif mode == "fx":
+            module = torch.fx.symbolic_trace(module)
+        elif mode == "jit":
+            module = torch.jit.trace(module, inputs)
+            fname = os.path.join(tmp_path, "test.ts")
+            torch.jit.save(module, fname)
+            module = torch.jit.load(fname)
+        elif mode == "export":
+            exp_program = torch.export.export(module, tuple(inputs))
+            fname = os.path.join(tmp_path, "test.pt2")
+            torch.export.save(exp_program, fname)
+            del exp_program
+            module = torch.export.load(fname).module()
+        elif mode == "torch_trt":
+            if not TORCH_TRT_AVAILABLE:
+                pytest.skip("torch_tensorrt is not installed!")
+            register_plugins()
+            exp_program = torch_tensorrt.dynamo.trace(module, inputs)
+            module = torch_tensorrt.dynamo.compile(
+                exp_program,
+                inputs=inputs,
+                require_full_compilation=True,
+                min_block_size=1,
+                enabled_precisions={torch.float32, dtype},
+                # dryrun=True
+            )
+        elif mode == "onnx" or mode == "trt":
+            try:
+                onnx_path = os.path.join(tmp_path, "test.onnx")
+                torch.onnx.export(
+                    module, tuple(inputs), onnx_path, opset_version=17, verbose=False
+                )
+                if mode == "trt":
+                    verify_trt(module, onnx_path, inputs, dtype)
+                else:
+                    verify_onnx(module, onnx_path, inputs, dtype)
+            except ImportError:
+                pytest.skip("ONNX/TRT is not available")
+
+        elif mode == "onnx_dynamo":
+            try:
+                from cuequivariance_ops_torch.onnx import (
+                    cuequivariance_ops_torch_onnx_registry,
+                )
+
+                export_options = torch.onnx.ExportOptions(
+                    onnx_registry=cuequivariance_ops_torch_onnx_registry
+                )
+                onnx_program = torch.onnx.dynamo_export(
+                    module, *inputs, export_options=export_options
+                )
+                onnx_path = os.path.join(tmp_path, "test.onnx")
+                onnx_program.save(onnx_path)
+                verify_onnx(module, onnx_path, inputs, dtype)
+            except ImportError:
+                pytest.skip("ONNX is not available")
+        elif mode == "eager":
+            pass
+        else:
+            raise ValueError(f"No such mode: {mode}")
+
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+    return module
+
+
+def create_random_tensor_2d(batch_size, stride, requires_grad, dtype, is_shared):
+    data = torch.randn(
+        (stride,) if is_shared else (batch_size, stride),
+        dtype=dtype,
+        device="cuda",
+    ).requires_grad_(requires_grad)
+
+    return data
+
+
+def maybe_detach_and_to(tensor, *args, **kwargs):
+    if tensor is not None:
+        return tensor.clone().detach().to(*args, **kwargs)
+    return None
+
+
+def run_fwd_test(module, x: Sequence):
+    with torch.no_grad():
+        out = module(*x)
+        test_output = [maybe_detach_and_to(out, dtype=torch.float32)]
+        return test_output
+
+
+def run_fwd_bwd_test(module, x: Sequence):
+    out = module(*x)
+
+    loss = out.sum()
+    loss.backward()
+
+    test_output = [maybe_detach_and_to(out, dtype=torch.float32)]
+    test_output.extend([maybe_detach_and_to(t.grad, dtype=torch.float32) for t in x])
+
+    return test_output
+
+
+def run_bwd_bwd_test(module, x: Sequence):
+    test_outputs = []
+    out = module(*x)
+    grads = torch.autograd.grad(out.pow(2).sum(), x, create_graph=True)
+    test_outputs.extend([maybe_detach_and_to(g, dtype=torch.float32) for g in grads])
+    loss = sum([g.sum() for g in grads])
+    loss.backward()
+    test_outputs.extend([maybe_detach_and_to(t.grad, dtype=torch.float32) for t in x])
+    return test_outputs
+
+
+def assert_close_modules(m_test, m_ref, inputs_test, procedure, tol_dict):
+    outs_test = procedure(m_test, inputs_test)
+
+    inputs_ref = [
+        x.clone()
+        .detach()
+        .to(device="cuda", dtype=torch.float32)
+        .requires_grad_(x.requires_grad)
+        for x in inputs_test
+    ]
+    outs_ref = procedure(m_ref, inputs_ref)
+    for out_test, out_ref in zip(outs_test, outs_ref):
+        torch.testing.assert_close(out_test, out_ref, **tol_dict)
+
+
+tol_dict = {
+    # we compare against double for precision reasons
+    # hence FP64 and FP32 threshold are the same
+    (torch.float64, torch.float64): {"atol": 1e-9, "rtol": 1e-5},
+    (torch.float32, torch.float64): {"atol": 1e-4, "rtol": 1e-5},
+    (torch.float64, torch.float32): {"atol": 1e-4, "rtol": 1e-5},
+    (torch.float32, torch.float32): {"atol": 1e-4, "rtol": 1e-5},
+    (torch.bfloat16, torch.float32): {"atol": 4.0, "rtol": 1e-2},
+    (torch.float16, torch.float32): {"atol": 0.25, "rtol": 1e-2},
+}

From 8d319290985a1f2b0b2988f8cceb0a60b7661d39 Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Mon, 9 Dec 2024 13:48:46 +0100
Subject: [PATCH 35/96] enable tests on cpu

---
 .../primitives/tensor_product.py              | 62 +++++++++++++++----
 .../layers/tp_conv_fully_connected_test.py    |  4 +-
 .../tests/operations/linear_test.py           | 18 ++++--
 .../tests/operations/rotation_test.py         | 12 ++--
 .../operations/spherical_harmonics_test.py    | 10 +--
 .../operations/symmetric_contraction_test.py  | 14 +++--
 .../tests/operations/tp_channel_wise_test.py  | 39 ++++++------
 .../operations/tp_fully_connected_test.py     | 30 +++++----
 .../equivariant_tensor_product_test.py        | 42 +++++++------
 .../tests/primitives/script_test.py           | 56 +++++++++++------
 .../symmetric_tensor_product_test.py          | 32 ++++++----
 .../tests/primitives/tensor_product_test.py   | 15 +++--
 .../tests/primitives/transpose_test.py        | 18 +++---
 13 files changed, 223 insertions(+), 129 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index de5b971..b436161 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -19,6 +19,7 @@
 
 import torch
 import torch.fx
+
 from cuequivariance import segmented_tensor_product as stp
 
 logger = logging.getLogger(__name__)
@@ -103,7 +104,7 @@ def __init__(
         self.f = None
         self.has_cuda = False
         self.num_operands = descriptor.num_operands
-        
+
         if use_fallback is None or use_fallback is False:
             try:
                 self.f = _tensor_product_cuda(descriptor, device, math_dtype)
@@ -278,9 +279,9 @@ def _tensor_product_fx(
                     for operand in descriptor.operands[:num_inputs]
                 ]
                 graphmod = opt_einsum_fx.optimize_einsums_full(graphmod, example_inputs)
-    else:
+    elif num_inputs == 0:
 
-        class _no_input_or_no_paths(torch.nn.Module):
+        class _no_input(torch.nn.Module):
             def __init__(self, descriptor: stp.SegmentedTensorProduct):
                 super().__init__()
 
@@ -292,12 +293,9 @@ def __init__(self, descriptor: stp.SegmentedTensorProduct):
                         ),
                     )
 
-            def forward(self, args: List[torch.Tensor]):
-                shape = broadcast_shapes([arg.shape[:-1] for arg in args])
+            def forward(self):
                 output = torch.zeros(
-                    shape + (descriptor.operands[-1].size,),
-                    device=device,
-                    dtype=math_dtype,
+                    (descriptor.operands[-1].size,), device=device, dtype=math_dtype
                 )
                 for pid in range(descriptor.num_paths):
                     output += torch.einsum(
@@ -308,7 +306,12 @@ def forward(self, args: List[torch.Tensor]):
                     )
                 return output
 
-        graphmod = _no_input_or_no_paths(descriptor)
+        graphmod = _no_input(descriptor)
+
+    else:
+        raise NotImplementedError(
+            "No FX implementation for empty paths and non-empty inputs"
+        )
 
     return _Wrapper(graphmod, descriptor)
 
@@ -317,33 +320,66 @@ class _Caller(torch.nn.Module):
     def __init__(self, module: torch.nn.Module):
         super().__init__()
         self.module = module
-    
+
+
 class _NoArgCaller(_Caller):
     def forward(self, args: List[torch.Tensor]):
         return self.module()
 
+
 class _OneArgCaller(_Caller):
     def forward(self, args: List[torch.Tensor]):
         return self.module(args[0])
 
+
 class _TwoArgCaller(_Caller):
     def forward(self, args: List[torch.Tensor]):
         return self.module(args[0], args[1])
-    
+
+
 class _ThreeArgCaller(_Caller):
     def forward(self, args: List[torch.Tensor]):
         return self.module(args[0], args[1], args[2])
 
+
 class _FourArgCaller(_Caller):
     def forward(self, args: List[torch.Tensor]):
         return self.module(args[0], args[1], args[2], args[3])
 
-CALL_DISPATCHERS = [_NoArgCaller, _OneArgCaller, _TwoArgCaller, _ThreeArgCaller, _FourArgCaller]
+
+class _FiveArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(args[0], args[1], args[2], args[3], args[4])
+
+
+class _SixArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(args[0], args[1], args[2], args[3], args[4], args[5])
+
+
+class _SevenArgCaller(_Caller):
+    def forward(self, args: List[torch.Tensor]):
+        return self.module(
+            args[0], args[1], args[2], args[3], args[4], args[5], args[6]
+        )
+
+
+CALL_DISPATCHERS = [
+    _NoArgCaller,
+    _OneArgCaller,
+    _TwoArgCaller,
+    _ThreeArgCaller,
+    _FourArgCaller,
+    _FiveArgCaller,
+    _SixArgCaller,
+    _SevenArgCaller,
+]
+
 
 class _Wrapper(torch.nn.Module):
     def __init__(self, module: torch.nn.Module, descriptor: stp.SegmentedTensorProduct):
         super().__init__()
-        self.module = CALL_DISPATCHERS[descriptor.num_operands-1](module)
+        self.module = CALL_DISPATCHERS[descriptor.num_operands - 1](module)
         self.descriptor = descriptor
 
     def forward(self, args: List[torch.Tensor]):
diff --git a/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py b/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py
index 198c6d8..96602a3 100644
--- a/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py
+++ b/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py
@@ -21,7 +21,7 @@
 import cuequivariance_torch as cuet
 from cuequivariance_torch.layers.tp_conv_fully_connected import scatter_reduce
 
-device = torch.device("cuda:0")
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
 
 @pytest.mark.parametrize("layout", [cue.mul_ir, cue.ir_mul])
@@ -133,7 +133,6 @@ def D(irreps, axis, angle):
 
 @pytest.mark.parametrize("reduce", ["sum", "mean", "prod", "amax", "amin"])
 def test_scatter_reduce(reduce: str):
-    device = torch.device("cuda")
     src = torch.Tensor([3, 1, 0, 1, 1, 2])
     index = torch.Tensor([0, 1, 2, 2, 3, 1])
 
@@ -153,7 +152,6 @@ def test_scatter_reduce(reduce: str):
 
 
 def test_scatter_reduce_empty():
-    device = torch.device("cuda")
     src, index = torch.empty((0, 41)), torch.empty((0,))
     src = src.to(device)
     index = index.to(device)
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index afd5632..2b78ff1 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -20,6 +20,8 @@
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 list_of_irreps = [
     cue.Irreps("SU2", "3x1/2 + 4x1"),
     cue.Irreps("SU2", "2x1/2 + 5x1 + 2x1/2"),
@@ -37,13 +39,16 @@ def test_linear_fwd(
     layout: cue.IrrepsLayout,
     shared_weights: bool,
 ):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     torch.manual_seed(0)
     linear = cuet.Linear(
         irreps_in,
         irreps_out,
         layout=layout,
         shared_weights=shared_weights,
-        device="cuda",
+        device=device,
         dtype=torch.float64,
         use_fallback=False,
     )
@@ -54,7 +59,7 @@ def test_linear_fwd(
         irreps_out,
         layout=layout,
         shared_weights=shared_weights,
-        device="cuda",
+        device=device,
         dtype=torch.float64,
         use_fallback=True,
     )
@@ -83,6 +88,9 @@ def test_linear_bwd_bwd(
     layout: cue.IrrepsLayout,
     shared_weights: bool,
 ):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     outputs = dict()
     for use_fallback in [True, False]:
         torch.manual_seed(0)
@@ -91,7 +99,7 @@ def test_linear_bwd_bwd(
             irreps_out,
             layout=layout,
             shared_weights=shared_weights,
-            device="cuda",
+            device=device,
             dtype=torch.float64,
             use_fallback=use_fallback,
         )
@@ -100,7 +108,7 @@ def test_linear_bwd_bwd(
         torch.manual_seed(0)
 
         x = torch.randn(
-            10, irreps_in.dim, requires_grad=True, device="cuda", dtype=torch.float64
+            10, irreps_in.dim, requires_grad=True, device=device, dtype=torch.float64
         )
 
         if shared_weights:
@@ -158,6 +166,6 @@ def test_linear_copy(
         irreps_out,
         layout=layout,
         shared_weights=shared_weights,
-    ).cuda()
+    ).to(device)
 
     copy.deepcopy(linear)
diff --git a/cuequivariance_torch/tests/operations/rotation_test.py b/cuequivariance_torch/tests/operations/rotation_test.py
index a73c68b..86d0230 100644
--- a/cuequivariance_torch/tests/operations/rotation_test.py
+++ b/cuequivariance_torch/tests/operations/rotation_test.py
@@ -17,16 +17,18 @@
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 
 def test_rotation():
     irreps = cue.Irreps("SO3", "3x0 + 1 + 0 + 4x2 + 4")
-    alpha = torch.tensor(0.3).cuda()
-    beta = torch.tensor(0.4).cuda()
-    gamma = torch.tensor(-0.5).cuda()
+    alpha = torch.tensor(0.3).to(device)
+    beta = torch.tensor(0.4).to(device)
+    gamma = torch.tensor(-0.5).to(device)
 
-    rot = cuet.Rotation(irreps, layout=cue.ir_mul).cuda()
+    rot = cuet.Rotation(irreps, layout=cue.ir_mul).to(device)
 
-    x = torch.randn(10, irreps.dim).cuda()
+    x = torch.randn(10, irreps.dim).to(device)
 
     rx = rot(gamma, beta, alpha, x)
     x_ = rot(-alpha, -beta, -gamma, rx)
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index 73c3d40..955ee87 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -19,6 +19,8 @@
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 
 @pytest.mark.parametrize(
     "dtype, tol",
@@ -26,15 +28,15 @@
 )
 @pytest.mark.parametrize("ell", [1, 2, 3])
 def test_spherical_harmonics(ell: int, dtype, tol):
-    vec = torch.randn(3, dtype=dtype, device="cuda")
+    vec = torch.randn(3, dtype=dtype, device=device)
     axis = np.random.randn(3)
     angle = np.random.rand()
     scale = 1.3
 
     yl = cuet.spherical_harmonics([ell], vec, False)
 
-    R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).cuda()
-    Rl = torch.from_numpy(cue.SO3(ell).rotation(axis, angle)).to(dtype).cuda()
+    R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).to(device)
+    Rl = torch.from_numpy(cue.SO3(ell).rotation(axis, angle)).to(dtype).to(device)
 
     yl1 = cuet.spherical_harmonics([ell], scale * R @ vec, False)
     yl2 = scale**ell * Rl @ yl
@@ -43,7 +45,7 @@ def test_spherical_harmonics(ell: int, dtype, tol):
 
 
 def test_spherical_harmonics_full():
-    vec = torch.randn(3, device="cuda")
+    vec = torch.randn(3, device=device)
     ls = [0, 1, 2, 3]
     yl = cuet.spherical_harmonics(ls, vec, False)
 
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index 46cf3f1..80a4065 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -22,6 +22,8 @@
 import cuequivariance_torch as cuet
 from cuequivariance.experimental.e3nn import O3_e3nn
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 USE_TF32 = False
 torch.backends.cuda.matmul.allow_tf32 = USE_TF32
 torch.backends.cudnn.allow_tf32 = USE_TF32
@@ -45,12 +47,12 @@ def test_symmetric_contraction(dtype, layout, original_mace, batch):
         layout_out=layout,
         dtype=dtype,
         math_dtype=dtype,
-        device="cuda",
+        device=device,
         original_mace=original_mace,
     )
 
-    x = torch.randn((batch, irreps_in.dim), dtype=dtype).cuda()
-    indices = torch.randint(0, 5, (batch,), dtype=torch.int32).cuda()
+    x = torch.randn((batch, irreps_in.dim), dtype=dtype).to(device)
+    indices = torch.randint(0, 5, (batch,), dtype=torch.int32).to(device)
 
     out = m(x, indices)
     assert out.shape == (batch, irreps_out.dim)
@@ -58,7 +60,7 @@ def test_symmetric_contraction(dtype, layout, original_mace, batch):
 
 def from64(shape: tuple[int, ...], data: str) -> torch.Tensor:
     x = np.frombuffer(base64.b64decode(data), dtype=np.float32).reshape(shape)
-    return torch.from_numpy(x.copy()).cuda()
+    return torch.from_numpy(x.copy()).to(device)
 
 
 def test_mace_compatibility():
@@ -76,7 +78,7 @@ def test_mace_compatibility():
     irreps_in = cue.Irreps(O3_e3nn, "0e + 1o + 2e")
     irreps_out = cue.Irreps(O3_e3nn, "0e + 1o")
 
-    i = (torch.arange(3) % num_elements).cuda()
+    i = (torch.arange(3) % num_elements).to(device)
     x = from64(
         (3, 36),
         "mHgaP1zHTz5kdhs/3ygQvwzZf77dhoU8+iP+PzzRRD8L9CY+qi9Fv5aiBz/sGJG/xwaev+5w4b2Mbg8+1jDOP4/cwj9rt/u/FedUP7H6qD4y9LM+i7yvPhcifz8coHE/Vkk1PwK0hb/BNig+GF4gP1FNaD94Uj++d+1qPtkrYD8m8o6/9zK9PihGBz9M6Ne9XgCXP/r6bzxTXJO/glIsQPQlDL/fN5w7VeeKP4iYlD/9Msa/GF/cvg+2gz/oRJ6/0Te4P7g+oz8YQ6g+k0q0vN8WEr41/u0/sa55PmAhvD9FZZw/ICJtvyxFkz+zOAq/8JtNPztZX74E9hK/xCdqv4+0Rz9Ah/g+5vmDv6mLL7+M5DI/xgP3PhWEnj5ZmZ0+DBkXwPa12D1mVPo9rDdWP4DkRD+L85Y9EJ01P+8Hiz6gxSM7/eoPwOQOtr8gjge+NBEYPrmg5L2XpO8/F2tCvjEyWL8gjLw+UOIuP5bhPr9qRvM+ADa5v3rqLLwSr/8+PbZhP4tn675SWVm/SMC1P5h/0r0D8v2/CNS7Pza7SL8PqJG+DsKCOpTKoT+xnLg/",
@@ -95,7 +97,7 @@ def test_mace_compatibility():
         layout_in=cue.ir_mul,
         layout_out=cue.mul_ir,
         original_mace=True,
-        device="cuda",
+        device=device,
         dtype=torch.float32,
         math_dtype=torch.float64,
     )
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index d3628ab..d3e7cdd 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -19,6 +19,8 @@
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 list_of_irreps = [
     cue.Irreps("O3", "32x0e + 32x1o"),
     cue.Irreps("O3", "2x1o + 5x0e + 2e + 1e + 1o"),
@@ -40,54 +42,49 @@ def test_channel_wise(
     use_fallback: bool,
     batch: int,
 ):
-    m = cuet.ChannelWiseTensorProduct(
-        irreps1,
-        irreps2,
-        irreps3,
-        shared_weights=True,
-        internal_weights=True,
-        layout=layout,
-        device="cuda",
-        dtype=torch.float64,
-    )
-    m_fx = cuet.ChannelWiseTensorProduct(
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    m1 = cuet.ChannelWiseTensorProduct(
         irreps1,
         irreps2,
         irreps3,
         shared_weights=True,
         internal_weights=True,
         layout=layout,
-        device="cuda",
+        device=device,
         dtype=torch.float64,
-        use_fallback=True,
+        use_fallback=use_fallback,
     )
+    x1 = torch.randn(batch, irreps1.dim, dtype=torch.float64).to(device)
+    x2 = torch.randn(batch, irreps2.dim, dtype=torch.float64).to(device)
 
-    x1 = torch.randn(batch, irreps1.dim, dtype=torch.float64).cuda()
-    x2 = torch.randn(batch, irreps2.dim, dtype=torch.float64).cuda()
-
-    out1 = m(x1, x2)
+    out1 = m1(x1, x2)
 
     d = descriptors.channelwise_tensor_product(irreps1, irreps2, irreps3).d
     d = d.squeeze_modes("v")
     assert d.subscripts == "u,iu,j,ku+ijk"
     if layout == cue.mul_ir:
         d = d.add_or_transpose_modes("u,ui,j,uk+ijk")
-    mfx = cuet.TensorProduct(d, math_dtype=torch.float64, use_fallback=True).cuda()
-    out2 = mfx([m.weight, x1, x2])
+    m2 = cuet.TensorProduct(d, math_dtype=torch.float64, use_fallback=True).to(device)
+    out2 = m2([m1.weight, x1, x2])
 
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
 
 def test_channel_wise_bwd_bwd():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     irreps1 = cue.Irreps("SO3", "2x0 + 3x1")
     irreps2 = cue.Irreps("SO3", "0 + 1")
     irreps3 = cue.Irreps("SO3", "0 + 1")
 
     x1 = torch.randn(
-        32, irreps1.dim, device="cuda", requires_grad=True, dtype=torch.float64
+        32, irreps1.dim, device=device, requires_grad=True, dtype=torch.float64
     )
     x2 = torch.randn(
-        32, irreps2.dim, device="cuda", requires_grad=True, dtype=torch.float64
+        32, irreps2.dim, device=device, requires_grad=True, dtype=torch.float64
     )
 
     outputs = {}
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index d9b19b4..832904b 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -19,6 +19,8 @@
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 list_of_irreps = [
     cue.Irreps("O3", "4x0e + 4x1o"),
     cue.Irreps("O3", "2x1o + 5x0e + 2e + 1e + 1o"),
@@ -38,43 +40,49 @@ def test_fully_connected(
     layout: cue.IrrepsLayout,
     use_fallback: bool,
 ):
-    m = cuet.FullyConnectedTensorProduct(
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    m1 = cuet.FullyConnectedTensorProduct(
         irreps1,
         irreps2,
         irreps3,
         shared_weights=True,
         internal_weights=True,
         layout=layout,
-        device="cuda",
+        device=device,
         dtype=torch.float64,
         use_fallback=use_fallback,
     )
 
-    x1 = torch.randn(32, irreps1.dim, dtype=torch.float64).cuda()
-    x2 = torch.randn(32, irreps2.dim, dtype=torch.float64).cuda()
+    x1 = torch.randn(32, irreps1.dim, dtype=torch.float64).to(device)
+    x2 = torch.randn(32, irreps2.dim, dtype=torch.float64).to(device)
 
-    out1 = m(x1, x2)
+    out1 = m1(x1, x2)
 
     d = descriptors.fully_connected_tensor_product(irreps1, irreps2, irreps3).d
     if layout == cue.mul_ir:
         d = d.add_or_transpose_modes("uvw,ui,vj,wk+ijk")
-    mfx = cuet.TensorProduct(d, math_dtype=torch.float64, use_fallback=True).cuda()
-    out2 = mfx(
-        [m.weight.to(torch.float64), x1.to(torch.float64), x2.to(torch.float64)],
+    m2 = cuet.TensorProduct(d, math_dtype=torch.float64, use_fallback=True).to(device)
+    out2 = m2(
+        [m1.weight.to(torch.float64), x1.to(torch.float64), x2.to(torch.float64)],
     ).to(out1.dtype)
 
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
 
-def test_compile():
-    device = "cuda"
+@pytest.mark.parametrize("use_fallback", [False, True])
+def test_compile(use_fallback: bool):
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     m = cuet.FullyConnectedTensorProduct(
         irreps_in1=cue.Irreps("O3", "32x0e + 32x1o"),
         irreps_in2=cue.Irreps("O3", "32x0e + 32x1o"),
         irreps_out=cue.Irreps("O3", "32x0e + 32x1o"),
         layout=cue.mul_ir,
         device=device,
-        use_fallback=False,
+        use_fallback=use_fallback,
     )
 
     m_compile = torch.compile(m, fullgraph=True)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 01ca6d2..68531a7 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -17,17 +17,18 @@
 import pytest
 import torch
 import torch._dynamo
+from utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+torch._dynamo.config.cache_size_limit = 100
 
-from utils import (
-    module_with_mode,
-)
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
-torch._dynamo.config.cache_size_limit = 100
 
 def make_descriptors():
     # This ETP will trigger the fusedTP kernel
@@ -61,7 +62,7 @@ def make_descriptors():
     (torch.float32, torch.float32),
     (torch.float64, torch.float64),
 ]
-if torch.cuda.get_device_capability()[0] >= 8:
+if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     settings1 += [
         (torch.float16, torch.float32),
         (torch.bfloat16, torch.float32),
@@ -75,7 +76,8 @@ def test_performance_cuda_vs_fx(
     dtype: torch.dtype,
     math_dtype: torch.dtype,
 ):
-    device = torch.device("cuda:0")
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     m = cuet.EquivariantTensorProduct(
         e,
@@ -125,7 +127,7 @@ def f1():
     (torch.float64, torch.float32, 1e-5, 1e-6),
     (torch.float64, torch.float64, 1e-12, 0),
 ]
-if torch.cuda.get_device_capability()[0] >= 8:
+if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     settings2 += [
         (torch.float16, torch.float32, 1, 0.2),
         (torch.bfloat16, torch.float32, 1, 0.2),
@@ -141,7 +143,8 @@ def test_precision_cuda_vs_fx(
     atol: float,
     rtol: float,
 ):
-    device = torch.device("cuda:0")
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
@@ -175,10 +178,11 @@ def test_compile(
     atol: float,
     rtol: float,
 ):
-    device = torch.device("cuda:0")
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device="cuda", math_dtype=math_dtype
+        e, layout=cue.mul_ir, use_fallback=False, device=device, math_dtype=math_dtype
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
@@ -199,10 +203,11 @@ def test_script(
     atol: float,
     rtol: float,
 ):
-    device = torch.device("cuda:0")
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device="cuda", math_dtype=math_dtype
+        e, layout=cue.mul_ir, use_fallback=False, device=device, math_dtype=math_dtype
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
@@ -213,13 +218,14 @@ def test_script(
     res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
+
 # export_modes = ["onnx", "onnx_dynamo", "trt", "torch_trt", "jit"]
-export_modes = ["trt","onnx"]
+export_modes = ["trt", "onnx"]
+
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
 @pytest.mark.parametrize("mode", export_modes)
-
 def test_export(
     e: cue.EquivariantTensorProduct,
     dtype: torch.dtype,
@@ -227,13 +233,13 @@ def test_export(
     atol: float,
     rtol: float,
     mode: str,
-    tmp_path
+    tmp_path,
 ):
-
-    device = torch.device("cuda:0")
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=False, device="cuda"
+        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=False, device=device
     )
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/script_test.py
index 37b2a0c..4706bff 100644
--- a/cuequivariance_torch/tests/primitives/script_test.py
+++ b/cuequivariance_torch/tests/primitives/script_test.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 
 import cuequivariance as cue
@@ -11,26 +12,32 @@
     TensorProductUniform4x1d,
 )
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 
 def test_script_symmetric_contraction():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     ds = cue.descriptors.symmetric_contraction(
         32 * cue.Irreps("SO3", "0 + 1"), 32 * cue.Irreps("SO3", "0 + 1"), [1, 2, 3]
     ).ds
 
     batch = 12
-    x0 = torch.randn(3, ds[0].operands[0].size, device="cuda:0", dtype=torch.float32)
-    i0 = torch.zeros(batch, device="cuda:0", dtype=torch.int32)
-    x1 = torch.randn(
-        batch, ds[0].operands[1].size, device="cuda:0", dtype=torch.float32
-    )
+    x0 = torch.randn(3, ds[0].operands[0].size, device=device, dtype=torch.float32)
+    i0 = torch.zeros(batch, device=device, dtype=torch.int32)
+    x1 = torch.randn(batch, ds[0].operands[1].size, device=device, dtype=torch.float32)
 
-    module = SymmetricTensorProduct(ds, torch.device("cuda:0"), torch.float32)
+    module = SymmetricTensorProduct(ds, device, torch.float32)
     module = torch.jit.script(module)
 
     assert module(x0, i0, x1).shape == (batch, ds[0].operands[-1].size)
 
 
 def test_script_fused_tp_3():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     d = (
         cue.descriptors.full_tensor_product(
             cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1")
@@ -40,16 +47,19 @@ def test_script_fused_tp_3():
     )
 
     batch = 12
-    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
-    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
+    x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
 
-    module = FusedTensorProductOp3(d, (0, 1), torch.device("cuda:0"), torch.float32)
+    module = FusedTensorProductOp3(d, (0, 1), device, torch.float32)
     module = torch.jit.script(module)
 
     assert module([x0, x1]).shape == (batch, d.operands[2].size)
 
 
 def test_script_fused_tp_4():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     d = (
         cue.descriptors.fully_connected_tensor_product(
             cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1"), cue.Irreps("SO3", "32x1")
@@ -60,17 +70,20 @@ def test_script_fused_tp_4():
     )
 
     batch = 12
-    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
-    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
-    x2 = torch.randn(batch, d.operands[2].size, device="cuda:0", dtype=torch.float32)
+    x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
+    x2 = torch.randn(batch, d.operands[2].size, device=device, dtype=torch.float32)
 
-    module = FusedTensorProductOp4(d, (0, 1, 2), torch.device("cuda:0"), torch.float32)
+    module = FusedTensorProductOp4(d, (0, 1, 2), device, torch.float32)
     module = torch.jit.script(module)
 
     assert module([x0, x1, x2]).shape == (batch, d.operands[3].size)
 
 
 def test_script_uniform_tp_3():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     d = (
         cue.descriptors.full_tensor_product(
             cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1")
@@ -80,16 +93,19 @@ def test_script_uniform_tp_3():
     )
 
     batch = 12
-    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
-    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
+    x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
 
-    module = TensorProductUniform3x1d(d, torch.device("cuda:0"), torch.float32)
+    module = TensorProductUniform3x1d(d, device, torch.float32)
     module = torch.jit.script(module)
 
     assert module([x0, x1]).shape == (batch, d.operands[2].size)
 
 
 def test_script_uniform_tp_4():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     d = (
         cue.descriptors.channelwise_tensor_product(
             cue.Irreps("SO3", "32x1"), cue.Irreps("SO3", "1"), cue.Irreps("SO3", "32x1")
@@ -99,11 +115,11 @@ def test_script_uniform_tp_4():
     )
 
     batch = 12
-    x0 = torch.randn(batch, d.operands[0].size, device="cuda:0", dtype=torch.float32)
-    x1 = torch.randn(batch, d.operands[1].size, device="cuda:0", dtype=torch.float32)
-    x2 = torch.randn(batch, d.operands[2].size, device="cuda:0", dtype=torch.float32)
+    x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
+    x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
+    x2 = torch.randn(batch, d.operands[2].size, device=device, dtype=torch.float32)
 
-    module = TensorProductUniform4x1d(d, torch.device("cuda:0"), torch.float32)
+    module = TensorProductUniform4x1d(d, device, torch.float32)
     module = torch.jit.script(module)
 
     assert module([x0, x1, x2]).shape == (batch, d.operands[3].size)
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index f5ab6aa..7858576 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -20,6 +20,8 @@
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 
 def make_descriptors():
     yield descriptors.symmetric_contraction(
@@ -47,7 +49,7 @@ def make_descriptors():
     (torch.float32, torch.float64, 1e-5),
 ]
 
-if torch.cuda.get_device_capability()[0] >= 8:
+if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     settings1 += [
         (torch.float16, torch.float32, 1.0),
         (torch.float16, torch.float64, 0.1),
@@ -58,15 +60,22 @@ def make_descriptors():
 
 @pytest.mark.parametrize("ds", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings1)
+@pytest.mark.parametrize("use_fallback", [False, True])
 def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
-    ds: list[stp.SegmentedTensorProduct], dtype, math_dtype, tol: float
+    ds: list[stp.SegmentedTensorProduct],
+    dtype,
+    math_dtype,
+    tol: float,
+    use_fallback: bool,
 ):
-    device = torch.device("cuda:0")
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds, math_dtype=math_dtype, device=device, use_fallback=False
+        ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
     )
-    m = torch.jit.script(m)
+    if use_fallback is False:
+        m = torch.jit.script(m)
 
     x0 = torch.randn((2, m.x0_size), device=device, dtype=dtype, requires_grad=True)
     i0 = torch.tensor([0, 1, 0], dtype=torch.int32, device=device)
@@ -109,7 +118,7 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
     (torch.float32, torch.float64),
 ]
 
-if torch.cuda.get_device_capability()[0] >= 8:
+if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     settings2 += [
         (torch.float16, torch.float32),
         (torch.bfloat16, torch.float32),
@@ -117,17 +126,16 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 
 
 @pytest.mark.parametrize("dtype, math_dtype", settings2)
-def test_math_dtype(
-    dtype: torch.dtype,
-    math_dtype: torch.dtype,
-):
-    device = torch.device("cuda:0")
+@pytest.mark.parametrize("use_fallback", [False, True])
+def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: bool):
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
     ).ds
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds, math_dtype=math_dtype, device=device, use_fallback=False
+        ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
     )
     x0 = torch.randn((20, m.x0_size), dtype=dtype, device=device)
     i0 = torch.randint(0, m.x0_size, (1000,), dtype=torch.int32, device=device)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index e4ba7cc..d8c26ef 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -22,6 +22,8 @@
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+
 
 def make_descriptors():
     yield descriptors.fully_connected_tensor_product(
@@ -80,7 +82,7 @@ def make_descriptors():
     (torch.float64, torch.float64, 1e-12),
 ]
 
-if torch.cuda.get_device_capability()[0] >= 8:
+if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     settings += [
         (torch.float16, torch.float32, 1.0),
         (torch.bfloat16, torch.float32, 1.0),
@@ -89,13 +91,16 @@ def make_descriptors():
 
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
+@pytest.mark.parametrize("use_fallback", [False, True])
 def test_primitive_tensor_product_cuda_vs_fx(
     d: stp.SegmentedTensorProduct,
     dtype: torch.dtype,
     math_dtype: torch.dtype,
     tol: float,
+    use_fallback: bool,
 ):
-    device = torch.device("cuda:0")
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     for batches in itertools.product([(16,), (), (4, 1)], repeat=d.num_operands - 1):
         inputs = [
@@ -109,9 +114,11 @@ def test_primitive_tensor_product_cuda_vs_fx(
         ]
 
         m = cuet.TensorProduct(
-            d, device=device, math_dtype=math_dtype, optimize_fallback=False
+            d, device=device, math_dtype=math_dtype, use_fallback=use_fallback
         )
-        m = torch.jit.script(m)
+        if not use_fallback:
+            m = torch.jit.script(m)
+
         out1 = m(inputs)
 
         m = cuet.TensorProduct(
diff --git a/cuequivariance_torch/tests/primitives/transpose_test.py b/cuequivariance_torch/tests/primitives/transpose_test.py
index 67ad700..31eb271 100644
--- a/cuequivariance_torch/tests/primitives/transpose_test.py
+++ b/cuequivariance_torch/tests/primitives/transpose_test.py
@@ -12,13 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import pytest
 import torch
+
 import cuequivariance_torch as cuet
 
-import pytest
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
 dtypes = [torch.float32, torch.float64]
-if torch.cuda.get_device_capability()[0] >= 8:
+if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
     dtypes += [torch.float16, torch.bfloat16]
 
 
@@ -33,14 +35,16 @@ def test_transpose(use_fallback: bool, dtype: torch.dtype):
     10 11       10 12
     12 13       11 13
     """
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
 
     x = torch.tensor(
-        [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10, 11, 12, 13]], dtype=dtype
-    ).cuda()
+        [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10, 11, 12, 13]], dtype=dtype, device=device
+    )
     segments = [(2, 3), (2, 2)]
     xt = torch.tensor(
-        [[1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 10, 12, 11, 13]], dtype=dtype
-    ).cuda()
+        [[1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 10, 12, 11, 13]], dtype=dtype, device=device
+    )
 
-    m = cuet.TransposeSegments(segments, use_fallback=use_fallback).cuda()
+    m = cuet.TransposeSegments(segments, device, use_fallback=use_fallback)
     torch.testing.assert_close(m(x), xt)

From 8afa05674733a3a56e1e5b6a6f3c7c102077392e Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Mon, 9 Dec 2024 13:51:01 +0100
Subject: [PATCH 36/96] fix tests

---
 .github/workflows/tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index dadac6c..828840a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -56,8 +56,8 @@ jobs:
         pytest --doctest-modules cuequivariance_jax
 
   cuequivariance-torch:
-
-    runs-on: self-hosted
+    # runs-on: self-hosted (temporary unavailable)
+    runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:

From 09bbc8d95d08c88723500221ab29c8d2c1e30055 Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Mon, 9 Dec 2024 13:53:22 +0100
Subject: [PATCH 37/96] fix ruff

---
 .../cuequivariance_torch/primitives/symmetric_tensor_product.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 69f4a7c..dc61230 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import math
 from typing import Optional
 
 import torch

From 9c38168cbc5f854ecaa579319b8825e395fdb58a Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Mon, 9 Dec 2024 13:55:24 +0100
Subject: [PATCH 38/96] fix

---
 .../cuequivariance_torch/layers/tp_conv_fully_connected.py    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
index dd1a255..b6a6fb0 100644
--- a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
@@ -70,7 +70,7 @@ class FullyConnectedTensorProductConv(nn.Module):
         having 16 channels. edge_emb.size(1) must match the size of the input layer: 6
 
         >>> conv1 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
-        ...     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU(), layout=cue.ir_mul).cuda()
+        ...     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU(), layout=cue.ir_mul)
         >>> conv1
         FullyConnectedTensorProductConv(...)
         >>> # out = conv1(src_features, edge_sh, edge_emb, graph)
@@ -92,7 +92,7 @@ class FullyConnectedTensorProductConv(nn.Module):
         **Case 3**: No MLP, edge_emb will be directly used as the tensor product weights:
 
         >>> conv3 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
-        ...     mlp_channels=None, layout=cue.ir_mul).cuda()
+        ...     mlp_channels=None, layout=cue.ir_mul)
         >>> # out = conv3(src_features, edge_sh, edge_emb, graph)
     """
 

From de9af8f48c42ed7116b73612fbb507db17d89730 Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Mon, 9 Dec 2024 13:57:13 +0100
Subject: [PATCH 39/96] fix docstring tests

---
 .../operations/symmetric_contraction.py               |  7 ++++---
 .../primitives/equivariant_tensor_product.py          | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index e1b8d66..a91a72a 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -52,6 +52,7 @@ class SymmetricContraction(torch.nn.Module):
 
         The argument `original_mace` can be set to `True` to emulate the original MACE implementation.
 
+        >>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         >>> feats_irreps = cue.Irreps("O3", "32x0e + 32x1o + 32x2e")
         >>> target_irreps = cue.Irreps("O3", "32x0e + 32x1o")
         >>> # OLD FUNCTION DEFINITION:
@@ -71,15 +72,15 @@ class SymmetricContraction(torch.nn.Module):
         ...     layout_out=cue.mul_ir,
         ...     original_mace=True,
         ...     dtype=torch.float64,
-        ...     device=torch.device("cuda"),
+        ...     device=device,
         ... )
 
         Then the execution is as follows:
 
-        >>> node_feats = torch.randn(128, 32, feats_irreps.dim // 32, dtype=torch.float64).cuda()
+        >>> node_feats = torch.randn(128, 32, feats_irreps.dim // 32, dtype=torch.float64, device=device)
         >>> # with node_attrs_index being the index version of node_attrs, sth like:
         >>> # node_attrs_index = torch.nonzero(node_attrs)[:, 1].int()
-        >>> node_attrs_index = torch.randint(0, 10, (128,), dtype=torch.int32).cuda()
+        >>> node_attrs_index = torch.randint(0, 10, (128,), dtype=torch.int32, device=device)
         >>> # OLD CALL:
         >>> # symmetric_contractions_old(node_feats, node_attrs)
         >>> # NEW CALL:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 3bd3b63..16701ff 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -124,19 +124,20 @@ class EquivariantTensorProduct(torch.nn.Module):
         RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
     Examples:
+        >>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         >>> e = cue.descriptors.fully_connected_tensor_product(
         ...    cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1")
         ... )
-        >>> w = torch.ones(e.inputs[0].irreps.dim).cuda()
-        >>> x1 = torch.ones(17, e.inputs[1].irreps.dim).cuda()
-        >>> x2 = torch.ones(17, e.inputs[2].irreps.dim).cuda()
-        >>> tp = cuet.EquivariantTensorProduct(e, layout=cue.ir_mul, device=torch.device("cuda"))
+        >>> w = torch.ones(e.inputs[0].irreps.dim, device=device)
+        >>> x1 = torch.ones(17, e.inputs[1].irreps.dim, device=device)
+        >>> x2 = torch.ones(17, e.inputs[2].irreps.dim, device=device)
+        >>> tp = cuet.EquivariantTensorProduct(e, layout=cue.ir_mul, device=device)
         >>> tp([w, x1, x2])
         tensor([[0., 0., 0., 0., 0., 0.],...)
 
         You can optionally index the first input tensor:
 
-        >>> w = torch.ones(3, e.inputs[0].irreps.dim).cuda()
+        >>> w = torch.ones(3, e.inputs[0].irreps.dim, device=device)
         >>> indices = torch.randint(3, (17,))
         >>> tp([w, x1, x2], indices=indices)
         tensor([[0., 0., 0., 0., 0., 0.],...)

From 999a31defde6112659f87fe24484328cbb26eb26 Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Mon, 9 Dec 2024 14:00:11 +0100
Subject: [PATCH 40/96] add -x to tests

---
 .github/workflows/tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 828840a..def808e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -29,7 +29,7 @@ jobs:
         python -m pip install ./cuequivariance
     - name: Test with pytest
       run: |
-        pytest --doctest-modules cuequivariance
+        pytest --doctest-modules -x cuequivariance
 
   cuequivariance-jax:
 
@@ -53,7 +53,7 @@ jobs:
         python -m pip install ./cuequivariance_jax
     - name: Test with pytest
       run: |
-        pytest --doctest-modules cuequivariance_jax
+        pytest --doctest-modules -x cuequivariance_jax
 
   cuequivariance-torch:
     # runs-on: self-hosted (temporary unavailable)
@@ -79,4 +79,4 @@ jobs:
         python -m pip install e3nn
     - name: Test with pytest
       run: |
-        pytest --doctest-modules cuequivariance_torch
+        pytest --doctest-modules -x cuequivariance_torch

From 8c435fecdb11d28af9921a3aa32fd96b57303a5c Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 11 Dec 2024 12:56:13 -0800
Subject: [PATCH 41/96] Working around torch_tensorrt bugs

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../tests/primitives/equivariant_tensor_product_test.py       | 3 +--
 cuequivariance_torch/tests/primitives/utils.py                | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 68531a7..d581dd1 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -219,8 +219,7 @@ def test_script(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-# export_modes = ["onnx", "onnx_dynamo", "trt", "torch_trt", "jit"]
-export_modes = ["trt", "onnx"]
+export_modes = ["export", "onnx", "trt", "torch_trt", "jit"]
 
 
 @pytest.mark.parametrize("e", make_descriptors())
diff --git a/cuequivariance_torch/tests/primitives/utils.py b/cuequivariance_torch/tests/primitives/utils.py
index ce646fd..1de6266 100644
--- a/cuequivariance_torch/tests/primitives/utils.py
+++ b/cuequivariance_torch/tests/primitives/utils.py
@@ -114,7 +114,7 @@ def module_with_mode(
         dtype = inputs[0][0].dtype
     else:
         dtype = inputs[0].dtype
-    if mode in ["trt", "torch_trt", "onnx", "onnx_dynamo", "export"]:
+    if mode in ["trt", "torch_trt", "onnx", "onnx_dynamo"]:
         if not ONNX_AVAILABLE:
             pytest.skip("ONNX not available!")
         if dtype == torch.float64 or math_dtype == torch.float64:
@@ -144,7 +144,7 @@ def module_with_mode(
             if not TORCH_TRT_AVAILABLE:
                 pytest.skip("torch_tensorrt is not installed!")
             register_plugins()
-            exp_program = torch_tensorrt.dynamo.trace(module, inputs)
+            exp_program = torch.export.export(module, tuple(inputs))
             module = torch_tensorrt.dynamo.compile(
                 exp_program,
                 inputs=inputs,

From ae0bff22af6b747008b97359f4d1839cd49caa4c Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 11 Dec 2024 13:52:11 -0800
Subject: [PATCH 42/96] Fixing utils.py import

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 cuequivariance_torch/tests/__init__.py        |   0
 .../equivariant_tensor_product_test.py        | 193 +------------
 .../tests/primitives/utils.py                 | 267 ------------------
 3 files changed, 4 insertions(+), 456 deletions(-)
 create mode 100644 cuequivariance_torch/tests/__init__.py
 delete mode 100644 cuequivariance_torch/tests/primitives/utils.py

diff --git a/cuequivariance_torch/tests/__init__.py b/cuequivariance_torch/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 602370c..b38c0f7 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -19,199 +19,14 @@
 import torch
 import torch._dynamo
 
+from tests.utils import (
+    module_with_mode,
+)
+
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
-torch._dynamo.config.cache_size_limit = 100
-
-try:
-    import cuequivariance_ops_torch.onnx  # noqa: F401
-    import onnx  # noqa: F401
-    import onnxruntime  # noqa: F401
-    import onnxscript  # noqa: F401
-    from cuequivariance_ops_torch.tensorrt import register_plugins
-
-    ONNX_AVAILABLE = True
-except Exception:
-    ONNX_AVAILABLE = False
-
-
-try:
-    import torch_tensorrt
-
-    TORCH_TRT_AVAILABLE = True
-except Exception:
-    TORCH_TRT_AVAILABLE = False
-
-
-def verify_onnx(module, onnx_module, inputs, dtype):
-    if dtype != torch.float32:
-        pytest.skip("onnxrt only checked for float32")
-    from onnxruntime import SessionOptions
-    from onnxruntime_extensions import get_library_path
-    from torch.onnx.verification import (
-        VerificationOptions,
-        _compare_onnx_pytorch_model,
-    )
-
-    original_init = SessionOptions.__init__
-
-    def new_init(self):
-        original_init(self)
-        try:
-            self.register_custom_ops_library(get_library_path())
-        except Exception:
-            pass
-
-    SessionOptions.__init__ = new_init
-    _compare_onnx_pytorch_model(
-        module, onnx_module, tuple(inputs), None, None, VerificationOptions()
-    )
-    SessionOptions.__init__ = original_init
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-
-
-def verify_trt(module, onnx_module, inputs, dtype):
-    import tensorrt
-    from pkg_resources import parse_version
-
-    if parse_version(tensorrt.__version__) < parse_version("10.3.0"):
-        pytest.skip("TRT < 10.3.0 is not supported!")
-    if dtype == torch.float64:
-        pytest.skip("TRT does not support float64")
-
-    from onnxruntime import InferenceSession, SessionOptions
-    from onnxruntime_extensions import get_library_path
-    from polygraphy.backend.onnxrt import OnnxrtRunner
-    from polygraphy.backend.trt import (
-        CreateConfig,
-        TrtRunner,
-        engine_from_network,
-        network_from_onnx_path,
-    )
-    from polygraphy.comparator import Comparator, DataLoader
-
-    register_plugins()
-
-    network = network_from_onnx_path(onnx_module)
-    trt_engine = engine_from_network(network, config=CreateConfig())
-
-    if dtype != torch.float32:
-        pytest.skip("Comparator only supports float32")
-
-    # Create runners for ONNX and TRT models
-    trt_runner = TrtRunner(trt_engine)
-
-    options = SessionOptions()
-    options.register_custom_ops_library(get_library_path())
-    onnx_runner = OnnxrtRunner(InferenceSession(onnx_module, sess_options=options))
-
-    results = Comparator.run([trt_runner, onnx_runner], data_loader=DataLoader())
-    Comparator.compare_accuracy(results)
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-
-
-def module_with_mode(
-    mode,
-    module,
-    inputs,
-    math_dtype,
-    tmp_path,
-    grad_modes=["eager", "compile", "jit", "export"],
-):
-    if isinstance(inputs[0], list):
-        dtype = inputs[0][0].dtype
-    else:
-        dtype = inputs[0].dtype
-    if mode in ["trt", "torch_trt", "onnx", "onnx_dynamo", "export"]:
-        if not ONNX_AVAILABLE:
-            pytest.skip("ONNX not available!")
-        if dtype == torch.float64 or math_dtype == torch.float64:
-            pytest.skip("TRT/ORT do not support float64")
-
-    with torch.set_grad_enabled(mode in grad_modes):
-        if mode == "compile":
-            import sys
-
-            if sys.version_info.major == 3 and sys.version_info.minor >= 12:
-                pytest.skip("torch dynamo needs cpy <= 3.11")
-                module = torch.compile(module)
-        elif mode == "fx":
-            module = torch.fx.symbolic_trace(module)
-        elif mode == "jit":
-            module = torch.jit.trace(module, inputs)
-            fname = os.path.join(tmp_path, "test.ts")
-            torch.jit.save(module, fname)
-            module = torch.jit.load(fname)
-        elif mode == "export":
-            exp_program = torch.export.export(module, tuple(inputs))
-            fname = os.path.join(tmp_path, "test.pt2")
-            torch.export.save(exp_program, fname)
-            del exp_program
-            module = torch.export.load(fname).module()
-        elif mode == "torch_trt":
-            if not TORCH_TRT_AVAILABLE:
-                pytest.skip("torch_tensorrt is not installed!")
-            register_plugins()
-            exp_program = torch_tensorrt.dynamo.trace(module, inputs)
-            module = torch_tensorrt.dynamo.compile(
-                exp_program,
-                inputs=inputs,
-                require_full_compilation=True,
-                min_block_size=1,
-                enabled_precisions={torch.float32, dtype},
-                # dryrun=True
-            )
-        elif mode == "onnx" or mode == "trt":
-            try:
-                onnx_path = os.path.join(tmp_path, "test.onnx")
-                torch.onnx.export(
-                    module, tuple(inputs), onnx_path, opset_version=17, verbose=False
-                )
-                if mode == "trt":
-                    verify_trt(module, onnx_path, inputs, dtype)
-                else:
-                    verify_onnx(module, onnx_path, inputs, dtype)
-            except ImportError:
-                pytest.skip("ONNX/TRT is not available")
-
-        elif mode == "onnx_dynamo":
-            try:
-                from cuequivariance_ops_torch.onnx import (
-                    cuequivariance_ops_torch_onnx_registry,
-                )
-
-                export_options = torch.onnx.ExportOptions(
-                    onnx_registry=cuequivariance_ops_torch_onnx_registry
-                )
-                onnx_program = torch.onnx.dynamo_export(
-                    module, *inputs, export_options=export_options
-                )
-                onnx_path = os.path.join(tmp_path, "test.onnx")
-                onnx_program.save(onnx_path)
-                verify_onnx(module, onnx_path, inputs, dtype)
-            except ImportError:
-                pytest.skip("ONNX is not available")
-        elif mode == "eager":
-            pass
-        else:
-            raise ValueError(f"No such mode: {mode}")
-
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-
-    return module
-
-
-def maybe_detach_and_to(tensor, *args, **kwargs):
-    if tensor is not None:
-        return tensor.clone().detach().to(*args, **kwargs)
-    return None
-
-
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
 
diff --git a/cuequivariance_torch/tests/primitives/utils.py b/cuequivariance_torch/tests/primitives/utils.py
deleted file mode 100644
index 1de6266..0000000
--- a/cuequivariance_torch/tests/primitives/utils.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
-#
-# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
-# property and proprietary rights in and to this material, related
-# documentation and any modifications thereto. Any use, reproduction,
-# disclosure or distribution of this material and related documentation
-# without an express license agreement from NVIDIA CORPORATION or
-# its affiliates is strictly prohibited.
-
-import os
-import pytest
-import torch
-from typing import Sequence
-
-try:
-    import onnx  # noqa: F401
-    import onnxscript  # noqa: F401
-    import onnxruntime  # noqa: F401
-    import cuequivariance_ops_torch.onnx  # noqa: F401
-    from cuequivariance_ops_torch.tensorrt import register_plugins
-
-    ONNX_AVAILABLE = True
-except Exception:
-    ONNX_AVAILABLE = False
-
-
-try:
-    import torch_tensorrt
-
-    TORCH_TRT_AVAILABLE = True
-except Exception:
-    TORCH_TRT_AVAILABLE = False
-
-
-def verify_onnx(module, onnx_module, inputs, dtype):
-    if dtype != torch.float32:
-        pytest.skip("onnxrt only checked for float32")
-    from onnxruntime import SessionOptions
-    from onnxruntime_extensions import get_library_path
-    from torch.onnx.verification import (
-        _compare_onnx_pytorch_model,
-        VerificationOptions,
-    )
-
-    original_init = SessionOptions.__init__
-
-    def new_init(self):
-        original_init(self)
-        try:
-            self.register_custom_ops_library(get_library_path())
-        except Exception:
-            pass
-
-    SessionOptions.__init__ = new_init
-    _compare_onnx_pytorch_model(
-        module, onnx_module, tuple(inputs), None, None, VerificationOptions()
-    )
-    SessionOptions.__init__ = original_init
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-
-
-def verify_trt(module, onnx_module, inputs, dtype):
-    import tensorrt
-    from pkg_resources import parse_version
-
-    if parse_version(tensorrt.__version__) < parse_version("10.3.0"):
-        pytest.skip("TRT < 10.3.0 is not supported!")
-    if dtype == torch.float64:
-        pytest.skip("TRT does not support float64")
-
-    from polygraphy.backend.trt import (
-        engine_from_network,
-        network_from_onnx_path,
-        TrtRunner,
-        CreateConfig,
-    )
-    from polygraphy.backend.onnxrt import OnnxrtRunner
-    from polygraphy.comparator import Comparator, DataLoader
-    from onnxruntime import InferenceSession, SessionOptions
-    from onnxruntime_extensions import get_library_path
-
-    register_plugins()
-
-    network = network_from_onnx_path(onnx_module)
-    trt_engine = engine_from_network(network, config=CreateConfig())
-
-    if dtype != torch.float32:
-        pytest.skip("Comparator only supports float32")
-
-    # Create runners for ONNX and TRT models
-    trt_runner = TrtRunner(trt_engine)
-
-    options = SessionOptions()
-    options.register_custom_ops_library(get_library_path())
-    onnx_runner = OnnxrtRunner(InferenceSession(onnx_module, sess_options=options))
-
-    results = Comparator.run([trt_runner, onnx_runner], data_loader=DataLoader())
-    Comparator.compare_accuracy(results)
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-
-
-def module_with_mode(
-    mode,
-    module,
-    inputs,
-    math_dtype,
-    tmp_path,
-    grad_modes=["eager", "compile", "jit", "export"],
-):
-    if isinstance(inputs[0], list):
-        dtype = inputs[0][0].dtype
-    else:
-        dtype = inputs[0].dtype
-    if mode in ["trt", "torch_trt", "onnx", "onnx_dynamo"]:
-        if not ONNX_AVAILABLE:
-            pytest.skip("ONNX not available!")
-        if dtype == torch.float64 or math_dtype == torch.float64:
-            pytest.skip("TRT/ORT do not support float64")
-
-    with torch.set_grad_enabled(mode in grad_modes):
-        if mode == "compile":
-            import sys
-
-            if sys.version_info.major == 3 and sys.version_info.minor >= 12:
-                pytest.skip("torch dynamo needs cpy <= 3.11")
-                module = torch.compile(module)
-        elif mode == "fx":
-            module = torch.fx.symbolic_trace(module)
-        elif mode == "jit":
-            module = torch.jit.trace(module, inputs)
-            fname = os.path.join(tmp_path, "test.ts")
-            torch.jit.save(module, fname)
-            module = torch.jit.load(fname)
-        elif mode == "export":
-            exp_program = torch.export.export(module, tuple(inputs))
-            fname = os.path.join(tmp_path, "test.pt2")
-            torch.export.save(exp_program, fname)
-            del exp_program
-            module = torch.export.load(fname).module()
-        elif mode == "torch_trt":
-            if not TORCH_TRT_AVAILABLE:
-                pytest.skip("torch_tensorrt is not installed!")
-            register_plugins()
-            exp_program = torch.export.export(module, tuple(inputs))
-            module = torch_tensorrt.dynamo.compile(
-                exp_program,
-                inputs=inputs,
-                require_full_compilation=True,
-                min_block_size=1,
-                enabled_precisions={torch.float32, dtype},
-                # dryrun=True
-            )
-        elif mode == "onnx" or mode == "trt":
-            try:
-                onnx_path = os.path.join(tmp_path, "test.onnx")
-                torch.onnx.export(
-                    module, tuple(inputs), onnx_path, opset_version=17, verbose=False
-                )
-                if mode == "trt":
-                    verify_trt(module, onnx_path, inputs, dtype)
-                else:
-                    verify_onnx(module, onnx_path, inputs, dtype)
-            except ImportError:
-                pytest.skip("ONNX/TRT is not available")
-
-        elif mode == "onnx_dynamo":
-            try:
-                from cuequivariance_ops_torch.onnx import (
-                    cuequivariance_ops_torch_onnx_registry,
-                )
-
-                export_options = torch.onnx.ExportOptions(
-                    onnx_registry=cuequivariance_ops_torch_onnx_registry
-                )
-                onnx_program = torch.onnx.dynamo_export(
-                    module, *inputs, export_options=export_options
-                )
-                onnx_path = os.path.join(tmp_path, "test.onnx")
-                onnx_program.save(onnx_path)
-                verify_onnx(module, onnx_path, inputs, dtype)
-            except ImportError:
-                pytest.skip("ONNX is not available")
-        elif mode == "eager":
-            pass
-        else:
-            raise ValueError(f"No such mode: {mode}")
-
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-
-    return module
-
-
-def create_random_tensor_2d(batch_size, stride, requires_grad, dtype, is_shared):
-    data = torch.randn(
-        (stride,) if is_shared else (batch_size, stride),
-        dtype=dtype,
-        device="cuda",
-    ).requires_grad_(requires_grad)
-
-    return data
-
-
-def maybe_detach_and_to(tensor, *args, **kwargs):
-    if tensor is not None:
-        return tensor.clone().detach().to(*args, **kwargs)
-    return None
-
-
-def run_fwd_test(module, x: Sequence):
-    with torch.no_grad():
-        out = module(*x)
-        test_output = [maybe_detach_and_to(out, dtype=torch.float32)]
-        return test_output
-
-
-def run_fwd_bwd_test(module, x: Sequence):
-    out = module(*x)
-
-    loss = out.sum()
-    loss.backward()
-
-    test_output = [maybe_detach_and_to(out, dtype=torch.float32)]
-    test_output.extend([maybe_detach_and_to(t.grad, dtype=torch.float32) for t in x])
-
-    return test_output
-
-
-def run_bwd_bwd_test(module, x: Sequence):
-    test_outputs = []
-    out = module(*x)
-    grads = torch.autograd.grad(out.pow(2).sum(), x, create_graph=True)
-    test_outputs.extend([maybe_detach_and_to(g, dtype=torch.float32) for g in grads])
-    loss = sum([g.sum() for g in grads])
-    loss.backward()
-    test_outputs.extend([maybe_detach_and_to(t.grad, dtype=torch.float32) for t in x])
-    return test_outputs
-
-
-def assert_close_modules(m_test, m_ref, inputs_test, procedure, tol_dict):
-    outs_test = procedure(m_test, inputs_test)
-
-    inputs_ref = [
-        x.clone()
-        .detach()
-        .to(device="cuda", dtype=torch.float32)
-        .requires_grad_(x.requires_grad)
-        for x in inputs_test
-    ]
-    outs_ref = procedure(m_ref, inputs_ref)
-    for out_test, out_ref in zip(outs_test, outs_ref):
-        torch.testing.assert_close(out_test, out_ref, **tol_dict)
-
-
-tol_dict = {
-    # we compare against double for precision reasons
-    # hence FP64 and FP32 threshold are the same
-    (torch.float64, torch.float64): {"atol": 1e-9, "rtol": 1e-5},
-    (torch.float32, torch.float64): {"atol": 1e-4, "rtol": 1e-5},
-    (torch.float64, torch.float32): {"atol": 1e-4, "rtol": 1e-5},
-    (torch.float32, torch.float32): {"atol": 1e-4, "rtol": 1e-5},
-    (torch.bfloat16, torch.float32): {"atol": 4.0, "rtol": 1e-2},
-    (torch.float16, torch.float32): {"atol": 0.25, "rtol": 1e-2},
-}

From 2bad3bc38d38cdcbcf799bb5e6c0eacefb145b64 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 11 Dec 2024 13:56:06 -0800
Subject: [PATCH 43/96] Adding utils.py

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 cuequivariance_torch/tests/utils.py | 267 ++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 cuequivariance_torch/tests/utils.py

diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
new file mode 100644
index 0000000..1de6266
--- /dev/null
+++ b/cuequivariance_torch/tests/utils.py
@@ -0,0 +1,267 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+
+import os
+import pytest
+import torch
+from typing import Sequence
+
+try:
+    import onnx  # noqa: F401
+    import onnxscript  # noqa: F401
+    import onnxruntime  # noqa: F401
+    import cuequivariance_ops_torch.onnx  # noqa: F401
+    from cuequivariance_ops_torch.tensorrt import register_plugins
+
+    ONNX_AVAILABLE = True
+except Exception:
+    ONNX_AVAILABLE = False
+
+
+try:
+    import torch_tensorrt
+
+    TORCH_TRT_AVAILABLE = True
+except Exception:
+    TORCH_TRT_AVAILABLE = False
+
+
+def verify_onnx(module, onnx_module, inputs, dtype):
+    if dtype != torch.float32:
+        pytest.skip("onnxrt only checked for float32")
+    from onnxruntime import SessionOptions
+    from onnxruntime_extensions import get_library_path
+    from torch.onnx.verification import (
+        _compare_onnx_pytorch_model,
+        VerificationOptions,
+    )
+
+    original_init = SessionOptions.__init__
+
+    def new_init(self):
+        original_init(self)
+        try:
+            self.register_custom_ops_library(get_library_path())
+        except Exception:
+            pass
+
+    SessionOptions.__init__ = new_init
+    _compare_onnx_pytorch_model(
+        module, onnx_module, tuple(inputs), None, None, VerificationOptions()
+    )
+    SessionOptions.__init__ = original_init
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+
+def verify_trt(module, onnx_module, inputs, dtype):
+    import tensorrt
+    from pkg_resources import parse_version
+
+    if parse_version(tensorrt.__version__) < parse_version("10.3.0"):
+        pytest.skip("TRT < 10.3.0 is not supported!")
+    if dtype == torch.float64:
+        pytest.skip("TRT does not support float64")
+
+    from polygraphy.backend.trt import (
+        engine_from_network,
+        network_from_onnx_path,
+        TrtRunner,
+        CreateConfig,
+    )
+    from polygraphy.backend.onnxrt import OnnxrtRunner
+    from polygraphy.comparator import Comparator, DataLoader
+    from onnxruntime import InferenceSession, SessionOptions
+    from onnxruntime_extensions import get_library_path
+
+    register_plugins()
+
+    network = network_from_onnx_path(onnx_module)
+    trt_engine = engine_from_network(network, config=CreateConfig())
+
+    if dtype != torch.float32:
+        pytest.skip("Comparator only supports float32")
+
+    # Create runners for ONNX and TRT models
+    trt_runner = TrtRunner(trt_engine)
+
+    options = SessionOptions()
+    options.register_custom_ops_library(get_library_path())
+    onnx_runner = OnnxrtRunner(InferenceSession(onnx_module, sess_options=options))
+
+    results = Comparator.run([trt_runner, onnx_runner], data_loader=DataLoader())
+    Comparator.compare_accuracy(results)
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+
+def module_with_mode(
+    mode,
+    module,
+    inputs,
+    math_dtype,
+    tmp_path,
+    grad_modes=["eager", "compile", "jit", "export"],
+):
+    if isinstance(inputs[0], list):
+        dtype = inputs[0][0].dtype
+    else:
+        dtype = inputs[0].dtype
+    if mode in ["trt", "torch_trt", "onnx", "onnx_dynamo"]:
+        if not ONNX_AVAILABLE:
+            pytest.skip("ONNX not available!")
+        if dtype == torch.float64 or math_dtype == torch.float64:
+            pytest.skip("TRT/ORT do not support float64")
+
+    with torch.set_grad_enabled(mode in grad_modes):
+        if mode == "compile":
+            import sys
+
+            if sys.version_info.major == 3 and sys.version_info.minor >= 12:
+                pytest.skip("torch dynamo needs cpy <= 3.11")
+                module = torch.compile(module)
+        elif mode == "fx":
+            module = torch.fx.symbolic_trace(module)
+        elif mode == "jit":
+            module = torch.jit.trace(module, inputs)
+            fname = os.path.join(tmp_path, "test.ts")
+            torch.jit.save(module, fname)
+            module = torch.jit.load(fname)
+        elif mode == "export":
+            exp_program = torch.export.export(module, tuple(inputs))
+            fname = os.path.join(tmp_path, "test.pt2")
+            torch.export.save(exp_program, fname)
+            del exp_program
+            module = torch.export.load(fname).module()
+        elif mode == "torch_trt":
+            if not TORCH_TRT_AVAILABLE:
+                pytest.skip("torch_tensorrt is not installed!")
+            register_plugins()
+            exp_program = torch.export.export(module, tuple(inputs))
+            module = torch_tensorrt.dynamo.compile(
+                exp_program,
+                inputs=inputs,
+                require_full_compilation=True,
+                min_block_size=1,
+                enabled_precisions={torch.float32, dtype},
+                # dryrun=True
+            )
+        elif mode == "onnx" or mode == "trt":
+            try:
+                onnx_path = os.path.join(tmp_path, "test.onnx")
+                torch.onnx.export(
+                    module, tuple(inputs), onnx_path, opset_version=17, verbose=False
+                )
+                if mode == "trt":
+                    verify_trt(module, onnx_path, inputs, dtype)
+                else:
+                    verify_onnx(module, onnx_path, inputs, dtype)
+            except ImportError:
+                pytest.skip("ONNX/TRT is not available")
+
+        elif mode == "onnx_dynamo":
+            try:
+                from cuequivariance_ops_torch.onnx import (
+                    cuequivariance_ops_torch_onnx_registry,
+                )
+
+                export_options = torch.onnx.ExportOptions(
+                    onnx_registry=cuequivariance_ops_torch_onnx_registry
+                )
+                onnx_program = torch.onnx.dynamo_export(
+                    module, *inputs, export_options=export_options
+                )
+                onnx_path = os.path.join(tmp_path, "test.onnx")
+                onnx_program.save(onnx_path)
+                verify_onnx(module, onnx_path, inputs, dtype)
+            except ImportError:
+                pytest.skip("ONNX is not available")
+        elif mode == "eager":
+            pass
+        else:
+            raise ValueError(f"No such mode: {mode}")
+
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+
+    return module
+
+
+def create_random_tensor_2d(batch_size, stride, requires_grad, dtype, is_shared):
+    data = torch.randn(
+        (stride,) if is_shared else (batch_size, stride),
+        dtype=dtype,
+        device="cuda",
+    ).requires_grad_(requires_grad)
+
+    return data
+
+
+def maybe_detach_and_to(tensor, *args, **kwargs):
+    if tensor is not None:
+        return tensor.clone().detach().to(*args, **kwargs)
+    return None
+
+
+def run_fwd_test(module, x: Sequence):
+    with torch.no_grad():
+        out = module(*x)
+        test_output = [maybe_detach_and_to(out, dtype=torch.float32)]
+        return test_output
+
+
+def run_fwd_bwd_test(module, x: Sequence):
+    out = module(*x)
+
+    loss = out.sum()
+    loss.backward()
+
+    test_output = [maybe_detach_and_to(out, dtype=torch.float32)]
+    test_output.extend([maybe_detach_and_to(t.grad, dtype=torch.float32) for t in x])
+
+    return test_output
+
+
+def run_bwd_bwd_test(module, x: Sequence):
+    test_outputs = []
+    out = module(*x)
+    grads = torch.autograd.grad(out.pow(2).sum(), x, create_graph=True)
+    test_outputs.extend([maybe_detach_and_to(g, dtype=torch.float32) for g in grads])
+    loss = sum([g.sum() for g in grads])
+    loss.backward()
+    test_outputs.extend([maybe_detach_and_to(t.grad, dtype=torch.float32) for t in x])
+    return test_outputs
+
+
+def assert_close_modules(m_test, m_ref, inputs_test, procedure, tol_dict):
+    outs_test = procedure(m_test, inputs_test)
+
+    inputs_ref = [
+        x.clone()
+        .detach()
+        .to(device="cuda", dtype=torch.float32)
+        .requires_grad_(x.requires_grad)
+        for x in inputs_test
+    ]
+    outs_ref = procedure(m_ref, inputs_ref)
+    for out_test, out_ref in zip(outs_test, outs_ref):
+        torch.testing.assert_close(out_test, out_ref, **tol_dict)
+
+
+tol_dict = {
+    # we compare against double for precision reasons
+    # hence FP64 and FP32 threshold are the same
+    (torch.float64, torch.float64): {"atol": 1e-9, "rtol": 1e-5},
+    (torch.float32, torch.float64): {"atol": 1e-4, "rtol": 1e-5},
+    (torch.float64, torch.float32): {"atol": 1e-4, "rtol": 1e-5},
+    (torch.float32, torch.float32): {"atol": 1e-4, "rtol": 1e-5},
+    (torch.bfloat16, torch.float32): {"atol": 4.0, "rtol": 1e-2},
+    (torch.float16, torch.float32): {"atol": 0.25, "rtol": 1e-2},
+}

From 7c82e20b87babf1bf3953f25d7fb3bb2a00220ff Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 11 Dec 2024 14:31:09 -0800
Subject: [PATCH 44/96] Style

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance/experimental/mace/symmetric_contractions.py | 2 +-
 .../cuequivariance/irreps_array/context_decorator.py           | 2 +-
 cuequivariance/cuequivariance/irreps_array/irreps.py           | 2 +-
 cuequivariance/cuequivariance/irreps_array/misc_ui.py          | 2 +-
 .../cuequivariance/irreps_array/reduced_tensor_product.py      | 2 +-
 cuequivariance/cuequivariance/misc/sympy_utils.py              | 2 +-
 cuequivariance/cuequivariance/representation/irrep_so3.py      | 2 +-
 cuequivariance/cuequivariance/representation/irrep_su2.py      | 1 +
 .../cuequivariance/segmented_tensor_product/operand.py         | 2 +-
 .../segmented_tensor_product/segmented_tensor_product.py       | 2 +-
 cuequivariance/tests/context_test.py                           | 2 +-
 cuequivariance/tests/equivariant_tensor_products_test.py       | 2 +-
 .../tests/primitives/equivariant_tensor_product_test.py        | 3 +--
 docs/conf.py                                                   | 2 --
 14 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py b/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py
index ebfc5c7..d8c2398 100644
--- a/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py
+++ b/cuequivariance/cuequivariance/experimental/mace/symmetric_contractions.py
@@ -18,8 +18,8 @@
 import numpy as np
 
 import cuequivariance as cue
-from cuequivariance import descriptors
 import cuequivariance.segmented_tensor_product as stp
+from cuequivariance import descriptors
 from cuequivariance.misc.linalg import round_to_sqrt_rational, triu_array
 
 
diff --git a/cuequivariance/cuequivariance/irreps_array/context_decorator.py b/cuequivariance/cuequivariance/irreps_array/context_decorator.py
index 3989905..4cbeef7 100644
--- a/cuequivariance/cuequivariance/irreps_array/context_decorator.py
+++ b/cuequivariance/cuequivariance/irreps_array/context_decorator.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import wraps
-from typing import Optional, Union, Type
+from typing import Optional, Type, Union
 
 import cuequivariance as cue
 import cuequivariance.irreps_array as irreps_array
diff --git a/cuequivariance/cuequivariance/irreps_array/irreps.py b/cuequivariance/cuequivariance/irreps_array/irreps.py
index ee26c79..1bb7792 100644
--- a/cuequivariance/cuequivariance/irreps_array/irreps.py
+++ b/cuequivariance/cuequivariance/irreps_array/irreps.py
@@ -16,7 +16,7 @@
 
 import dataclasses
 import re
-from typing import NamedTuple, Union, Type, Any, Sequence, Callable, Optional
+from typing import Any, Callable, NamedTuple, Optional, Sequence, Type, Union
 
 import cuequivariance as cue
 
diff --git a/cuequivariance/cuequivariance/irreps_array/misc_ui.py b/cuequivariance/cuequivariance/irreps_array/misc_ui.py
index 4bf7297..d473eda 100644
--- a/cuequivariance/cuequivariance/irreps_array/misc_ui.py
+++ b/cuequivariance/cuequivariance/irreps_array/misc_ui.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Generator, Optional, Union, Any
+from typing import Any, Generator, Optional, Union
 
 import cuequivariance as cue
 
diff --git a/cuequivariance/cuequivariance/irreps_array/reduced_tensor_product.py b/cuequivariance/cuequivariance/irreps_array/reduced_tensor_product.py
index 9c24d82..4f48ba2 100644
--- a/cuequivariance/cuequivariance/irreps_array/reduced_tensor_product.py
+++ b/cuequivariance/cuequivariance/irreps_array/reduced_tensor_product.py
@@ -16,7 +16,7 @@
 import itertools
 import logging
 from math import prod
-from typing import FrozenSet, List, Optional, Sequence, Tuple, Iterator, Union
+from typing import FrozenSet, Iterator, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 
diff --git a/cuequivariance/cuequivariance/misc/sympy_utils.py b/cuequivariance/cuequivariance/misc/sympy_utils.py
index 2596dba..c285d67 100644
--- a/cuequivariance/cuequivariance/misc/sympy_utils.py
+++ b/cuequivariance/cuequivariance/misc/sympy_utils.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from fractions import Fraction
-import numpy as np
 
+import numpy as np
 import sympy
 
 
diff --git a/cuequivariance/cuequivariance/representation/irrep_so3.py b/cuequivariance/cuequivariance/representation/irrep_so3.py
index 6345d40..5cd9cce 100644
--- a/cuequivariance/cuequivariance/representation/irrep_so3.py
+++ b/cuequivariance/cuequivariance/representation/irrep_so3.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from cuequivariance.misc.linalg import round_to_sqrt_rational
-from cuequivariance.representation import Irrep, SU2
+from cuequivariance.representation import SU2, Irrep
 
 
 # This function is copied from https://github.com/lie-nn/lie-nn/blob/70adebce44e3197ee17f780585c6570d836fc2fe/lie_nn/_src/irreps/so3_real.py
diff --git a/cuequivariance/cuequivariance/representation/irrep_su2.py b/cuequivariance/cuequivariance/representation/irrep_su2.py
index 9b3865e..08e3cc9 100644
--- a/cuequivariance/cuequivariance/representation/irrep_su2.py
+++ b/cuequivariance/cuequivariance/representation/irrep_su2.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
+
 import itertools
 import re
 from dataclasses import dataclass
diff --git a/cuequivariance/cuequivariance/segmented_tensor_product/operand.py b/cuequivariance/cuequivariance/segmented_tensor_product/operand.py
index 0511145..12770e1 100644
--- a/cuequivariance/cuequivariance/segmented_tensor_product/operand.py
+++ b/cuequivariance/cuequivariance/segmented_tensor_product/operand.py
@@ -16,7 +16,7 @@
 
 import dataclasses
 import math
-from typing import Optional, Union, Sequence
+from typing import Optional, Sequence, Union
 
 from cuequivariance import segmented_tensor_product as stp
 
diff --git a/cuequivariance/cuequivariance/segmented_tensor_product/segmented_tensor_product.py b/cuequivariance/cuequivariance/segmented_tensor_product/segmented_tensor_product.py
index 3059d8f..3d468d3 100644
--- a/cuequivariance/cuequivariance/segmented_tensor_product/segmented_tensor_product.py
+++ b/cuequivariance/cuequivariance/segmented_tensor_product/segmented_tensor_product.py
@@ -25,7 +25,7 @@
 import math
 import re
 import zlib
-from typing import Any, Optional, Union, Callable, Sequence
+from typing import Any, Callable, Optional, Sequence, Union
 
 import numpy as np
 import opt_einsum
diff --git a/cuequivariance/tests/context_test.py b/cuequivariance/tests/context_test.py
index 11ffe8f..5cc2d01 100644
--- a/cuequivariance/tests/context_test.py
+++ b/cuequivariance/tests/context_test.py
@@ -12,10 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import pytest
 
 import cuequivariance as cue
-import numpy as np
 
 
 def test_rep_collection_context():
diff --git a/cuequivariance/tests/equivariant_tensor_products_test.py b/cuequivariance/tests/equivariant_tensor_products_test.py
index ee9e192..b231844 100644
--- a/cuequivariance/tests/equivariant_tensor_products_test.py
+++ b/cuequivariance/tests/equivariant_tensor_products_test.py
@@ -16,8 +16,8 @@
 import pytest
 
 import cuequivariance as cue
-from cuequivariance import descriptors
 import cuequivariance.segmented_tensor_product as stp
+from cuequivariance import descriptors
 
 
 def test_commutativity_squeeze_flatten():
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index b38c0f7..e1a684c 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -12,13 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import timeit
 
 import pytest
 import torch
 import torch._dynamo
-
 from tests.utils import (
     module_with_mode,
 )
@@ -221,6 +219,7 @@ def test_script(
 
 export_modes = ["export", "onnx", "trt", "torch_trt", "jit"]
 
+
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
 @pytest.mark.parametrize("mode", export_modes)
diff --git a/docs/conf.py b/docs/conf.py
index b00ec7e..3dbe2ef 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,8 +21,6 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 import datetime
-import nvidia_sphinx_theme
-
 
 current_year = datetime.datetime.now().year
 

From 68a84f8edcbf3e356bd0b8ab9fe0c54847c4ea0e Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Wed, 11 Dec 2024 14:45:10 -0800
Subject: [PATCH 45/96] import nvidia_sphinx_theme

---
 docs/conf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/conf.py b/docs/conf.py
index 3dbe2ef..94992cb 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,6 +22,8 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 import datetime
 
+import nvidia_sphinx_theme  # noqa
+
 current_year = datetime.datetime.now().year
 
 project = "cuEquivariance"

From 5ca4edc95052342bf19fa6e3cd67964e31ad87ac Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Wed, 11 Dec 2024 14:57:50 -0800
Subject: [PATCH 46/96] spherical harmonics module

---
 .../cuequivariance_torch/__init__.py          |  4 +-
 .../operations/spherical_harmonics.py         | 85 ++++++++++---------
 .../operations/spherical_harmonics_test.py    |  8 +-
 docs/api/cuequivariance_torch.rst             |  7 +-
 4 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/__init__.py b/cuequivariance_torch/cuequivariance_torch/__init__.py
index e1b524e..b045128 100644
--- a/cuequivariance_torch/cuequivariance_torch/__init__.py
+++ b/cuequivariance_torch/cuequivariance_torch/__init__.py
@@ -36,7 +36,7 @@
     vector_to_euler_angles,
     Inversion,
 )
-from .operations.spherical_harmonics import spherical_harmonics
+from .operations.spherical_harmonics import SphericalHarmonics
 
 from cuequivariance_torch import layers
 
@@ -55,6 +55,6 @@
     "Inversion",
     "encode_rotation_angle",
     "vector_to_euler_angles",
-    "spherical_harmonics",
+    "SphericalHarmonics",
     "layers",
 ]
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
index da6be5d..6b9f094 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
@@ -15,52 +15,61 @@
 from typing import Optional
 
 import torch
+import torch.nn as nn
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
 
-def spherical_harmonics(
-    ls: list[int],
-    vectors: torch.Tensor,
-    normalize: bool = True,
-    use_fallback: Optional[bool] = None,
-    optimize_fallback: Optional[bool] = None,
-) -> torch.Tensor:
-    r"""Compute the spherical harmonics of the input vectors.
+class SphericalHarmonics(nn.Module):
+    r"""Compute the spherical harmonics of the input vectors as a torch module."""
 
-    Args:
-        ls (list of int): List of spherical harmonic degrees.
-        vectors (torch.Tensor): Input vectors of shape (..., 3).
-        normalize (bool, optional): Whether to normalize the input vectors. Defaults to True.
-        use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
-                If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
-                If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+    def __init__(
+        self,
+        ls: list[int],
+        normalize: bool = True,
+        use_fallback: Optional[bool] = None,
+        optimize_fallback: Optional[bool] = None,
+    ):
+        """
+        Args:
+            ls (list of int): List of spherical harmonic degrees.
+            normalize (bool, optional): Whether to normalize the input vectors. Defaults to True.
+            use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
+                    If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
+                    If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
+            optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
+        """
+        super(SphericalHarmonics, self).__init__()
+        self.ls = ls if isinstance(ls, list) else [ls]
+        assert self.ls == sorted(set(self.ls))
+        self.normalize = normalize
+        self.use_fallback = use_fallback
+        self.optimize_fallback = optimize_fallback
 
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
+        self.m = cuet.EquivariantTensorProduct(
+            descriptors.spherical_harmonics(cue.SO3(1), self.ls),
+            layout=cue.ir_mul,
+            use_fallback=self.use_fallback,
+            optimize_fallback=self.optimize_fallback,
+        )
 
-    Returns:
-        torch.Tensor: The spherical harmonics of the input vectors of shape (..., dim)
-        where dim is the sum of 2*l+1 for l in ls.
-    """
-    if isinstance(ls, int):
-        ls = [ls]
-    assert ls == sorted(set(ls))
-    assert vectors.shape[-1] == 3
+    def forward(self, vectors: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            vectors (torch.Tensor): Input vectors of shape (..., 3).
 
-    if normalize:
-        vectors = torch.nn.functional.normalize(vectors, dim=-1)
+        Returns:
+            torch.Tensor: The spherical harmonics of the input vectors of shape (..., dim)
+            where dim is the sum of 2*l+1 for l in ls.
+        """
+        assert vectors.shape[-1] == 3
 
-    x = vectors.reshape(-1, 3)
-    m = cuet.EquivariantTensorProduct(
-        descriptors.spherical_harmonics(cue.SO3(1), ls),
-        layout=cue.ir_mul,
-        device=x.device,
-        math_dtype=x.dtype,
-        use_fallback=use_fallback,
-        optimize_fallback=optimize_fallback,
-    )
-    y = m([x])
-    y = y.reshape(vectors.shape[:-1] + (y.shape[-1],))
-    return y
+        if self.normalize:
+            vectors = torch.nn.functional.normalize(vectors, dim=-1)
+
+        x = vectors.reshape(-1, 3)
+        y = self.m([x])
+        y = y.reshape(vectors.shape[:-1] + (y.shape[-1],))
+        return y
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index 955ee87..27bad83 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -33,12 +33,13 @@ def test_spherical_harmonics(ell: int, dtype, tol):
     angle = np.random.rand()
     scale = 1.3
 
-    yl = cuet.spherical_harmonics([ell], vec, False)
+    m = cuet.SphericalHarmonics([ell])
+    yl = m(vec)
 
     R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).to(device)
     Rl = torch.from_numpy(cue.SO3(ell).rotation(axis, angle)).to(dtype).to(device)
 
-    yl1 = cuet.spherical_harmonics([ell], scale * R @ vec, False)
+    yl1 = m(scale * R @ vec)
     yl2 = scale**ell * Rl @ yl
 
     torch.testing.assert_close(yl1, yl2, rtol=tol, atol=tol)
@@ -47,6 +48,7 @@ def test_spherical_harmonics(ell: int, dtype, tol):
 def test_spherical_harmonics_full():
     vec = torch.randn(3, device=device)
     ls = [0, 1, 2, 3]
-    yl = cuet.spherical_harmonics(ls, vec, False)
+    m = cuet.SphericalHarmonics(ls)
+    yl = m(ls, vec)
 
     assert abs(yl[0] - 1.0) < 1e-6
diff --git a/docs/api/cuequivariance_torch.rst b/docs/api/cuequivariance_torch.rst
index ff33bfa..2785cf6 100644
--- a/docs/api/cuequivariance_torch.rst
+++ b/docs/api/cuequivariance_torch.rst
@@ -41,12 +41,7 @@ Special Cases of Tensor Products
    Linear
    SymmetricContraction
    TransposeIrrepsLayout
-
-.. autosummary::
-   :toctree: generated/
-   :template: function_template.rst
-
-   spherical_harmonics
+   SphericalHarmonics
 
 Euclidean Operations
 --------------------

From 01914dd792c6bebbca476fc56ccba7e04ec97593 Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Wed, 11 Dec 2024 14:59:37 -0800
Subject: [PATCH 47/96] fix tests

---
 .../tests/operations/spherical_harmonics_test.py              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index 27bad83..2a24d4c 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -33,7 +33,7 @@ def test_spherical_harmonics(ell: int, dtype, tol):
     angle = np.random.rand()
     scale = 1.3
 
-    m = cuet.SphericalHarmonics([ell])
+    m = cuet.SphericalHarmonics([ell], False)
     yl = m(vec)
 
     R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).to(device)
@@ -49,6 +49,6 @@ def test_spherical_harmonics_full():
     vec = torch.randn(3, device=device)
     ls = [0, 1, 2, 3]
     m = cuet.SphericalHarmonics(ls)
-    yl = m(ls, vec)
+    yl = m(vec)
 
     assert abs(yl[0] - 1.0) < 1e-6

From 50b75dc0504f3f8d1ce74bf0aaab26605d6cbbff Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Wed, 11 Dec 2024 15:50:36 -0800
Subject: [PATCH 48/96] test SymmetricContraction export

---
 .../operations/symmetric_contraction_test.py  | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index 80a4065..0776492 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -17,6 +17,9 @@
 import numpy as np
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
@@ -108,3 +111,46 @@ def test_mace_compatibility():
     output = n_sc(x, i)
 
     torch.testing.assert_close(output, expected_output, atol=1e-5, rtol=1e-5)
+
+
+export_modes = ["export", "onnx", "trt", "torch_trt", "jit"]
+
+
+@pytest.mark.parametrize(
+    "dtype, math_dtype, atol, rtol",
+    [
+        (torch.float64, torch.float64, 1e-10, 1e-10),
+        (torch.float32, torch.float32, 1e-5, 1e-5),
+    ],
+)
+@pytest.mark.parametrize("mode", export_modes)
+def test_export(
+    dtype: torch.dtype,
+    math_dtype: torch.dtype,
+    atol: float,
+    rtol: float,
+    mode: str,
+    tmp_path,
+):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    m = cuet.SymmetricContraction(
+        cue.Irreps("O3", "0e + 1o + 2e"),
+        cue.Irreps("O3", "0e + 1o"),
+        3,
+        5,
+        layout_in=cue.ir_mul,
+        layout_out=cue.mul_ir,
+        dtype=dtype,
+        math_dtype=math_dtype,
+        device=device,
+    )
+
+    x = torch.randn((1024, 36), device=device, dtype=dtype)
+    i = torch.randint(0, 5, (1024,), dtype=torch.int32).to(device)
+
+    res = m(x, i)
+    m_script = module_with_mode(mode, m, [x, i], math_dtype, tmp_path)
+    res_script = m_script(x, i)
+    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)

From bd16dbf100539499a9790b919c48f629949bf97d Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Thu, 12 Dec 2024 14:29:23 -0800
Subject: [PATCH 49/96] Fixed symmetric_contraction test

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../operations/symmetric_contraction.py       |  4 --
 .../operations/spherical_harmonics_test.py    |  6 +--
 .../operations/symmetric_contraction_test.py  | 42 +++++++++----------
 3 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index a91a72a..8a25b7c 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -180,10 +180,6 @@ def forward(
         Returns:
             torch.Tensor: The output tensor. It has shape (batch, irreps_out.dim).
         """
-        torch._assert(
-            x.shape[-1] == self.irreps_in.dim,
-            f"Input tensor must have shape (..., {self.irreps_in.dim}), got {x.shape}",
-        )
 
         if self.projection is not None:
             weight = torch.einsum("zau,ab->zbu", self.weight, self.projection)
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index 2a24d4c..d09b33f 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -33,7 +33,7 @@ def test_spherical_harmonics(ell: int, dtype, tol):
     angle = np.random.rand()
     scale = 1.3
 
-    m = cuet.SphericalHarmonics([ell], False)
+    m = cuet.SphericalHarmonics([ell], False).to(device)
     yl = m(vec)
 
     R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).to(device)
@@ -46,9 +46,9 @@ def test_spherical_harmonics(ell: int, dtype, tol):
 
 
 def test_spherical_harmonics_full():
-    vec = torch.randn(3, device=device)
+    vec = torch.randn(3, device=device).to(device)
     ls = [0, 1, 2, 3]
-    m = cuet.SphericalHarmonics(ls)
+    m = cuet.SphericalHarmonics(ls).to(device)
     yl = m(vec)
 
     assert abs(yl[0] - 1.0) < 1e-6
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index 0776492..b499f56 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -123,34 +123,34 @@ def test_mace_compatibility():
         (torch.float32, torch.float32, 1e-5, 1e-5),
     ],
 )
+@pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
+@pytest.mark.parametrize("original_mace", [True, False])
+@pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("mode", export_modes)
-def test_export(
-    dtype: torch.dtype,
-    math_dtype: torch.dtype,
-    atol: float,
-    rtol: float,
-    mode: str,
-    tmp_path,
-):
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA is not available")
+def test_export(dtype, math_dtype, atol, rtol, layout, original_mace, batch, mode, tmp_path):
+    mul = 64
+    irreps_in = mul * cue.Irreps("O3", "0e + 1o + 2e")
+    irreps_out = mul * cue.Irreps("O3", "0e + 1o")
 
     m = cuet.SymmetricContraction(
-        cue.Irreps("O3", "0e + 1o + 2e"),
-        cue.Irreps("O3", "0e + 1o"),
+        irreps_in,
+        irreps_out,
         3,
         5,
-        layout_in=cue.ir_mul,
-        layout_out=cue.mul_ir,
+        layout_in=layout,
+        layout_out=layout,
         dtype=dtype,
-        math_dtype=math_dtype,
+        math_dtype=dtype,
         device=device,
+        original_mace=original_mace,
     )
 
-    x = torch.randn((1024, 36), device=device, dtype=dtype)
-    i = torch.randint(0, 5, (1024,), dtype=torch.int32).to(device)
+    x = torch.randn((batch, irreps_in.dim), dtype=dtype).to(device)
+    indices = torch.randint(0, 5, (batch,), dtype=torch.int32).to(device)
 
-    res = m(x, i)
-    m_script = module_with_mode(mode, m, [x, i], math_dtype, tmp_path)
-    res_script = m_script(x, i)
-    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
+    out = m(x, indices)
+    assert out.shape == (batch, irreps_out.dim)
+    
+    m_script = module_with_mode(mode, m, [x, indices], dtype, tmp_path)
+    out_script = m_script(x, indices)
+    torch.testing.assert_close(out, out_script, atol=atol, rtol=rtol)

From 245e59443ba963f0fba3a9547fef17c63bc38ef8 Mon Sep 17 00:00:00 2001
From: Mario Geiger <geiger.mario@gmail.com>
Date: Thu, 12 Dec 2024 21:48:16 -0800
Subject: [PATCH 50/96] add device info

---
 .../operations/spherical_harmonics.py                | 12 +++++++-----
 .../tests/operations/spherical_harmonics_test.py     |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
index 6b9f094..0cbad1a 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
@@ -29,6 +29,8 @@ def __init__(
         self,
         ls: list[int],
         normalize: bool = True,
+        device: Optional[torch.device] = None,
+        math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
@@ -41,18 +43,18 @@ def __init__(
                     If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
             optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
         """
-        super(SphericalHarmonics, self).__init__()
+        super().__init__()
         self.ls = ls if isinstance(ls, list) else [ls]
         assert self.ls == sorted(set(self.ls))
         self.normalize = normalize
-        self.use_fallback = use_fallback
-        self.optimize_fallback = optimize_fallback
 
         self.m = cuet.EquivariantTensorProduct(
             descriptors.spherical_harmonics(cue.SO3(1), self.ls),
             layout=cue.ir_mul,
-            use_fallback=self.use_fallback,
-            optimize_fallback=self.optimize_fallback,
+            device=device,
+            math_dtype=math_dtype,
+            use_fallback=use_fallback,
+            optimize_fallback=optimize_fallback,
         )
 
     def forward(self, vectors: torch.Tensor) -> torch.Tensor:
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index 2a24d4c..01b9acb 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -33,7 +33,7 @@ def test_spherical_harmonics(ell: int, dtype, tol):
     angle = np.random.rand()
     scale = 1.3
 
-    m = cuet.SphericalHarmonics([ell], False)
+    m = cuet.SphericalHarmonics([ell], False, device=device)
     yl = m(vec)
 
     R = torch.from_numpy(cue.SO3(1).rotation(axis, angle)).to(dtype).to(device)

From 6f0e1b512dc93fff776382ad9f04dce101cd1861 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 12 Dec 2024 21:55:50 -0800
Subject: [PATCH 51/96] fix sh

---
 .../operations/spherical_harmonics.py                | 12 +++++++-----
 .../tests/operations/spherical_harmonics_test.py     |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
index 6b9f094..0cbad1a 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
@@ -29,6 +29,8 @@ def __init__(
         self,
         ls: list[int],
         normalize: bool = True,
+        device: Optional[torch.device] = None,
+        math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
         optimize_fallback: Optional[bool] = None,
     ):
@@ -41,18 +43,18 @@ def __init__(
                     If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
             optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
         """
-        super(SphericalHarmonics, self).__init__()
+        super().__init__()
         self.ls = ls if isinstance(ls, list) else [ls]
         assert self.ls == sorted(set(self.ls))
         self.normalize = normalize
-        self.use_fallback = use_fallback
-        self.optimize_fallback = optimize_fallback
 
         self.m = cuet.EquivariantTensorProduct(
             descriptors.spherical_harmonics(cue.SO3(1), self.ls),
             layout=cue.ir_mul,
-            use_fallback=self.use_fallback,
-            optimize_fallback=self.optimize_fallback,
+            device=device,
+            math_dtype=math_dtype,
+            use_fallback=use_fallback,
+            optimize_fallback=optimize_fallback,
         )
 
     def forward(self, vectors: torch.Tensor) -> torch.Tensor:
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index d09b33f..87e0997 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -48,7 +48,7 @@ def test_spherical_harmonics(ell: int, dtype, tol):
 def test_spherical_harmonics_full():
     vec = torch.randn(3, device=device).to(device)
     ls = [0, 1, 2, 3]
-    m = cuet.SphericalHarmonics(ls).to(device)
+    m = cuet.SphericalHarmonics(ls, device=device)
     yl = m(vec)
 
     assert abs(yl[0] - 1.0) < 1e-6

From e296279a778a43b145ebd435bd00b45699511ad2 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 12 Dec 2024 22:00:18 -0800
Subject: [PATCH 52/96] fix

---
 .../primitives/equivariant_tensor_product.py              | 5 ++++-
 .../tests/operations/symmetric_contraction_test.py        | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 16701ff..794435b 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -245,7 +245,10 @@ def forward(
 
         # assert len(inputs) == len(self.etp.inputs)
         for a, dim in zip(inputs, self.operands_dims):
-            assert a.shape[-1] == dim
+            torch._assert(
+                a.shape[-1] == dim,
+                f"Expected last dimension of input to be {dim}, got {a.shape[-1]}",
+            )
 
         # Transpose inputs
         inputs = self.transpose_in(inputs)
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index b499f56..e9135ef 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -127,7 +127,9 @@ def test_mace_compatibility():
 @pytest.mark.parametrize("original_mace", [True, False])
 @pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("mode", export_modes)
-def test_export(dtype, math_dtype, atol, rtol, layout, original_mace, batch, mode, tmp_path):
+def test_export(
+    dtype, math_dtype, atol, rtol, layout, original_mace, batch, mode, tmp_path
+):
     mul = 64
     irreps_in = mul * cue.Irreps("O3", "0e + 1o + 2e")
     irreps_out = mul * cue.Irreps("O3", "0e + 1o")
@@ -140,7 +142,7 @@ def test_export(dtype, math_dtype, atol, rtol, layout, original_mace, batch, mod
         layout_in=layout,
         layout_out=layout,
         dtype=dtype,
-        math_dtype=dtype,
+        math_dtype=math_dtype,
         device=device,
         original_mace=original_mace,
     )
@@ -150,7 +152,7 @@ def test_export(dtype, math_dtype, atol, rtol, layout, original_mace, batch, mod
 
     out = m(x, indices)
     assert out.shape == (batch, irreps_out.dim)
-    
+
     m_script = module_with_mode(mode, m, [x, indices], dtype, tmp_path)
     out_script = m_script(x, indices)
     torch.testing.assert_close(out, out_script, atol=atol, rtol=rtol)

From dc1f39453a361711d20d68ee1cfbd4c7d2fc799a Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 12 Dec 2024 22:09:04 -0800
Subject: [PATCH 53/96] skip

---
 .../tests/operations/symmetric_contraction_test.py             | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index e9135ef..69ff71a 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -130,6 +130,9 @@ def test_mace_compatibility():
 def test_export(
     dtype, math_dtype, atol, rtol, layout, original_mace, batch, mode, tmp_path
 ):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     mul = 64
     irreps_in = mul * cue.Irreps("O3", "0e + 1o + 2e")
     irreps_out = mul * cue.Irreps("O3", "0e + 1o")

From ad91f3851a67bd9e97222f88eece7760f7e68f45 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 12 Dec 2024 22:17:13 -0800
Subject: [PATCH 54/96] torch._dynamo.config.cache_size_limit = 100

---
 cuequivariance_torch/tests/utils.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index 1de6266..e5e4d17 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -9,15 +9,19 @@
 # its affiliates is strictly prohibited.
 
 import os
+from typing import Sequence
+
 import pytest
 import torch
-from typing import Sequence
+import torch._dynamo
+
+torch._dynamo.config.cache_size_limit = 100
 
 try:
+    import cuequivariance_ops_torch.onnx  # noqa: F401
     import onnx  # noqa: F401
-    import onnxscript  # noqa: F401
     import onnxruntime  # noqa: F401
-    import cuequivariance_ops_torch.onnx  # noqa: F401
+    import onnxscript  # noqa: F401
     from cuequivariance_ops_torch.tensorrt import register_plugins
 
     ONNX_AVAILABLE = True
@@ -39,8 +43,8 @@ def verify_onnx(module, onnx_module, inputs, dtype):
     from onnxruntime import SessionOptions
     from onnxruntime_extensions import get_library_path
     from torch.onnx.verification import (
-        _compare_onnx_pytorch_model,
         VerificationOptions,
+        _compare_onnx_pytorch_model,
     )
 
     original_init = SessionOptions.__init__
@@ -70,16 +74,16 @@ def verify_trt(module, onnx_module, inputs, dtype):
     if dtype == torch.float64:
         pytest.skip("TRT does not support float64")
 
+    from onnxruntime import InferenceSession, SessionOptions
+    from onnxruntime_extensions import get_library_path
+    from polygraphy.backend.onnxrt import OnnxrtRunner
     from polygraphy.backend.trt import (
+        CreateConfig,
+        TrtRunner,
         engine_from_network,
         network_from_onnx_path,
-        TrtRunner,
-        CreateConfig,
     )
-    from polygraphy.backend.onnxrt import OnnxrtRunner
     from polygraphy.comparator import Comparator, DataLoader
-    from onnxruntime import InferenceSession, SessionOptions
-    from onnxruntime_extensions import get_library_path
 
     register_plugins()
 

From 9d224b0987dea1ba8f396fbaa54bbb3c6950a3e7 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 12 Dec 2024 22:48:37 -0800
Subject: [PATCH 55/96] fix test

---
 .../tests/operations/spherical_harmonics_test.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index 87e0997..afb1703 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -24,7 +24,7 @@
 
 @pytest.mark.parametrize(
     "dtype, tol",
-    [(torch.float64, 1e-6), (torch.float32, 1e-4)],
+    [(torch.float64, 1e-5), (torch.float32, 1e-4)],
 )
 @pytest.mark.parametrize("ell", [1, 2, 3])
 def test_spherical_harmonics(ell: int, dtype, tol):

From 6c03f29d5461c62137ce330f8186a0f17eb52be1 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 15 Dec 2024 20:35:56 -0800
Subject: [PATCH 56/96] Script compatibility for fallback

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance_torch/operations/linear.py | 14 +++--
 .../primitives/symmetric_tensor_product.py    |  5 +-
 .../primitives/tensor_product.py              | 62 +++++++++++--------
 .../tests/operations/linear_test.py           |  2 +-
 .../equivariant_tensor_product_test.py        | 40 ++++--------
 .../symmetric_tensor_product_test.py          |  4 +-
 .../tests/primitives/tensor_product_test.py   | 24 +++++--
 cuequivariance_torch/tests/utils.py           |  5 ++
 8 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/linear.py b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
index 41c7e89..55789e5 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/linear.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
@@ -121,10 +121,12 @@ def forward(
                 raise ValueError("Internal weights are used, weight should be None")
 
             weight = self.weight
-
-        if self.shared_weights and weight.ndim != 1:
-            raise ValueError("Shared weights should be 1D tensor")
-        if not self.shared_weights and weight.ndim != 2:
-            raise ValueError("Weights should be 2D tensor")
-
+            
+        if weight is not None:
+            if self.shared_weights and weight.ndim != 1:
+                raise ValueError("Shared weights should be 1D tensor")
+            if not self.shared_weights and weight.ndim != 2:
+                raise ValueError("Weights should be 2D tensor")
+        else:
+            raise ValueError("Weights should not be None")
         return self.f([weight, x])
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index dc61230..8efde5a 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -361,6 +361,5 @@ def __init__(
     def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
-        return sum(
-            f([x0[i0]] + [x1] * (f.descriptor.num_operands - 2)) for f in self.fs
-        )
+        fs: List[torch.Tensor] = [f([x0[i0]] + [x1] * (f.num_operands - 2)) for f in self.fs]
+        return torch.sum(torch.stack(fs), dim=0)
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index b436161..153dc42 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -24,7 +24,7 @@
 
 logger = logging.getLogger(__name__)
 
-
+@torch.jit.script
 def prod(numbers: List[int]):
     product = 1
     for num in numbers:
@@ -206,26 +206,36 @@ def _tensor_product_fx(
 
         outputs = []
         for path_idx, path in enumerate(descriptor.paths):
-            segments = [
-                inputs[oid][..., slices[oid][path.indices[oid]]]
-                .reshape(
-                    inputs[oid].shape[:-1] + descriptor.get_segment_shape(oid, path)
+            segments = []
+            for oid in range(num_inputs):
+                seg_shape = descriptor.get_segment_shape(oid, path)
+                inp = inputs[oid][..., slices[oid][path.indices[oid]]]
+                if len(seg_shape) > 0:
+                    inp = inp.reshape(
+                        inputs[oid].shape[:-1] + seg_shape
+                    )
+                else:
+                    inp = inp.reshape(
+                        inputs[oid].shape[:-1]
+                    )
+
+                segments.append(
+                    inp.to(dtype=math_dtype)
                 )
-                .to(dtype=math_dtype)
-                for oid in range(num_inputs)
-            ]
+
+            int_dtype = {
+                2: torch.int16,
+                4: torch.int32,
+                8: torch.int64,
+            }[math_dtype.itemsize]
+
             constants[f"c{path_idx}"] = torch.tensor(
                 path.coefficients, dtype=math_dtype, device=device
-            ).view(
-                {
-                    2: torch.int16,
-                    4: torch.int32,
-                    8: torch.int64,
-                }[math_dtype.itemsize]
-            )
+            ) # .view(dtype=int_dtype)
+            
             c = (
                 torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer)
-                .view(math_dtype)
+                # .view(dtype=math_dtype)
                 .clone()
             )
             out = torch.einsum(formula, c, *segments)
@@ -235,9 +245,17 @@ def _tensor_product_fx(
             outputs += [
                 out.reshape(out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),))
             ]
-
+ 
         if len(outputs) == 0:
             raise NotImplementedError("No FX implementation for empty paths")
+        
+        def _sum(tensors, *, shape=None, like=None):
+            if len(tensors) == 0:
+                return like.new_zeros(shape)
+            out = tensors[0]
+            for t in tensors[1:]:
+                out = torch.add(out, t)
+            return out
 
         batch_shape = outputs[0].shape[:-1]
         output = torch.cat(
@@ -407,15 +425,6 @@ def forward(self, args: List[torch.Tensor]):
         return out.reshape(shape + (out.shape[-1],))
 
 
-def _sum(tensors, *, shape=None, like=None):
-    if len(tensors) == 0:
-        return like.new_zeros(shape)
-    out = tensors[0]
-    for t in tensors[1:]:
-        out += t
-    return out
-
-
 def _tensor_product_cuda(
     descriptor: stp.SegmentedTensorProduct,
     device: Optional[torch.device],
@@ -494,6 +503,7 @@ def _tensor_product_cuda(
         return FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
 
 
+@torch.jit.script
 def _reshape(x: torch.Tensor, leading_shape: List[int]) -> torch.Tensor:
     # Make x have shape (Z, x.shape[-1]) or (x.shape[-1],)
     if prod(leading_shape) > 1 and prod(x.shape[:-1]) == 1:
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index 2b78ff1..c288f80 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -64,7 +64,7 @@ def test_linear_fwd(
         use_fallback=True,
     )
     x = torch.randn(10, irreps_in.dim, dtype=torch.float64).cuda()
-
+        
     if shared_weights:
         y = linear(x)
         y_fx = linear_fx(x)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index e1a684c..8ee4b7d 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -192,36 +192,12 @@ def test_compile(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-@pytest.mark.parametrize("e", make_descriptors())
-@pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
-def test_script(
-    e: cue.EquivariantTensorProduct,
-    dtype: torch.dtype,
-    math_dtype: torch.dtype,
-    atol: float,
-    rtol: float,
-):
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA is not available")
-
-    m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device=device, math_dtype=math_dtype
-    )
-    inputs = [
-        torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
-        for inp in e.inputs
-    ]
-    res = m(inputs)
-    m_script = torch.jit.script(m)
-    res_script = m_script(inputs)
-    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
-
-
-export_modes = ["export", "onnx", "trt", "torch_trt", "jit"]
+export_modes = ["script" ] # , "export", "onnx", "trt" ] # , "torch_trt", "jit"]
 
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
+@pytest.mark.parametrize("use_fallback", [True, False])
 @pytest.mark.parametrize("mode", export_modes)
 def test_export(
     e: cue.EquivariantTensorProduct,
@@ -230,19 +206,27 @@ def test_export(
     atol: float,
     rtol: float,
     mode: str,
+    use_fallback: bool,
     tmp_path,
 ):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
+    if use_fallback is True and not mode in ["eager", "script"]:
+        pytest.skip("Only eager, script and export modes are supported for the fallback!")
+
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=False, device=device
+        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=use_fallback, device=device
     )
+    exp_inputs = [
+        torch.randn((512, inp.irreps.dim), device=device, dtype=dtype)
+        for inp in e.inputs
+    ]
     inputs = [
         torch.randn((1024, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
     ]
     res = m(inputs)
-    m_script = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
+    m_script = module_with_mode(mode, m, [exp_inputs], math_dtype, tmp_path)
     res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 7858576..991a2fa 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -74,8 +74,8 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
     m = cuet.IWeightedSymmetricTensorProduct(
         ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
     )
-    if use_fallback is False:
-        m = torch.jit.script(m)
+    
+    m = torch.jit.script(m)
 
     x0 = torch.randn((2, m.x0_size), device=device, dtype=dtype, requires_grad=True)
     i0 = torch.tensor([0, 1, 0], dtype=torch.int32, device=device)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index d8c26ef..72cb277 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -16,6 +16,9 @@
 
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance.segmented_tensor_product as stp
@@ -88,19 +91,26 @@ def make_descriptors():
         (torch.bfloat16, torch.float32, 1.0),
     ]
 
+export_modes = ["script", "eager"] # , "export", "onnx", "trt", "torch_trt", "jit"]
 
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
-@pytest.mark.parametrize("use_fallback", [False, True])
+@pytest.mark.parametrize("use_fallback", [True, False])
+@pytest.mark.parametrize("mode", export_modes)
+
 def test_primitive_tensor_product_cuda_vs_fx(
     d: stp.SegmentedTensorProduct,
     dtype: torch.dtype,
     math_dtype: torch.dtype,
     tol: float,
     use_fallback: bool,
+    mode: str,
+    tmp_path: str
 ):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
+    if use_fallback is True and not mode in ["eager", "script", "export"]:
+        pytest.skip("Only eager, script and export modes are supported for the fallback!")
 
     for batches in itertools.product([(16,), (), (4, 1)], repeat=d.num_operands - 1):
         inputs = [
@@ -114,11 +124,12 @@ def test_primitive_tensor_product_cuda_vs_fx(
         ]
 
         m = cuet.TensorProduct(
-            d, device=device, math_dtype=math_dtype, use_fallback=use_fallback
+            d, device=device, math_dtype=math_dtype, 
+            use_fallback=use_fallback,
+            optimize_fallback=True,
         )
-        if not use_fallback:
-            m = torch.jit.script(m)
-
+        m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
+        
         out1 = m(inputs)
 
         m = cuet.TensorProduct(
@@ -128,7 +139,8 @@ def test_primitive_tensor_product_cuda_vs_fx(
             use_fallback=True,
             optimize_fallback=False,
         )
-        inputs_ = [inp.clone().to(torch.float64) for inp in inputs]
+
+        inputs_ = [inp.to(torch.float64) for inp in inputs]
         out2 = m(inputs_)
 
         assert out1.shape[:-1] == torch.broadcast_shapes(*batches)
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index e5e4d17..b90207a 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -133,6 +133,11 @@ def module_with_mode(
                 module = torch.compile(module)
         elif mode == "fx":
             module = torch.fx.symbolic_trace(module)
+        elif mode == "script":
+            module = torch.jit.script(module)
+            # fname = os.path.join(tmp_path, "test.ts")
+            # torch.jit.save(module, fname)
+            # module = torch.jit.load(fname)
         elif mode == "jit":
             module = torch.jit.trace(module, inputs)
             fname = os.path.join(tmp_path, "test.ts")

From 21da7b0b0eb9d5f39c84f4256756d27113b6242c Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 15 Dec 2024 21:24:53 -0800
Subject: [PATCH 57/96] style

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance_torch/operations/linear.py |  2 +-
 .../primitives/symmetric_tensor_product.py    |  7 ++--
 .../primitives/tensor_product.py              | 32 ++++++++-----------
 .../tests/operations/linear_test.py           |  2 +-
 .../equivariant_tensor_product_test.py        | 14 +++++---
 .../symmetric_tensor_product_test.py          |  2 +-
 .../tests/primitives/tensor_product_test.py   | 18 +++++++----
 cuequivariance_torch/tests/utils.py           |  6 ++--
 8 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/linear.py b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
index 55789e5..d018767 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/linear.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
@@ -121,7 +121,7 @@ def forward(
                 raise ValueError("Internal weights are used, weight should be None")
 
             weight = self.weight
-            
+
         if weight is not None:
             if self.shared_weights and weight.ndim != 1:
                 raise ValueError("Shared weights should be 1D tensor")
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 8efde5a..a59e09e 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-from typing import Optional
+from typing import List, Optional
 
 import torch
 import torch.fx
@@ -177,6 +177,7 @@ def __init__(
                 optimize_fallback=optimize_fallback,
             )
 
+    @torch.jit.ignore
     def __repr__(self):
         has_cuda_kernel = (
             "(with CUDA kernel)"
@@ -361,5 +362,7 @@ def __init__(
     def forward(
         self, x0: torch.Tensor, i0: torch.Tensor, x1: torch.Tensor
     ) -> torch.Tensor:
-        fs: List[torch.Tensor] = [f([x0[i0]] + [x1] * (f.num_operands - 2)) for f in self.fs]
+        fs: List[torch.Tensor] = [
+            f([x0[i0]] + [x1] * (f.num_operands - 2)) for f in self.fs
+        ]
         return torch.sum(torch.stack(fs), dim=0)
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 153dc42..4e5034b 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -24,6 +24,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 @torch.jit.script
 def prod(numbers: List[int]):
     product = 1
@@ -138,6 +139,7 @@ def __init__(
             )
             self._optimize_fallback = optimize_fallback
 
+    @torch.jit.ignore
     def __repr__(self):
         has_cuda_kernel = (
             "(with CUDA kernel)" if self.has_cuda else "(without CUDA kernel)"
@@ -211,28 +213,22 @@ def _tensor_product_fx(
                 seg_shape = descriptor.get_segment_shape(oid, path)
                 inp = inputs[oid][..., slices[oid][path.indices[oid]]]
                 if len(seg_shape) > 0:
-                    inp = inp.reshape(
-                        inputs[oid].shape[:-1] + seg_shape
-                    )
+                    inp = inp.reshape(inputs[oid].shape[:-1] + seg_shape)
                 else:
-                    inp = inp.reshape(
-                        inputs[oid].shape[:-1]
-                    )
+                    inp = inp.reshape(inputs[oid].shape[:-1])
 
-                segments.append(
-                    inp.to(dtype=math_dtype)
-                )
+                segments.append(inp.to(dtype=math_dtype))
 
-            int_dtype = {
-                2: torch.int16,
-                4: torch.int32,
-                8: torch.int64,
-            }[math_dtype.itemsize]
+            # int_dtype = {
+            #    2: torch.int16,
+            #    4: torch.int32,
+            #    8: torch.int64,
+            # }[math_dtype.itemsize]
 
             constants[f"c{path_idx}"] = torch.tensor(
                 path.coefficients, dtype=math_dtype, device=device
-            ) # .view(dtype=int_dtype)
-            
+            )  # .view(dtype=int_dtype)
+
             c = (
                 torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer)
                 # .view(dtype=math_dtype)
@@ -245,10 +241,10 @@ def _tensor_product_fx(
             outputs += [
                 out.reshape(out.shape[: out.ndim - len(seg_shape)] + (prod(seg_shape),))
             ]
- 
+
         if len(outputs) == 0:
             raise NotImplementedError("No FX implementation for empty paths")
-        
+
         def _sum(tensors, *, shape=None, like=None):
             if len(tensors) == 0:
                 return like.new_zeros(shape)
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index c288f80..2b78ff1 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -64,7 +64,7 @@ def test_linear_fwd(
         use_fallback=True,
     )
     x = torch.randn(10, irreps_in.dim, dtype=torch.float64).cuda()
-        
+
     if shared_weights:
         y = linear(x)
         y_fx = linear_fx(x)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 8ee4b7d..bab5fb3 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -192,7 +192,7 @@ def test_compile(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-export_modes = ["script" ] # , "export", "onnx", "trt" ] # , "torch_trt", "jit"]
+export_modes = ["script"]  # , "export", "onnx", "trt" ] # , "torch_trt", "jit"]
 
 
 @pytest.mark.parametrize("e", make_descriptors())
@@ -212,11 +212,17 @@ def test_export(
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
-    if use_fallback is True and not mode in ["eager", "script"]:
-        pytest.skip("Only eager, script and export modes are supported for the fallback!")
+    if use_fallback is True and mode not in ["eager", "script"]:
+        pytest.skip(
+            "Only eager, script and export modes are supported for the fallback!"
+        )
 
     m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, math_dtype=math_dtype, use_fallback=use_fallback, device=device
+        e,
+        layout=cue.mul_ir,
+        math_dtype=math_dtype,
+        use_fallback=use_fallback,
+        device=device,
     )
     exp_inputs = [
         torch.randn((512, inp.irreps.dim), device=device, dtype=dtype)
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 991a2fa..4807b86 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -74,7 +74,7 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
     m = cuet.IWeightedSymmetricTensorProduct(
         ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
     )
-    
+
     m = torch.jit.script(m)
 
     x0 = torch.randn((2, m.x0_size), device=device, dtype=dtype, requires_grad=True)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 72cb277..c2f8f4e 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -91,13 +91,13 @@ def make_descriptors():
         (torch.bfloat16, torch.float32, 1.0),
     ]
 
-export_modes = ["script", "eager"] # , "export", "onnx", "trt", "torch_trt", "jit"]
+export_modes = ["script", "eager"]  # , "export", "onnx", "trt", "torch_trt", "jit"]
+
 
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
 @pytest.mark.parametrize("use_fallback", [True, False])
 @pytest.mark.parametrize("mode", export_modes)
-
 def test_primitive_tensor_product_cuda_vs_fx(
     d: stp.SegmentedTensorProduct,
     dtype: torch.dtype,
@@ -105,12 +105,14 @@ def test_primitive_tensor_product_cuda_vs_fx(
     tol: float,
     use_fallback: bool,
     mode: str,
-    tmp_path: str
+    tmp_path: str,
 ):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-    if use_fallback is True and not mode in ["eager", "script", "export"]:
-        pytest.skip("Only eager, script and export modes are supported for the fallback!")
+    if use_fallback is True and mode not in ["eager", "script", "export"]:
+        pytest.skip(
+            "Only eager, script and export modes are supported for the fallback!"
+        )
 
     for batches in itertools.product([(16,), (), (4, 1)], repeat=d.num_operands - 1):
         inputs = [
@@ -124,12 +126,14 @@ def test_primitive_tensor_product_cuda_vs_fx(
         ]
 
         m = cuet.TensorProduct(
-            d, device=device, math_dtype=math_dtype, 
+            d,
+            device=device,
+            math_dtype=math_dtype,
             use_fallback=use_fallback,
             optimize_fallback=True,
         )
         m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
-        
+
         out1 = m(inputs)
 
         m = cuet.TensorProduct(
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index b90207a..3244a84 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -135,9 +135,9 @@ def module_with_mode(
             module = torch.fx.symbolic_trace(module)
         elif mode == "script":
             module = torch.jit.script(module)
-            # fname = os.path.join(tmp_path, "test.ts")
-            # torch.jit.save(module, fname)
-            # module = torch.jit.load(fname)
+            fname = os.path.join(tmp_path, "test.ts")
+            torch.jit.save(module, fname)
+            module = torch.jit.load(fname)
         elif mode == "jit":
             module = torch.jit.trace(module, inputs)
             fname = os.path.join(tmp_path, "test.ts")

From d8a433645e4a088a386cecb203227daf955c609a Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Dec 2024 01:32:08 -0800
Subject: [PATCH 58/96] Trying to make trace() work

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/symmetric_tensor_product.py    |  3 ++-
 .../primitives/tensor_product.py              | 22 ++++++++++++++-----
 .../equivariant_tensor_product_test.py        |  3 ++-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index a59e09e..ea373b0 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -217,7 +217,8 @@ def forward(
             f"Expected 2 dims (i0.max() + 1, x0_size), got shape {x0.shape}",
         )
         shape = broadcast_shapes([i0.shape, x1.shape[:-1]])
-        i0 = i0.expand(shape).reshape((prod(shape),))
+        i0 = i0.expand(shape)
+        i0 = i0.reshape((prod(shape),))
         x1 = x1.expand(shape + (x1.shape[-1],)).reshape((prod(shape), x1.shape[-1]))
 
         out = self.f(x0, i0, x1)
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 4e5034b..25459e6 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -25,15 +25,23 @@
 logger = logging.getLogger(__name__)
 
 
-@torch.jit.script
 def prod(numbers: List[int]):
-    product = 1
-    for num in numbers:
-        product *= num
-    return product
+    """
+    This method is a workaround for script() not recognizing math.prod()
+    """
+    if torch.jit.is_scripting():
+        product = 1
+        for num in numbers:
+            product *= num
+        return product
+    else:
+        return math.prod(numbers)
 
 
 def broadcast_shapes(shapes: List[List[int]]):
+    """
+    This method is a workaround for script() not recognizing torch.broadcast_shapes()
+    """
     if torch.jit.is_scripting():
         max_len = 0
         for shape in shapes:
@@ -543,6 +551,7 @@ def __init__(
             math_dtype=math_dtype,
         ).to(device=device)
 
+    @torch.jit.ignore
     def __repr__(self) -> str:
         return f"FusedTensorProductOp3({self.descriptor} (output last operand))"
 
@@ -598,6 +607,7 @@ def __init__(
             math_dtype=math_dtype,
         ).to(device=device)
 
+    @torch.jit.ignore
     def __repr__(self) -> str:
         return f"FusedTensorProductOp4({self.descriptor} (output last operand))"
 
@@ -650,6 +660,7 @@ def __init__(
 
 
 class TensorProductUniform3x1d(TensorProductUniform1d):
+    @torch.jit.ignore
     def __repr__(self):
         return f"TensorProductUniform3x1d({self.descriptor} (output last operand))"
 
@@ -678,6 +689,7 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
 
 
 class TensorProductUniform4x1d(TensorProductUniform1d):
+    @torch.jit.ignore
     def __repr__(self):
         return f"TensorProductUniform4x1d({self.descriptor} (output last operand))"
 
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index bab5fb3..ad827ca 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -192,7 +192,8 @@ def test_compile(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-export_modes = ["script"]  # , "export", "onnx", "trt" ] # , "torch_trt", "jit"]
+# export_modes = ["script", "export", "onnx", "trt" ] # , "torch_trt", "jit"]
+export_modes = ["script"]  # , "torch_trt", "jit"]
 
 
 @pytest.mark.parametrize("e", make_descriptors())

From 6e518f6e4a55ef5dd5c9a0fa85a92582b7551726 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Dec 2024 02:20:59 -0800
Subject: [PATCH 59/96] Restoring integer cast

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/tensor_product.py                   | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 25459e6..7b3520c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -227,19 +227,19 @@ def _tensor_product_fx(
 
                 segments.append(inp.to(dtype=math_dtype))
 
-            # int_dtype = {
-            #    2: torch.int16,
-            #    4: torch.int32,
-            #    8: torch.int64,
-            # }[math_dtype.itemsize]
+            int_dtype = {
+                2: torch.int16,
+                4: torch.int32,
+                8: torch.int64,
+            }[math_dtype.itemsize]
 
             constants[f"c{path_idx}"] = torch.tensor(
                 path.coefficients, dtype=math_dtype, device=device
-            )  # .view(dtype=int_dtype)
+            ).view(dtype=int_dtype)
 
             c = (
                 torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer)
-                # .view(dtype=math_dtype)
+                .view(dtype=math_dtype)
                 .clone()
             )
             out = torch.einsum(formula, c, *segments)

From 9410be649fc28357f5d1a5c200a35c36d716febb Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Dec 2024 02:30:59 -0800
Subject: [PATCH 60/96] Skipping failing tests

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../tests/primitives/equivariant_tensor_product_test.py     | 2 +-
 .../tests/primitives/tensor_product_test.py                 | 4 ++--
 cuequivariance_torch/tests/utils.py                         | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index ad827ca..9044ee3 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -198,7 +198,7 @@ def test_compile(
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
-@pytest.mark.parametrize("use_fallback", [True, False])
+@pytest.mark.parametrize("use_fallback", [False])
 @pytest.mark.parametrize("mode", export_modes)
 def test_export(
     e: cue.EquivariantTensorProduct,
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index c2f8f4e..dabe147 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -109,9 +109,9 @@ def test_primitive_tensor_product_cuda_vs_fx(
 ):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-    if use_fallback is True and mode not in ["eager", "script", "export"]:
+    if use_fallback is True and mode not in ["eager"]:
         pytest.skip(
-            "Only eager, script and export modes are supported for the fallback!"
+            "Only eager mode is supported for the fallback!"
         )
 
     for batches in itertools.product([(16,), (), (4, 1)], repeat=d.num_operands - 1):
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index 3244a84..b90207a 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -135,9 +135,9 @@ def module_with_mode(
             module = torch.fx.symbolic_trace(module)
         elif mode == "script":
             module = torch.jit.script(module)
-            fname = os.path.join(tmp_path, "test.ts")
-            torch.jit.save(module, fname)
-            module = torch.jit.load(fname)
+            # fname = os.path.join(tmp_path, "test.ts")
+            # torch.jit.save(module, fname)
+            # module = torch.jit.load(fname)
         elif mode == "jit":
             module = torch.jit.trace(module, inputs)
             fname = os.path.join(tmp_path, "test.ts")

From d4a0842db65fb1af9541fd391cf342339d362167 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Dec 2024 02:52:31 -0800
Subject: [PATCH 61/96] disabling cast for fallback

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/tensor_product.py                   | 14 +++++++-------
 .../primitives/symmetric_tensor_product_test.py    |  2 ++
 .../tests/primitives/tensor_product_test.py        |  4 ++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 7b3520c..25459e6 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -227,19 +227,19 @@ def _tensor_product_fx(
 
                 segments.append(inp.to(dtype=math_dtype))
 
-            int_dtype = {
-                2: torch.int16,
-                4: torch.int32,
-                8: torch.int64,
-            }[math_dtype.itemsize]
+            # int_dtype = {
+            #    2: torch.int16,
+            #    4: torch.int32,
+            #    8: torch.int64,
+            # }[math_dtype.itemsize]
 
             constants[f"c{path_idx}"] = torch.tensor(
                 path.coefficients, dtype=math_dtype, device=device
-            ).view(dtype=int_dtype)
+            )  # .view(dtype=int_dtype)
 
             c = (
                 torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer)
-                .view(dtype=math_dtype)
+                # .view(dtype=math_dtype)
                 .clone()
             )
             out = torch.einsum(formula, c, *segments)
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 4807b86..eac3a5c 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -130,6 +130,8 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: bool):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
+    if use_fallback is True:
+        pytest.skip("Skipping dtype test for fallback")
 
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index dabe147..c2f8f4e 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -109,9 +109,9 @@ def test_primitive_tensor_product_cuda_vs_fx(
 ):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-    if use_fallback is True and mode not in ["eager"]:
+    if use_fallback is True and mode not in ["eager", "script", "export"]:
         pytest.skip(
-            "Only eager mode is supported for the fallback!"
+            "Only eager, script and export modes are supported for the fallback!"
         )
 
     for batches in itertools.product([(16,), (), (4, 1)], repeat=d.num_operands - 1):

From a6856db16fcb942659daa06919cafc7c44f4cca9 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Mon, 16 Dec 2024 14:07:12 -0800
Subject: [PATCH 62/96] optimize_fallback=use_fallback

---
 .../layers/tp_conv_fully_connected.py           |  3 ---
 .../cuequivariance_torch/operations/linear.py   |  3 ---
 .../cuequivariance_torch/operations/rotation.py |  6 ------
 .../operations/spherical_harmonics.py           |  3 ---
 .../operations/symmetric_contraction.py         |  3 ---
 .../operations/tp_channel_wise.py               |  3 ---
 .../operations/tp_fully_connected.py            |  4 +---
 .../primitives/equivariant_tensor_product.py    |  5 -----
 .../primitives/symmetric_tensor_product.py      | 11 -----------
 .../primitives/tensor_product.py                | 17 +----------------
 .../equivariant_tensor_product_test.py          |  2 --
 .../primitives/symmetric_tensor_product_test.py |  1 -
 .../tests/primitives/tensor_product_test.py     |  2 --
 13 files changed, 2 insertions(+), 61 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
index b6a6fb0..a5b2e1a 100644
--- a/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/layers/tp_conv_fully_connected.py
@@ -59,7 +59,6 @@ class FullyConnectedTensorProductConv(nn.Module):
         use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                 If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Examples:
         >>> in_irreps = cue.Irreps("O3", "4x0e + 4x1o")
@@ -106,7 +105,6 @@ def __init__(
         mlp_activation: Union[nn.Module, Sequence[nn.Module], None] = nn.GELU(),
         layout: cue.IrrepsLayout = None,  # e3nn_compat_mode
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -127,7 +125,6 @@ def __init__(
             layout=self.layout,
             shared_weights=False,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
         self.batch_norm = (
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/linear.py b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
index d018767..0d41d90 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/linear.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
@@ -35,7 +35,6 @@ class Linear(torch.nn.Module):
         use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                 If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
     """
 
     def __init__(
@@ -52,7 +51,6 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         irreps_in, irreps_out = default_irreps(irreps_in, irreps_out)
@@ -90,7 +88,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def extra_repr(self) -> str:
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/rotation.py b/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
index cc2356f..62c72f8 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/rotation.py
@@ -41,7 +41,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         (irreps,) = default_irreps(irreps)
@@ -62,7 +61,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def forward(
@@ -158,8 +156,6 @@ class Inversion(torch.nn.Module):
         use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                 If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
     """
 
     def __init__(
@@ -172,7 +168,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         (irreps,) = default_irreps(irreps)
@@ -191,7 +186,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
index 0cbad1a..e89a6fd 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/spherical_harmonics.py
@@ -32,7 +32,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         """
         Args:
@@ -41,7 +40,6 @@ def __init__(
             use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                     If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                     If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-            optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
         """
         super().__init__()
         self.ls = ls if isinstance(ls, list) else [ls]
@@ -54,7 +52,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def forward(self, vectors: torch.Tensor) -> torch.Tensor:
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index 8a25b7c..e1dd94f 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -41,7 +41,6 @@ class SymmetricContraction(torch.nn.Module):
         use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                 If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Examples:
         >>> irreps_in = cue.Irreps("O3", "32x0e + 32x1o")
@@ -109,7 +108,6 @@ def __init__(
         math_dtype: Optional[torch.dtype] = None,
         original_mace: bool = False,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -155,7 +153,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype or dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def extra_repr(self) -> str:
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
index a6ac80f..00f2851 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
@@ -36,7 +36,6 @@ class ChannelWiseTensorProduct(torch.nn.Module):
         use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                 If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Note:
         In e3nn there was a irrep_normalization and path_normalization parameters.
@@ -59,7 +58,6 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         irreps_in1, irreps_in2 = default_irreps(irreps_in1, irreps_in2)
@@ -101,7 +99,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def extra_repr(self) -> str:
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
index 44c781d..5d7ff5b 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
@@ -36,7 +36,6 @@ class FullyConnectedTensorProduct(torch.nn.Module):
         use_fallback (bool, optional): If `None` (default), a CUDA kernel will be used if available.
                 If `False`, a CUDA kernel will be used, and an exception is raised if it's not available.
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-        optimize_fallback (bool, optional): Whether to optimize fallback. Defaults to None.
 
     Note:
         In e3nn there was a irrep_normalization and path_normalization parameters.
@@ -59,7 +58,6 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         irreps_in1, irreps_in2, irreps_out = default_irreps(
@@ -101,7 +99,7 @@ def __init__(
             layout_out=layout_out,
             device=device,
             math_dtype=math_dtype,
-            optimize_fallback=optimize_fallback,
+            use_fallback=use_fallback,
         )
 
     def extra_repr(self) -> str:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 794435b..9fa84cc 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -119,7 +119,6 @@ class EquivariantTensorProduct(torch.nn.Module):
         device (torch.device): device of the Module.
         math_dtype (torch.dtype): dtype for internal computations.
         use_fallback (bool, optional):  Determines the computation method. If `None` (default), a CUDA kernel will be used if available. If `False`, a CUDA kernel will be used, and an exception is raised if it's not available. If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
-        optimize_fallback (bool): whether to optimize the fallback implementation.
     Raises:
         RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
@@ -155,7 +154,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         if not isinstance(layout_in, tuple):
@@ -203,7 +201,6 @@ def __init__(
                         device=device,
                         math_dtype=math_dtype,
                         use_fallback=use_fallback,
-                        optimize_fallback=optimize_fallback,
                     )
                 )
             elif e.num_inputs == 2:
@@ -213,7 +210,6 @@ def __init__(
                         device=device,
                         math_dtype=math_dtype,
                         use_fallback=use_fallback,
-                        optimize_fallback=optimize_fallback,
                     )
                 )
             else:
@@ -225,7 +221,6 @@ def __init__(
                     device=device,
                     math_dtype=math_dtype,
                     use_fallback=use_fallback,
-                    optimize_fallback=optimize_fallback,
                 )
             )
 
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index ea373b0..ca1fb01 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -32,7 +32,6 @@ class SymmetricTensorProduct(torch.nn.Module):
     Args:
         descriptors (list of SegmentedTensorProduct): The list of SegmentedTensorProduct descriptors.
         math_dtype (torch.dtype, optional): The data type of the coefficients and calculations.
-        optimize_fallback (bool, optional): If `True`, the torch.fx graph will be optimized before execution. Because the optimization takes time, it is turned off by default.
     """
 
     def __init__(
@@ -42,7 +41,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -57,7 +55,6 @@ def __init__(
                 device=device,
                 math_dtype=math_dtype,
                 use_fallback=use_fallback,
-                optimize_fallback=optimize_fallback,
             )
         else:
             self.f0 = None
@@ -85,7 +82,6 @@ def __init__(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=optimize_fallback,
         )
 
     def forward(self, x0: torch.Tensor) -> torch.Tensor:
@@ -123,9 +119,6 @@ class IWeightedSymmetricTensorProduct(torch.nn.Module):
         The list of SegmentedTensorProduct descriptors
     math_dtype : torch.dtype, optional
         The data type of the coefficients and calculations
-    optimize_fallback : bool, optional
-        If `True`, the torch.fx graph will be optimized before execution
-        Because the optimization takes time, it is turned off by default.
     """
 
     def __init__(
@@ -135,7 +128,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -174,7 +166,6 @@ def __init__(
                 descriptors,
                 device,
                 math_dtype=math_dtype,
-                optimize_fallback=optimize_fallback,
             )
 
     @torch.jit.ignore
@@ -344,7 +335,6 @@ def __init__(
         stps: list[stp.SegmentedTensorProduct],
         device: Optional[torch.device],
         math_dtype: Optional[torch.dtype],
-        optimize_fallback: Optional[bool],
     ):
         super().__init__()
         self.fs = torch.nn.ModuleList(
@@ -354,7 +344,6 @@ def __init__(
                     device=device,
                     math_dtype=math_dtype,
                     use_fallback=True,
-                    optimize_fallback=optimize_fallback,
                 )
                 for d in stps
             ]
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 25459e6..80aa80c 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -91,7 +91,6 @@ class TensorProduct(torch.nn.Module):
         device (torch.device, optional): The device on which the calculations are performed.
         use_fallback (bool, optional):  Determines the computation method. If `None` (default), a CUDA kernel will be used if available. If `False`, a CUDA kernel will be used, and an exception is raised if it's not available. If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
-        optimize_fallback (bool, optional): If `True`, the fallback method is optimized. If `False`, the fallback method is used without optimization.
         Raises:
             RuntimeError: If `use_fallback` is `False` and no CUDA kernel is available.
 
@@ -104,7 +103,6 @@ def __init__(
         device: Optional[torch.device] = None,
         math_dtype: Optional[torch.dtype] = None,
         use_fallback: Optional[bool] = None,
-        optimize_fallback: Optional[bool] = None,
     ):
         super().__init__()
         self.descriptor = descriptor
@@ -135,17 +133,7 @@ def __init__(
             )
 
         if self.f is None:
-            if optimize_fallback is None:
-                optimize_fallback = False
-                warnings.warn(
-                    "The fallback method is used but it has not been optimized. "
-                    "Consider setting optimize_fallback=True when creating the TensorProduct module."
-                )
-
-            self.f = _tensor_product_fx(
-                descriptor, device, math_dtype, optimize_fallback
-            )
-            self._optimize_fallback = optimize_fallback
+            self.f = _tensor_product_fx(descriptor, device, math_dtype, True)
 
     @torch.jit.ignore
     def __repr__(self):
@@ -170,9 +158,6 @@ def forward(self, inputs: List[torch.Tensor]):
                 `last_operand_size` is the size of the last operand in the descriptor.
 
         """
-        # if any(x.numel() == 0 for x in inputs):
-        #    use_fallback = True  # Empty tensors are not supported by the CUDA kernel
-
         return self.f(inputs)
 
 
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 9044ee3..4e2045f 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -91,7 +91,6 @@ def test_performance_cuda_vs_fx(
         device=device,
         math_dtype=math_dtype,
         use_fallback=True,
-        optimize_fallback=True,
     )
 
     inputs = [
@@ -159,7 +158,6 @@ def test_precision_cuda_vs_fx(
         device=device,
         math_dtype=torch.float64,
         use_fallback=True,
-        optimize_fallback=True,
     )
     inputs = [x.to(torch.float64) for x in inputs]
     y1 = m(inputs).to(dtype)
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index eac3a5c..6a3c8ed 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -91,7 +91,6 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
         math_dtype=torch.float64,
         device=device,
         use_fallback=True,
-        optimize_fallback=True,
     )
     out2 = m(x0_, i0, x1_)
 
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index c2f8f4e..46c1928 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -130,7 +130,6 @@ def test_primitive_tensor_product_cuda_vs_fx(
             device=device,
             math_dtype=math_dtype,
             use_fallback=use_fallback,
-            optimize_fallback=True,
         )
         m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
 
@@ -141,7 +140,6 @@ def test_primitive_tensor_product_cuda_vs_fx(
             device=device,
             math_dtype=torch.float64,
             use_fallback=True,
-            optimize_fallback=False,
         )
 
         inputs_ = [inp.to(torch.float64) for inp in inputs]

From b8be9a2682634f35d271cd0127c8079b46176dd1 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Dec 2024 15:10:23 -0800
Subject: [PATCH 63/96] Fixing the reinterpret cast

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/tensor_product.py              | 25 +++++++++++--------
 .../equivariant_tensor_product_test.py        |  2 +-
 .../symmetric_tensor_product_test.py          |  5 ++--
 cuequivariance_torch/tests/utils.py           |  6 ++---
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 25459e6..aaf9fa0 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -175,6 +175,15 @@ def forward(self, inputs: List[torch.Tensor]):
 
         return self.f(inputs)
 
+class NoConvTensor(torch.Tensor):
+    def to(self, *args, **kwargs):
+        new_kwargs = kwargs.copy()
+        new_kwargs.pop('dtype', None)
+        new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
+        result = super().to(*new_args, **new_kwargs)
+        return result
+    def clone(self):
+        return super().clone()
 
 def _tensor_product_fx(
     descriptor: stp.SegmentedTensorProduct,
@@ -227,21 +236,17 @@ def _tensor_product_fx(
 
                 segments.append(inp.to(dtype=math_dtype))
 
-            # int_dtype = {
-            #    2: torch.int16,
-            #    4: torch.int32,
-            #    8: torch.int64,
-            # }[math_dtype.itemsize]
-
-            constants[f"c{path_idx}"] = torch.tensor(
+            c_tensor = NoConvTensor(torch.tensor(
                 path.coefficients, dtype=math_dtype, device=device
-            )  # .view(dtype=int_dtype)
+            ))
+
+            constants[f"c{path_idx}"] = c_tensor
 
             c = (
                 torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer)
-                # .view(dtype=math_dtype)
                 .clone()
             )
+            # out = torch.tensor(c)
             out = torch.einsum(formula, c, *segments)
             out = out.to(dtype=inputs[0].dtype)
 
@@ -718,7 +723,7 @@ def forward(self, inputs: List[torch.Tensor]):
 
         out = self._f(x0, x1, x2)
 
-        return out.reshape(shape + (out.shape[-1],))
+        return out # .reshape(shape + (out.shape[-1],))
 
 
 def _permutation_module(permutation: Tuple[int, ...]):
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 9044ee3..ad827ca 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -198,7 +198,7 @@ def test_compile(
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
-@pytest.mark.parametrize("use_fallback", [False])
+@pytest.mark.parametrize("use_fallback", [True, False])
 @pytest.mark.parametrize("mode", export_modes)
 def test_export(
     e: cue.EquivariantTensorProduct,
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index eac3a5c..28a5289 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -130,8 +130,8 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: bool):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-    if use_fallback is True:
-        pytest.skip("Skipping dtype test for fallback")
+    # if use_fallback is True:
+    #    pytest.skip("Skipping dtype test for fallback")
 
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
@@ -148,6 +148,7 @@ def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: b
     # .to should have no effect
     for param in m.parameters():
         assert False  # no parameters
+    # breakpoint()
     m = m.to(torch.float64)
 
     out2 = m(x0, i0, x1)
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index b90207a..3244a84 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -135,9 +135,9 @@ def module_with_mode(
             module = torch.fx.symbolic_trace(module)
         elif mode == "script":
             module = torch.jit.script(module)
-            # fname = os.path.join(tmp_path, "test.ts")
-            # torch.jit.save(module, fname)
-            # module = torch.jit.load(fname)
+            fname = os.path.join(tmp_path, "test.ts")
+            torch.jit.save(module, fname)
+            module = torch.jit.load(fname)
         elif mode == "jit":
             module = torch.jit.trace(module, inputs)
             fname = os.path.join(tmp_path, "test.ts")

From 3538231fbf2f94430c02417d56be10f2aecca96c Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Mon, 16 Dec 2024 15:14:59 -0800
Subject: [PATCH 64/96] Fixing clone()

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance_torch/primitives/tensor_product.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 343ca3b..7c0fffb 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -168,7 +168,7 @@ def to(self, *args, **kwargs):
         result = super().to(*new_args, **new_kwargs)
         return result
     def clone(self):
-        return super().clone()
+        return torch.Tensor(self)
 
 def _tensor_product_fx(
     descriptor: stp.SegmentedTensorProduct,

From 0c0d7e93181466b7889ce74a83dd42cdf30c6ecf Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 04:01:17 -0800
Subject: [PATCH 65/96] delete broadcast_shapes

---
 .../primitives/symmetric_tensor_product.py    |  22 +--
 .../primitives/tensor_product.py              | 133 ++----------------
 2 files changed, 25 insertions(+), 130 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index ca1fb01..0e22707 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -20,7 +20,6 @@
 
 import cuequivariance.segmented_tensor_product as stp
 import cuequivariance_torch as cuet
-from cuequivariance_torch.primitives.tensor_product import broadcast_shapes, prod
 
 logger = logging.getLogger(__name__)
 
@@ -192,9 +191,9 @@ def forward(
         x0 : torch.Tensor
             The input tensor for the first operand. It should have the shape (i0.max() + 1, x0_size).
         i0 : torch.Tensor
-            The index tensor for the first operand. It should have the shape (...).
+            The index tensor for the first operand. It should have the shape (batch).
         x1 : torch.Tensor
-            The repeated input tensor. It should have the shape (..., x1_size).
+            The repeated input tensor. It should have the shape (batch, x1_size).
 
         Returns
         -------
@@ -207,14 +206,15 @@ def forward(
             x0.ndim == 2,
             f"Expected 2 dims (i0.max() + 1, x0_size), got shape {x0.shape}",
         )
-        shape = broadcast_shapes([i0.shape, x1.shape[:-1]])
-        i0 = i0.expand(shape)
-        i0 = i0.reshape((prod(shape),))
-        x1 = x1.expand(shape + (x1.shape[-1],)).reshape((prod(shape), x1.shape[-1]))
-
-        out = self.f(x0, i0, x1)
-        out = out.reshape(shape + (self.x2_size,))
-        return out
+        torch._assert(
+            i0.ndim == 1,
+            f"Expected 1 dim (batch), got shape {i0.shape}",
+        )
+        torch._assert(
+            x1.ndim == 2,
+            f"Expected 2 dims (batch, x1_size), got shape {x1.shape}",
+        )
+        return self.f(x0, i0, x1)
 
 
 def _check_descriptors(descriptors: list[stp.SegmentedTensorProduct]):
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 7c0fffb..184f559 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -38,49 +38,6 @@ def prod(numbers: List[int]):
         return math.prod(numbers)
 
 
-def broadcast_shapes(shapes: List[List[int]]):
-    """
-    This method is a workaround for script() not recognizing torch.broadcast_shapes()
-    """
-    if torch.jit.is_scripting():
-        max_len = 0
-        for shape in shapes:
-            if isinstance(shape, int):
-                if max_len < 1:
-                    max_len = 1
-            elif isinstance(shape, (tuple, list)):
-                s = len(shape)
-                if max_len < s:
-                    max_len = s
-        result = [1] * max_len
-        for shape in shapes:
-            if isinstance(shape, int):
-                shape = (shape,)
-            if isinstance(shape, (tuple, list)):
-                for i in range(-1, -1 - len(shape), -1):
-                    if shape[i] < 0:
-                        raise RuntimeError(
-                            "Trying to create tensor with negative dimension ({}): ({})".format(
-                                shape[i], shape[i]
-                            )
-                        )
-                    if shape[i] == 1 or shape[i] == result[i]:
-                        continue
-                    if result[i] != 1:
-                        raise RuntimeError(
-                            "Shape mismatch: objects cannot be broadcast to a single shape"
-                        )
-                    result[i] = shape[i]
-            else:
-                raise RuntimeError(
-                    "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
-                    shape,
-                )
-        return torch.Size(result)
-    else:
-        return torch.functional.broadcast_shapes(*shapes)
-
-
 class TensorProduct(torch.nn.Module):
     """
     PyTorch module that computes the last operand of the segmented tensor product defined by the descriptor.
@@ -160,16 +117,19 @@ def forward(self, inputs: List[torch.Tensor]):
         """
         return self.f(inputs)
 
+
 class NoConvTensor(torch.Tensor):
     def to(self, *args, **kwargs):
         new_kwargs = kwargs.copy()
-        new_kwargs.pop('dtype', None)
-        new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
+        new_kwargs.pop("dtype", None)
+        new_args = [None if isinstance(a, torch.dtype) else a for a in args]
         result = super().to(*new_args, **new_kwargs)
         return result
+
     def clone(self):
         return torch.Tensor(self)
 
+
 def _tensor_product_fx(
     descriptor: stp.SegmentedTensorProduct,
     device: Optional[torch.device],
@@ -221,16 +181,13 @@ def _tensor_product_fx(
 
                 segments.append(inp.to(dtype=math_dtype))
 
-            c_tensor = NoConvTensor(torch.tensor(
-                path.coefficients, dtype=math_dtype, device=device
-            ))
+            c_tensor = NoConvTensor(
+                torch.tensor(path.coefficients, dtype=math_dtype, device=device)
+            )
 
             constants[f"c{path_idx}"] = c_tensor
 
-            c = (
-                torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer)
-                .clone()
-            )
+            c = torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer).clone()
             # out = torch.tensor(c)
             out = torch.einsum(formula, c, *segments)
             out = out.to(dtype=inputs[0].dtype)
@@ -402,21 +359,7 @@ def forward(self, args: List[torch.Tensor]):
                     "input shape[-1] does not match operand size",
                 )
 
-        shape = broadcast_shapes([arg.shape[:-1] for arg in args])
-
-        args = [
-            (
-                arg.expand(shape + (arg.shape[-1],)).reshape(
-                    (prod(shape), arg.shape[-1])
-                )
-                if prod(arg.shape[:-1]) > 1
-                else arg.reshape((prod(arg.shape[:-1]), arg.shape[-1]))
-            )
-            for arg in args
-        ]
-        out = self.module(args)
-
-        return out.reshape(shape + (out.shape[-1],))
+        return self.module(args)
 
 
 def _tensor_product_cuda(
@@ -547,21 +490,13 @@ def __repr__(self) -> str:
 
     def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1 = self._perm(inputs[0], inputs[1])
-        assert x0.ndim >= 1, x0.ndim
-        assert x1.ndim >= 1, x1.ndim
-
-        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1]])
-        x0 = _reshape(x0, shape)
-        x1 = _reshape(x1, shape)
 
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
-        out = self._f(x0, x1)
-
-        return out.reshape(shape + (out.shape[-1],))
+        return self._f(x0, x1)
 
 
 class FusedTensorProductOp4(torch.nn.Module):
@@ -603,23 +538,13 @@ def __repr__(self) -> str:
 
     def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1, x2 = self._perm(inputs[0], inputs[1], inputs[2])
-        assert x0.ndim >= 1, x0.ndim
-        assert x1.ndim >= 1, x1.ndim
-        assert x2.ndim >= 1, x2.ndim
-
-        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1], x2.shape[:-1]])
-        x0 = _reshape(x0, shape)
-        x1 = _reshape(x1, shape)
-        x2 = _reshape(x2, shape)
 
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
-        out = self._f(x0, x1, x2)
-
-        return out.reshape(shape + (out.shape[-1],))
+        return self._f(x0, x1, x2)
 
 
 class TensorProductUniform1d(torch.nn.Module):
@@ -656,26 +581,13 @@ def __repr__(self):
 
     def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         x0, x1 = inputs
-        assert x0.ndim >= 1, x0.ndim
-        assert x1.ndim >= 1, x1.ndim
-
-        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1]])
-        x0 = _reshape(x0, shape)
-        x1 = _reshape(x1, shape)
-
-        if x0.ndim == 1:
-            x0 = x0.unsqueeze(0)
-        if x1.ndim == 1:
-            x1 = x1.unsqueeze(0)
 
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
-        out = self._f(x0, x1, x0)
-
-        return out.reshape(shape + (out.shape[-1],))
+        return self._f(x0, x1, x0)
 
 
 class TensorProductUniform4x1d(TensorProductUniform1d):
@@ -685,30 +597,13 @@ def __repr__(self):
 
     def forward(self, inputs: List[torch.Tensor]):
         x0, x1, x2 = inputs
-        assert x0.ndim >= 1, x0.ndim
-        assert x1.ndim >= 1, x1.ndim
-        assert x2.ndim >= 1, x2.ndim
-
-        shape = broadcast_shapes([x0.shape[:-1], x1.shape[:-1], x2.shape[:-1]])
-        x0 = _reshape(x0, shape)
-        x1 = _reshape(x1, shape)
-        x2 = _reshape(x2, shape)
-
-        if x0.ndim == 1:
-            x0 = x0.unsqueeze(0)
-        if x1.ndim == 1:
-            x1 = x1.unsqueeze(0)
-        if x2.ndim == 1:
-            x2 = x2.unsqueeze(0)
 
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             logger.debug(
                 f"Calling TensorProductUniform4x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
-        out = self._f(x0, x1, x2)
-
-        return out # .reshape(shape + (out.shape[-1],))
+        return self._f(x0, x1, x2)
 
 
 def _permutation_module(permutation: Tuple[int, ...]):

From 97ca27ac464f0051abdd9cc8c1e67a6e0076c1ee Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 04:04:18 -0800
Subject: [PATCH 66/96] delete _reshape

---
 .../cuequivariance_torch/primitives/tensor_product.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 184f559..8160b08 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -440,17 +440,6 @@ def _tensor_product_cuda(
         return FusedTensorProductOp4(descriptor, perm[:3], device, math_dtype)
 
 
-@torch.jit.script
-def _reshape(x: torch.Tensor, leading_shape: List[int]) -> torch.Tensor:
-    # Make x have shape (Z, x.shape[-1]) or (x.shape[-1],)
-    if prod(leading_shape) > 1 and prod(x.shape[:-1]) == 1:
-        return x.reshape((x.shape[-1],))
-    else:
-        return x.expand(leading_shape + (x.shape[-1],)).reshape(
-            (prod(leading_shape), x.shape[-1])
-        )
-
-
 class FusedTensorProductOp3(torch.nn.Module):
     def __init__(
         self,

From 64bd41f06f348ad5c1d5f3d650f1d0e3370d3448 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 04:09:48 -0800
Subject: [PATCH 67/96] rename

---
 cuequivariance_torch/tests/primitives/tensor_product_test.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 46c1928..b89510c 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -21,7 +21,6 @@
 )
 
 import cuequivariance as cue
-import cuequivariance.segmented_tensor_product as stp
 import cuequivariance_torch as cuet
 from cuequivariance import descriptors
 
@@ -65,7 +64,7 @@ def make_descriptors():
         "u,,u",
         ",v,v",
     ]:
-        d = stp.SegmentedTensorProduct.from_subscripts(subscripts)
+        d = cue.SegmentedTensorProduct.from_subscripts(subscripts)
         for i in range(3):
             d.add_path(
                 *[None] * d.num_operands,
@@ -99,7 +98,7 @@ def make_descriptors():
 @pytest.mark.parametrize("use_fallback", [True, False])
 @pytest.mark.parametrize("mode", export_modes)
 def test_primitive_tensor_product_cuda_vs_fx(
-    d: stp.SegmentedTensorProduct,
+    d: cue.SegmentedTensorProduct,
     dtype: torch.dtype,
     math_dtype: torch.dtype,
     tol: float,

From d527020f1a725ee67519d909f4d4463da5ff2d7b Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 17 Dec 2024 10:41:07 -0800
Subject: [PATCH 68/96] Using alternative disable type change fixture

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/tensor_product.py                  | 15 +++++++++++++--
 .../primitives/symmetric_tensor_product_test.py   |  6 +++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 7c0fffb..3170d79 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -170,6 +170,17 @@ def to(self, *args, **kwargs):
     def clone(self):
         return torch.Tensor(self)
 
+def disable_type_conv(t: torch.Tensor):
+    original_to = t.to
+    def to_notypeconv(self, *args, **kwargs):
+        new_kwargs = kwargs.copy()
+        new_kwargs.pop('dtype', None)
+        new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
+        result = original_to(self, *new_args, **new_kwargs)
+        return result
+    t.to = to_notypeconv
+    return t
+
 def _tensor_product_fx(
     descriptor: stp.SegmentedTensorProduct,
     device: Optional[torch.device],
@@ -221,7 +232,7 @@ def _tensor_product_fx(
 
                 segments.append(inp.to(dtype=math_dtype))
 
-            c_tensor = NoConvTensor(torch.tensor(
+            c_tensor = disable_type_conv(torch.tensor(
                 path.coefficients, dtype=math_dtype, device=device
             ))
 
@@ -708,7 +719,7 @@ def forward(self, inputs: List[torch.Tensor]):
 
         out = self._f(x0, x1, x2)
 
-        return out # .reshape(shape + (out.shape[-1],))
+        return out.reshape(shape + (out.shape[-1],))
 
 
 def _permutation_module(permutation: Tuple[int, ...]):
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index a90b84a..b57dc4c 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -129,8 +129,6 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: bool):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-    # if use_fallback is True:
-    #    pytest.skip("Skipping dtype test for fallback")
 
     ds = descriptors.symmetric_contraction(
         cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
@@ -147,9 +145,11 @@ def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: b
     # .to should have no effect
     for param in m.parameters():
         assert False  # no parameters
-    # breakpoint()
+
+    m = m.float()
     m = m.to(torch.float64)
 
+
     out2 = m(x0, i0, x1)
 
     assert out1.dtype == dtype

From 3687da3529a38ae5f5b058c0ca3f64f75ee50817 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 17 Dec 2024 11:39:27 -0800
Subject: [PATCH 69/96] Restored assert

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance_torch/primitives/tensor_product.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index dd69c3e..e9ab2f7 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -153,8 +153,8 @@ def _tensor_product_fx(
             torch.fx.Proxy(graph.placeholder(f"input_{i}"), tracer)
             for i in range(num_inputs)
         ]
-        # for input in inputs:
-        #     torch._assert(input.ndim == 2, "input should have ndim=2")
+        for input in inputs:
+            torch._assert(input.ndim == 2, "input should have ndim=2")
         operand_subscripts = [
             f"Z{operand.subscripts}" for operand in descriptor.operands
         ]

From 687dd530c03a228429f53705650306ca2b2aa07a Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 12:39:20 -0800
Subject: [PATCH 70/96] try fix test

---
 .../tests/primitives/tensor_product_test.py   | 72 +++++++++----------
 1 file changed, 35 insertions(+), 37 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index b89510c..eb511e6 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import itertools
 
 import pytest
 import torch
@@ -113,50 +112,49 @@ def test_primitive_tensor_product_cuda_vs_fx(
             "Only eager, script and export modes are supported for the fallback!"
         )
 
-    for batches in itertools.product([(16,), (), (4, 1)], repeat=d.num_operands - 1):
-        inputs = [
-            torch.randn(
-                batches[i] + (d.operands[i].size,),
-                device=device,
-                dtype=dtype,
-                requires_grad=True,
-            )
-            for i in range(d.num_operands - 1)
-        ]
-
-        m = cuet.TensorProduct(
-            d,
+    inputs = [
+        torch.randn(
+            (12, d.operands[i].size),
             device=device,
-            math_dtype=math_dtype,
-            use_fallback=use_fallback,
+            dtype=dtype,
+            requires_grad=True,
         )
-        m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
+        for i in range(d.num_operands - 1)
+    ]
 
-        out1 = m(inputs)
+    m = cuet.TensorProduct(
+        d,
+        device=device,
+        math_dtype=math_dtype,
+        use_fallback=use_fallback,
+    )
+    m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
 
-        m = cuet.TensorProduct(
-            d,
-            device=device,
-            math_dtype=torch.float64,
-            use_fallback=True,
-        )
+    out1 = m(inputs)
+
+    m = cuet.TensorProduct(
+        d,
+        device=device,
+        math_dtype=torch.float64,
+        use_fallback=True,
+    )
 
-        inputs_ = [inp.to(torch.float64) for inp in inputs]
-        out2 = m(inputs_)
+    inputs_ = [inp.to(torch.float64) for inp in inputs]
+    out2 = m(inputs_)
 
-        assert out1.shape[:-1] == torch.broadcast_shapes(*batches)
-        assert out1.dtype == dtype
+    assert out1.shape[:-1] == (12,)
+    assert out1.dtype == dtype
 
-        torch.testing.assert_close(out1, out2.to(dtype), atol=tol, rtol=tol)
+    torch.testing.assert_close(out1, out2.to(dtype), atol=tol, rtol=tol)
 
-        grad1 = torch.autograd.grad(out1.sum(), inputs, create_graph=True)
-        grad2 = torch.autograd.grad(out2.sum(), inputs_, create_graph=True)
+    grad1 = torch.autograd.grad(out1.sum(), inputs, create_graph=True)
+    grad2 = torch.autograd.grad(out2.sum(), inputs_, create_graph=True)
 
-        for g1, g2 in zip(grad1, grad2):
-            torch.testing.assert_close(g1, g2.to(dtype), atol=10 * tol, rtol=10 * tol)
+    for g1, g2 in zip(grad1, grad2):
+        torch.testing.assert_close(g1, g2.to(dtype), atol=10 * tol, rtol=10 * tol)
 
-        double_grad1 = torch.autograd.grad(sum(g.sum() for g in grad1), inputs)
-        double_grad2 = torch.autograd.grad(sum(g.sum() for g in grad2), inputs_)
+    double_grad1 = torch.autograd.grad(sum(g.sum() for g in grad1), inputs)
+    double_grad2 = torch.autograd.grad(sum(g.sum() for g in grad2), inputs_)
 
-        for g1, g2 in zip(double_grad1, double_grad2):
-            torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)
+    for g1, g2 in zip(double_grad1, double_grad2):
+        torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)

From 0576ad04c562b333e9d3fcbbd941181cbd0255c9 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 13:01:39 -0800
Subject: [PATCH 71/96] simplify symmetric_tensor_product_test to make test run
 faster

---
 .../symmetric_tensor_product_test.py          | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index b57dc4c..4ae1db5 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -25,13 +25,7 @@
 
 def make_descriptors():
     yield descriptors.symmetric_contraction(
-        cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [3]
-    ).ds
-    yield descriptors.symmetric_contraction(
-        cue.Irreps("O3", "0e + 1o + 2e"), cue.Irreps("O3", "0e + 1o"), [4]
-    ).ds
-    yield descriptors.symmetric_contraction(
-        cue.Irreps("SU2", "0 + 1/2"), cue.Irreps("SU2", "0 + 1/2"), [5]
+        cue.Irreps("SO3", "0 + 1 + 2"), cue.Irreps("SO3", "0"), [1, 2, 3]
     ).ds
 
     d1 = stp.SegmentedTensorProduct.from_subscripts(",,")
@@ -60,16 +54,10 @@ def make_descriptors():
 
 @pytest.mark.parametrize("ds", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings1)
-@pytest.mark.parametrize("use_fallback", [False, True])
 def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
-    ds: list[stp.SegmentedTensorProduct],
-    dtype,
-    math_dtype,
-    tol: float,
-    use_fallback: bool,
+    ds: list[stp.SegmentedTensorProduct], dtype, math_dtype, tol: float
 ):
-    if use_fallback is False and not torch.cuda.is_available():
-        pytest.skip("CUDA is not available")
+    use_fallback = not torch.cuda.is_available()
 
     m = cuet.IWeightedSymmetricTensorProduct(
         ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
@@ -87,10 +75,7 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
 
     out1 = m(x0, i0, x1)
     m = cuet.IWeightedSymmetricTensorProduct(
-        ds,
-        math_dtype=torch.float64,
-        device=device,
-        use_fallback=True,
+        ds, math_dtype=torch.float64, device=device, use_fallback=True
     )
     out2 = m(x0_, i0, x1_)
 
@@ -149,7 +134,6 @@ def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: b
     m = m.float()
     m = m.to(torch.float64)
 
-
     out2 = m(x0, i0, x1)
 
     assert out1.dtype == dtype

From 66aa108945d76b8e27c81bc1b910cea8ea91d598 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 13:48:31 -0800
Subject: [PATCH 72/96] try to fix some tests

---
 .../primitives/tensor_product.py              | 47 +++++++++++++++++--
 .../tests/operations/rotation_test.py         |  4 +-
 .../tests/operations/tp_channel_wise_test.py  |  9 ++--
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index e9ab2f7..1054a62 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -117,17 +117,21 @@ def forward(self, inputs: List[torch.Tensor]):
         """
         return self.f(inputs)
 
+
 def disable_type_conv(t: torch.Tensor):
     original_to = t.to
+
     def to_notypeconv(self, *args, **kwargs):
         new_kwargs = kwargs.copy()
-        new_kwargs.pop('dtype', None)
-        new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
+        new_kwargs.pop("dtype", None)
+        new_args = [None if isinstance(a, torch.dtype) else a for a in args]
         result = original_to(self, *new_args, **new_kwargs)
         return result
+
     t.to = to_notypeconv
     return t
 
+
 def _tensor_product_fx(
     descriptor: stp.SegmentedTensorProduct,
     device: Optional[torch.device],
@@ -179,9 +183,9 @@ def _tensor_product_fx(
 
                 segments.append(inp.to(dtype=math_dtype))
 
-            c_tensor = disable_type_conv(torch.tensor(
-                path.coefficients, dtype=math_dtype, device=device
-            ))
+            c_tensor = disable_type_conv(
+                torch.tensor(path.coefficients, dtype=math_dtype, device=device)
+            )
             constants[f"c{path_idx}"] = c_tensor
 
             c = torch.fx.Proxy(graph.get_attr(f"c{path_idx}"), tracer=tracer).clone()
@@ -355,6 +359,7 @@ def forward(self, args: List[torch.Tensor]):
                     "input shape[-1] does not match operand size",
                 )
 
+        args = [arg.unsqueeze(0) if arg.ndim == 1 else arg for arg in args]
         return self.module(args)
 
 
@@ -481,6 +486,14 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
+        if x0.ndim == 1:
+            x0 = x0.unsqueeze(0)
+        if x1.ndim == 1:
+            x1 = x1.unsqueeze(0)
+        Z = max(x0.shape[0], x1.shape[0])
+        x0 = x0.expand(Z, -1)
+        x1 = x1.expand(Z, -1)
+
         return self._f(x0, x1)
 
 
@@ -529,6 +542,18 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
+        if x0.ndim == 1:
+            x0 = x0.unsqueeze(0)
+        if x1.ndim == 1:
+            x1 = x1.unsqueeze(0)
+        if x2.ndim == 1:
+            x2 = x2.unsqueeze(0)
+
+        Z = max(x0.shape[0], x1.shape[0], x2.shape[0])
+        x0 = x0.expand(Z, -1)
+        x1 = x1.expand(Z, -1)
+        x2 = x2.expand(Z, -1)
+
         return self._f(x0, x1, x2)
 
 
@@ -572,6 +597,11 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
+        if x0.ndim == 1:
+            x0 = x0.unsqueeze(0)
+        if x1.ndim == 1:
+            x1 = x1.unsqueeze(0)
+
         return self._f(x0, x1, x0)
 
 
@@ -588,6 +618,13 @@ def forward(self, inputs: List[torch.Tensor]):
                 f"Calling TensorProductUniform4x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
+        if x0.ndim == 1:
+            x0 = x0.unsqueeze(0)
+        if x1.ndim == 1:
+            x1 = x1.unsqueeze(0)
+        if x2.ndim == 1:
+            x2 = x2.unsqueeze(0)
+
         return self._f(x0, x1, x2)
 
 
diff --git a/cuequivariance_torch/tests/operations/rotation_test.py b/cuequivariance_torch/tests/operations/rotation_test.py
index 86d0230..6f04eef 100644
--- a/cuequivariance_torch/tests/operations/rotation_test.py
+++ b/cuequivariance_torch/tests/operations/rotation_test.py
@@ -51,7 +51,7 @@ def test_inversion():
     irreps = cue.Irreps("O3", "2x1e + 1o")
     torch.testing.assert_close(
         cuet.Inversion(irreps, layout=cue.ir_mul)(
-            torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+            torch.tensor([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]])
         ),
-        torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0]),
+        torch.tensor([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0]]),
     )
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index d3e7cdd..a56e8ce 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -34,7 +34,7 @@
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
 @pytest.mark.parametrize("batch", [1, 32])
-def test_channel_wise(
+def test_channel_wise_fwd(
     irreps1: cue.Irreps,
     irreps2: cue.Irreps,
     irreps3: cue.Irreps,
@@ -72,13 +72,14 @@ def test_channel_wise(
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
 
-def test_channel_wise_bwd_bwd():
+@pytest.mark.parametrize("irreps", ["32x0", "2x0 + 3x1"])
+def test_channel_wise_bwd_bwd(irreps: cue.Irreps):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
-    irreps1 = cue.Irreps("SO3", "2x0 + 3x1")
+    irreps1 = cue.Irreps("SO3", irreps)
     irreps2 = cue.Irreps("SO3", "0 + 1")
-    irreps3 = cue.Irreps("SO3", "0 + 1")
+    irreps3 = cue.Irreps("SO3", irreps)
 
     x1 = torch.randn(
         32, irreps1.dim, device=device, requires_grad=True, dtype=torch.float64

From 17df143c2d5150b38d122cb7850164b226759bd8 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 17 Dec 2024 13:54:02 -0800
Subject: [PATCH 73/96] Fixing disable_type_conv

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/tensor_product.py              | 22 ++++++++++++-------
 .../equivariant_tensor_product_test.py        |  9 +-------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index e9ab2f7..db2d4d9 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -117,14 +117,20 @@ def forward(self, inputs: List[torch.Tensor]):
         """
         return self.f(inputs)
 
-def disable_type_conv(t: torch.Tensor):
-    original_to = t.to
-    def to_notypeconv(self, *args, **kwargs):
-        new_kwargs = kwargs.copy()
-        new_kwargs.pop('dtype', None)
-        new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
-        result = original_to(self, *new_args, **new_kwargs)
-        return result
+def to_notypeconv(self, *args, **kwargs):
+    new_kwargs = kwargs.copy()
+    new_kwargs.pop('dtype', None)
+    new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
+    result = self.__original_to(*new_args, **new_kwargs)
+    return result
+
+
+def disable_type_conv(t):
+    """
+    This modifier can be used on Tensors or whole Modules
+    to prevent them from being modified during to(dtype=x) calls 
+    """
+    t.__original_to = t.to
     t.to = to_notypeconv
     return t
 
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 4a68747..3999964 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -190,9 +190,7 @@ def test_compile(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-# export_modes = ["script", "export", "onnx", "trt" ] # , "torch_trt", "jit"]
-export_modes = ["script"]  # , "torch_trt", "jit"]
-
+export_modes = ["script", "export", "onnx", "trt", "torch_trt", "jit"]
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
@@ -211,11 +209,6 @@ def test_export(
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
-    if use_fallback is True and mode not in ["eager", "script"]:
-        pytest.skip(
-            "Only eager, script and export modes are supported for the fallback!"
-        )
-
     m = cuet.EquivariantTensorProduct(
         e,
         layout=cue.mul_ir,

From 1c459aa1e5d5acfd531ad966c294942266ef0fcf Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 14:02:17 -0800
Subject: [PATCH 74/96] try fix

---
 .../primitives/tensor_product.py              | 28 ++++++-------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 1054a62..6fd332f 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -486,14 +486,8 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
-        if x0.ndim == 1:
-            x0 = x0.unsqueeze(0)
-        if x1.ndim == 1:
-            x1 = x1.unsqueeze(0)
-        Z = max(x0.shape[0], x1.shape[0])
-        x0 = x0.expand(Z, -1)
-        x1 = x1.expand(Z, -1)
-
+        # ops.FusedTensorProductOp3 expects inputs
+        # of shape (Z, dim) or (dim,)
         return self._f(x0, x1)
 
 
@@ -542,18 +536,8 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
-        if x0.ndim == 1:
-            x0 = x0.unsqueeze(0)
-        if x1.ndim == 1:
-            x1 = x1.unsqueeze(0)
-        if x2.ndim == 1:
-            x2 = x2.unsqueeze(0)
-
-        Z = max(x0.shape[0], x1.shape[0], x2.shape[0])
-        x0 = x0.expand(Z, -1)
-        x1 = x1.expand(Z, -1)
-        x2 = x2.expand(Z, -1)
-
+        # ops.FusedTensorProductOp4 expects inputs
+        # of shape (Z, dim) or (dim,)
         return self._f(x0, x1, x2)
 
 
@@ -602,6 +586,8 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
         if x1.ndim == 1:
             x1 = x1.unsqueeze(0)
 
+        # ops.TensorProductUniform1d expects inputs
+        # of shape (Z, dim) or (1, dim)
         return self._f(x0, x1, x0)
 
 
@@ -625,6 +611,8 @@ def forward(self, inputs: List[torch.Tensor]):
         if x2.ndim == 1:
             x2 = x2.unsqueeze(0)
 
+        # ops.TensorProductUniform1d expects inputs
+        # of shape (Z, dim) or (1, dim)
         return self._f(x0, x1, x2)
 
 

From df170425ab3ab8fcd40f60b9349cac34c9687bc5 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Tue, 17 Dec 2024 14:32:11 -0800
Subject: [PATCH 75/96] fix strange bug

---
 .../tests/primitives/symmetric_tensor_product_test.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 4ae1db5..1191437 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -131,7 +131,7 @@ def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: b
     for param in m.parameters():
         assert False  # no parameters
 
-    m = m.float()
+    m = m.to(torch.float32)
     m = m.to(torch.float64)
 
     out2 = m(x0, i0, x1)

From a8516aa098137b6676eabad0a01a23091381fb69 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 18 Dec 2024 02:50:24 -0800
Subject: [PATCH 76/96] Script fixes for uniform

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../primitives/tensor_product.py              |  2 +-
 .../tests/primitives/script_test.py           | 48 ++++++++++++-------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 8b7aa3c..1bf21c1 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -591,7 +591,7 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
 
         # ops.TensorProductUniform1d expects inputs
         # of shape (Z, dim) or (1, dim)
-        return self._f(x0, x1, x0)
+        return self._f(x0, x1)
 
 
 class TensorProductUniform4x1d(TensorProductUniform1d):
diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/script_test.py
index 4706bff..5221f69 100644
--- a/cuequivariance_torch/tests/primitives/script_test.py
+++ b/cuequivariance_torch/tests/primitives/script_test.py
@@ -11,6 +11,9 @@
     TensorProductUniform3x1d,
     TensorProductUniform4x1d,
 )
+from tests.utils import (
+    module_with_mode,
+)
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
@@ -55,8 +58,10 @@ def test_script_fused_tp_3():
 
     assert module([x0, x1]).shape == (batch, d.operands[2].size)
 
+export_modes = ["script", "export"]
 
-def test_script_fused_tp_4():
+@pytest.mark.parametrize("mode", export_modes)
+def test_script_fused_tp_4(mode, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -73,14 +78,16 @@ def test_script_fused_tp_4():
     x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
     x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
     x2 = torch.randn(batch, d.operands[2].size, device=device, dtype=torch.float32)
-
-    module = FusedTensorProductOp4(d, (0, 1, 2), device, torch.float32)
-    module = torch.jit.script(module)
-
-    assert module([x0, x1, x2]).shape == (batch, d.operands[3].size)
-
-
-def test_script_uniform_tp_3():
+    
+    inputs = [x0, x1, x2]
+    m = FusedTensorProductOp4(d, [0, 1, 2], device, torch.float32)
+    module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
+    out1=m(inputs)
+    out2=module(inputs)
+    torch.testing.assert_close(out1, out2)
+
+@pytest.mark.parametrize("mode", export_modes)
+def test_script_uniform_tp_3(mode, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -95,14 +102,17 @@ def test_script_uniform_tp_3():
     batch = 12
     x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
     x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
+    inputs = [x0, x1]
 
-    module = TensorProductUniform3x1d(d, device, torch.float32)
-    module = torch.jit.script(module)
-
-    assert module([x0, x1]).shape == (batch, d.operands[2].size)
+    m = TensorProductUniform3x1d(d, device, torch.float32)
+    module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
+    out1=m(inputs)
+    out2=module(inputs)
+    torch.testing.assert_close(out1, out2)
 
 
-def test_script_uniform_tp_4():
+@pytest.mark.parametrize("mode", export_modes)
+def test_script_uniform_tp_4(mode, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -118,8 +128,10 @@ def test_script_uniform_tp_4():
     x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
     x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
     x2 = torch.randn(batch, d.operands[2].size, device=device, dtype=torch.float32)
+    inputs = [x0, x1, x2]
 
-    module = TensorProductUniform4x1d(d, device, torch.float32)
-    module = torch.jit.script(module)
-
-    assert module([x0, x1, x2]).shape == (batch, d.operands[3].size)
+    m= TensorProductUniform4x1d(d, device, torch.float32)
+    module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
+    out1=m(inputs)
+    out2=module(inputs)
+    torch.testing.assert_close(out1, out2)

From 9a256c9f603dbc8fdd9c7a8ec80b63365cf0fd3b Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 18 Dec 2024 06:49:15 -0800
Subject: [PATCH 77/96] add test_script_tensor_product

---
 .../tests/primitives/tensor_product_test.py   | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index eb511e6..9e2bf35 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -158,3 +158,22 @@ def test_primitive_tensor_product_cuda_vs_fx(
 
     for g1, g2 in zip(double_grad1, double_grad2):
         torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)
+
+
+@pytest.mark.parametrize("d", make_descriptors())
+@pytest.mark.parametrize("mode", export_modes)
+def test_script_tensor_product(d: cue.SegmentedTensorProduct, mode, tmp_path):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    batch = 12
+    inputs = [
+        torch.randn(batch, ope.size, device=device, dtype=torch.float32)
+        for ope in d.operands[:-1]
+    ]
+
+    m = cuet.TensorProduct(d, device=device, math_dtype=torch.float32)
+    module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
+    out1 = m(inputs)
+    out2 = module(inputs)
+    torch.testing.assert_close(out1, out2)

From 0b37932d8e259c28e76ec62d900173ee0b61d038 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 18 Dec 2024 11:59:14 -0800
Subject: [PATCH 78/96] Moving all export tests, disabling torch_trt for now

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../equivariant_tensor_product_test.py           |  2 +-
 .../tests/primitives/tensor_product_test.py      | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 3999964..ebd18ba 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -190,7 +190,7 @@ def test_compile(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-export_modes = ["script", "export", "onnx", "trt", "torch_trt", "jit"]
+export_modes = ["script", "export", "onnx", "trt", "jit"]
 
 @pytest.mark.parametrize("e", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 9e2bf35..8343f08 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -89,13 +89,12 @@ def make_descriptors():
         (torch.bfloat16, torch.float32, 1.0),
     ]
 
-export_modes = ["script", "eager"]  # , "export", "onnx", "trt", "torch_trt", "jit"]
+export_modes = ["script", "export", "onnx", "trt", "jit"]
 
 
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
 @pytest.mark.parametrize("use_fallback", [True, False])
-@pytest.mark.parametrize("mode", export_modes)
 def test_primitive_tensor_product_cuda_vs_fx(
     d: cue.SegmentedTensorProduct,
     dtype: torch.dtype,
@@ -107,10 +106,6 @@ def test_primitive_tensor_product_cuda_vs_fx(
 ):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-    if use_fallback is True and mode not in ["eager", "script", "export"]:
-        pytest.skip(
-            "Only eager, script and export modes are supported for the fallback!"
-        )
 
     inputs = [
         torch.randn(
@@ -128,7 +123,6 @@ def test_primitive_tensor_product_cuda_vs_fx(
         math_dtype=math_dtype,
         use_fallback=use_fallback,
     )
-    m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
 
     out1 = m(inputs)
 
@@ -162,7 +156,8 @@ def test_primitive_tensor_product_cuda_vs_fx(
 
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("mode", export_modes)
-def test_script_tensor_product(d: cue.SegmentedTensorProduct, mode, tmp_path):
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_script_tensor_product(d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -172,6 +167,11 @@ def test_script_tensor_product(d: cue.SegmentedTensorProduct, mode, tmp_path):
         for ope in d.operands[:-1]
     ]
 
+    # if use_fallback is True and mode not in ["eager", "script", "export"]:
+    #    pytest.skip(
+    #        "Only eager, script and export modes are supported for the fallback!"
+    #    )
+
     m = cuet.TensorProduct(d, device=device, math_dtype=torch.float32)
     module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
     out1 = m(inputs)

From a5262bdba1b9f2f7960727237d6cfcf303149076 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 18 Dec 2024 13:40:57 -0800
Subject: [PATCH 79/96] more strict input shapes

---
 .../primitives/tensor_product.py              | 61 +++++++++++++------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 1bf21c1..20d1702 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -105,8 +105,9 @@ def forward(self, inputs: List[torch.Tensor]):
 
         Args:
             inputs (list of torch.Tensor): The input tensors. The number of input tensors should match the number of operands in the descriptor minus one.
-                Each input tensor should have a shape of ((batch,) operand_size), where `operand_size` corresponds to the size
-                of each operand as defined in the tensor product descriptor.
+                Each input tensor should have a shape of (batch, operand_size) or (1, operand_size)
+                where `operand_size` corresponds to the size of each operand as defined in
+                the tensor product descriptor.
 
         Returns:
             torch.Tensor:
@@ -117,10 +118,11 @@ def forward(self, inputs: List[torch.Tensor]):
         """
         return self.f(inputs)
 
+
 def to_notypeconv(self, *args, **kwargs):
     new_kwargs = kwargs.copy()
-    new_kwargs.pop('dtype', None)
-    new_args = [None if isinstance(a, torch.dtype) else a for a in args ]
+    new_kwargs.pop("dtype", None)
+    new_args = [None if isinstance(a, torch.dtype) else a for a in args]
     result = self.__original_to(*new_args, **new_kwargs)
     return result
 
@@ -128,7 +130,7 @@ def to_notypeconv(self, *args, **kwargs):
 def disable_type_conv(t):
     """
     This modifier can be used on Tensors or whole Modules
-    to prevent them from being modified during to(dtype=x) calls 
+    to prevent them from being modified during to(dtype=x) calls
     """
     t.__original_to = t.to
     t.to = to_notypeconv
@@ -358,11 +360,14 @@ def forward(self, args: List[torch.Tensor]):
         if not torch.jit.is_scripting() and not torch.compiler.is_compiling():
             for oid, arg in enumerate(args):
                 torch._assert(
-                    arg.shape[-1] == self.descriptor.operands[oid].size,
-                    "input shape[-1] does not match operand size",
+                    arg.ndim == 2,
+                    f"input {oid} should have ndim=2",
+                )
+                torch._assert(
+                    arg.shape[1] == self.descriptor.operands[oid].size,
+                    f"input {oid} should have shape (batch, {self.descriptor.operands[oid].size})",
                 )
 
-        args = [arg.unsqueeze(0) if arg.ndim == 1 else arg for arg in args]
         return self.module(args)
 
 
@@ -489,6 +494,17 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling FusedTensorProductOp3: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
+        torch._assert(x0.ndim == 2, "input should be (batch, dim) or (1, dim)")
+        torch._assert(x1.ndim == 2, "input should be (batch, dim) or (1, dim)")
+
+        batch = max(x0.shape[0], x1.shape[0])
+
+        if batch > 1:
+            if x0.shape[0] == 1:
+                x0 = x0.squeeze(0)
+            if x1.shape[0] == 1:
+                x1 = x1.squeeze(0)
+
         # ops.FusedTensorProductOp3 expects inputs
         # of shape (Z, dim) or (dim,)
         return self._f(x0, x1)
@@ -539,6 +555,20 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling FusedTensorProductOp4: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
+        torch._assert(x0.ndim == 2, "input should be (batch, dim) or (1, dim)")
+        torch._assert(x1.ndim == 2, "input should be (batch, dim) or (1, dim)")
+        torch._assert(x2.ndim == 2, "input should be (batch, dim) or (1, dim)")
+
+        batch = max(x0.shape[0], x1.shape[0], x2.shape[0])
+
+        if batch > 1:
+            if x0.shape[0] == 1:
+                x0 = x0.squeeze(0)
+            if x1.shape[0] == 1:
+                x1 = x1.squeeze(0)
+            if x2.shape[0] == 1:
+                x2 = x2.squeeze(0)
+
         # ops.FusedTensorProductOp4 expects inputs
         # of shape (Z, dim) or (dim,)
         return self._f(x0, x1, x2)
@@ -584,10 +614,8 @@ def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
                 f"Calling TensorProductUniform3x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}"
             )
 
-        if x0.ndim == 1:
-            x0 = x0.unsqueeze(0)
-        if x1.ndim == 1:
-            x1 = x1.unsqueeze(0)
+        torch._assert(x0.ndim == 2, "input should be (batch, dim) or (1, dim)")
+        torch._assert(x1.ndim == 2, "input should be (batch, dim) or (1, dim)")
 
         # ops.TensorProductUniform1d expects inputs
         # of shape (Z, dim) or (1, dim)
@@ -607,12 +635,9 @@ def forward(self, inputs: List[torch.Tensor]):
                 f"Calling TensorProductUniform4x1d: {self.descriptor}, input shapes: {x0.shape}, {x1.shape}, {x2.shape}"
             )
 
-        if x0.ndim == 1:
-            x0 = x0.unsqueeze(0)
-        if x1.ndim == 1:
-            x1 = x1.unsqueeze(0)
-        if x2.ndim == 1:
-            x2 = x2.unsqueeze(0)
+        torch._assert(x0.ndim == 2, "input should be (batch, dim) or (1, dim)")
+        torch._assert(x1.ndim == 2, "input should be (batch, dim) or (1, dim)")
+        torch._assert(x2.ndim == 2, "input should be (batch, dim) or (1, dim)")
 
         # ops.TensorProductUniform1d expects inputs
         # of shape (Z, dim) or (1, dim)

From 59dd354aecc2a9c33ed0c7fa22080dbe7639f2bc Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 18 Dec 2024 13:45:44 -0800
Subject: [PATCH 80/96] add back @pytest.mark.parametrize("mode", export_modes)

---
 cuequivariance_torch/tests/primitives/tensor_product_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 8343f08..b3ec246 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -95,6 +95,7 @@ def make_descriptors():
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
 @pytest.mark.parametrize("use_fallback", [True, False])
+@pytest.mark.parametrize("mode", export_modes)
 def test_primitive_tensor_product_cuda_vs_fx(
     d: cue.SegmentedTensorProduct,
     dtype: torch.dtype,
@@ -157,7 +158,9 @@ def test_primitive_tensor_product_cuda_vs_fx(
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("mode", export_modes)
 @pytest.mark.parametrize("use_fallback", [True, False])
-def test_script_tensor_product(d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path):
+def test_script_tensor_product(
+    d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path
+):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 

From 8a0f10955e9ee3e87476faf7ffa11afe2d838d2e Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Wed, 18 Dec 2024 13:49:40 -0800
Subject: [PATCH 81/96] fix

---
 .../primitives/equivariant_tensor_product.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
index 9fa84cc..6c974ab 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/equivariant_tensor_product.py
@@ -127,7 +127,7 @@ class EquivariantTensorProduct(torch.nn.Module):
         >>> e = cue.descriptors.fully_connected_tensor_product(
         ...    cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1"), cue.Irreps("SO3", "2x1")
         ... )
-        >>> w = torch.ones(e.inputs[0].irreps.dim, device=device)
+        >>> w = torch.ones(1, e.inputs[0].irreps.dim, device=device)
         >>> x1 = torch.ones(17, e.inputs[1].irreps.dim, device=device)
         >>> x2 = torch.ones(17, e.inputs[2].irreps.dim, device=device)
         >>> tp = cuet.EquivariantTensorProduct(e, layout=cue.ir_mul, device=device)

From e01a7a95e7ec8ef35a5cb49cfbbdf792a3b12aa3 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 18 Dec 2024 14:35:52 -0800
Subject: [PATCH 82/96] Fixing noconv bug

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance_torch/primitives/tensor_product.py  |  3 ++-
 .../tests/primitives/symmetric_tensor_product_test.py  |  1 +
 .../tests/primitives/tensor_product_test.py            | 10 +++++++---
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 20d1702..95d9fad 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -15,6 +15,7 @@
 import logging
 import math
 import warnings
+from types import MethodType
 from typing import List, Optional, OrderedDict, Tuple
 
 import torch
@@ -133,7 +134,7 @@ def disable_type_conv(t):
     to prevent them from being modified during to(dtype=x) calls
     """
     t.__original_to = t.to
-    t.to = to_notypeconv
+    t.to = MethodType(to_notypeconv, t)
     return t
 
 
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 1191437..ffad279 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -131,6 +131,7 @@ def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: b
     for param in m.parameters():
         assert False  # no parameters
 
+    m = m.to(device)
     m = m.to(torch.float32)
     m = m.to(torch.float64)
 
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index b3ec246..71cab81 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -89,7 +89,7 @@ def make_descriptors():
         (torch.bfloat16, torch.float32, 1.0),
     ]
 
-export_modes = ["script", "export", "onnx", "trt", "jit"]
+export_modes = ["script", "export", "trt", "jit"]
 
 
 @pytest.mark.parametrize("d", make_descriptors())
@@ -158,7 +158,7 @@ def test_primitive_tensor_product_cuda_vs_fx(
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("mode", export_modes)
 @pytest.mark.parametrize("use_fallback", [True, False])
-def test_script_tensor_product(
+def test_export(
     d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path
 ):
     if not torch.cuda.is_available():
@@ -175,7 +175,11 @@ def test_script_tensor_product(
     #        "Only eager, script and export modes are supported for the fallback!"
     #    )
 
-    m = cuet.TensorProduct(d, device=device, math_dtype=torch.float32)
+    m = cuet.TensorProduct(d, 
+                           device=device, 
+                           math_dtype=torch.float32,
+                           use_fallback=use_fallback
+                           )
     module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
     out1 = m(inputs)
     out2 = module(inputs)

From 3fbadb5cfd34db2a2c70a2519d13861112382fc6 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 18 Dec 2024 16:49:58 -0800
Subject: [PATCH 83/96] Really fixing noconv

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../cuequivariance_torch/primitives/tensor_product.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
index 95d9fad..687ec85 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/tensor_product.py
@@ -15,7 +15,7 @@
 import logging
 import math
 import warnings
-from types import MethodType
+from functools import partial
 from typing import List, Optional, OrderedDict, Tuple
 
 import torch
@@ -120,11 +120,11 @@ def forward(self, inputs: List[torch.Tensor]):
         return self.f(inputs)
 
 
-def to_notypeconv(self, *args, **kwargs):
+def to_notypeconv(t, *args, **kwargs):
     new_kwargs = kwargs.copy()
     new_kwargs.pop("dtype", None)
     new_args = [None if isinstance(a, torch.dtype) else a for a in args]
-    result = self.__original_to(*new_args, **new_kwargs)
+    result = t.__original_to(*new_args, **new_kwargs)
     return result
 
 
@@ -134,7 +134,7 @@ def disable_type_conv(t):
     to prevent them from being modified during to(dtype=x) calls
     """
     t.__original_to = t.to
-    t.to = MethodType(to_notypeconv, t)
+    t.to = partial(to_notypeconv, t)
     return t
 
 

From 686782c5e95ede5b05bb30485664f20ebf753f1e Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 03:34:07 -0800
Subject: [PATCH 84/96] fix linear

---
 .../cuequivariance_torch/operations/linear.py          | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/linear.py b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
index 0d41d90..e5a6a2c 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/linear.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/linear.py
@@ -75,7 +75,7 @@ def __init__(
             if not self.shared_weights:
                 raise ValueError("Internal weights should be shared")
             self.weight = torch.nn.Parameter(
-                torch.randn(self.weight_numel, device=device, dtype=dtype)
+                torch.randn(1, self.weight_numel, device=device, dtype=dtype)
             )
         else:
             self.weight = None
@@ -119,11 +119,7 @@ def forward(
 
             weight = self.weight
 
-        if weight is not None:
-            if self.shared_weights and weight.ndim != 1:
-                raise ValueError("Shared weights should be 1D tensor")
-            if not self.shared_weights and weight.ndim != 2:
-                raise ValueError("Weights should be 2D tensor")
-        else:
+        if weight is None:
             raise ValueError("Weights should not be None")
+
         return self.f([weight, x])

From d8c1deeb7962926185555d0ea4943dcd4132f279 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 03:36:48 -0800
Subject: [PATCH 85/96] fix rotations

---
 .../tests/operations/rotation_test.py                | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cuequivariance_torch/tests/operations/rotation_test.py b/cuequivariance_torch/tests/operations/rotation_test.py
index 6f04eef..0e4694c 100644
--- a/cuequivariance_torch/tests/operations/rotation_test.py
+++ b/cuequivariance_torch/tests/operations/rotation_test.py
@@ -22,9 +22,9 @@
 
 def test_rotation():
     irreps = cue.Irreps("SO3", "3x0 + 1 + 0 + 4x2 + 4")
-    alpha = torch.tensor(0.3).to(device)
-    beta = torch.tensor(0.4).to(device)
-    gamma = torch.tensor(-0.5).to(device)
+    alpha = torch.tensor([0.3]).to(device)
+    beta = torch.tensor([0.4]).to(device)
+    gamma = torch.tensor([-0.5]).to(device)
 
     rot = cuet.Rotation(irreps, layout=cue.ir_mul).to(device)
 
@@ -41,8 +41,10 @@ def test_vector_to_euler_angles():
     A = torch.nn.functional.normalize(A, dim=-1)
 
     beta, alpha = cuet.vector_to_euler_angles(A)
-    ey = torch.tensor([0.0, 1.0, 0.0])
-    B = cuet.Rotation(cue.Irreps("SO3", "1"), layout=cue.ir_mul)(0.0, beta, alpha, ey)
+    ey = torch.tensor([[0.0, 1.0, 0.0]])
+    B = cuet.Rotation(cue.Irreps("SO3", "1"), layout=cue.ir_mul)(
+        torch.tensor([0.0]), beta, alpha, ey
+    )
 
     assert torch.allclose(A, B)
 

From c837298a6be416cfc14cbbe8d3b6a2b7a6301e10 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 03:38:05 -0800
Subject: [PATCH 86/96] fix tpfc

---
 .../cuequivariance_torch/operations/tp_fully_connected.py  | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
index 5d7ff5b..f28b08c 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
@@ -87,7 +87,7 @@ def __init__(
             if not self.shared_weights:
                 raise ValueError("Internal weights should be shared")
             self.weight = torch.nn.Parameter(
-                torch.randn(self.weight_numel, device=device, dtype=dtype)
+                torch.randn(1, self.weight_numel, device=device, dtype=dtype)
             )
         else:
             self.weight = None
@@ -141,9 +141,4 @@ def forward(
 
             weight = self.weight
 
-        if self.shared_weights and weight.ndim != 1:
-            raise ValueError("Shared weights should be 1D tensor")
-        if not self.shared_weights and weight.ndim != 2:
-            raise ValueError("Weights should be 2D tensor")
-
         return self.f([weight, x1, x2])

From ac2ed25046a9ddbe460aef54333b6dda40ea5b8a Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 03:43:24 -0800
Subject: [PATCH 87/96] fix tpcw

---
 .../operations/tp_channel_wise.py             |  7 +----
 .../tests/operations/tp_channel_wise_test.py  | 26 ++++++++++++-------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
index 00f2851..39677f6 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
@@ -86,7 +86,7 @@ def __init__(
             if not self.shared_weights:
                 raise ValueError("Internal weights should be shared")
             self.weight = torch.nn.Parameter(
-                torch.randn(self.weight_numel, device=device, dtype=dtype)
+                torch.randn(1, self.weight_numel, device=device, dtype=dtype)
             )
         else:
             self.weight = None
@@ -140,9 +140,4 @@ def forward(
 
             weight = self.weight
 
-        if self.shared_weights and weight.ndim != 1:
-            raise ValueError("Shared weights should be 1D tensor")
-        if not self.shared_weights and weight.ndim != 2:
-            raise ValueError("Weights should be 2D tensor")
-
         return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index a56e8ce..c2d9213 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -21,16 +21,22 @@
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
-list_of_irreps = [
-    cue.Irreps("O3", "32x0e + 32x1o"),
-    cue.Irreps("O3", "2x1o + 5x0e + 2e + 1e + 1o"),
-    cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
-]
 
-
-@pytest.mark.parametrize("irreps1", list_of_irreps)
-@pytest.mark.parametrize("irreps2", [irreps.set_mul(1) for irreps in list_of_irreps])
-@pytest.mark.parametrize("irreps3", list_of_irreps)
+@pytest.mark.parametrize(
+    "irreps1, irreps2, irreps3",
+    [
+        (
+            cue.Irreps("O3", "32x0e + 32x1o"),
+            cue.Irreps("O3", "0e + 1o + 2e"),
+            cue.Irreps("O3", "32x0e + 32x1o"),
+        ),
+        (
+            cue.Irreps("O3", "2x1o + 3x0e + 4x2e + 3x1e + 2x1o"),
+            cue.Irreps("O3", "1o + 2e"),
+            cue.Irreps("O3", "2x1o + 5x0e + 1e + 1o"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
 @pytest.mark.parametrize("batch", [1, 32])
@@ -104,7 +110,7 @@ def test_channel_wise_bwd_bwd(irreps: cue.Irreps):
 
         torch.manual_seed(0)
         w = torch.randn(
-            m.weight_numel, device="cuda", requires_grad=True, dtype=torch.float64
+            1, m.weight_numel, device="cuda", requires_grad=True, dtype=torch.float64
         )
 
         (grad1, grad2, grad3) = torch.autograd.grad(

From 64c0121892b3f633cea9a0bf6739efc4a5a27f5d Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 03:44:38 -0800
Subject: [PATCH 88/96] less test

---
 .../operations/tp_fully_connected_test.py     | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index 832904b..8a066a1 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -21,16 +21,22 @@
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
-list_of_irreps = [
-    cue.Irreps("O3", "4x0e + 4x1o"),
-    cue.Irreps("O3", "2x1o + 5x0e + 2e + 1e + 1o"),
-    cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
-]
 
-
-@pytest.mark.parametrize("irreps1", list_of_irreps)
-@pytest.mark.parametrize("irreps2", list_of_irreps)
-@pytest.mark.parametrize("irreps3", list_of_irreps)
+@pytest.mark.parametrize(
+    "irreps1, irreps2, irreps3",
+    [
+        (
+            cue.Irreps("O3", "4x0e + 4x1o"),
+            cue.Irreps("O3", "4x0e + 4x1o"),
+            cue.Irreps("O3", "4x0e + 4x1o"),
+        ),
+        (
+            cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
+            cue.Irreps("O3", "4x0e + 4x1o"),
+            cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
 def test_fully_connected(

From 0080283217b1e7a17029f5aa06bdf56e0ab12e92 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 08:56:46 -0800
Subject: [PATCH 89/96] remove unused mode in tensor_product_test

---
 .../tests/primitives/tensor_product_test.py       | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 71cab81..a345f85 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -95,15 +95,12 @@ def make_descriptors():
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
 @pytest.mark.parametrize("use_fallback", [True, False])
-@pytest.mark.parametrize("mode", export_modes)
 def test_primitive_tensor_product_cuda_vs_fx(
     d: cue.SegmentedTensorProduct,
     dtype: torch.dtype,
     math_dtype: torch.dtype,
     tol: float,
     use_fallback: bool,
-    mode: str,
-    tmp_path: str,
 ):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
@@ -158,9 +155,7 @@ def test_primitive_tensor_product_cuda_vs_fx(
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("mode", export_modes)
 @pytest.mark.parametrize("use_fallback", [True, False])
-def test_export(
-    d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path
-):
+def test_export(d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -175,11 +170,9 @@ def test_export(
     #        "Only eager, script and export modes are supported for the fallback!"
     #    )
 
-    m = cuet.TensorProduct(d, 
-                           device=device, 
-                           math_dtype=torch.float32,
-                           use_fallback=use_fallback
-                           )
+    m = cuet.TensorProduct(
+        d, device=device, math_dtype=torch.float32, use_fallback=use_fallback
+    )
     module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
     out1 = m(inputs)
     out2 = module(inputs)

From 9c07cd2e527004f300cb7279e58152980ff86e95 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 15:35:31 -0800
Subject: [PATCH 90/96] typo

---
 .../tests/primitives/equivariant_tensor_product_test.py        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index e00df1c..68adb60 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -215,8 +215,7 @@ def test_export(
         device=device,
     )
     exp_inputs = [
-        torch.randn((512, inp.irreps.dim), device=device, dtype=dtype)
-        for inp in e.inputs
+        torch.randn((512, inp.dim), device=device, dtype=dtype) for inp in e.inputs
     ]
     inputs = [
         torch.randn((1024, inp.dim), device=device, dtype=dtype) for inp in e.inputs

From c860042b0a24fb6670b39c0685dc818529818ab4 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Thu, 19 Dec 2024 15:40:51 -0800
Subject: [PATCH 91/96] disable export

---
 .../primitives/equivariant_tensor_product_test.py  |  3 ++-
 cuequivariance_torch/tests/utils.py                | 14 +++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 68adb60..e1cbf88 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -187,7 +187,8 @@ def test_compile(
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
 
 
-export_modes = ["script", "export", "onnx", "trt", "jit"]
+export_modes = ["script", "onnx", "trt", "jit"]
+# "export" does not support the change of batch size
 
 
 @pytest.mark.parametrize("e", make_descriptors())
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index 3244a84..b16cf24 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -107,13 +107,13 @@ def verify_trt(module, onnx_module, inputs, dtype):
 
 
 def module_with_mode(
-    mode,
-    module,
-    inputs,
-    math_dtype,
-    tmp_path,
-    grad_modes=["eager", "compile", "jit", "export"],
-):
+    mode: str,
+    module: torch.nn.Module,
+    inputs: list[torch.Tensor] | list[list[torch.Tensor]],
+    math_dtype: torch.dtype,
+    tmp_path: str,
+    grad_modes: list[str] = ["eager", "compile", "jit", "export"],
+) -> torch.nn.Module:
     if isinstance(inputs[0], list):
         dtype = inputs[0][0].dtype
     else:

From df212f58f838192e95189f1746938a231259443d Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Fri, 20 Dec 2024 01:19:08 -0800
Subject: [PATCH 92/96] Reduced export test modes list

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../equivariant_tensor_product_test.py        | 33 ++-----------------
 .../tests/primitives/script_test.py           | 25 +++++++-------
 .../tests/primitives/tensor_product_test.py   |  2 +-
 cuequivariance_torch/tests/utils.py           |  6 +---
 4 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index e00df1c..e6fa06d 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -163,31 +163,7 @@ def test_precision_cuda_vs_fx(
     torch.testing.assert_close(y0, y1, atol=atol, rtol=rtol)
 
 
-@pytest.mark.parametrize("e", make_descriptors())
-@pytest.mark.parametrize("dtype, math_dtype, atol, rtol", settings2)
-def test_compile(
-    e: cue.EquivariantTensorProduct,
-    dtype: torch.dtype,
-    math_dtype: torch.dtype,
-    atol: float,
-    rtol: float,
-):
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA is not available")
-
-    m = cuet.EquivariantTensorProduct(
-        e, layout=cue.mul_ir, use_fallback=False, device=device, math_dtype=math_dtype
-    )
-    inputs = [
-        torch.randn((1024, inp.dim), device=device, dtype=dtype) for inp in e.inputs
-    ]
-    res = m(inputs)
-    m_compile = torch.compile(m, fullgraph=True)
-    res_script = m_compile(inputs)
-    torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
-
-
-export_modes = ["script", "export", "onnx", "trt", "jit"]
+export_modes = ["compile", "script", "jit"]
 
 
 @pytest.mark.parametrize("e", make_descriptors())
@@ -214,14 +190,11 @@ def test_export(
         use_fallback=use_fallback,
         device=device,
     )
-    exp_inputs = [
+    inputs = [
         torch.randn((512, inp.irreps.dim), device=device, dtype=dtype)
         for inp in e.inputs
     ]
-    inputs = [
-        torch.randn((1024, inp.dim), device=device, dtype=dtype) for inp in e.inputs
-    ]
     res = m(inputs)
-    m_script = module_with_mode(mode, m, [exp_inputs], math_dtype, tmp_path)
+    m_script = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
     res_script = m_script(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/script_test.py
index 5221f69..5b32782 100644
--- a/cuequivariance_torch/tests/primitives/script_test.py
+++ b/cuequivariance_torch/tests/primitives/script_test.py
@@ -1,5 +1,8 @@
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 from cuequivariance_torch.primitives.symmetric_tensor_product import (
@@ -11,9 +14,6 @@
     TensorProductUniform3x1d,
     TensorProductUniform4x1d,
 )
-from tests.utils import (
-    module_with_mode,
-)
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
@@ -58,8 +58,10 @@ def test_script_fused_tp_3():
 
     assert module([x0, x1]).shape == (batch, d.operands[2].size)
 
+
 export_modes = ["script", "export"]
 
+
 @pytest.mark.parametrize("mode", export_modes)
 def test_script_fused_tp_4(mode, tmp_path):
     if not torch.cuda.is_available():
@@ -78,14 +80,15 @@ def test_script_fused_tp_4(mode, tmp_path):
     x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
     x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
     x2 = torch.randn(batch, d.operands[2].size, device=device, dtype=torch.float32)
-    
+
     inputs = [x0, x1, x2]
     m = FusedTensorProductOp4(d, [0, 1, 2], device, torch.float32)
     module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
-    out1=m(inputs)
-    out2=module(inputs)
+    out1 = m(inputs)
+    out2 = module(inputs)
     torch.testing.assert_close(out1, out2)
 
+
 @pytest.mark.parametrize("mode", export_modes)
 def test_script_uniform_tp_3(mode, tmp_path):
     if not torch.cuda.is_available():
@@ -106,8 +109,8 @@ def test_script_uniform_tp_3(mode, tmp_path):
 
     m = TensorProductUniform3x1d(d, device, torch.float32)
     module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
-    out1=m(inputs)
-    out2=module(inputs)
+    out1 = m(inputs)
+    out2 = module(inputs)
     torch.testing.assert_close(out1, out2)
 
 
@@ -130,8 +133,8 @@ def test_script_uniform_tp_4(mode, tmp_path):
     x2 = torch.randn(batch, d.operands[2].size, device=device, dtype=torch.float32)
     inputs = [x0, x1, x2]
 
-    m= TensorProductUniform4x1d(d, device, torch.float32)
+    m = TensorProductUniform4x1d(d, device, torch.float32)
     module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
-    out1=m(inputs)
-    out2=module(inputs)
+    out1 = m(inputs)
+    out2 = module(inputs)
     torch.testing.assert_close(out1, out2)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index a345f85..1487286 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -89,7 +89,7 @@ def make_descriptors():
         (torch.bfloat16, torch.float32, 1.0),
     ]
 
-export_modes = ["script", "export", "trt", "jit"]
+export_modes = ["compile", "script", "jit"]
 
 
 @pytest.mark.parametrize("d", make_descriptors())
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index 3244a84..8ad7cc2 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -126,11 +126,7 @@ def module_with_mode(
 
     with torch.set_grad_enabled(mode in grad_modes):
         if mode == "compile":
-            import sys
-
-            if sys.version_info.major == 3 and sys.version_info.minor >= 12:
-                pytest.skip("torch dynamo needs cpy <= 3.11")
-                module = torch.compile(module)
+            module = torch.compile(module, fullgraph=True)
         elif mode == "fx":
             module = torch.fx.symbolic_trace(module)
         elif mode == "script":

From 25c2eef3428f361e265c712b8401ce573d76d86e Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Fri, 20 Dec 2024 17:11:16 -0800
Subject: [PATCH 93/96] Added unit tests to operations and the rest of the
 primitives

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../operations/tp_channel_wise.py             | 11 ++-
 .../operations/tp_fully_connected.py          | 11 ++-
 .../tests/operations/linear_test.py           | 49 ++++++++++++
 .../tests/operations/rotation_test.py         | 25 ++++++
 .../operations/spherical_harmonics_test.py    | 28 +++++++
 .../operations/symmetric_contraction_test.py  |  2 +-
 .../tests/operations/tp_channel_wise_test.py  | 78 +++++++++++++++----
 .../operations/tp_fully_connected_test.py     | 75 +++++++++++-------
 .../equivariant_tensor_product_test.py        |  4 +-
 ...cript_test.py => primitive_export_test.py} | 32 ++++----
 .../symmetric_tensor_product_test.py          | 38 ++++++++-
 .../tests/primitives/tensor_product_test.py   | 17 ++--
 .../tests/primitives/transpose_test.py        | 21 +++++
 13 files changed, 313 insertions(+), 78 deletions(-)
 rename cuequivariance_torch/tests/primitives/{script_test.py => primitive_export_test.py} (85%)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
index 39677f6..d19da94 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
@@ -137,7 +137,10 @@ def forward(
         if self.internal_weights:
             if weight is not None:
                 raise ValueError("Internal weights are used, weight should be None")
-
-            weight = self.weight
-
-        return self.f([weight, x1, x2])
+            return self.f([self.weight, x1, x2])
+        else:
+            if weight is None:
+                raise ValueError(
+                    "Internal weights are not used, weight should not be None"
+                )
+            return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
index f28b08c..4efb7c6 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
@@ -138,7 +138,10 @@ def forward(
         if self.internal_weights:
             if weight is not None:
                 raise ValueError("Internal weights are used, weight should be None")
-
-            weight = self.weight
-
-        return self.f([weight, x1, x2])
+            return self.f([self.weight, x1, x2])
+        else:
+            if weight is None:
+                raise ValueError(
+                    "Internal weights are not used, weight should not be None"
+                )
+            return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/tests/operations/linear_test.py b/cuequivariance_torch/tests/operations/linear_test.py
index 2b78ff1..7e904d6 100644
--- a/cuequivariance_torch/tests/operations/linear_test.py
+++ b/cuequivariance_torch/tests/operations/linear_test.py
@@ -16,6 +16,9 @@
 
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
@@ -169,3 +172,49 @@ def test_linear_copy(
     ).to(device)
 
     copy.deepcopy(linear)
+
+
+export_modes = ["compile", "script", "jit"]
+
+
+@pytest.mark.parametrize("irreps_in", list_of_irreps)
+@pytest.mark.parametrize("irreps_out", list_of_irreps)
+@pytest.mark.parametrize("layout", [cue.mul_ir, cue.ir_mul])
+@pytest.mark.parametrize("shared_weights", [True, False])
+@pytest.mark.parametrize("mode", export_modes)
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_export(
+    irreps_in: cue.Irreps,
+    irreps_out: cue.Irreps,
+    layout: cue.IrrepsLayout,
+    shared_weights: bool,
+    mode: str,
+    use_fallback: bool,
+    tmp_path: str,
+):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    torch.manual_seed(0)
+    m = cuet.Linear(
+        irreps_in,
+        irreps_out,
+        layout=layout,
+        shared_weights=shared_weights,
+        device=device,
+        dtype=torch.float32,
+        use_fallback=use_fallback,
+    )
+
+    x = torch.randn(10, irreps_in.dim, dtype=torch.float32).cuda()
+
+    if shared_weights:
+        inputs = (x,)
+    else:
+        w = torch.randn(10, m.weight_numel, dtype=torch.float32).cuda()
+        inputs = (x, w)
+
+    out1 = m(*inputs)
+    m = module_with_mode(mode, m, inputs, torch.float32, tmp_path)
+    out2 = m(*inputs)
+    torch.testing.assert_close(out1, out2)
diff --git a/cuequivariance_torch/tests/operations/rotation_test.py b/cuequivariance_torch/tests/operations/rotation_test.py
index e307053..1fd34a7 100644
--- a/cuequivariance_torch/tests/operations/rotation_test.py
+++ b/cuequivariance_torch/tests/operations/rotation_test.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
@@ -62,3 +65,25 @@ def test_inversion(use_fallback: bool):
         )(torch.tensor([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]], device=device)),
         torch.tensor([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0]], device=device),
     )
+
+
+export_modes = ["compile", "script", "jit"]
+
+
+@pytest.mark.parametrize("mode", export_modes)
+def test_export(mode: str, tmp_path: str):
+    irreps = cue.Irreps("SO3", "3x0 + 1 + 0 + 4x2 + 4")
+    dtype = torch.float32
+    alpha = torch.tensor([0.3]).to(device)
+    beta = torch.tensor([0.4]).to(device)
+    gamma = torch.tensor([-0.5]).to(device)
+
+    m = cuet.Rotation(irreps, layout=cue.ir_mul).to(device)
+
+    x = torch.randn(10, irreps.dim).to(device)
+    inputs = (gamma, beta, alpha, x)
+
+    out1 = m(*inputs)
+    m = module_with_mode(mode, m, inputs, dtype, tmp_path)
+    out2 = m(*inputs)
+    torch.testing.assert_close(out1, out2)
diff --git a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
index e5318a9..1404ef7 100644
--- a/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
+++ b/cuequivariance_torch/tests/operations/spherical_harmonics_test.py
@@ -15,6 +15,9 @@
 import numpy as np
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
@@ -69,3 +72,28 @@ def test_spherical_harmonics_full(dtype, ls: list[int], use_fallback: bool):
     yl = m(vec)
     assert yl.shape[0] == 10
     assert yl.shape[1] == sum(2 * ell + 1 for ell in ls)
+
+
+export_modes = ["compile", "script", "jit"]
+
+
+@pytest.mark.parametrize("dtype", data_types)
+@pytest.mark.parametrize("ls", [[0], [1], [2], [0, 1], [0, 1, 2]])
+@pytest.mark.parametrize("use_fallback", [False, True])
+@pytest.mark.parametrize("mode", export_modes)
+def test_export(dtype, ls: list[int], use_fallback: bool, mode: str, tmp_path: str):
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    tol = 1e-5
+    if dtype in [torch.float16, torch.bfloat16]:
+        tol = 1e-2
+    m = cuet.SphericalHarmonics(ls, False, use_fallback=use_fallback, device=device)
+
+    vec = torch.randn(10, 3, device=device, dtype=dtype)
+    inputs = (vec,)
+    out1 = m(vec)
+
+    m = module_with_mode(mode, m, inputs, dtype, tmp_path)
+    out2 = m(*inputs)
+    torch.testing.assert_close(out1, out2, atol=tol, rtol=tol)
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index 69ff71a..d9821ee 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -113,7 +113,7 @@ def test_mace_compatibility():
     torch.testing.assert_close(output, expected_output, atol=1e-5, rtol=1e-5)
 
 
-export_modes = ["export", "onnx", "trt", "torch_trt", "jit"]
+export_modes = ["compile", "script", "jit"]
 
 
 @pytest.mark.parametrize(
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index c2d9213..e83273f 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
@@ -21,22 +24,21 @@
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
-
-@pytest.mark.parametrize(
-    "irreps1, irreps2, irreps3",
-    [
-        (
-            cue.Irreps("O3", "32x0e + 32x1o"),
-            cue.Irreps("O3", "0e + 1o + 2e"),
-            cue.Irreps("O3", "32x0e + 32x1o"),
-        ),
-        (
-            cue.Irreps("O3", "2x1o + 3x0e + 4x2e + 3x1e + 2x1o"),
-            cue.Irreps("O3", "1o + 2e"),
-            cue.Irreps("O3", "2x1o + 5x0e + 1e + 1o"),
-        ),
-    ],
-)
+irreps = [
+    (
+        cue.Irreps("O3", "32x0e + 32x1o"),
+        cue.Irreps("O3", "0e + 1o + 2e"),
+        cue.Irreps("O3", "32x0e + 32x1o"),
+    ),
+    (
+        cue.Irreps("O3", "2x1o + 3x0e + 4x2e + 3x1e + 2x1o"),
+        cue.Irreps("O3", "1o + 2e"),
+        cue.Irreps("O3", "2x1o + 5x0e + 1e + 1o"),
+    ),
+]
+
+
+@pytest.mark.parametrize("irreps1, irreps2, irreps3", irreps)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
 @pytest.mark.parametrize("batch", [1, 32])
@@ -78,6 +80,50 @@ def test_channel_wise_fwd(
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
 
+export_modes = ["compile", "script", "jit"]
+
+
+@pytest.mark.parametrize("irreps1, irreps2, irreps3", irreps)
+@pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
+@pytest.mark.parametrize("use_fallback", [False, True])
+@pytest.mark.parametrize("batch", [1, 32])
+@pytest.mark.parametrize("mode", export_modes)
+def test_export(
+    irreps1: cue.Irreps,
+    irreps2: cue.Irreps,
+    irreps3: cue.Irreps,
+    layout: cue.IrrepsLayout,
+    use_fallback: bool,
+    batch: int,
+    mode: str,
+    tmp_path: str,
+):
+    dtype = torch.float32
+    if use_fallback is False and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
+    m1 = cuet.ChannelWiseTensorProduct(
+        irreps1,
+        irreps2,
+        irreps3,
+        shared_weights=True,
+        internal_weights=True,
+        layout=layout,
+        device=device,
+        dtype=dtype,
+        use_fallback=use_fallback,
+    )
+    x1 = torch.randn(batch, irreps1.dim, dtype=dtype).to(device)
+    x2 = torch.randn(batch, irreps2.dim, dtype=dtype).to(device)
+
+    inputs = (x1, x2)
+    out1 = m1(x1, x2)
+
+    m1 = module_with_mode(mode, m1, inputs, dtype, tmp_path)
+    out2 = m1(*inputs)
+    torch.testing.assert_close(out1, out2)
+
+
 @pytest.mark.parametrize("irreps", ["32x0", "2x0 + 3x1"])
 def test_channel_wise_bwd_bwd(irreps: cue.Irreps):
     if not torch.cuda.is_available():
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index 8a066a1..ffddd5a 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance_torch as cuet
@@ -21,22 +24,23 @@
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
+export_modes = ["compile", "script", "jit"]
 
-@pytest.mark.parametrize(
-    "irreps1, irreps2, irreps3",
-    [
-        (
-            cue.Irreps("O3", "4x0e + 4x1o"),
-            cue.Irreps("O3", "4x0e + 4x1o"),
-            cue.Irreps("O3", "4x0e + 4x1o"),
-        ),
-        (
-            cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
-            cue.Irreps("O3", "4x0e + 4x1o"),
-            cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
-        ),
-    ],
-)
+irreps = [
+    (
+        cue.Irreps("O3", "4x0e + 4x1o"),
+        cue.Irreps("O3", "4x0e + 4x1o"),
+        cue.Irreps("O3", "4x0e + 4x1o"),
+    ),
+    (
+        cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
+        cue.Irreps("O3", "4x0e + 4x1o"),
+        cue.Irreps("O3", "2e + 0x0e + 0o + 0x1e + 1e"),
+    ),
+]
+
+
+@pytest.mark.parametrize("irreps1, irreps2, irreps3", irreps)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
 def test_fully_connected(
@@ -77,21 +81,40 @@ def test_fully_connected(
     torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)
 
 
+@pytest.mark.parametrize("irreps1, irreps2, irreps3", irreps)
+@pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
 @pytest.mark.parametrize("use_fallback", [False, True])
-def test_compile(use_fallback: bool):
+@pytest.mark.parametrize("mode", export_modes)
+def test_export(
+    irreps1: cue.Irreps,
+    irreps2: cue.Irreps,
+    irreps3: cue.Irreps,
+    layout: cue.IrrepsLayout,
+    use_fallback: bool,
+    mode: str,
+    tmp_path: str,
+):
     if use_fallback is False and not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
-
-    m = cuet.FullyConnectedTensorProduct(
-        irreps_in1=cue.Irreps("O3", "32x0e + 32x1o"),
-        irreps_in2=cue.Irreps("O3", "32x0e + 32x1o"),
-        irreps_out=cue.Irreps("O3", "32x0e + 32x1o"),
-        layout=cue.mul_ir,
+    dtype = torch.float32
+    m1 = cuet.FullyConnectedTensorProduct(
+        irreps1,
+        irreps2,
+        irreps3,
+        shared_weights=True,
+        internal_weights=True,
+        layout=layout,
         device=device,
+        dtype=dtype,
         use_fallback=use_fallback,
     )
 
-    m_compile = torch.compile(m, fullgraph=True)
-    input1 = torch.randn(100, m.irreps_in1.dim, device=device)
-    input2 = torch.randn(100, m.irreps_in2.dim, device=device)
-    m_compile(input1, input2)
+    x1 = torch.randn(32, irreps1.dim, dtype=dtype).to(device)
+    x2 = torch.randn(32, irreps2.dim, dtype=dtype).to(device)
+
+    inputs = (x1, x2)
+    out1 = m1(*inputs)
+
+    m1 = module_with_mode(mode, m1, inputs, dtype, tmp_path)
+    out2 = m1(*inputs)
+    torch.testing.assert_close(out1, out2)
diff --git a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
index 7f57920..dd4389b 100644
--- a/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/equivariant_tensor_product_test.py
@@ -195,6 +195,6 @@ def test_export(
         torch.randn((512, inp.dim), device=device, dtype=dtype) for inp in e.inputs
     ]
     res = m(inputs)
-    m_script = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
-    res_script = m_script(inputs)
+    m = module_with_mode(mode, m, [inputs], math_dtype, tmp_path)
+    res_script = m(inputs)
     torch.testing.assert_close(res, res_script, atol=atol, rtol=rtol)
diff --git a/cuequivariance_torch/tests/primitives/script_test.py b/cuequivariance_torch/tests/primitives/primitive_export_test.py
similarity index 85%
rename from cuequivariance_torch/tests/primitives/script_test.py
rename to cuequivariance_torch/tests/primitives/primitive_export_test.py
index 5b32782..72592a9 100644
--- a/cuequivariance_torch/tests/primitives/script_test.py
+++ b/cuequivariance_torch/tests/primitives/primitive_export_test.py
@@ -17,8 +17,11 @@
 
 device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
 
+export_modes = ["script", "export"]
+
 
-def test_script_symmetric_contraction():
+@pytest.mark.parametrize("mode", export_modes)
+def test_script_symmetric_contraction(mode, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -31,13 +34,16 @@ def test_script_symmetric_contraction():
     i0 = torch.zeros(batch, device=device, dtype=torch.int32)
     x1 = torch.randn(batch, ds[0].operands[1].size, device=device, dtype=torch.float32)
 
-    module = SymmetricTensorProduct(ds, device, torch.float32)
-    module = torch.jit.script(module)
-
-    assert module(x0, i0, x1).shape == (batch, ds[0].operands[-1].size)
+    m = SymmetricTensorProduct(ds, device, torch.float32)
+    inputs = (x0, i0, x1)
+    module = module_with_mode(mode, m, inputs, torch.float32, tmp_path)
+    out1 = m(*inputs)
+    out2 = module(*inputs)
+    torch.testing.assert_close(out1, out2)
 
 
-def test_script_fused_tp_3():
+@pytest.mark.parametrize("mode", export_modes)
+def test_script_fused_tp_3(mode, tmp_path):
     if not torch.cuda.is_available():
         pytest.skip("CUDA is not available")
 
@@ -52,14 +58,12 @@ def test_script_fused_tp_3():
     batch = 12
     x0 = torch.randn(batch, d.operands[0].size, device=device, dtype=torch.float32)
     x1 = torch.randn(batch, d.operands[1].size, device=device, dtype=torch.float32)
-
-    module = FusedTensorProductOp3(d, (0, 1), device, torch.float32)
-    module = torch.jit.script(module)
-
-    assert module([x0, x1]).shape == (batch, d.operands[2].size)
-
-
-export_modes = ["script", "export"]
+    inputs = [x0, x1]
+    m = FusedTensorProductOp3(d, (0, 1), device, torch.float32)
+    module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
+    out1 = m(inputs)
+    out2 = module(inputs)
+    torch.testing.assert_close(out1, out2)
 
 
 @pytest.mark.parametrize("mode", export_modes)
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index ffad279..9f1da91 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance as cue
 import cuequivariance.segmented_tensor_product as stp
@@ -63,8 +66,6 @@ def test_primitive_indexed_symmetric_tensor_product_cuda_vs_fx(
         ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
     )
 
-    m = torch.jit.script(m)
-
     x0 = torch.randn((2, m.x0_size), device=device, dtype=dtype, requires_grad=True)
     i0 = torch.tensor([0, 1, 0], dtype=torch.int32, device=device)
     x1 = torch.randn(
@@ -140,3 +141,36 @@ def test_math_dtype(dtype: torch.dtype, math_dtype: torch.dtype, use_fallback: b
     assert out1.dtype == dtype
     assert out2.dtype == dtype
     assert (out1 == out2).all()
+
+
+export_modes = ["compile", "script", "jit"]
+
+
+@pytest.mark.parametrize("ds", make_descriptors())
+@pytest.mark.parametrize("mode", export_modes)
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_export(
+    ds: list[stp.SegmentedTensorProduct],
+    mode: str,
+    use_fallback: bool,
+    tmp_path,
+):
+    dtype = torch.float32
+    math_dtype = torch.float32
+
+    if use_fallback is True and mode in ["trt"]:
+        pytest.skip(f"{mode} not supported for the fallback!")
+
+    m = cuet.IWeightedSymmetricTensorProduct(
+        ds, math_dtype=math_dtype, device=device, use_fallback=use_fallback
+    )
+    x0 = torch.randn((2, m.x0_size), device=device, dtype=dtype, requires_grad=True)
+    i0 = torch.tensor([0, 1, 0], dtype=torch.int32, device=device)
+    x1 = torch.randn(
+        (i0.size(0), m.x1_size), device=device, dtype=dtype, requires_grad=True
+    )
+    inputs = (x0, i0, x1)
+    out1 = m(*inputs)
+    m = module_with_mode(mode, m, inputs, torch.float32, tmp_path)
+    out2 = m(*inputs)
+    torch.testing.assert_close(out1, out2)
diff --git a/cuequivariance_torch/tests/primitives/tensor_product_test.py b/cuequivariance_torch/tests/primitives/tensor_product_test.py
index 1487286..a54a530 100644
--- a/cuequivariance_torch/tests/primitives/tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/tensor_product_test.py
@@ -89,8 +89,6 @@ def make_descriptors():
         (torch.bfloat16, torch.float32, 1.0),
     ]
 
-export_modes = ["compile", "script", "jit"]
-
 
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("dtype, math_dtype, tol", settings)
@@ -152,6 +150,9 @@ def test_primitive_tensor_product_cuda_vs_fx(
         torch.testing.assert_close(g1, g2.to(dtype), atol=100 * tol, rtol=100 * tol)
 
 
+export_modes = ["compile", "script", "jit"]
+
+
 @pytest.mark.parametrize("d", make_descriptors())
 @pytest.mark.parametrize("mode", export_modes)
 @pytest.mark.parametrize("use_fallback", [True, False])
@@ -165,15 +166,13 @@ def test_export(d: cue.SegmentedTensorProduct, mode, use_fallback, tmp_path):
         for ope in d.operands[:-1]
     ]
 
-    # if use_fallback is True and mode not in ["eager", "script", "export"]:
-    #    pytest.skip(
-    #        "Only eager, script and export modes are supported for the fallback!"
-    #    )
+    if use_fallback is True and mode in ["trt"]:
+        pytest.skip(f"{mode} not supported for the fallback!")
 
-    m = cuet.TensorProduct(
+    module = cuet.TensorProduct(
         d, device=device, math_dtype=torch.float32, use_fallback=use_fallback
     )
-    module = module_with_mode(mode, m, (inputs,), torch.float32, tmp_path)
-    out1 = m(inputs)
+    out1 = module(inputs)
+    module = module_with_mode(mode, module, (inputs,), torch.float32, tmp_path)
     out2 = module(inputs)
     torch.testing.assert_close(out1, out2)
diff --git a/cuequivariance_torch/tests/primitives/transpose_test.py b/cuequivariance_torch/tests/primitives/transpose_test.py
index 31eb271..f0e53e7 100644
--- a/cuequivariance_torch/tests/primitives/transpose_test.py
+++ b/cuequivariance_torch/tests/primitives/transpose_test.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 import pytest
 import torch
+from tests.utils import (
+    module_with_mode,
+)
 
 import cuequivariance_torch as cuet
 
@@ -48,3 +51,21 @@ def test_transpose(use_fallback: bool, dtype: torch.dtype):
 
     m = cuet.TransposeSegments(segments, device, use_fallback=use_fallback)
     torch.testing.assert_close(m(x), xt)
+
+
+export_modes = ["compile", "script", "jit"]
+
+
+@pytest.mark.parametrize("mode", export_modes)
+@pytest.mark.parametrize("use_fallback", [True, False])
+def test_export(mode, use_fallback, tmp_path):
+    dtype = torch.float32
+    x = torch.tensor(
+        [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10, 11, 12, 13]], dtype=dtype, device=device
+    )
+    segments = [(2, 3), (2, 2)]
+    m = cuet.TransposeSegments(segments, device, use_fallback=use_fallback)
+    out1 = m(x)
+    m = module_with_mode(mode, m, (x,), dtype, tmp_path)
+    out2 = m(x)
+    torch.testing.assert_close(out1, out2)

From de0fa42edc3f242fb08c66cd0fe69f82c91b9638 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 22 Dec 2024 23:18:10 -0800
Subject: [PATCH 94/96] Fixing script() for non-internal weights

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>
---
 .../operations/tp_channel_wise.py                   |  5 +++--
 .../operations/tp_fully_connected.py                |  5 +++--
 .../tests/operations/tp_channel_wise_test.py        | 13 +++++++++----
 .../tests/operations/tp_fully_connected_test.py     | 11 +++++++++--
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
index d19da94..026b666 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_channel_wise.py
@@ -134,7 +134,7 @@ def forward(
                 or if shared weights are used and weight is not a 1D tensor,
                 or if shared weights are not used and weight is not a 2D tensor.
         """
-        if self.internal_weights:
+        if self.weight is not None:
             if weight is not None:
                 raise ValueError("Internal weights are used, weight should be None")
             return self.f([self.weight, x1, x2])
@@ -143,4 +143,5 @@ def forward(
                 raise ValueError(
                     "Internal weights are not used, weight should not be None"
                 )
-            return self.f([weight, x1, x2])
+            else:
+                return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
index 4efb7c6..e1e3122 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/tp_fully_connected.py
@@ -135,7 +135,7 @@ def forward(
                 or if shared weights are used and weight is not a 1D tensor,
                 or if shared weights are not used and weight is not a 2D tensor.
         """
-        if self.internal_weights:
+        if self.weight is not None:
             if weight is not None:
                 raise ValueError("Internal weights are used, weight should be None")
             return self.f([self.weight, x1, x2])
@@ -144,4 +144,5 @@ def forward(
                 raise ValueError(
                     "Internal weights are not used, weight should not be None"
                 )
-            return self.f([weight, x1, x2])
+            else:
+                return self.f([weight, x1, x2])
diff --git a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
index e83273f..4d275f0 100644
--- a/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
+++ b/cuequivariance_torch/tests/operations/tp_channel_wise_test.py
@@ -85,6 +85,7 @@ def test_channel_wise_fwd(
 
 @pytest.mark.parametrize("irreps1, irreps2, irreps3", irreps)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
+@pytest.mark.parametrize("internal_weights", [False, True])
 @pytest.mark.parametrize("use_fallback", [False, True])
 @pytest.mark.parametrize("batch", [1, 32])
 @pytest.mark.parametrize("mode", export_modes)
@@ -93,6 +94,7 @@ def test_export(
     irreps2: cue.Irreps,
     irreps3: cue.Irreps,
     layout: cue.IrrepsLayout,
+    internal_weights: bool,
     use_fallback: bool,
     batch: int,
     mode: str,
@@ -107,7 +109,7 @@ def test_export(
         irreps2,
         irreps3,
         shared_weights=True,
-        internal_weights=True,
+        internal_weights=internal_weights,
         layout=layout,
         device=device,
         dtype=dtype,
@@ -115,9 +117,12 @@ def test_export(
     )
     x1 = torch.randn(batch, irreps1.dim, dtype=dtype).to(device)
     x2 = torch.randn(batch, irreps2.dim, dtype=dtype).to(device)
-
-    inputs = (x1, x2)
-    out1 = m1(x1, x2)
+    if internal_weights:
+        inputs = (x1, x2)
+    else:
+        weights = torch.randn(1, m1.weight_numel, device=device, dtype=dtype)
+        inputs = (x1, x2, weights)
+    out1 = m1(*inputs)
 
     m1 = module_with_mode(mode, m1, inputs, dtype, tmp_path)
     out2 = m1(*inputs)
diff --git a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
index ffddd5a..57a77b8 100644
--- a/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
+++ b/cuequivariance_torch/tests/operations/tp_fully_connected_test.py
@@ -83,6 +83,7 @@ def test_fully_connected(
 
 @pytest.mark.parametrize("irreps1, irreps2, irreps3", irreps)
 @pytest.mark.parametrize("layout", [cue.ir_mul, cue.mul_ir])
+@pytest.mark.parametrize("internal_weights", [False, True])
 @pytest.mark.parametrize("use_fallback", [False, True])
 @pytest.mark.parametrize("mode", export_modes)
 def test_export(
@@ -90,6 +91,7 @@ def test_export(
     irreps2: cue.Irreps,
     irreps3: cue.Irreps,
     layout: cue.IrrepsLayout,
+    internal_weights: bool,
     use_fallback: bool,
     mode: str,
     tmp_path: str,
@@ -102,7 +104,7 @@ def test_export(
         irreps2,
         irreps3,
         shared_weights=True,
-        internal_weights=True,
+        internal_weights=internal_weights,
         layout=layout,
         device=device,
         dtype=dtype,
@@ -112,7 +114,12 @@ def test_export(
     x1 = torch.randn(32, irreps1.dim, dtype=dtype).to(device)
     x2 = torch.randn(32, irreps2.dim, dtype=dtype).to(device)
 
-    inputs = (x1, x2)
+    if internal_weights:
+        inputs = (x1, x2)
+    else:
+        weights = torch.randn(1, m1.weight_numel, device=device, dtype=dtype)
+        inputs = (x1, x2, weights)
+
     out1 = m1(*inputs)
 
     m1 = module_with_mode(mode, m1, inputs, dtype, tmp_path)

From d297764fa3235528b8c754a7c3f20d13aef13a97 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Mon, 6 Jan 2025 05:38:59 -0800
Subject: [PATCH 95/96] skip GPU tests when running on CPU

---
 .../tests/layers/tp_conv_fully_connected_test.py             | 1 +
 cuequivariance_torch/tests/operations/rotation_test.py       | 3 +++
 .../tests/operations/symmetric_contraction_test.py           | 4 ++++
 .../tests/primitives/symmetric_tensor_product_test.py        | 3 +++
 cuequivariance_torch/tests/primitives/transpose_test.py      | 3 +++
 cuequivariance_torch/tests/utils.py                          | 5 +++--
 6 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py b/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py
index 96602a3..1c1f72c 100644
--- a/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py
+++ b/cuequivariance_torch/tests/layers/tp_conv_fully_connected_test.py
@@ -51,6 +51,7 @@ def test_tensor_product_conv_equivariance(
         mlp_activation=mlp_activation,
         batch_norm=batch_norm,
         layout=layout,
+        use_fallback=not torch.cuda.is_available(),
     ).to(device)
 
     num_src_nodes, num_dst_nodes = 9, 7
diff --git a/cuequivariance_torch/tests/operations/rotation_test.py b/cuequivariance_torch/tests/operations/rotation_test.py
index 1fd34a7..dd8721f 100644
--- a/cuequivariance_torch/tests/operations/rotation_test.py
+++ b/cuequivariance_torch/tests/operations/rotation_test.py
@@ -72,6 +72,9 @@ def test_inversion(use_fallback: bool):
 
 @pytest.mark.parametrize("mode", export_modes)
 def test_export(mode: str, tmp_path: str):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     irreps = cue.Irreps("SO3", "3x0 + 1 + 0 + 4x2 + 4")
     dtype = torch.float32
     alpha = torch.tensor([0.3]).to(device)
diff --git a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
index d9821ee..3bf8467 100644
--- a/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
+++ b/cuequivariance_torch/tests/operations/symmetric_contraction_test.py
@@ -37,6 +37,9 @@
 @pytest.mark.parametrize("original_mace", [True, False])
 @pytest.mark.parametrize("batch", [1, 32])
 def test_symmetric_contraction(dtype, layout, original_mace, batch):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     mul = 64
     irreps_in = mul * cue.Irreps("O3", "0e + 1o + 2e")
     irreps_out = mul * cue.Irreps("O3", "0e + 1o")
@@ -103,6 +106,7 @@ def test_mace_compatibility():
         device=device,
         dtype=torch.float32,
         math_dtype=torch.float64,
+        use_fallback=not torch.cuda.is_available(),
     )
     n_sc.weight.data = from64(
         (2, 164 // mul, mul),
diff --git a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
index 9f1da91..9662e85 100644
--- a/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
+++ b/cuequivariance_torch/tests/primitives/symmetric_tensor_product_test.py
@@ -155,6 +155,9 @@ def test_export(
     use_fallback: bool,
     tmp_path,
 ):
+    if not use_fallback and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     dtype = torch.float32
     math_dtype = torch.float32
 
diff --git a/cuequivariance_torch/tests/primitives/transpose_test.py b/cuequivariance_torch/tests/primitives/transpose_test.py
index f0e53e7..f1b32d7 100644
--- a/cuequivariance_torch/tests/primitives/transpose_test.py
+++ b/cuequivariance_torch/tests/primitives/transpose_test.py
@@ -59,6 +59,9 @@ def test_transpose(use_fallback: bool, dtype: torch.dtype):
 @pytest.mark.parametrize("mode", export_modes)
 @pytest.mark.parametrize("use_fallback", [True, False])
 def test_export(mode, use_fallback, tmp_path):
+    if not use_fallback and not torch.cuda.is_available():
+        pytest.skip("CUDA is not available")
+
     dtype = torch.float32
     x = torch.tensor(
         [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10, 11, 12, 13]], dtype=dtype, device=device
diff --git a/cuequivariance_torch/tests/utils.py b/cuequivariance_torch/tests/utils.py
index 810c08c..5102e1e 100644
--- a/cuequivariance_torch/tests/utils.py
+++ b/cuequivariance_torch/tests/utils.py
@@ -193,8 +193,9 @@ def module_with_mode(
         else:
             raise ValueError(f"No such mode: {mode}")
 
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
 
     return module
 

From 4af57d8512f56d6965a1f38a66d3d6951ff60943 Mon Sep 17 00:00:00 2001
From: Mario Geiger <mgeiger@nvidia.com>
Date: Mon, 6 Jan 2025 05:50:55 -0800
Subject: [PATCH 96/96] Fix: if use_fallback is None and cuda is not available
 => use fallback

---
 .../operations/symmetric_contraction.py                    | 4 ++--
 .../primitives/symmetric_tensor_product.py                 | 3 +++
 .../cuequivariance_torch/primitives/transpose.py           | 7 ++++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
index 1fab04c..38f81f0 100644
--- a/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
+++ b/cuequivariance_torch/cuequivariance_torch/operations/symmetric_contraction.py
@@ -43,15 +43,15 @@ class SymmetricContraction(torch.nn.Module):
                 If `True`, a PyTorch fallback method is used regardless of CUDA kernel availability.
 
     Examples:
+        >>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         >>> irreps_in = cue.Irreps("O3", "32x0e + 32x1o")
         >>> irreps_out = cue.Irreps("O3", "32x0e")
-        >>> layer = SymmetricContraction(irreps_in, irreps_out, contraction_degree=3, num_elements=5, layout=cue.ir_mul, dtype=torch.float32)
+        >>> layer = SymmetricContraction(irreps_in, irreps_out, contraction_degree=3, num_elements=5, layout=cue.ir_mul, dtype=torch.float32, device=device)
 
         Now `layer` can be used as part of a PyTorch model.
 
         The argument `original_mace` can be set to `True` to emulate the original MACE implementation.
 
-        >>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         >>> feats_irreps = cue.Irreps("O3", "32x0e + 32x1o + 32x2e")
         >>> target_irreps = cue.Irreps("O3", "32x0e + 32x1o")
         >>> # OLD FUNCTION DEFINITION:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
index 08e02e6..f56eb37 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/symmetric_tensor_product.py
@@ -224,6 +224,9 @@ def __init__(
     ):
         super().__init__()
 
+        if not torch.cuda.is_available():
+            raise NotImplementedError("CUDA is not available.")
+
         max_degree = max(d.num_operands - 2 for d in ds)
 
         if max_degree > 6:
diff --git a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
index b23777a..bc6144e 100644
--- a/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
+++ b/cuequivariance_torch/cuequivariance_torch/primitives/transpose.py
@@ -36,7 +36,7 @@ def __init__(
         source: cue.IrrepsLayout,
         target: cue.IrrepsLayout,
         device: Optional[torch.device] = None,
-        use_fallback: Optional[bool] = False,
+        use_fallback: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -85,7 +85,7 @@ def __init__(
         self,
         segments: list[tuple[int, int]],
         device: Optional[torch.device] = None,
-        use_fallback: Optional[bool] = False,
+        use_fallback: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -99,7 +99,8 @@ def __init__(
                 except ImportError:
                     pass
                 else:
-                    self.f = _transpose(info).to(device=device)
+                    if torch.cuda.is_available():
+                        self.f = _transpose(info).to(device=device)
 
             if use_fallback is False and self.f is None:
                 raise RuntimeError("CUDA kernel not available for TransposeSegments.")