diff --git a/openfold3/tests/conftest.py b/openfold3/tests/conftest.py
index 76bd897b..361b6510 100644
--- a/openfold3/tests/conftest.py
+++ b/openfold3/tests/conftest.py
@@ -1,11 +1,158 @@
+from __future__ import annotations
+
+import json
+import platform
+import random
+import warnings
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
 import biotite.setup_ccd
 import numpy as np
 import pytest
+import torch
 from biotite.structure import AtomArray
+from torch.random import fork_rng
 
 from openfold3.core.data.primitives.structure.component import BiotiteCCDWrapper
 from openfold3.setup_openfold import setup_biotite_ccd
 
+# ---------------------------------------------------------------------------
+# Device fixture: parametrize tests to run on both CPU and CUDA
+# ---------------------------------------------------------------------------
+
+_DEVICES = [
+    pytest.param("cpu", id="cpu"),
+    pytest.param(
+        "cuda",
+        id="cuda",
+        marks=pytest.mark.skipif(
+            not torch.cuda.is_available(), reason="CUDA not available"
+        ),
+    ),
+]
+
+
+@pytest.fixture(params=_DEVICES)
+def device(request) -> str:
+    """Yield 'cpu' or 'cuda'; CUDA tests are auto-skipped when no GPU."""
+    return request.param
+
+
+# ---------------------------------------------------------------------------
+# CUDA determinism: ensure reproducible results on the same hardware
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def _cuda_deterministic(request):
+    """Enable deterministic CUDA ops for tests that use the ``device`` fixture."""
+    if "device" not in request.fixturenames:
+        yield
+        return
+
+    dev = request.getfixturevalue("device")
+    if dev != "cuda":
+        yield
+        return
+
+    orig_deterministic = torch.backends.cudnn.deterministic
+    orig_benchmark = torch.backends.cudnn.benchmark
+    orig_det_algos = torch.are_deterministic_algorithms_enabled()
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.use_deterministic_algorithms(True)
+    yield
+
+    torch.backends.cudnn.deterministic = orig_deterministic
+    torch.backends.cudnn.benchmark = orig_benchmark
+    torch.use_deterministic_algorithms(orig_det_algos)
+
+
+# ---------------------------------------------------------------------------
+# Snapshot environment metadata
+# ---------------------------------------------------------------------------
+
+_SNAPSHOT_ENV_FILE = "_snapshot_env.json"
+
+
+@dataclass(frozen=True)
+class SnapshotEnv:
+    """Environment info relevant to snapshot reproducibility."""
+
+    torch_version: str
+    python_version: str
+    cuda_version: str | None = None
+    cudnn_version: str | None = None
+    gpu_name: str | None = None
+
+    @classmethod
+    def current(cls) -> SnapshotEnv:
+        cuda_kwargs = {}
+        if torch.cuda.is_available():
+            cuda_kwargs = dict(
+                cuda_version=torch.version.cuda,
+                cudnn_version=str(torch.backends.cudnn.version()),
+                gpu_name=torch.cuda.get_device_name(0),
+            )
+        return cls(
+            torch_version=torch.__version__,
+            python_version=platform.python_version(),
+            **cuda_kwargs,
+        )
+
+    @classmethod
+    def from_json(cls, path: Path) -> SnapshotEnv:
+        data = json.loads(path.read_text())
+        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+
+    def to_json(self, path: Path) -> None:
+        path.write_text(json.dumps(asdict(self), indent=2) + "\n")
+
+    def mismatches(self, other: SnapshotEnv) -> list[str]:
+        result = []
+        for field in ("torch_version", "cuda_version", "cudnn_version", "gpu_name"):
+            stored = getattr(self, field)
+            current = getattr(other, field)
+            if stored is not None and current is not None and stored != current:
+                result.append(f"  {field}: stored={stored}, current={current}")
+        return result
+
+
+def _check_snapshot_env(snapshot_dir: Path) -> None:
+    """Warn if the current environment differs from the one that generated snapshots."""
+    env_file = snapshot_dir / _SNAPSHOT_ENV_FILE
+    if not env_file.exists():
+        return
+
+    stored = SnapshotEnv.from_json(env_file)
+    mismatches = stored.mismatches(SnapshotEnv.current())
+
+    if mismatches:
+        warnings.warn(
+            f"Snapshot environment mismatch in {snapshot_dir.name}/:\n"
+            + "\n".join(mismatches)
+            + "\nSnapshot tests may fail. Regenerate with: pytest --force-regen",
+            stacklevel=2,
+        )
+
+
+def _write_snapshot_env(snapshot_dir: Path) -> None:
+    """Write current environment metadata alongside snapshots."""
+    SnapshotEnv.current().to_json(snapshot_dir / _SNAPSHOT_ENV_FILE)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    """After ``--force-regen``, write environment metadata to snapshot dirs."""
+    if not session.config.getoption("force_regen", default=False):
+        return
+    snapshots_root = Path(__file__).parent / "test_data" / "snapshots"
+    if snapshots_root.exists():
+        for subdir in snapshots_root.iterdir():
+            if subdir.is_dir() and any(subdir.glob("*.npz")):
+                _write_snapshot_env(subdir)
+
 
 @pytest.fixture
 def dummy_atom_array():
@@ -76,3 +223,31 @@ def ensure_biotite_ccd(request):
 def biotite_ccd_wrapper():
     """Cache CCD wrapper fixture for tests that need it."""
     return BiotiteCCDWrapper()
+
+
+@pytest.fixture(scope="module")
+def original_datadir(request: pytest.FixtureRequest) -> Path:
+    """Redirect pytest-regressions snapshot storage to test_data/snapshots/."""
+    datadir = (
+        Path(__file__).parent / "test_data" / "snapshots" / Path(request.path).stem
+    )
+    _check_snapshot_env(datadir)
+    return datadir
+
+
+@pytest.fixture()
+def seeded_rng():
+    """Isolate all RNG state (torch, numpy, python) for the duration of a test.
+
+    Uses torch.random.fork_rng() to save/restore torch (+CUDA) state, and
+    manually saves/restores numpy and python random state.
+    """
+    py_state = random.getstate()
+    np_state = np.random.get_state()
+    with fork_rng():
+        torch.manual_seed(123)
+        random.seed(123)
+        np.random.seed(123)
+        yield
+    random.setstate(py_state)
+    np.random.set_state(np_state)
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_attention/_snapshot_env.json b/openfold3/tests/test_data/snapshots/test_triangular_attention/_snapshot_env.json
new file mode 100644
index 00000000..429cc9a5
--- /dev/null
+++ b/openfold3/tests/test_data/snapshots/test_triangular_attention/_snapshot_env.json
@@ -0,0 +1,7 @@
+{
+  "torch_version": "2.10.0+cu130",
+  "python_version": "3.13.12",
+  "cuda_version": "13.0",
+  "cudnn_version": "91501",
+  "gpu_name": "NVIDIA GB10"
+}
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cpu_False_.npz b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cpu_False_.npz
new file mode 100644
index 00000000..43d432f9
Binary files /dev/null and b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cpu_False_.npz differ
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cpu_True_.npz b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cpu_True_.npz
new file mode 100644
index 00000000..889d223d
Binary files /dev/null and b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cpu_True_.npz differ
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cuda_False_.npz b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cuda_False_.npz
new file mode 100644
index 00000000..b71e3084
Binary files /dev/null and b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cuda_False_.npz differ
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cuda_True_.npz b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cuda_True_.npz
new file mode 100644
index 00000000..9b1e196e
Binary files /dev/null and b/openfold3/tests/test_data/snapshots/test_triangular_attention/test_shape_cuda_True_.npz differ
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/_snapshot_env.json b/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/_snapshot_env.json
new file mode 100644
index 00000000..429cc9a5
--- /dev/null
+++ b/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/_snapshot_env.json
@@ -0,0 +1,7 @@
+{
+  "torch_version": "2.10.0+cu130",
+  "python_version": "3.13.12",
+  "cuda_version": "13.0",
+  "cudnn_version": "91501",
+  "gpu_name": "NVIDIA GB10"
+}
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/test_shape_cpu_.npz b/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/test_shape_cpu_.npz
new file mode 100644
index 00000000..0a4d9baa
Binary files /dev/null and b/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/test_shape_cpu_.npz differ
diff --git a/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/test_shape_cuda_.npz b/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/test_shape_cuda_.npz
new file mode 100644
index 00000000..f63faf8b
Binary files /dev/null and b/openfold3/tests/test_data/snapshots/test_triangular_multiplicative_update/test_shape_cuda_.npz differ
diff --git a/openfold3/tests/test_triangular_attention.py b/openfold3/tests/test_triangular_attention.py
index 68b8321d..4d8764fd 100644
--- a/openfold3/tests/test_triangular_attention.py
+++ b/openfold3/tests/test_triangular_attention.py
@@ -12,38 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
+import pytest
 import torch
 
 from openfold3.core.model.layers.triangular_attention import TriangleAttention
 from openfold3.tests.config import consts
 
 
-class TestTriangularAttention(unittest.TestCase):
-    def test_shape(self):
-        c_z = consts.c_z
-        c = 12
-        no_heads = 4
-        starting = True
+# starting=True -> "starting node" variant: rows attend to rows,
+# biased by z[i, k]. False would transpose internally for the
+# "ending node" variant (columns attend to columns).
+@pytest.mark.parametrize("starting", [True, False])
+def test_shape(starting, device, seeded_rng, ndarrays_regression):
+    # c_z: pair representation channel dim (128 in production)
+    c_z = consts.c_z
+    # c: attention hidden dim (production uses 32; smaller here for speed)
+    c = 12
+    no_heads = 4
 
-        tan = TriangleAttention(
-            c_z,
-            c,
-            no_heads,
-            starting=starting,
-        )
+    tan = TriangleAttention(
+        c_z,
+        c,
+        no_heads,
+        starting=starting,
+    ).to(device)
+    # AlphaFold initializes the output projection to zero (so residual blocks
+    # start as identity). Reinitialize all params so the test exercises the
+    # actual computation and produces non-trivial output.
+    for p in tan.parameters():
+        torch.nn.init.normal_(p, std=0.01)
+    tan.eval()
 
-        batch_size = consts.batch_size
-        n_res = consts.n_res
+    batch_size = consts.batch_size
+    n_res = consts.n_res
 
-        x = torch.rand((batch_size, n_res, n_res, c_z))
-        shape_before = x.shape
+    # Pair representation: [batch, N_residues, N_residues, C_z]
+    x = torch.rand((batch_size, n_res, n_res, c_z), device=device)
+    shape_before = x.shape
+    # chunk_size=None -> no memory-saving chunking, full attention in one pass
+    with torch.no_grad():
         x = tan(x, chunk_size=None)
-        shape_after = x.shape
+    shape_after = x.shape
 
-        self.assertTrue(shape_before == shape_after)
+    # Shape must be preserved for the residual addition z = z + tri_att(z)
+    assert shape_before == shape_after
 
+    # Guard against trivial all-zero output (e.g. from zero-initialized weights)
+    assert x.abs().max().item() > 0, (
+        "Output is all zeros — snapshot would be meaningless"
+    )
 
-if __name__ == "__main__":
-    unittest.main()
+    # Snapshot regression: output must be numerically identical across runs.
+    # CUDA tolerances are looser to accommodate hardware-level differences.
+    # Regenerate with: pytest --force-regen
+    tolerances = dict(atol=1e-6, rtol=1e-5)
+    ndarrays_regression.check(
+        {"output": x.cpu().numpy()},
+        default_tolerance=tolerances,
+    )
diff --git a/openfold3/tests/test_triangular_multiplicative_update.py b/openfold3/tests/test_triangular_multiplicative_update.py
index 7893a3ac..458a3344 100644
--- a/openfold3/tests/test_triangular_multiplicative_update.py
+++ b/openfold3/tests/test_triangular_multiplicative_update.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import re
-import unittest
 
 import torch
 
@@ -23,34 +22,59 @@
 )
 from openfold3.tests.config import consts
 
+# Updates pair representation z[i,j] by projecting to two gated vectors (a, b),
+# contracting along a shared dimension (outgoing vs incoming), then projecting
+# back. "Outgoing" contracts over the starting node, "Incoming" over the ending
+# node. Shape-preserving: [*, N, N, C_z] -> [*, N, N, C_z].
 
-class TestTriangularMultiplicativeUpdate(unittest.TestCase):
-    def test_shape(self):
-        c_z = consts.c_z
-        c = 11
-
-        if re.fullmatch("^model_[1-5]_multimer_v3$", consts.model_preset):
-            tm = FusedTriangleMultiplicationOutgoing(
-                c_z,
-                c,
-            )
-        else:
-            tm = TriangleMultiplicationOutgoing(
-                c_z,
-                c,
-            )
-
-        n_res = consts.c_z
-        batch_size = consts.batch_size
-
-        x = torch.rand((batch_size, n_res, n_res, c_z))
-        mask = torch.randint(0, 2, size=(batch_size, n_res, n_res))
-        shape_before = x.shape
+
+def _make_module(c_z, c):
+    """Pick fused vs non-fused variant based on model preset."""
+    # Multimer v3 uses a fused variant (single projection split into a, b)
+    # vs separate projections for each
+    if re.fullmatch("^model_[1-5]_multimer_v3$", consts.model_preset):
+        return FusedTriangleMultiplicationOutgoing(c_z, c)
+    return TriangleMultiplicationOutgoing(c_z, c)
+
+
+def test_shape(device, seeded_rng, ndarrays_regression):
+    # c_z: pair representation channel dim (128 in production)
+    c_z = consts.c_z
+    # c: hidden projection dim (production uses ~128; smaller here for speed)
+    c = 11
+
+    tm = _make_module(c_z, c).to(device)
+    # Reinitialize all params to non-trivial values (some layers may be
+    # zero-initialized by default for residual identity at init)
+    for p in tm.parameters():
+        torch.nn.init.normal_(p, std=0.01)
+    tm.eval()
+
+    n_res = consts.n_res
+    batch_size = consts.batch_size
+
+    # Pair representation: [batch, N_residues, N_residues, C_z]
+    x = torch.rand((batch_size, n_res, n_res, c_z), device=device)
+    # Binary mask: which residue pairs are valid
+    mask = torch.randint(0, 2, size=(batch_size, n_res, n_res), device=device)
+    shape_before = x.shape
+    with torch.no_grad():
         x = tm(x, mask)
-        shape_after = x.shape
+    shape_after = x.shape
 
-        self.assertTrue(shape_before == shape_after)
+    # Shape must be preserved for the residual addition z = z + tri_mul(z)
+    assert shape_before == shape_after
 
+    # Guard against trivial all-zero output (e.g. from zero-initialized weights)
+    assert x.abs().max().item() > 0, (
+        "Output is all zeros — snapshot would be meaningless"
+    )
 
-if __name__ == "__main__":
-    unittest.main()
+    # Snapshot regression: output must be numerically identical across runs.
+    # CUDA tolerances are looser to accommodate hardware-level differences.
+    # Regenerate with: pytest --force-regen
+    tolerances = dict(atol=1e-6, rtol=1e-5)
+    ndarrays_regression.check(
+        {"output": x.cpu().numpy()},
+        default_tolerance=tolerances,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index c428ef6c..904d1bb4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,7 @@ test = [
   "pytest-xdist",
   "pytest-cov",
   "pytest-benchmark",
+  "pytest-regressions",
   "debugpy",
   "pytest-recording",
 ]