diff --git a/benchmarks/prepare_mols.py b/benchmarks/prepare_mols.py
new file mode 100644
index 0000000..56ee040
--- /dev/null
+++ b/benchmarks/prepare_mols.py
@@ -0,0 +1,172 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Precompute molecules with conformers for TFD benchmarking and profiling.
+
+Generates conformers once and saves the results as pickle files (one per
+conformer count). This avoids the expensive ETKDG embedding during
+benchmark/profile runs.
+
+Output files: <output-dir>/prepared_mols_<N>confs.pkl
+
+Usage:
+    python benchmarks/prepare_mols.py
+
+    # Custom settings:
+    python benchmarks/prepare_mols.py \
+        --smiles-file benchmarks/data/benchmark_smiles.csv \
+        --output-dir benchmarks/data \
+        --conformers 5 10 20 50 \
+        --max-mols 500 \
+        --workers 8
+"""
+
+import argparse
+import multiprocessing
+import os
+import pickle
+import sys
+import time
+from functools import partial
+
+import pandas as pd
+from rdkit import Chem, RDLogger
+from rdkit.Chem import AllChem
+
+RDLogger.DisableLog("rdApp.*")
+
+
+def _generate_one(args_tuple, num_confs):
+    """Generate conformers for a single molecule (picklable for multiprocessing)."""
+    idx, smi = args_tuple
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None or mol.GetNumAtoms() < 4:
+        return None
+
+    mol = Chem.AddHs(mol)
+    params = AllChem.ETKDGv3()
+    params.randomSeed = 42 + idx
+    params.numThreads = 1
+    params.useRandomCoords = True
+    conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, params=params)
+    if len(conf_ids) < 2:
+        return None
+    return Chem.RemoveHs(mol)
+
+
+def main():
+    default_smiles = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "benchmark_smiles.csv")
+    default_output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+
+    parser = argparse.ArgumentParser(
+        description="Precompute molecules with conformers for TFD benchmarking",
+    )
+    parser.add_argument(
+        "--smiles-file",
+        type=str,
+        default=default_smiles,
+        help="CSV file with SMILES in first column (default: benchmarks/data/benchmark_smiles.csv)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=default_output_dir,
+        help="Output directory for pickle files (default: benchmarks/data)",
+    )
+    parser.add_argument(
+        "--conformers",
+        type=int,
+        nargs="+",
+        default=[5, 10, 20],
+        help="Conformer counts to generate (default: 5 10 20)",
+    )
+    parser.add_argument(
+        "--max-mols",
+        type=int,
+        default=0,
+        help="Maximum number of molecules to prepare (default: 0 = all valid SMILES)",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=max(1, multiprocessing.cpu_count() // 2),
+        help=f"Number of parallel workers (default: {max(1, multiprocessing.cpu_count() // 2)})",
+    )
+    args = parser.parse_args()
+
+    print(f"Loading SMILES from: {args.smiles_file}")
+    df = pd.read_csv(args.smiles_file)
+    smiles_list = df.iloc[:, 0].tolist()
+    print(f"Loaded {len(smiles_list)} SMILES")
+
+    # Build (index, smiles) pairs, filtering small molecules
+    work_items = []
+    for i, smi in enumerate(smiles_list):
+        mol = Chem.MolFromSmiles(smi)
+        if mol is not None and mol.GetNumAtoms() >= 4:
+            work_items.append((i, smi))
+        if args.max_mols > 0 and len(work_items) >= args.max_mols:
+            break
+    print(f"Valid molecules (>=4 atoms): {len(work_items)}")
+    print(f"Workers: {args.workers}")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    for num_confs in args.conformers:
+        out_path = os.path.join(args.output_dir, f"prepared_mols_{num_confs}confs.pkl")
+
+        if os.path.exists(out_path):
+            print(f"\n--- {num_confs} conformers: {out_path} already exists, skipping ---")
+            continue
+
+        print(f"\n--- Generating {num_confs} conformers ({len(work_items)} molecules, {args.workers} workers) ---")
+        start = time.perf_counter()
+
+        worker_fn = partial(_generate_one, num_confs=num_confs)
+
+        if args.workers > 1:
+            with multiprocessing.Pool(args.workers) as pool:
+                results = []
+                for i, result in enumerate(pool.imap(worker_fn, work_items)):
+                    results.append(result)
+                    if (i + 1) % 50 == 0 or (i + 1) == len(work_items):
+                        print(f"  Progress: {i + 1}/{len(work_items)} molecules", flush=True)
+        else:
+            results = []
+            for i, item in enumerate(work_items):
+                results.append(worker_fn(item))
+                if (i + 1) % 50 == 0 or (i + 1) == len(work_items):
+                    print(f"  Progress: {i + 1}/{len(work_items)} molecules", flush=True)
+
+        mols = [m for m in results if m is not None]
+        elapsed = time.perf_counter() - start
+
+        actual_confs = [m.GetNumConformers() for m in mols]
+        total_pairs = sum(c * (c - 1) // 2 for c in actual_confs)
+
+        with open(out_path, "wb") as f:
+            pickle.dump(mols, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+        file_size_mb = os.path.getsize(out_path) / (1024 * 1024)
+        print(f"  {len(mols)} molecules, avg {sum(actual_confs) / len(actual_confs):.1f} conformers")
+        print(f"  Total TFD pairs: {total_pairs}")
+        print(f"  Time: {elapsed:.1f}s")
+        print(f"  Saved: {out_path} ({file_size_mb:.1f} MB)")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/tfd_bench.py b/benchmarks/tfd_bench.py
new file mode 100644
index 0000000..9023831
--- /dev/null
+++ b/benchmarks/tfd_bench.py
@@ -0,0 +1,524 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Benchmark for TFD (Torsion Fingerprint Deviation) calculation.
+
+Compares:
+- RDKit TorsionFingerprints.GetTFDMatrix (Python, single-threaded)
+- nvMolKit CPU backend (C++ with OpenMP, multi-threaded)
+- nvMolKit GPU backend (CUDA)
+
+Usage:
+    python tfd_bench.py [--smiles-file FILE] [--output FILE] [--skip-rdkit]
+    python tfd_bench.py --pkl-file data1.pkl data2.pkl [--output FILE]
+
+Example:
+    python tfd_bench.py --smiles-file data/benchmark_smiles.csv --output tfd_results.csv
+
+    # Use precomputed ChEMBL stratified pickles directly:
+    python tfd_bench.py --pkl-file ../Data/Chembl_stratified_prepared/chembl_0-20_10confs.pkl \
+        --num-mols 100 500 1000 5000 --output tfd_chembl_results.csv
+"""
+
+import argparse
+import os
+import pickle
+import sys
+import time
+from typing import List, Tuple
+
+import pandas as pd
+import torch
+from rdkit import Chem
+from rdkit.Chem import AllChem, TorsionFingerprints
+
+import nvmolkit.tfd as nvmol_tfd
+
+
+def time_it(func, runs: int = 3, warmups: int = 1) -> Tuple[float, float]:
+    """Time a function with warmup runs.
+
+    Args:
+        func: Function to time (no arguments)
+        runs: Number of timed runs
+        warmups: Number of warmup runs
+
+    Returns:
+        Tuple of (average_time_ms, std_time_ms)
+    """
+    for _ in range(warmups):
+        func()
+
+    times = []
+    for _ in range(runs):
+        start = time.perf_counter_ns()
+        func()
+        end = time.perf_counter_ns()
+        times.append(end - start)
+
+    avg_time = sum(times) / runs
+    std_time = (sum((t - avg_time) ** 2 for t in times) / runs) ** 0.5
+    return avg_time / 1.0e6, std_time / 1.0e6  # Return in milliseconds
+
+
+def generate_conformers(mol: Chem.Mol, num_confs: int, seed: int = 42) -> Chem.Mol:
+    """Generate conformers for a molecule using ETKDG.
+
+    Args:
+        mol: RDKit molecule
+        num_confs: Number of conformers to generate
+        seed: Random seed
+
+    Returns:
+        Molecule with conformers (or None if embedding failed)
+    """
+    mol = Chem.AddHs(mol)
+    params = AllChem.ETKDGv3()
+    params.randomSeed = seed
+    params.numThreads = 1  # Single-threaded for reproducibility
+    params.useRandomCoords = True
+
+    conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, params=params)
+
+    if len(conf_ids) < 2:
+        return None
+
+    mol = Chem.RemoveHs(mol)
+    return mol
+
+
+def _try_load_pickle(num_confs: int, max_mols: int, smiles_file: str = None) -> List[Chem.Mol]:
+    """Try to load precomputed molecules from pickle file."""
+    search_dirs = []
+    if smiles_file:
+        search_dirs.append(os.path.dirname(os.path.abspath(smiles_file)))
+    search_dirs.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
+
+    for d in search_dirs:
+        pkl_path = os.path.join(d, f"prepared_mols_{num_confs}confs.pkl")
+        if os.path.exists(pkl_path):
+            with open(pkl_path, "rb") as f:
+                all_mols = pickle.load(f)
+            mols = all_mols[:max_mols]
+            print(f"  Loaded {len(mols)} molecules from {pkl_path}")
+            return mols
+    return None
+
+
+def prepare_molecules(
+    smiles_list: List[str], num_confs: int, max_mols: int = 100, smiles_file: str = None
+) -> List[Chem.Mol]:
+    """Prepare molecules with conformers, using precomputed pickle if available.
+
+    Args:
+        smiles_list: List of SMILES strings (fallback if no pickle)
+        num_confs: Number of conformers per molecule
+        max_mols: Maximum number of molecules to prepare
+        smiles_file: Path to SMILES CSV (used to locate pickle files)
+
+    Returns:
+        List of molecules with conformers
+    """
+    cached = _try_load_pickle(num_confs, max_mols, smiles_file)
+    if cached is not None:
+        return cached
+
+    print(f"  No precomputed pickle found, generating from scratch...")
+    mols = []
+    for i, smi in enumerate(smiles_list):
+        if len(mols) >= max_mols:
+            break
+
+        mol = Chem.MolFromSmiles(smi)
+        if mol is None:
+            continue
+
+        if mol.GetNumAtoms() < 4:
+            continue
+
+        mol_with_confs = generate_conformers(mol, num_confs, seed=42 + i)
+        if mol_with_confs is not None and mol_with_confs.GetNumConformers() >= 2:
+            mols.append(mol_with_confs)
+
+    return mols
+
+
+def bench_rdkit_single(mol: Chem.Mol) -> None:
+    """Benchmark RDKit TFD for a single molecule."""
+    TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+
+
+def bench_rdkit_batch(mols: List[Chem.Mol]) -> None:
+    """Benchmark RDKit TFD for multiple molecules (sequential)."""
+    for mol in mols:
+        TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+
+
+def bench_nvmol_cpu_single(mol: Chem.Mol) -> None:
+    """Benchmark nvMolKit CPU TFD for a single molecule."""
+    nvmol_tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal", backend="cpu")
+
+
+def bench_nvmol_cpu_list(mols: List[Chem.Mol]) -> None:
+    """Benchmark nvMolKit CPU TFD returning Python lists."""
+    nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="cpu", return_type="list")
+
+
+def bench_nvmol_cpu_numpy(mols: List[Chem.Mol]) -> None:
+    """Benchmark nvMolKit CPU TFD returning numpy arrays."""
+    nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="cpu", return_type="numpy")
+
+
+def bench_nvmol_gpu_single(mol: Chem.Mol) -> None:
+    """Benchmark nvMolKit GPU TFD for a single molecule."""
+    nvmol_tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal", backend="gpu")
+    torch.cuda.synchronize()
+
+
+def bench_nvmol_gpu_list(mols: List[Chem.Mol]) -> None:
+    """Benchmark nvMolKit GPU TFD returning Python lists."""
+    nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="gpu", return_type="list")
+    torch.cuda.synchronize()
+
+
+def bench_nvmol_gpu_numpy(mols: List[Chem.Mol]) -> None:
+    """Benchmark nvMolKit GPU TFD returning numpy arrays."""
+    nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="gpu", return_type="numpy")
+    torch.cuda.synchronize()
+
+
+def bench_nvmol_gpu_tensor(mols: List[Chem.Mol]) -> None:
+    """Benchmark nvMolKit GPU TFD returning GPU tensors (no D2H)."""
+    nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="gpu", return_type="tensor")
+    torch.cuda.synchronize()
+
+
+def verify_correctness(mol: Chem.Mol, tolerance: float = 0.01) -> bool:
+    """Verify nvMolKit results match RDKit (within tolerance).
+
+    Multi-quartet torsions (rings and symmetric) are fully supported,
+    so results should match RDKit closely.
+    """
+    rdkit_result = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+    nvmol_result = nvmol_tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal", backend="gpu")
+
+    if len(rdkit_result) != len(nvmol_result):
+        return False
+
+    for rd, nv in zip(rdkit_result, nvmol_result):
+        if abs(rd - nv) > tolerance:
+            return False
+
+    return True
+
+
+def load_pkl_files(pkl_paths: List[str]) -> List[Chem.Mol]:
+    """Load and concatenate molecules from one or more pickle files.
+
+    Each pickle file must contain a list of RDKit Mol objects with conformers
+    (as produced by prepare_chembl_stratified.py).
+    """
+    all_mols = []
+    for path in pkl_paths:
+        with open(path, "rb") as f:
+            mols = pickle.load(f)
+        print(f"  Loaded {len(mols)} molecules from {path}")
+        all_mols.extend(mols)
+    return all_mols
+
+
+def run_benchmarks(
+    smiles_list: List[str] | None = None,
+    skip_rdkit: bool = False,
+    output_file: str = "tfd_results.csv",
+    smiles_file: str = None,
+    mol_counts: List[int] = None,
+    conformer_counts: List[int] = None,
+    preloaded_mols: List[Chem.Mol] | None = None,
+) -> pd.DataFrame:
+    """Run TFD benchmarks with various configurations.
+
+    Args:
+        smiles_list: List of SMILES strings (unused when preloaded_mols given)
+        skip_rdkit: If True, skip RDKit benchmarks (faster for large runs)
+        output_file: Output CSV file path
+        smiles_file: Path to SMILES CSV (used to locate pickle files)
+        mol_counts: List of molecule counts to benchmark
+        conformer_counts: List of conformer counts to benchmark
+        preloaded_mols: Pre-loaded molecules with conformers (e.g. from --pkl-file).
+            When provided, smiles_list/smiles_file/conformer_counts are ignored and
+            the actual conformer count is read from the molecules.
+
+    Returns:
+        DataFrame with benchmark results
+    """
+    if mol_counts is None:
+        mol_counts = [1, 5, 10, 25, 50, 100]
+
+    if preloaded_mols is not None:
+        actual_confs_all = [m.GetNumConformers() for m in preloaded_mols]
+        median_confs = sorted(actual_confs_all)[len(actual_confs_all) // 2]
+        conformer_counts = [median_confs]
+        print(f"  Using {len(preloaded_mols)} preloaded molecules (~{median_confs} conformers each)")
+    elif conformer_counts is None:
+        conformer_counts = [5, 10, 20]
+
+    results = []
+
+    print("=" * 70)
+    print("TFD Benchmark: RDKit vs nvMolKit (CPU) vs nvMolKit (GPU)")
+    print(f"Molecule counts: {mol_counts}")
+    print(f"Conformer counts: {conformer_counts}")
+    print("=" * 70)
+
+    for num_confs in conformer_counts:
+        if preloaded_mols is not None:
+            all_mols = preloaded_mols[: max(mol_counts)]
+        else:
+            print(f"\n--- Preparing molecules with {num_confs} conformers ---")
+            all_mols = prepare_molecules(
+                smiles_list, num_confs, max_mols=max(mol_counts) + 20, smiles_file=smiles_file
+            )
+
+        if len(all_mols) < max(mol_counts):
+            print(f"Warning: Only {len(all_mols)} molecules available")
+
+        for num_mols in mol_counts:
+            if num_mols > len(all_mols):
+                print(f"Skipping {num_mols} mols (only {len(all_mols)} available)")
+                continue
+
+            mols = all_mols[:num_mols]
+            actual_confs = [mol.GetNumConformers() for mol in mols]
+            avg_confs = sum(actual_confs) / len(actual_confs)
+
+            print(f"\nBenchmarking: {num_mols} molecules, ~{avg_confs:.1f} conformers each")
+
+            # Calculate expected TFD pairs
+            total_pairs = sum(c * (c - 1) // 2 for c in actual_confs)
+            print(f"  Total TFD pairs: {total_pairs}")
+
+            result = {
+                "num_molecules": num_mols,
+                "target_conformers": num_confs,
+                "avg_conformers": avg_confs,
+                "total_tfd_pairs": total_pairs,
+            }
+
+            # RDKit benchmark (single-threaded Python)
+            if not skip_rdkit:
+                try:
+                    rdkit_time, rdkit_std = time_it(lambda: bench_rdkit_batch(mols))
+                    result["rdkit_time_ms"] = rdkit_time
+                    result["rdkit_std_ms"] = rdkit_std
+                    print(f"  RDKit (Python):     {rdkit_time:8.2f} ms (+/- {rdkit_std:.2f})")
+                except Exception as e:
+                    print(f"  RDKit failed: {e}")
+                    result["rdkit_time_ms"] = None
+                    result["rdkit_std_ms"] = None
+            else:
+                result["rdkit_time_ms"] = None
+                result["rdkit_std_ms"] = None
+
+            # nvMolKit CPU list benchmark (return_type="list")
+            try:
+                t, s = time_it(lambda: bench_nvmol_cpu_list(mols))
+                result["nvmol_cpu_list_time_ms"] = t
+                result["nvmol_cpu_list_std_ms"] = s
+                print(f"  nvMolKit (CPU list): {t:8.2f} ms (+/- {s:.2f})")
+            except Exception as e:
+                print(f"  nvMolKit CPU list failed: {e}")
+                result["nvmol_cpu_list_time_ms"] = None
+                result["nvmol_cpu_list_std_ms"] = None
+
+            # nvMolKit CPU numpy benchmark (return_type="numpy")
+            try:
+                t, s = time_it(lambda: bench_nvmol_cpu_numpy(mols))
+                result["nvmol_cpu_numpy_time_ms"] = t
+                result["nvmol_cpu_numpy_std_ms"] = s
+                print(f"  nvMolKit (CPU numpy):{t:8.2f} ms (+/- {s:.2f})")
+            except Exception as e:
+                print(f"  nvMolKit CPU numpy failed: {e}")
+                result["nvmol_cpu_numpy_time_ms"] = None
+                result["nvmol_cpu_numpy_std_ms"] = None
+
+            # nvMolKit GPU list benchmark (return_type="list")
+            try:
+                t, s = time_it(lambda: bench_nvmol_gpu_list(mols))
+                result["nvmol_gpu_list_time_ms"] = t
+                result["nvmol_gpu_list_std_ms"] = s
+                print(f"  nvMolKit (GPU list):  {t:8.2f} ms (+/- {s:.2f})")
+            except Exception as e:
+                print(f"  nvMolKit GPU list failed: {e}")
+                result["nvmol_gpu_list_time_ms"] = None
+
+            # nvMolKit GPU numpy benchmark (return_type="numpy")
+            try:
+                t, s = time_it(lambda: bench_nvmol_gpu_numpy(mols))
+                result["nvmol_gpu_numpy_time_ms"] = t
+                result["nvmol_gpu_numpy_std_ms"] = s
+                print(f"  nvMolKit (GPU numpy): {t:8.2f} ms (+/- {s:.2f})")
+            except Exception as e:
+                print(f"  nvMolKit GPU numpy failed: {e}")
+                result["nvmol_gpu_numpy_time_ms"] = None
+
+            # nvMolKit GPU tensor benchmark (return_type="tensor", no D2H)
+            try:
+                t, s = time_it(lambda: bench_nvmol_gpu_tensor(mols))
+                result["nvmol_gpu_tensor_time_ms"] = t
+                result["nvmol_gpu_tensor_std_ms"] = s
+                print(f"  nvMolKit (GPU ten):  {t:8.2f} ms (+/- {s:.2f})")
+            except Exception as e:
+                print(f"  nvMolKit GPU tensor failed: {e}")
+                result["nvmol_gpu_tensor_time_ms"] = None
+
+            # Calculate speedups vs RDKit
+            speedups = {}
+            for key, label in [
+                ("nvmol_cpu_list_time_ms", "CPU list"),
+                ("nvmol_cpu_numpy_time_ms", "CPU numpy"),
+                ("nvmol_gpu_list_time_ms", "GPU list"),
+                ("nvmol_gpu_numpy_time_ms", "GPU numpy"),
+                ("nvmol_gpu_tensor_time_ms", "GPU tensor"),
+            ]:
+                if result.get("rdkit_time_ms") and result.get(key):
+                    speedups[label] = result["rdkit_time_ms"] / result[key]
+
+            for label, val in speedups.items():
+                print(f"  Speedup {label:>10s} vs RDKit: {val:.1f}x")
+
+            results.append(result)
+
+    # Create DataFrame and save
+    df = pd.DataFrame(results)
+    df.to_csv(output_file, index=False)
+    print(f"\n{'=' * 70}")
+    print(f"Results saved to: {output_file}")
+    print(f"{'=' * 70}")
+
+    return df
+
+
+def main():
+    _default_smiles = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "benchmark_smiles.csv")
+
+    parser = argparse.ArgumentParser(description="TFD Benchmark")
+    parser.add_argument(
+        "--smiles-file",
+        type=str,
+        default=_default_smiles,
+        help="CSV file with SMILES (default: benchmarks/data/benchmark_smiles.csv)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="tfd_results.csv",
+        help="Output CSV file for results",
+    )
+    parser.add_argument(
+        "--num-mols",
+        type=int,
+        nargs="+",
+        default=[1, 10, 50, 100, 500, 1000],
+        help="Molecule counts to benchmark (default: 1 10 50 100 500 1000)",
+    )
+    parser.add_argument(
+        "--num-confs",
+        type=int,
+        nargs="+",
+        default=[5, 10, 20, 50],
+        help="Conformer counts to benchmark (default: 5 10 20 50)",
+    )
+    parser.add_argument(
+        "--skip-rdkit",
+        action="store_true",
+        help="Skip RDKit benchmarks (faster)",
+    )
+    parser.add_argument(
+        "--pkl-file",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Precomputed pickle file(s) containing molecules with conformers. "
+        "When provided, --smiles-file and --num-confs are ignored.",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify correctness before benchmarking",
+    )
+    parser.add_argument(
+        "--verify-only",
+        action="store_true",
+        help="Verify correctness and exit (skip benchmarking)",
+    )
+    args = parser.parse_args()
+
+    preloaded_mols = None
+    smiles_list = None
+
+    if args.pkl_file:
+        print("Loading precomputed molecules from pickle file(s)...")
+        preloaded_mols = load_pkl_files(args.pkl_file)
+        if not preloaded_mols:
+            print("Error: no molecules loaded from pickle files")
+            sys.exit(1)
+        print(f"Total: {len(preloaded_mols)} molecules")
+    else:
+        print(f"Loading SMILES from: {args.smiles_file}")
+        try:
+            df = pd.read_csv(args.smiles_file)
+            smiles_list = df.iloc[:, 0].tolist()
+        except Exception as e:
+            print(f"Error loading SMILES file: {e}")
+            sys.exit(1)
+        print(f"Loaded {len(smiles_list)} SMILES")
+
+    if args.verify or args.verify_only:
+        print("\nVerifying correctness...")
+        if preloaded_mols is not None:
+            test_mols = preloaded_mols[:50]
+        else:
+            test_mols = prepare_molecules(smiles_list, num_confs=5, max_mols=50, smiles_file=args.smiles_file)
+        all_correct = True
+        mismatches = 0
+        for i, mol in enumerate(test_mols):
+            if verify_correctness(mol):
+                print(f"  Molecule {i}: OK")
+            else:
+                print(f"  Molecule {i}: MISMATCH")
+                all_correct = False
+                mismatches += 1
+        if all_correct:
+            print(f"All {len(test_mols)} molecules match RDKit.")
+        else:
+            print(f"Warning: {mismatches}/{len(test_mols)} molecules did not match RDKit within tolerance")
+
+        if args.verify_only:
+            sys.exit(0 if all_correct else 1)
+
+    run_benchmarks(
+        smiles_list=smiles_list,
+        skip_rdkit=args.skip_rdkit,
+        output_file=args.output,
+        smiles_file=args.smiles_file,
+        mol_counts=args.num_mols,
+        conformer_counts=args.num_confs if not args.pkl_file else None,
+        preloaded_mols=preloaded_mols,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/tfd_profile.py b/benchmarks/tfd_profile.py
new file mode 100644
index 0000000..664de98
--- /dev/null
+++ b/benchmarks/tfd_profile.py
@@ -0,0 +1,228 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TFD profiling script for use with NVIDIA Nsight Systems (nsys).
+
+Designed to produce NVTX-annotated traces for analyzing CPU vs GPU TFD performance.
+C++ NVTX ranges (buildTFDSystem, transferToDevice, kernel launches, etc.) are
+automatically captured by nsys alongside the Python-level annotations below.
+
+Usage:
+    # Profile GPU with default sweep (multiple mol counts and conformer counts):
+    nsys profile -t nvtx,osrt,cuda --gpu-metrics-devices=all --stats=true \
+        -f true -o tfd_gpu_profile \
+        python benchmarks/tfd_profile.py gpu
+
+    # Profile GPU with specific configurations:
+    python benchmarks/tfd_profile.py gpu --num-mols 100 500 1000 --num-confs 20 50
+
+    # Profile a single configuration (legacy style):
+    python benchmarks/tfd_profile.py gpu --num-mols 50 --num-confs 20
+
+    # Profile CPU path:
+    python benchmarks/tfd_profile.py cpu
+
+    # Profile both paths:
+    python benchmarks/tfd_profile.py both
+"""
+
+import argparse
+import os
+import pickle
+import sys
+
+import nvtx
+import torch
+from rdkit import Chem
+from rdkit.Chem import AllChem
+
+import nvmolkit.tfd as nvmol_tfd
+
+
+def generate_conformers(mol, num_confs, seed=42):
+    """Generate conformers for a molecule using ETKDG."""
+    mol = Chem.AddHs(mol)
+    params = AllChem.ETKDGv3()
+    params.randomSeed = seed
+    params.numThreads = 1
+    params.useRandomCoords = True
+    conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, params=params)
+    if len(conf_ids) < 2:
+        return None
+    return Chem.RemoveHs(mol)
+
+
+def prepare_molecules(smiles_file, num_mols, num_confs):
+    """Load molecules with conformers, using precomputed pickle if available."""
+    data_dir = os.path.dirname(os.path.abspath(smiles_file))
+    pkl_path = os.path.join(data_dir, f"prepared_mols_{num_confs}confs.pkl")
+
+    if os.path.exists(pkl_path):
+        with open(pkl_path, "rb") as f:
+            all_mols = pickle.load(f)
+        mols = all_mols[:num_mols]
+        print(f"  Loaded {len(mols)} molecules from {pkl_path}")
+        return mols
+
+    print(f"  No precomputed file found ({pkl_path}), generating from scratch...")
+    import pandas as pd
+
+    df = pd.read_csv(smiles_file)
+    smiles_list = df.iloc[:, 0].tolist()
+
+    mols = []
+    for i, smi in enumerate(smiles_list):
+        if len(mols) >= num_mols:
+            break
+        mol = Chem.MolFromSmiles(smi)
+        if mol is None or mol.GetNumAtoms() < 4:
+            continue
+        mol_with_confs = generate_conformers(mol, num_confs, seed=42 + i)
+        if mol_with_confs is not None and mol_with_confs.GetNumConformers() >= 2:
+            mols.append(mol_with_confs)
+
+    return mols
+
+
+def run_config(mols, backend, num_runs, label):
+    """Run profiled iterations for a single configuration."""
+    actual_confs = [mol.GetNumConformers() for mol in mols]
+    total_pairs = sum(c * (c - 1) // 2 for c in actual_confs)
+    print(
+        f"\n[{label}] {len(mols)} mols, avg {sum(actual_confs) / len(actual_confs):.1f} confs, {total_pairs} TFD pairs"
+    )
+
+    for run in range(num_runs):
+        if backend in ("cpu", "both"):
+            with nvtx.annotate(f"{label} CPU run {run}", color="cyan"):
+                nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="cpu", return_type="numpy")
+
+        if backend in ("gpu", "both"):
+            with nvtx.annotate(f"{label} GPU run {run}", color="orange"):
+                nvmol_tfd.GetTFDMatrices(mols, useWeights=True, maxDev="equal", backend="gpu", return_type="tensor")
+                torch.cuda.synchronize()
+
+
+def main():
+    _default_smiles = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "benchmark_smiles.csv")
+
+    parser = argparse.ArgumentParser(
+        description="TFD profiling script for nsys",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""\
+Examples:
+  # Profile GPU with default sweep:
+  nsys profile -t nvtx,osrt,cuda --gpu-metrics-devices=all --stats=true \\
+      -f true -o tfd_gpu python benchmarks/tfd_profile.py gpu
+
+  # Profile GPU with large configurations:
+  python benchmarks/tfd_profile.py gpu --num-mols 500 1000 --num-confs 50 100
+
+  # Profile CPU with single config:
+  python benchmarks/tfd_profile.py cpu --num-mols 100 --num-confs 20
+""",
+    )
+    parser.add_argument(
+        "backend",
+        choices=["cpu", "gpu", "both"],
+        help="Backend to profile",
+    )
+    parser.add_argument(
+        "--num-mols",
+        type=int,
+        nargs="+",
+        default=[50, 100, 500, 1000],
+        help="Number of molecules to test (default: 50 100 500 1000)",
+    )
+    parser.add_argument(
+        "--num-confs",
+        type=int,
+        nargs="+",
+        default=[10, 20, 50],
+        help="Number of conformers to test (default: 10 20 50)",
+    )
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=1,
+        help="Number of warmup iterations (default: 1)",
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=3,
+        help="Number of profiled runs per configuration (default: 3)",
+    )
+    parser.add_argument(
+        "--smiles-file",
+        type=str,
+        default=_default_smiles,
+        help="CSV file with SMILES (default: benchmarks/data/benchmark_smiles.csv)",
+    )
+    args = parser.parse_args()
+
+    print(f"Backend: {args.backend}")
+    print(f"Molecule counts: {args.num_mols}")
+    print(f"Conformer counts: {args.num_confs}")
+    print(f"Runs per config: {args.runs}")
+
+    # === Setup: load all needed configurations ===
+    configs = {}
+    with nvtx.annotate("Setup", color="blue"):
+        for num_confs in args.num_confs:
+            max_mols = max(args.num_mols)
+            print(f"\nLoading up to {max_mols} molecules with {num_confs} conformers...")
+            all_mols = prepare_molecules(args.smiles_file, max_mols, num_confs)
+            if not all_mols:
+                print(f"  Warning: no molecules available for {num_confs} conformers, skipping")
+                continue
+            configs[num_confs] = all_mols
+
+    if not configs:
+        print("Error: No valid molecules prepared")
+        sys.exit(1)
+
+    # === Warmup ===
+    if args.warmup > 0:
+        with nvtx.annotate("Warmup", color="red"):
+            first_mols = list(configs.values())[0]
+            warmup_mols = first_mols[: min(5, len(first_mols))]
+            print(f"\nWarmup ({args.warmup} iteration(s)) with {len(warmup_mols)} molecules...")
+            for _ in range(args.warmup):
+                if args.backend in ("cpu", "both"):
+                    nvmol_tfd.GetTFDMatrices(warmup_mols, useWeights=True, maxDev="equal", backend="cpu")
+                if args.backend in ("gpu", "both"):
+                    nvmol_tfd.GetTFDMatrices(
+                        warmup_mols, useWeights=True, maxDev="equal", backend="gpu", return_type="tensor"
+                    )
+                    torch.cuda.synchronize()
+
+    # === Profiled runs ===
+    with nvtx.annotate("Profiled runs", color="green"):
+        for num_confs in sorted(configs.keys()):
+            all_mols = configs[num_confs]
+            for num_mols in args.num_mols:
+                mols = all_mols[:num_mols]
+                if len(mols) == 0:
+                    continue
+                label = f"{len(mols)}mols_{num_confs}confs"
+                with nvtx.annotate(label, color="magenta"):
+                    run_config(mols, args.backend, args.runs, label)
+
+    print("\nProfiling complete.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/api/nvmolkit.rst b/docs/api/nvmolkit.rst
index 235dd6b..56b9677 100644
--- a/docs/api/nvmolkit.rst
+++ b/docs/api/nvmolkit.rst
@@ -86,6 +86,16 @@ Conformer RMSD
    conformerRmsd.GetConformerRMSMatrix
    conformerRmsd.GetConformerRMSMatrixBatch
 
+Torsion Fingerprint Deviation (TFD)
+-----------------------------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: function_template.rst
+
+   tfd.GetTFDMatrix
+   tfd.GetTFDMatrices
+
 Types
 -----
 
diff --git a/docs/index.rst b/docs/index.rst
index 8b15bdd..eb85340 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -125,6 +125,10 @@ nvMolKit currently supports the following features:
 
 * **Conformer RMSD**: GPU-accelerated pairwise RMSD matrix computation for conformer ensembles
 
+* **Torsion Fingerprint Deviation (TFD)**: GPU-accelerated TFD computation for comparing conformer geometry
+    * Batch processing of multiple molecules with all-pairs conformer comparison
+    * CPU and GPU backends with automatic backend selection
+
 .. _async-results:
 
 Asynchronous GPU Results
diff --git a/nvmolkit/CMakeLists.txt b/nvmolkit/CMakeLists.txt
index be4069b..bc52099 100644
--- a/nvmolkit/CMakeLists.txt
+++ b/nvmolkit/CMakeLists.txt
@@ -106,3 +106,12 @@ target_link_libraries(_substructure PUBLIC ${Boost_LIBRARIES}
 target_link_libraries(_substructure PRIVATE ${RDKit_LIBS} substructure_search
                                             pinned_host_allocator)
 installpythontarget(_substructure ./)
+
+add_library(_TFD MODULE tfd.cpp)
+target_include_directories(
+  _TFD PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/utils ${Boost_INCLUDE_DIRS}
+              ${Python_INCLUDE_DIRS})
+target_link_libraries(_TFD PUBLIC ${Boost_LIBRARIES} ${PYTHON_LIBRARIES}
+                                  device_vector)
+target_link_libraries(_TFD PRIVATE ${RDKit_LIBS} tfd CUDA::cudart)
+installpythontarget(_TFD ./)
diff --git a/nvmolkit/_TFD.pyi b/nvmolkit/_TFD.pyi
new file mode 100644
index 0000000..7f18295
--- /dev/null
+++ b/nvmolkit/_TFD.pyi
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, List, Tuple
+
+import numpy as np
+from rdkit.Chem import Mol
+
+def GetTFDMatricesCpuBuffer(
+    mols: List[Mol],
+    useWeights: bool = True,
+    maxDev: str = "equal",
+    symmRadius: int = 2,
+    ignoreColinearBonds: bool = True,
+) -> List[np.ndarray]: ...
+def GetTFDMatricesGpuBuffer(
+    mols: List[Mol],
+    useWeights: bool = True,
+    maxDev: str = "equal",
+    symmRadius: int = 2,
+    ignoreColinearBonds: bool = True,
+) -> Tuple[Any, List[int]]: ...
diff --git a/nvmolkit/__init__.py b/nvmolkit/__init__.py
index 760cc84..ddd0d5d 100644
--- a/nvmolkit/__init__.py
+++ b/nvmolkit/__init__.py
@@ -25,6 +25,7 @@
 - Bulk tanimoto/cosine similarity calculations between fingerprints
 - ETKDG conformer generation for multiple molecules
 - MMFF optimization for multiple molecules and conformers
+- TFD (Torsion Fingerprint Deviation) calculation for conformer comparison
 """
 
 VERSION = "0.4.0"
diff --git a/nvmolkit/tests/test_tfd.py b/nvmolkit/tests/test_tfd.py
new file mode 100644
index 0000000..8371b5e
--- /dev/null
+++ b/nvmolkit/tests/test_tfd.py
@@ -0,0 +1,417 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for GPU-accelerated TFD calculation."""
+
+import pytest
+import torch
+from rdkit import Chem
+from rdkit.Chem import AllChem, TorsionFingerprints
+
+import nvmolkit.tfd as tfd
+
+# Tolerance for comparing GPU vs RDKit results
+TOLERANCE = 0.01
+
+
+def generate_conformers(mol, num_confs, seed=42):
+    """Generate conformers for a molecule using ETKDG."""
+    params = AllChem.ETKDGv3()
+    params.randomSeed = seed
+    params.numThreads = 1
+    AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, params=params)
+    return mol
+
+
+@pytest.fixture
+def simple_mol_with_conformers():
+    """Create a simple molecule with conformers."""
+    mol = Chem.MolFromSmiles("CCCCC")  # n-pentane
+    return generate_conformers(mol, 5)
+
+
+@pytest.fixture
+def multiple_mols_with_conformers():
+    """Create multiple molecules with conformers."""
+    smiles_list = ["CCCC", "CCCCC", "CCCCCC", "CCO", "CCCO"]
+    mols = []
+    for i, smi in enumerate(smiles_list):
+        mol = Chem.MolFromSmiles(smi)
+        if mol:
+            mol = generate_conformers(mol, 3 + i, seed=42 + i)
+            if mol.GetNumConformers() >= 2:
+                mols.append(mol)
+    return mols
+
+
+class TestGetTFDMatrix:
+    """Tests for single-molecule TFD calculation."""
+
+    def test_basic_tfd_matrix(self, simple_mol_with_conformers):
+        """Test basic TFD matrix computation."""
+        mol = simple_mol_with_conformers
+        num_conf = mol.GetNumConformers()
+
+        result = tfd.GetTFDMatrix(mol)
+
+        # Check size: C*(C-1)/2 for C conformers
+        expected_size = num_conf * (num_conf - 1) // 2
+        assert len(result) == expected_size
+
+        # All TFD values should be non-negative
+        for val in result:
+            assert val >= 0.0
+
+    def test_single_conformer_returns_empty(self):
+        """Test that single conformer returns empty matrix."""
+        mol = Chem.MolFromSmiles("CCCC")
+        generate_conformers(mol, 1)
+        assert mol.GetNumConformers() == 1
+
+        result = tfd.GetTFDMatrix(mol)
+        assert len(result) == 0
+
+    def test_cpu_backend(self, simple_mol_with_conformers):
+        """Test CPU backend gives same results as GPU."""
+        mol = simple_mol_with_conformers
+
+        gpu_result = tfd.GetTFDMatrix(mol, backend="gpu")
+        cpu_result = tfd.GetTFDMatrix(mol, backend="cpu")
+
+        assert len(gpu_result) == len(cpu_result)
+        for gpu_val, cpu_val in zip(gpu_result, cpu_result):
+            assert abs(gpu_val - cpu_val) < 1e-4
+
+    def test_no_weights(self, simple_mol_with_conformers):
+        """Test computation without weights."""
+        mol = simple_mol_with_conformers
+
+        with_weights = tfd.GetTFDMatrix(mol, useWeights=True)
+        no_weights = tfd.GetTFDMatrix(mol, useWeights=False)
+
+        # Both should have same size
+        assert len(with_weights) == len(no_weights)
+
+        # Values may differ when weights are disabled
+        # Just verify computation completes
+
+    def test_maxdev_spec(self, simple_mol_with_conformers):
+        """Test with specific max deviation mode."""
+        mol = simple_mol_with_conformers
+
+        equal_result = tfd.GetTFDMatrix(mol, maxDev="equal")
+        spec_result = tfd.GetTFDMatrix(mol, maxDev="spec")
+
+        assert len(equal_result) == len(spec_result)
+
+    def test_invalid_maxdev_raises(self, simple_mol_with_conformers):
+        """Test that invalid maxDev raises error."""
+        mol = simple_mol_with_conformers
+
+        with pytest.raises(Exception):
+            tfd.GetTFDMatrix(mol, maxDev="invalid")
+
+    def test_invalid_backend_raises(self, simple_mol_with_conformers):
+        """Test that invalid backend raises error."""
+        mol = simple_mol_with_conformers
+
+        with pytest.raises(Exception):
+            tfd.GetTFDMatrix(mol, backend="invalid")
+
+
+class TestGetTFDMatrices:
+    """Tests for batch TFD calculation."""
+
+    def test_batch_processing(self, multiple_mols_with_conformers):
+        """Test batch processing of multiple molecules."""
+        mols = multiple_mols_with_conformers
+
+        results = tfd.GetTFDMatrices(mols)
+
+        assert len(results) == len(mols)
+
+        for i, (mol, result) in enumerate(zip(mols, results)):
+            num_conf = mol.GetNumConformers()
+            expected_size = num_conf * (num_conf - 1) // 2
+            assert len(result) == expected_size, f"Molecule {i} has wrong result size"
+
+    def test_empty_input(self):
+        """Test with empty molecule list."""
+        results = tfd.GetTFDMatrices([])
+        assert len(results) == 0
+
+    def test_batch_vs_individual(self, multiple_mols_with_conformers):
+        """Test that batch results match individual processing."""
+        mols = multiple_mols_with_conformers
+
+        batch_results = tfd.GetTFDMatrices(mols)
+        individual_results = [tfd.GetTFDMatrix(mol) for mol in mols]
+
+        assert len(batch_results) == len(individual_results)
+
+        for batch, individual in zip(batch_results, individual_results):
+            assert len(batch) == len(individual)
+            for b, ind in zip(batch, individual):
+                assert abs(b - ind) < 1e-4
+
+
+class TestGpuResidentOutput:
+    """Tests for GPU-resident TFD output via return_type parameter."""
+
+    def test_tensor_return_single(self, simple_mol_with_conformers):
+        """Test return_type='tensor' for a single molecule."""
+        mol = simple_mol_with_conformers
+
+        tensor = tfd.GetTFDMatrix(mol, return_type="tensor")
+
+        assert isinstance(tensor, torch.Tensor)
+        assert tensor.device.type == "cuda"
+        num_conf = mol.GetNumConformers()
+        expected_size = num_conf * (num_conf - 1) // 2
+        assert tensor.shape[0] == expected_size
+
+    def test_tensor_return_batch(self, multiple_mols_with_conformers):
+        """Test return_type='tensor' for batch."""
+        mols = multiple_mols_with_conformers
+
+        tensors = tfd.GetTFDMatrices(mols, return_type="tensor")
+        assert len(tensors) == len(mols)
+
+        for i, (mol, tensor) in enumerate(zip(mols, tensors)):
+            assert isinstance(tensor, torch.Tensor)
+            assert tensor.device.type == "cuda"
+            num_conf = mol.GetNumConformers()
+            expected_size = num_conf * (num_conf - 1) // 2
+            assert tensor.shape[0] == expected_size, f"Molecule {i} has wrong tensor size"
+
+    def test_list_matches_tensor(self, multiple_mols_with_conformers):
+        """Test that list and tensor return types give consistent results."""
+        mols = multiple_mols_with_conformers
+
+        lists = tfd.GetTFDMatrices(mols, return_type="list")
+        tensors = tfd.GetTFDMatrices(mols, return_type="tensor")
+
+        assert len(lists) == len(tensors)
+        for gpu_list, tensor in zip(lists, tensors):
+            tensor_list = tensor.cpu().tolist()
+            assert len(gpu_list) == len(tensor_list)
+            for l, t in zip(gpu_list, tensor_list):
+                assert abs(l - t) < 1e-4
+
+    def test_return_type_numpy(self, multiple_mols_with_conformers):
+        """Test return_type='numpy' returns correct numpy arrays."""
+        import numpy as np
+
+        mols = multiple_mols_with_conformers
+        lists = tfd.GetTFDMatrices(mols, return_type="list")
+        arrays = tfd.GetTFDMatrices(mols, return_type="numpy")
+
+        assert len(arrays) == len(lists)
+        for arr, lst in zip(arrays, lists):
+            assert isinstance(arr, np.ndarray)
+            assert arr.dtype == np.float32
+            assert len(arr) == len(lst)
+            for a, l in zip(arr.tolist(), lst):
+                assert abs(a - l) < 1e-4
+
+    def test_return_type_tensor(self, multiple_mols_with_conformers):
+        """Test return_type='tensor' returns correct GPU tensors."""
+        import torch
+
+        mols = multiple_mols_with_conformers
+        lists = tfd.GetTFDMatrices(mols, return_type="list")
+        tensors = tfd.GetTFDMatrices(mols, return_type="tensor")
+
+        assert len(tensors) == len(lists)
+        for tensor, lst in zip(tensors, lists):
+            assert isinstance(tensor, torch.Tensor)
+            assert tensor.is_cuda
+            assert tensor.dtype == torch.float32
+            assert len(tensor) == len(lst)
+            for t, l in zip(tensor.cpu().tolist(), lst):
+                assert abs(t - l) < 1e-4
+
+
+class TestCompareWithRDKit:
+    """Tests comparing nvMolKit TFD with RDKit TFD."""
+
+    def test_simple_chain_molecule(self):
+        """Test simple chain molecule against RDKit."""
+        mol = Chem.MolFromSmiles("CCCCC")
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"nvMolKit={nv}, RDKit={rd}"
+
+    def test_no_weights_comparison(self):
+        """Test without weights against RDKit."""
+        mol = Chem.MolFromSmiles("CCCCCC")
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(mol, useWeights=False, maxDev="equal")
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(mol, useWeights=False, maxDev="equal")
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE
+
+    def test_ring_molecule(self):
+        """Test ring molecule (multi-quartet) against RDKit."""
+        mol = Chem.MolFromSmiles("C1CCCCC1")  # cyclohexane
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"nvMolKit={nv}, RDKit={rd}"
+
+    def test_ring_with_substituent(self):
+        """Test molecule with both ring and non-ring torsions against RDKit."""
+        mol = Chem.MolFromSmiles("c1ccccc1CC")  # ethylbenzene
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"nvMolKit={nv}, RDKit={rd}"
+
+    @pytest.mark.parametrize("backend", ["gpu", "cpu"])
+    @pytest.mark.parametrize(
+        "smiles",
+        ["CCCCC", "CC(C)CC"],
+    )
+    def test_add_hs(self, backend, smiles):
+        """Test molecules with explicit hydrogens against RDKit."""
+        mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(mol, useWeights=True, maxDev="equal", backend=backend)
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev="equal")
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"AddHs({smiles}), backend={backend}: nvMolKit={nv}, RDKit={rd}"
+
+    @pytest.mark.parametrize("backend", ["gpu", "cpu"])
+    def test_symmetric_molecule_gpu_cpu_consistency(self, backend):
+        """Test branched molecule gives consistent results across backends."""
+        mol = Chem.MolFromSmiles("CC(C)CC")  # isopentane
+        generate_conformers(mol, 4)
+
+        result = tfd.GetTFDMatrix(mol, backend=backend)
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(mol)
+
+        assert len(result) == len(rdkit_result)
+
+        for nv, rd in zip(result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"backend={backend}: nvMolKit={nv}, RDKit={rd}"
+
+    @pytest.mark.parametrize("backend", ["gpu", "cpu"])
+    @pytest.mark.parametrize("symm_radius", [0, 1, 3])
+    def test_symm_radius(self, backend, symm_radius):
+        """Test different symmRadius values against RDKit on both backends."""
+        mol = Chem.MolFromSmiles("CC(C)CC")
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(
+            mol,
+            symmRadius=symm_radius,
+            backend=backend,
+        )
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(
+            mol,
+            symmRadius=symm_radius,
+        )
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"symmRadius={symm_radius}, backend={backend}: nvMolKit={nv}, RDKit={rd}"
+
+    @pytest.mark.parametrize("backend", ["gpu", "cpu"])
+    def test_ignore_colinear_bonds_false(self, backend):
+        """Test ignoreColinearBonds=False against RDKit on both backends."""
+        mol = Chem.MolFromSmiles("CCCC#CCC")
+        generate_conformers(mol, 4)
+
+        nvmolkit_result = tfd.GetTFDMatrix(
+            mol,
+            ignoreColinearBonds=False,
+            backend=backend,
+        )
+        rdkit_result = TorsionFingerprints.GetTFDMatrix(
+            mol,
+            ignoreColinearBonds=False,
+        )
+
+        assert len(nvmolkit_result) == len(rdkit_result)
+
+        for nv, rd in zip(nvmolkit_result, rdkit_result):
+            assert abs(nv - rd) < TOLERANCE, f"ignoreColinearBonds=False, backend={backend}: nvMolKit={nv}, RDKit={rd}"
+
+
+class TestEdgeCases:
+    """Tests for edge cases."""
+
+    def test_ethane_no_torsions(self):
+        """Test ethane (terminal bonds only, no rotatable bonds)."""
+        mol = Chem.MolFromSmiles("CC")
+        generate_conformers(mol, 3)
+
+        result = tfd.GetTFDMatrix(mol)
+
+        # Should return zeros
+        for val in result:
+            assert val == 0.0
+
+    def test_large_molecule(self):
+        """Test larger molecule."""
+        # Decane
+        mol = Chem.MolFromSmiles("CCCCCCCCCC")
+        generate_conformers(mol, 5)
+
+        result = tfd.GetTFDMatrix(mol)
+
+        num_conf = mol.GetNumConformers()
+        expected_size = num_conf * (num_conf - 1) // 2
+        assert len(result) == expected_size
+
+        for val in result:
+            assert 0.0 <= val <= 1.0  # TFD should be normalized
+
+    def test_invalid_molecule_raises(self):
+        """Test that None molecule raises error."""
+        with pytest.raises(Exception):
+            tfd.GetTFDMatrix(None)
+
+    def test_invalid_molecule_in_batch_raises(self, simple_mol_with_conformers):
+        """Test that None in batch raises error."""
+        mols = [simple_mol_with_conformers, None]
+
+        with pytest.raises(Exception):
+            tfd.GetTFDMatrices(mols)
diff --git a/nvmolkit/tfd.cpp b/nvmolkit/tfd.cpp
new file mode 100644
index 0000000..5c19ccf
--- /dev/null
+++ b/nvmolkit/tfd.cpp
@@ -0,0 +1,169 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <GraphMol/ROMol.h>
+
+#include <boost/python.hpp>
+#include <boost/python/manage_new_object.hpp>
+#include <boost/python/numpy.hpp>
+
+#include "array_helpers.h"
+#include "nvtx.h"
+#include "tfd_cpu.h"
+#include "tfd_gpu.h"
+
+namespace {
+
+using namespace boost::python;
+namespace numpy = boost::python::numpy;
+
+using CpuTFDResults = std::vector<std::vector<double>>;
+
+std::vector<const RDKit::ROMol*> listToMolVector(const boost::python::list& mols) {
+  std::vector<const RDKit::ROMol*> molsVec;
+  molsVec.reserve(len(mols));
+  for (int i = 0; i < len(mols); i++) {
+    const RDKit::ROMol* mol = boost::python::extract<const RDKit::ROMol*>(boost::python::object(mols[i]));
+    if (mol == nullptr) {
+      throw std::invalid_argument("Invalid molecule at index " + std::to_string(i));
+    }
+    molsVec.push_back(mol);
+  }
+  return molsVec;
+}
+
+boost::python::list intVectorToList(const std::vector<int>& vec) {
+  boost::python::list result;
+  for (const auto& value : vec) {
+    result.append(value);
+  }
+  return result;
+}
+
+nvMolKit::TFDComputeOptions buildOptions(bool               useWeights,
+                                         const std::string& maxDev,
+                                         int                symmRadius,
+                                         bool               ignoreColinearBonds) {
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights          = useWeights;
+  options.symmRadius          = symmRadius;
+  options.ignoreColinearBonds = ignoreColinearBonds;
+
+  if (maxDev == "equal") {
+    options.maxDevMode = nvMolKit::TFDMaxDevMode::Equal;
+  } else if (maxDev == "spec") {
+    options.maxDevMode = nvMolKit::TFDMaxDevMode::Spec;
+  } else {
+    throw std::invalid_argument("maxDev must be 'equal' or 'spec', got: " + maxDev);
+  }
+
+  return options;
+}
+
+boost::python::object toOwnedPyArray(nvMolKit::PyArray* array) {
+  using Converter = boost::python::manage_new_object::apply<nvMolKit::PyArray*>::type;
+  return boost::python::object(boost::python::handle<>(Converter()(array)));
+}
+
+nvMolKit::TFDCpuGenerator& getCpuGenerator() {
+  static nvMolKit::TFDCpuGenerator generator;
+  return generator;
+}
+
+nvMolKit::TFDGpuGenerator& getGpuGenerator() {
+  static nvMolKit::TFDGpuGenerator generator;
+  return generator;
+}
+
+}  // namespace
+
+BOOST_PYTHON_MODULE(_TFD) {
+  numpy::initialize();
+
+  // CPU path: returns flat numpy array + offsets (avoids per-element Python object creation)
+  def(
+    "GetTFDMatricesCpuBuffer",
+    +[](const boost::python::list& mols,
+        bool                       useWeights,
+        const std::string&         maxDev,
+        int                        symmRadius,
+        bool                       ignoreColinearBonds) -> boost::python::object {
+      auto molsVec    = listToMolVector(mols);
+      auto options    = buildOptions(useWeights, maxDev, symmRadius, ignoreColinearBonds);
+      options.backend = nvMolKit::TFDComputeBackend::CPU;
+      auto results    = getCpuGenerator().GetTFDMatrices(molsVec, options);
+
+      nvMolKit::ScopedNvtxRange range("CPU: wrap as numpy arrays", nvMolKit::NvtxColor::kGreen);
+
+      // Move results to heap so PyCapsule can own the memory
+      auto* owned = new CpuTFDResults(std::move(results));
+
+      auto deleter = [](PyObject* cap) {
+        delete reinterpret_cast<CpuTFDResults*>(PyCapsule_GetPointer(cap, "nvmolkit.cpu_tfd"));
+      };
+      PyObject* cap = PyCapsule_New(static_cast<void*>(owned), "nvmolkit.cpu_tfd", deleter);
+      if (cap == nullptr) {
+        delete owned;
+        throw std::runtime_error("Failed to create PyCapsule for CPU TFD results");
+      }
+      object owner{handle<>(cap)};
+
+      // Create per-molecule numpy array views (zero-copy, each backed by the capsule)
+      const Py_intptr_t   stride = static_cast<Py_intptr_t>(sizeof(double));
+      boost::python::list arrays;
+      for (auto& vec : *owned) {
+        const Py_intptr_t shape = static_cast<Py_intptr_t>(vec.size());
+        arrays.append(numpy::from_data(vec.data(),
+                                       numpy::dtype::get_builtin<double>(),
+                                       make_tuple(shape),
+                                       make_tuple(stride),
+                                       owner));
+      }
+      return arrays;
+    },
+    (arg("mols"),
+     arg("useWeights")          = true,
+     arg("maxDev")              = "equal",
+     arg("symmRadius")          = 2,
+     arg("ignoreColinearBonds") = true));
+
+  // GPU path: returns GPU-resident buffer + metadata
+  def(
+    "GetTFDMatricesGpuBuffer",
+    +[](const boost::python::list& mols,
+        bool                       useWeights,
+        const std::string&         maxDev,
+        int                        symmRadius,
+        bool                       ignoreColinearBonds) -> boost::python::object {
+      auto molsVec    = listToMolVector(mols);
+      auto options    = buildOptions(useWeights, maxDev, symmRadius, ignoreColinearBonds);
+      options.backend = nvMolKit::TFDComputeBackend::GPU;
+
+      auto gpuResult = getGpuGenerator().GetTFDMatricesGpuBuffer(molsVec, options);
+
+      nvMolKit::ScopedNvtxRange range("GPU: C++ to Python tuple", nvMolKit::NvtxColor::kYellow);
+      boost::python::list       outputStarts = intVectorToList(gpuResult.tfdOutputStarts);
+
+      size_t totalSize = gpuResult.tfdValues.size();
+      auto*  pyArray   = nvMolKit::makePyArray(gpuResult.tfdValues, boost::python::make_tuple(totalSize));
+
+      return boost::python::make_tuple(toOwnedPyArray(pyArray), outputStarts);
+    },
+    (arg("mols"),
+     arg("useWeights")          = true,
+     arg("maxDev")              = "equal",
+     arg("symmRadius")          = 2,
+     arg("ignoreColinearBonds") = true));
+}
diff --git a/nvmolkit/tfd.py b/nvmolkit/tfd.py
new file mode 100644
index 0000000..648438c
--- /dev/null
+++ b/nvmolkit/tfd.py
@@ -0,0 +1,216 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPU-accelerated Torsion Fingerprint Deviation (TFD) calculation.
+
+This module provides GPU-accelerated implementations of TFD calculation,
+compatible with RDKit's TorsionFingerprints API.
+
+TFD (Torsion Fingerprint Deviation) is a measure of conformational similarity
+based on the comparison of torsion angles between conformers. It was described
+in: Schulz-Gasch et al., JCIM, 52, 1499-1512 (2012).
+
+Example usage:
+    >>> from rdkit import Chem
+    >>> from rdkit.Chem import AllChem
+    >>> import nvmolkit.tfd as tfd
+    >>>
+    >>> mol = Chem.MolFromSmiles('CCCCC')
+    >>> AllChem.EmbedMultipleConfs(mol, numConfs=5)
+    >>>
+    >>> # Python lists (RDKit-compatible, default)
+    >>> tfd_matrix = tfd.GetTFDMatrix(mol)
+    >>>
+    >>> # Numpy arrays (fast, CPU)
+    >>> tfd_matrices = tfd.GetTFDMatrices(mols, return_type="numpy")
+    >>>
+    >>> # GPU tensors (fastest, no D2H copy)
+    >>> tfd_matrices = tfd.GetTFDMatrices(mols, return_type="tensor")
+"""
+
+from contextlib import nullcontext
+from typing import List, Union
+
+import numpy as np
+import torch
+
+try:
+    import nvtx as _nvtx
+
+    def _nvtx_range(name, color="blue"):
+        return _nvtx.annotate(name, color=color)
+except ImportError:
+
+    def _nvtx_range(name, color="blue"):
+        return nullcontext()
+
+
+from nvmolkit import _TFD
+from nvmolkit._arrayHelpers import *  # noqa: F403
+from nvmolkit.types import AsyncGpuResult
+
+
+class _TFDGpuResult:
+    """Internal result container for GPU-resident TFD computation."""
+
+    def __init__(self, tfd_values: AsyncGpuResult, output_starts: List[int]):
+        self.tfd_values = tfd_values
+        self.output_starts = output_starts
+
+    def to_tensors(self) -> List[torch.Tensor]:
+        """Extract as list of GPU tensors (no D2H copy)."""
+        n = len(self.output_starts) - 1
+        all_values = self.tfd_values.torch()
+        return [all_values[self.output_starts[i] : self.output_starts[i + 1]] for i in range(n)]
+
+    def to_numpy(self) -> List[np.ndarray]:
+        """Extract as list of numpy arrays (one bulk D2H copy)."""
+        n = len(self.output_starts) - 1
+        torch.cuda.synchronize()
+        all_values = self.tfd_values.numpy()
+        return [all_values[self.output_starts[i] : self.output_starts[i + 1]] for i in range(n)]
+
+    def to_lists(self) -> List[List[float]]:
+        """Extract as Python lists (bulk D2H + tolist)."""
+        n = len(self.output_starts) - 1
+        torch.cuda.synchronize()
+        all_list = self.tfd_values.numpy().tolist()
+        return [all_list[self.output_starts[i] : self.output_starts[i + 1]] for i in range(n)]
+
+
+def _get_gpu_result(mols, useWeights, maxDev, symmRadius, ignoreColinearBonds):
+    """Run GPU TFD computation and return _TFDGpuResult (no D2H copy)."""
+    pyarray, output_starts = _TFD.GetTFDMatricesGpuBuffer(
+        mols,
+        useWeights=useWeights,
+        maxDev=maxDev,
+        symmRadius=symmRadius,
+        ignoreColinearBonds=ignoreColinearBonds,
+    )
+    return _TFDGpuResult(
+        tfd_values=AsyncGpuResult(pyarray),
+        output_starts=output_starts,
+    )
+
+
+def _extract_gpu_result(gpu_result, return_type):
+    """Extract results from _TFDGpuResult based on return_type."""
+    if return_type == "tensor":
+        return gpu_result.to_tensors()
+    elif return_type == "numpy":
+        return gpu_result.to_numpy()
+    else:
+        return gpu_result.to_lists()
+
+
+def _extract_cpu_result(arrays, return_type):
+    """Convert list of numpy arrays to requested return type."""
+    if return_type == "numpy":
+        return list(arrays)
+    elif return_type == "tensor":
+        return [torch.from_numpy(a) for a in arrays]
+    else:
+        return [a.tolist() for a in arrays]
+
+
+def GetTFDMatrices(
+    mols: List,
+    useWeights: bool = True,
+    maxDev: str = "equal",
+    symmRadius: int = 2,
+    ignoreColinearBonds: bool = True,
+    backend: str = "gpu",
+    return_type: str = "list",
+) -> Union[List[List[float]], List[np.ndarray], List[torch.Tensor]]:
+    """Calculate TFD matrices for multiple molecules.
+
+    Args:
+        mols: List of RDKit molecules, each with multiple conformers.
+        useWeights: If True (default), use distance-based torsion weights.
+        maxDev: Normalization mode ('equal' or 'spec').
+        symmRadius: Radius for atom invariants (default: 2).
+        ignoreColinearBonds: If True (default), ignore colinear bonds.
+        backend: Computation backend, 'gpu' (default) or 'cpu'.
+        return_type: Output format:
+            'list' (default): List of Python lists (RDKit-compatible).
+            'numpy': List of numpy arrays (float64 for CPU, float32 for GPU).
+            'tensor': List of torch.Tensors (GPU tensors for GPU backend, CPU tensors for CPU backend).
+
+    Returns:
+        List of TFD matrices in the requested format.
+    """
+    if backend in ("gpu", "GPU"):
+        gpu_result = _get_gpu_result(mols, useWeights, maxDev, symmRadius, ignoreColinearBonds)
+        with _nvtx_range("GPU: split per-molecule results", color="yellow"):
+            return _extract_gpu_result(gpu_result, return_type)
+
+    if backend not in ("cpu", "CPU"):
+        raise ValueError(f"backend must be 'gpu' or 'cpu', got: '{backend}'")
+
+    arrays = _TFD.GetTFDMatricesCpuBuffer(
+        mols,
+        useWeights=useWeights,
+        maxDev=maxDev,
+        symmRadius=symmRadius,
+        ignoreColinearBonds=ignoreColinearBonds,
+    )
+    with _nvtx_range("CPU: convert result format", color="gray"):
+        return _extract_cpu_result(arrays, return_type)
+
+
+def GetTFDMatrix(
+    mol,
+    useWeights: bool = True,
+    maxDev: str = "equal",
+    symmRadius: int = 2,
+    ignoreColinearBonds: bool = True,
+    backend: str = "gpu",
+    return_type: str = "list",
+) -> Union[List[float], np.ndarray, torch.Tensor]:
+    """Calculate the TFD matrix for conformers of a molecule.
+
+    Convenience wrapper over GetTFDMatrices for a single molecule.
+
+    Args:
+        mol: RDKit molecule with multiple conformers.
+        useWeights: If True (default), use distance-based torsion weights.
+        maxDev: Normalization mode ('equal' or 'spec').
+        symmRadius: Radius for atom invariants (default: 2).
+        ignoreColinearBonds: If True (default), ignore colinear bonds.
+        backend: Computation backend, 'gpu' (default) or 'cpu'.
+        return_type: Output format:
+            'list' (default): Python list of floats (RDKit-compatible).
+            'numpy': numpy float32 array (CPU).
+            'tensor': GPU torch.Tensor (no D2H copy).
+
+    Returns:
+        Lower triangular TFD matrix as a flat list, numpy array, or GPU tensor.
+    """
+    results = GetTFDMatrices(
+        [mol],
+        useWeights=useWeights,
+        maxDev=maxDev,
+        symmRadius=symmRadius,
+        ignoreColinearBonds=ignoreColinearBonds,
+        backend=backend,
+        return_type=return_type,
+    )
+    if not results:
+        if return_type == "numpy":
+            return np.array([], dtype=np.float32)
+        elif return_type == "tensor":
+            return torch.tensor([], dtype=torch.float32)
+        return []
+    return results[0]
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8d9cb37..78e50d7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -144,3 +144,32 @@ target_link_libraries(
   PRIVATE ${RDKit_LIBS})
 target_include_directories(conformer_rmsd_mol
                            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+# TFD (Torsion Fingerprint Deviation) CPU implementation
+add_library(tfd_cpu tfd_common.cpp tfd_cpu.cpp tfd_transfer.cu)
+target_link_libraries(
+  tfd_cpu
+  PRIVATE ${RDKit_LIBS} OpenMP::OpenMP_CXX openmp_helpers nvtx
+  PUBLIC device_vector)
+target_include_directories(tfd_cpu PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+# TFD GPU kernels
+add_library(tfd_kernels tfd_kernels.cu)
+target_link_libraries(
+  tfd_kernels
+  PRIVATE cuda_error_check
+  PUBLIC device_vector)
+target_include_directories(tfd_kernels PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+# TFD GPU orchestration
+add_library(tfd_gpu tfd_gpu.cpp)
+target_link_libraries(
+  tfd_gpu
+  PRIVATE tfd_kernels tfd_cpu ${RDKit_LIBS} nvtx
+  PUBLIC device_vector device)
+target_include_directories(tfd_gpu PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+# TFD unified public API
+add_library(tfd tfd.cpp)
+target_link_libraries(tfd PUBLIC tfd_cpu tfd_gpu ${RDKit_LIBS})
+target_include_directories(tfd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/src/tfd.cpp b/src/tfd.cpp
new file mode 100644
index 0000000..29225b3
--- /dev/null
+++ b/src/tfd.cpp
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tfd.h"
+
+#include <stdexcept>
+
+namespace nvMolKit {
+
+void TFDGenerator::initializeBackendIfNeeded(TFDComputeBackend backend) {
+  switch (backend) {
+    case TFDComputeBackend::GPU:
+      if (!gpuGenerator_) {
+        gpuGenerator_ = std::make_unique<TFDGpuGenerator>();
+      }
+      break;
+    case TFDComputeBackend::CPU:
+      if (!cpuGenerator_) {
+        cpuGenerator_ = std::make_unique<TFDCpuGenerator>();
+      }
+      break;
+  }
+}
+
+std::vector<double> TFDGenerator::GetTFDMatrix(const RDKit::ROMol& mol, const TFDComputeOptions& options) {
+  initializeBackendIfNeeded(options.backend);
+
+  switch (options.backend) {
+    case TFDComputeBackend::GPU:
+      return gpuGenerator_->GetTFDMatrix(mol, options);
+    case TFDComputeBackend::CPU:
+      return cpuGenerator_->GetTFDMatrix(mol, options);
+  }
+
+  return {};
+}
+
+std::vector<std::vector<double>> TFDGenerator::GetTFDMatrices(const std::vector<const RDKit::ROMol*>& mols,
+                                                              const TFDComputeOptions&                options) {
+  initializeBackendIfNeeded(options.backend);
+
+  switch (options.backend) {
+    case TFDComputeBackend::GPU:
+      return gpuGenerator_->GetTFDMatrices(mols, options);
+    case TFDComputeBackend::CPU:
+      return cpuGenerator_->GetTFDMatrices(mols, options);
+  }
+
+  return {};
+}
+
+}  // namespace nvMolKit
diff --git a/src/tfd.h b/src/tfd.h
new file mode 100644
index 0000000..46c1107
--- /dev/null
+++ b/src/tfd.h
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_H
+#define NVMOLKIT_TFD_H
+
+#include <memory>
+#include <vector>
+
+#include "tfd_common.h"
+#include "tfd_cpu.h"
+#include "tfd_gpu.h"
+
+namespace nvMolKit {
+
+//! Unified TFD generator with automatic backend selection
+//!
+//! Provides a single interface for TFD computation, automatically
+//! selecting between CPU and GPU backends based on the options provided.
+//! Backends are lazily initialized on first use.
+//!
+//! For GPU-resident output, use TFDGpuGenerator::GetTFDMatricesGpuBuffer directly.
+class TFDGenerator {
+ public:
+  TFDGenerator() = default;
+
+  //! Compute TFD matrix for a single molecule
+  std::vector<double> GetTFDMatrix(const RDKit::ROMol& mol, const TFDComputeOptions& options = TFDComputeOptions{});
+
+  //! Compute TFD matrices for multiple molecules
+  std::vector<std::vector<double>> GetTFDMatrices(const std::vector<const RDKit::ROMol*>& mols,
+                                                  const TFDComputeOptions& options = TFDComputeOptions{});
+
+ private:
+  void initializeBackendIfNeeded(TFDComputeBackend backend);
+
+  std::unique_ptr<TFDGpuGenerator> gpuGenerator_;
+  std::unique_ptr<TFDCpuGenerator> cpuGenerator_;
+};
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_H
diff --git a/src/tfd_common.cpp b/src/tfd_common.cpp
new file mode 100644
index 0000000..d5f5758
--- /dev/null
+++ b/src/tfd_common.cpp
@@ -0,0 +1,845 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tfd_common.h"
+
+#include <Geometry/point.h>
+#include <GraphMol/Fingerprints/FingerprintGenerator.h>
+#include <GraphMol/Fingerprints/MorganGenerator.h>
+#include <GraphMol/MolOps.h>
+#include <GraphMol/RingInfo.h>
+#include <GraphMol/SmilesParse/SmilesParse.h>
+#include <GraphMol/Substruct/SubstructMatch.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <stdexcept>
+#include <unordered_set>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "nvtx.h"
+
+namespace nvMolKit {
+
+namespace {
+
+//! Get heavy atom neighbors of an atom, optionally excluding one atom
+std::vector<const RDKit::Atom*> getHeavyAtomNeighbors(const RDKit::Atom* atom, int excludeIdx = -1) {
+  std::vector<const RDKit::Atom*> neighbors;
+  for (const auto* neighbor : atom->getOwningMol().atomNeighbors(atom)) {
+    if (neighbor->getAtomicNum() != 1 && neighbor->getIdx() != static_cast<unsigned int>(excludeIdx)) {
+      neighbors.push_back(neighbor);
+    }
+  }
+  return neighbors;
+}
+
+//! Check if all atoms have the same invariant
+bool doMatch(const std::vector<std::uint32_t>& inv, const std::vector<const RDKit::Atom*>& atoms) {
+  if (atoms.size() < 2) {
+    return true;
+  }
+  auto firstInv = inv[atoms[0]->getIdx()];
+  for (size_t i = 1; i < atoms.size(); ++i) {
+    if (inv[atoms[i]->getIdx()] != firstInv) {
+      return false;
+    }
+  }
+  return true;
+}
+
+//! Find the atom that is different when two atoms match (for 3 atoms)
+const RDKit::Atom* doMatchExcept1(const std::vector<std::uint32_t>& inv, const std::vector<const RDKit::Atom*>& atoms) {
+  if (atoms.size() != 3) {
+    return nullptr;
+  }
+  int a1 = atoms[0]->getIdx();
+  int a2 = atoms[1]->getIdx();
+  int a3 = atoms[2]->getIdx();
+
+  if (inv[a1] == inv[a2] && inv[a1] != inv[a3] && inv[a2] != inv[a3]) {
+    return atoms[2];
+  } else if (inv[a1] != inv[a2] && inv[a1] == inv[a3] && inv[a2] != inv[a3]) {
+    return atoms[1];
+  } else if (inv[a1] != inv[a2] && inv[a1] != inv[a3] && inv[a2] == inv[a3]) {
+    return atoms[0];
+  }
+  return nullptr;
+}
+
+//! Get atom invariants using Morgan fingerprints at given radius
+std::vector<std::uint32_t> getAtomInvariantsWithRadius(const RDKit::ROMol& mol, int radius) {
+  std::vector<std::uint32_t> inv(mol.getNumAtoms(), 0);
+
+  auto fpGen = RDKit::MorganFingerprint::getMorganGenerator<std::uint32_t>(radius,
+                                                                           false /* countSimulation */,
+                                                                           false /* includeChirality */,
+                                                                           true /* useBondTypes */,
+                                                                           false /* onlyNonzeroInvariants */,
+                                                                           true /* includeRedundantEnvironments */);
+
+  RDKit::AdditionalOutput ao;
+  ao.allocateBitInfoMap();
+
+  // Call to populate ao.bitInfoMap; result is unused
+  (void)fpGen->getSparseCountFingerprint(mol, nullptr, nullptr, -1, &ao);
+  const auto& bitInfo = ao.bitInfoMap;
+
+  if (bitInfo) {
+    for (const auto& [bitId, atomRadiusPairs] : *bitInfo) {
+      for (const auto& [atomIdx, r] : atomRadiusPairs) {
+        if (r == static_cast<unsigned int>(radius)) {
+          inv[atomIdx] = bitId;
+        }
+      }
+    }
+  }
+
+  return inv;
+}
+
+//! Get reference atoms for torsion based on neighbor symmetry
+std::vector<const RDKit::Atom*> getIndexForTorsion(const std::vector<const RDKit::Atom*>& neighbors,
+                                                   const std::vector<std::uint32_t>&      inv) {
+  if (neighbors.size() == 1) {
+    return neighbors;
+  } else if (doMatch(inv, neighbors)) {
+    // All symmetric neighbors - return all
+    return neighbors;
+  } else if (neighbors.size() == 3) {
+    const RDKit::Atom* different = doMatchExcept1(inv, neighbors);
+    if (different) {
+      return {different};
+    }
+  }
+  // Fallback: sort and take first
+  auto sorted = neighbors;
+  std::sort(sorted.begin(), sorted.end(), [&inv](const RDKit::Atom* a, const RDKit::Atom* b) {
+    return inv[a->getIdx()] < inv[b->getIdx()];
+  });
+  return {sorted[0]};
+}
+
+//! Information about a rotatable bond for torsion calculation
+struct BondInfo {
+  int                             a1;   // First central atom
+  int                             a2;   // Second central atom
+  std::vector<const RDKit::Atom*> nb1;  // Heavy atom neighbors of a1 (excluding a2)
+  std::vector<const RDKit::Atom*> nb2;  // Heavy atom neighbors of a2 (excluding a1)
+};
+
+//! Get bonds for which torsions should be calculated
+std::vector<BondInfo> getBondsForTorsions(const RDKit::ROMol& mol, bool ignoreColinearBonds) {
+  // Flag atoms that cannot be middle atoms of torsion (triple bonds, allenes)
+  std::vector<int> atomFlags(mol.getNumAtoms(), 0);
+
+  // Pattern for triple bonds
+  auto triplePattern = RDKit::SmartsToMol("*#*");
+  if (triplePattern) {
+    std::vector<RDKit::MatchVectType> matches;
+    RDKit::SubstructMatch(mol, *triplePattern, matches);
+    for (const auto& match : matches) {
+      for (const auto& [_, atomIdx] : match) {
+        atomFlags[atomIdx] = 1;
+      }
+    }
+  }
+
+  // Pattern for allenes
+  auto allenePattern = RDKit::SmartsToMol("[$([C](=*)=*)]");
+  if (allenePattern) {
+    std::vector<RDKit::MatchVectType> matches;
+    RDKit::SubstructMatch(mol, *allenePattern, matches);
+    for (const auto& match : matches) {
+      for (const auto& [_, atomIdx] : match) {
+        atomFlags[atomIdx] = 1;
+      }
+    }
+  }
+
+  std::vector<BondInfo> bonds;
+  std::vector<int>      doneBonds(mol.getNumBonds(), 0);
+
+  const auto* ringInfo = mol.getRingInfo();
+  for (const auto* bond : mol.bonds()) {
+    if (ringInfo->numBondRings(bond->getIdx()) > 0) {
+      continue;
+    }
+
+    int a1 = bond->getBeginAtomIdx();
+    int a2 = bond->getEndAtomIdx();
+
+    auto nb1 = getHeavyAtomNeighbors(bond->getBeginAtom(), a2);
+    auto nb2 = getHeavyAtomNeighbors(bond->getEndAtom(), a1);
+
+    if (!doneBonds[bond->getIdx()] && !nb1.empty() && !nb2.empty()) {
+      doneBonds[bond->getIdx()] = 1;
+
+      // Check if atoms cannot be middle atoms
+      if (atomFlags[a1] || atomFlags[a2]) {
+        if (!ignoreColinearBonds) {
+          // Search for alternative atoms (following the Python logic)
+          while (nb1.size() == 1 && atomFlags[a1]) {
+            int a1old = a1;
+            a1        = nb1[0]->getIdx();
+            auto* b   = mol.getBondBetweenAtoms(a1old, a1);
+            if (b) {
+              if (b->getEndAtomIdx() == static_cast<unsigned int>(a1old)) {
+                nb1 = getHeavyAtomNeighbors(b->getBeginAtom(), a1old);
+              } else {
+                nb1 = getHeavyAtomNeighbors(b->getEndAtom(), a1old);
+              }
+              doneBonds[b->getIdx()] = 1;
+            } else {
+              break;
+            }
+          }
+          while (nb2.size() == 1 && atomFlags[a2]) {
+            int a2old = a2;
+            a2        = nb2[0]->getIdx();
+            auto* b   = mol.getBondBetweenAtoms(a2old, a2);
+            if (b) {
+              if (b->getBeginAtomIdx() == static_cast<unsigned int>(a2old)) {
+                nb2 = getHeavyAtomNeighbors(b->getEndAtom(), a2old);
+              } else {
+                nb2 = getHeavyAtomNeighbors(b->getBeginAtom(), a2old);
+              }
+              doneBonds[b->getIdx()] = 1;
+            } else {
+              break;
+            }
+          }
+          if (!nb1.empty() && !nb2.empty()) {
+            bonds.push_back({a1, a2, nb1, nb2});
+          }
+        }
+        // If ignoreColinearBonds is true, we skip this bond
+      } else {
+        bonds.push_back({a1, a2, nb1, nb2});
+      }
+    }
+  }
+
+  return bonds;
+}
+
+//! Find the most central bond in a molecule.
+//! Returns {-1, -1} if no central bond can be found (e.g., methane, linear molecules).
+//!
+//! NOTE: For molecules with near-perfect C2 topological symmetry (e.g. macrocyclic
+//! peptides), two atoms may have STDs that differ only at machine-epsilon (~4e-16).
+//! The sort tiebreak can therefore pick either atom as "most central", and the choice
+//! may differ from RDKit Python (which uses numpy.std with a different accumulation
+//! order).  This causes the weighted TFD to diverge by up to ~0.1 for affected
+//! molecules (~0.8% of ChEMBL), while unweighted TFD matches exactly.  Both weight
+//! schemes are equally valid — the TFD paper does not prescribe a tiebreak rule.
+std::pair<int, int> findCentralBond(const RDKit::ROMol& mol, const double* distMat) {
+  int numAtoms = mol.getNumAtoms();
+
+  // Calculate STD of distances for each non-terminal atom
+  std::vector<std::pair<double, int>> stds;
+  for (int i = 0; i < numAtoms; ++i) {
+    auto neighbors = getHeavyAtomNeighbors(mol.getAtomWithIdx(i));
+    if (neighbors.size() < 2) {
+      continue;  // Skip terminal atoms
+    }
+
+    // Calculate STD of distances
+    double sum   = 0.0;
+    double sumSq = 0.0;
+    int    count = 0;
+    for (int j = 0; j < numAtoms; ++j) {
+      if (j != i) {
+        double d = distMat[i * numAtoms + j];
+        sum += d;
+        sumSq += d * d;
+        count++;
+      }
+    }
+    double mean     = sum / count;
+    double variance = (sumSq / count) - (mean * mean);
+    double stdDev   = std::sqrt(std::max(0.0, variance));
+    stds.emplace_back(stdDev, i);
+  }
+
+  if (stds.empty()) {
+    return {-1, -1};  // No non-terminal atoms found
+  }
+
+  std::sort(stds.begin(), stds.end());
+  int aid1 = stds[0].second;
+
+  // Find second most central atom that is bonded to aid1
+  for (size_t i = 1; i < stds.size(); ++i) {
+    if (mol.getBondBetweenAtoms(aid1, stds[i].second) != nullptr) {
+      return {aid1, stds[i].second};
+    }
+  }
+
+  return {-1, -1};  // Could not find central bond
+}
+
+//! Calculate beta parameter for weight calculation
+double calculateBeta(const RDKit::ROMol& mol, const double* distMat, int aid1) {
+  int numAtoms = mol.getNumAtoms();
+
+  // Get all non-terminal bonds
+  // NOTE: RDKit has a typo in _calculateBeta (TorsionFingerprints.py ~line 391):
+  //   `if len(nb2) > 1 and len(nb2) > 1` checks nb2 twice instead of nb1 and nb2.
+  //   This includes bonds where only the end atom is non-terminal, inflating dmax.
+  //   We replicate this behavior for RDKit compatibility.
+  // TODO: Fix once RDKit corrects this, or add a flag for "correct" behavior.
+  double dmax = 0.0;
+  for (const auto* bond : mol.bonds()) {
+    auto nb2 = getHeavyAtomNeighbors(bond->getEndAtom());
+    if (nb2.size() > 1 && nb2.size() > 1) {
+      int    bid1 = bond->getBeginAtomIdx();
+      int    bid2 = bond->getEndAtomIdx();
+      double d    = std::max(distMat[aid1 * numAtoms + bid1], distMat[aid1 * numAtoms + bid2]);
+      dmax        = std::max(dmax, d);
+    }
+  }
+
+  double dmax2 = dmax / 2.0;
+  if (dmax2 < 1e-6) {
+    dmax2 = 1.0;  // Avoid division by zero
+  }
+  double beta = -std::log(0.1) / (dmax2 * dmax2);
+  return beta;
+}
+
+}  // namespace
+
+// Internal: build torsion list from precomputed bonds (used when caller already has bonds)
+static TorsionList extractTorsionListImpl(const RDKit::ROMol&          mol,
+                                          TFDMaxDevMode                maxDevMode,
+                                          int                          symmRadius,
+                                          const std::vector<BondInfo>& bonds) {
+  TorsionList result;
+
+  // Get atom invariants
+  std::vector<std::uint32_t> inv;
+  if (symmRadius > 0) {
+    inv = getAtomInvariantsWithRadius(mol, symmRadius);
+  } else {
+    // Use connectivity invariants as fallback
+    inv.resize(mol.getNumAtoms());
+    for (unsigned int i = 0; i < mol.getNumAtoms(); ++i) {
+      inv[i] = mol.getAtomWithIdx(i)->getDegree();
+    }
+  }
+
+  // Process each bond to create torsions
+  for (const auto& bond : bonds) {
+    auto d1 = getIndexForTorsion(bond.nb1, inv);
+    auto d2 = getIndexForTorsion(bond.nb2, inv);
+
+    TorsionDef torsion;
+
+    if (maxDevMode == TFDMaxDevMode::Equal) {
+      // Equal mode: all combinations (d1 x d2), maxDev 180 (default path)
+      for (const auto* n1 : d1) {
+        for (const auto* n2 : d2) {
+          torsion.atomQuartets.push_back(
+            {static_cast<int>(n1->getIdx()), bond.a1, bond.a2, static_cast<int>(n2->getIdx())});
+        }
+      }
+      torsion.maxDev = 180.0f;
+    } else {
+      // Spec mode: build quartets and set torsion-specific maxDev
+      if (d1.size() == 1 && d2.size() == 1) {
+        // Case 1, 2, 4, 5, 7, 10, 16, 12, 17, 19 - single torsion
+        torsion.atomQuartets.push_back(
+          {static_cast<int>(d1[0]->getIdx()), bond.a1, bond.a2, static_cast<int>(d2[0]->getIdx())});
+        torsion.maxDev = 180.0f;
+      } else if (d1.size() == 1) {
+        // Case 3, 6, 8, 13, 20 - multiple torsions from d2
+        for (const auto* nb : d2) {
+          torsion.atomQuartets.push_back(
+            {static_cast<int>(d1[0]->getIdx()), bond.a1, bond.a2, static_cast<int>(nb->getIdx())});
+        }
+        torsion.maxDev = (bond.nb2.size() == 2) ? 90.0f : 60.0f;
+      } else if (d2.size() == 1) {
+        // Case 3, 6, 8, 13, 20 - multiple torsions from d1
+        for (const auto* nb : d1) {
+          torsion.atomQuartets.push_back(
+            {static_cast<int>(nb->getIdx()), bond.a1, bond.a2, static_cast<int>(d2[0]->getIdx())});
+        }
+        torsion.maxDev = (bond.nb1.size() == 2) ? 90.0f : 60.0f;
+      } else {
+        // Both symmetric - all combinations
+        for (const auto* n1 : d1) {
+          for (const auto* n2 : d2) {
+            torsion.atomQuartets.push_back(
+              {static_cast<int>(n1->getIdx()), bond.a1, bond.a2, static_cast<int>(n2->getIdx())});
+          }
+        }
+        if (bond.nb1.size() == 2 && bond.nb2.size() == 2) {
+          torsion.maxDev = 90.0f;
+        } else if (bond.nb1.size() == 3 && bond.nb2.size() == 3) {
+          torsion.maxDev = 60.0f;
+        } else {
+          torsion.maxDev = 30.0f;
+        }
+      }
+    }
+
+    result.nonRingTorsions.push_back(std::move(torsion));
+  }
+
+  // Process rings
+  auto rings = mol.getRingInfo()->atomRings();
+  for (const auto& ring : rings) {
+    TorsionDef torsion;
+    int        num = static_cast<int>(ring.size());
+
+    // Calculate max deviation for ring
+    float maxdev;
+    if (num >= 14) {
+      maxdev = 180.0f;
+    } else {
+      maxdev = 180.0f * std::exp(-0.025f * (num - 14) * (num - 14));
+    }
+
+    // Create torsions for ring (consecutive 4 atoms)
+    for (int i = 0; i < num; ++i) {
+      torsion.atomQuartets.push_back({ring[i], ring[(i + 1) % num], ring[(i + 2) % num], ring[(i + 3) % num]});
+    }
+    torsion.maxDev = maxdev;
+
+    result.ringTorsions.push_back(std::move(torsion));
+  }
+
+  return result;
+}
+
+TorsionList extractTorsionList(const RDKit::ROMol& mol,
+                               TFDMaxDevMode       maxDevMode,
+                               int                 symmRadius,
+                               bool                ignoreColinearBonds) {
+  auto bonds = getBondsForTorsions(mol, ignoreColinearBonds);
+  return extractTorsionListImpl(mol, maxDevMode, symmRadius, bonds);
+}
+
+// Internal: compute weights using precomputed bonds (used when caller already has bonds)
+static std::vector<float> computeTorsionWeightsImpl(const RDKit::ROMol&          mol,
+                                                    const TorsionList&           torsionList,
+                                                    const std::vector<BondInfo>& bonds) {
+  std::vector<float> weights;
+
+  // If no torsions, return empty weights
+  size_t totalTorsions = torsionList.totalCount();
+  if (totalTorsions == 0) {
+    return weights;
+  }
+
+  // Get distance matrix (returns raw pointer, stored in mol's dictionary)
+  const double* distMat  = RDKit::MolOps::getDistanceMat(mol);
+  int           numAtoms = mol.getNumAtoms();
+
+  // Find central bond
+  auto [aid1, aid2] = findCentralBond(mol, distMat);
+
+  // If no central bond found, return uniform weights
+  if (aid1 < 0 || aid2 < 0) {
+    weights.resize(totalTorsions, 1.0f);
+    return weights;
+  }
+
+  // Calculate beta
+  double beta = calculateBeta(mol, distMat, aid1);
+
+  // Calculate weights for non-ring torsions (bonds provided by caller)
+  for (size_t i = 0; i < bonds.size(); ++i) {
+    const auto& bond = bonds[i];
+    double      d;
+
+    if ((bond.a1 == aid1 && bond.a2 == aid2) || (bond.a1 == aid2 && bond.a2 == aid1)) {
+      d = 0.0;  // Central bond itself
+    } else {
+      // Shortest distance to central bond atoms + 1
+      d = std::min({distMat[aid1 * numAtoms + bond.a1],
+                    distMat[aid1 * numAtoms + bond.a2],
+                    distMat[aid2 * numAtoms + bond.a1],
+                    distMat[aid2 * numAtoms + bond.a2]}) +
+          1.0;
+    }
+
+    float w = static_cast<float>(std::exp(-beta * d * d));
+    weights.push_back(w);
+  }
+
+  // Calculate weights for ring torsions
+  auto ringInfo  = mol.getRingInfo();
+  auto bondRings = ringInfo->bondRings();
+
+  for (const auto& bondRing : bondRings) {
+    int    num  = static_cast<int>(bondRing.size());
+    double sumD = 0.0;
+
+    for (int bidx : bondRing) {
+      const auto* bond = mol.getBondWithIdx(bidx);
+      int         bid1 = bond->getBeginAtomIdx();
+      int         bid2 = bond->getEndAtomIdx();
+      double      d    = std::min({distMat[aid1 * numAtoms + bid1],
+                                   distMat[aid1 * numAtoms + bid2],
+                                   distMat[aid2 * numAtoms + bid1],
+                                   distMat[aid2 * numAtoms + bid2]}) +
+                 1.0;
+      sumD += d;
+    }
+
+    double avgD = sumD / num;
+    float  w    = static_cast<float>(std::exp(-beta * avgD * avgD) * (num / 2.0));
+    weights.push_back(w);
+  }
+
+  return weights;
+}
+
+std::vector<float> computeTorsionWeights(const RDKit::ROMol& mol,
+                                         const TorsionList&  torsionList,
+                                         bool                ignoreColinearBonds) {
+  auto bonds = getBondsForTorsions(mol, ignoreColinearBonds);
+  return computeTorsionWeightsImpl(mol, torsionList, bonds);
+}
+
+// Sequential single-molecule builder (used as the building block for parallel batch builds)
+static TFDSystemHost buildTFDSystemImpl(const RDKit::ROMol& mol, const TFDComputeOptions& options) {
+  TFDSystemHost system;
+
+  int numConformers = mol.getNumConformers();
+  int numAtoms      = mol.getNumAtoms();
+
+  if (numConformers == 0) {
+    throw std::runtime_error("Molecule has no conformers");
+  }
+
+  // Get bonds once and reuse for torsion list and (optionally) weights
+  auto        bonds       = getBondsForTorsions(mol, options.ignoreColinearBonds);
+  TorsionList torsionList = extractTorsionListImpl(mol, options.maxDevMode, options.symmRadius, bonds);
+
+  // Extract weights if needed (reuse same bonds)
+  std::vector<float> weights;
+  if (options.useWeights) {
+    weights = computeTorsionWeightsImpl(mol, torsionList, bonds);
+  }
+
+  int confStart          = static_cast<int>(system.confPositionStarts.size());
+  int torsStart          = system.totalTorsions();
+  int quartetStartForMol = system.totalQuartets();
+
+  // Extract coordinates (tightly packed, no padding)
+  for (auto confIt = mol.beginConformers(); confIt != mol.endConformers(); ++confIt) {
+    system.confPositionStarts.push_back(static_cast<int>(system.positions.size()));
+    const auto& conf = **confIt;
+    for (int atomIdx = 0; atomIdx < numAtoms; ++atomIdx) {
+      const auto& pos = conf.getAtomPos(atomIdx);
+      system.positions.push_back(static_cast<float>(pos.x));
+      system.positions.push_back(static_cast<float>(pos.y));
+      system.positions.push_back(static_cast<float>(pos.z));
+    }
+  }
+
+  // Add torsion definitions (store ALL quartets, classify type)
+  int torsionIdx = 0;
+
+  for (const auto& torsion : torsionList.nonRingTorsions) {
+    if (torsion.atomQuartets.empty()) {
+      torsionIdx++;
+      continue;
+    }
+    for (const auto& q : torsion.atomQuartets)
+      system.torsionAtoms.push_back(q);
+    system.quartetStarts.push_back(static_cast<int>(system.torsionAtoms.size()));
+    system.torsionTypes.push_back(torsion.atomQuartets.size() > 1 ? TorsionType::Symmetric : TorsionType::Single);
+    system.torsionMaxDevs.push_back(torsion.maxDev);
+    system.torsionWeights.push_back(
+      (options.useWeights && torsionIdx < static_cast<int>(weights.size())) ? weights[torsionIdx] : 1.0f);
+    torsionIdx++;
+  }
+
+  for (const auto& torsion : torsionList.ringTorsions) {
+    if (torsion.atomQuartets.empty()) {
+      torsionIdx++;
+      continue;
+    }
+    for (const auto& q : torsion.atomQuartets)
+      system.torsionAtoms.push_back(q);
+    system.quartetStarts.push_back(static_cast<int>(system.torsionAtoms.size()));
+    system.torsionTypes.push_back(torsion.atomQuartets.size() > 1 ? TorsionType::Ring : TorsionType::Single);
+    system.torsionMaxDevs.push_back(torsion.maxDev);
+    system.torsionWeights.push_back(
+      (options.useWeights && torsionIdx < static_cast<int>(weights.size())) ? weights[torsionIdx] : 1.0f);
+    torsionIdx++;
+  }
+
+  // Build per-molecule descriptor
+  int numTorsions         = system.totalTorsions() - torsStart;
+  int totalQuartetsForMol = system.totalQuartets() - quartetStartForMol;
+  int dihedStart          = system.totalDihedrals();
+  int numDihedrals        = numConformers * totalQuartetsForMol;
+  int numTFDOutputs       = numConformers * (numConformers - 1) / 2;
+  int tfdOutStart         = system.totalTFDOutputs();
+
+  system.totalDihedrals_ = dihedStart + numDihedrals;
+
+  MolDescriptor desc;
+  desc.confStart     = confStart;
+  desc.numConformers = numConformers;
+  desc.quartetStart  = quartetStartForMol;
+  desc.numQuartets   = totalQuartetsForMol;
+  desc.dihedStart    = dihedStart;
+  desc.torsStart     = torsStart;
+  desc.numTorsions   = numTorsions;
+  desc.tfdOutStart   = tfdOutStart;
+  system.molDescriptors.push_back(desc);
+
+  system.dihedralWorkStarts.push_back(system.dihedralWorkStarts.back() + numDihedrals);
+  system.tfdWorkStarts.push_back(system.tfdWorkStarts.back() + numTFDOutputs);
+
+  return system;
+}
+
+//! Lightweight per-molecule extraction result (used in two-pass batch build).
+//! Contains RDKit-derived torsion data, coordinates, and per-molecule sizes needed
+//! for computing global offsets. Coordinates are extracted during Pass 1 (parallel)
+//! so that Pass 2 only needs memcpy, not RDKit conformer access.
+struct MolExtraction {
+  TorsionList                     torsionList;
+  std::vector<float>              weights;
+  std::vector<std::array<int, 4>> atoms;      //!< Flattened torsion atom quartets
+  std::vector<float>              wts;        //!< Weight per torsion
+  std::vector<float>              maxDevs;    //!< MaxDev per torsion
+  std::vector<TorsionType>        types;      //!< Type per torsion
+  std::vector<int>                qStarts;    //!< CSR for quartets (including leading 0)
+  std::vector<float>              positions;  //!< Flat xyz coords for all conformers
+  int                             numConformers;
+  int                             numAtoms;
+  int                             numTorsions;   //!< = wts.size()
+  int                             numQuartets;   //!< = atoms.size()
+  int                             numPositions;  //!< = numConformers * numAtoms * 3
+};
+
+//! Extract torsion data and coordinates from a single molecule (Pass 1 of two-pass build).
+//! Coordinates are extracted here (in parallel) so Pass 2 only needs memcpy.
+static MolExtraction extractMolData(const RDKit::ROMol& mol, const TFDComputeOptions& options) {
+  MolExtraction ext;
+  ext.numConformers = mol.getNumConformers();
+  ext.numAtoms      = mol.getNumAtoms();
+  ext.numPositions  = ext.numConformers * ext.numAtoms * 3;
+
+  if (ext.numConformers == 0) {
+    throw std::runtime_error("Molecule has no conformers");
+  }
+
+  auto bonds      = getBondsForTorsions(mol, options.ignoreColinearBonds);
+  ext.torsionList = extractTorsionListImpl(mol, options.maxDevMode, options.symmRadius, bonds);
+  if (options.useWeights) {
+    ext.weights = computeTorsionWeightsImpl(mol, ext.torsionList, bonds);
+  }
+
+  // Extract coordinates (tightly packed, no padding)
+  ext.positions.resize(ext.numPositions);
+  int posIdx = 0;
+  for (auto confIt = mol.beginConformers(); confIt != mol.endConformers(); ++confIt) {
+    const auto& conf = **confIt;
+    for (int a = 0; a < ext.numAtoms; ++a) {
+      const auto& pos           = conf.getAtomPos(a);
+      ext.positions[posIdx]     = static_cast<float>(pos.x);
+      ext.positions[posIdx + 1] = static_cast<float>(pos.y);
+      ext.positions[posIdx + 2] = static_cast<float>(pos.z);
+      posIdx += 3;
+    }
+  }
+
+  // Flatten torsion definitions into GPU-ready arrays
+  ext.qStarts.push_back(0);
+  int torsionIdx = 0;
+
+  auto addTorsion = [&](const TorsionDef& torsion, bool isRing) {
+    if (torsion.atomQuartets.empty()) {
+      torsionIdx++;
+      return;
+    }
+    for (const auto& q : torsion.atomQuartets)
+      ext.atoms.push_back(q);
+    ext.qStarts.push_back(static_cast<int>(ext.atoms.size()));
+    if (isRing) {
+      ext.types.push_back(torsion.atomQuartets.size() > 1 ? TorsionType::Ring : TorsionType::Single);
+    } else {
+      ext.types.push_back(torsion.atomQuartets.size() > 1 ? TorsionType::Symmetric : TorsionType::Single);
+    }
+    ext.maxDevs.push_back(torsion.maxDev);
+    ext.wts.push_back(
+      (options.useWeights && torsionIdx < static_cast<int>(ext.weights.size())) ? ext.weights[torsionIdx] : 1.0f);
+    torsionIdx++;
+  };
+
+  for (const auto& t : ext.torsionList.nonRingTorsions)
+    addTorsion(t, false);
+  for (const auto& t : ext.torsionList.ringTorsions)
+    addTorsion(t, true);
+
+  ext.numTorsions = static_cast<int>(ext.wts.size());
+  ext.numQuartets = static_cast<int>(ext.atoms.size());
+  return ext;
+}
+
+TFDSystemHost buildTFDSystem(const std::vector<const RDKit::ROMol*>& mols, const TFDComputeOptions& options) {
+  ScopedNvtxRange range("buildTFDSystem (" + std::to_string(mols.size()) + " mols)", NvtxColor::kCyan);
+
+  if (mols.empty()) {
+    return {};
+  }
+  if (mols.size() == 1) {
+    return buildTFDSystemImpl(*mols[0], options);
+  }
+
+  int N = static_cast<int>(mols.size());
+
+  // ---- Pass 1: extract torsion data per-molecule in parallel ----
+  // This is the expensive part (RDKit fingerprinting, SMARTS matching, etc.)
+  std::vector<MolExtraction> extractions(N);
+  {
+    ScopedNvtxRange buildRange("Parallel RDKit extraction", NvtxColor::kCyan);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic)
+#endif
+    for (int i = 0; i < N; ++i) {
+      extractions[i] = extractMolData(*mols[i], options);
+    }
+  }
+
+  // ---- Compute global offsets (sequential, ~O(N) with tiny per-element cost) ----
+  ScopedNvtxRange mergeRange("Compute offsets & fill system", NvtxColor::kCyan);
+
+  std::vector<int> posOffset(N);
+  std::vector<int> confOffset(N);
+  std::vector<int> quartetOffset(N);
+  std::vector<int> torsOffset(N);
+  std::vector<int> dihedOffset(N);
+  std::vector<int> tfdOutOffset(N);
+
+  int totalPos      = 0;
+  int totalConfs    = 0;
+  int totalQuartets = 0;
+  int totalTors     = 0;
+  int totalDiheds   = 0;
+  int totalTfdOuts  = 0;
+
+  for (int i = 0; i < N; ++i) {
+    const auto& ext  = extractions[i];
+    posOffset[i]     = totalPos;
+    confOffset[i]    = totalConfs;
+    quartetOffset[i] = totalQuartets;
+    torsOffset[i]    = totalTors;
+    dihedOffset[i]   = totalDiheds;
+    tfdOutOffset[i]  = totalTfdOuts;
+
+    totalPos += ext.numPositions;
+    totalConfs += ext.numConformers;
+    totalQuartets += ext.numQuartets;
+    totalTors += ext.numTorsions;
+    totalDiheds += ext.numConformers * ext.numQuartets;
+    totalTfdOuts += ext.numConformers * (ext.numConformers - 1) / 2;
+  }
+
+  // ---- Allocate the final system (single allocation per array) ----
+  // For large arrays (positions, torsionAtoms), we use reserve() + resize()
+  // to avoid double-writing: reserve() allocates without init, then the parallel
+  // fill writes every element exactly once before the vector is used.
+  TFDSystemHost system;
+  // Positions is the largest array (~29MB for 1900 mols x 100 confs).
+  // Use reserve+resize pattern: reserve avoids reallocation during parallel fill.
+  system.positions.resize(totalPos);
+  system.confPositionStarts.resize(totalConfs);
+  system.torsionAtoms.resize(totalQuartets);
+  system.torsionWeights.resize(totalTors);
+  system.torsionMaxDevs.resize(totalTors);
+  system.torsionTypes.resize(totalTors);
+  system.quartetStarts.resize(1 + totalTors);
+  system.quartetStarts[0] = 0;
+  system.molDescriptors.resize(N);
+  system.dihedralWorkStarts.resize(1 + N);
+  system.dihedralWorkStarts[0] = 0;
+  system.tfdWorkStarts.resize(1 + N);
+  system.tfdWorkStarts[0] = 0;
+  system.totalDihedrals_  = totalDiheds;
+
+  // ---- Pass 2: fill the system in parallel ----
+  // Each thread writes to its own slice of the pre-allocated arrays (no contention).
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic)
+#endif
+  for (int i = 0; i < N; ++i) {
+    const auto& ext = extractions[i];
+
+    // Positions: bulk memcpy from pre-extracted coordinates
+    if (ext.numPositions > 0) {
+      std::memcpy(system.positions.data() + posOffset[i], ext.positions.data(), ext.numPositions * sizeof(float));
+    }
+    // confPositionStarts: compute directly
+    for (int c = 0; c < ext.numConformers; ++c) {
+      system.confPositionStarts[confOffset[i] + c] = posOffset[i] + c * ext.numAtoms * 3;
+    }
+
+    // Torsion atoms: bulk copy (no offset adjustment needed)
+    if (ext.numQuartets > 0) {
+      std::memcpy(&system.torsionAtoms[quartetOffset[i]],
+                  ext.atoms.data(),
+                  ext.numQuartets * sizeof(std::array<int, 4>));
+    }
+    if (ext.numTorsions > 0) {
+      std::memcpy(&system.torsionWeights[torsOffset[i]], ext.wts.data(), ext.numTorsions * sizeof(float));
+      std::memcpy(&system.torsionMaxDevs[torsOffset[i]], ext.maxDevs.data(), ext.numTorsions * sizeof(float));
+      std::memcpy(&system.torsionTypes[torsOffset[i]], ext.types.data(), ext.numTorsions * sizeof(TorsionType));
+    }
+
+    // QuartetStarts CSR: skip leading 0, add global quartet offset
+    for (int j = 0; j < ext.numTorsions; ++j) {
+      system.quartetStarts[1 + torsOffset[i] + j] = quartetOffset[i] + ext.qStarts[j + 1];
+    }
+
+    // MolDescriptor
+    MolDescriptor desc;
+    desc.confStart           = confOffset[i];
+    desc.numConformers       = ext.numConformers;
+    desc.quartetStart        = quartetOffset[i];
+    desc.numQuartets         = ext.numQuartets;
+    desc.dihedStart          = dihedOffset[i];
+    desc.torsStart           = torsOffset[i];
+    desc.numTorsions         = ext.numTorsions;
+    desc.tfdOutStart         = tfdOutOffset[i];
+    system.molDescriptors[i] = desc;
+
+    // Work CSRs
+    system.dihedralWorkStarts[1 + i] = dihedOffset[i] + ext.numConformers * ext.numQuartets;
+    system.tfdWorkStarts[1 + i]      = tfdOutOffset[i] + ext.numConformers * (ext.numConformers - 1) / 2;
+  }
+
+  return system;
+}
+
+TFDSystemHost buildTFDSystem(const RDKit::ROMol& mol, const TFDComputeOptions& options) {
+  return buildTFDSystemImpl(mol, options);
+}
+
+}  // namespace nvMolKit
diff --git a/src/tfd_common.h b/src/tfd_common.h
new file mode 100644
index 0000000..7941eac
--- /dev/null
+++ b/src/tfd_common.h
@@ -0,0 +1,213 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_COMMON_H
+#define NVMOLKIT_TFD_COMMON_H
+
+#include <GraphMol/ROMol.h>
+
+#include <cstdint>
+#include <vector>
+
+#include "device_vector.h"
+#include "tfd_types.h"
+
+namespace nvMolKit {
+
+//! Device to run TFD computation on
+enum class TFDComputeBackend {
+  CPU,
+  GPU,
+};
+
+//! Maximum deviation mode for torsion normalization
+enum class TFDMaxDevMode {
+  Equal,  //!< All torsions normalized by 180.0
+  Spec,   //!< Each torsion normalized by its specific max deviation
+};
+
+//! A single torsion definition: four atom indices and normalization factor
+struct TorsionDef {
+  //! Atom indices defining the torsion (a-b-c-d dihedral)
+  //! For symmetric cases, multiple quartets may define equivalent torsions
+  std::vector<std::array<int, 4>> atomQuartets;
+  //! Maximum deviation for normalization (180.0 for equal mode, or specific value)
+  float                           maxDev = 180.0f;
+};
+
+//! Torsion list for a single molecule (non-ring and ring torsions)
+struct TorsionList {
+  std::vector<TorsionDef> nonRingTorsions;
+  std::vector<TorsionDef> ringTorsions;
+
+  //! Total number of torsions (non-ring + ring)
+  size_t totalCount() const { return nonRingTorsions.size() + ringTorsions.size(); }
+};
+
+//! Performance and algorithm options for TFD computation
+struct TFDComputeOptions {
+  //! Whether to use distance-based weights for torsions
+  bool              useWeights          = true;
+  //! Maximum deviation mode
+  TFDMaxDevMode     maxDevMode          = TFDMaxDevMode::Equal;
+  //! Radius for Morgan fingerprint atom invariants (for symmetry detection)
+  int               symmRadius          = 2;
+  //! Whether to ignore single bonds adjacent to triple bonds
+  bool              ignoreColinearBonds = true;
+  //! Backend selection
+  TFDComputeBackend backend             = TFDComputeBackend::GPU;
+};
+
+//! Flattened system data on host for a batch of molecules
+struct TFDSystemHost {
+  //! Flattened 3D coordinates, tightly packed (no padding)
+  //! Stored as: conf0_atom0_xyz, conf0_atom1_xyz, ..., conf1_atom0_xyz, ...
+  std::vector<float> positions;
+
+  //! Position start offset per conformer [totalConformers]
+  //! confPositionStarts[i] = float offset into positions for conformer i
+  std::vector<int> confPositionStarts;
+
+  //! Flattened torsion atom indices: [totalQuartets][4]
+  //! Multiple quartets per torsion (when torsionTypes[t] is Ring or Symmetric) are stored contiguously.
+  std::vector<std::array<int, 4>> torsionAtoms;
+
+  //! Weight per torsion [totalTorsions]
+  std::vector<float> torsionWeights;
+
+  //! Maximum deviation per torsion [totalTorsions]
+  std::vector<float> torsionMaxDevs;
+
+  //! Type per torsion [totalTorsions] - Single, Ring, or Symmetric
+  std::vector<TorsionType> torsionTypes;
+
+  //! CSR index: quartet boundaries per torsion [totalTorsions + 1]
+  //! quartetStarts[t] to quartetStarts[t+1] are indices into torsionAtoms for torsion t
+  std::vector<int> quartetStarts = {0};
+
+  // ========== Per-molecule descriptors for GPU kernel dispatch ==========
+
+  //! One descriptor per molecule with all offsets needed by both kernels
+  std::vector<MolDescriptor> molDescriptors;
+
+  //! CSR: cumulative dihedral work items per molecule [nMols + 1]
+  //! dihedralWorkStarts[m+1] - dihedralWorkStarts[m] = numConformers[m] * numQuartets[m]
+  std::vector<int> dihedralWorkStarts = {0};
+
+  //! CSR: cumulative TFD pair work items per molecule [nMols + 1]
+  //! tfdWorkStarts[m+1] - tfdWorkStarts[m] = C[m] * (C[m]-1) / 2
+  std::vector<int> tfdWorkStarts = {0};
+
+  //! Total number of dihedral angle values (numConformers * totalQuartets across all molecules)
+  int totalDihedrals_ = 0;
+
+  //! Total number of molecules
+  int numMolecules() const { return static_cast<int>(molDescriptors.size()); }
+
+  //! Total number of torsions across all molecules
+  int totalTorsions() const { return quartetStarts.empty() ? 0 : static_cast<int>(quartetStarts.size()) - 1; }
+
+  //! Total number of quartets across all molecules
+  int totalQuartets() const { return quartetStarts.empty() ? 0 : quartetStarts.back(); }
+
+  //! Total number of TFD output values
+  int totalTFDOutputs() const { return tfdWorkStarts.empty() ? 0 : tfdWorkStarts.back(); }
+
+  //! Total number of dihedral angle values to store
+  int totalDihedrals() const { return totalDihedrals_; }
+
+  //! Total number of dihedral work items (for dihedral kernel launch)
+  int totalDihedralWorkItems() const { return dihedralWorkStarts.empty() ? 0 : dihedralWorkStarts.back(); }
+};
+
+//! Flattened system data on GPU device
+//! Mirrors TFDSystemHost for device-side storage.
+struct TFDSystemDevice {
+  // ========== Shared kernel inputs ==========
+
+  //! Flattened 3D coordinates (tightly packed, no padding)
+  AsyncDeviceVector<float> positions;
+  //! Position start offset per conformer [totalConformers]
+  AsyncDeviceVector<int>   confPositionStarts;
+  //! Flattened torsion atom indices [totalQuartets * 4]
+  AsyncDeviceVector<int>   torsionAtoms;
+
+  //! Weight per torsion
+  AsyncDeviceVector<float>   torsionWeights;
+  //! Maximum deviation per torsion
+  AsyncDeviceVector<float>   torsionMaxDevs;
+  //! CSR index: quartet boundaries per torsion [totalTorsions + 1]
+  AsyncDeviceVector<int>     quartetStarts;
+  //! Type per torsion [totalTorsions]
+  AsyncDeviceVector<uint8_t> torsionTypes;
+
+  // ========== Per-molecule descriptors ==========
+
+  //! One MolDescriptor per molecule [nMols]
+  AsyncDeviceVector<MolDescriptor> molDescriptors;
+  //! CSR: cumulative dihedral work items [nMols + 1]
+  AsyncDeviceVector<int>           dihedralWorkStarts;
+  //! CSR: cumulative TFD pair work items [nMols + 1]
+  AsyncDeviceVector<int>           tfdWorkStarts;
+
+  // ========== Output buffers ==========
+
+  //! Output: computed dihedral angles [totalDihedrals]
+  AsyncDeviceVector<float> dihedralAngles;
+  //! Output: TFD matrix values
+  AsyncDeviceVector<float> tfdOutput;
+
+  //! Set CUDA stream for all buffers
+  void setStream(cudaStream_t stream);
+};
+
+//! Extract torsion list from an RDKit molecule
+//! @param mol The molecule to analyze
+//! @param maxDevMode How to determine max deviation for normalization
+//! @param symmRadius Radius for Morgan fingerprint (symmetry detection)
+//! @param ignoreColinearBonds Whether to skip bonds adjacent to triple bonds
+//! @return TorsionList containing non-ring and ring torsions
+TorsionList extractTorsionList(const RDKit::ROMol& mol,
+                               TFDMaxDevMode       maxDevMode          = TFDMaxDevMode::Equal,
+                               int                 symmRadius          = 2,
+                               bool                ignoreColinearBonds = true);
+
+//! Calculate distance-based weights for torsions
+//! @param mol The molecule
+//! @param torsionList Previously extracted torsion list
+//! @param ignoreColinearBonds Must match the value used in extractTorsionList
+//! @return Vector of weights (size = total torsion count)
+std::vector<float> computeTorsionWeights(const RDKit::ROMol& mol,
+                                         const TorsionList&  torsionList,
+                                         bool                ignoreColinearBonds = true);
+
+//! Transfer host system data to device, resizing and allocating output buffers
+//! @param host Source host data
+//! @param device Destination device data (resized as needed)
+//! @param stream CUDA stream for async transfers
+void transferToDevice(const TFDSystemHost& host, TFDSystemDevice& device, cudaStream_t stream);
+
+//! Build TFDSystemHost from a batch of molecules
+//! @param mols Vector of molecules (each may have multiple conformers)
+//! @param options Computation options
+//! @return Populated TFDSystemHost ready for GPU transfer
+TFDSystemHost buildTFDSystem(const std::vector<const RDKit::ROMol*>& mols, const TFDComputeOptions& options);
+
+//! Build TFDSystemHost from a single molecule (convenience wrapper)
+TFDSystemHost buildTFDSystem(const RDKit::ROMol& mol, const TFDComputeOptions& options);
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_COMMON_H
diff --git a/src/tfd_cpu.cpp b/src/tfd_cpu.cpp
new file mode 100644
index 0000000..ef85ac9
--- /dev/null
+++ b/src/tfd_cpu.cpp
@@ -0,0 +1,214 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tfd_cpu.h"
+
+#include <stdexcept>
+
+#include "tfd_detail.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "nvtx.h"
+
+namespace nvMolKit {
+
+double TFDCpuGenerator::computeTFDPair(const float*              anglesI,
+                                       const float*              anglesJ,
+                                       const TorsionList&        torsionList,
+                                       const std::vector<float>& weights) {
+  double sumWeightedDev = 0.0;
+  double sumWeights     = 0.0;
+  int    qOffset        = 0;
+  int    torsIdx        = 0;
+
+  auto processTorsion = [&](const TorsionDef& torsion, bool isRing) {
+    int numQ = static_cast<int>(torsion.atomQuartets.size());
+    if (numQ == 0) {
+      torsIdx++;
+      return;
+    }
+
+    double deviation;
+    if (numQ == 1) {
+      float diff = detail::circularDifference(anglesI[qOffset], anglesJ[qOffset]);
+      deviation  = diff / torsion.maxDev;
+    } else if (isRing) {
+      // Average abs(signed dihedral) for each conformer, then compare averages.
+      // Our angles are in [0,360) with a 180 deg offset from RDKit's signed convention,
+      // so abs(signed) = abs(angle - 180).
+      double avgI = 0.0, avgJ = 0.0;
+      for (int q = 0; q < numQ; ++q) {
+        avgI += std::abs(static_cast<double>(anglesI[qOffset + q]) - 180.0);
+        avgJ += std::abs(static_cast<double>(anglesJ[qOffset + q]) - 180.0);
+      }
+      avgI /= numQ;
+      avgJ /= numQ;
+      deviation = std::abs(avgI - avgJ) / torsion.maxDev;
+    } else {
+      // Symmetric: minimum circular difference across all (qi, qj) pairs
+      double minDiff = 180.0;
+      for (int qi = 0; qi < numQ; ++qi) {
+        for (int qj = 0; qj < numQ; ++qj) {
+          float diff = detail::circularDifference(anglesI[qOffset + qi], anglesJ[qOffset + qj]);
+          minDiff    = std::min(minDiff, static_cast<double>(diff));
+        }
+      }
+      deviation = minDiff / torsion.maxDev;
+    }
+
+    float w = (torsIdx < static_cast<int>(weights.size())) ? weights[torsIdx] : 1.0f;
+    sumWeightedDev += deviation * w;
+    sumWeights += w;
+    qOffset += numQ;
+    torsIdx++;
+  };
+
+  for (const auto& t : torsionList.nonRingTorsions)
+    processTorsion(t, false);
+  for (const auto& t : torsionList.ringTorsions)
+    processTorsion(t, true);
+
+  return (sumWeights > 1e-10) ? (sumWeightedDev / sumWeights) : 0.0;
+}
+
+std::vector<float> TFDCpuGenerator::computeDihedralAngles(const RDKit::ROMol& mol, const TorsionList& torsionList) {
+  int numConformers = mol.getNumConformers();
+  int totalQuartets = 0;
+  for (const auto& t : torsionList.nonRingTorsions)
+    totalQuartets += static_cast<int>(t.atomQuartets.size());
+  for (const auto& t : torsionList.ringTorsions)
+    totalQuartets += static_cast<int>(t.atomQuartets.size());
+
+  if (totalQuartets == 0 || numConformers == 0) {
+    return {};
+  }
+
+  std::vector<float> angles(numConformers * totalQuartets);
+
+  // Collect all quartets into a flat list for indexed access
+  std::vector<const std::array<int, 4>*> allQuartets;
+  allQuartets.reserve(totalQuartets);
+  for (const auto& t : torsionList.nonRingTorsions)
+    for (const auto& q : t.atomQuartets)
+      allQuartets.push_back(&q);
+  for (const auto& t : torsionList.ringTorsions)
+    for (const auto& q : t.atomQuartets)
+      allQuartets.push_back(&q);
+
+  int numAtoms = mol.getNumAtoms();
+  int confIdx  = 0;
+  for (auto confIt = mol.beginConformers(); confIt != mol.endConformers(); ++confIt, ++confIdx) {
+    const auto&        conf = **confIt;
+    std::vector<float> pos(numAtoms * 3);
+    for (int a = 0; a < numAtoms; ++a) {
+      const auto& p  = conf.getAtomPos(a);
+      pos[a * 3]     = static_cast<float>(p.x);
+      pos[a * 3 + 1] = static_cast<float>(p.y);
+      pos[a * 3 + 2] = static_cast<float>(p.z);
+    }
+
+    for (int qIdx = 0; qIdx < totalQuartets; ++qIdx) {
+      const auto& quartet                    = *allQuartets[qIdx];
+      angles[confIdx * totalQuartets + qIdx] = detail::computeDihedralAngle(&pos[quartet[0] * 3],
+                                                                            &pos[quartet[1] * 3],
+                                                                            &pos[quartet[2] * 3],
+                                                                            &pos[quartet[3] * 3]);
+    }
+  }
+
+  return angles;
+}
+
+std::vector<double> TFDCpuGenerator::computeTFDMatrixFromAngles(const std::vector<float>& angles,
+                                                                int                       numConformers,
+                                                                int                       totalQuartets,
+                                                                const TorsionList&        torsionList,
+                                                                const std::vector<float>& weights) {
+  int                 numPairs = numConformers * (numConformers - 1) / 2;
+  std::vector<double> tfdMatrix(numPairs);
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic)
+#endif
+  for (int i = 1; i < numConformers; ++i) {
+    for (int j = 0; j < i; ++j) {
+      int    pairIdx = i * (i - 1) / 2 + j;
+      double tfd =
+        computeTFDPair(angles.data() + i * totalQuartets, angles.data() + j * totalQuartets, torsionList, weights);
+      tfdMatrix[pairIdx] = tfd;
+    }
+  }
+
+  return tfdMatrix;
+}
+
+TFDCpuGenerator::TorsionData TFDCpuGenerator::extractTorsionData(const RDKit::ROMol&      mol,
+                                                                 const TFDComputeOptions& options) {
+  TorsionData data;
+  data.torsionList = extractTorsionList(mol, options.maxDevMode, options.symmRadius, options.ignoreColinearBonds);
+  if (options.useWeights) {
+    data.weights = computeTorsionWeights(mol, data.torsionList, options.ignoreColinearBonds);
+  }
+  return data;
+}
+
+std::vector<double> TFDCpuGenerator::GetTFDMatrix(const RDKit::ROMol& mol, const TFDComputeOptions& options) {
+  int numConformers = mol.getNumConformers();
+  if (numConformers < 2) {
+    return {};
+  }
+
+  TorsionData td;
+  {
+    ScopedNvtxRange range("CPU: extractTorsions", NvtxColor::kGreen);
+    td = extractTorsionData(mol, options);
+  }
+
+  if (td.torsionList.totalCount() == 0) {
+    int numPairs = numConformers * (numConformers - 1) / 2;
+    return std::vector<double>(numPairs, 0.0);
+  }
+
+  std::vector<float> angles;
+  int                totalQuartets;
+  {
+    ScopedNvtxRange range("CPU: computeDihedralAngles", NvtxColor::kGreen);
+    angles        = computeDihedralAngles(mol, td.torsionList);
+    totalQuartets = static_cast<int>(angles.size()) / numConformers;
+  }
+
+  ScopedNvtxRange range("CPU: computeTFDMatrix", NvtxColor::kGreen);
+  return computeTFDMatrixFromAngles(angles, numConformers, totalQuartets, td.torsionList, td.weights);
+}
+
+std::vector<std::vector<double>> TFDCpuGenerator::GetTFDMatrices(const std::vector<const RDKit::ROMol*>& mols,
+                                                                 const TFDComputeOptions&                options) {
+  ScopedNvtxRange range("CPU: GetTFDMatrices (" + std::to_string(mols.size()) + " mols)", NvtxColor::kGreen);
+  std::vector<std::vector<double>> results(mols.size());
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic)
+#endif
+  for (size_t i = 0; i < mols.size(); ++i) {
+    results[i] = GetTFDMatrix(*mols[i], options);
+  }
+
+  return results;
+}
+
+}  // namespace nvMolKit
diff --git a/src/tfd_cpu.h b/src/tfd_cpu.h
new file mode 100644
index 0000000..4f043f1
--- /dev/null
+++ b/src/tfd_cpu.h
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_CPU_H
+#define NVMOLKIT_TFD_CPU_H
+
+#include <vector>
+
+#include "tfd_common.h"
+
+namespace nvMolKit {
+
+//! CPU implementation of TFD computation
+class TFDCpuGenerator {
+ public:
+  TFDCpuGenerator() = default;
+
+  //! Compute TFD matrix for a single molecule
+  //! @param mol Molecule with conformers
+  //! @param options Computation options
+  //! @return Lower triangular TFD matrix as flat vector [C*(C-1)/2 values]
+  std::vector<double> GetTFDMatrix(const RDKit::ROMol& mol, const TFDComputeOptions& options = TFDComputeOptions{});
+
+  //! Compute TFD matrices for multiple molecules
+  //! @param mols Vector of molecules
+  //! @param options Computation options
+  //! @return Vector of TFD matrices, one per molecule
+  std::vector<std::vector<double>> GetTFDMatrices(const std::vector<const RDKit::ROMol*>& mols,
+                                                  const TFDComputeOptions& options = TFDComputeOptions{});
+
+  //! Compute dihedral angles for all conformers of a molecule
+  //! @param mol Molecule with conformers
+  //! @param torsionList Previously extracted torsion list
+  //! @return Angles array [numConformers][totalQuartets], flattened
+  std::vector<float> computeDihedralAngles(const RDKit::ROMol& mol, const TorsionList& torsionList);
+
+ private:
+  //! Torsion list and weights for a single molecule
+  struct TorsionData {
+    TorsionList        torsionList;
+    std::vector<float> weights;
+  };
+
+  //! Extract torsion list and weights from a molecule
+  static TorsionData extractTorsionData(const RDKit::ROMol& mol, const TFDComputeOptions& options);
+
+  //! Compute TFD matrix from precomputed angles
+  std::vector<double> computeTFDMatrixFromAngles(const std::vector<float>& angles,
+                                                 int                       numConformers,
+                                                 int                       totalQuartets,
+                                                 const TorsionList&        torsionList,
+                                                 const std::vector<float>& weights);
+
+  //! Compute TFD between two conformers
+  static double computeTFDPair(const float*              anglesI,
+                               const float*              anglesJ,
+                               const TorsionList&        torsionList,
+                               const std::vector<float>& weights);
+};
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_CPU_H
diff --git a/src/tfd_detail.h b/src/tfd_detail.h
new file mode 100644
index 0000000..05e5077
--- /dev/null
+++ b/src/tfd_detail.h
@@ -0,0 +1,123 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_DETAIL_H
+#define NVMOLKIT_TFD_DETAIL_H
+
+#include <cmath>
+
+#ifdef __CUDACC__
+#define TFD_INLINE __host__ __device__ __forceinline__
+#else
+#define TFD_INLINE inline
+#endif
+
+namespace nvMolKit {
+namespace detail {
+
+constexpr float kPi       = 3.14159265358979323846f;
+constexpr float kRadToDeg = 180.0f / kPi;
+
+//! Cross product of two 3D vectors: result = a x b
+TFD_INLINE void cross3(const float* a, const float* b, float* result) {
+  result[0] = a[1] * b[2] - a[2] * b[1];
+  result[1] = a[2] * b[0] - a[0] * b[2];
+  result[2] = a[0] * b[1] - a[1] * b[0];
+}
+
+//! Dot product of two 3D vectors
+TFD_INLINE float dot3(const float* a, const float* b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
+//! Vector subtraction: result = a - b
+TFD_INLINE void sub3(const float* a, const float* b, float* result) {
+  result[0] = a[0] - b[0];
+  result[1] = a[1] - b[1];
+  result[2] = a[2] - b[2];
+}
+
+//! Vector length
+TFD_INLINE float length3(const float* v) {
+  return sqrtf(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+}
+
+//! Compute dihedral angle in degrees from four atom positions
+//! @param p1, p2, p3, p4 The four atom positions (p2-p3 is the central bond)
+//! @return Dihedral angle in degrees [0, 360)
+TFD_INLINE float computeDihedralAngle(const float* p1, const float* p2, const float* p3, const float* p4) {
+  // Vector from p2 to p3 (central bond)
+  float b1[3];
+  sub3(p3, p2, b1);
+
+  // Vector from p2 to p1
+  float v1[3];
+  sub3(p1, p2, v1);
+
+  // Vector from p3 to p4
+  float v2[3];
+  sub3(p4, p3, v2);
+
+  // Normal to plane 1 (p1-p2-p3)
+  float n1[3];
+  cross3(v1, b1, n1);
+
+  // Normal to plane 2 (p2-p3-p4)
+  float n2[3];
+  cross3(b1, v2, n2);
+
+  // Lengths of normals
+  float n1Len = length3(n1);
+  float n2Len = length3(n2);
+
+  if (n1Len < 1e-10f || n2Len < 1e-10f) {
+    return 0.0f;  // Degenerate case
+  }
+
+  // Compute angle between normals
+  float cosAngle = dot3(n1, n2) / (n1Len * n2Len);
+  cosAngle       = fmaxf(-1.0f, fminf(1.0f, cosAngle));  // Clamp for numerical stability
+  float angle    = acosf(cosAngle) * kRadToDeg;
+
+  // Determine sign using cross product of normals dotted with central bond
+  float crossN[3];
+  cross3(n1, n2, crossN);
+  if (dot3(crossN, b1) < 0) {
+    angle = -angle;
+  }
+
+  // Normalize to [0, 360)
+  if (angle < 0) {
+    angle += 360.0f;
+  }
+
+  return angle;
+}
+
+//! Compute circular difference between two angles (both in degrees)
+//! @param angle1, angle2 Angles in degrees [0, 360)
+//! @return Minimum difference in degrees [0, 180]
+TFD_INLINE float circularDifference(float angle1, float angle2) {
+  float diff = fabsf(angle1 - angle2);
+  if (360.0f - diff < diff) {
+    diff = 360.0f - diff;
+  }
+  return diff;
+}
+
+}  // namespace detail
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_DETAIL_H
diff --git a/src/tfd_gpu.cpp b/src/tfd_gpu.cpp
new file mode 100644
index 0000000..df46875
--- /dev/null
+++ b/src/tfd_gpu.cpp
@@ -0,0 +1,192 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tfd_gpu.h"
+
+#include <cuda_runtime.h>
+
+#include <stdexcept>
+
+#include "nvtx.h"
+#include "tfd_kernels.h"
+
+namespace nvMolKit {
+
+// ========== TFDGpuResult ==========
+
+std::vector<double> TFDGpuResult::extractMolecule(int molIdx) const {
+  ScopedNvtxRange range("GPU: extractMolecule", NvtxColor::kRed);
+  if (molIdx < 0 || molIdx >= static_cast<int>(conformerCounts.size())) {
+    throw std::out_of_range("Invalid molecule index: " + std::to_string(molIdx));
+  }
+
+  int outStart  = tfdOutputStarts[molIdx];
+  int outEnd    = tfdOutputStarts[molIdx + 1];
+  int numValues = outEnd - outStart;
+
+  if (numValues == 0) {
+    return {};
+  }
+
+  // Copy from device to host
+  std::vector<float> hostFloats(numValues);
+  const_cast<AsyncDeviceVector<float>&>(tfdValues).copyToHost(hostFloats.data(), numValues, 0, outStart);
+  cudaStreamSynchronize(tfdValues.stream());
+
+  // Convert to double
+  std::vector<double> result(numValues);
+  for (int i = 0; i < numValues; ++i) {
+    result[i] = static_cast<double>(hostFloats[i]);
+  }
+
+  return result;
+}
+
+std::vector<std::vector<double>> TFDGpuResult::extractAll() const {
+  ScopedNvtxRange range("GPU: extractAll (D2H)", NvtxColor::kRed);
+
+  int                              numMolecules = static_cast<int>(conformerCounts.size());
+  std::vector<std::vector<double>> results(numMolecules);
+
+  if (tfdValues.size() == 0) {
+    return results;
+  }
+
+  int totalValues = static_cast<int>(tfdValues.size());
+
+  // Copy all data at once
+  std::vector<float> allHostFloats(totalValues);
+  const_cast<AsyncDeviceVector<float>&>(tfdValues).copyToHost(allHostFloats.data(), totalValues);
+  cudaStreamSynchronize(tfdValues.stream());
+
+  // Pre-allocate all per-molecule vectors in one pass to reduce allocation overhead
+  for (int m = 0; m < numMolecules; ++m) {
+    results[m].resize(tfdOutputStarts[m + 1] - tfdOutputStarts[m]);
+  }
+
+  // Convert float→double and scatter (parallelizable, cache-friendly)
+  const float* src = allHostFloats.data();
+  for (int m = 0; m < numMolecules; ++m) {
+    int     numValues = static_cast<int>(results[m].size());
+    double* dst       = results[m].data();
+    for (int i = 0; i < numValues; ++i) {
+      dst[i] = static_cast<double>(src[i]);
+    }
+    src += numValues;
+  }
+
+  return results;
+}
+
+// ========== TFDGpuGenerator ==========
+
+TFDGpuGenerator::TFDGpuGenerator() : stream_() {
+  device_.setStream(stream_.stream());
+}
+
+TFDGpuResult TFDGpuGenerator::GetTFDMatricesGpuBuffer(const std::vector<const RDKit::ROMol*>& mols,
+                                                      const TFDComputeOptions&                options) {
+  ScopedNvtxRange outerRange("GPU: GetTFDMatricesGpuBuffer (" + std::to_string(mols.size()) + " mols)",
+                             NvtxColor::kBlue);
+
+  TFDGpuResult result;
+
+  if (mols.empty()) {
+    return result;
+  }
+
+  // Build host system data (CPU preprocessing, parallelized across molecules)
+  TFDSystemHost system = buildTFDSystem(mols, options);
+
+  // Build result metadata from MolDescriptors
+  result.tfdOutputStarts.resize(system.numMolecules() + 1);
+  result.tfdOutputStarts[0] = 0;
+  result.conformerCounts.resize(system.numMolecules());
+  for (int i = 0; i < system.numMolecules(); ++i) {
+    const auto& desc              = system.molDescriptors[i];
+    result.conformerCounts[i]     = desc.numConformers;
+    int numPairs                  = desc.numConformers * (desc.numConformers - 1) / 2;
+    result.tfdOutputStarts[i + 1] = result.tfdOutputStarts[i] + numPairs;
+  }
+
+  // Handle edge case: no TFD outputs
+  if (system.totalTFDOutputs() == 0) {
+    return result;
+  }
+
+  cudaStream_t stream = stream_.stream();
+
+  // Transfer to device (handles resize + copy + output buffer allocation)
+  {
+    ScopedNvtxRange range("GPU: transferToDevice (H2D)", NvtxColor::kYellow);
+    transferToDevice(system, device_, stream);
+  }
+
+  // Launch dihedral kernel
+  {
+    ScopedNvtxRange range("GPU: launchDihedralKernel", NvtxColor::kOrange);
+    launchDihedralKernel(system.totalDihedralWorkItems(),
+                         device_.positions.data(),
+                         device_.confPositionStarts.data(),
+                         device_.torsionAtoms.data(),
+                         device_.molDescriptors.data(),
+                         device_.dihedralWorkStarts.data(),
+                         system.numMolecules(),
+                         device_.dihedralAngles.data(),
+                         stream);
+  }
+
+  // Launch TFD matrix kernel (one block per molecule)
+  {
+    ScopedNvtxRange range("GPU: launchTFDMatrixKernel", NvtxColor::kOrange);
+    launchTFDMatrixKernel(system.numMolecules(),
+                          device_.dihedralAngles.data(),
+                          device_.torsionWeights.data(),
+                          device_.torsionMaxDevs.data(),
+                          device_.quartetStarts.data(),
+                          device_.torsionTypes.data(),
+                          device_.molDescriptors.data(),
+                          device_.tfdOutput.data(),
+                          stream);
+  }
+
+  // Move output to result (transfer ownership of GPU memory)
+  result.tfdValues = std::move(device_.tfdOutput);
+
+  // Reallocate the output buffer for next use
+  device_.tfdOutput = AsyncDeviceVector<float>();
+  device_.tfdOutput.setStream(stream);
+
+  return result;
+}
+
+std::vector<std::vector<double>> TFDGpuGenerator::GetTFDMatrices(const std::vector<const RDKit::ROMol*>& mols,
+                                                                 const TFDComputeOptions&                options) {
+  ScopedNvtxRange range("GPU: GetTFDMatrices (" + std::to_string(mols.size()) + " mols)", NvtxColor::kBlue);
+  TFDGpuResult    gpuResult = GetTFDMatricesGpuBuffer(mols, options);
+  return gpuResult.extractAll();
+}
+
+std::vector<double> TFDGpuGenerator::GetTFDMatrix(const RDKit::ROMol& mol, const TFDComputeOptions& options) {
+  std::vector<const RDKit::ROMol*> mols    = {&mol};
+  auto                             results = GetTFDMatrices(mols, options);
+
+  if (results.empty()) {
+    return {};
+  }
+  return results[0];
+}
+
+}  // namespace nvMolKit
diff --git a/src/tfd_gpu.h b/src/tfd_gpu.h
new file mode 100644
index 0000000..6560cb5
--- /dev/null
+++ b/src/tfd_gpu.h
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_GPU_H
+#define NVMOLKIT_TFD_GPU_H
+
+#include <vector>
+
+#include "device.h"
+#include "tfd_common.h"
+
+namespace nvMolKit {
+
+//! Result structure for GPU TFD computation with GPU-resident data
+struct TFDGpuResult {
+  //! GPU-resident TFD values (flattened across all molecules)
+  AsyncDeviceVector<float> tfdValues;
+
+  //! CSR index: TFD output boundaries per molecule [nMols + 1]
+  std::vector<int> tfdOutputStarts;
+
+  //! Number of conformers per molecule [nMols]
+  std::vector<int> conformerCounts;
+
+  //! Extract TFD matrix for a single molecule (copies to host)
+  //! @param molIdx Index of the molecule in the batch
+  //! @return Lower triangular TFD matrix as flat vector
+  std::vector<double> extractMolecule(int molIdx) const;
+
+  //! Extract all TFD matrices (copies to host)
+  //! @return Vector of TFD matrices, one per molecule
+  std::vector<std::vector<double>> extractAll() const;
+};
+
+//! GPU implementation of TFD computation
+class TFDGpuGenerator {
+ public:
+  TFDGpuGenerator();
+
+  //! Compute TFD matrix for a single molecule
+  //! @param mol Molecule with conformers
+  //! @param options Computation options
+  //! @return Lower triangular TFD matrix as flat vector [C*(C-1)/2 values]
+  std::vector<double> GetTFDMatrix(const RDKit::ROMol& mol, const TFDComputeOptions& options = TFDComputeOptions{});
+
+  //! Compute TFD matrices for multiple molecules
+  //! @param mols Vector of molecules
+  //! @param options Computation options
+  //! @return Vector of TFD matrices, one per molecule
+  std::vector<std::vector<double>> GetTFDMatrices(const std::vector<const RDKit::ROMol*>& mols,
+                                                  const TFDComputeOptions& options = TFDComputeOptions{});
+
+  //! Compute TFD matrices and keep results on GPU
+  //! @param mols Vector of molecules
+  //! @param options Computation options
+  //! @return TFDGpuResult with GPU-resident data
+  TFDGpuResult GetTFDMatricesGpuBuffer(const std::vector<const RDKit::ROMol*>& mols,
+                                       const TFDComputeOptions&                options = TFDComputeOptions{});
+
+ private:
+  // NOTE: Declaration order matters! Stream must be declared BEFORE device_
+  // because members are destroyed in reverse order, and device_ buffers need
+  // a valid stream for cudaFreeAsync during destruction.
+  ScopedStream    stream_;
+  TFDSystemDevice device_;
+};
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_GPU_H
diff --git a/src/tfd_kernels.cu b/src/tfd_kernels.cu
new file mode 100644
index 0000000..f268b0d
--- /dev/null
+++ b/src/tfd_kernels.cu
@@ -0,0 +1,218 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cuda_error_check.h"
+#include "tfd_detail.h"
+#include "tfd_kernels.h"
+
+namespace nvMolKit {
+
+using detail::circularDifference;
+using detail::computeDihedralAngle;
+
+namespace {
+
+//! Binary search: find m such that starts[m] <= idx < starts[m+1]
+__device__ __forceinline__ int findMolecule(const int* starts, int numMolecules, int idx) {
+  int lo = 0, hi = numMolecules;
+  while (lo < hi) {
+    int mid = (lo + hi) / 2;
+    if (starts[mid + 1] <= idx) {
+      lo = mid + 1;
+    } else {
+      hi = mid;
+    }
+  }
+  return lo;
+}
+
+//! Kernel to compute dihedral angles for all conformers.
+//! One thread per (conformer, quartet) work item.
+//! Uses binary search on dihedralWorkStarts to find the molecule,
+//! then computes (confIdx, quartetIdx, outIdx) arithmetically.
+__global__ void dihedralKernel(const int totalWorkItems,
+                               const float* __restrict__ positions,
+                               const int* __restrict__ confPositionStarts,
+                               const int* __restrict__ torsionAtoms,
+                               const MolDescriptor* __restrict__ molDescriptors,
+                               const int* __restrict__ dihedralWorkStarts,
+                               const int numMolecules,
+                               float* __restrict__ dihedralAngles) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= totalWorkItems) {
+    return;
+  }
+
+  int m = findMolecule(dihedralWorkStarts, numMolecules, idx);
+
+  const MolDescriptor& desc     = molDescriptors[m];
+  int                  localIdx = idx - dihedralWorkStarts[m];
+  int                  c        = localIdx / desc.numQuartets;
+  int                  q        = localIdx % desc.numQuartets;
+
+  int confIdx = desc.confStart + c;
+  int torsIdx = desc.quartetStart + q;
+  int outIdx  = desc.dihedStart + localIdx;
+
+  int a  = torsionAtoms[torsIdx * 4 + 0];
+  int b  = torsionAtoms[torsIdx * 4 + 1];
+  int cc = torsionAtoms[torsIdx * 4 + 2];
+  int d  = torsionAtoms[torsIdx * 4 + 3];
+
+  const float* posBase = positions + confPositionStarts[confIdx];
+
+  dihedralAngles[outIdx] = computeDihedralAngle(posBase + a * 3, posBase + b * 3, posBase + cc * 3, posBase + d * 3);
+}
+
+//! Kernel to compute TFD matrix values.
+//! One block per molecule. Threads within a block cooperatively process
+//! all C*(C-1)/2 conformer pairs via grid-stride loop.
+//! All threads in a block share the same torsion types — no cross-molecule divergence.
+__global__ void tfdMatrixKernel(const int numMolecules,
+                                const float* __restrict__ dihedralAngles,
+                                const float* __restrict__ torsionWeights,
+                                const float* __restrict__ torsionMaxDevs,
+                                const int* __restrict__ quartetStarts,
+                                const uint8_t* __restrict__ torsionTypes,
+                                const MolDescriptor* __restrict__ molDescriptors,
+                                float* __restrict__ tfdOutput) {
+  int m = blockIdx.x;
+  if (m >= numMolecules) {
+    return;
+  }
+
+  const MolDescriptor& desc     = molDescriptors[m];
+  int                  numConf  = desc.numConformers;
+  int                  numPairs = numConf * (numConf - 1) / 2;
+  int                  numTors  = desc.numTorsions;
+  int                  qBase    = quartetStarts[desc.torsStart];
+
+  if (numPairs == 0 || numTors == 0) {
+    return;
+  }
+
+  // Each thread processes conformer pairs via grid-stride within this block
+  for (int pairIdx = threadIdx.x; pairIdx < numPairs; pairIdx += blockDim.x) {
+    // Decode lower-triangular (i, j) from flat pairIdx
+    // i*(i-1)/2 + j = pairIdx, i > j >= 0
+    // i = floor((1 + sqrt(1 + 8*pairIdx)) / 2)
+    int i = static_cast<int>((1.0f + sqrtf(1.0f + 8.0f * pairIdx)) * 0.5f);
+    if (i * (i - 1) / 2 > pairIdx)
+      i--;
+    int j = pairIdx - i * (i - 1) / 2;
+
+    int aI = desc.dihedStart + i * desc.numQuartets;
+    int aJ = desc.dihedStart + j * desc.numQuartets;
+
+    float sumWeightedDev = 0.0f;
+    float sumWeights     = 0.0f;
+
+    for (int t = 0; t < numTors; ++t) {
+      int     globalT     = desc.torsStart + t;
+      int     qLocalStart = quartetStarts[globalT] - qBase;
+      int     numQ        = quartetStarts[globalT + 1] - quartetStarts[globalT];
+      uint8_t type        = torsionTypes[globalT];
+
+      float deviation;
+      if (type == 0) {  // Single
+        deviation = circularDifference(dihedralAngles[aI + qLocalStart], dihedralAngles[aJ + qLocalStart]) /
+                    torsionMaxDevs[globalT];
+      } else if (type == 1) {  // Ring
+        float avgI = 0.0f;
+        float avgJ = 0.0f;
+        for (int qq = 0; qq < numQ; ++qq) {
+          avgI += fabsf(dihedralAngles[aI + qLocalStart + qq] - 180.0f);
+          avgJ += fabsf(dihedralAngles[aJ + qLocalStart + qq] - 180.0f);
+        }
+        avgI /= numQ;
+        avgJ /= numQ;
+        deviation = fabsf(avgI - avgJ) / torsionMaxDevs[globalT];
+      } else {  // Symmetric
+        float minDiff = 180.0f;
+        for (int qi = 0; qi < numQ; ++qi) {
+          for (int qj = 0; qj < numQ; ++qj) {
+            float diff =
+              circularDifference(dihedralAngles[aI + qLocalStart + qi], dihedralAngles[aJ + qLocalStart + qj]);
+            minDiff = fminf(minDiff, diff);
+          }
+        }
+        deviation = minDiff / torsionMaxDevs[globalT];
+      }
+
+      float weight = torsionWeights[globalT];
+      sumWeightedDev += deviation * weight;
+      sumWeights += weight;
+    }
+
+    tfdOutput[desc.tfdOutStart + pairIdx] = (sumWeights > 1e-10f) ? (sumWeightedDev / sumWeights) : 0.0f;
+  }
+}
+
+}  // namespace
+
+void launchDihedralKernel(int                  totalWorkItems,
+                          const float*         positions,
+                          const int*           confPositionStarts,
+                          const int*           torsionAtoms,
+                          const MolDescriptor* molDescriptors,
+                          const int*           dihedralWorkStarts,
+                          int                  numMolecules,
+                          float*               dihedralAngles,
+                          cudaStream_t         stream) {
+  if (totalWorkItems == 0) {
+    return;
+  }
+
+  int gridSize = (totalWorkItems + kTFDBlockSize - 1) / kTFDBlockSize;
+
+  dihedralKernel<<<gridSize, kTFDBlockSize, 0, stream>>>(totalWorkItems,
+                                                         positions,
+                                                         confPositionStarts,
+                                                         torsionAtoms,
+                                                         molDescriptors,
+                                                         dihedralWorkStarts,
+                                                         numMolecules,
+                                                         dihedralAngles);
+
+  cudaCheckError(cudaGetLastError());
+}
+
+void launchTFDMatrixKernel(int                  numMolecules,
+                           const float*         dihedralAngles,
+                           const float*         torsionWeights,
+                           const float*         torsionMaxDevs,
+                           const int*           quartetStarts,
+                           const uint8_t*       torsionTypes,
+                           const MolDescriptor* molDescriptors,
+                           float*               tfdOutput,
+                           cudaStream_t         stream) {
+  if (numMolecules == 0) {
+    return;
+  }
+
+  // One block per molecule; threads within block process pairs via grid-stride
+  tfdMatrixKernel<<<numMolecules, kTFDBlockSize, 0, stream>>>(numMolecules,
+                                                              dihedralAngles,
+                                                              torsionWeights,
+                                                              torsionMaxDevs,
+                                                              quartetStarts,
+                                                              torsionTypes,
+                                                              molDescriptors,
+                                                              tfdOutput);
+
+  cudaCheckError(cudaGetLastError());
+}
+
+}  // namespace nvMolKit
diff --git a/src/tfd_kernels.h b/src/tfd_kernels.h
new file mode 100644
index 0000000..190a503
--- /dev/null
+++ b/src/tfd_kernels.h
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_KERNELS_H
+#define NVMOLKIT_TFD_KERNELS_H
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+#include "tfd_types.h"
+
+namespace nvMolKit {
+
+//! Block size for TFD kernels
+constexpr int kTFDBlockSize = 256;
+
+//! Launch kernel to compute dihedral angles for all conformers.
+//! One thread per (conformer, quartet) work item; uses binary search on
+//! dihedralWorkStarts to find the molecule, then computes indices arithmetically.
+void launchDihedralKernel(int                  totalWorkItems,
+                          const float*         positions,
+                          const int*           confPositionStarts,
+                          const int*           torsionAtoms,
+                          const MolDescriptor* molDescriptors,
+                          const int*           dihedralWorkStarts,
+                          int                  numMolecules,
+                          float*               dihedralAngles,
+                          cudaStream_t         stream);
+
+//! Launch kernel to compute TFD matrix for all conformer pairs.
+//! One block per molecule; threads within a block cooperatively process pairs.
+//! Uses shared memory for torsion metadata to reduce global reads.
+void launchTFDMatrixKernel(int                  numMolecules,
+                           const float*         dihedralAngles,
+                           const float*         torsionWeights,
+                           const float*         torsionMaxDevs,
+                           const int*           quartetStarts,
+                           const uint8_t*       torsionTypes,
+                           const MolDescriptor* molDescriptors,
+                           float*               tfdOutput,
+                           cudaStream_t         stream);
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_KERNELS_H
diff --git a/src/tfd_transfer.cu b/src/tfd_transfer.cu
new file mode 100644
index 0000000..530d5bc
--- /dev/null
+++ b/src/tfd_transfer.cu
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tfd_common.h"
+
+namespace nvMolKit {
+
+void TFDSystemDevice::setStream(cudaStream_t stream) {
+  positions.setStream(stream);
+  confPositionStarts.setStream(stream);
+  torsionAtoms.setStream(stream);
+  torsionWeights.setStream(stream);
+  torsionMaxDevs.setStream(stream);
+  quartetStarts.setStream(stream);
+  torsionTypes.setStream(stream);
+  molDescriptors.setStream(stream);
+  dihedralWorkStarts.setStream(stream);
+  tfdWorkStarts.setStream(stream);
+  dihedralAngles.setStream(stream);
+  tfdOutput.setStream(stream);
+}
+
+void transferToDevice(const TFDSystemHost& host, TFDSystemDevice& device, cudaStream_t stream) {
+  device.setStream(stream);
+
+  // Positions and conformer offsets
+  device.positions.setFromVector(host.positions);
+  device.confPositionStarts.setFromVector(host.confPositionStarts);
+
+  // std::array<int,4> is layout-compatible with int[4]; copy directly
+  device.torsionAtoms.setFromArray(reinterpret_cast<const int*>(host.torsionAtoms.data()),
+                                   host.torsionAtoms.size() * 4);
+
+  // Torsion metadata
+  device.torsionWeights.setFromVector(host.torsionWeights);
+  device.torsionMaxDevs.setFromVector(host.torsionMaxDevs);
+  device.quartetStarts.setFromVector(host.quartetStarts);
+
+  // TorsionType is enum class : uint8_t; copy directly
+  device.torsionTypes.setFromArray(reinterpret_cast<const uint8_t*>(host.torsionTypes.data()),
+                                   host.torsionTypes.size());
+
+  // Per-molecule descriptors (compact: ~32 bytes per molecule)
+  device.molDescriptors.setFromArray(host.molDescriptors.data(), host.molDescriptors.size());
+  device.dihedralWorkStarts.setFromVector(host.dihedralWorkStarts);
+  device.tfdWorkStarts.setFromVector(host.tfdWorkStarts);
+
+  // Allocate and zero output buffers
+  int totalDihedrals = host.totalDihedrals();
+  if (static_cast<int>(device.dihedralAngles.size()) < totalDihedrals) {
+    device.dihedralAngles.resize(totalDihedrals);
+  }
+  cudaMemsetAsync(device.dihedralAngles.data(), 0, totalDihedrals * sizeof(float), stream);
+
+  int totalTFDOutputs = host.totalTFDOutputs();
+  if (static_cast<int>(device.tfdOutput.size()) < totalTFDOutputs) {
+    device.tfdOutput.resize(totalTFDOutputs);
+  }
+  cudaMemsetAsync(device.tfdOutput.data(), 0, totalTFDOutputs * sizeof(float), stream);
+}
+
+}  // namespace nvMolKit
diff --git a/src/tfd_types.h b/src/tfd_types.h
new file mode 100644
index 0000000..7c1d8c5
--- /dev/null
+++ b/src/tfd_types.h
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_TFD_TYPES_H
+#define NVMOLKIT_TFD_TYPES_H
+
+#include <cstdint>
+
+namespace nvMolKit {
+
+//! Torsion type for multi-quartet handling
+enum class TorsionType : uint8_t {
+  Single,     //!< 1 quartet: direct circularDifference
+  Ring,       //!< N quartets: average abs(signed), compare averages
+  Symmetric,  //!< N quartets: min circularDiff across all (qi,qj) pairs
+};
+
+//! Per-molecule descriptor for GPU kernel dispatch.
+//! Replaces per-work-item index arrays with compact per-molecule metadata.
+//! Kernels use binary search on cumulative work-item counts to find the molecule,
+//! then compute local indices arithmetically.
+struct MolDescriptor {
+  int confStart;      //!< First conformer index (into confPositionStarts)
+  int numConformers;  //!< Number of conformers for this molecule
+  int quartetStart;   //!< First quartet index (into torsionAtoms)
+  int numQuartets;    //!< Total quartets for this molecule
+  int dihedStart;     //!< Offset into dihedralAngles output
+  int torsStart;      //!< First torsion index (into weights/maxDevs/types/quartetStarts)
+  int numTorsions;    //!< Number of torsions
+  int tfdOutStart;    //!< Offset into tfdOutput
+};
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_TFD_TYPES_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 03945da..c5bce6a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -280,10 +280,30 @@ target_link_libraries(
   test_substruct_label_integration PRIVATE substructure_search molecules device
                                            test_utils molData ${RDKit_LIBS})
 
+add_executable(test_tfd_cpu test_tfd_cpu.cpp)
+target_link_libraries(test_tfd_cpu PRIVATE ${RDKit_LIBS} tfd_cpu test_utils)
+
+add_executable(test_tfd_kernels test_tfd_kernels.cu)
+target_link_libraries(
+  test_tfd_kernels PRIVATE ${RDKit_LIBS} tfd_cpu tfd_kernels device_vector
+                           test_utils)
+
+add_executable(test_tfd_gpu test_tfd_gpu.cpp)
+target_link_libraries(test_tfd_gpu PRIVATE ${RDKit_LIBS} tfd_gpu device_vector
+                                           test_utils)
+
+add_executable(test_tfd test_tfd.cpp)
+target_link_libraries(test_tfd PRIVATE ${RDKit_LIBS} tfd device_vector
+                                       test_utils)
+
 set(TEST_LIST
     test_boolean_tree
     test_butina
     test_bfgs_hessian
+    test_tfd_cpu
+    test_tfd_kernels
+    test_tfd_gpu
+    test_tfd
     test_openmp_helpers
     test_conformer_checkers
     test_coordgen
diff --git a/tests/test_tfd.cpp b/tests/test_tfd.cpp
new file mode 100644
index 0000000..ec0609f
--- /dev/null
+++ b/tests/test_tfd.cpp
@@ -0,0 +1,163 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <GraphMol/DistGeomHelpers/Embedder.h>
+#include <GraphMol/ROMol.h>
+#include <GraphMol/SmilesParse/SmilesParse.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "tfd.h"
+#include "tfd_gpu.h"
+
+namespace {
+
+constexpr double kTolerance = 1e-3;
+
+void generateConformers(RDKit::ROMol& mol, int numConformers, int seed = 42) {
+  RDKit::DGeomHelpers::EmbedParameters params = RDKit::DGeomHelpers::ETKDGv3;
+  params.randomSeed                           = seed;
+  params.numThreads                           = 1;
+  RDKit::DGeomHelpers::EmbedMultipleConfs(mol, numConformers, params);
+}
+
+}  // namespace
+
+// Global check for CUDA availability (same pattern as test_tfd_gpu.cpp)
+static bool gCudaAvailable = false;
+static bool gCudaChecked   = false;
+
+static bool checkCudaAvailable() {
+  if (!gCudaChecked) {
+    gCudaChecked = true;
+    try {
+      int         deviceCount = 0;
+      cudaError_t err         = cudaGetDeviceCount(&deviceCount);
+      if (err == cudaSuccess && deviceCount > 0) {
+        err = cudaSetDevice(0);
+        if (err == cudaSuccess) {
+          gCudaAvailable = true;
+        } else {
+          cudaGetLastError();
+        }
+      } else {
+        cudaGetLastError();
+      }
+    } catch (...) {
+      gCudaAvailable = false;
+    }
+  }
+  return gCudaAvailable;
+}
+
+// ========== Unified TFDGenerator API Tests ==========
+
+class TFDGeneratorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    if (!checkCudaAvailable()) {
+      GTEST_SKIP() << "No CUDA devices available, skipping GPU tests";
+    }
+  }
+};
+
+TEST_F(TFDGeneratorTest, UnifiedAPIBackendSelection) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 4);
+
+  nvMolKit::TFDGenerator generator;
+
+  nvMolKit::TFDComputeOptions gpuOptions;
+  gpuOptions.backend = nvMolKit::TFDComputeBackend::GPU;
+  auto gpuResult     = generator.GetTFDMatrix(*mol, gpuOptions);
+
+  nvMolKit::TFDComputeOptions cpuOptions;
+  cpuOptions.backend = nvMolKit::TFDComputeBackend::CPU;
+  auto cpuResult     = generator.GetTFDMatrix(*mol, cpuOptions);
+
+  ASSERT_EQ(gpuResult.size(), cpuResult.size());
+  for (size_t i = 0; i < gpuResult.size(); ++i) {
+    EXPECT_NEAR(gpuResult[i], cpuResult[i], kTolerance) << "Mismatch at index " << i;
+  }
+}
+
+TEST_F(TFDGeneratorTest, UnifiedAPIBatchProcessing) {
+  std::vector<std::unique_ptr<RDKit::RWMol>> mols;
+  std::vector<const RDKit::ROMol*>           molPtrs;
+
+  for (int i = 0; i < 3; ++i) {
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+    generateConformers(*mol, 3 + i);
+    if (mol->getNumConformers() >= 2) {
+      molPtrs.push_back(mol.get());
+      mols.push_back(std::move(mol));
+    }
+  }
+
+  ASSERT_GE(mols.size(), 2u);
+
+  nvMolKit::TFDGenerator generator;
+
+  nvMolKit::TFDComputeOptions gpuOptions;
+  gpuOptions.backend = nvMolKit::TFDComputeBackend::GPU;
+  auto gpuResults    = generator.GetTFDMatrices(molPtrs, gpuOptions);
+
+  nvMolKit::TFDComputeOptions cpuOptions;
+  cpuOptions.backend = nvMolKit::TFDComputeBackend::CPU;
+  auto cpuResults    = generator.GetTFDMatrices(molPtrs, cpuOptions);
+
+  ASSERT_EQ(gpuResults.size(), cpuResults.size());
+  for (size_t m = 0; m < gpuResults.size(); ++m) {
+    ASSERT_EQ(gpuResults[m].size(), cpuResults[m].size());
+    for (size_t i = 0; i < gpuResults[m].size(); ++i) {
+      EXPECT_NEAR(gpuResults[m][i], cpuResults[m][i], kTolerance) << "Mismatch at molecule " << m << " index " << i;
+    }
+  }
+}
+
+TEST_F(TFDGeneratorTest, GpuBufferMethodWorks) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+  generateConformers(*mol, 4);
+
+  std::vector<const RDKit::ROMol*> molPtrs = {mol.get()};
+
+  nvMolKit::TFDGpuGenerator   gpuGenerator;
+  nvMolKit::TFDComputeOptions options;
+
+  auto gpuResult = gpuGenerator.GetTFDMatricesGpuBuffer(molPtrs, options);
+
+  auto extracted = gpuResult.extractAll();
+  ASSERT_EQ(extracted.size(), 1u);
+  EXPECT_EQ(extracted[0].size(), 6u);  // 4*(4-1)/2 = 6
+}
+
+TEST_F(TFDGeneratorTest, EmptyInput) {
+  nvMolKit::TFDGenerator           generator;
+  std::vector<const RDKit::ROMol*> emptyMols;
+  nvMolKit::TFDComputeOptions      options;
+
+  options.backend = nvMolKit::TFDComputeBackend::GPU;
+  auto gpuResults = generator.GetTFDMatrices(emptyMols, options);
+  EXPECT_TRUE(gpuResults.empty());
+
+  options.backend = nvMolKit::TFDComputeBackend::CPU;
+  auto cpuResults = generator.GetTFDMatrices(emptyMols, options);
+  EXPECT_TRUE(cpuResults.empty());
+}
diff --git a/tests/test_tfd_cpu.cpp b/tests/test_tfd_cpu.cpp
new file mode 100644
index 0000000..aacf09a
--- /dev/null
+++ b/tests/test_tfd_cpu.cpp
@@ -0,0 +1,828 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <GraphMol/DistGeomHelpers/Embedder.h>
+#include <GraphMol/MolOps.h>
+#include <GraphMol/ROMol.h>
+#include <GraphMol/SmilesParse/SmilesParse.h>
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tfd_common.h"
+#include "tfd_cpu.h"
+
+namespace {
+
+constexpr double kTolerance = 1e-4;
+
+//! Generate conformers for a molecule using RDKit
+void generateConformers(RDKit::ROMol& mol, int numConformers, int seed = 42) {
+  RDKit::DGeomHelpers::EmbedParameters params = RDKit::DGeomHelpers::ETKDGv3;
+  params.randomSeed                           = seed;
+  params.numThreads                           = 1;
+  RDKit::DGeomHelpers::EmbedMultipleConfs(mol, numConformers, params);
+}
+
+}  // namespace
+
+class TFDCpuTest : public ::testing::Test {
+ protected:
+  nvMolKit::TFDCpuGenerator generator_;
+};
+
+// =============================================================================
+// extractTorsionList
+// =============================================================================
+
+TEST_F(TFDCpuTest, ExtractTorsionListRDKitReference) {
+  // Compare extractTorsionList output against RDKit CalculateTorsionLists.
+  // Parameters: maxDev='equal', symmRadius=2, ignoreColinearBonds=True
+
+  // --- CCCC: 1 non-ring torsion, 0 ring ---
+  {
+    SCOPED_TRACE("CCCC");
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+    ASSERT_NE(mol, nullptr);
+    auto tl = nvMolKit::extractTorsionList(*mol);
+
+    ASSERT_EQ(tl.nonRingTorsions.size(), 1u);
+    EXPECT_EQ(tl.ringTorsions.size(), 0u);
+
+    ASSERT_EQ(tl.nonRingTorsions[0].atomQuartets.size(), 1u);
+    EXPECT_EQ(tl.nonRingTorsions[0].atomQuartets[0], (std::array<int, 4>{0, 1, 2, 3}));
+    EXPECT_NEAR(tl.nonRingTorsions[0].maxDev, 180.0f, 0.01f);
+  }
+
+  // --- CCCCC: 2 non-ring torsions, 0 ring ---
+  {
+    SCOPED_TRACE("CCCCC");
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+    ASSERT_NE(mol, nullptr);
+    auto tl = nvMolKit::extractTorsionList(*mol);
+
+    ASSERT_EQ(tl.nonRingTorsions.size(), 2u);
+    EXPECT_EQ(tl.ringTorsions.size(), 0u);
+
+    ASSERT_EQ(tl.nonRingTorsions[0].atomQuartets.size(), 1u);
+    EXPECT_EQ(tl.nonRingTorsions[0].atomQuartets[0], (std::array<int, 4>{0, 1, 2, 3}));
+
+    ASSERT_EQ(tl.nonRingTorsions[1].atomQuartets.size(), 1u);
+    EXPECT_EQ(tl.nonRingTorsions[1].atomQuartets[0], (std::array<int, 4>{1, 2, 3, 4}));
+  }
+
+  // --- CCCCCC: 3 non-ring torsions, 0 ring ---
+  {
+    SCOPED_TRACE("CCCCCC");
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCCC"));
+    ASSERT_NE(mol, nullptr);
+    auto tl = nvMolKit::extractTorsionList(*mol);
+
+    ASSERT_EQ(tl.nonRingTorsions.size(), 3u);
+    EXPECT_EQ(tl.ringTorsions.size(), 0u);
+
+    EXPECT_EQ(tl.nonRingTorsions[0].atomQuartets[0], (std::array<int, 4>{0, 1, 2, 3}));
+    EXPECT_EQ(tl.nonRingTorsions[1].atomQuartets[0], (std::array<int, 4>{1, 2, 3, 4}));
+    EXPECT_EQ(tl.nonRingTorsions[2].atomQuartets[0], (std::array<int, 4>{2, 3, 4, 5}));
+  }
+
+  // --- c1ccccc1 (benzene): 0 non-ring, 1 ring torsion with 6 quartets ---
+  {
+    SCOPED_TRACE("c1ccccc1");
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("c1ccccc1"));
+    ASSERT_NE(mol, nullptr);
+    auto tl = nvMolKit::extractTorsionList(*mol);
+
+    EXPECT_EQ(tl.nonRingTorsions.size(), 0u);
+    ASSERT_EQ(tl.ringTorsions.size(), 1u);
+
+    const auto& ringTorsion = tl.ringTorsions[0];
+    ASSERT_EQ(ringTorsion.atomQuartets.size(), 6u);
+
+    // Ring torsion maxDev = 180 * exp(-0.025 * (6-14)^2) ≈ 36.34
+    EXPECT_NEAR(ringTorsion.maxDev, 36.34f, 0.1f);
+
+    // Quartets from RDKit (ring order: 0, 5, 4, 3, 2, 1)
+    EXPECT_EQ(ringTorsion.atomQuartets[0], (std::array<int, 4>{0, 5, 4, 3}));
+    EXPECT_EQ(ringTorsion.atomQuartets[1], (std::array<int, 4>{5, 4, 3, 2}));
+    EXPECT_EQ(ringTorsion.atomQuartets[2], (std::array<int, 4>{4, 3, 2, 1}));
+    EXPECT_EQ(ringTorsion.atomQuartets[3], (std::array<int, 4>{3, 2, 1, 0}));
+    EXPECT_EQ(ringTorsion.atomQuartets[4], (std::array<int, 4>{2, 1, 0, 5}));
+    EXPECT_EQ(ringTorsion.atomQuartets[5], (std::array<int, 4>{1, 0, 5, 4}));
+  }
+
+  // --- CC(=O)[C@H]1CCCC[C@@H]1CN: 2 non-ring + 1 ring (cyclohexane with substituents) ---
+  {
+    SCOPED_TRACE("CC(=O)[C@H]1CCCC[C@@H]1CN");
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CC(=O)[C@H]1CCCC[C@@H]1CN"));
+    ASSERT_NE(mol, nullptr);
+    auto tl = nvMolKit::extractTorsionList(*mol);
+
+    ASSERT_EQ(tl.nonRingTorsions.size(), 2u);
+    ASSERT_EQ(tl.ringTorsions.size(), 1u);
+
+    EXPECT_EQ(tl.nonRingTorsions[0].atomQuartets[0], (std::array<int, 4>{0, 1, 3, 4}));
+    EXPECT_EQ(tl.nonRingTorsions[1].atomQuartets[0], (std::array<int, 4>{3, 8, 9, 10}));
+
+    ASSERT_EQ(tl.ringTorsions[0].atomQuartets.size(), 6u);
+    EXPECT_NEAR(tl.ringTorsions[0].maxDev, 36.34f, 0.1f);
+    EXPECT_EQ(tl.ringTorsions[0].atomQuartets[0], (std::array<int, 4>{3, 8, 7, 6}));
+    EXPECT_EQ(tl.ringTorsions[0].atomQuartets[5], (std::array<int, 4>{4, 3, 8, 7}));
+  }
+
+  // --- C[C@@H](CN(C)C(=O)C#CCN)C1CC1: 4 non-ring + 1 ring (cyclopropane + triple bond) ---
+  {
+    SCOPED_TRACE("C[C@@H](CN(C)C(=O)C#CCN)C1CC1");
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("C[C@@H](CN(C)C(=O)C#CCN)C1CC1"));
+    ASSERT_NE(mol, nullptr);
+    auto tl = nvMolKit::extractTorsionList(*mol);
+
+    ASSERT_EQ(tl.nonRingTorsions.size(), 4u);
+    ASSERT_EQ(tl.ringTorsions.size(), 1u);
+
+    EXPECT_EQ(tl.nonRingTorsions[0].atomQuartets[0], (std::array<int, 4>{0, 1, 2, 3}));
+    EXPECT_EQ(tl.nonRingTorsions[1].atomQuartets[0], (std::array<int, 4>{1, 2, 3, 4}));
+    EXPECT_EQ(tl.nonRingTorsions[2].atomQuartets[0], (std::array<int, 4>{4, 3, 5, 7}));
+    ASSERT_EQ(tl.nonRingTorsions[3].atomQuartets.size(), 2u);
+    EXPECT_EQ(tl.nonRingTorsions[3].atomQuartets[0], (std::array<int, 4>{0, 1, 11, 12}));
+    EXPECT_EQ(tl.nonRingTorsions[3].atomQuartets[1], (std::array<int, 4>{0, 1, 11, 13}));
+
+    ASSERT_EQ(tl.ringTorsions[0].atomQuartets.size(), 3u);
+    EXPECT_NEAR(tl.ringTorsions[0].maxDev, 8.74f, 0.1f);
+  }
+}
+
+// =============================================================================
+// computeTorsionWeights
+// =============================================================================
+
+TEST_F(TFDCpuTest, ComputeTorsionWeightsRDKitReference) {
+  // Compare computeTorsionWeights output against RDKit CalculateTorsionWeights.
+  // Reference generated with: TorsionFingerprints.CalculateTorsionWeights(mol, ignoreColinearBonds=True)
+
+  struct TestCase {
+    const char*        smiles;
+    std::vector<float> expectedWeights;
+  };
+
+  // clang-format off
+  std::vector<TestCase> cases = {
+    {"CCCC",   {1.0f}},                                             // 1 torsion: central bond
+    {"CCCCC",  {1.0f, 0.1f}},                                      // 2 torsions: central + terminal
+    {"CCCCCC", {0.1f, 1.0f, 0.1f}},                                // 3 torsions: symmetric around center
+    {"CC(=O)[C@H]1CCCC[C@@H]1CN", {0.3593813664f, 0.3593813664f, 0.1748047999f}},  // cyclohexane + substituents
+    {"C[C@@H](CN(C)C(=O)C#CCN)C1CC1", {0.5623413252f, 1.0f, 0.5623413252f, 0.1f, 0.0025021508f}},  // cyclopropane + triple bond
+  };
+  // clang-format on
+
+  for (const auto& tc : cases) {
+    SCOPED_TRACE(tc.smiles);
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(tc.smiles));
+    ASSERT_NE(mol, nullptr);
+
+    auto tl      = nvMolKit::extractTorsionList(*mol);
+    auto weights = nvMolKit::computeTorsionWeights(*mol, tl);
+
+    ASSERT_EQ(weights.size(), tc.expectedWeights.size());
+    for (size_t i = 0; i < weights.size(); ++i) {
+      EXPECT_NEAR(weights[i], tc.expectedWeights[i], 5e-4f) << "Weight[" << i << "] mismatch";
+    }
+  }
+}
+
+// =============================================================================
+// buildTFDSystem
+// =============================================================================
+
+TEST_F(TFDCpuTest, BuildTFDSystem) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 5);
+  ASSERT_EQ(mol->getNumConformers(), 5);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        system = nvMolKit::buildTFDSystem(*mol, options);
+
+  EXPECT_EQ(system.numMolecules(), 1);
+  EXPECT_EQ(system.molDescriptors[0].numConformers, 5);
+  EXPECT_GT(system.totalTorsions(), 0);
+
+  // TFD output size: 5 conformers = 5*4/2 = 10 pairs
+  EXPECT_EQ(system.totalTFDOutputs(), 10);
+
+  // Single-quartet molecule: totalQuartets == totalTorsions
+  EXPECT_EQ(system.totalQuartets(), system.totalTorsions());
+}
+
+TEST_F(TFDCpuTest, BuildTFDSystemMultiQuartet) {
+  // Verify multi-quartet fields for CC(C)CC (isopentane)
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CC(C)CC"));
+  ASSERT_NE(mol, nullptr);
+  generateConformers(*mol, 3);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        system = nvMolKit::buildTFDSystem(*mol, options);
+
+  EXPECT_EQ(system.numMolecules(), 1);
+  EXPECT_EQ(system.totalTorsions(), 1);  // 1 torsion
+  EXPECT_EQ(system.totalQuartets(), 2);  // 2 quartets
+
+  // Verify torsion type
+  ASSERT_EQ(system.torsionTypes.size(), 1u);
+  EXPECT_EQ(system.torsionTypes[0], nvMolKit::TorsionType::Symmetric);
+
+  // Verify quartetStarts CSR
+  ASSERT_EQ(system.quartetStarts.size(), 2u);  // 1 torsion + 1
+  EXPECT_EQ(system.quartetStarts[0], 0);
+  EXPECT_EQ(system.quartetStarts[1], 2);
+
+  // Verify 2 quartets in torsionAtoms
+  ASSERT_EQ(system.torsionAtoms.size(), 2u);
+}
+
+TEST_F(TFDCpuTest, BuildTFDSystemRingTorsion) {
+  // Verify multi-quartet fields for C1CCCCC1 (cyclohexane)
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("C1CCCCC1"));
+  ASSERT_NE(mol, nullptr);
+  generateConformers(*mol, 3);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        system = nvMolKit::buildTFDSystem(*mol, options);
+
+  EXPECT_EQ(system.totalTorsions(), 1);  // 1 ring torsion
+  EXPECT_EQ(system.totalQuartets(), 6);  // 6 quartets in the ring
+
+  ASSERT_EQ(system.torsionTypes.size(), 1u);
+  EXPECT_EQ(system.torsionTypes[0], nvMolKit::TorsionType::Ring);
+}
+
+// =============================================================================
+// computeDihedralAngles
+// =============================================================================
+
+TEST_F(TFDCpuTest, KnownDihedralAngle) {
+  // Test dihedral angle computation with hand-crafted geometry (RDKit-independent).
+  // Create n-butane and manually set conformer coordinates to known dihedral angles.
+  //
+  // Dihedral angle definition: Looking down the C1-C2 bond (central bond),
+  // the angle between C0 and C3 measured clockwise from C0.
+  // - Trans (180°): C0 and C3 on opposite sides
+  // - Gauche (60°): C0 and C3 at 60° apart
+
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+  ASSERT_EQ(mol->getNumAtoms(), 4);
+
+  // Trans conformer (180°): C0-C1 and C2-C3 vectors are antiparallel
+  {
+    auto* conf = new RDKit::Conformer(4);
+    conf->setId(0);
+    conf->setAtomPos(0, RDGeom::Point3D(1.0, 0.0, 0.0));
+    conf->setAtomPos(1, RDGeom::Point3D(0.0, 0.0, 0.0));
+    conf->setAtomPos(2, RDGeom::Point3D(0.0, 1.5, 0.0));
+    conf->setAtomPos(3, RDGeom::Point3D(1.0, 1.5, 0.0));
+    mol->addConformer(conf, true);
+  }
+
+  // Gauche conformer (60°): C3 rotated to give 60° dihedral
+  {
+    auto* conf = new RDKit::Conformer(4);
+    conf->setId(1);
+    conf->setAtomPos(0, RDGeom::Point3D(1.0, 0.0, 0.0));
+    conf->setAtomPos(1, RDGeom::Point3D(0.0, 0.0, 0.0));
+    conf->setAtomPos(2, RDGeom::Point3D(0.0, 1.5, 0.0));
+    conf->setAtomPos(3, RDGeom::Point3D(-0.5, 1.5, 0.866));
+    mol->addConformer(conf, true);
+  }
+
+  ASSERT_EQ(mol->getNumConformers(), 2);
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights = false;
+  options.maxDevMode = nvMolKit::TFDMaxDevMode::Equal;
+
+  auto tl     = nvMolKit::extractTorsionList(*mol, options.maxDevMode, options.symmRadius, options.ignoreColinearBonds);
+  auto angles = generator_.computeDihedralAngles(*mol, tl);
+
+  int totalQuartets = static_cast<int>(tl.totalCount());
+  ASSERT_GE(totalQuartets, 1);
+
+  EXPECT_NEAR(angles[0], 180.0f, 5.0f) << "Trans conformer should have ~180° dihedral";
+  EXPECT_NEAR(angles[totalQuartets], 60.0f, 5.0f) << "Gauche conformer should have ~60° dihedral";
+
+  // TFD: difference = |180 - 60| = 120°, normalized by 180° ≈ 0.667
+  auto tfdMatrix = generator_.GetTFDMatrix(*mol, options);
+  ASSERT_EQ(tfdMatrix.size(), 1u);
+  EXPECT_NEAR(tfdMatrix[0], 0.667, 0.05);
+}
+
+TEST_F(TFDCpuTest, ComputeDihedralAnglesRDKitReference) {
+  // Compare computeDihedralAngles output against RDKit rdMolTransforms.GetDihedralDeg.
+  // 4 conformers per molecule, seed=42, ETKDGv3.
+  //
+  // NOTE: Our dihedral convention is offset by 180° from RDKit's GetDihedralDeg.
+  // This does NOT affect TFD (circularDifference is invariant to shared offset).
+  // Reference values below are RDKit values + 180° (mod 360).
+  //
+  // Original RDKit values generated with:
+  //   angle = rdMolTransforms.GetDihedralDeg(conf, a, b, c, d)
+  //   our_angle = (angle + 180) % 360
+
+  struct TestCase {
+    const char*        smiles;
+    int                numTorsions;
+    std::vector<float> expectedAngles;  // [numConf * numTors], our convention
+  };
+
+  // clang-format off
+  std::vector<TestCase> cases = {
+    {"CCCC", 1, {
+      120.0130f,  // conf[0] tors[0]  (RDKit: 300.013)
+        0.0000f,  // conf[1] tors[0]  (RDKit: 180.000)
+      120.0000f,  // conf[2] tors[0]  (RDKit: 300.000)
+        0.0000f   // conf[3] tors[0]  (RDKit: 180.000)
+    }},
+    {"CCCCC", 2, {
+      239.9983f, 240.0002f,   // conf[0]  (RDKit: 60.0, 60.0)
+      119.9983f, 240.0001f,   // conf[1]  (RDKit: 300.0, 60.0)
+      359.9974f, 240.0025f,   // conf[2]  (RDKit: 180.0, 60.0)
+        0.0037f, 120.0134f    // conf[3]  (RDKit: 180.0, 300.0)
+    }},
+    {"CCCCCC", 3, {
+      240.0055f, 359.9997f, 119.9928f,  // conf[0]  (RDKit: 60, 180, 300)
+      359.9961f, 239.9959f, 119.9996f,  // conf[1]  (RDKit: 180, 60, 300)
+      239.9980f, 359.9978f, 239.9987f,  // conf[2]  (RDKit: 60, 180, 60)
+      239.9980f, 119.9980f, 119.9976f   // conf[3]  (RDKit: 60, 300, 300)
+    }},
+  };
+  // clang-format on
+
+  // Use circular distance to handle 0°/360° wrap-around
+  constexpr float kAngleTolerance = 0.05f;  // degrees; generous for float vs double
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights          = true;
+  options.maxDevMode          = nvMolKit::TFDMaxDevMode::Equal;
+  options.symmRadius          = 2;
+  options.ignoreColinearBonds = true;
+
+  for (const auto& tc : cases) {
+    SCOPED_TRACE(tc.smiles);
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(tc.smiles));
+    ASSERT_NE(mol, nullptr);
+
+    generateConformers(*mol, 4, 42);
+    ASSERT_EQ(mol->getNumConformers(), 4);
+
+    auto tl = nvMolKit::extractTorsionList(*mol, options.maxDevMode, options.symmRadius, options.ignoreColinearBonds);
+    auto angles = generator_.computeDihedralAngles(*mol, tl);
+
+    int numConf = 4;
+    int numTors = tc.numTorsions;
+    ASSERT_EQ(static_cast<int>(angles.size()), numConf * numTors);
+    ASSERT_EQ(static_cast<int>(tc.expectedAngles.size()), numConf * numTors);
+
+    for (size_t i = 0; i < angles.size(); ++i) {
+      // Circular distance handles 0°/360° boundary
+      float diff = std::abs(angles[i] - tc.expectedAngles[i]);
+      if (diff > 180.0f) {
+        diff = 360.0f - diff;
+      }
+      EXPECT_LE(diff, kAngleTolerance) << "Angle[" << i << "] (conf=" << i / numTors << " tors=" << i % numTors
+                                       << "): got " << angles[i] << ", expected " << tc.expectedAngles[i];
+    }
+  }
+}
+
+// =============================================================================
+// GetTFDMatrix (pipeline)
+// =============================================================================
+
+TEST_F(TFDCpuTest, ComputeTFDMatrixSelfComparison) {
+  // Identical conformers should produce TFD = 0
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 1);
+  ASSERT_EQ(mol->getNumConformers(), 1);
+
+  // Duplicate the conformer
+  RDKit::Conformer conf = mol->getConformer(0);
+  conf.setId(1);
+  mol->addConformer(new RDKit::Conformer(conf), true);
+  ASSERT_EQ(mol->getNumConformers(), 2);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        tfdMatrix = generator_.GetTFDMatrix(*mol, options);
+
+  ASSERT_EQ(tfdMatrix.size(), 1u);
+  EXPECT_NEAR(tfdMatrix[0], 0.0, kTolerance);
+}
+
+TEST_F(TFDCpuTest, NoTorsionsMolecule) {
+  // Methane has no rotatable bonds — TFD should be zero
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("C"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 3);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        tfdMatrix = generator_.GetTFDMatrix(*mol, options);
+
+  ASSERT_EQ(tfdMatrix.size(), 3u);  // 3 conformers = 3 pairs
+  for (double tfd : tfdMatrix) {
+    EXPECT_NEAR(tfd, 0.0, kTolerance);
+  }
+}
+
+TEST_F(TFDCpuTest, SingleConformer) {
+  // Single conformer should return empty matrix (no pairs)
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 1);
+  ASSERT_EQ(mol->getNumConformers(), 1);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        tfdMatrix = generator_.GetTFDMatrix(*mol, options);
+
+  EXPECT_TRUE(tfdMatrix.empty());
+}
+
+TEST_F(TFDCpuTest, UseWeightsOption) {
+  // Weighted and unweighted TFD should differ for multi-torsion molecules
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCCCCC"));  // n-octane
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 4);
+  ASSERT_GE(mol->getNumConformers(), 2);
+
+  nvMolKit::TFDComputeOptions optionsWithWeights;
+  optionsWithWeights.useWeights = true;
+
+  nvMolKit::TFDComputeOptions optionsNoWeights;
+  optionsNoWeights.useWeights = false;
+
+  auto tfdWithWeights = generator_.GetTFDMatrix(*mol, optionsWithWeights);
+  auto tfdNoWeights   = generator_.GetTFDMatrix(*mol, optionsNoWeights);
+
+  ASSERT_EQ(tfdWithWeights.size(), tfdNoWeights.size());
+  ASSERT_GT(tfdWithWeights.size(), 0u);
+
+  for (size_t i = 0; i < tfdWithWeights.size(); ++i) {
+    EXPECT_GE(tfdWithWeights[i], 0.0);
+    EXPECT_GE(tfdNoWeights[i], 0.0);
+  }
+
+  bool anyDifferent = false;
+  for (size_t i = 0; i < tfdWithWeights.size(); ++i) {
+    if (std::abs(tfdWithWeights[i] - tfdNoWeights[i]) > 1e-6) {
+      anyDifferent = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(anyDifferent) << "Weighted and unweighted TFD should differ for n-octane";
+}
+
+TEST_F(TFDCpuTest, MaxDevModes) {
+  // Both Equal and Spec modes should produce valid finite results
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 3);
+
+  nvMolKit::TFDComputeOptions optionsEqual;
+  optionsEqual.maxDevMode = nvMolKit::TFDMaxDevMode::Equal;
+
+  nvMolKit::TFDComputeOptions optionsSpec;
+  optionsSpec.maxDevMode = nvMolKit::TFDMaxDevMode::Spec;
+
+  auto tfdEqual = generator_.GetTFDMatrix(*mol, optionsEqual);
+  auto tfdSpec  = generator_.GetTFDMatrix(*mol, optionsSpec);
+
+  ASSERT_EQ(tfdEqual.size(), tfdSpec.size());
+
+  for (size_t i = 0; i < tfdEqual.size(); ++i) {
+    EXPECT_TRUE(std::isfinite(tfdEqual[i]));
+    EXPECT_TRUE(std::isfinite(tfdSpec[i]));
+  }
+}
+
+TEST_F(TFDCpuTest, MaxDevModesMultiQuartet) {
+  // Spec maxDev mode uses torsion-specific normalization for non-ring torsions:
+  //   - CC(C)CC symmetric torsion: maxDev=90 (Spec) vs 180 (Equal)
+  // Ring torsions always use the ring-specific formula (matching RDKit),
+  // so Equal vs Spec produces identical results for ring-only molecules.
+
+  // --- CC(C)CC: Spec and Equal should differ ---
+  {
+    SCOPED_TRACE("CC(C)CC");
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CC(C)CC"));
+    ASSERT_NE(mol, nullptr);
+
+    generateConformers(*mol, 4);
+    ASSERT_GE(mol->getNumConformers(), 2);
+
+    nvMolKit::TFDComputeOptions optionsEqual;
+    optionsEqual.maxDevMode = nvMolKit::TFDMaxDevMode::Equal;
+
+    nvMolKit::TFDComputeOptions optionsSpec;
+    optionsSpec.maxDevMode = nvMolKit::TFDMaxDevMode::Spec;
+
+    auto tfdEqual = generator_.GetTFDMatrix(*mol, optionsEqual);
+    auto tfdSpec  = generator_.GetTFDMatrix(*mol, optionsSpec);
+
+    ASSERT_EQ(tfdEqual.size(), tfdSpec.size());
+    ASSERT_GT(tfdEqual.size(), 0u);
+
+    for (size_t i = 0; i < tfdEqual.size(); ++i) {
+      EXPECT_TRUE(std::isfinite(tfdEqual[i]));
+      EXPECT_TRUE(std::isfinite(tfdSpec[i]));
+      EXPECT_GE(tfdEqual[i], 0.0);
+      EXPECT_GE(tfdSpec[i], 0.0);
+    }
+
+    // Symmetric non-ring torsion: maxDev=90 (Spec) vs 180 (Equal)
+    bool anyDifferent = false;
+    for (size_t i = 0; i < tfdEqual.size(); ++i) {
+      if (std::abs(tfdEqual[i] - tfdSpec[i]) > 1e-6) {
+        anyDifferent = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(anyDifferent) << "Spec and Equal should differ for symmetric non-ring torsion";
+  }
+
+  // --- C1CCCCC1: Spec and Equal should be identical (ring maxDev is always formula-based) ---
+  {
+    SCOPED_TRACE("C1CCCCC1");
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("C1CCCCC1"));
+    ASSERT_NE(mol, nullptr);
+
+    generateConformers(*mol, 4);
+    ASSERT_GE(mol->getNumConformers(), 2);
+
+    nvMolKit::TFDComputeOptions optionsEqual;
+    optionsEqual.maxDevMode = nvMolKit::TFDMaxDevMode::Equal;
+
+    nvMolKit::TFDComputeOptions optionsSpec;
+    optionsSpec.maxDevMode = nvMolKit::TFDMaxDevMode::Spec;
+
+    auto tfdEqual = generator_.GetTFDMatrix(*mol, optionsEqual);
+    auto tfdSpec  = generator_.GetTFDMatrix(*mol, optionsSpec);
+
+    ASSERT_EQ(tfdEqual.size(), tfdSpec.size());
+    ASSERT_GT(tfdEqual.size(), 0u);
+
+    for (size_t i = 0; i < tfdEqual.size(); ++i) {
+      EXPECT_TRUE(std::isfinite(tfdEqual[i]));
+      EXPECT_GE(tfdEqual[i], 0.0);
+      // Ring-only molecule: maxDev mode has no effect
+      EXPECT_NEAR(tfdEqual[i], tfdSpec[i], 1e-10) << "Ring torsion maxDev should be identical for Equal and Spec modes";
+    }
+  }
+}
+
+TEST_F(TFDCpuTest, IgnoreColinearBondsFalse) {
+  // Test ignoreColinearBonds=false with a molecule containing a triple bond.
+  // With ignoreColinearBonds=true (default), bonds adjacent to triple bonds are skipped.
+  // With false, alternative atoms are found and torsions are included.
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC#CCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 4);
+  ASSERT_GE(mol->getNumConformers(), 2);
+
+  nvMolKit::TFDComputeOptions optionsIgnore;
+  optionsIgnore.ignoreColinearBonds = true;
+
+  nvMolKit::TFDComputeOptions optionsKeep;
+  optionsKeep.ignoreColinearBonds = false;
+
+  auto torsionsIgnore = nvMolKit::extractTorsionList(*mol, nvMolKit::TFDMaxDevMode::Equal, 2, true);
+  auto torsionsKeep   = nvMolKit::extractTorsionList(*mol, nvMolKit::TFDMaxDevMode::Equal, 2, false);
+
+  // With ignoreColinearBonds=false, we should get more (or at least as many) torsions
+  EXPECT_GE(torsionsKeep.totalCount(), torsionsIgnore.totalCount());
+
+  auto tfdIgnore = generator_.GetTFDMatrix(*mol, optionsIgnore);
+  auto tfdKeep   = generator_.GetTFDMatrix(*mol, optionsKeep);
+
+  // Both should produce valid results
+  for (double val : tfdIgnore) {
+    EXPECT_TRUE(std::isfinite(val));
+    EXPECT_GE(val, 0.0);
+  }
+  for (double val : tfdKeep) {
+    EXPECT_TRUE(std::isfinite(val));
+    EXPECT_GE(val, 0.0);
+  }
+}
+
+TEST_F(TFDCpuTest, CompareWithRDKitReference) {
+  // Compare full TFD pipeline against pre-computed RDKit reference values.
+  // Includes single-quartet, multi-quartet (ring, symmetric), ring+substituent,
+  // and triple-bond molecules.
+  //
+  // Reference values generated with RDKit Python:
+  //   mol = Chem.MolFromSmiles(smiles)
+  //   params = AllChem.ETKDGv3()
+  //   params.randomSeed = <seed>
+  //   AllChem.EmbedMultipleConfs(mol, <numConfs>, params)
+  //   tfd = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev='equal', symmRadius=2)
+
+  struct TestCase {
+    const char*         smiles;
+    int                 numConfs;
+    int                 seed;
+    std::vector<double> reference;
+  };
+
+  // clang-format off
+  std::vector<TestCase> cases = {
+    {"CCCC", 4, 42, {              // n-butane, 1 torsion
+      0.6667389132, 0.0000726610, 0.6666662521,
+      0.6667387931, 0.0000001200, 0.6666661321
+    }},
+    {"CCCCC", 4, 42, {             // n-pentane, 2 torsions
+      0.6060606631, 0.6060573299, 0.6060662252,
+      0.6666872992, 0.6666326206, 0.0606323184
+    }},
+    {"CCCCCC", 4, 42, {            // n-hexane, 3 torsions
+      0.6111276139, 0.0555704226, 0.6666744357,
+      0.5555532106, 0.6111014381, 0.6111123144
+    }},
+    {"CC(C)CC", 4, 42, {           // isopentane, 1 symmetric torsion (2 quartets)
+      0.0000045777, 0.0433253929, 0.0433299706,
+      0.0000027031, 0.0000072808, 0.0433226898
+    }},
+    {"C1CCCCC1", 4, 42, {          // cyclohexane, 1 ring torsion (6 quartets)
+      0.1343662730, 0.3653621455, 0.2309958724,
+      0.1017138867, 0.2360801597, 0.4670760321
+    }},
+    {"c1ccccc1CC", 4, 42, {        // ethylbenzene, 1 symmetric non-ring + 1 ring torsion
+      0.0000114982, 0.0000104863, 0.0000020789,
+      0.0000100311, 0.0000018047, 0.0000013199
+    }},
+    {"CC(=O)[C@H]1CCCC[C@@H]1CN", 5, 44, {  // cyclohexane + ketone/amine substituents
+      0.3538817018, 0.5924305446, 0.2977012405, 0.5450659700, 0.6307899400,
+      0.3331152938, 0.5590067080, 0.3311160076, 0.3015480245, 0.5678678322
+    }},
+    {"C[C@@H](CN(C)C(=O)C#CCN)C1CC1", 5, 45, {  // cyclopropane + triple bond
+      0.6173250123, 0.2578501411, 0.8622535690, 0.4331903310, 0.7154140061,
+      0.1978909084, 0.1683354525, 0.4489992971, 0.4239704017, 0.2760081465
+    }},
+  };
+  // clang-format on
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights          = true;
+  options.maxDevMode          = nvMolKit::TFDMaxDevMode::Equal;
+  options.symmRadius          = 2;
+  options.ignoreColinearBonds = true;
+
+  for (const auto& tc : cases) {
+    SCOPED_TRACE(tc.smiles);
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(tc.smiles));
+    ASSERT_NE(mol, nullptr);
+
+    generateConformers(*mol, tc.numConfs, tc.seed);
+    auto tfdMatrix = generator_.GetTFDMatrix(*mol, options);
+
+    ASSERT_EQ(tfdMatrix.size(), tc.reference.size());
+    constexpr double kRDKitTolerance = 5e-4;
+    for (size_t i = 0; i < tfdMatrix.size(); ++i) {
+      EXPECT_NEAR(tfdMatrix[i], tc.reference[i], kRDKitTolerance) << "TFD[" << i << "] mismatch with RDKit reference";
+    }
+  }
+}
+
+TEST_F(TFDCpuTest, CompareWithRDKitReferenceAddHs) {
+  // Compare TFD with explicit hydrogens (AddHs) against RDKit reference values.
+  // AddHs produces more quartets per torsion, exercising multi-quartet handling
+  // on molecules that would otherwise be single-quartet.
+  //
+  // Reference values generated with RDKit Python:
+  //   mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
+  //   params = AllChem.ETKDGv3()
+  //   params.randomSeed = 42
+  //   AllChem.EmbedMultipleConfs(mol, 4, params)
+  //   tfd = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev='equal', symmRadius=2)
+
+  struct TestCase {
+    const char*         smiles;
+    std::vector<double> reference;
+  };
+
+  // clang-format off
+  std::vector<TestCase> cases = {
+    {"CCCCC", {                   // n-pentane with AddHs (symmetric torsions from H)
+      0.6666346588,
+      0.0606342183,
+      0.6666024736,
+      0.6666717502,
+      0.6666935910,
+      0.6061117436
+    }},
+    {"CC(C)CC", {                 // isopentane with AddHs
+      0.0155798214,
+      0.0118375755,
+      0.0180868258,
+      0.0000140721,
+      0.0091266798,
+      0.0336807192
+    }},
+  };
+  // clang-format on
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights          = true;
+  options.maxDevMode          = nvMolKit::TFDMaxDevMode::Equal;
+  options.symmRadius          = 2;
+  options.ignoreColinearBonds = true;
+
+  for (const auto& tc : cases) {
+    SCOPED_TRACE(std::string(tc.smiles) + " (AddHs)");
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(tc.smiles));
+    ASSERT_NE(mol, nullptr);
+
+    RDKit::MolOps::addHs(*mol);
+
+    generateConformers(*mol, 4, 42);
+    ASSERT_EQ(mol->getNumConformers(), 4);
+
+    auto tfdMatrix = generator_.GetTFDMatrix(*mol, options);
+
+    ASSERT_EQ(tfdMatrix.size(), tc.reference.size());
+    constexpr double kRDKitTolerance = 5e-4;
+    for (size_t i = 0; i < tfdMatrix.size(); ++i) {
+      EXPECT_NEAR(tfdMatrix[i], tc.reference[i], kRDKitTolerance) << "TFD[" << i << "] mismatch with RDKit reference";
+    }
+  }
+}
+
+// =============================================================================
+// GetTFDMatrices (batch)
+// =============================================================================
+
+TEST_F(TFDCpuTest, BatchProcessing) {
+  // Test batch processing of multiple molecules via GetTFDMatrices
+  // clang-format off
+  const std::vector<std::string> testSmiles = {
+    "CCCC", "CC(C)C", "c1ccccc1", "CCO", "CCCCC",
+    "CC(=O)O", "c1ccc(cc1)O", "CCCCCC", "CC(C)(C)C", "c1ccc2ccccc2c1",
+  };
+  // clang-format on
+
+  std::vector<std::unique_ptr<RDKit::RWMol>> mols;
+  std::vector<const RDKit::ROMol*>           molPtrs;
+
+  for (const auto& smiles : testSmiles) {
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(smiles));
+    if (mol) {
+      generateConformers(*mol, 3);
+      if (mol->getNumConformers() >= 2) {
+        molPtrs.push_back(mol.get());
+        mols.push_back(std::move(mol));
+      }
+    }
+  }
+
+  ASSERT_GE(mols.size(), 3u);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        results = generator_.GetTFDMatrices(molPtrs, options);
+
+  ASSERT_EQ(results.size(), molPtrs.size());
+
+  for (size_t i = 0; i < results.size(); ++i) {
+    int numConf       = molPtrs[i]->getNumConformers();
+    int expectedPairs = numConf * (numConf - 1) / 2;
+
+    EXPECT_EQ(results[i].size(), static_cast<size_t>(expectedPairs)) << "Mismatch for molecule " << i;
+
+    for (double tfd : results[i]) {
+      EXPECT_TRUE(std::isfinite(tfd));
+      EXPECT_GE(tfd, 0.0);
+    }
+  }
+}
diff --git a/tests/test_tfd_gpu.cpp b/tests/test_tfd_gpu.cpp
new file mode 100644
index 0000000..4bcad16
--- /dev/null
+++ b/tests/test_tfd_gpu.cpp
@@ -0,0 +1,327 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <GraphMol/DistGeomHelpers/Embedder.h>
+#include <GraphMol/MolOps.h>
+#include <GraphMol/ROMol.h>
+#include <GraphMol/SmilesParse/SmilesParse.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "tfd_cpu.h"
+#include "tfd_gpu.h"
+
+namespace {
+
+constexpr double kTolerance = 1e-3;  // GPU has lower precision due to float
+
+//! Generate conformers for a molecule using RDKit
+void generateConformers(RDKit::ROMol& mol, int numConformers, int seed = 42) {
+  RDKit::DGeomHelpers::EmbedParameters params = RDKit::DGeomHelpers::ETKDGv3;
+  params.randomSeed                           = seed;
+  params.numThreads                           = 1;
+  RDKit::DGeomHelpers::EmbedMultipleConfs(mol, numConformers, params);
+}
+
+}  // namespace
+
+// Global check for CUDA availability
+static bool gCudaAvailable = false;
+static bool gCudaChecked   = false;
+
+static bool checkCudaAvailable() {
+  if (!gCudaChecked) {
+    gCudaChecked = true;
+    try {
+      int         deviceCount = 0;
+      cudaError_t err         = cudaGetDeviceCount(&deviceCount);
+      if (err == cudaSuccess && deviceCount > 0) {
+        err = cudaSetDevice(0);
+        if (err == cudaSuccess) {
+          gCudaAvailable = true;
+        } else {
+          cudaGetLastError();
+        }
+      } else {
+        cudaGetLastError();
+      }
+    } catch (...) {
+      gCudaAvailable = false;
+    }
+  }
+  return gCudaAvailable;
+}
+
+// ========== GPU Generator Tests ==========
+
+class TFDGpuTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    if (!checkCudaAvailable()) {
+      GTEST_SKIP() << "No CUDA devices available, skipping GPU tests";
+    }
+    gpuGenerator_ = std::make_unique<nvMolKit::TFDGpuGenerator>();
+  }
+
+  nvMolKit::TFDCpuGenerator                  cpuGenerator_;
+  std::unique_ptr<nvMolKit::TFDGpuGenerator> gpuGenerator_;
+};
+
+TEST_F(TFDGpuTest, MatchesCPUReference) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 5);
+  ASSERT_EQ(mol->getNumConformers(), 5);
+
+  nvMolKit::TFDComputeOptions options;
+
+  auto cpuTFD = cpuGenerator_.GetTFDMatrix(*mol, options);
+  ASSERT_FALSE(cpuTFD.empty());
+
+  auto gpuTFD = gpuGenerator_->GetTFDMatrix(*mol, options);
+
+  ASSERT_EQ(gpuTFD.size(), cpuTFD.size());
+  for (size_t i = 0; i < cpuTFD.size(); ++i) {
+    EXPECT_NEAR(gpuTFD[i], cpuTFD[i], kTolerance) << "Mismatch at TFD index " << i;
+  }
+}
+
+TEST_F(TFDGpuTest, MatchesCPUReferenceUnweighted) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 5);
+  ASSERT_EQ(mol->getNumConformers(), 5);
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights = false;
+
+  auto cpuTFD = cpuGenerator_.GetTFDMatrix(*mol, options);
+  ASSERT_FALSE(cpuTFD.empty());
+
+  auto gpuTFD = gpuGenerator_->GetTFDMatrix(*mol, options);
+
+  ASSERT_EQ(gpuTFD.size(), cpuTFD.size());
+  for (size_t i = 0; i < cpuTFD.size(); ++i) {
+    EXPECT_NEAR(gpuTFD[i], cpuTFD[i], kTolerance) << "Mismatch at TFD index " << i;
+  }
+}
+
+TEST_F(TFDGpuTest, BatchMultipleMolecules) {
+  // Mix of single-quartet and multi-quartet molecules in a single batch
+  std::vector<std::string> smilesList = {
+    "CCCC",                           // n-butane (single-quartet)
+    "CCCCC",                          // n-pentane (single-quartet)
+    "CC(C)CC",                        // isopentane (symmetric, 2 quartets)
+    "C1CCCCC1",                       // cyclohexane (ring, 6 quartets)
+    "c1ccccc1CC",                     // ethylbenzene (ring + symmetric)
+    "CCO",                            // ethanol
+    "CC(=O)O",                        // acetic acid
+    "CC(=O)[C@H]1CCCC[C@@H]1CN",      // cyclohexane + ketone/amine substituents
+    "C[C@@H](CN(C)C(=O)C#CCN)C1CC1",  // cyclopropane + triple bond
+  };
+
+  std::vector<std::unique_ptr<RDKit::RWMol>> mols;
+  std::vector<const RDKit::ROMol*>           molPtrs;
+
+  for (const auto& smiles : smilesList) {
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(smiles));
+    if (mol) {
+      generateConformers(*mol, 4);
+      if (mol->getNumConformers() >= 2) {
+        molPtrs.push_back(mol.get());
+        mols.push_back(std::move(mol));
+      }
+    }
+  }
+
+  ASSERT_GE(mols.size(), 3u);
+
+  nvMolKit::TFDComputeOptions options;
+
+  auto cpuResults = cpuGenerator_.GetTFDMatrices(molPtrs, options);
+  auto gpuResults = gpuGenerator_->GetTFDMatrices(molPtrs, options);
+
+  ASSERT_EQ(gpuResults.size(), cpuResults.size());
+  for (size_t m = 0; m < cpuResults.size(); ++m) {
+    ASSERT_EQ(gpuResults[m].size(), cpuResults[m].size()) << "Size mismatch for molecule " << m;
+
+    for (size_t i = 0; i < cpuResults[m].size(); ++i) {
+      EXPECT_NEAR(gpuResults[m][i], cpuResults[m][i], kTolerance) << "Mismatch at molecule " << m << " TFD index " << i;
+    }
+  }
+}
+
+TEST_F(TFDGpuTest, SingleConformer) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 1);
+  ASSERT_EQ(mol->getNumConformers(), 1);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        tfdMatrix = gpuGenerator_->GetTFDMatrix(*mol, options);
+
+  EXPECT_TRUE(tfdMatrix.empty());
+}
+
+TEST_F(TFDGpuTest, NoTorsionsMolecule) {
+  // Methane has no rotatable bonds
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("C"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 3);
+
+  nvMolKit::TFDComputeOptions options;
+
+  auto cpuTFD = cpuGenerator_.GetTFDMatrix(*mol, options);
+  auto gpuTFD = gpuGenerator_->GetTFDMatrix(*mol, options);
+
+  ASSERT_EQ(gpuTFD.size(), cpuTFD.size());
+  for (size_t i = 0; i < gpuTFD.size(); ++i) {
+    EXPECT_NEAR(gpuTFD[i], 0.0, kTolerance);
+    EXPECT_NEAR(cpuTFD[i], 0.0, kTolerance);
+  }
+}
+
+TEST_F(TFDGpuTest, GeneratorReuse) {
+  // Verify that calling GetTFDMatrix twice on the same generator works.
+  // The first call moves device_.tfdOutput into the result; the second call
+  // must work correctly with the reallocated output buffer.
+  auto mol1 = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol1, nullptr);
+  generateConformers(*mol1, 3);
+
+  auto mol2 = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol2, nullptr);
+  generateConformers(*mol2, 4);
+
+  nvMolKit::TFDComputeOptions options;
+
+  auto gpuTFD1 = gpuGenerator_->GetTFDMatrix(*mol1, options);
+  auto gpuTFD2 = gpuGenerator_->GetTFDMatrix(*mol2, options);
+
+  auto cpuTFD1 = cpuGenerator_.GetTFDMatrix(*mol1, options);
+  auto cpuTFD2 = cpuGenerator_.GetTFDMatrix(*mol2, options);
+
+  ASSERT_EQ(gpuTFD1.size(), cpuTFD1.size());
+  for (size_t i = 0; i < cpuTFD1.size(); ++i) {
+    EXPECT_NEAR(gpuTFD1[i], cpuTFD1[i], kTolerance) << "First call mismatch at index " << i;
+  }
+
+  ASSERT_EQ(gpuTFD2.size(), cpuTFD2.size());
+  for (size_t i = 0; i < cpuTFD2.size(); ++i) {
+    EXPECT_NEAR(gpuTFD2[i], cpuTFD2[i], kTolerance) << "Second call mismatch at index " << i;
+  }
+}
+
+// ========== GPU Result Extraction Tests ==========
+
+TEST_F(TFDGpuTest, GpuResultExtraction) {
+  std::vector<std::unique_ptr<RDKit::RWMol>> mols;
+  std::vector<const RDKit::ROMol*>           molPtrs;
+
+  auto mol1 = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  generateConformers(*mol1, 3);
+  molPtrs.push_back(mol1.get());
+  mols.push_back(std::move(mol1));
+
+  auto mol2 = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  generateConformers(*mol2, 4);
+  molPtrs.push_back(mol2.get());
+  mols.push_back(std::move(mol2));
+
+  nvMolKit::TFDComputeOptions options;
+
+  auto gpuResult = gpuGenerator_->GetTFDMatricesGpuBuffer(molPtrs, options);
+
+  // Test extractMolecule
+  auto mol0Extracted = gpuResult.extractMolecule(0);
+  auto mol1Extracted = gpuResult.extractMolecule(1);
+
+  // Verify sizes: 3 conformers = 3 pairs, 4 conformers = 6 pairs
+  EXPECT_EQ(mol0Extracted.size(), 3u);  // 3*(3-1)/2 = 3
+  EXPECT_EQ(mol1Extracted.size(), 6u);  // 4*(4-1)/2 = 6
+
+  // Test extractAll
+  auto allExtracted = gpuResult.extractAll();
+  ASSERT_EQ(allExtracted.size(), 2u);
+  EXPECT_EQ(allExtracted[0].size(), mol0Extracted.size());
+  EXPECT_EQ(allExtracted[1].size(), mol1Extracted.size());
+
+  // Values should match
+  for (size_t i = 0; i < mol0Extracted.size(); ++i) {
+    EXPECT_DOUBLE_EQ(allExtracted[0][i], mol0Extracted[i]);
+  }
+  for (size_t i = 0; i < mol1Extracted.size(); ++i) {
+    EXPECT_DOUBLE_EQ(allExtracted[1][i], mol1Extracted[i]);
+  }
+
+  // Compare with CPU reference
+  auto cpuResults = cpuGenerator_.GetTFDMatrices(molPtrs, options);
+  for (size_t m = 0; m < allExtracted.size(); ++m) {
+    ASSERT_EQ(allExtracted[m].size(), cpuResults[m].size());
+    for (size_t i = 0; i < allExtracted[m].size(); ++i) {
+      EXPECT_NEAR(allExtracted[m][i], cpuResults[m][i], kTolerance) << "Mismatch at molecule " << m << " index " << i;
+    }
+  }
+}
+
+TEST_F(TFDGpuTest, GpuResultExtractionOutOfRange) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+  generateConformers(*mol, 3);
+
+  std::vector<const RDKit::ROMol*> molPtrs = {mol.get()};
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        gpuResult = gpuGenerator_->GetTFDMatricesGpuBuffer(molPtrs, options);
+
+  EXPECT_NO_THROW(gpuResult.extractMolecule(0));
+  EXPECT_THROW(gpuResult.extractMolecule(-1), std::out_of_range);
+  EXPECT_THROW(gpuResult.extractMolecule(1), std::out_of_range);
+}
+
+TEST_F(TFDGpuTest, AddHsMatchesCPU) {
+  // Test with explicit hydrogens (AddHs) — produces more multi-quartet torsions.
+  // Verifies GPU generator matches CPU for molecules with hydrogen atoms.
+  const std::vector<std::string> testSmiles = {"CCCCC", "CC(C)CC"};
+
+  for (const auto& smiles : testSmiles) {
+    SCOPED_TRACE(std::string(smiles) + " (AddHs)");
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(smiles));
+    ASSERT_NE(mol, nullptr);
+
+    RDKit::MolOps::addHs(*mol);
+    generateConformers(*mol, 4, 42);
+    ASSERT_EQ(mol->getNumConformers(), 4);
+
+    nvMolKit::TFDComputeOptions options;
+
+    auto cpuTFD = cpuGenerator_.GetTFDMatrix(*mol, options);
+    ASSERT_FALSE(cpuTFD.empty());
+
+    auto gpuTFD = gpuGenerator_->GetTFDMatrix(*mol, options);
+
+    ASSERT_EQ(gpuTFD.size(), cpuTFD.size());
+    for (size_t i = 0; i < cpuTFD.size(); ++i) {
+      EXPECT_NEAR(gpuTFD[i], cpuTFD[i], kTolerance) << "Mismatch at TFD index " << i;
+    }
+  }
+}
diff --git a/tests/test_tfd_kernels.cu b/tests/test_tfd_kernels.cu
new file mode 100644
index 0000000..4fe3c3a
--- /dev/null
+++ b/tests/test_tfd_kernels.cu
@@ -0,0 +1,666 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <GraphMol/DistGeomHelpers/Embedder.h>
+#include <GraphMol/MolOps.h>
+#include <GraphMol/ROMol.h>
+#include <GraphMol/SmilesParse/SmilesParse.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "tfd_common.h"
+#include "tfd_cpu.h"
+#include "tfd_kernels.h"
+
+namespace {
+
+constexpr double kTolerance = 1e-3;  // GPU has lower precision due to float
+
+//! Generate conformers for a molecule using RDKit
+void generateConformers(RDKit::ROMol& mol, int numConformers, int seed = 42) {
+  RDKit::DGeomHelpers::EmbedParameters params = RDKit::DGeomHelpers::ETKDGv3;
+  params.randomSeed                           = seed;
+  params.numThreads                           = 1;
+  RDKit::DGeomHelpers::EmbedMultipleConfs(mol, numConformers, params);
+}
+
+}  // namespace
+
+// Global check for CUDA availability
+static bool gCudaAvailable = false;
+static bool gCudaChecked   = false;
+
+static bool checkCudaAvailable() {
+  if (!gCudaChecked) {
+    gCudaChecked = true;
+    try {
+      int         deviceCount = 0;
+      cudaError_t err         = cudaGetDeviceCount(&deviceCount);
+      if (err == cudaSuccess && deviceCount > 0) {
+        // Try to actually use the device
+        err = cudaSetDevice(0);
+        if (err == cudaSuccess) {
+          gCudaAvailable = true;
+        } else {
+          cudaGetLastError();  // Reset error state
+        }
+      } else {
+        cudaGetLastError();  // Reset error state
+      }
+    } catch (...) {
+      gCudaAvailable = false;
+    }
+  }
+  return gCudaAvailable;
+}
+
+class TFDKernelsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Check if CUDA is available
+    if (!checkCudaAvailable()) {
+      GTEST_SKIP() << "No CUDA devices available, skipping GPU tests";
+    }
+  }
+
+  nvMolKit::TFDCpuGenerator cpuGenerator_;
+};
+
+// =============================================================================
+// Dihedral kernel
+// =============================================================================
+
+TEST_F(TFDKernelsTest, DihedralKernelBasic) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 3);
+  ASSERT_EQ(mol->getNumConformers(), 3);
+
+  nvMolKit::TFDComputeOptions options;
+  auto                        system = nvMolKit::buildTFDSystem(*mol, options);
+
+  ASSERT_EQ(system.numMolecules(), 1);
+  ASSERT_EQ(system.molDescriptors[0].numConformers, 3);
+  ASSERT_GT(system.totalTorsions(), 0);
+  ASSERT_GT(system.totalDihedralWorkItems(), 0);
+
+  // Transfer to device
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::vector<float> gpuAngles;
+
+  {
+    nvMolKit::TFDSystemDevice device;
+    nvMolKit::transferToDevice(system, device, stream);
+
+    nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                   device.positions.data(),
+                                   device.confPositionStarts.data(),
+                                   device.torsionAtoms.data(),
+                                   device.molDescriptors.data(),
+                                   device.dihedralWorkStarts.data(),
+                                   system.numMolecules(),
+                                   device.dihedralAngles.data(),
+                                   stream);
+
+    gpuAngles.resize(device.dihedralAngles.size());
+    device.dihedralAngles.copyToHost(gpuAngles.data(), gpuAngles.size());
+    cudaStreamSynchronize(stream);
+  }
+
+  // Compute CPU reference
+  auto tl = nvMolKit::extractTorsionList(*mol, options.maxDevMode, options.symmRadius, options.ignoreColinearBonds);
+  auto cpuAngles = cpuGenerator_.computeDihedralAngles(*mol, tl);
+
+  // Compare
+  ASSERT_EQ(gpuAngles.size(), cpuAngles.size());
+  for (size_t i = 0; i < cpuAngles.size(); ++i) {
+    EXPECT_NEAR(gpuAngles[i], cpuAngles[i], kTolerance) << "Mismatch at angle " << i;
+  }
+
+  cudaStreamDestroy(stream);
+}
+
+// =============================================================================
+// TFD pipeline (GPU vs CPU)
+// =============================================================================
+
+TEST_F(TFDKernelsTest, TFDMatrixKernelMatchesCPU) {
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 5);
+  ASSERT_EQ(mol->getNumConformers(), 5);
+
+  nvMolKit::TFDComputeOptions options;
+
+  // Compute CPU reference
+  auto cpuTFD = cpuGenerator_.GetTFDMatrix(*mol, options);
+  ASSERT_FALSE(cpuTFD.empty());
+
+  // Build system
+  auto system = nvMolKit::buildTFDSystem(*mol, options);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::vector<float> gpuTFD;
+
+  {
+    nvMolKit::TFDSystemDevice device;
+    nvMolKit::transferToDevice(system, device, stream);
+
+    // Compute dihedral angles first
+    nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                   device.positions.data(),
+                                   device.confPositionStarts.data(),
+                                   device.torsionAtoms.data(),
+                                   device.molDescriptors.data(),
+                                   device.dihedralWorkStarts.data(),
+                                   system.numMolecules(),
+                                   device.dihedralAngles.data(),
+                                   stream);
+
+    // Compute TFD matrix
+    nvMolKit::launchTFDMatrixKernel(system.numMolecules(),
+                                    device.dihedralAngles.data(),
+                                    device.torsionWeights.data(),
+                                    device.torsionMaxDevs.data(),
+                                    device.quartetStarts.data(),
+                                    device.torsionTypes.data(),
+                                    device.molDescriptors.data(),
+                                    device.tfdOutput.data(),
+                                    stream);
+
+    gpuTFD.resize(device.tfdOutput.size());
+    device.tfdOutput.copyToHost(gpuTFD.data(), gpuTFD.size());
+    cudaStreamSynchronize(stream);
+  }
+
+  // Compare
+  ASSERT_EQ(gpuTFD.size(), cpuTFD.size());
+  for (size_t i = 0; i < cpuTFD.size(); ++i) {
+    EXPECT_NEAR(gpuTFD[i], cpuTFD[i], kTolerance) << "Mismatch at TFD index " << i;
+  }
+
+  cudaStreamDestroy(stream);
+}
+
+TEST_F(TFDKernelsTest, BatchMultipleMolecules) {
+  // Mix of single-quartet and multi-quartet molecules in a single batch
+  const std::vector<std::string> testSmiles = {
+    "CCCC",         // n-butane (single-quartet)
+    "CCCCC",        // n-pentane (single-quartet)
+    "CC(C)CC",      // isopentane (symmetric, 2 quartets)
+    "c1ccccc1",     // benzene (ring, 6 quartets)
+    "c1ccccc1CC",   // ethylbenzene (ring + symmetric)
+    "CCO",          // ethanol
+    "c1ccc(cc1)O",  // phenol
+  };
+
+  std::vector<std::unique_ptr<RDKit::RWMol>> mols;
+  std::vector<const RDKit::ROMol*>           molPtrs;
+
+  for (const auto& smiles : testSmiles) {
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(smiles));
+    if (mol) {
+      generateConformers(*mol, 3);
+      if (mol->getNumConformers() >= 2) {
+        molPtrs.push_back(mol.get());
+        mols.push_back(std::move(mol));
+      }
+    }
+  }
+
+  ASSERT_GE(mols.size(), 3u);
+
+  nvMolKit::TFDComputeOptions options;
+
+  // Compute CPU reference
+  auto cpuResults = cpuGenerator_.GetTFDMatrices(molPtrs, options);
+
+  // Build batch system
+  auto system = nvMolKit::buildTFDSystem(molPtrs, options);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::vector<float> gpuTFDFlat;
+
+  {
+    nvMolKit::TFDSystemDevice device;
+    nvMolKit::transferToDevice(system, device, stream);
+
+    // Launch dihedral kernel
+    nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                   device.positions.data(),
+                                   device.confPositionStarts.data(),
+                                   device.torsionAtoms.data(),
+                                   device.molDescriptors.data(),
+                                   device.dihedralWorkStarts.data(),
+                                   system.numMolecules(),
+                                   device.dihedralAngles.data(),
+                                   stream);
+
+    // Launch TFD matrix kernel
+    nvMolKit::launchTFDMatrixKernel(system.numMolecules(),
+                                    device.dihedralAngles.data(),
+                                    device.torsionWeights.data(),
+                                    device.torsionMaxDevs.data(),
+                                    device.quartetStarts.data(),
+                                    device.torsionTypes.data(),
+                                    device.molDescriptors.data(),
+                                    device.tfdOutput.data(),
+                                    stream);
+
+    gpuTFDFlat.resize(device.tfdOutput.size());
+    if (!gpuTFDFlat.empty()) {
+      device.tfdOutput.copyToHost(gpuTFDFlat.data(), gpuTFDFlat.size());
+    }
+    cudaStreamSynchronize(stream);
+  }
+
+  // Compare per molecule
+  for (size_t m = 0; m < molPtrs.size(); ++m) {
+    int outStart = system.molDescriptors[m].tfdOutStart;
+    int outEnd   = (static_cast<int>(m) + 1 < system.numMolecules()) ? system.molDescriptors[m + 1].tfdOutStart :
+                                                                       system.totalTFDOutputs();
+
+    ASSERT_EQ(static_cast<size_t>(outEnd - outStart), cpuResults[m].size()) << "Size mismatch for molecule " << m;
+
+    for (int i = outStart; i < outEnd; ++i) {
+      EXPECT_NEAR(gpuTFDFlat[i], cpuResults[m][i - outStart], kTolerance)
+        << "Mismatch at molecule " << m << " TFD index " << (i - outStart);
+    }
+  }
+
+  cudaStreamDestroy(stream);
+}
+
+// =============================================================================
+// Direct RDKit reference
+// =============================================================================
+
+TEST_F(TFDKernelsTest, CompareWithRDKitReference) {
+  // Compare GPU TFD output directly against pre-computed RDKit reference values.
+  // Includes both single-quartet and multi-quartet (ring, symmetric) molecules.
+  //
+  // Reference generated with RDKit Python:
+  //   mol = Chem.MolFromSmiles(smiles)
+  //   params = AllChem.ETKDGv3()
+  //   params.randomSeed = 42
+  //   AllChem.EmbedMultipleConfs(mol, 4, params)
+  //   tfd = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev='equal', symmRadius=2)
+
+  struct TestCase {
+    const char*         smiles;
+    std::vector<double> reference;
+  };
+
+  // clang-format off
+  std::vector<TestCase> cases = {
+    {"CCCC", {                    // n-butane, 1 torsion
+      0.6667389132,  // TFD(1,0)
+      0.0000726610,  // TFD(2,0)
+      0.6666662521,  // TFD(2,1)
+      0.6667387931,  // TFD(3,0)
+      0.0000001200,  // TFD(3,1)
+      0.6666661321   // TFD(3,2)
+    }},
+    {"CCCCC", {                   // n-pentane, 2 torsions
+      0.6060606631,  // TFD[0]
+      0.6060573299,  // TFD[1]
+      0.6060662252,  // TFD[2]
+      0.6666872992,  // TFD[3]
+      0.6666326206,  // TFD[4]
+      0.0606323184   // TFD[5]
+    }},
+    {"CCCCCC", {                  // n-hexane, 3 torsions
+      0.6111276139,  // TFD[0]
+      0.0555704226,  // TFD[1]
+      0.6666744357,  // TFD[2]
+      0.5555532106,  // TFD[3]
+      0.6111014381,  // TFD[4]
+      0.6111123144   // TFD[5]
+    }},
+    {"CC(C)CC", {                 // isopentane, 1 symmetric torsion (2 quartets)
+      0.0000045777,
+      0.0433253929,
+      0.0433299706,
+      0.0000027031,
+      0.0000072808,
+      0.0433226898
+    }},
+    {"C1CCCCC1", {                // cyclohexane, 1 ring torsion (6 quartets)
+      0.1343662730,
+      0.3653621455,
+      0.2309958724,
+      0.1017138867,
+      0.2360801597,
+      0.4670760321
+    }},
+    {"c1ccccc1CC", {              // ethylbenzene, 1 symmetric non-ring + 1 ring
+      0.0000114982,
+      0.0000104863,
+      0.0000020789,
+      0.0000100311,
+      0.0000018047,
+      0.0000013199
+    }},
+  };
+  // clang-format on
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights          = true;
+  options.maxDevMode          = nvMolKit::TFDMaxDevMode::Equal;
+  options.symmRadius          = 2;
+  options.ignoreColinearBonds = true;
+
+  for (const auto& tc : cases) {
+    SCOPED_TRACE(tc.smiles);
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(tc.smiles));
+    ASSERT_NE(mol, nullptr);
+
+    generateConformers(*mol, 4, 42);
+    ASSERT_EQ(mol->getNumConformers(), 4);
+
+    auto system = nvMolKit::buildTFDSystem(*mol, options);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    std::vector<float> gpuTFD;
+
+    {
+      nvMolKit::TFDSystemDevice device;
+      nvMolKit::transferToDevice(system, device, stream);
+
+      nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                     device.positions.data(),
+                                     device.confPositionStarts.data(),
+                                     device.torsionAtoms.data(),
+                                     device.molDescriptors.data(),
+                                     device.dihedralWorkStarts.data(),
+                                     system.numMolecules(),
+                                     device.dihedralAngles.data(),
+                                     stream);
+
+      nvMolKit::launchTFDMatrixKernel(system.numMolecules(),
+                                      device.dihedralAngles.data(),
+                                      device.torsionWeights.data(),
+                                      device.torsionMaxDevs.data(),
+                                      device.quartetStarts.data(),
+                                      device.torsionTypes.data(),
+                                      device.molDescriptors.data(),
+                                      device.tfdOutput.data(),
+                                      stream);
+
+      gpuTFD.resize(device.tfdOutput.size());
+      device.tfdOutput.copyToHost(gpuTFD.data(), gpuTFD.size());
+      cudaStreamSynchronize(stream);
+    }
+
+    ASSERT_EQ(gpuTFD.size(), tc.reference.size());
+    for (size_t i = 0; i < gpuTFD.size(); ++i) {
+      EXPECT_NEAR(gpuTFD[i], tc.reference[i], kTolerance) << "TFD[" << i << "] mismatch with RDKit reference";
+    }
+
+    cudaStreamDestroy(stream);
+  }
+}
+
+// =============================================================================
+// Edge cases
+// =============================================================================
+
+TEST_F(TFDKernelsTest, TwoConformers) {
+  // Minimum pair case: 2 conformers produce exactly 1 TFD pair
+  auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(mol, nullptr);
+
+  generateConformers(*mol, 2);
+  ASSERT_EQ(mol->getNumConformers(), 2);
+
+  nvMolKit::TFDComputeOptions options;
+
+  auto cpuTFD = cpuGenerator_.GetTFDMatrix(*mol, options);
+  ASSERT_EQ(cpuTFD.size(), 1u);
+
+  auto system = nvMolKit::buildTFDSystem(*mol, options);
+  ASSERT_EQ(system.totalTFDOutputs(), 1);
+  ASSERT_EQ(system.totalTFDOutputs(), 1);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::vector<float> gpuTFD;
+
+  {
+    nvMolKit::TFDSystemDevice device;
+    nvMolKit::transferToDevice(system, device, stream);
+
+    nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                   device.positions.data(),
+                                   device.confPositionStarts.data(),
+                                   device.torsionAtoms.data(),
+                                   device.molDescriptors.data(),
+                                   device.dihedralWorkStarts.data(),
+                                   system.numMolecules(),
+                                   device.dihedralAngles.data(),
+                                   stream);
+
+    nvMolKit::launchTFDMatrixKernel(system.numMolecules(),
+                                    device.dihedralAngles.data(),
+                                    device.torsionWeights.data(),
+                                    device.torsionMaxDevs.data(),
+                                    device.quartetStarts.data(),
+                                    device.torsionTypes.data(),
+                                    device.molDescriptors.data(),
+                                    device.tfdOutput.data(),
+                                    stream);
+
+    gpuTFD.resize(device.tfdOutput.size());
+    device.tfdOutput.copyToHost(gpuTFD.data(), gpuTFD.size());
+    cudaStreamSynchronize(stream);
+  }
+
+  ASSERT_EQ(gpuTFD.size(), 1u);
+  EXPECT_NEAR(gpuTFD[0], cpuTFD[0], kTolerance);
+
+  cudaStreamDestroy(stream);
+}
+
+TEST_F(TFDKernelsTest, BatchWithZeroTorsionMolecule) {
+  // A molecule with no torsions (ethane) should produce TFD=0
+  // and not corrupt results of neighboring molecules in the batch
+  auto ethane = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CC"));
+  ASSERT_NE(ethane, nullptr);
+  generateConformers(*ethane, 3);
+  ASSERT_GE(ethane->getNumConformers(), 2);
+
+  auto butane = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol("CCCC"));
+  ASSERT_NE(butane, nullptr);
+  generateConformers(*butane, 3);
+  ASSERT_GE(butane->getNumConformers(), 2);
+
+  // CPU reference for butane alone
+  nvMolKit::TFDComputeOptions options;
+  auto                        cpuButane = cpuGenerator_.GetTFDMatrix(*butane, options);
+
+  // GPU batch: ethane first, then butane
+  std::vector<const RDKit::ROMol*> molPtrs = {ethane.get(), butane.get()};
+  auto                             system  = nvMolKit::buildTFDSystem(molPtrs, options);
+  ASSERT_EQ(system.numMolecules(), 2);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  std::vector<float> gpuTFD;
+
+  {
+    nvMolKit::TFDSystemDevice device;
+    nvMolKit::transferToDevice(system, device, stream);
+
+    nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                   device.positions.data(),
+                                   device.confPositionStarts.data(),
+                                   device.torsionAtoms.data(),
+                                   device.molDescriptors.data(),
+                                   device.dihedralWorkStarts.data(),
+                                   system.numMolecules(),
+                                   device.dihedralAngles.data(),
+                                   stream);
+
+    nvMolKit::launchTFDMatrixKernel(system.numMolecules(),
+                                    device.dihedralAngles.data(),
+                                    device.torsionWeights.data(),
+                                    device.torsionMaxDevs.data(),
+                                    device.quartetStarts.data(),
+                                    device.torsionTypes.data(),
+                                    device.molDescriptors.data(),
+                                    device.tfdOutput.data(),
+                                    stream);
+
+    gpuTFD.resize(device.tfdOutput.size());
+    if (!gpuTFD.empty()) {
+      device.tfdOutput.copyToHost(gpuTFD.data(), gpuTFD.size());
+    }
+    cudaStreamSynchronize(stream);
+  }
+
+  // Check ethane: all TFD values must be exactly 0 (no torsions).
+  // The kernel skips molecules with 0 torsions, so output buffer must be pre-zeroed.
+  int ethaneStart = system.molDescriptors[0].tfdOutStart;
+  int ethaneEnd   = system.molDescriptors[1].tfdOutStart;
+  for (int i = ethaneStart; i < ethaneEnd; ++i) {
+    EXPECT_EQ(gpuTFD[i], 0.0f) << "Ethane (no torsions) should have TFD=0 at index " << i;
+  }
+
+  // Check butane: should match CPU reference (not corrupted by ethane)
+  int butaneStart = system.molDescriptors[1].tfdOutStart;
+  int butaneEnd   = system.totalTFDOutputs();
+  ASSERT_EQ(static_cast<size_t>(butaneEnd - butaneStart), cpuButane.size());
+  for (int i = butaneStart; i < butaneEnd; ++i) {
+    EXPECT_NEAR(gpuTFD[i], cpuButane[i - butaneStart], kTolerance)
+      << "Butane TFD mismatch at index " << (i - butaneStart);
+  }
+
+  cudaStreamDestroy(stream);
+}
+
+TEST_F(TFDKernelsTest, CompareWithRDKitReferenceAddHs) {
+  // Compare GPU TFD output with AddHs against RDKit reference values.
+  // AddHs produces more quartets per torsion, exercising multi-quartet handling
+  // on molecules that would otherwise be single-quartet.
+  //
+  // Reference generated with RDKit Python:
+  //   mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
+  //   params = AllChem.ETKDGv3()
+  //   params.randomSeed = 42
+  //   AllChem.EmbedMultipleConfs(mol, 4, params)
+  //   tfd = TorsionFingerprints.GetTFDMatrix(mol, useWeights=True, maxDev='equal', symmRadius=2)
+
+  struct TestCase {
+    const char*         smiles;
+    std::vector<double> reference;
+  };
+
+  // clang-format off
+  std::vector<TestCase> cases = {
+    {"CCCCC", {
+      0.6666346588,
+      0.0606342183,
+      0.6666024736,
+      0.6666717502,
+      0.6666935910,
+      0.6061117436
+    }},
+    {"CC(C)CC", {
+      0.0155798214,
+      0.0118375755,
+      0.0180868258,
+      0.0000140721,
+      0.0091266798,
+      0.0336807192
+    }},
+  };
+  // clang-format on
+
+  nvMolKit::TFDComputeOptions options;
+  options.useWeights          = true;
+  options.maxDevMode          = nvMolKit::TFDMaxDevMode::Equal;
+  options.symmRadius          = 2;
+  options.ignoreColinearBonds = true;
+
+  for (const auto& tc : cases) {
+    SCOPED_TRACE(std::string(tc.smiles) + " (AddHs)");
+
+    auto mol = std::unique_ptr<RDKit::RWMol>(RDKit::SmilesToMol(tc.smiles));
+    ASSERT_NE(mol, nullptr);
+
+    RDKit::MolOps::addHs(*mol);
+
+    generateConformers(*mol, 4, 42);
+    ASSERT_EQ(mol->getNumConformers(), 4);
+
+    auto system = nvMolKit::buildTFDSystem(*mol, options);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    std::vector<float> gpuTFD;
+
+    {
+      nvMolKit::TFDSystemDevice device;
+      nvMolKit::transferToDevice(system, device, stream);
+
+      nvMolKit::launchDihedralKernel(system.totalDihedralWorkItems(),
+                                     device.positions.data(),
+                                     device.confPositionStarts.data(),
+                                     device.torsionAtoms.data(),
+                                     device.molDescriptors.data(),
+                                     device.dihedralWorkStarts.data(),
+                                     system.numMolecules(),
+                                     device.dihedralAngles.data(),
+                                     stream);
+
+      nvMolKit::launchTFDMatrixKernel(system.numMolecules(),
+                                      device.dihedralAngles.data(),
+                                      device.torsionWeights.data(),
+                                      device.torsionMaxDevs.data(),
+                                      device.quartetStarts.data(),
+                                      device.torsionTypes.data(),
+                                      device.molDescriptors.data(),
+                                      device.tfdOutput.data(),
+                                      stream);
+
+      gpuTFD.resize(device.tfdOutput.size());
+      device.tfdOutput.copyToHost(gpuTFD.data(), gpuTFD.size());
+      cudaStreamSynchronize(stream);
+    }
+
+    ASSERT_EQ(gpuTFD.size(), tc.reference.size());
+    for (size_t i = 0; i < gpuTFD.size(); ++i) {
+      EXPECT_NEAR(gpuTFD[i], tc.reference[i], kTolerance) << "TFD[" << i << "] mismatch with RDKit reference";
+    }
+
+    cudaStreamDestroy(stream);
+  }
+}