Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1aa2ff2
Add TFD (Torsion Fingerprint Deviation) CPU implementation
evasnow1992 Feb 9, 2026
3818dc5
Add TFD GPU CUDA kernels for dihedral and pairwise TFD computation
evasnow1992 Feb 9, 2026
d099729
Add TFD GPU orchestration layer and unified public API
evasnow1992 Feb 9, 2026
f0b170e
Add TFD Python bindings and test suite
evasnow1992 Feb 10, 2026
690c8d8
Add multi-quartet support for ring and symmetric torsions in TFD
evasnow1992 Feb 10, 2026
7861600
Parallelize RDKit preprocessing in TFD batch builds
evasnow1992 Feb 10, 2026
48effd9
Add TFD benchmark script comparing RDKit, CPU, and GPU backends
evasnow1992 Feb 10, 2026
e78c8c7
Add NVTX instrumentation to TFD and nsys profiling script
evasnow1992 Feb 11, 2026
76bf762
Refactor TFDSystemHost: remove redundant fields and optimize bond reuse
evasnow1992 Mar 4, 2026
5db7095
Fix Morgan fingerprint invariants and ring torsion deviation to match…
evasnow1992 Mar 5, 2026
ba97e15
Add skipGpuWorkItems to buildTFDSystem for CPU path optimization
evasnow1992 Mar 6, 2026
e9380df
Add molecule preprocessing script and pickle-based caching for benchm…
evasnow1992 Mar 6, 2026
85fed05
Refactor GPU kernels: MolDescriptor dispatch, block-per-molecule TFD
evasnow1992 Mar 10, 2026
daec8d9
Add return_type option to Python TFD API and optimize GPU result extr…
evasnow1992 Mar 11, 2026
3f34a4c
Add Python tests for numpy and tensor return types in TFD API
evasnow1992 Mar 11, 2026
dc5f338
Update TFD reference values for RDKit 2025.09.5
evasnow1992 Mar 11, 2026
6083fb5
Simplify TFD API layering: direct CPU/GPU generator routing
evasnow1992 Mar 12, 2026
6820e43
Move TFDGenerator tests to dedicated test_tfd.cpp, add backend valida…
evasnow1992 Mar 12, 2026
e618a93
Remove redundant list copies in Python TFD result handling
evasnow1992 Mar 12, 2026
bcd3889
Document floating-point tiebreak limitation in findCentralBond
evasnow1992 Apr 2, 2026
777fb6b
Add --pkl-file and --verify-only flags to tfd_bench.py
evasnow1992 Apr 2, 2026
27a084e
Commit message:
evasnow1992 Apr 2, 2026
6e12693
Optimize CPU TFD result path: zero-copy numpy arrays, add NVTX instru…
evasnow1992 Apr 2, 2026
f0981dc
Optimize TFD batch build and simplify C++ API surface
evasnow1992 Apr 3, 2026
e15075b
Fix copyright years and ruff formatting for CI
evasnow1992 Apr 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions benchmarks/prepare_mols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Precompute molecules with conformers for TFD benchmarking and profiling.

Generates conformers once and saves the results as pickle files (one per
conformer count). This avoids the expensive ETKDG embedding during
benchmark/profile runs.

Output files: <output-dir>/prepared_mols_<N>confs.pkl

Usage:
python benchmarks/prepare_mols.py

# Custom settings:
python benchmarks/prepare_mols.py \
--smiles-file benchmarks/data/benchmark_smiles.csv \
--output-dir benchmarks/data \
--conformers 5 10 20 50 \
--max-mols 500 \
--workers 8
"""

import argparse
import multiprocessing
import os
import pickle
import sys
import time
from functools import partial

import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem

RDLogger.DisableLog("rdApp.*")


def _generate_one(args_tuple, num_confs):
"""Generate conformers for a single molecule (picklable for multiprocessing)."""
idx, smi = args_tuple
mol = Chem.MolFromSmiles(smi)
if mol is None or mol.GetNumAtoms() < 4:
return None

mol = Chem.AddHs(mol)
params = AllChem.ETKDGv3()
params.randomSeed = 42 + idx
params.numThreads = 1
params.useRandomCoords = True
conf_ids = AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, params=params)
if len(conf_ids) < 2:
return None
return Chem.RemoveHs(mol)


def main():
default_smiles = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "benchmark_smiles.csv")
default_output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")

parser = argparse.ArgumentParser(
description="Precompute molecules with conformers for TFD benchmarking",
)
parser.add_argument(
"--smiles-file",
type=str,
default=default_smiles,
help="CSV file with SMILES in first column (default: benchmarks/data/benchmark_smiles.csv)",
)
parser.add_argument(
"--output-dir",
type=str,
default=default_output_dir,
help="Output directory for pickle files (default: benchmarks/data)",
)
parser.add_argument(
"--conformers",
type=int,
nargs="+",
default=[5, 10, 20],
help="Conformer counts to generate (default: 5 10 20)",
)
parser.add_argument(
"--max-mols",
type=int,
default=0,
help="Maximum number of molecules to prepare (default: 0 = all valid SMILES)",
)
parser.add_argument(
"--workers",
type=int,
default=max(1, multiprocessing.cpu_count() // 2),
help=f"Number of parallel workers (default: {max(1, multiprocessing.cpu_count() // 2)})",
)
args = parser.parse_args()

print(f"Loading SMILES from: {args.smiles_file}")
df = pd.read_csv(args.smiles_file)
smiles_list = df.iloc[:, 0].tolist()
print(f"Loaded {len(smiles_list)} SMILES")

# Build (index, smiles) pairs, filtering small molecules
work_items = []
for i, smi in enumerate(smiles_list):
mol = Chem.MolFromSmiles(smi)
if mol is not None and mol.GetNumAtoms() >= 4:
work_items.append((i, smi))
if args.max_mols > 0 and len(work_items) >= args.max_mols:
break
print(f"Valid molecules (>=4 atoms): {len(work_items)}")
print(f"Workers: {args.workers}")

os.makedirs(args.output_dir, exist_ok=True)

for num_confs in args.conformers:
out_path = os.path.join(args.output_dir, f"prepared_mols_{num_confs}confs.pkl")

if os.path.exists(out_path):
print(f"\n--- {num_confs} conformers: {out_path} already exists, skipping ---")
continue

print(f"\n--- Generating {num_confs} conformers ({len(work_items)} molecules, {args.workers} workers) ---")
start = time.perf_counter()

worker_fn = partial(_generate_one, num_confs=num_confs)

if args.workers > 1:
with multiprocessing.Pool(args.workers) as pool:
results = []
for i, result in enumerate(pool.imap(worker_fn, work_items)):
results.append(result)
if (i + 1) % 50 == 0 or (i + 1) == len(work_items):
print(f" Progress: {i + 1}/{len(work_items)} molecules", flush=True)
else:
results = []
for i, item in enumerate(work_items):
results.append(worker_fn(item))
if (i + 1) % 50 == 0 or (i + 1) == len(work_items):
print(f" Progress: {i + 1}/{len(work_items)} molecules", flush=True)

mols = [m for m in results if m is not None]
elapsed = time.perf_counter() - start

actual_confs = [m.GetNumConformers() for m in mols]
total_pairs = sum(c * (c - 1) // 2 for c in actual_confs)

with open(out_path, "wb") as f:
pickle.dump(mols, f, protocol=pickle.HIGHEST_PROTOCOL)

file_size_mb = os.path.getsize(out_path) / (1024 * 1024)
print(f" {len(mols)} molecules, avg {sum(actual_confs) / len(actual_confs):.1f} conformers")
print(f" Total TFD pairs: {total_pairs}")
print(f" Time: {elapsed:.1f}s")
print(f" Saved: {out_path} ({file_size_mb:.1f} MB)")

print("\nDone.")


if __name__ == "__main__":
main()
Loading
Loading