diff --git a/nvmolkit/CMakeLists.txt b/nvmolkit/CMakeLists.txt
index be4069b..1e6aa04 100644
--- a/nvmolkit/CMakeLists.txt
+++ b/nvmolkit/CMakeLists.txt
@@ -55,6 +55,16 @@ target_include_directories(_mmffOptimization SYSTEM
                            PUBLIC ${Boost_INCLUDE_DIRS})
 installpythontarget(_mmffOptimization ./)
 
+add_library(_uffOptimization MODULE uffOptimization.cpp)
+target_link_libraries(_uffOptimization PUBLIC ${Boost_LIBRARIES}
+                                              ${PYTHON_LIBRARIES})
+target_link_libraries(_uffOptimization PRIVATE ${RDKit_LIBS} bfgs_uff ff_utils)
+target_include_directories(
+  _uffOptimization PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/utils
+                          ${Python_INCLUDE_DIRS})
+target_include_directories(_uffOptimization SYSTEM PUBLIC ${Boost_INCLUDE_DIRS})
+installpythontarget(_uffOptimization ./)
+
 add_library(_batchedForcefield MODULE batchedForcefield.cpp)
 target_link_libraries(_batchedForcefield PUBLIC ${Boost_LIBRARIES}
                                                 ${PYTHON_LIBRARIES})
diff --git a/nvmolkit/__init__.py b/nvmolkit/__init__.py
index 760cc84..96f694c 100644
--- a/nvmolkit/__init__.py
+++ b/nvmolkit/__init__.py
@@ -25,6 +25,7 @@
 - Bulk tanimoto/cosine similarity calculations between fingerprints
 - ETKDG conformer generation for multiple molecules
 - MMFF optimization for multiple molecules and conformers
+- UFF optimization for multiple molecules and conformers
 """
 
 VERSION = "0.4.0"
diff --git a/nvmolkit/_uffOptimization.pyi b/nvmolkit/_uffOptimization.pyi
new file mode 100644
index 0000000..6e660fa
--- /dev/null
+++ b/nvmolkit/_uffOptimization.pyi
@@ -0,0 +1,10 @@
+from typing import Any, List
+from rdkit.Chem import Mol
+
+def UFFOptimizeMoleculesConfs(
+    molecules: List[Mol],
+    maxIters: int = 1000,
+    vdwThresholds: Any = ...,
+    ignoreInterfragInteractions: Any = ...,
+    hardwareOptions: Any = ...,
+) -> List[List[float]]: ...
diff --git a/nvmolkit/tests/test_uff_optimization.py b/nvmolkit/tests/test_uff_optimization.py
new file mode 100644
index 0000000..6fabdb7
--- /dev/null
+++ b/nvmolkit/tests/test_uff_optimization.py
@@ -0,0 +1,198 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+from rdkit import Chem
+from rdkit.Chem import rdDistGeom, rdForceFieldHelpers
+from rdkit.ForceField import rdForceField as _rdForceField  # noqa: F401
+from rdkit.Geometry import Point3D
+
+import nvmolkit.uffOptimization as nvmolkit_uff
+from nvmolkit.types import HardwareOptions
+
+
+@pytest.fixture
+def uff_test_mols(num_mols=5):
+    """Load a handful of UFF-valid molecules from the shared validation set."""
+    sdf_path = os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "..",
+        "tests",
+        "test_data",
+        "MMFF94_dative.sdf",
+    )
+
+    if not os.path.exists(sdf_path):
+        pytest.skip(f"Test data file not found: {sdf_path}")
+
+    supplier = Chem.SDMolSupplier(sdf_path, removeHs=False, sanitize=True)
+    molecules = []
+    for mol in supplier:
+        if mol is None:
+            continue
+        if not rdForceFieldHelpers.UFFHasAllMoleculeParams(mol):
+            continue
+        molecules.append(mol)
+        if len(molecules) >= num_mols:
+            break
+
+    if len(molecules) < num_mols:
+        pytest.skip(f"Expected {num_mols} UFF-valid molecules, found {len(molecules)}")
+
+    return molecules
+
+
+def create_hard_copy_mols(molecules):
+    return [Chem.Mol(mol) for mol in molecules]
+
+
+def make_fragmented_mol():
+    mol = Chem.AddHs(Chem.MolFromSmiles("CC.CC"))
+    params = rdDistGeom.ETKDGv3()
+    params.useRandomCoords = True
+    params.randomSeed = 42
+    rdDistGeom.EmbedMultipleConfs(mol, numConfs=1, params=params)
+    conf = mol.GetConformer()
+    fragments = Chem.GetMolFrags(mol)
+    if len(fragments) != 2:
+        raise AssertionError("Expected two fragments for interfragment interaction test")
+    anchor = conf.GetAtomPosition(fragments[0][0])
+    moved = conf.GetAtomPosition(fragments[1][0])
+    shift = Point3D(anchor.x - moved.x + 2.0, anchor.y - moved.y, anchor.z - moved.z)
+    for atom_idx in fragments[1]:
+        pos = conf.GetAtomPosition(atom_idx)
+        conf.SetAtomPosition(atom_idx, Point3D(pos.x + shift.x, pos.y + shift.y, pos.z + shift.z))
+    return mol
+
+
+def calculate_rdkit_uff_energies(
+    molecules,
+    maxIters=1000,
+    vdwThreshold: float = 10.0,
+    ignoreInterfragInteractions: bool = True,
+):
+    all_energies = []
+    for mol in molecules:
+        mol_energies = []
+        for conf_id in range(mol.GetNumConformers()):
+            ff = rdForceFieldHelpers.UFFGetMoleculeForceField(
+                mol,
+                vdwThresh=vdwThreshold,
+                confId=conf_id,
+                ignoreInterfragInteractions=ignoreInterfragInteractions,
+            )
+            ff.Initialize()
+            ff.Minimize(maxIts=maxIters)
+            mol_energies.append(ff.CalcEnergy())
+        all_energies.append(mol_energies)
+    return all_energies
+
+
+def test_uff_optimization_serial_vs_rdkit(uff_test_mols):
+    rdkit_mols = create_hard_copy_mols(uff_test_mols)
+    nvmolkit_mols = create_hard_copy_mols(uff_test_mols)
+
+    rdkit_energies = calculate_rdkit_uff_energies(rdkit_mols, maxIters=200)
+
+    nvmolkit_energies = []
+    for mol in nvmolkit_mols:
+        mol_energies = nvmolkit_uff.UFFOptimizeMoleculesConfs([mol], maxIters=200)
+        nvmolkit_energies.extend(mol_energies)
+
+    assert len(rdkit_energies) == len(nvmolkit_energies)
+    for mol_idx, (rdkit_mol_energies, nvmolkit_mol_energies) in enumerate(zip(rdkit_energies, nvmolkit_energies)):
+        assert len(rdkit_mol_energies) == len(nvmolkit_mol_energies)
+        for conf_idx, (rdkit_energy, nvmolkit_energy) in enumerate(zip(rdkit_mol_energies, nvmolkit_mol_energies)):
+            diff = abs(rdkit_energy - nvmolkit_energy)
+            rel = diff / abs(rdkit_energy) if abs(rdkit_energy) > 1e-10 else diff
+            assert rel < 1e-3, (
+                f"Molecule {mol_idx}, conformer {conf_idx}: "
+                f"RDKit={rdkit_energy:.6f} nvMolKit={nvmolkit_energy:.6f} rel={rel:.6f}"
+            )
+
+
+def test_uff_optimization_batch_vs_rdkit(uff_test_mols):
+    rdkit_mols = create_hard_copy_mols(uff_test_mols)
+    nvmolkit_mols = create_hard_copy_mols(uff_test_mols)
+
+    rdkit_energies = calculate_rdkit_uff_energies(rdkit_mols, maxIters=200)
+    hardware_options = HardwareOptions(batchSize=2, batchesPerGpu=1)
+    nvmolkit_energies = nvmolkit_uff.UFFOptimizeMoleculesConfs(
+        nvmolkit_mols,
+        maxIters=200,
+        hardwareOptions=hardware_options,
+    )
+
+    assert len(rdkit_energies) == len(nvmolkit_energies)
+    for mol_idx, (rdkit_mol_energies, nvmolkit_mol_energies) in enumerate(zip(rdkit_energies, nvmolkit_energies)):
+        assert len(rdkit_mol_energies) == len(nvmolkit_mol_energies)
+        for conf_idx, (rdkit_energy, nvmolkit_energy) in enumerate(zip(rdkit_mol_energies, nvmolkit_mol_energies)):
+            diff = abs(rdkit_energy - nvmolkit_energy)
+            rel = diff / abs(rdkit_energy) if abs(rdkit_energy) > 1e-10 else diff
+            assert rel < 1e-3, (
+                f"Molecule {mol_idx}, conformer {conf_idx}: "
+                f"RDKit={rdkit_energy:.6f} nvMolKit={nvmolkit_energy:.6f} rel={rel:.6f}"
+            )
+
+
+def test_uff_optimization_empty_input():
+    assert nvmolkit_uff.UFFOptimizeMoleculesConfs([]) == []
+
+
+def test_uff_optimization_invalid_input():
+    unsupported = Chem.MolFromSmiles("*")
+    with pytest.raises(ValueError) as exc_info:
+        nvmolkit_uff.UFFOptimizeMoleculesConfs([None, unsupported])
+    assert exc_info.value.args[1]["none"] == [0]
+    assert exc_info.value.args[1]["no_params"] == [1]
+
+
+def test_uff_optimization_threshold_and_interfrag_vs_rdkit():
+    mols = [make_fragmented_mol(), make_fragmented_mol()]
+    rdkit_mols = create_hard_copy_mols(mols)
+    nvmolkit_mols = create_hard_copy_mols(mols)
+
+    thresholds = [25.0, 100.0]
+    ignore_interfrag = [False, True]
+
+    rdkit_energies = [
+        calculate_rdkit_uff_energies(
+            [mol],
+            maxIters=1000,
+            vdwThreshold=threshold,
+            ignoreInterfragInteractions=ignore,
+        )[0]
+        for mol, threshold, ignore in zip(rdkit_mols, thresholds, ignore_interfrag)
+    ]
+    nvmolkit_energies = nvmolkit_uff.UFFOptimizeMoleculesConfs(
+        nvmolkit_mols,
+        maxIters=1000,
+        vdwThreshold=thresholds,
+        ignoreInterfragInteractions=ignore_interfrag,
+    )
+
+    assert len(rdkit_energies) == len(nvmolkit_energies)
+    for mol_idx, (rdkit_mol_energies, nvmolkit_mol_energies) in enumerate(zip(rdkit_energies, nvmolkit_energies)):
+        assert len(rdkit_mol_energies) == len(nvmolkit_mol_energies)
+        for conf_idx, (rdkit_energy, nvmolkit_energy) in enumerate(zip(rdkit_mol_energies, nvmolkit_mol_energies)):
+            diff = abs(rdkit_energy - nvmolkit_energy)
+            rel = diff / abs(rdkit_energy) if abs(rdkit_energy) > 1e-10 else diff
+            assert rel < 1e-3, (
+                f"Molecule {mol_idx}, conformer {conf_idx}: "
+                f"RDKit={rdkit_energy:.6f} nvMolKit={nvmolkit_energy:.6f} rel={rel:.6f}"
+            )
diff --git a/nvmolkit/uffOptimization.cpp b/nvmolkit/uffOptimization.cpp
new file mode 100644
index 0000000..bbb79c8
--- /dev/null
+++ b/nvmolkit/uffOptimization.cpp
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <GraphMol/ROMol.h>
+
+#include <boost/python.hpp>
+#include <stdexcept>
+#include <vector>
+
+#include "bfgs_uff.h"
+
+namespace bp = boost::python;
+
+template <typename T> bp::list vectorToList(const std::vector<T>& vec) {
+  bp::list list;
+  for (const auto& value : vec) {
+    list.append(value);
+  }
+  return list;
+}
+
+template <typename T> bp::list vectorOfVectorsToList(const std::vector<std::vector<T>>& vecOfVecs) {
+  bp::list outerList;
+  for (const auto& innerVec : vecOfVecs) {
+    outerList.append(vectorToList(innerVec));
+  }
+  return outerList;
+}
+
+static std::vector<RDKit::ROMol*> extractMolecules(const bp::list& molecules) {
+  const int                  n = bp::len(molecules);
+  std::vector<RDKit::ROMol*> mols;
+  mols.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    auto* mol = bp::extract<RDKit::ROMol*>(bp::object(molecules[i]))();
+    if (mol == nullptr) {
+      throw std::invalid_argument("Invalid molecule at index " + std::to_string(i));
+    }
+    mols.push_back(mol);
+  }
+  return mols;
+}
+
+static std::vector<double> extractDoubleList(const bp::list& values, const int expectedSize, const std::string& name) {
+  if (bp::len(values) != expectedSize) {
+    throw std::invalid_argument("Expected " + std::to_string(expectedSize) + " values for " + name + ", got " +
+                                std::to_string(bp::len(values)));
+  }
+  std::vector<double> result;
+  result.reserve(expectedSize);
+  for (int i = 0; i < expectedSize; ++i) {
+    result.push_back(bp::extract<double>(values[i]));
+  }
+  return result;
+}
+
+static std::vector<bool> extractBoolList(const bp::list& values, const int expectedSize, const std::string& name) {
+  if (bp::len(values) != expectedSize) {
+    throw std::invalid_argument("Expected " + std::to_string(expectedSize) + " values for " + name + ", got " +
+                                std::to_string(bp::len(values)));
+  }
+  std::vector<bool> result;
+  result.reserve(expectedSize);
+  for (int i = 0; i < expectedSize; ++i) {
+    result.push_back(bp::extract<bool>(values[i]));
+  }
+  return result;
+}
+
+BOOST_PYTHON_MODULE(_uffOptimization) {
+  bp::def(
+    "UFFOptimizeMoleculesConfs",
+    +[](const bp::list&                       molecules,
+        int                                   maxIters,
+        const bp::list&                       vdwThresholds,
+        const bp::list&                       ignoreInterfragInteractions,
+        const nvMolKit::BatchHardwareOptions& hardwareOptions) -> bp::list {
+      auto       molsVec      = extractMolecules(molecules);
+      const int  numMols      = static_cast<int>(molsVec.size());
+      const auto thresholdVec = extractDoubleList(vdwThresholds, numMols, "vdwThreshold");
+      const auto ignoreVec    = extractBoolList(ignoreInterfragInteractions, numMols, "ignoreInterfragInteractions");
+      const auto result =
+        nvMolKit::UFF::UFFOptimizeMoleculesConfsBfgs(molsVec, maxIters, thresholdVec, ignoreVec, hardwareOptions);
+      return vectorOfVectorsToList(result);
+    },
+    (bp::arg("molecules"),
+     bp::arg("maxIters") = 1000,
+     bp::arg("vdwThresholds"),
+     bp::arg("ignoreInterfragInteractions"),
+     bp::arg("hardwareOptions") = nvMolKit::BatchHardwareOptions()),
+    "Optimize conformers for multiple molecules using UFF force field.\n"
+    "\n"
+    "Args:\n"
+    "    molecules: List of RDKit molecules to optimize\n"
+    "    maxIters: Maximum number of optimization iterations (default: 1000)\n"
+    "    vdwThresholds: Per-molecule van der Waals thresholds\n"
+    "    ignoreInterfragInteractions: Per-molecule interfragment interaction flags\n"
+    "    hardwareOptions: BatchHardwareOptions object with hardware settings\n"
+    "\n"
+    "Returns:\n"
+    "    List of lists of energies, where each inner list contains energies for conformers of one molecule");
+}
diff --git a/nvmolkit/uffOptimization.py b/nvmolkit/uffOptimization.py
new file mode 100644
index 0000000..791bf57
--- /dev/null
+++ b/nvmolkit/uffOptimization.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+"""GPU-accelerated UFF optimization for molecular conformers."""
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
+
+from rdkit.Chem import rdForceFieldHelpers
+
+if TYPE_CHECKING:
+    from rdkit.Chem import Mol
+
+from nvmolkit import _uffOptimization
+from nvmolkit.types import HardwareOptions
+
+
+def UFFOptimizeMoleculesConfs(
+    molecules: list["Mol"],
+    maxIters: int = 1000,
+    vdwThreshold: float | Sequence[float] = 10.0,
+    ignoreInterfragInteractions: bool | Sequence[bool] = True,
+    hardwareOptions: HardwareOptions | None = None,
+) -> list[list[float]]:
+    """Optimize conformers for multiple molecules using the UFF force field.
+
+    Args:
+        molecules: List of RDKit molecules to optimize. Each molecule should already
+            contain conformers.
+        maxIters: Maximum number of UFF minimization iterations.
+        vdwThreshold: Van der Waals threshold used when constructing the UFF force
+            field. May be provided as a scalar or per-molecule sequence.
+        ignoreInterfragInteractions: If ``True``, omit non-bonded terms between
+            fragments. May be provided as a scalar or per-molecule sequence.
+        hardwareOptions: Configures CPU and GPU batching, threading, and device
+            selection. Defaults are chosen automatically when omitted.
+
+    Returns:
+        List of lists of energies, where each inner list contains optimized
+        conformer energies for the corresponding molecule.
+
+    Raises:
+        ValueError: If molecules contains ``None`` entries or molecules lacking UFF
+            atom types. ``e.args[1]`` contains keys ``"none"`` and ``"no_params"``.
+    """
+    if not molecules:
+        return []
+
+    none_indices = []
+    no_params_indices = []
+    for i, mol in enumerate(molecules):
+        if mol is None:
+            none_indices.append(i)
+        elif not rdForceFieldHelpers.UFFHasAllMoleculeParams(mol):
+            no_params_indices.append(i)
+
+    if none_indices or no_params_indices:
+        parts = []
+        if none_indices:
+            parts.append(f"None at indices {none_indices}")
+        if no_params_indices:
+            parts.append(f"lacking UFF atom types at indices {no_params_indices}")
+        raise ValueError(
+            "; ".join(parts),
+            {"none": none_indices, "no_params": no_params_indices},
+        )
+
+    def _normalize_scalar_or_list(value, name: str):
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
+            if len(value) != len(molecules):
+                raise ValueError(f"Expected {len(molecules)} values for {name}, got {len(value)}")
+            return list(value)
+        return [value for _ in molecules]
+
+    thresholds = [float(v) for v in _normalize_scalar_or_list(vdwThreshold, "vdwThreshold")]
+    interfrag_flags = [
+        bool(v) for v in _normalize_scalar_or_list(ignoreInterfragInteractions, "ignoreInterfragInteractions")
+    ]
+
+    if hardwareOptions is None:
+        hardwareOptions = HardwareOptions()
+    return _uffOptimization.UFFOptimizeMoleculesConfs(
+        molecules,
+        int(maxIters),
+        thresholds,
+        interfrag_flags,
+        hardwareOptions._as_native(),
+    )
diff --git a/src/minimizer/CMakeLists.txt b/src/minimizer/CMakeLists.txt
index d301269..e51e7b8 100644
--- a/src/minimizer/CMakeLists.txt
+++ b/src/minimizer/CMakeLists.txt
@@ -22,17 +22,39 @@ target_link_libraries(
           cub_helpers cccl_interface)
 target_include_directories(bfgs PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
+add_library(bfgs_common bfgs_common.cpp)
+target_link_libraries(
+  bfgs_common
+  PUBLIC host_vector device
+  PRIVATE ${RDKit_LIBS} OpenMP::OpenMP_CXX)
+target_include_directories(bfgs_common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
 add_library(bfgs_mmff bfgs_mmff.cpp)
 target_link_libraries(
   bfgs_mmff
   PUBLIC bfgs
   PRIVATE ${RDKit_LIBS}
+          bfgs_common
           mmff_batched_forcefield
           rdkit_mmff_flattened
-          device
           device_vector
           OpenMP::OpenMP_CXX
           openmp_helpers
           cccl_interface)
 target_include_directories(bfgs_mmff PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
                                             ${CMAKE_SOURCE_DIR}/src/forcefields)
+
+add_library(bfgs_uff bfgs_uff.cpp)
+target_link_libraries(
+  bfgs_uff
+  PUBLIC bfgs
+  PRIVATE ${RDKit_LIBS}
+          bfgs_common
+          uff_batched_forcefield
+          rdkit_uff_flattened
+          device_vector
+          OpenMP::OpenMP_CXX
+          openmp_helpers
+          cccl_interface)
+target_include_directories(bfgs_uff PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
+                                           ${CMAKE_SOURCE_DIR}/src/forcefields)
diff --git a/src/minimizer/bfgs_common.cpp b/src/minimizer/bfgs_common.cpp
new file mode 100644
index 0000000..718b6cc
--- /dev/null
+++ b/src/minimizer/bfgs_common.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "bfgs_common.h"
+
+#include <GraphMol/Conformer.h>
+#include <GraphMol/ROMol.h>
+#include <omp.h>
+
+#include <numeric>
+#include <stdexcept>
+#include <string>
+
+namespace nvMolKit {
+
+void ThreadLocalBuffers::ensureCapacity(const size_t positionsSize, const size_t energiesSize) {
+  constexpr double extraCapacityFactor = 1.3;
+  const auto       newSize             = static_cast<size_t>(static_cast<double>(positionsSize) * extraCapacityFactor);
+  if (positions.size() < positionsSize) {
+    positions.resize(newSize);
+  }
+  if (energies.size() < energiesSize) {
+    energies.resize(static_cast<size_t>(static_cast<double>(energiesSize) * extraCapacityFactor));
+  }
+  if (initialPositions.size() < positionsSize) {
+    initialPositions.resize(newSize);
+  }
+}
+
+BatchExecutionContext setupBatchExecution(const BatchHardwareOptions& perfOptions) {
+  BatchExecutionContext ctx;
+  ctx.batchSize = perfOptions.batchSize == -1 ? 500 : static_cast<size_t>(perfOptions.batchSize);
+
+  std::vector<int> gpuIds = perfOptions.gpuIds;
+  if (gpuIds.empty()) {
+    const int numDevices = countCudaDevices();
+    if (numDevices == 0) {
+      throw std::runtime_error("No CUDA devices found");
+    }
+    gpuIds.resize(numDevices);
+    std::iota(gpuIds.begin(), gpuIds.end(), 0);
+  }
+  const int batchesPerGpu = perfOptions.batchesPerGpu == -1 ? 4 : perfOptions.batchesPerGpu;
+  ctx.numThreads =
+    perfOptions.batchesPerGpu > 0 ? batchesPerGpu * static_cast<int>(gpuIds.size()) : omp_get_max_threads();
+
+  ctx.streamPool.reserve(ctx.numThreads);
+  ctx.devicesPerThread.resize(ctx.numThreads);
+  for (int i = 0; i < ctx.numThreads; ++i) {
+    const int        gpuId = gpuIds[i % gpuIds.size()];
+    const WithDevice dev(gpuId);
+    ctx.streamPool.emplace_back();
+    ctx.devicesPerThread[i] = gpuId;
+  }
+
+  return ctx;
+}
+
+std::vector<ConformerInfo> flattenConformers(const std::vector<RDKit::ROMol*>& mols,
+                                             std::vector<std::vector<double>>& moleculeEnergies) {
+  moleculeEnergies.resize(mols.size());
+  std::vector<ConformerInfo> allConformers;
+  for (size_t molIdx = 0; molIdx < mols.size(); ++molIdx) {
+    auto* mol = mols[molIdx];
+    if (mol == nullptr) {
+      throw std::invalid_argument("Invalid molecule pointer at index " + std::to_string(molIdx));
+    }
+    moleculeEnergies[molIdx].resize(mol->getNumConformers());
+    size_t confIdx = 0;
+    for (auto confIter = mol->beginConformers(); confIter != mol->endConformers(); ++confIter, ++confIdx) {
+      allConformers.push_back({mol, molIdx, &(**confIter), static_cast<int>((*confIter)->getId()), confIdx});
+    }
+  }
+  return allConformers;
+}
+
+void writeBackResults(const std::vector<ConformerInfo>& batchConformers,
+                      const std::vector<uint32_t>&      conformerAtomStarts,
+                      const ThreadLocalBuffers&         buffers,
+                      std::vector<std::vector<double>>& moleculeEnergies) {
+  for (size_t i = 0; i < batchConformers.size(); ++i) {
+    const auto&    confInfo     = batchConformers[i];
+    const uint32_t numAtoms     = confInfo.mol->getNumAtoms();
+    const uint32_t atomStartIdx = conformerAtomStarts[i];
+    for (uint32_t j = 0; j < numAtoms; ++j) {
+      confInfo.conformer->setAtomPos(j,
+                                     RDGeom::Point3D(buffers.positions[3 * (atomStartIdx + j) + 0],
+                                                     buffers.positions[3 * (atomStartIdx + j) + 1],
+                                                     buffers.positions[3 * (atomStartIdx + j) + 2]));
+    }
+    moleculeEnergies[confInfo.molIdx][confInfo.confIdx] = buffers.energies[i];
+  }
+}
+
+}  // namespace nvMolKit
diff --git a/src/minimizer/bfgs_common.h b/src/minimizer/bfgs_common.h
new file mode 100644
index 0000000..61b15bd
--- /dev/null
+++ b/src/minimizer/bfgs_common.h
@@ -0,0 +1,73 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_BFGS_COMMON_H
+#define NVMOLKIT_BFGS_COMMON_H
+
+#include <cstdint>
+#include <vector>
+
+#include "../hardware_options.h"
+#include "device.h"
+#include "host_vector.h"
+
+namespace RDKit {
+class ROMol;
+class Conformer;
+}  // namespace RDKit
+
+namespace nvMolKit {
+
+//! Thread-local pinned memory buffers for async host-device transfers during BFGS minimization.
+struct ThreadLocalBuffers {
+  PinnedHostVector<double> positions;
+  PinnedHostVector<double> energies;
+  PinnedHostVector<double> initialPositions;
+
+  void ensureCapacity(size_t positionsSize, size_t energiesSize);
+};
+
+struct ConformerInfo {
+  RDKit::ROMol*     mol;
+  size_t            molIdx;
+  RDKit::Conformer* conformer;
+  int               conformerId;
+  size_t            confIdx;
+};
+
+struct BatchExecutionContext {
+  size_t                    batchSize;
+  int                       numThreads;
+  std::vector<ScopedStream> streamPool;
+  std::vector<int>          devicesPerThread;
+};
+
+/// Set up GPU streams, thread counts, and batch sizing from hardware options.
+BatchExecutionContext setupBatchExecution(const BatchHardwareOptions& perfOptions);
+
+/// Flatten all conformers from all molecules into a single list, validating molecule pointers
+/// and initializing the per-molecule energy output vectors.
+std::vector<ConformerInfo> flattenConformers(const std::vector<RDKit::ROMol*>& mols,
+                                             std::vector<std::vector<double>>& moleculeEnergies);
+
+/// Write optimized positions and energies back from host buffers into the RDKit conformers.
+void writeBackResults(const std::vector<ConformerInfo>& batchConformers,
+                      const std::vector<uint32_t>&      conformerAtomStarts,
+                      const ThreadLocalBuffers&         buffers,
+                      std::vector<std::vector<double>>& moleculeEnergies);
+
+}  // namespace nvMolKit
+
+#endif  // NVMOLKIT_BFGS_COMMON_H
diff --git a/src/minimizer/bfgs_mmff.cpp b/src/minimizer/bfgs_mmff.cpp
index 7e167f7..bf5da84 100644
--- a/src/minimizer/bfgs_mmff.cpp
+++ b/src/minimizer/bfgs_mmff.cpp
@@ -20,10 +20,9 @@
 
 #include <unordered_map>
 
+#include "bfgs_common.h"
 #include "bfgs_minimize.h"
-#include "device.h"
 #include "ff_utils.h"
-#include "host_vector.h"
 #include "mmff_batched_forcefield.h"
 #include "mmff_flattened_builder.h"
 #include "nvtx.h"
@@ -36,27 +35,6 @@ struct CachedMoleculeData {
   EnergyForceContribsHost ffParams;
 };
 
-//! Thread-local pinned memory buffers for async transfers
-struct ThreadLocalBuffers {
-  PinnedHostVector<double> positions;
-  PinnedHostVector<double> energies;
-  PinnedHostVector<double> initialPositions;
-
-  void ensureCapacity(const size_t positionsSize, const size_t energiesSize) {
-    constexpr double extraCapacityFactor = 1.3;
-    const auto       newSize = static_cast<size_t>(static_cast<double>(positionsSize) * extraCapacityFactor);
-    if (positions.size() < positionsSize) {
-      positions.resize(newSize);
-    }
-    if (energies.size() < energiesSize) {
-      energies.resize(newSize);
-    }
-    if (initialPositions.size() < positionsSize) {
-      initialPositions.resize(newSize);
-    }
-  }
-};
-
 std::vector<std::vector<double>> MMFFOptimizeMoleculesConfsBfgs(std::vector<RDKit::ROMol*>& mols,
                                                                 const int                   maxIters,
                                                                 const MMFFProperties&       properties,
@@ -77,103 +55,47 @@ std::vector<std::vector<double>> MMFFOptimizeMoleculesConfsBfgs(std::vector<RDKi
   ScopedNvtxRange fullMinimizeRange("BFGS MMFF Optimize Molecules Confs");
   ScopedNvtxRange setupRange("BFGS MMFF Optimize Molecules Confs");
 
-  // Extract values from performance options
-  const size_t batchSize = perfOptions.batchSize == -1 ? 500 : perfOptions.batchSize;
-
   if (properties.size() != mols.size()) {
     throw std::invalid_argument("Expected one MMFFProperties entry per molecule");
   }
 
-  for (size_t i = 0; i < mols.size(); ++i) {
-    const auto* mol = mols[i];
-    if (mol == nullptr) {
-      throw std::invalid_argument("Invalid molecule pointer at index " + std::to_string(i));
-    }
-  }
-
-  std::vector<int> gpuIds = perfOptions.gpuIds;
-  if (gpuIds.empty()) {
-    const int numDevices = countCudaDevices();
-    if (numDevices == 0) {
-      throw std::runtime_error("No CUDA devices found for MMFF relaxation");
-    }
-    gpuIds.resize(numDevices);
-    std::iota(gpuIds.begin(), gpuIds.end(), 0);  // Fill with device IDs 0, 1, ..., numDevices-1
-  }
-  const int batchesPerGpu = perfOptions.batchesPerGpu == -1 ? 4 : perfOptions.batchesPerGpu;
-  const int numThreads =
-    perfOptions.batchesPerGpu > 0 ? batchesPerGpu * static_cast<int>(gpuIds.size()) : omp_get_max_threads();
+  auto                             ctx = setupBatchExecution(perfOptions);
+  std::vector<std::vector<double>> moleculeEnergies;
+  const auto                       allConformers = flattenConformers(mols, moleculeEnergies);
 
-  // Initialize result structure
-  std::vector<std::vector<double>> moleculeEnergies(mols.size());
-
-  // Flatten all conformers from all molecules for better load balancing
-  struct ConformerInfo {
-    RDKit::ROMol*     mol;
-    size_t            molIdx;
-    RDKit::Conformer* conformer;
-    size_t            confIdx;
-  };
-
-  std::vector<ConformerInfo> allConformers;
-  for (size_t molIdx = 0; molIdx < mols.size(); ++molIdx) {
-    auto* mol = mols[molIdx];
-    moleculeEnergies[molIdx].resize(mol->getNumConformers());
-
-    size_t confIdx = 0;
-    for (auto confIter = mol->beginConformers(); confIter != mol->endConformers(); ++confIter, ++confIdx) {
-      allConformers.push_back({mol, molIdx, &(**confIter), confIdx});
-    }
-  }
-
-  // Calculate batch parameters for conformers
   const size_t totalConformers    = allConformers.size();
-  const size_t effectiveBatchSize = (batchSize == 0) ? totalConformers : batchSize;
+  const size_t effectiveBatchSize = (ctx.batchSize == 0) ? totalConformers : ctx.batchSize;
 
   if (totalConformers == 0) {
-    return moleculeEnergies;  // Early return for empty input
+    return moleculeEnergies;
   }
 
-  // Create stream pool for better performance and profiling
-  std::vector<nvMolKit::ScopedStream> streamPool;
-  streamPool.reserve(numThreads);
-  std::vector<int> devicesPerThread(numThreads);
-  for (int i = 0; i < numThreads; ++i) {
-    const int        gpuId = gpuIds[i % gpuIds.size()];
-    const WithDevice dev(gpuId);
-    streamPool.emplace_back();
-    devicesPerThread[i] = gpuId;  // Round-robin assignment of devices
-  }
-
-  // Create thread-local pinned memory buffers for async transfers
-  std::vector<ThreadLocalBuffers> threadBuffers(numThreads);
+  std::vector<ThreadLocalBuffers> threadBuffers(ctx.numThreads);
   detail::OpenMPExceptionRegistry exceptionHandler;
   setupRange.pop();
-#pragma omp parallel for num_threads(numThreads) schedule(dynamic) default(none) shared(allConformers,        \
-                                                                                          moleculeEnergies,   \
-                                                                                          totalConformers,    \
-                                                                                          effectiveBatchSize, \
-                                                                                          maxIters,           \
-                                                                                          properties,         \
-                                                                                          streamPool,         \
-                                                                                          devicesPerThread,   \
-                                                                                          threadBuffers,      \
-                                                                                          backend,            \
-                                                                                          exceptionHandler)
+#pragma omp parallel for num_threads(ctx.numThreads) schedule(dynamic) default(none) shared(allConformers,        \
+                                                                                              moleculeEnergies,   \
+                                                                                              totalConformers,    \
+                                                                                              effectiveBatchSize, \
+                                                                                              maxIters,           \
+                                                                                              properties,         \
+                                                                                              ctx,                \
+                                                                                              threadBuffers,      \
+                                                                                              backend,            \
+                                                                                              exceptionHandler)
   for (size_t batchStart = 0; batchStart < totalConformers; batchStart += effectiveBatchSize) {
     try {
       std::unordered_map<RDKit::ROMol*, CachedMoleculeData> moleculeCache;
       ScopedNvtxRange                                       singleBatchRange("OpenMP loop thread");
       ScopedNvtxRange                                       setupBatchRange("OpenMP loop preprocessing");
       const int                                             threadId = omp_get_thread_num();
-      const WithDevice                                      dev(devicesPerThread[threadId]);
+      const WithDevice                                      dev(ctx.devicesPerThread[threadId]);
       const size_t batchEnd = std::min(batchStart + effectiveBatchSize, totalConformers);
 
-      // Create batch subset of conformers
-      std::vector<ConformerInfo> batchConformers(allConformers.begin() + batchStart, allConformers.begin() + batchEnd);
+      std::vector<nvMolKit::ConformerInfo> batchConformers(allConformers.begin() + batchStart,
+                                                           allConformers.begin() + batchEnd);
 
-      // Get thread-local stream from pool
-      cudaStream_t streamPtr = streamPool[threadId].stream();
+      cudaStream_t streamPtr = ctx.streamPool[threadId].stream();
 
       // Process this batch
       BatchedMolecularSystemHost    systemHost;
@@ -258,23 +180,7 @@ std::vector<std::vector<double>> MMFFOptimizeMoleculesConfsBfgs(std::vector<RDKi
         cudaStreamSynchronize(streamPtr);
       }
 
-      // Update conformer positions and store energies
-      for (size_t i = 0; i < batchConformers.size(); ++i) {
-        const auto&    confInfo     = batchConformers[i];
-        const uint32_t numAtoms     = confInfo.mol->getNumAtoms();
-        const uint32_t atomStartIdx = conformerAtomStarts[i];
-
-        // Update conformer positions
-        for (uint32_t j = 0; j < numAtoms; ++j) {
-          confInfo.conformer->setAtomPos(j,
-                                         RDGeom::Point3D(buffers.positions[3 * (atomStartIdx + j) + 0],
-                                                         buffers.positions[3 * (atomStartIdx + j) + 1],
-                                                         buffers.positions[3 * (atomStartIdx + j) + 2]));
-        }
-
-        // Store energy result - thread-safe since each thread writes to different indices
-        moleculeEnergies[confInfo.molIdx][confInfo.confIdx] = buffers.energies[i];
-      }
+      writeBackResults(batchConformers, conformerAtomStarts, buffers, moleculeEnergies);
     } catch (...) {
       exceptionHandler.store(std::current_exception());
     }
diff --git a/src/minimizer/bfgs_uff.cpp b/src/minimizer/bfgs_uff.cpp
new file mode 100644
index 0000000..6982e6b
--- /dev/null
+++ b/src/minimizer/bfgs_uff.cpp
@@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "bfgs_uff.h"
+
+#include <GraphMol/ROMol.h>
+#include <omp.h>
+
+#include <vector>
+
+#include "bfgs_common.h"
+#include "bfgs_minimize.h"
+#include "ff_utils.h"
+#include "nvtx.h"
+#include "openmp_helpers.h"
+#include "uff_batched_forcefield.h"
+#include "uff_flattened_builder.h"
+
+namespace nvMolKit::UFF {
+
+std::vector<std::vector<double>> UFFOptimizeMoleculesConfsBfgs(std::vector<RDKit::ROMol*>& mols,
+                                                               const int                   maxIters,
+                                                               const std::vector<double>&  vdwThresholds,
+                                                               const std::vector<bool>&    ignoreInterfragInteractions,
+                                                               const BatchHardwareOptions& perfOptions) {
+  ScopedNvtxRange fullMinimizeRange("BFGS UFF Optimize Molecules Confs");
+  ScopedNvtxRange setupRange("BFGS UFF Optimize Molecules Confs");
+
+  if (vdwThresholds.size() != mols.size()) {
+    throw std::invalid_argument("Expected one vdw threshold per molecule");
+  }
+  if (ignoreInterfragInteractions.size() != mols.size()) {
+    throw std::invalid_argument("Expected one interfragment interaction flag per molecule");
+  }
+
+  auto                             ctx = setupBatchExecution(perfOptions);
+  std::vector<std::vector<double>> moleculeEnergies;
+  const auto                       allConformers = flattenConformers(mols, moleculeEnergies);
+
+  const size_t totalConformers    = allConformers.size();
+  const size_t effectiveBatchSize = ctx.batchSize == 0 ? totalConformers : ctx.batchSize;
+  if (totalConformers == 0) {
+    return moleculeEnergies;
+  }
+
+  std::vector<ThreadLocalBuffers> threadBuffers(ctx.numThreads);
+  detail::OpenMPExceptionRegistry exceptionHandler;
+  setupRange.pop();
+#pragma omp parallel for num_threads(ctx.numThreads) schedule(dynamic) default(none) \
+  shared(allConformers,                                                              \
+           moleculeEnergies,                                                         \
+           totalConformers,                                                          \
+           effectiveBatchSize,                                                       \
+           maxIters,                                                                 \
+           vdwThresholds,                                                            \
+           ignoreInterfragInteractions,                                              \
+           ctx,                                                                      \
+           threadBuffers,                                                            \
+           exceptionHandler)
+  for (size_t batchStart = 0; batchStart < totalConformers; batchStart += effectiveBatchSize) {
+    try {
+      ScopedNvtxRange singleBatchRange("OpenMP loop thread");
+      ScopedNvtxRange setupBatchRange("OpenMP loop preprocessing");
+
+      const int                            threadId = omp_get_thread_num();
+      const WithDevice                     dev(ctx.devicesPerThread[threadId]);
+      const size_t                         batchEnd = std::min(batchStart + effectiveBatchSize, totalConformers);
+      std::vector<nvMolKit::ConformerInfo> batchConformers(allConformers.begin() + batchStart,
+                                                           allConformers.begin() + batchEnd);
+
+      cudaStream_t streamPtr = ctx.streamPool[threadId].stream();
+
+      BatchedMolecularSystemHost systemHost;
+      BatchedForcefieldMetadata  metadata;
+      std::vector<uint32_t>      conformerAtomStarts;
+      uint32_t                   currentAtomOffset = 0;
+      std::vector<double>        pos;
+
+      for (const auto& confInfo : batchConformers) {
+        const uint32_t numAtoms = confInfo.mol->getNumAtoms();
+        conformerAtomStarts.push_back(currentAtomOffset);
+        currentAtomOffset += numAtoms;
+
+        nvMolKit::confPosToVect(*confInfo.conformer, pos);
+        auto ffParams = constructForcefieldContribs(*confInfo.mol,
+                                                    vdwThresholds[confInfo.molIdx],
+                                                    confInfo.conformerId,
+                                                    ignoreInterfragInteractions[confInfo.molIdx]);
+        addMoleculeToBatch(ffParams, pos, systemHost, metadata, confInfo.molIdx, static_cast<int>(confInfo.confIdx));
+      }
+
+      auto& buffers = threadBuffers[threadId];
+      buffers.ensureCapacity(systemHost.positions.size(), batchConformers.size());
+      std::copy(systemHost.positions.begin(), systemHost.positions.end(), buffers.initialPositions.begin());
+
+      UFFBatchedForcefield      forcefield(systemHost, metadata, streamPtr);
+      AsyncDeviceVector<double> positionsDevice;
+      AsyncDeviceVector<double> gradDevice;
+      AsyncDeviceVector<double> energyOutsDevice;
+      positionsDevice.setStream(streamPtr);
+      gradDevice.setStream(streamPtr);
+      energyOutsDevice.setStream(streamPtr);
+      positionsDevice.resize(systemHost.positions.size());
+      positionsDevice.copyFromHost(buffers.initialPositions.data(), systemHost.positions.size());
+      gradDevice.resize(systemHost.positions.size());
+      gradDevice.zero();
+      energyOutsDevice.resize(batchConformers.size());
+      energyOutsDevice.zero();
+
+      nvMolKit::BfgsBatchMinimizer bfgsMinimizer(
+        /*dataDim=*/3,
+        nvMolKit::DebugLevel::NONE,
+        true,
+        streamPtr,
+        nvMolKit::BfgsBackend::BATCHED);
+      constexpr double gradTol = 1e-4;
+      setupBatchRange.pop();
+      bfgsMinimizer.minimize(maxIters, gradTol, forcefield, positionsDevice, gradDevice, energyOutsDevice);
+
+      ScopedNvtxRange finalizeBatchRange("OpenMP loop finalizing batch");
+      positionsDevice.copyToHost(buffers.positions.data(), positionsDevice.size());
+      energyOutsDevice.copyToHost(buffers.energies.data(), energyOutsDevice.size());
+      cudaStreamSynchronize(streamPtr);
+
+      writeBackResults(batchConformers, conformerAtomStarts, buffers, moleculeEnergies);
+    } catch (...) {
+      exceptionHandler.store(std::current_exception());
+    }
+  }
+  exceptionHandler.rethrow();
+  return moleculeEnergies;
+}
+
+}  // namespace nvMolKit::UFF
diff --git a/src/minimizer/bfgs_uff.h b/src/minimizer/bfgs_uff.h
new file mode 100644
index 0000000..1511e1f
--- /dev/null
+++ b/src/minimizer/bfgs_uff.h
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef NVMOLKIT_BFGS_UFF_H
+#define NVMOLKIT_BFGS_UFF_H
+
+#include <vector>
+
+#include "../hardware_options.h"
+
+namespace RDKit {
+class ROMol;
+}
+
+namespace nvMolKit::UFF {
+
+std::vector<std::vector<double>> UFFOptimizeMoleculesConfsBfgs(std::vector<RDKit::ROMol*>& mols,
+                                                               int                         maxIters,
+                                                               const std::vector<double>&  vdwThresholds,
+                                                               const std::vector<bool>&    ignoreInterfragInteractions,
+                                                               const BatchHardwareOptions& perfOptions = {});
+
+}  // namespace nvMolKit::UFF
+
+#endif  // NVMOLKIT_BFGS_UFF_H