Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions dpdata/plugins/fennol.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot Please also support to_multi_systems that writes different systems into the same file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to_multi_systems support that writes different systems into the same FeNNol file. MultiSystems can now be exported using multi_systems.to("fennol", "combined_data.pkl") which combines all systems into a single pickle file with the proper FeNNol structure. Includes comprehensive tests and maintains backward compatibility. (c2e7011)

Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from __future__ import annotations

import pickle

from dpdata.format import Format
from dpdata.unit import EnergyConversion


@Format.register("fennol")
class FeNNolFormat(Format):
"""The FeNNol format plugin for dpdata.

FeNNol (https://github.com/thomasple/FeNNol/) uses a pickle format
for training machine learning models. This plugin supports exporting
dpdata LabeledSystem to FeNNol format.

The format consists of a dictionary with 'training' and 'validation' keys,
where each contains a list of structures with:
- 'species': atomic species/elements
- 'coordinates': atomic positions in Angstroms
- 'formation_energy': energy in kcal/mol
- 'shifted_energy': energy in kcal/mol (same as formation_energy in this implementation)
- 'forces': atomic forces in kcal/mol/Angstrom

Examples
--------
Export a LabeledSystem to FeNNol format:

>>> import dpdata
>>> ls = dpdata.LabeledSystem("OUTCAR", fmt="vasp/outcar")
>>> ls.to("fennol", "data.pkl")
"""

def to_labeled_system(self, data, file_name, train_size=0.8, **kwargs):
"""Convert dpdata LabeledSystem to FeNNol format.

Parameters
----------
data : dict
LabeledSystem data
file_name : str
Output pickle file name
train_size : float, optional
Fraction of data to use for training (default: 0.8)
**kwargs : dict
Other parameters
"""
# Unit conversions
energy_conv = EnergyConversion("eV", "kcal_mol").value()
force_conv = EnergyConversion("eV", "kcal_mol").value() # eV/Angstrom to kcal/mol/Angstrom

# Extract data
atom_names = data["atom_names"]
atom_types = data["atom_types"]
coords = data["coords"] # shape: (nframes, natoms, 3)
energies = data["energies"] # shape: (nframes,)
forces = data["forces"] # shape: (nframes, natoms, 3)

nframes = coords.shape[0]
natoms = coords.shape[1]

# Create species array from atom_types and atom_names
species = [atom_names[atom_types[i]] for i in range(natoms)]

# Prepare data structures
structures = []

for i in range(nframes):
structure = {
"species": species,
"coordinates": coords[i].copy(), # Already in Angstroms
"formation_energy": energies[i] * energy_conv, # Convert eV to kcal/mol
"shifted_energy": energies[i] * energy_conv, # Same as formation_energy
"forces": forces[i] * force_conv, # Convert eV/Angstrom to kcal/mol/Angstrom
}
structures.append(structure)

# Split into training and validation sets
n_train = int(nframes * train_size)
training_data = structures[:n_train]
validation_data = structures[n_train:]

# Create FeNNol format dictionary
fennol_data = {
"training": training_data,
"validation": validation_data,
"description": f"Generated from dpdata with {nframes} frames, {n_train} training, {nframes - n_train} validation"
}

# Save to pickle file
with open(file_name, 'wb') as f:
pickle.dump(fennol_data, f)
193 changes: 193 additions & 0 deletions tests/test_fennol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
from __future__ import annotations

import os
import pickle
import tempfile
import unittest

import numpy as np
from context import dpdata


class TestFeNNolFormat(unittest.TestCase):
def setUp(self):
"""Set up test fixtures with a simple water molecule system."""
# Create a simple test system: water molecule (H2O)
self.test_data = {
"atom_names": ["H", "O"],
"atom_numbs": [2, 1],
"atom_types": np.array([0, 1, 0]), # H, O, H
"coords": np.array(
[
[[0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0]], # frame 1
[[0.1, 0.0, 0.0], [0.0, 0.1, 1.0], [0.0, 1.1, 0.0]], # frame 2
]
), # 2 frames, 3 atoms, 3 coords
"cells": np.array(
[
[[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]], # frame 1
[[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]], # frame 2
]
), # 2 frames, 3x3 cell
"energies": np.array([-1.0, -1.1]), # 2 frame energies in eV
"forces": np.array(
[
[[0.1, 0.0, 0.0], [0.0, 0.1, 0.0], [-0.1, -0.1, 0.0]], # frame 1
[[0.2, 0.0, 0.0], [0.0, 0.2, 0.0], [-0.2, -0.2, 0.0]], # frame 2
]
), # 2 frames, 3 atoms, 3 force components in eV/Angstrom
"orig": np.array([0.0, 0.0, 0.0]),
"nopbc": False,
}

self.system = dpdata.LabeledSystem(data=self.test_data)

def test_fennol_export(self):
"""Test basic FeNNol format export functionality."""
with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp_file:
tmp_filename = tmp_file.name

try:
# Export to FeNNol format
self.system.to("fennol", tmp_filename)

# Check that file was created
self.assertTrue(os.path.exists(tmp_filename))

# Load and verify the FeNNol data
with open(tmp_filename, "rb") as f:
fennol_data = pickle.load(f)

# Check main structure
self.assertIn("training", fennol_data)
self.assertIn("validation", fennol_data)
self.assertIn("description", fennol_data)

# Check that we have training and validation data
training = fennol_data["training"]
validation = fennol_data["validation"]

# With default train_size=0.8 and 2 frames, we should have 1 training, 1 validation
self.assertEqual(len(training), 1)
self.assertEqual(len(validation), 1)

# Check structure of training data
sample = training[0]
expected_keys = {
"species",
"coordinates",
"formation_energy",
"shifted_energy",
"forces",
}
self.assertEqual(set(sample.keys()), expected_keys)

# Check species
expected_species = ["H", "O", "H"]
self.assertEqual(sample["species"], expected_species)

# Check coordinates (should be unchanged from Angstroms)
np.testing.assert_array_almost_equal(
sample["coordinates"], self.test_data["coords"][0]
)

# Check energy conversion (eV to kcal/mol)
# 1 eV ≈ 23.06 kcal/mol
expected_energy = self.test_data["energies"][0] * 23.06054783061903
self.assertAlmostEqual(
sample["formation_energy"], expected_energy, places=5
)
self.assertAlmostEqual(sample["shifted_energy"], expected_energy, places=5)

# Check forces conversion
expected_forces = self.test_data["forces"][0] * 23.06054783061903
np.testing.assert_array_almost_equal(
sample["forces"], expected_forces, decimal=5
)

finally:
# Clean up
if os.path.exists(tmp_filename):
os.unlink(tmp_filename)

def test_fennol_export_custom_train_size(self):
"""Test FeNNol export with custom training size."""
with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp_file:
tmp_filename = tmp_file.name

try:
# Export with train_size=0.5 (1 training, 1 validation from 2 frames)
self.system.to("fennol", tmp_filename, train_size=0.5)

with open(tmp_filename, "rb") as f:
fennol_data = pickle.load(f)

training = fennol_data["training"]
validation = fennol_data["validation"]

# Should have 1 training, 1 validation with train_size=0.5
self.assertEqual(len(training), 1)
self.assertEqual(len(validation), 1)

finally:
if os.path.exists(tmp_filename):
os.unlink(tmp_filename)

def test_fennol_export_all_training(self):
"""Test FeNNol export with all data as training."""
with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp_file:
tmp_filename = tmp_file.name

try:
# Export with train_size=1.0 (all training, no validation)
self.system.to("fennol", tmp_filename, train_size=1.0)

with open(tmp_filename, "rb") as f:
fennol_data = pickle.load(f)

training = fennol_data["training"]
validation = fennol_data["validation"]

# Should have 2 training, 0 validation
self.assertEqual(len(training), 2)
self.assertEqual(len(validation), 0)

finally:
if os.path.exists(tmp_filename):
os.unlink(tmp_filename)

def test_fennol_single_frame(self):
"""Test FeNNol export with single frame."""
# Create single frame system
single_frame_data = {
k: v[:1]
if k in ["coords", "cells", "energies"]
else (v[:1] if k == "forces" else v)
for k, v in self.test_data.items()
}
single_system = dpdata.LabeledSystem(data=single_frame_data)

with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp_file:
tmp_filename = tmp_file.name

try:
single_system.to("fennol", tmp_filename)

with open(tmp_filename, "rb") as f:
fennol_data = pickle.load(f)

training = fennol_data["training"]
validation = fennol_data["validation"]

# With 1 frame and train_size=0.8, should have 0 training, 1 validation
# (since int(1 * 0.8) = 0)
self.assertEqual(len(training), 0)
self.assertEqual(len(validation), 1)

finally:
if os.path.exists(tmp_filename):
os.unlink(tmp_filename)


if __name__ == "__main__":
unittest.main()