diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 0e770dc20..fff8e930e 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -54,60 +54,80 @@ def to_system_data(folder, type_map=None, labels=True): if os.path.isfile(os.path.join(folder, "nopbc")): data["nopbc"] = True sets = sorted(glob.glob(os.path.join(folder, "set.*"))) - assert len(sets) == 1, "Mixed type must have only one set!" - cells, coords, eners, forces, virs, real_atom_types = _load_set( - sets[0], data.get("nopbc", False) - ) - nframes = np.reshape(cells, [-1, 3, 3]).shape[0] - cells = np.reshape(cells, [nframes, 3, 3]) - coords = np.reshape(coords, [nframes, -1, 3]) - real_atom_types = np.reshape(real_atom_types, [nframes, -1]) - natom = real_atom_types.shape[1] - if labels: - if eners is not None and eners.size > 0: + all_cells = [] + all_coords = [] + all_eners = [] + all_forces = [] + all_virs = [] + all_real_atom_types = [] + for ii in sets: + cells, coords, eners, forces, virs, real_atom_types = _load_set( + ii, data.get("nopbc", False) + ) + nframes = np.reshape(cells, [-1, 3, 3]).shape[0] + all_cells.append(np.reshape(cells, [nframes, 3, 3])) + all_coords.append(np.reshape(coords, [nframes, -1, 3])) + all_real_atom_types.append(np.reshape(real_atom_types, [nframes, -1])) + if eners is not None: eners = np.reshape(eners, [nframes]) - if forces is not None and forces.size > 0: - forces = np.reshape(forces, [nframes, -1, 3]) - if virs is not None and virs.size > 0: - virs = np.reshape(virs, [nframes, 3, 3]) + if labels: + if eners is not None and eners.size > 0: + all_eners.append(np.reshape(eners, [nframes])) + if forces is not None and forces.size > 0: + all_forces.append(np.reshape(forces, [nframes, -1, 3])) + if virs is not None and virs.size > 0: + all_virs.append(np.reshape(virs, [nframes, 3, 3])) + all_cells_concat = np.concatenate(all_cells, axis=0) + all_coords_concat = np.concatenate(all_coords, axis=0) + all_real_atom_types_concat = np.concatenate(all_real_atom_types, axis=0) + all_eners_concat = None + all_forces_concat = None + all_virs_concat = None + if len(all_eners) > 0: + all_eners_concat = np.concatenate(all_eners, axis=0) + if len(all_forces) > 0: + all_forces_concat = np.concatenate(all_forces, axis=0) + if len(all_virs) > 0: + all_virs_concat = np.concatenate(all_virs, axis=0) data_list = [] while True: - if real_atom_types.size == 0: + if all_real_atom_types_concat.size == 0: break temp_atom_numbs = [ - np.count_nonzero(real_atom_types[0] == i) + np.count_nonzero(all_real_atom_types_concat[0] == i) for i in range(len(data["atom_names"])) ] # temp_formula = formula(data['atom_names'], temp_atom_numbs) - temp_idx = np.arange(real_atom_types.shape[0])[ - (real_atom_types == real_atom_types[0]).all(-1) + temp_idx = np.arange(all_real_atom_types_concat.shape[0])[ + (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1) ] - rest_idx = np.arange(real_atom_types.shape[0])[ - (real_atom_types != real_atom_types[0]).any(-1) + rest_idx = np.arange(all_real_atom_types_concat.shape[0])[ + (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1) ] temp_data = data.copy() + temp_data["atom_names"] = data["atom_names"].copy() temp_data["atom_numbs"] = temp_atom_numbs - temp_data["atom_types"] = real_atom_types[0] - real_atom_types = real_atom_types[rest_idx] - temp_data["cells"] = cells[temp_idx] - cells = cells[rest_idx] - temp_data["coords"] = coords[temp_idx] - coords = coords[rest_idx] + temp_data["atom_types"] = all_real_atom_types_concat[0] + all_real_atom_types_concat = all_real_atom_types_concat[rest_idx] + temp_data["cells"] = all_cells_concat[temp_idx] + all_cells_concat = all_cells_concat[rest_idx] + temp_data["coords"] = all_coords_concat[temp_idx] + all_coords_concat = all_coords_concat[rest_idx] if labels: - if eners is not None and eners.size > 0: - temp_data["energies"] = eners[temp_idx] - eners = eners[rest_idx] - if forces is not None and forces.size > 0: - temp_data["forces"] = forces[temp_idx] - forces = forces[rest_idx] - if virs is not None and virs.size > 0: - temp_data["virials"] = virs[temp_idx] - virs = virs[rest_idx] + if all_eners_concat is not None and all_eners_concat.size > 0: + temp_data["energies"] = all_eners_concat[temp_idx] + all_eners_concat = all_eners_concat[rest_idx] + if all_forces_concat is not None and all_forces_concat.size > 0: + temp_data["forces"] = all_forces_concat[temp_idx] + all_forces_concat = all_forces_concat[rest_idx] + if all_virs_concat is not None and all_virs_concat.size > 0: + temp_data["virials"] = all_virs_concat[temp_idx] + all_virs_concat = all_virs_concat[rest_idx] data_list.append(temp_data) return data_list -def dump(folder, data, comp_prec=np.float32, remove_sets=True): +def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True): os.makedirs(folder, exist_ok=True) sets = sorted(glob.glob(os.path.join(folder, "set.*"))) if len(sets) > 0: @@ -164,20 +184,29 @@ def dump(folder, data, comp_prec=np.float32, remove_sets=True): np.int64 ) # dump frame properties: cell, coord, energy, force and virial - set_folder = os.path.join(folder, "set.%03d" % 0) - os.makedirs(set_folder) - np.save(os.path.join(set_folder, "box"), cells) - np.save(os.path.join(set_folder, "coord"), coords) - if eners is not None: - np.save(os.path.join(set_folder, "energy"), eners) - if forces is not None: - np.save(os.path.join(set_folder, "force"), forces) - if virials is not None: - np.save(os.path.join(set_folder, "virial"), virials) - if real_atom_types is not None: - np.save(os.path.join(set_folder, "real_atom_types"), real_atom_types) - if "atom_pref" in data: - np.save(os.path.join(set_folder, "atom_pref"), atom_pref) + nsets = nframes // set_size + if set_size * nsets < nframes: + nsets += 1 + for ii in range(nsets): + set_stt = ii * set_size + set_end = (ii + 1) * set_size + set_folder = os.path.join(folder, "set.%06d" % ii) + os.makedirs(set_folder) + np.save(os.path.join(set_folder, "box"), cells[set_stt:set_end]) + np.save(os.path.join(set_folder, "coord"), coords[set_stt:set_end]) + if eners is not None: + np.save(os.path.join(set_folder, "energy"), eners[set_stt:set_end]) + if forces is not None: + np.save(os.path.join(set_folder, "force"), forces[set_stt:set_end]) + if virials is not None: + np.save(os.path.join(set_folder, "virial"), virials[set_stt:set_end]) + if real_atom_types is not None: + np.save( + os.path.join(set_folder, "real_atom_types"), + real_atom_types[set_stt:set_end], + ) + if "atom_pref" in data: + np.save(os.path.join(set_folder, "atom_pref"), atom_pref[set_stt:set_end]) try: os.remove(os.path.join(folder, "nopbc")) except OSError: @@ -187,8 +216,8 @@ def dump(folder, data, comp_prec=np.float32, remove_sets=True): pass -def mix_system(*system, type_map, split_num=200, **kwargs): - """Mix the systems into mixed_type ones +def mix_system(*system, type_map, **kwargs): + """Mix the systems into mixed_type ones according to the unified given type_map. Parameters ---------- @@ -196,24 +225,19 @@ def mix_system(*system, type_map, split_num=200, **kwargs): The systems to mix type_map : list of str Maps atom type to name - split_num : int - Number of frames in each system Returns ------- mixed_systems: dict - dict of mixed system with key '{atom_numbs}/sys.xxx' + dict of mixed system with key 'atom_numbs' """ mixed_systems = {} temp_systems = {} - atom_numbs_sys_index = {} # index of sys atom_numbs_frame_index = {} # index of frames in cur sys for sys in system: tmp_sys = sys.copy() natom = tmp_sys.get_natoms() tmp_sys.convert_to_mixed_type(type_map=type_map) - if str(natom) not in atom_numbs_sys_index: - atom_numbs_sys_index[str(natom)] = 0 if str(natom) not in atom_numbs_frame_index: atom_numbs_frame_index[str(natom)] = 0 atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes() @@ -221,27 +245,14 @@ def mix_system(*system, type_map, split_num=200, **kwargs): temp_systems[str(natom)] = tmp_sys else: temp_systems[str(natom)].append(tmp_sys) - if atom_numbs_frame_index[str(natom)] >= split_num: - while True: - sys_split, temp_systems[str(natom)], rest_num = split_system( - temp_systems[str(natom)], split_num=split_num - ) - sys_name = ( - f"{str(natom)}/sys." + "%.6d" % atom_numbs_sys_index[str(natom)] - ) - mixed_systems[sys_name] = sys_split - atom_numbs_sys_index[str(natom)] += 1 - if rest_num < split_num: - atom_numbs_frame_index[str(natom)] = rest_num - break for natom in temp_systems: if atom_numbs_frame_index[natom] > 0: - sys_name = f"{natom}/sys." + "%.6d" % atom_numbs_sys_index[natom] + sys_name = f"{natom}" mixed_systems[sys_name] = temp_systems[natom] return mixed_systems -def split_system(sys, split_num=100): +def split_system(sys, split_num=10000): rest = sys.get_nframes() - split_num if rest <= 0: return sys, None, 0 diff --git a/dpdata/format.py b/dpdata/format.py index b4fc5a8e5..c6ba91b7b 100644 --- a/dpdata/format.py +++ b/dpdata/format.py @@ -132,7 +132,7 @@ def to_multi_systems(self, formulas, directory, **kwargs): "%s doesn't support MultiSystems.to" % (self.__class__.__name__) ) - def mix_system(self, *system, type_map, split_num=200, **kwargs): + def mix_system(self, *system, type_map, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters @@ -141,13 +141,11 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs): The systems to mix type_map : list of str Maps atom type to name - split_num : int - Number of frames in each system Returns ------- mixed_systems: dict - dict of mixed system with key '{atom_numbs}/sys.xxx' + dict of mixed system with key 'atom_numbs' """ raise NotImplementedError( "%s doesn't support System.from" % (self.__class__.__name__) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index dcb9d810c..499e23b21 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -117,7 +117,7 @@ def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): file_name, type_map=type_map, labels=True ) - def mix_system(self, *system, type_map, split_num=200, **kwargs): + def mix_system(self, *system, type_map, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters @@ -126,49 +126,22 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs): The systems to mix type_map : list of str Maps atom type to name - split_num : int - Number of frames in each system Returns ------- mixed_systems: dict - dict of mixed system with key '{atom_numbs}/sys.xxx' + dict of mixed system with key 'atom_numbs' """ - return dpdata.deepmd.mixed.mix_system( - *system, type_map=type_map, split_num=split_num, **kwargs - ) + return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, **kwargs) def from_multi_systems(self, directory, **kwargs): - """MultiSystems.from - - Parameters - ---------- - directory : str - directory of system - - Returns - ------- - filenames: list[str] - list of filenames - """ - if self.MultiMode == self.MultiModes.Directory: - level_1_dir = [ - os.path.join(directory, name) - for name in os.listdir(directory) - if os.path.isdir(os.path.join(directory, name)) - and os.path.isfile(os.path.join(directory, name, "type_map.raw")) - ] - level_2_dir = [ - os.path.join(directory, name1, name2) - for name1 in os.listdir(directory) - for name2 in os.listdir(os.path.join(directory, name1)) - if os.path.isdir(os.path.join(directory, name1)) - and os.path.isdir(os.path.join(directory, name1, name2)) - and os.path.isfile( - os.path.join(directory, name1, name2, "type_map.raw") - ) - ] - return level_1_dir + level_2_dir + sys_dir = [] + for root, dirs, files in os.walk(directory): + if ( + "type_map.raw" in files + ): # mixed_type format systems must have type_map.raw + sys_dir.append(root) + return sys_dir MultiMode = Format.MultiModes.Directory diff --git a/dpdata/system.py b/dpdata/system.py index 887aba15f..802b352c5 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1307,15 +1307,13 @@ def from_fmt_obj(self, fmtobj, directory, labeled=True, **kwargs): if labeled: data_list = fmtobj.from_labeled_system_mix(dd, **kwargs) for data_item in data_list: - system_list.append(LabeledSystem(data=data_item)) + system_list.append(LabeledSystem(data=data_item, **kwargs)) else: data_list = fmtobj.from_system_mix(dd, **kwargs) for data_item in data_list: - system_list.append(System(data=data_item)) - return self.__class__( - *system_list, - type_map=kwargs["type_map"] if "type_map" in kwargs else None, - ) + system_list.append(System(data=data_item, **kwargs)) + self.append(*system_list) + return self def to_fmt_obj(self, fmtobj, directory, *args, **kwargs): if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat): diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index 9e6ee9dd1..19c46e727 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -1,6 +1,7 @@ import os import shutil import unittest +from glob import glob from itertools import permutations import numpy as np @@ -8,7 +9,9 @@ from context import dpdata -class TestMixedMultiSystems(unittest.TestCase, CompLabeledSys, MultiSystems, IsNoPBC): +class TestMixedMultiSystemsDumpLoad( + unittest.TestCase, CompLabeledSys, MultiSystems, IsNoPBC +): def setUp(self): self.places = 6 self.e_places = 6 @@ -54,17 +57,18 @@ def setUp(self): system_1_modified_type_3, ) self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed") - self.place_holder_ms = dpdata.MultiSystems().load_systems_from_file( - "tmp.deepmd.mixed/5", fmt="deepmd/npy" - ) - self.place_holder_ms += dpdata.MultiSystems().load_systems_from_file( - "tmp.deepmd.mixed/4", fmt="deepmd/npy" - ) - self.systems = dpdata.MultiSystems().load_systems_from_file( - "tmp.deepmd.mixed", fmt="deepmd/npy/mixed" - ) + self.place_holder_ms = dpdata.MultiSystems() + self.place_holder_ms.from_deepmd_npy("tmp.deepmd.mixed", fmt="deepmd/npy") + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed("tmp.deepmd.mixed", fmt="deepmd/npy/mixed") self.system_1 = self.ms["C1H4A0B0D0"] self.system_2 = self.systems["C1H4A0B0D0"] + mixed_sets = glob("tmp.deepmd.mixed/*/set.*") + self.assertEqual(len(mixed_sets), 2) + for i in mixed_sets: + self.assertEqual( + os.path.exists(os.path.join(i, "real_atom_types.npy")), True + ) self.system_names = [ "C1H4A0B0D0", @@ -106,5 +110,107 @@ def test_str(self): ) +class TestMixedMultiSystemsTypeChange( + unittest.TestCase, CompLabeledSys, MultiSystems, IsNoPBC +): + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + + # C1H3 + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + tmp_data = system_1.data.copy() + tmp_data["atom_numbs"] = [1, 1, 1, 2] + tmp_data["atom_names"] = ["C", "H", "A", "B"] + tmp_data["atom_types"] = np.array([0, 1, 2, 3, 3]) + # C1H1A1B2 + system_1_modified_type_1 = dpdata.LabeledSystem(data=tmp_data) + + tmp_data = system_1.data.copy() + tmp_data["atom_numbs"] = [1, 1, 2, 1] + tmp_data["atom_names"] = ["C", "H", "A", "B"] + tmp_data["atom_types"] = np.array([0, 1, 2, 2, 3]) + # C1H1A2B1 + system_1_modified_type_2 = dpdata.LabeledSystem(data=tmp_data) + + tmp_data = system_1.data.copy() + tmp_data["atom_numbs"] = [1, 1, 1, 2] + tmp_data["atom_names"] = ["C", "H", "A", "D"] + tmp_data["atom_types"] = np.array([0, 1, 2, 3, 3]) + # C1H1A1C2 + system_1_modified_type_3 = dpdata.LabeledSystem(data=tmp_data) + + self.ms = dpdata.MultiSystems( + system_1, + system_2, + system_1_modified_type_1, + system_1_modified_type_2, + system_1_modified_type_3, + type_map=["TOKEN"], + ) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed") + self.place_holder_ms = dpdata.MultiSystems() + self.place_holder_ms.from_deepmd_npy("tmp.deepmd.mixed", fmt="deepmd/npy") + self.systems = dpdata.MultiSystems(type_map=["TOKEN"]) + self.systems.from_deepmd_npy_mixed("tmp.deepmd.mixed", fmt="deepmd/npy/mixed") + self.system_1 = self.ms["TOKEN0C1H4A0B0D0"] + self.system_2 = self.systems["TOKEN0C1H4A0B0D0"] + mixed_sets = glob("tmp.deepmd.mixed/*/set.*") + self.assertEqual(len(mixed_sets), 2) + for i in mixed_sets: + self.assertEqual( + os.path.exists(os.path.join(i, "real_atom_types.npy")), True + ) + + self.system_names = [ + "TOKEN0C1H4A0B0D0", + "TOKEN0C1H3A0B0D0", + "TOKEN0C1H1A1B2D0", + "TOKEN0C1H1A2B1D0", + "TOKEN0C1H1A1B0D2", + ] + self.system_sizes = { + "TOKEN0C1H4A0B0D0": 1, + "TOKEN0C1H3A0B0D0": 1, + "TOKEN0C1H1A1B2D0": 1, + "TOKEN0C1H1A2B1D0": 1, + "TOKEN0C1H1A1B0D2": 1, + } + self.atom_names = ["C", "H", "A", "B", "D"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed"): + shutil.rmtree("tmp.deepmd.mixed") + + def test_len(self): + self.assertEqual(len(self.ms), 5) + self.assertEqual(len(self.place_holder_ms), 2) + self.assertEqual(len(self.systems), 5) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 5) + self.assertEqual(self.place_holder_ms.get_nframes(), 5) + self.assertEqual(self.systems.get_nframes(), 5) + + def test_str(self): + self.assertEqual(str(self.ms), "MultiSystems (5 systems containing 5 frames)") + self.assertEqual( + str(self.place_holder_ms), "MultiSystems (2 systems containing 5 frames)" + ) + self.assertEqual( + str(self.systems), "MultiSystems (5 systems containing 5 frames)" + ) + + if __name__ == "__main__": unittest.main()