Skip to content

Commit

Permalink
fix: standardize the deepmd/npy/mixed format (#425)
Browse files Browse the repository at this point in the history
This PR has concated two commits together:

1. Update the dpdata.MultiSystems() when from_deepmd_npy_mixed method is
called;

dpdata.MultiSystems().from_deepmd_npy_mixed only returned the results
before but did not change itself, which is fixed in this commit, to be
consistent with other from methods.
(another bug is also fixed: not using .copy() in data["atom_names"] may
cause error when manually changing type_map for this system. UTs are
added in the next commit.)


2. Allow multiple sets in mixed-type format;

Now for maximum 50000 frames in one sys and 2000 frames in one set.
The reason I did not use 5000 frames per set, is that I think maximum
set frames will be much more often used in mixed-type format than other
format, and 2000 will be enough for large batch and more friendly for
memory.

Add UTs for type_map changing and mixed_type dir check.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
iProzd and pre-commit-ci[bot] committed Feb 25, 2023
1 parent aec7747 commit 06c21b6
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 131 deletions.
159 changes: 85 additions & 74 deletions dpdata/deepmd/mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,60 +54,80 @@ def to_system_data(folder, type_map=None, labels=True):
if os.path.isfile(os.path.join(folder, "nopbc")):
data["nopbc"] = True
sets = sorted(glob.glob(os.path.join(folder, "set.*")))
assert len(sets) == 1, "Mixed type must have only one set!"
cells, coords, eners, forces, virs, real_atom_types = _load_set(
sets[0], data.get("nopbc", False)
)
nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
cells = np.reshape(cells, [nframes, 3, 3])
coords = np.reshape(coords, [nframes, -1, 3])
real_atom_types = np.reshape(real_atom_types, [nframes, -1])
natom = real_atom_types.shape[1]
if labels:
if eners is not None and eners.size > 0:
all_cells = []
all_coords = []
all_eners = []
all_forces = []
all_virs = []
all_real_atom_types = []
for ii in sets:
cells, coords, eners, forces, virs, real_atom_types = _load_set(
ii, data.get("nopbc", False)
)
nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
all_cells.append(np.reshape(cells, [nframes, 3, 3]))
all_coords.append(np.reshape(coords, [nframes, -1, 3]))
all_real_atom_types.append(np.reshape(real_atom_types, [nframes, -1]))
if eners is not None:
eners = np.reshape(eners, [nframes])
if forces is not None and forces.size > 0:
forces = np.reshape(forces, [nframes, -1, 3])
if virs is not None and virs.size > 0:
virs = np.reshape(virs, [nframes, 3, 3])
if labels:
if eners is not None and eners.size > 0:
all_eners.append(np.reshape(eners, [nframes]))
if forces is not None and forces.size > 0:
all_forces.append(np.reshape(forces, [nframes, -1, 3]))
if virs is not None and virs.size > 0:
all_virs.append(np.reshape(virs, [nframes, 3, 3]))
all_cells_concat = np.concatenate(all_cells, axis=0)
all_coords_concat = np.concatenate(all_coords, axis=0)
all_real_atom_types_concat = np.concatenate(all_real_atom_types, axis=0)
all_eners_concat = None
all_forces_concat = None
all_virs_concat = None
if len(all_eners) > 0:
all_eners_concat = np.concatenate(all_eners, axis=0)
if len(all_forces) > 0:
all_forces_concat = np.concatenate(all_forces, axis=0)
if len(all_virs) > 0:
all_virs_concat = np.concatenate(all_virs, axis=0)
data_list = []
while True:
if real_atom_types.size == 0:
if all_real_atom_types_concat.size == 0:
break
temp_atom_numbs = [
np.count_nonzero(real_atom_types[0] == i)
np.count_nonzero(all_real_atom_types_concat[0] == i)
for i in range(len(data["atom_names"]))
]
# temp_formula = formula(data['atom_names'], temp_atom_numbs)
temp_idx = np.arange(real_atom_types.shape[0])[
(real_atom_types == real_atom_types[0]).all(-1)
temp_idx = np.arange(all_real_atom_types_concat.shape[0])[
(all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1)
]
rest_idx = np.arange(real_atom_types.shape[0])[
(real_atom_types != real_atom_types[0]).any(-1)
rest_idx = np.arange(all_real_atom_types_concat.shape[0])[
(all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1)
]
temp_data = data.copy()
temp_data["atom_names"] = data["atom_names"].copy()
temp_data["atom_numbs"] = temp_atom_numbs
temp_data["atom_types"] = real_atom_types[0]
real_atom_types = real_atom_types[rest_idx]
temp_data["cells"] = cells[temp_idx]
cells = cells[rest_idx]
temp_data["coords"] = coords[temp_idx]
coords = coords[rest_idx]
temp_data["atom_types"] = all_real_atom_types_concat[0]
all_real_atom_types_concat = all_real_atom_types_concat[rest_idx]
temp_data["cells"] = all_cells_concat[temp_idx]
all_cells_concat = all_cells_concat[rest_idx]
temp_data["coords"] = all_coords_concat[temp_idx]
all_coords_concat = all_coords_concat[rest_idx]
if labels:
if eners is not None and eners.size > 0:
temp_data["energies"] = eners[temp_idx]
eners = eners[rest_idx]
if forces is not None and forces.size > 0:
temp_data["forces"] = forces[temp_idx]
forces = forces[rest_idx]
if virs is not None and virs.size > 0:
temp_data["virials"] = virs[temp_idx]
virs = virs[rest_idx]
if all_eners_concat is not None and all_eners_concat.size > 0:
temp_data["energies"] = all_eners_concat[temp_idx]
all_eners_concat = all_eners_concat[rest_idx]
if all_forces_concat is not None and all_forces_concat.size > 0:
temp_data["forces"] = all_forces_concat[temp_idx]
all_forces_concat = all_forces_concat[rest_idx]
if all_virs_concat is not None and all_virs_concat.size > 0:
temp_data["virials"] = all_virs_concat[temp_idx]
all_virs_concat = all_virs_concat[rest_idx]
data_list.append(temp_data)
return data_list


def dump(folder, data, comp_prec=np.float32, remove_sets=True):
def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True):
os.makedirs(folder, exist_ok=True)
sets = sorted(glob.glob(os.path.join(folder, "set.*")))
if len(sets) > 0:
Expand Down Expand Up @@ -164,20 +184,29 @@ def dump(folder, data, comp_prec=np.float32, remove_sets=True):
np.int64
)
# dump frame properties: cell, coord, energy, force and virial
set_folder = os.path.join(folder, "set.%03d" % 0)
os.makedirs(set_folder)
np.save(os.path.join(set_folder, "box"), cells)
np.save(os.path.join(set_folder, "coord"), coords)
if eners is not None:
np.save(os.path.join(set_folder, "energy"), eners)
if forces is not None:
np.save(os.path.join(set_folder, "force"), forces)
if virials is not None:
np.save(os.path.join(set_folder, "virial"), virials)
if real_atom_types is not None:
np.save(os.path.join(set_folder, "real_atom_types"), real_atom_types)
if "atom_pref" in data:
np.save(os.path.join(set_folder, "atom_pref"), atom_pref)
nsets = nframes // set_size
if set_size * nsets < nframes:
nsets += 1
for ii in range(nsets):
set_stt = ii * set_size
set_end = (ii + 1) * set_size
set_folder = os.path.join(folder, "set.%06d" % ii)
os.makedirs(set_folder)
np.save(os.path.join(set_folder, "box"), cells[set_stt:set_end])
np.save(os.path.join(set_folder, "coord"), coords[set_stt:set_end])
if eners is not None:
np.save(os.path.join(set_folder, "energy"), eners[set_stt:set_end])
if forces is not None:
np.save(os.path.join(set_folder, "force"), forces[set_stt:set_end])
if virials is not None:
np.save(os.path.join(set_folder, "virial"), virials[set_stt:set_end])
if real_atom_types is not None:
np.save(
os.path.join(set_folder, "real_atom_types"),
real_atom_types[set_stt:set_end],
)
if "atom_pref" in data:
np.save(os.path.join(set_folder, "atom_pref"), atom_pref[set_stt:set_end])
try:
os.remove(os.path.join(folder, "nopbc"))
except OSError:
Expand All @@ -187,61 +216,43 @@ def dump(folder, data, comp_prec=np.float32, remove_sets=True):
pass


def mix_system(*system, type_map, split_num=200, **kwargs):
"""Mix the systems into mixed_type ones
def mix_system(*system, type_map, **kwargs):
"""Mix the systems into mixed_type ones according to the unified given type_map.
Parameters
----------
*system : System
The systems to mix
type_map : list of str
Maps atom type to name
split_num : int
Number of frames in each system
Returns
-------
mixed_systems: dict
dict of mixed system with key '{atom_numbs}/sys.xxx'
dict of mixed system with key 'atom_numbs'
"""
mixed_systems = {}
temp_systems = {}
atom_numbs_sys_index = {} # index of sys
atom_numbs_frame_index = {} # index of frames in cur sys
for sys in system:
tmp_sys = sys.copy()
natom = tmp_sys.get_natoms()
tmp_sys.convert_to_mixed_type(type_map=type_map)
if str(natom) not in atom_numbs_sys_index:
atom_numbs_sys_index[str(natom)] = 0
if str(natom) not in atom_numbs_frame_index:
atom_numbs_frame_index[str(natom)] = 0
atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes()
if str(natom) not in temp_systems or not temp_systems[str(natom)]:
temp_systems[str(natom)] = tmp_sys
else:
temp_systems[str(natom)].append(tmp_sys)
if atom_numbs_frame_index[str(natom)] >= split_num:
while True:
sys_split, temp_systems[str(natom)], rest_num = split_system(
temp_systems[str(natom)], split_num=split_num
)
sys_name = (
f"{str(natom)}/sys." + "%.6d" % atom_numbs_sys_index[str(natom)]
)
mixed_systems[sys_name] = sys_split
atom_numbs_sys_index[str(natom)] += 1
if rest_num < split_num:
atom_numbs_frame_index[str(natom)] = rest_num
break
for natom in temp_systems:
if atom_numbs_frame_index[natom] > 0:
sys_name = f"{natom}/sys." + "%.6d" % atom_numbs_sys_index[natom]
sys_name = f"{natom}"
mixed_systems[sys_name] = temp_systems[natom]
return mixed_systems


def split_system(sys, split_num=100):
def split_system(sys, split_num=10000):
rest = sys.get_nframes() - split_num
if rest <= 0:
return sys, None, 0
Expand Down
6 changes: 2 additions & 4 deletions dpdata/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def to_multi_systems(self, formulas, directory, **kwargs):
"%s doesn't support MultiSystems.to" % (self.__class__.__name__)
)

def mix_system(self, *system, type_map, split_num=200, **kwargs):
def mix_system(self, *system, type_map, **kwargs):
"""Mix the systems into mixed_type ones according to the unified given type_map.
Parameters
Expand All @@ -141,13 +141,11 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs):
The systems to mix
type_map : list of str
Maps atom type to name
split_num : int
Number of frames in each system
Returns
-------
mixed_systems: dict
dict of mixed system with key '{atom_numbs}/sys.xxx'
dict of mixed system with key 'atom_numbs'
"""
raise NotImplementedError(
"%s doesn't support System.from" % (self.__class__.__name__)
Expand Down
47 changes: 10 additions & 37 deletions dpdata/plugins/deepmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def from_labeled_system_mix(self, file_name, type_map=None, **kwargs):
file_name, type_map=type_map, labels=True
)

def mix_system(self, *system, type_map, split_num=200, **kwargs):
def mix_system(self, *system, type_map, **kwargs):
"""Mix the systems into mixed_type ones according to the unified given type_map.
Parameters
Expand All @@ -126,49 +126,22 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs):
The systems to mix
type_map : list of str
Maps atom type to name
split_num : int
Number of frames in each system
Returns
-------
mixed_systems: dict
dict of mixed system with key '{atom_numbs}/sys.xxx'
dict of mixed system with key 'atom_numbs'
"""
return dpdata.deepmd.mixed.mix_system(
*system, type_map=type_map, split_num=split_num, **kwargs
)
return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, **kwargs)

def from_multi_systems(self, directory, **kwargs):
"""MultiSystems.from
Parameters
----------
directory : str
directory of system
Returns
-------
filenames: list[str]
list of filenames
"""
if self.MultiMode == self.MultiModes.Directory:
level_1_dir = [
os.path.join(directory, name)
for name in os.listdir(directory)
if os.path.isdir(os.path.join(directory, name))
and os.path.isfile(os.path.join(directory, name, "type_map.raw"))
]
level_2_dir = [
os.path.join(directory, name1, name2)
for name1 in os.listdir(directory)
for name2 in os.listdir(os.path.join(directory, name1))
if os.path.isdir(os.path.join(directory, name1))
and os.path.isdir(os.path.join(directory, name1, name2))
and os.path.isfile(
os.path.join(directory, name1, name2, "type_map.raw")
)
]
return level_1_dir + level_2_dir
sys_dir = []
for root, dirs, files in os.walk(directory):
if (
"type_map.raw" in files
): # mixed_type format systems must have type_map.raw
sys_dir.append(root)
return sys_dir

MultiMode = Format.MultiModes.Directory

Expand Down
10 changes: 4 additions & 6 deletions dpdata/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,15 +1307,13 @@ def from_fmt_obj(self, fmtobj, directory, labeled=True, **kwargs):
if labeled:
data_list = fmtobj.from_labeled_system_mix(dd, **kwargs)
for data_item in data_list:
system_list.append(LabeledSystem(data=data_item))
system_list.append(LabeledSystem(data=data_item, **kwargs))
else:
data_list = fmtobj.from_system_mix(dd, **kwargs)
for data_item in data_list:
system_list.append(System(data=data_item))
return self.__class__(
*system_list,
type_map=kwargs["type_map"] if "type_map" in kwargs else None,
)
system_list.append(System(data=data_item, **kwargs))
self.append(*system_list)
return self

def to_fmt_obj(self, fmtobj, directory, *args, **kwargs):
if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat):
Expand Down
Loading

0 comments on commit 06c21b6

Please sign in to comment.