-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 796fa1e
Showing
7 changed files
with
751 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# LnQM dataset | ||
|
||
[![Python](https://img.shields.io/badge/python-3.11.4-blue.svg)](https://www.python.org) | ||
[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) | ||
|
||
This is the offical repository of the `LnQM` dataset developed by the Grimme group in Bonn. | ||
|
||
<div align="center"> | ||
<img src="./assets/logo.png" alt="LnQM Dataset" width="400"> | ||
</div> | ||
|
||
|
||
# Data | ||
|
||
The data is available under: [https://zenodo.org/records/10222384](https://zenodo.org/records/10222384) | ||
|
||
|
||
# Setup | ||
|
||
We provide a dataset class and a sample representation based on `pytorch`. | ||
For this purpose please install the package requirements: | ||
```bash | ||
conda env create --file requirements.yml | ||
``` | ||
|
||
Basic usage of the dataset: | ||
```python | ||
from lnqm import LnQM_Dataset | ||
|
||
# load LnQM from disk | ||
dataset = LnQM_Dataset(path_to_hdf5="/path/to/lnqm.h5") | ||
|
||
# loop over each sample in the dataset | ||
for data in dataset: | ||
# ... | ||
# do sth. with each sample | ||
# ... | ||
|
||
# e.g. access properties | ||
print(data.uid) | ||
``` | ||
|
||
For a more detailed description of the features included see the datamodel in `LnQM_Sample`. | ||
If required, a `Data` object can be converted via: | ||
|
||
```python | ||
from lnqm import LnQM_Sample | ||
sample = LnQM_Sample(**data) | ||
``` | ||
|
||
# Citations | ||
|
||
When using or referencing to the `LnQM` please cite: | ||
- C. Hölzer, I. Gordiy, S. Grimme, M. Bursch | ||
... tbd ... | ||
|
||
# License | ||
|
||
[![CC BY NC 4.0][cc-by-nc-image]][cc-by-nc] | ||
|
||
This work is licensed under a | ||
[Creative Commons Attribution-NonCommercial 4.0 International License][cc-by-nc]. | ||
|
||
[cc-by-nc]: http://creativecommons.org/licenses/by-nc/4.0/ | ||
[cc-by-nc-image]: https://i.creativecommons.org/l/by-nc/4.0/88x31.png |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .dataset import LnQM_Dataset | ||
from .sample import LnQM_Sample |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
from collections import defaultdict | ||
from pathlib import Path | ||
|
||
import h5py | ||
import numpy as np | ||
import torch | ||
from torch_geometric.data import Data, InMemoryDataset | ||
|
||
|
||
class LnQM_Dataset(InMemoryDataset): | ||
"""Dataset holding LnQM data. | ||
Usage: | ||
>>> dataset = LnQM_Dataset(path_to_hdf5="lnqm.h5") | ||
""" | ||
|
||
def __init__(self, path_to_hdf5: str | Path, transform=None): | ||
super().__init__("./", transform, pre_transform=None, pre_filter=None) | ||
self.path_to_hdf5 = path_to_hdf5 | ||
|
||
if self.path_to_hdf5: | ||
self.data, self.slices = LnQM_Dataset.from_hdf5(self.path_to_hdf5) | ||
else: | ||
# empty dataset | ||
self.data, self.slices = Data(), defaultdict(dict, {}) | ||
|
||
@staticmethod | ||
def from_hdf5(fp: str | Path) -> tuple[Data, defaultdict]: | ||
"""Load data and slices from HDF5 file.""" | ||
data = {} | ||
slices = {} | ||
with h5py.File(fp, "r") as f: | ||
for key in f["data"].keys(): | ||
np_arrays = {"data": f["data"][key][:], "slices": f["slices"][key][:]} | ||
# some casting | ||
for prop, val in np_arrays.items(): | ||
if val.dtype == np.uint64: | ||
np_arrays[prop] = val.astype(np.int64) | ||
# uid is of dtype string, so we got to handle it seperately | ||
if key == "uid": | ||
uids = [s.decode("utf-8") for s in np_arrays["data"].tolist()] | ||
data[key] = uids | ||
slices[key] = torch.from_numpy(np_arrays["slices"]) | ||
else: | ||
data[key] = torch.from_numpy(np_arrays["data"]) | ||
slices[key] = torch.from_numpy(np_arrays["slices"]) | ||
return Data.from_dict(data), defaultdict(dict, slices) | ||
|
||
def to_hdf5(self, fp: str | Path): | ||
"""Save the data and slices of the dataset to an HDF5 file.""" | ||
with h5py.File(fp, "w") as f: | ||
data_group = f.create_group("data") | ||
slices_group = f.create_group("slices") | ||
|
||
for key, value in self._data.items(): | ||
if not isinstance(value, list): # strings such as uid | ||
if isinstance(value, torch.Tensor): | ||
value = value.numpy() | ||
if value.dtype == np.int64: | ||
value = value.astype(np.uint64) | ||
|
||
data_group.create_dataset(key, data=value) | ||
|
||
# save slices | ||
slice_value = self.slices[key].numpy() | ||
if slice_value.dtype == np.int64: | ||
slice_value = slice_value.astype(np.uint64) | ||
slices_group.create_dataset(key, data=slice_value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
from pydantic import BaseModel | ||
from torch import Tensor | ||
|
||
|
||
class LnQM_Sample(BaseModel): | ||
""" | ||
Data model representation for samples in the `LnQM` dataset. | ||
While designed for comprehensive representation, some attributes | ||
may contain redundant information. For instance: | ||
- Total electron count (`nel`) can be deduced from alpha (`nel_alpha`) | ||
and beta (`nel_beta`) electron counts. | ||
- The combined exchange-correlation energy (`exc`) is the sum of | ||
exchange (`ex`) and correlation (`ec`) energies. | ||
- The Mulliken charges are encompassed within Mayer population. | ||
Units: | ||
All numerical attributes are expressed in atomic units (Hartree, Bohr, etc.). | ||
""" | ||
|
||
class Config: | ||
arbitrary_types_allowed = True | ||
|
||
# General properties | ||
uid: str | ||
"""Unique identifier for the calculation.""" | ||
|
||
time_singlepoint: Tensor | ||
"""Time taken for the singlepoint ORCA calculation in seconds.""" | ||
|
||
time_geoopt: Tensor | ||
"""Time taken for the geometry optimization ORCA calculation in seconds.""" | ||
|
||
# Geometric properties | ||
numbers: Tensor | ||
"""Atomic numbers.""" | ||
|
||
coord: Tensor | ||
"""Optimized geometry in Bohr.""" | ||
|
||
gradient: Tensor | ||
"""Gradient of the optimized geometry in Eh/Bohr.""" | ||
|
||
trajectory: Tensor | ||
"""Trajectory of atomic positions during optimization in Bohr.""" | ||
|
||
trajectory_energies: Tensor | ||
"""Energies at different points in the optimization trajectory in Eh.""" | ||
|
||
trajectory_etot: Tensor | ||
"""Energies without dispersion correction at different points in the optimization trajectory in Eh.""" | ||
|
||
trajectory_gradients: Tensor | ||
"""Gradients at different points in the optimization trajectory in Eh/Bohr.""" | ||
|
||
cn: Tensor | ||
"""Coordination numbers from D4 calculation.""" | ||
|
||
# Energetic properties | ||
energy: Tensor | ||
"""Total single point energy from the ORCA calculation in Eh.""" | ||
|
||
energy_geoopt: Tensor | ||
"""Energy after geometric optimization in Eh. This is important for comparability with other trajectory features.""" | ||
|
||
etot_geoopt: Tensor | ||
"""Energy after geometric optimization without dispersion correction in Eh. This is important for comparability with other trajectory features.""" | ||
|
||
ex: Tensor | ||
"""Exchange energy in Eh.""" | ||
|
||
ec: Tensor | ||
"""Correlation energy in Eh.""" | ||
|
||
exc: Tensor | ||
"""Exchange-correlation energy in Eh.""" | ||
|
||
ecnl: Tensor | ||
"""Non-local correlation energy in Eh.""" | ||
|
||
eemb: Tensor | ||
"""Embedding correction energy in Eh.""" | ||
|
||
homo_spin_up: Tensor | ||
"""HOMO energy for spin-up electrons in Eh.""" | ||
# NOTE: SOMO for alpha spin channel in open shell systems | ||
|
||
lumo_spin_up: Tensor | ||
"""LUMO energy for spin-up electrons in Eh.""" | ||
|
||
homo_spin_down: Tensor | ||
"""HOMO energy for spin-down electrons in Eh.""" | ||
# NOTE: SOMO for beta spin channel in open shell systems | ||
|
||
lumo_spin_down: Tensor | ||
"""LUMO energy for spin-down electrons in Eh.""" | ||
|
||
orbital_energies_spin_up: Tensor | ||
"""Energies of orbitals for spin-up electrons in Eh.""" | ||
|
||
orbital_energies_spin_down: Tensor | ||
"""Energies of orbitals for spin-down electrons in Eh.""" | ||
|
||
# Electronic properties | ||
charge: Tensor | ||
"""Total charge of the molecule in e.""" | ||
|
||
unpaired_e: Tensor | ||
"""Number of unpaired electrons.""" | ||
|
||
nel_alpha: Tensor | ||
"""Number of alpha electrons.""" | ||
|
||
nel_beta: Tensor | ||
"""Number of beta electrons.""" | ||
|
||
nel: Tensor | ||
"""Total number of electrons.""" | ||
|
||
polarizabilities: Tensor | ||
"""Polarizabilities from D4 calculation.""" | ||
|
||
eeq: Tensor | ||
"""EEQ charges from tblite calculation in e.""" | ||
|
||
ceh: Tensor | ||
"""CEH charges from tblite calculation in e.""" | ||
|
||
q_gfn2: Tensor | ||
"""Charges from GFN2 calculation in e.""" | ||
|
||
# Population and bond properties | ||
mayer_pop: Tensor | ||
"""Mayer population analysis values. | ||
Sorted by: | ||
NA - Mulliken gross atomic population | ||
ZA - Total nuclear charge | ||
QA - Mulliken gross atomic charge | ||
VA - Mayer's total valence | ||
BVA - Mayer's bonded valence | ||
FA - Mayer's free valence | ||
""" | ||
|
||
mayer_bo: Tensor | ||
"""Mayer bond order values with tuple indices. | ||
Sorted by: | ||
Atom idx A - idx of atom A | ||
Atom idx B - idx of atom B | ||
Bond order - value of bond order | ||
""" | ||
|
||
loewdin_charges: Tensor | ||
"""Loewdin atomic charges.""" | ||
|
||
loewdin_spins: Tensor | ||
"""Loewdin spin populations.""" | ||
|
||
mulliken_charges: Tensor | ||
"""Mulliken atomic charges.""" | ||
|
||
mulliken_spins: Tensor | ||
"""Mulliken spin populations.""" | ||
|
||
hirshfeld_charges: Tensor | ||
"""Hirshfeld atomic charges.""" | ||
|
||
hirshfeld_spins: Tensor | ||
"""Hirshfeld spin populations.""" | ||
|
||
hirshfeld_alpha: Tensor | ||
"""Hirshfeld total integrated alpha density.""" | ||
|
||
hirshfeld_beta: Tensor | ||
"""Hirshfeld total integrated beta density.""" | ||
|
||
# Molecular properties | ||
rot_const: Tensor | ||
"""Rotational constants in MHz.""" | ||
|
||
rot_dipole: Tensor | ||
"""Dipole components along the rotational axes in A.U..""" | ||
|
||
dipole: Tensor | ||
"""Magnitude of the dipole moment in A.U..""" | ||
|
||
dipole_ele: Tensor | ||
"""Electronic xyz-contribution to the dipole moment.""" | ||
|
||
dipole_nuc: Tensor | ||
"""Nuclear xyz-contribution to the dipole moment.""" | ||
|
||
dipole_tot: Tensor | ||
"""Total xyz-dipole moment.""" | ||
|
||
def __str__(self): | ||
return f"Sample({self.uid})" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: lnqm | ||
channels: | ||
- conda-forge | ||
- defaults | ||
- pyg | ||
dependencies: | ||
- python=3.11.4 | ||
- h5py | ||
- pytorch | ||
- pydantic | ||
- pip | ||
- pip: | ||
- torch_geometric |