Skip to content

Commit

Permalink
Add v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
hoelzerC committed Nov 29, 2023
0 parents commit 796fa1e
Show file tree
Hide file tree
Showing 7 changed files with 751 additions and 0 deletions.
407 changes: 407 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

65 changes: 65 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# LnQM dataset

[![Python](https://img.shields.io/badge/python-3.11.4-blue.svg)](https://www.python.org)
[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

This is the offical repository of the `LnQM` dataset developed by the Grimme group in Bonn.

<div align="center">
<img src="./assets/logo.png" alt="LnQM Dataset" width="400">
</div>


# Data

The data is available under: [https://zenodo.org/records/10222384](https://zenodo.org/records/10222384)


# Setup

We provide a dataset class and a sample representation based on `pytorch`.
For this purpose please install the package requirements:
```bash
conda env create --file requirements.yml
```

Basic usage of the dataset:
```python
from lnqm import LnQM_Dataset

# load LnQM from disk
dataset = LnQM_Dataset(path_to_hdf5="/path/to/lnqm.h5")

# loop over each sample in the dataset
for data in dataset:
# ...
# do sth. with each sample
# ...

# e.g. access properties
print(data.uid)
```

For a more detailed description of the features included see the datamodel in `LnQM_Sample`.
If required, a `Data` object can be converted via:

```python
from lnqm import LnQM_Sample
sample = LnQM_Sample(**data)
```

# Citations

When using or referencing to the `LnQM` please cite:
- C. Hölzer, I. Gordiy, S. Grimme, M. Bursch
... tbd ...

# License

[![CC BY NC 4.0][cc-by-nc-image]][cc-by-nc]

This work is licensed under a
[Creative Commons Attribution-NonCommercial 4.0 International License][cc-by-nc].

[cc-by-nc]: http://creativecommons.org/licenses/by-nc/4.0/
[cc-by-nc-image]: https://i.creativecommons.org/l/by-nc/4.0/88x31.png
Binary file added assets/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions lnqm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dataset import LnQM_Dataset
from .sample import LnQM_Sample
68 changes: 68 additions & 0 deletions lnqm/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from collections import defaultdict
from pathlib import Path

import h5py
import numpy as np
import torch
from torch_geometric.data import Data, InMemoryDataset


class LnQM_Dataset(InMemoryDataset):
"""Dataset holding LnQM data.
Usage:
>>> dataset = LnQM_Dataset(path_to_hdf5="lnqm.h5")
"""

def __init__(self, path_to_hdf5: str | Path, transform=None):
super().__init__("./", transform, pre_transform=None, pre_filter=None)
self.path_to_hdf5 = path_to_hdf5

if self.path_to_hdf5:
self.data, self.slices = LnQM_Dataset.from_hdf5(self.path_to_hdf5)
else:
# empty dataset
self.data, self.slices = Data(), defaultdict(dict, {})

@staticmethod
def from_hdf5(fp: str | Path) -> tuple[Data, defaultdict]:
"""Load data and slices from HDF5 file."""
data = {}
slices = {}
with h5py.File(fp, "r") as f:
for key in f["data"].keys():
np_arrays = {"data": f["data"][key][:], "slices": f["slices"][key][:]}
# some casting
for prop, val in np_arrays.items():
if val.dtype == np.uint64:
np_arrays[prop] = val.astype(np.int64)
# uid is of dtype string, so we got to handle it seperately
if key == "uid":
uids = [s.decode("utf-8") for s in np_arrays["data"].tolist()]
data[key] = uids
slices[key] = torch.from_numpy(np_arrays["slices"])
else:
data[key] = torch.from_numpy(np_arrays["data"])
slices[key] = torch.from_numpy(np_arrays["slices"])
return Data.from_dict(data), defaultdict(dict, slices)

def to_hdf5(self, fp: str | Path):
"""Save the data and slices of the dataset to an HDF5 file."""
with h5py.File(fp, "w") as f:
data_group = f.create_group("data")
slices_group = f.create_group("slices")

for key, value in self._data.items():
if not isinstance(value, list): # strings such as uid
if isinstance(value, torch.Tensor):
value = value.numpy()
if value.dtype == np.int64:
value = value.astype(np.uint64)

data_group.create_dataset(key, data=value)

# save slices
slice_value = self.slices[key].numpy()
if slice_value.dtype == np.int64:
slice_value = slice_value.astype(np.uint64)
slices_group.create_dataset(key, data=slice_value)
196 changes: 196 additions & 0 deletions lnqm/sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
from pydantic import BaseModel
from torch import Tensor


class LnQM_Sample(BaseModel):
"""
Data model representation for samples in the `LnQM` dataset.
While designed for comprehensive representation, some attributes
may contain redundant information. For instance:
- Total electron count (`nel`) can be deduced from alpha (`nel_alpha`)
and beta (`nel_beta`) electron counts.
- The combined exchange-correlation energy (`exc`) is the sum of
exchange (`ex`) and correlation (`ec`) energies.
- The Mulliken charges are encompassed within Mayer population.
Units:
All numerical attributes are expressed in atomic units (Hartree, Bohr, etc.).
"""

class Config:
arbitrary_types_allowed = True

# General properties
uid: str
"""Unique identifier for the calculation."""

time_singlepoint: Tensor
"""Time taken for the singlepoint ORCA calculation in seconds."""

time_geoopt: Tensor
"""Time taken for the geometry optimization ORCA calculation in seconds."""

# Geometric properties
numbers: Tensor
"""Atomic numbers."""

coord: Tensor
"""Optimized geometry in Bohr."""

gradient: Tensor
"""Gradient of the optimized geometry in Eh/Bohr."""

trajectory: Tensor
"""Trajectory of atomic positions during optimization in Bohr."""

trajectory_energies: Tensor
"""Energies at different points in the optimization trajectory in Eh."""

trajectory_etot: Tensor
"""Energies without dispersion correction at different points in the optimization trajectory in Eh."""

trajectory_gradients: Tensor
"""Gradients at different points in the optimization trajectory in Eh/Bohr."""

cn: Tensor
"""Coordination numbers from D4 calculation."""

# Energetic properties
energy: Tensor
"""Total single point energy from the ORCA calculation in Eh."""

energy_geoopt: Tensor
"""Energy after geometric optimization in Eh. This is important for comparability with other trajectory features."""

etot_geoopt: Tensor
"""Energy after geometric optimization without dispersion correction in Eh. This is important for comparability with other trajectory features."""

ex: Tensor
"""Exchange energy in Eh."""

ec: Tensor
"""Correlation energy in Eh."""

exc: Tensor
"""Exchange-correlation energy in Eh."""

ecnl: Tensor
"""Non-local correlation energy in Eh."""

eemb: Tensor
"""Embedding correction energy in Eh."""

homo_spin_up: Tensor
"""HOMO energy for spin-up electrons in Eh."""
# NOTE: SOMO for alpha spin channel in open shell systems

lumo_spin_up: Tensor
"""LUMO energy for spin-up electrons in Eh."""

homo_spin_down: Tensor
"""HOMO energy for spin-down electrons in Eh."""
# NOTE: SOMO for beta spin channel in open shell systems

lumo_spin_down: Tensor
"""LUMO energy for spin-down electrons in Eh."""

orbital_energies_spin_up: Tensor
"""Energies of orbitals for spin-up electrons in Eh."""

orbital_energies_spin_down: Tensor
"""Energies of orbitals for spin-down electrons in Eh."""

# Electronic properties
charge: Tensor
"""Total charge of the molecule in e."""

unpaired_e: Tensor
"""Number of unpaired electrons."""

nel_alpha: Tensor
"""Number of alpha electrons."""

nel_beta: Tensor
"""Number of beta electrons."""

nel: Tensor
"""Total number of electrons."""

polarizabilities: Tensor
"""Polarizabilities from D4 calculation."""

eeq: Tensor
"""EEQ charges from tblite calculation in e."""

ceh: Tensor
"""CEH charges from tblite calculation in e."""

q_gfn2: Tensor
"""Charges from GFN2 calculation in e."""

# Population and bond properties
mayer_pop: Tensor
"""Mayer population analysis values.
Sorted by:
NA - Mulliken gross atomic population
ZA - Total nuclear charge
QA - Mulliken gross atomic charge
VA - Mayer's total valence
BVA - Mayer's bonded valence
FA - Mayer's free valence
"""

mayer_bo: Tensor
"""Mayer bond order values with tuple indices.
Sorted by:
Atom idx A - idx of atom A
Atom idx B - idx of atom B
Bond order - value of bond order
"""

loewdin_charges: Tensor
"""Loewdin atomic charges."""

loewdin_spins: Tensor
"""Loewdin spin populations."""

mulliken_charges: Tensor
"""Mulliken atomic charges."""

mulliken_spins: Tensor
"""Mulliken spin populations."""

hirshfeld_charges: Tensor
"""Hirshfeld atomic charges."""

hirshfeld_spins: Tensor
"""Hirshfeld spin populations."""

hirshfeld_alpha: Tensor
"""Hirshfeld total integrated alpha density."""

hirshfeld_beta: Tensor
"""Hirshfeld total integrated beta density."""

# Molecular properties
rot_const: Tensor
"""Rotational constants in MHz."""

rot_dipole: Tensor
"""Dipole components along the rotational axes in A.U.."""

dipole: Tensor
"""Magnitude of the dipole moment in A.U.."""

dipole_ele: Tensor
"""Electronic xyz-contribution to the dipole moment."""

dipole_nuc: Tensor
"""Nuclear xyz-contribution to the dipole moment."""

dipole_tot: Tensor
"""Total xyz-dipole moment."""

def __str__(self):
return f"Sample({self.uid})"
13 changes: 13 additions & 0 deletions requirements.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: lnqm
channels:
- conda-forge
- defaults
- pyg
dependencies:
- python=3.11.4
- h5py
- pytorch
- pydantic
- pip
- pip:
- torch_geometric

0 comments on commit 796fa1e

Please sign in to comment.