Add v1.0

grimme-lab · Nov 29, 2023 · 796fa1e · 796fa1e
commit 796fa1e
Show file tree

Hide file tree

Showing 7 changed files with 751 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -0,0 +1,65 @@
+# LnQM dataset
+
+[![Python](https://img.shields.io/badge/python-3.11.4-blue.svg)](https://www.python.org)
+[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+
+This is the offical repository of the `LnQM` dataset developed by the Grimme group in Bonn.
+
+<div align="center">
+<img src="./assets/logo.png" alt="LnQM Dataset" width="400">
+</div>
+
+
+# Data
+
+The data is available under: [https://zenodo.org/records/10222384](https://zenodo.org/records/10222384)
+
+
+# Setup
+
+We provide a dataset class and a sample representation based on `pytorch`.
+For this purpose please install the package requirements:
+```bash
+conda env create --file requirements.yml
+```
+
+Basic usage of the dataset:
+```python
+from lnqm import LnQM_Dataset
+
+# load LnQM from disk
+dataset = LnQM_Dataset(path_to_hdf5="/path/to/lnqm.h5")
+
+# loop over each sample in the dataset
+for data in dataset:
+    # ...
+    # do sth. with each sample
+    # ...
+
+    # e.g. access properties
+    print(data.uid)
+```
+
+For a more detailed description of the features included see the datamodel in `LnQM_Sample`.
+If required, a `Data` object can be converted via:
+
+```python
+from lnqm import LnQM_Sample
+sample = LnQM_Sample(**data)
+```
+
+# Citations
+
+When using or referencing to the `LnQM` please cite:
+- C. Hölzer, I. Gordiy, S. Grimme, M. Bursch
+  ... tbd ...
+
+# License
+
+[![CC BY NC 4.0][cc-by-nc-image]][cc-by-nc]
+
+This work is licensed under a
+[Creative Commons Attribution-NonCommercial 4.0 International License][cc-by-nc].
+
+[cc-by-nc]: http://creativecommons.org/licenses/by-nc/4.0/
+[cc-by-nc-image]: https://i.creativecommons.org/l/by-nc/4.0/88x31.png
diff --git a/assets/logo.png b/assets/logo.png
diff --git a/lnqm/__init__.py b/lnqm/__init__.py
@@ -0,0 +1,2 @@
+from .dataset import LnQM_Dataset
+from .sample import LnQM_Sample
diff --git a/lnqm/dataset.py b/lnqm/dataset.py
@@ -0,0 +1,68 @@
+from collections import defaultdict
+from pathlib import Path
+
+import h5py
+import numpy as np
+import torch
+from torch_geometric.data import Data, InMemoryDataset
+
+
+class LnQM_Dataset(InMemoryDataset):
+    """Dataset holding LnQM data.
+
+    Usage:
+    >>> dataset = LnQM_Dataset(path_to_hdf5="lnqm.h5")
+    """
+
+    def __init__(self, path_to_hdf5: str | Path, transform=None):
+        super().__init__("./", transform, pre_transform=None, pre_filter=None)
+        self.path_to_hdf5 = path_to_hdf5
+
+        if self.path_to_hdf5:
+            self.data, self.slices = LnQM_Dataset.from_hdf5(self.path_to_hdf5)
+        else:
+            # empty dataset
+            self.data, self.slices = Data(), defaultdict(dict, {})
+
+    @staticmethod
+    def from_hdf5(fp: str | Path) -> tuple[Data, defaultdict]:
+        """Load data and slices from HDF5 file."""
+        data = {}
+        slices = {}
+        with h5py.File(fp, "r") as f:
+            for key in f["data"].keys():
+                np_arrays = {"data": f["data"][key][:], "slices": f["slices"][key][:]}
+                # some casting
+                for prop, val in np_arrays.items():
+                    if val.dtype == np.uint64:
+                        np_arrays[prop] = val.astype(np.int64)
+                # uid is of dtype string, so we got to handle it seperately
+                if key == "uid":
+                    uids = [s.decode("utf-8") for s in np_arrays["data"].tolist()]
+                    data[key] = uids
+                    slices[key] = torch.from_numpy(np_arrays["slices"])
+                else:
+                    data[key] = torch.from_numpy(np_arrays["data"])
+                    slices[key] = torch.from_numpy(np_arrays["slices"])
+        return Data.from_dict(data), defaultdict(dict, slices)
+
+    def to_hdf5(self, fp: str | Path):
+        """Save the data and slices of the dataset to an HDF5 file."""
+        with h5py.File(fp, "w") as f:
+            data_group = f.create_group("data")
+            slices_group = f.create_group("slices")
+
+            for key, value in self._data.items():
+                if not isinstance(value, list):  # strings such as uid
+                    if isinstance(value, torch.Tensor):
+                        value = value.numpy()
+                    if value.dtype == np.int64:
+                        value = value.astype(np.uint64)
+
+                data_group.create_dataset(key, data=value)
+
+                # save slices
+                slice_value = self.slices[key].numpy()
+                if slice_value.dtype == np.int64:
+                    slice_value = slice_value.astype(np.uint64)
+                slices_group.create_dataset(key, data=slice_value)
diff --git a/lnqm/sample.py b/lnqm/sample.py
@@ -0,0 +1,196 @@
+from pydantic import BaseModel
+from torch import Tensor
+
+
+class LnQM_Sample(BaseModel):
+    """
+    Data model representation for samples in the `LnQM` dataset.
+
+    While designed for comprehensive representation, some attributes
+    may contain redundant information. For instance:
+    - Total electron count (`nel`) can be deduced from alpha (`nel_alpha`)
+      and beta (`nel_beta`) electron counts.
+    - The combined exchange-correlation energy (`exc`) is the sum of
+      exchange (`ex`) and correlation (`ec`) energies.
+    - The Mulliken charges are encompassed within Mayer population.
+
+    Units:
+    All numerical attributes are expressed in atomic units (Hartree, Bohr, etc.).
+    """
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    # General properties
+    uid: str
+    """Unique identifier for the calculation."""
+
+    time_singlepoint: Tensor
+    """Time taken for the singlepoint ORCA calculation in seconds."""
+
+    time_geoopt: Tensor
+    """Time taken for the geometry optimization ORCA calculation in seconds."""
+
+    # Geometric properties
+    numbers: Tensor
+    """Atomic numbers."""
+
+    coord: Tensor
+    """Optimized geometry in Bohr."""
+
+    gradient: Tensor
+    """Gradient of the optimized geometry in Eh/Bohr."""
+
+    trajectory: Tensor
+    """Trajectory of atomic positions during optimization in Bohr."""
+
+    trajectory_energies: Tensor
+    """Energies at different points in the optimization trajectory in Eh."""
+
+    trajectory_etot: Tensor
+    """Energies without dispersion correction at different points in the optimization trajectory in Eh."""
+
+    trajectory_gradients: Tensor
+    """Gradients at different points in the optimization trajectory in Eh/Bohr."""
+
+    cn: Tensor
+    """Coordination numbers from D4 calculation."""
+
+    # Energetic properties
+    energy: Tensor
+    """Total single point energy from the ORCA calculation in Eh."""
+
+    energy_geoopt: Tensor
+    """Energy after geometric optimization in Eh. This is important for comparability with other trajectory features."""
+
+    etot_geoopt: Tensor
+    """Energy after geometric optimization without dispersion correction in Eh. This is important for comparability with other trajectory features."""
+
+    ex: Tensor
+    """Exchange energy in Eh."""
+
+    ec: Tensor
+    """Correlation energy in Eh."""
+
+    exc: Tensor
+    """Exchange-correlation energy in Eh."""
+
+    ecnl: Tensor
+    """Non-local correlation energy in Eh."""
+
+    eemb: Tensor
+    """Embedding correction energy in Eh."""
+
+    homo_spin_up: Tensor
+    """HOMO energy for spin-up electrons in Eh."""
+    # NOTE: SOMO for alpha spin channel in open shell systems
+
+    lumo_spin_up: Tensor
+    """LUMO energy for spin-up electrons in Eh."""
+
+    homo_spin_down: Tensor
+    """HOMO energy for spin-down electrons in Eh."""
+    # NOTE: SOMO for beta spin channel in open shell systems
+
+    lumo_spin_down: Tensor
+    """LUMO energy for spin-down electrons in Eh."""
+
+    orbital_energies_spin_up: Tensor
+    """Energies of orbitals for spin-up electrons in Eh."""
+
+    orbital_energies_spin_down: Tensor
+    """Energies of orbitals for spin-down electrons in Eh."""
+
+    # Electronic properties
+    charge: Tensor
+    """Total charge of the molecule in e."""
+
+    unpaired_e: Tensor
+    """Number of unpaired electrons."""
+
+    nel_alpha: Tensor
+    """Number of alpha electrons."""
+
+    nel_beta: Tensor
+    """Number of beta electrons."""
+
+    nel: Tensor
+    """Total number of electrons."""
+
+    polarizabilities: Tensor
+    """Polarizabilities from D4 calculation."""
+
+    eeq: Tensor
+    """EEQ charges from tblite calculation in e."""
+
+    ceh: Tensor
+    """CEH charges from tblite calculation in e."""
+
+    q_gfn2: Tensor
+    """Charges from GFN2 calculation in e."""
+
+    # Population and bond properties
+    mayer_pop: Tensor
+    """Mayer population analysis values.
+        Sorted by:
+            NA   - Mulliken gross atomic population
+            ZA   - Total nuclear charge
+            QA   - Mulliken gross atomic charge
+            VA   - Mayer's total valence
+            BVA  - Mayer's bonded valence
+            FA   - Mayer's free valence
+    """
+
+    mayer_bo: Tensor
+    """Mayer bond order values with tuple indices.
+        Sorted by:
+            Atom idx A - idx of atom A
+            Atom idx B - idx of atom B
+            Bond order - value of bond order
+    """
+
+    loewdin_charges: Tensor
+    """Loewdin atomic charges."""
+
+    loewdin_spins: Tensor
+    """Loewdin spin populations."""
+
+    mulliken_charges: Tensor
+    """Mulliken atomic charges."""
+
+    mulliken_spins: Tensor
+    """Mulliken spin populations."""
+
+    hirshfeld_charges: Tensor
+    """Hirshfeld atomic charges."""
+
+    hirshfeld_spins: Tensor
+    """Hirshfeld spin populations."""
+
+    hirshfeld_alpha: Tensor
+    """Hirshfeld total integrated alpha density."""
+
+    hirshfeld_beta: Tensor
+    """Hirshfeld total integrated beta density."""
+
+    # Molecular properties
+    rot_const: Tensor
+    """Rotational constants in MHz."""
+
+    rot_dipole: Tensor
+    """Dipole components along the rotational axes in A.U.."""
+
+    dipole: Tensor
+    """Magnitude of the dipole moment in A.U.."""
+
+    dipole_ele: Tensor
+    """Electronic xyz-contribution to the dipole moment."""
+
+    dipole_nuc: Tensor
+    """Nuclear xyz-contribution to the dipole moment."""
+
+    dipole_tot: Tensor
+    """Total xyz-dipole moment."""
+
+    def __str__(self):
+        return f"Sample({self.uid})"
diff --git a/requirements.yml b/requirements.yml
@@ -0,0 +1,13 @@
+name: lnqm
+channels:
+  - conda-forge
+  - defaults
+  - pyg
+dependencies:
+  - python=3.11.4
+  - h5py
+  - pytorch
+  - pydantic
+  - pip
+  - pip:
+    - torch_geometric
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .dataset import LnQM_Dataset
		from .sample import LnQM_Sample