atomistic-machine-learning · epens94 · Feb 7, 2024 · Feb 7, 2024 · Feb 12, 2024 · Feb 13, 2024
diff --git a/.gitignore b/.gitignore
@@ -125,4 +125,6 @@ interfaces/lammps/examples/*/*.dat
 interfaces/lammps/examples/*/deployed_model
 
 # batchwise optimizer examples
-examples/howtos/howto_batchwise_relaxations_outputs/*
+examples/howtos/howto_batchwise_relaxations_outputs/*
+.vscode/launch.json
+.vscode/*
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include src/schnetpack/train/ressources/partition_spline_for_robust_loss.npz
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,5 +57,8 @@ script-files = [
     "src/scripts/spkdeploy",
 ]
 
+# Ensure package data such as resources are included
+package-data = { "schnetpack.train" = ["ressources/partition_spline_for_robust_loss.npz"] }
+
 [tool.setuptools.dynamic]
 version = {attr = "schnetpack.__version__"}
diff --git a/src/schnetpack/configs/data/qcml.yaml b/src/schnetpack/configs/data/qcml.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - custom
+
+_target_: schnetpack.datasets.QCML
+
+datapath: ${run.data_dir}/qcml.db  # data_dir is specified in train.yaml
+batch_size: 50
+num_train: 0.90
+num_val: 0.05
+load_properties: [formation_energy,forces,charge,multiplicity]
+version: 0.0.3
+
diff --git a/src/schnetpack/configs/experiment/qcml.yaml b/src/schnetpack/configs/experiment/qcml.yaml
@@ -0,0 +1,101 @@
+# @package _global_
+
+defaults:
+  - override /data: qcml
+  - override /model/representation: painn
+  - override /model/representation/radial_basis: bernstein ### NEW ADDED FEATURE
+  - override /task/scheduler: multistep ### NEW ADDED CONFIG
+
+run:
+  experiment: qcml
+
+seed: 0
+
+globals:
+  cutoff: 10.
+  lr: 1e-3
+  energy_key: formation_energy
+  forces_key: forces
+  total_charge_key: charge ### NEW ADDED FEATURE
+  spin_key: multiplicity ### NEW ADDED FEATURE
+
+data:
+  datapath: ???
+  load_properties: [formation_energy,forces,charge,multiplicity]
+  batch_size: 50
+  num_train: 0.90
+  num_val: 0.05
+  num_workers: 4
+  num_val_workers: 4
+  distance_unit: Bohr
+  property_units:
+    energy: Hartree
+    forces: Hartree/Bohr
+  transforms:
+    - _target_: schnetpack.transform.SubtractCenterOfMass
+    - _target_: schnetpack.transform.RemoveOffsets ### NEW ADDED FEATURE
+      property: ${globals.energy_key}
+      remove_mean: True
+    - _target_: schnetpack.transform.MatScipyNeighborList
+      cutoff: ${globals.cutoff}
+    - _target_: schnetpack.transform.CastTo32
+
+model:
+  representation: 
+    nuclear_embedding:
+      _target_: schnetpack.nn.embedding.NuclearEmbedding
+      max_z: 101
+      num_features: ${globals.representation_features} # same as n_atom_basis 
+    electronic_embeddings: ### NEW ADDED FEATURE
+    - _target_: schnetpack.nn.embedding.ElectronicEmbedding
+      property_key: ${globals.total_charge_key}
+      num_features: ${model.representation.n_atom_basis}
+      is_charged: true
+      num_residual: 1
+    - _target_: schnetpack.nn.embedding.ElectronicEmbedding ### NEW ADDED FEATURE
+      property_key: ${globals.spin_key}
+      num_features: ${model.representation.n_atom_basis}
+      is_charged: false
+      num_residual: 1
+  output_modules:
+    - _target_: schnetpack.atomistic.Atomwise
+      output_key: ${globals.energy_key}
+      n_in: ${model.representation.n_atom_basis}
+      aggregation_mode: sum
+    - _target_: schnetpack.atomistic.Forces
+      energy_key: ${globals.energy_key}
+      force_key: ${globals.forces_key}
+  postprocessors:
+    - _target_: schnetpack.transform.CastTo64
+    - _target_: schnetpack.transform.AddOffsets
+      property: ${globals.energy_key}
+      add_mean: True
+
+task:
+  scheduler_args:
+    milestones: [3,9,15,18,24,30,36]
+  outputs:
+    - _target_: schnetpack.task.ModelOutput
+      name: ${globals.energy_key}
+      loss_fn:
+        _target_: schnetpack.train.AdaptiveLossFunction ### NEW ADDED FEATURE
+        num_dims: 1
+      metrics:
+        mae:
+          _target_: torchmetrics.regression.MeanAbsoluteError
+        rmse:
+          _target_: torchmetrics.regression.MeanSquaredError
+          squared: False
+      loss_weight: 0.05
+    - _target_: schnetpack.task.ModelOutput
+      name: ${globals.forces_key}
+      loss_fn:
+        _target_: schnetpack.train.AdaptiveLossFunction ### NEW ADDED FEATURE
+        num_dims: 3
+      metrics:
+        mae:
+          _target_: torchmetrics.regression.MeanAbsoluteError
+        rmse:
+          _target_: torchmetrics.regression.MeanSquaredError
+          squared: False
+      loss_weight: 0.95
diff --git a/src/schnetpack/configs/model/representation/painn.yaml b/src/schnetpack/configs/model/representation/painn.yaml
@@ -8,4 +8,5 @@ shared_interactions: False
 shared_filters: False
 cutoff_fn:
   _target_: schnetpack.nn.cutoff.CosineCutoff
-  cutoff: ${globals.cutoff}
+  cutoff: ${globals.cutoff}
+nuclear_embedding: null
diff --git a/src/schnetpack/configs/model/representation/radial_basis/bernstein.yaml b/src/schnetpack/configs/model/representation/radial_basis/bernstein.yaml
@@ -0,0 +1,4 @@
+_target_: schnetpack.nn.radial.BernsteinRBF
+n_rbf: 32
+cutoff: ${globals.cutoff}
+init_alpha: 0.95
diff --git a/src/schnetpack/configs/task/scheduler/multistep.yaml b/src/schnetpack/configs/task/scheduler/multistep.yaml
@@ -0,0 +1,7 @@
+# @package task
+scheduler_cls: torch.optim.lr_scheduler.MultiStepLR
+scheduler_monitor: val_loss
+scheduler_args:
+  milestones: ???
+  gamma: 0.5
+  last_epoch: -1
diff --git a/src/schnetpack/data/atoms.py b/src/schnetpack/data/atoms.py
@@ -345,7 +345,7 @@ def _get_properties(
         properties[structure.idx] = torch.tensor([idx])
         for pname in load_properties:
             properties[pname] = (
-                torch.tensor(row.data[pname].copy()) * self.conversions[pname]
+                    torch.tensor(row.data[pname].copy()) * self.conversions[pname]
             )
 
         Z = row["numbers"].copy()

diff --git a/src/schnetpack/data/datamodule.py b/src/schnetpack/data/datamodule.py
@@ -16,6 +16,7 @@
     BaseAtomsData,
     AtomsLoader,
     calculate_stats,
+    estimate_atomrefs,
     SplittingStrategy,
     RandomSplit,
 )
@@ -127,6 +128,7 @@ def __init__(
         self.property_units = property_units
         self.distance_unit = distance_unit
         self._stats = {}
+        self._atomrefs = {}
         self._is_setup = False
         self.data_workdir = data_workdir
         self.cleanup_workdir_stage = cleanup_workdir_stage
@@ -359,6 +361,20 @@ def get_stats(
         self._stats[key] = stats
         return stats
 
+    def get_atomrefs(
+        self, property: str, is_extensive: bool
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        key = (property, is_extensive)
+        if key in self._atomrefs:
+            return {property: self._atomrefs[key]}
+
+        atomrefs = estimate_atomrefs(
+            self.train_dataloader(),
+            is_extensive={property: is_extensive},
+        )[property]
+        self._atomrefs[key] = atomrefs
+        return {property: atomrefs}
+
     @property
     def train_dataset(self) -> BaseAtomsData:
         return self._train_dataset
@@ -408,4 +424,4 @@ def test_dataloader(self) -> AtomsLoader:
                 num_workers=self.num_test_workers,
                 pin_memory=self._pin_memory,
             )
-        return self._test_dataloader
+        return self._test_dataloader
diff --git a/src/schnetpack/data/splitting.py b/src/schnetpack/data/splitting.py
@@ -2,8 +2,9 @@
 import math
 import torch
 import numpy as np
+import scipy.sparse as sp
 
-__all__ = ["SplittingStrategy", "RandomSplit", "SubsamplePartitions"]
+__all__ = ["SplittingStrategy", "RandomSplit", "SubsamplePartitions","AtomTypeSplit"]
 
 
 def absolute_split_sizes(dsize: int, split_sizes: List[int]) -> List[int]:
@@ -96,6 +97,90 @@ def split(self, dataset, *split_sizes) -> List[torch.tensor]:
         return partition_sizes_idx
 
 
+class AtomTypeSplit(SplittingStrategy):
+
+    """
+    Strategy that filters out a specific atom type or multiple atom types from the database.
+    And then performs the splitting on the filtered dataset.
+    The remaining dataset are all molecules, except the ones that contain the atom type(s) to be filtered out.
+
+    The data are read from the metadata.
+    Data should be saved as sparse array, where the data,indices,pointer,shape are provided in metadata
+    The keys in the metadata are of structure "atom_type_count_{indices OR indptr OR shape OR data}"
+    Filter array is binary, where 1 means the atom type is present in the molecule and 0 means it is not.
+    """
+
+    def __init__(
+            self,
+            atomtypes: List[int], 
+            num_keep: Union[int,float] = None):
+        """
+        Args:
+            atomtypes: list of atom types to be filtered out.
+            num_keep: percentage of the to be filtered out atomtypes to keep.
+                        For now the percentage is applied to all atomtypes.
+                        Values below 1 are interpreted as percentage, values above as absolute number.
+                        Conversion is done automatically.
+        """
+        self.atomtypes = atomtypes
+        self.num_keep = num_keep 
+
+    def split(self, dataset, *split_sizes):
+
+        # binary array of NxZ, where N is the number of molecules and Z is the number of atom types
+        # 1 means the atom type is present in the molecule and 0 means it is not
+        # the atom array count can be calculated with estimate_atomrefs code
+        atom_type_count = sp.csr_matrix(
+                    (dataset.conn.metadata["atom_type_count_data"], 
+                     dataset.conn.metadata["atom_type_count_indices"], 
+                     dataset.conn.metadata["atom_type_count_indptr"]),
+                     shape=dataset.conn.metadata["atom_type_count_shape"]).toarray()
+
+        # mask to keep all molecules without requested atomtypes
+        keep = (atom_type_count[:,self.atomtypes] == 0).all(axis=1)
+        indices = np.where(keep)[0]
+        # mask to exclude all molecules with requested atomtypes
+        exclude = (~keep)
+        exclude_indices = np.where(exclude)[0]
+        # random indices of exclude to choose from
+        random_iter_indices = np.random.permutation(len(exclude_indices)).tolist()
+
+        # adding requested percentage or absolute value of exclude to keep
+        # if num keep for exclude requested, cumulative keeping is done
+        # e.g first run 3% num keep and second run 5% num keep
+        # the 3% of first run are included in the 5% of the second run
+        if self.num_keep:
+            if self.num_keep < 1:
+                num_keep = int(math.floor(self.num_keep * exclude_indices.shape[0]))
+            else:
+                num_keep = self.num_keep
+            indices = np.concatenate([indices,exclude_indices[random_iter_indices[:num_keep]]])
+
+        # split the dataset
+        partition_sizes_idx = self.random_split(np.array(indices), *split_sizes)
+        return partition_sizes_idx
+
+    def random_split(self,indices, *split_sizes: Union[int, float]) -> List[torch.tensor]:
+        """
+        Randomly split the dataset
+
+        Args:
+            dsize - Size of dataset.
+            split_sizes - Sizes for each split. One can be set to -1 to assign all
+                remaining data. Values in [0, 1] can be used to give relative partition
+                sizes.
+        """
+        dsize = len(indices)
+        split_sizes = absolute_split_sizes(dsize, split_sizes)
+        offsets = torch.cumsum(torch.tensor(split_sizes), dim=0)
+        indices = indices[torch.randperm(len(indices)).tolist()].tolist()
+        partition_sizes_idx = [
+            indices[offset - length : offset]
+            for offset, length in zip(offsets, split_sizes)
+        ]
+        return partition_sizes_idx
+
+
 class SubsamplePartitions(SplittingStrategy):
     """
     Strategy that splits the atoms dataset into predefined partitions as defined in the
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		include src/schnetpack/train/ressources/partition_spline_for_robust_loss.npz
Copy link Collaborator stefaanhessmann Oct 24, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. What is this? Copy link Collaborator Author epens94 Oct 24, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. From this file values are loaded which are used to approximate the partition function to for the adaptive loss fn