From 5a06309a888771cd37f9ad03023c6b6f38509d66 Mon Sep 17 00:00:00 2001
From: Nathan Painchaud <nathan.painchaud@usherbrooke.ca>
Date: Fri, 22 Jul 2022 17:30:03 +0200
Subject: [PATCH 1/3] Support `submitit` Hydra launcher, to launch runs on
 SLURM clusters

The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package
---
 .env.example                                 |  3 +++
 pyproject.toml                               |  1 +
 vital/config/hydra/launcher/alliancecan.yaml | 17 +++++++++++++++++
 vital/config/hydra/launcher/beluga.yaml      |  6 ++++++
 vital/config/launcher/beluga.yaml            | 15 +++++++++++++++
 vital/config/vital_default.yaml              |  1 +
 vital/runner.py                              | 15 +++++++++++++++
 7 files changed, 58 insertions(+)
 create mode 100644 vital/config/hydra/launcher/alliancecan.yaml
 create mode 100644 vital/config/hydra/launcher/beluga.yaml
 create mode 100644 vital/config/launcher/beluga.yaml

diff --git a/.env.example b/.env.example
index 2eba6e13..3f5eaca4 100644
--- a/.env.example
+++ b/.env.example
@@ -10,5 +10,8 @@ CAMUS_DATA_PATH="path/to/camus"
 ### API keys ###
 COMET_API_KEY="<your-comet-api-key>"
 
+### SLURM config ###
+SLURM_MAIL_USER="<mail-address-to-notify>"
+
 ### Error Flags ###
 # HYDRA_FULL_ERROR=1
diff --git a/pyproject.toml b/pyproject.toml
index c1f2f30f..5b88ca36 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ torch = "~1.12.0"
 torchvision = "~0.13.0"
 pytorch-lightning = "~1.8.0"
 hydra-core = "~1.2.0"
+hydra-submitit-launcher = "*"
 torchmetrics = "*"
 torchinfo = "*"
 pathos = "*"
diff --git a/vital/config/hydra/launcher/alliancecan.yaml b/vital/config/hydra/launcher/alliancecan.yaml
new file mode 100644
index 00000000..46755290
--- /dev/null
+++ b/vital/config/hydra/launcher/alliancecan.yaml
@@ -0,0 +1,17 @@
+defaults:
+  - submitit_slurm
+
+timeout_min: ${oc.select:run_time_min,60}
+setup:
+  - "module load httpproxy"  # load module allowing to connect to whitelisted domains
+  - "source ${hydra.launcher.additional_parameters.venv}/bin/activate" # activate the pre-installed virtual environment
+  - "rsync -a ${hydra.launcher.additional_parameters.dir_to_copy} $SLURM_TMPDIR" # copy the dataset to the compute node
+additional_parameters:
+  mail-user: ${oc.env:SLURM_MAIL_USER,null}
+  mail-type: ALL
+  # We are forced to specify new required options in the additional parameters instead of at the group level
+  # because the config is typed-checked against th submitit plugin's structured configs
+  # However, `additional_parameters` is a dict, and therefore does not check if required values are set.
+  # It is thus up to us to check it manually using our own validation logic at the start of a run
+  venv: ???
+  dir_to_copy: ???
diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml
new file mode 100644
index 00000000..b06fb75d
--- /dev/null
+++ b/vital/config/hydra/launcher/beluga.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - alliancecan
+
+gpus_per_node: 1
+cpus_per_gpu: 10
+mem_per_gpu: "47750M"
diff --git a/vital/config/launcher/beluga.yaml b/vital/config/launcher/beluga.yaml
new file mode 100644
index 00000000..8a53056e
--- /dev/null
+++ b/vital/config/launcher/beluga.yaml
@@ -0,0 +1,15 @@
+# @package _global_
+
+# NOTE: This custom launcher (separate from the built-in 'hydra/launcher' node) is meant to override global options
+# that depend on the hardware resources available and for which the default options are for local runs
+# (e.g. GPU devices, number of dataloader workers, progress bar display, etc.)
+
+defaults:
+  - override /hydra/launcher: beluga
+
+trainer:
+  devices: 1
+  enable_progress_bar: False
+
+data:
+  num_workers: 9
diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml
index d8a593b5..491ce8ca 100644
--- a/vital/config/vital_default.yaml
+++ b/vital/config/vital_default.yaml
@@ -6,6 +6,7 @@ defaults:
       - model_checkpoint
   - logger: comet/online
   - _self_
+  - optional launcher: null # List custom launcher after trainer/task/data, so that it can override their configs
 
 seed: null
 
diff --git a/vital/runner.py b/vital/runner.py
index c0f3585b..3dbe7dec 100644
--- a/vital/runner.py
+++ b/vital/runner.py
@@ -10,6 +10,8 @@
 import hydra
 import torch
 from dotenv import load_dotenv
+from hydra.core.hydra_config import HydraConfig
+from hydra_plugins.hydra_submitit_launcher.config import LocalQueueConf, SlurmQueueConf
 from omegaconf import DictConfig, open_dict
 from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.loggers import CometLogger, LightningLoggerBase
@@ -155,6 +157,19 @@ def _check_cfg(cfg: DictConfig) -> DictConfig:
             with open_dict(cfg):
                 cfg.trainer.default_root_dir = os.getcwd()
 
+        # If using the submitit launcher
+        if (hydra_launcher_cfg := HydraConfig.get().launcher)._target_ in [
+            SlurmQueueConf._target_,
+            LocalQueueConf._target_,
+        ]:
+            # Test if custom additional required parameters are set
+            for required_option in ["venv", "dir_to_copy"]:
+                if required_option not in hydra_launcher_cfg.additional_parameters:
+                    raise ValueError(
+                        f"You must specify 'hydra.launcher.additional_parameters.{required_option}', "
+                        f"e.g, hydra.launcher.additional_parameters.{required_option}=<OPTION>"
+                    )
+
         return cfg
 
     @staticmethod

From 7d27772f07105dc98a7b6c430d76cb9ca6dcabf1 Mon Sep 17 00:00:00 2001
From: Nathan Painchaud <nathan.painchaud@usherbrooke.ca>
Date: Thu, 23 Mar 2023 23:04:26 +0100
Subject: [PATCH 2/3] Move unsupported parameters from `hydra/launcher` config
 group to custom `launcher`

This allows us to revert the custom checks on the config that made sure required launcher config fields exist (because now we can rely on Hydra's built-in parsing)
---
 vital/config/hydra/launcher/alliancecan.yaml | 13 ++++---------
 vital/config/launcher/alliancecan.yaml       | 17 +++++++++++++++++
 vital/config/launcher/beluga.yaml            |  6 +-----
 vital/runner.py                              | 15 ---------------
 4 files changed, 22 insertions(+), 29 deletions(-)
 create mode 100644 vital/config/launcher/alliancecan.yaml

diff --git a/vital/config/hydra/launcher/alliancecan.yaml b/vital/config/hydra/launcher/alliancecan.yaml
index 46755290..016271d8 100644
--- a/vital/config/hydra/launcher/alliancecan.yaml
+++ b/vital/config/hydra/launcher/alliancecan.yaml
@@ -3,15 +3,10 @@ defaults:
 
 timeout_min: ${oc.select:run_time_min,60}
 setup:
-  - "module load httpproxy"  # load module allowing to connect to whitelisted domains
-  - "source ${hydra.launcher.additional_parameters.venv}/bin/activate" # activate the pre-installed virtual environment
-  - "rsync -a ${hydra.launcher.additional_parameters.dir_to_copy} $SLURM_TMPDIR" # copy the dataset to the compute node
+  - "module load httpproxy" # load module allowing to connect to whitelisted domains
+  - "source ${venv}/bin/activate" # activate the pre-installed virtual environment
+  - "rsync -a ${data_dir_to_copy} $SLURM_TMPDIR" # copy the dataset to the compute node
+
 additional_parameters:
   mail-user: ${oc.env:SLURM_MAIL_USER,null}
   mail-type: ALL
-  # We are forced to specify new required options in the additional parameters instead of at the group level
-  # because the config is typed-checked against th submitit plugin's structured configs
-  # However, `additional_parameters` is a dict, and therefore does not check if required values are set.
-  # It is thus up to us to check it manually using our own validation logic at the start of a run
-  venv: ???
-  dir_to_copy: ???
diff --git a/vital/config/launcher/alliancecan.yaml b/vital/config/launcher/alliancecan.yaml
new file mode 100644
index 00000000..8582b9b3
--- /dev/null
+++ b/vital/config/launcher/alliancecan.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+
+# NOTE: This custom launcher (separate from the built-in 'hydra/launcher' node) is meant to override global options
+# that depend on the hardware resources available and for which the default options are for local runs
+# (e.g. GPU devices, number of dataloader workers, progress bar display, etc.)
+
+defaults:
+  - override /hydra/launcher: alliancecan
+
+trainer:
+  enable_progress_bar: False
+
+# Path to the root of the virtualenv to activate for the jobs
+venv: ???
+
+# Path of the data to copy from the shared filesystem to the compute node
+data_dir_to_copy: ???
diff --git a/vital/config/launcher/beluga.yaml b/vital/config/launcher/beluga.yaml
index 8a53056e..9bb29a5b 100644
--- a/vital/config/launcher/beluga.yaml
+++ b/vital/config/launcher/beluga.yaml
@@ -1,15 +1,11 @@
 # @package _global_
 
-# NOTE: This custom launcher (separate from the built-in 'hydra/launcher' node) is meant to override global options
-# that depend on the hardware resources available and for which the default options are for local runs
-# (e.g. GPU devices, number of dataloader workers, progress bar display, etc.)
-
 defaults:
+  - alliancecan
   - override /hydra/launcher: beluga
 
 trainer:
   devices: 1
-  enable_progress_bar: False
 
 data:
   num_workers: 9
diff --git a/vital/runner.py b/vital/runner.py
index 3dbe7dec..c0f3585b 100644
--- a/vital/runner.py
+++ b/vital/runner.py
@@ -10,8 +10,6 @@
 import hydra
 import torch
 from dotenv import load_dotenv
-from hydra.core.hydra_config import HydraConfig
-from hydra_plugins.hydra_submitit_launcher.config import LocalQueueConf, SlurmQueueConf
 from omegaconf import DictConfig, open_dict
 from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.loggers import CometLogger, LightningLoggerBase
@@ -157,19 +155,6 @@ def _check_cfg(cfg: DictConfig) -> DictConfig:
             with open_dict(cfg):
                 cfg.trainer.default_root_dir = os.getcwd()
 
-        # If using the submitit launcher
-        if (hydra_launcher_cfg := HydraConfig.get().launcher)._target_ in [
-            SlurmQueueConf._target_,
-            LocalQueueConf._target_,
-        ]:
-            # Test if custom additional required parameters are set
-            for required_option in ["venv", "dir_to_copy"]:
-                if required_option not in hydra_launcher_cfg.additional_parameters:
-                    raise ValueError(
-                        f"You must specify 'hydra.launcher.additional_parameters.{required_option}', "
-                        f"e.g, hydra.launcher.additional_parameters.{required_option}=<OPTION>"
-                    )
-
         return cfg
 
     @staticmethod

From f3479dabab8e242a55e6c8e40f8dfffdea66d754 Mon Sep 17 00:00:00 2001
From: Nathan Painchaud <nathan.painchaud@usherbrooke.ca>
Date: Thu, 23 Mar 2023 23:29:38 +0100
Subject: [PATCH 3/3] Update default Beluga config to favor generic options
 where possible

---
 vital/config/hydra/launcher/beluga.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml
index b06fb75d..0dc21665 100644
--- a/vital/config/hydra/launcher/beluga.yaml
+++ b/vital/config/hydra/launcher/beluga.yaml
@@ -1,6 +1,6 @@
 defaults:
   - alliancecan
 
+cpus_per_task: 10
 gpus_per_node: 1
-cpus_per_gpu: 10
-mem_per_gpu: "47750M"
+mem_gb: 46