Support submitit Hydra launcher, to launch runs on SLURM clusters

The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package
vitalab · Sep 5, 2022 · 6378a22 · 6378a22
1 parent c777491
commit 6378a22
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 1 deletion.
diff --git a/.env.example b/.env.example
@@ -10,5 +10,11 @@ CAMUS_DATA_PATH="path/to/camus"
 ### API keys ###
 COMET_API_KEY="<your-comet-api-key>"
 
+### AllianceCan clusters config ###
+ALLIANCECAN_VENV_PATH="path/to/project/virtualenv"
+
+### SLURM config ###
+SLURM_MAIL_USER="<mail-address-to-notify>"
+
 ### Error Flags ###
 # HYDRA_FULL_ERROR=1
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -23,4 +23,5 @@ natsort
 nibabel
 albumentations
 hydra-core~=1.2.0
+hydra-submitit-launcher
 python-dotenv
diff --git a/vital/config/hydra/launcher/alliancecan.yaml b/vital/config/hydra/launcher/alliancecan.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - submitit_slurm@_group_
+
+hydra:
+  launcher:
+    timeout_min: ${oc.select:run_time_min,60}
+    setup:
+      - "module load httpproxy"  # load module allowing to connect to whitelisted domains
+      - "source $ALLIANCECAN_VENV_PATH/bin/activate" # activate the pre-installed virtual environment
+      - "rsync -a ${data.dataset_path} $SLURM_TMPDIR" # copy the dataset to the compute node
+    additional_parameters:
+      mail-user: ${oc.env:SLURM_MAIL_USER,null}
+      mail-type: ALL
+
+# NOTE: Options meant to override train/task/data should be defined in a "final" launcher config (e.g. `beluga.yaml`)
+# and not in launcher config meant to be used as in the defaults list. Otherwise, the order of the composition of the
+# configs might not give priority to the launcher config.
diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+
+defaults:
+  - alliancecan
+
+hydra:
+  launcher:
+    gpus_per_node: 1
+    cpus_per_gpu: 10
+    mem_per_gpu: "47750M"
+
+trainer:
+  devices: 1
+  enable_progress_bar: False
+
+data:
+  num_workers: 9
diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml
@@ -5,7 +5,7 @@ defaults:
   - callbacks:
       - model_checkpoint
   - logger: comet/online
-  - _self_
+  - override hydra/launcher: basic # List launcher after trainer/task/data, so that it can override their configs
 
 seed: null
 

diff --git a/vital/runner.py b/vital/runner.py
@@ -140,6 +140,21 @@ def _check_cfg(cfg: DictConfig) -> DictConfig:
             with open_dict(cfg):
                 cfg.trainer.default_root_dir = os.getcwd()
 
+        # When running on a SLURM cluster, we will look to see if the dataset was copied on the compute node,
+        # and update the path if it was. Otherwise, we will use the path as-is.
+        if compute_node_dir := os.environ.get("SLURM_TMPDIR"):
+            dataset_name = Path(cfg.data.dataset_path).name
+            slurm_dataset_path = Path(compute_node_dir) / dataset_name
+            if slurm_dataset_path.exists():
+                cfg.data.dataset_path = str(slurm_dataset_path)
+            else:
+                logger.warning(
+                    f"Running in a distributed computing environment, but we could not locate the dataset on the node "
+                    f"the code is running on (e.g., no file named '{dataset_name}' in $SLURM_TMPDIR). "
+                    f"You should consider copying the root of your dataset to $SLURM_TMPDIR in your job's setup so "
+                    f"that we can detect it and use the local data for improved performance."
+                )
+
         return cfg
 
     @staticmethod