From 5a06309a888771cd37f9ad03023c6b6f38509d66 Mon Sep 17 00:00:00 2001 From: Nathan Painchaud Date: Fri, 22 Jul 2022 17:30:03 +0200 Subject: [PATCH 1/3] Support `submitit` Hydra launcher, to launch runs on SLURM clusters The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package --- .env.example | 3 +++ pyproject.toml | 1 + vital/config/hydra/launcher/alliancecan.yaml | 17 +++++++++++++++++ vital/config/hydra/launcher/beluga.yaml | 6 ++++++ vital/config/launcher/beluga.yaml | 15 +++++++++++++++ vital/config/vital_default.yaml | 1 + vital/runner.py | 15 +++++++++++++++ 7 files changed, 58 insertions(+) create mode 100644 vital/config/hydra/launcher/alliancecan.yaml create mode 100644 vital/config/hydra/launcher/beluga.yaml create mode 100644 vital/config/launcher/beluga.yaml diff --git a/.env.example b/.env.example index 2eba6e13..3f5eaca4 100644 --- a/.env.example +++ b/.env.example @@ -10,5 +10,8 @@ CAMUS_DATA_PATH="path/to/camus" ### API keys ### COMET_API_KEY="" +### SLURM config ### +SLURM_MAIL_USER="" + ### Error Flags ### # HYDRA_FULL_ERROR=1 diff --git a/pyproject.toml b/pyproject.toml index c1f2f30f..5b88ca36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ torch = "~1.12.0" torchvision = "~0.13.0" pytorch-lightning = "~1.8.0" hydra-core = "~1.2.0" +hydra-submitit-launcher = "*" torchmetrics = "*" torchinfo = "*" pathos = "*" diff --git a/vital/config/hydra/launcher/alliancecan.yaml b/vital/config/hydra/launcher/alliancecan.yaml new file mode 100644 index 00000000..46755290 --- /dev/null +++ b/vital/config/hydra/launcher/alliancecan.yaml @@ -0,0 +1,17 @@ +defaults: + - submitit_slurm + +timeout_min: ${oc.select:run_time_min,60} +setup: + - "module load httpproxy" # load module allowing to connect to whitelisted domains + - "source ${hydra.launcher.additional_parameters.venv}/bin/activate" # activate the pre-installed virtual environment + - "rsync -a ${hydra.launcher.additional_parameters.dir_to_copy} $SLURM_TMPDIR" # copy the dataset to the compute node +additional_parameters: + mail-user: ${oc.env:SLURM_MAIL_USER,null} + mail-type: ALL + # We are forced to specify new required options in the additional parameters instead of at the group level + # because the config is typed-checked against th submitit plugin's structured configs + # However, `additional_parameters` is a dict, and therefore does not check if required values are set. + # It is thus up to us to check it manually using our own validation logic at the start of a run + venv: ??? + dir_to_copy: ??? diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml new file mode 100644 index 00000000..b06fb75d --- /dev/null +++ b/vital/config/hydra/launcher/beluga.yaml @@ -0,0 +1,6 @@ +defaults: + - alliancecan + +gpus_per_node: 1 +cpus_per_gpu: 10 +mem_per_gpu: "47750M" diff --git a/vital/config/launcher/beluga.yaml b/vital/config/launcher/beluga.yaml new file mode 100644 index 00000000..8a53056e --- /dev/null +++ b/vital/config/launcher/beluga.yaml @@ -0,0 +1,15 @@ +# @package _global_ + +# NOTE: This custom launcher (separate from the built-in 'hydra/launcher' node) is meant to override global options +# that depend on the hardware resources available and for which the default options are for local runs +# (e.g. GPU devices, number of dataloader workers, progress bar display, etc.) + +defaults: + - override /hydra/launcher: beluga + +trainer: + devices: 1 + enable_progress_bar: False + +data: + num_workers: 9 diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml index d8a593b5..491ce8ca 100644 --- a/vital/config/vital_default.yaml +++ b/vital/config/vital_default.yaml @@ -6,6 +6,7 @@ defaults: - model_checkpoint - logger: comet/online - _self_ + - optional launcher: null # List custom launcher after trainer/task/data, so that it can override their configs seed: null diff --git a/vital/runner.py b/vital/runner.py index c0f3585b..3dbe7dec 100644 --- a/vital/runner.py +++ b/vital/runner.py @@ -10,6 +10,8 @@ import hydra import torch from dotenv import load_dotenv +from hydra.core.hydra_config import HydraConfig +from hydra_plugins.hydra_submitit_launcher.config import LocalQueueConf, SlurmQueueConf from omegaconf import DictConfig, open_dict from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.loggers import CometLogger, LightningLoggerBase @@ -155,6 +157,19 @@ def _check_cfg(cfg: DictConfig) -> DictConfig: with open_dict(cfg): cfg.trainer.default_root_dir = os.getcwd() + # If using the submitit launcher + if (hydra_launcher_cfg := HydraConfig.get().launcher)._target_ in [ + SlurmQueueConf._target_, + LocalQueueConf._target_, + ]: + # Test if custom additional required parameters are set + for required_option in ["venv", "dir_to_copy"]: + if required_option not in hydra_launcher_cfg.additional_parameters: + raise ValueError( + f"You must specify 'hydra.launcher.additional_parameters.{required_option}', " + f"e.g, hydra.launcher.additional_parameters.{required_option}=