diff --git a/.env.example b/.env.example index 2eba6e13..3f5eaca4 100644 --- a/.env.example +++ b/.env.example @@ -10,5 +10,8 @@ CAMUS_DATA_PATH="path/to/camus" ### API keys ### COMET_API_KEY="" +### SLURM config ### +SLURM_MAIL_USER="" + ### Error Flags ### # HYDRA_FULL_ERROR=1 diff --git a/pyproject.toml b/pyproject.toml index c1f2f30f..5b88ca36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ torch = "~1.12.0" torchvision = "~0.13.0" pytorch-lightning = "~1.8.0" hydra-core = "~1.2.0" +hydra-submitit-launcher = "*" torchmetrics = "*" torchinfo = "*" pathos = "*" diff --git a/vital/config/hydra/launcher/alliancecan.yaml b/vital/config/hydra/launcher/alliancecan.yaml new file mode 100644 index 00000000..016271d8 --- /dev/null +++ b/vital/config/hydra/launcher/alliancecan.yaml @@ -0,0 +1,12 @@ +defaults: + - submitit_slurm + +timeout_min: ${oc.select:run_time_min,60} +setup: + - "module load httpproxy" # load module allowing to connect to whitelisted domains + - "source ${venv}/bin/activate" # activate the pre-installed virtual environment + - "rsync -a ${data_dir_to_copy} $SLURM_TMPDIR" # copy the dataset to the compute node + +additional_parameters: + mail-user: ${oc.env:SLURM_MAIL_USER,null} + mail-type: ALL diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml new file mode 100644 index 00000000..0dc21665 --- /dev/null +++ b/vital/config/hydra/launcher/beluga.yaml @@ -0,0 +1,6 @@ +defaults: + - alliancecan + +cpus_per_task: 10 +gpus_per_node: 1 +mem_gb: 46 diff --git a/vital/config/launcher/alliancecan.yaml b/vital/config/launcher/alliancecan.yaml new file mode 100644 index 00000000..8582b9b3 --- /dev/null +++ b/vital/config/launcher/alliancecan.yaml @@ -0,0 +1,17 @@ +# @package _global_ + +# NOTE: This custom launcher (separate from the built-in 'hydra/launcher' node) is meant to override global options +# that depend on the hardware resources available and for which the default options are for local runs +# (e.g. GPU devices, number of dataloader workers, progress bar display, etc.) + +defaults: + - override /hydra/launcher: alliancecan + +trainer: + enable_progress_bar: False + +# Path to the root of the virtualenv to activate for the jobs +venv: ??? + +# Path of the data to copy from the shared filesystem to the compute node +data_dir_to_copy: ??? diff --git a/vital/config/launcher/beluga.yaml b/vital/config/launcher/beluga.yaml new file mode 100644 index 00000000..9bb29a5b --- /dev/null +++ b/vital/config/launcher/beluga.yaml @@ -0,0 +1,11 @@ +# @package _global_ + +defaults: + - alliancecan + - override /hydra/launcher: beluga + +trainer: + devices: 1 + +data: + num_workers: 9 diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml index d8a593b5..491ce8ca 100644 --- a/vital/config/vital_default.yaml +++ b/vital/config/vital_default.yaml @@ -6,6 +6,7 @@ defaults: - model_checkpoint - logger: comet/online - _self_ + - optional launcher: null # List custom launcher after trainer/task/data, so that it can override their configs seed: null