Skip to content

Commit

Permalink
Support submitit Hydra launcher, to launch runs on SLURM clusters
Browse files Browse the repository at this point in the history
The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package
  • Loading branch information
nathanpainchaud committed Sep 5, 2022
1 parent c777491 commit 6378a22
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 1 deletion.
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,11 @@ CAMUS_DATA_PATH="path/to/camus"
### API keys ###
COMET_API_KEY="<your-comet-api-key>"

### AllianceCan clusters config ###
ALLIANCECAN_VENV_PATH="path/to/project/virtualenv"

### SLURM config ###
SLURM_MAIL_USER="<mail-address-to-notify>"

### Error Flags ###
# HYDRA_FULL_ERROR=1
1 change: 1 addition & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ natsort
nibabel
albumentations
hydra-core~=1.2.0
hydra-submitit-launcher
python-dotenv
19 changes: 19 additions & 0 deletions vital/config/hydra/launcher/alliancecan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# @package _global_

defaults:
- submitit_slurm@_group_

hydra:
launcher:
timeout_min: ${oc.select:run_time_min,60}
setup:
- "module load httpproxy" # load module allowing to connect to whitelisted domains
- "source $ALLIANCECAN_VENV_PATH/bin/activate" # activate the pre-installed virtual environment
- "rsync -a ${data.dataset_path} $SLURM_TMPDIR" # copy the dataset to the compute node
additional_parameters:
mail-user: ${oc.env:SLURM_MAIL_USER,null}
mail-type: ALL

# NOTE: Options meant to override train/task/data should be defined in a "final" launcher config (e.g. `beluga.yaml`)
# and not in launcher config meant to be used as in the defaults list. Otherwise, the order of the composition of the
# configs might not give priority to the launcher config.
17 changes: 17 additions & 0 deletions vital/config/hydra/launcher/beluga.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# @package _global_

defaults:
- alliancecan

hydra:
launcher:
gpus_per_node: 1
cpus_per_gpu: 10
mem_per_gpu: "47750M"

trainer:
devices: 1
enable_progress_bar: False

data:
num_workers: 9
2 changes: 1 addition & 1 deletion vital/config/vital_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defaults:
- callbacks:
- model_checkpoint
- logger: comet/online
- _self_
- override hydra/launcher: basic # List launcher after trainer/task/data, so that it can override their configs

seed: null

Expand Down
15 changes: 15 additions & 0 deletions vital/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,21 @@ def _check_cfg(cfg: DictConfig) -> DictConfig:
with open_dict(cfg):
cfg.trainer.default_root_dir = os.getcwd()

# When running on a SLURM cluster, we will look to see if the dataset was copied on the compute node,
# and update the path if it was. Otherwise, we will use the path as-is.
if compute_node_dir := os.environ.get("SLURM_TMPDIR"):
dataset_name = Path(cfg.data.dataset_path).name
slurm_dataset_path = Path(compute_node_dir) / dataset_name
if slurm_dataset_path.exists():
cfg.data.dataset_path = str(slurm_dataset_path)
else:
logger.warning(
f"Running in a distributed computing environment, but we could not locate the dataset on the node "
f"the code is running on (e.g., no file named '{dataset_name}' in $SLURM_TMPDIR). "
f"You should consider copying the root of your dataset to $SLURM_TMPDIR in your job's setup so "
f"that we can detect it and use the local data for improved performance."
)

return cfg

@staticmethod
Expand Down

0 comments on commit 6378a22

Please sign in to comment.