diff --git a/.env.example b/.env.example index 2eba6e135..ac8fbab98 100644 --- a/.env.example +++ b/.env.example @@ -10,5 +10,11 @@ CAMUS_DATA_PATH="path/to/camus" ### API keys ### COMET_API_KEY="" +### Alliance clusters config ### +ALLIANCE_VENV_PATH="path/to/project/virtualenv" + +### SLURM config ### +SLURM_MAIL_USER="" + ### Error Flags ### # HYDRA_FULL_ERROR=1 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e507fa78e..f8e8905c7 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -23,4 +23,5 @@ natsort nibabel albumentations hydra-core~=1.2.0 +hydra-submitit-launcher python-dotenv diff --git a/vital/config/hydra/launcher/alliance.yaml b/vital/config/hydra/launcher/alliance.yaml new file mode 100644 index 000000000..19c083ccc --- /dev/null +++ b/vital/config/hydra/launcher/alliance.yaml @@ -0,0 +1,19 @@ +# @package _global_ + +defaults: + - submitit_slurm@_group_ + +hydra: + launcher: + timeout_min: ${oc.select:run_time_min,60} + setup: + - "module load httpproxy" # load module allowing to connect to whitelisted domains + - "source $ALLIANCE_VENV_PATH/bin/activate" # activate the pre-installed virtual environment + - "rsync -a ${data.dataset_path} $SLURM_TMPDIR" # copy the dataset to the compute node + additional_parameters: + mail-user: ${oc.env:SLURM_MAIL_USER,null} + mail-type: ALL + +# NOTE: Options meant to override train/task/data should be defined in a "final" launcher config (e.g. `beluga.yaml`) +# and not in launcher config meant to be used as in the defaults list. Otherwise, the order of the composition of the +# configs might not give priority to the launcher config. diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml new file mode 100644 index 000000000..6f9a603e7 --- /dev/null +++ b/vital/config/hydra/launcher/beluga.yaml @@ -0,0 +1,16 @@ +# @package _global_ + +defaults: + - alliance + +hydra: + launcher: + gpus_per_node: 1 + cpus_per_gpu: 10 + mem_per_gpu: "47750M" + +trainer: + enable_progress_bar: False + +data: + num_workers: 9 diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml index 68ef56529..43740ff4d 100644 --- a/vital/config/vital_default.yaml +++ b/vital/config/vital_default.yaml @@ -5,7 +5,7 @@ defaults: - callbacks: - model_checkpoint - logger: comet/online - - _self_ + - override hydra/launcher: basic # List launcher after trainer/task/data, so that it can override their configs seed: null diff --git a/vital/runner.py b/vital/runner.py index 8bd1b7ba3..418da7a35 100644 --- a/vital/runner.py +++ b/vital/runner.py @@ -140,6 +140,21 @@ def _check_cfg(cfg: DictConfig) -> DictConfig: with open_dict(cfg): cfg.trainer.default_root_dir = os.getcwd() + # When running on a SLURM cluster, we will look to see if the dataset was copied on the compute node, + # and update the path if it was. Otherwise, we will use the path as-is. + if compute_node_dir := os.environ.get("SLURM_TMPDIR"): + dataset_name = Path(cfg.data.dataset_path).name + slurm_dataset_path = Path(compute_node_dir) / dataset_name + if slurm_dataset_path.exists(): + cfg.data.dataset_path = str(slurm_dataset_path) + else: + logger.warning( + f"Running in a distributed computing environment, but we could not locate the dataset on the node " + f"the code is running on (e.g., no file named '{dataset_name}' in $SLURM_TMPDIR). " + f"You should consider copying the root of your dataset to $SLURM_TMPDIR in your job's setup so " + f"that we can detect it and use the local data for improved performance." + ) + return cfg @staticmethod