From 37efd7380b6148ddb71d2efda3fd4fc1667b2d3d Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Wed, 21 Jun 2023 15:43:28 -0700 Subject: [PATCH] scr: improve comments --- megatron/initialize.py | 27 +++++++++++++++++++++------ megatron/training.py | 2 ++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/megatron/initialize.py b/megatron/initialize.py index 529314682..5556e1f9e 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -66,30 +66,45 @@ def finish_mpu_init(): args = get_args() - # SCR: point SCR_PREFIX to checkpoint path + # SCR: configure and initialize SCR based on user options if args.scr: - # SCR only supports a single directory to both read previous checkpoints and to write new checkpoints + # SCR only supports a single directory for save/load. if args.save != args.load: raise ValueError(f"--save {args.save} must match --load {args.load} when using SCR") - # SCR will default to use the current working dir if args.save not specified + # Configure SCR to use the save/load dir specified by the user. + # If not specified, SCR defaults to use the current working dir + # at the time scr.init() is called. if args.save is not None: scr.config(f"SCR_PREFIX={args.save}") + elif args.load is not None: + scr.config(f"SCR_PREFIX={args.load}") - # DeepSpeed expects files to be on global file system - # This will flush any cached checkpoint to the file system on restart + # DeepSpeed expects files to be on the global file system during restart. + # Configure SCR to flush any cached checkpoint to the file system during scr.init(). scr.config("SCR_GLOBAL_RESTART=1") - # Allow user to name a specific checkpoint to load. + # Attempt to load a specific checkpoint if the user requested one. # This should match the name that was given to SCR, which is the checkpoint tag. + # For example, to restart from global_step200 + # --scr-current=global_step200 if args.scr_current is not None: scr.config(f"SCR_CURRENT={args.scr_current}") # Configure seconds between checkpoints if user provided a limit. + # If enabled, SCR will advise the application to save a checkpoint after + # this time via scr.need_checkpoint(). + # For example, to write a checkpoint every 5 minutes: + # --scr-seconds=300 if args.scr_seconds is not None: scr.config(f"SCR_CHECKPOINT_SECONDS={args.scr_seconds}") # Configure max percentage of runtime for checkpointing if user provided a limit. + # If enabled, SCR will advise the application to save a checkpoint as often + # as possible with the constraint that the total percent of runtime spent + # doing checkpointing remains below this limit. + # For example, to limit time spent checkpointing to 5% of runtime: + # --scr-overhead=5.0 if args.scr_overhead is not None: scr.config(f"SCR_CHECKPOINT_OVERHEAD={args.scr_overhead}") diff --git a/megatron/training.py b/megatron/training.py index 35763e176..f516a78e3 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1144,6 +1144,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler, # conditions. For supported resource managers, like SLURM, SCR # can detect the time remaining within a job allocation and indicate # to the application that it should exit when it is close to its time limit. + # One should also set SCR_HALT_SECONDS to inform SCR how much time is required + # to flush any cached checkpoint to the file system before the allocation ends. # This can also react to external commands from the user, like an scr_halt command. # if args.save and args.scr and scr.should_exit():