From 37efd7380b6148ddb71d2efda3fd4fc1667b2d3d Mon Sep 17 00:00:00 2001
From: Adam Moody <moody20@llnl.gov>
Date: Wed, 21 Jun 2023 15:43:28 -0700
Subject: [PATCH] scr: improve comments

---
 megatron/initialize.py | 27 +++++++++++++++++++++------
 megatron/training.py   |  2 ++
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 529314682..5556e1f9e 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -66,30 +66,45 @@ def finish_mpu_init():
 
     args = get_args()
 
-    # SCR: point SCR_PREFIX to checkpoint path
+    # SCR: configure and initialize SCR based on user options
     if args.scr:
-        # SCR only supports a single directory to both read previous checkpoints and to write new checkpoints
+        # SCR only supports a single directory for save/load.
         if args.save != args.load:
             raise ValueError(f"--save {args.save} must match --load {args.load} when using SCR")
 
-        # SCR will default to use the current working dir if args.save not specified
+        # Configure SCR to use the save/load dir specified by the user.
+        # If not specified, SCR defaults to use the current working dir
+        # at the time scr.init() is called.
         if args.save is not None:
             scr.config(f"SCR_PREFIX={args.save}")
+        elif args.load is not None:
+            scr.config(f"SCR_PREFIX={args.load}")
 
-        # DeepSpeed expects files to be on global file system
-        # This will flush any cached checkpoint to the file system on restart
+        # DeepSpeed expects files to be on the global file system during restart.
+        # Configure SCR to flush any cached checkpoint to the file system during scr.init().
         scr.config("SCR_GLOBAL_RESTART=1")
 
-        # Allow user to name a specific checkpoint to load.
+        # Attempt to load a specific checkpoint if the user requested one.
         # This should match the name that was given to SCR, which is the checkpoint tag.
+        # For example, to restart from global_step200
+        #   --scr-current=global_step200
         if args.scr_current is not None:
             scr.config(f"SCR_CURRENT={args.scr_current}")
 
         # Configure seconds between checkpoints if user provided a limit.
+        # If enabled, SCR will advise the application to save a checkpoint after
+        # this time via scr.need_checkpoint().
+        # For example, to write a checkpoint every 5 minutes:
+        #   --scr-seconds=300
         if args.scr_seconds is not None:
             scr.config(f"SCR_CHECKPOINT_SECONDS={args.scr_seconds}")
 
         # Configure max percentage of runtime for checkpointing if user provided a limit.
+        # If enabled, SCR will advise the application to save a checkpoint as often
+        # as possible with the constraint that the total percent of runtime spent
+        # doing checkpointing remains below this limit.
+        # For example, to limit time spent checkpointing to 5% of runtime:
+        #   --scr-overhead=5.0
         if args.scr_overhead is not None:
             scr.config(f"SCR_CHECKPOINT_OVERHEAD={args.scr_overhead}")
 
diff --git a/megatron/training.py b/megatron/training.py
index 35763e176..f516a78e3 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -1144,6 +1144,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
         # conditions.  For supported resource managers, like SLURM, SCR
         # can detect the time remaining within a job allocation and indicate
         # to the application that it should exit when it is close to its time limit.
+        # One should also set SCR_HALT_SECONDS to inform SCR how much time is required
+        # to flush any cached checkpoint to the file system before the allocation ends.
         # This can also react to external commands from the user, like an scr_halt command.
         #
         if args.save and args.scr and scr.should_exit():