From b9182b3993b2310acf47d1586782e566868f4232 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 28 Jan 2025 23:06:35 -0800
Subject: [PATCH] Add docs on env vars

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 examples/llm/pretrain/default_executor.py      | 16 ++++++----------
 scripts/llm/performance/utils.py               | 14 ++++++--------
 scripts/llm/pretraining.py                     | 18 +++++++-----------
 tests/collections/llm/hf/peft_nemorun.py       |  6 ++----
 tests/collections/llm/hf/pretrain_nemorun.py   |  8 +++-----
 tests/collections/llm/hf/sft_nemorun.py        |  6 ++----
 tests/collections/llm/hf/sft_nemorun_fsdp2.py  |  8 +++-----
 .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb    |  4 ----
 .../llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb |  4 ----
 9 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py
index 6ebc874b2e39..89b2db21ed08 100644
--- a/examples/llm/pretrain/default_executor.py
+++ b/examples/llm/pretrain/default_executor.py
@@ -21,11 +21,9 @@
 
 def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
@@ -57,11 +55,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 68f4883451b2..f89b1df14837 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -57,14 +57,12 @@ def slurm_executor(
         sys.exit(1)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TOKENIZERS_PARALLELISM": "False",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "1",
-        "NVTE_FLASH_ATTN": "1",
-        "NEMO_LOG_MEMORY_USAGE": "1",
+        "TRANSFORMERS_OFFLINE": "1",        # Enable online downloads from HuggingFace
+        "TOKENIZERS_PARALLELISM": "False",  # Restrict warning message prints
+        "NCCL_NVLS_ENABLE": "0",            # Disable NVLink SHARP to save memory
+        "NVTE_FLASH_ATTN": "1",             # Enable Flash Attention, which is needed to enable cuDNN fused attention
+        "NVTE_FUSED_ATTN": "1",             # Enable cuDNN fused attention
+        "NEMO_LOG_MEMORY_USAGE": "1",       # Print memory allocation
         "NEMORUN_HOME": log_dir,
     }
     mounts = []
diff --git a/scripts/llm/pretraining.py b/scripts/llm/pretraining.py
index 3b1a2f140b4c..60954edeb270 100644
--- a/scripts/llm/pretraining.py
+++ b/scripts/llm/pretraining.py
@@ -81,11 +81,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
@@ -118,12 +116,10 @@ def slurm_executor(
 
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py
index 3a135b2346be..debbbf3b9c3b 100644
--- a/tests/collections/llm/hf/peft_nemorun.py
+++ b/tests/collections/llm/hf/peft_nemorun.py
@@ -24,10 +24,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py
index 331a0652e21a..a68b5f17446d 100644
--- a/tests/collections/llm/hf/pretrain_nemorun.py
+++ b/tests/collections/llm/hf/pretrain_nemorun.py
@@ -25,11 +25,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py
index b559c04f6cbd..bca653c6de64 100644
--- a/tests/collections/llm/hf/sft_nemorun.py
+++ b/tests/collections/llm/hf/sft_nemorun.py
@@ -25,10 +25,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun_fsdp2.py b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
index 53dd863cb185..81d7ac8550ba 100644
--- a/tests/collections/llm/hf/sft_nemorun_fsdp2.py
+++ b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
@@ -27,11 +27,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
index c983b277e72a..54a571b9bc45 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
@@ -339,8 +339,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -454,8 +452,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
index 0bb4367d50e9..7dbd4b904ad9 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
@@ -480,8 +480,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -562,8 +560,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",