NVIDIA · erhoo82 · Jan 29, 2025
diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py
@@ -21,11 +21,9 @@
 
 def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
@@ -57,11 +55,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars

diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
@@ -57,14 +57,12 @@ def slurm_executor(
         sys.exit(1)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TOKENIZERS_PARALLELISM": "False",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "1",
-        "NVTE_FLASH_ATTN": "1",
-        "NEMO_LOG_MEMORY_USAGE": "1",
+        "TRANSFORMERS_OFFLINE": "1",        # Enable online downloads from HuggingFace
+        "TOKENIZERS_PARALLELISM": "False",  # Restrict warning message prints
+        "NCCL_NVLS_ENABLE": "0",            # Disable NVLink SHARP to save memory
+        "NVTE_FLASH_ATTN": "1",             # Enable Flash Attention, which is needed to enable cuDNN fused attention
+        "NVTE_FUSED_ATTN": "1",             # Enable cuDNN fused attention
+        "NEMO_LOG_MEMORY_USAGE": "1",       # Print memory allocation
         "NEMORUN_HOME": log_dir,
     }
     mounts = []

diff --git a/scripts/llm/pretraining.py b/scripts/llm/pretraining.py
@@ -81,11 +81,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
@@ -118,12 +116,10 @@ def slurm_executor(
 
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)

diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py
@@ -24,10 +24,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)

diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py
@@ -25,11 +25,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)

diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py
@@ -25,10 +25,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)

diff --git a/tests/collections/llm/hf/sft_nemorun_fsdp2.py b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
@@ -27,11 +27,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)

diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
@@ -339,8 +339,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -454,8 +452,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",

diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
@@ -480,8 +480,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -562,8 +560,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",