From b9182b3993b2310acf47d1586782e566868f4232 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 28 Jan 2025 23:06:35 -0800 Subject: [PATCH] Add docs on env vars Signed-off-by: Sangkug Lym --- examples/llm/pretrain/default_executor.py | 16 ++++++---------- scripts/llm/performance/utils.py | 14 ++++++-------- scripts/llm/pretraining.py | 18 +++++++----------- tests/collections/llm/hf/peft_nemorun.py | 6 ++---- tests/collections/llm/hf/pretrain_nemorun.py | 8 +++----- tests/collections/llm/hf/sft_nemorun.py | 6 ++---- tests/collections/llm/hf/sft_nemorun_fsdp2.py | 8 +++----- .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb | 4 ---- .../llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb | 4 ---- 9 files changed, 29 insertions(+), 55 deletions(-) diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py index 6ebc874b2e39..89b2db21ed08 100644 --- a/examples/llm/pretrain/default_executor.py +++ b/examples/llm/pretrain/default_executor.py @@ -21,11 +21,9 @@ def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor: env_vars = { - "TRANSFORMERS_OFFLINE": "1", - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", + "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) @@ -57,11 +55,9 @@ def slurm_executor( mounts.extend(custom_mounts) env_vars = { - "TRANSFORMERS_OFFLINE": "1", - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", + "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } if custom_env_vars: env_vars |= custom_env_vars diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py index 68f4883451b2..f89b1df14837 100644 --- a/scripts/llm/performance/utils.py +++ b/scripts/llm/performance/utils.py @@ -57,14 +57,12 @@ def slurm_executor( sys.exit(1) env_vars = { - "TRANSFORMERS_OFFLINE": "1", - "TOKENIZERS_PARALLELISM": "False", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "1", - "NVTE_FLASH_ATTN": "1", - "NEMO_LOG_MEMORY_USAGE": "1", + "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace + "TOKENIZERS_PARALLELISM": "False", # Restrict warning message prints + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory + "NVTE_FLASH_ATTN": "1", # Enable Flash Attention, which is needed to enable cuDNN fused attention + "NVTE_FUSED_ATTN": "1", # Enable cuDNN fused attention + "NEMO_LOG_MEMORY_USAGE": "1", # Print memory allocation "NEMORUN_HOME": log_dir, } mounts = [] diff --git a/scripts/llm/pretraining.py b/scripts/llm/pretraining.py index 3b1a2f140b4c..60954edeb270 100644 --- a/scripts/llm/pretraining.py +++ b/scripts/llm/pretraining.py @@ -81,11 +81,9 @@ def slurm_executor( mounts.extend(custom_mounts) env_vars = { - "TRANSFORMERS_OFFLINE": "1", - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", + "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } if custom_env_vars: env_vars |= custom_env_vars @@ -118,12 +116,10 @@ def slurm_executor( def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor: env_vars = { - "TRANSFORMERS_OFFLINE": "1", - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", + "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory + "NVTE_FUSED_ATTN": "0", # Disable cuDNN fused attention } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py index 3a135b2346be..debbbf3b9c3b 100644 --- a/tests/collections/llm/hf/peft_nemorun.py +++ b/tests/collections/llm/hf/peft_nemorun.py @@ -24,10 +24,8 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor: # Env vars for jobs are configured here env_vars = { - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py index 331a0652e21a..a68b5f17446d 100644 --- a/tests/collections/llm/hf/pretrain_nemorun.py +++ b/tests/collections/llm/hf/pretrain_nemorun.py @@ -25,11 +25,9 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor: # Env vars for jobs are configured here env_vars = { - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory + "NVTE_FUSED_ATTN": "0", # Disable cuDNN fused attention } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py index b559c04f6cbd..bca653c6de64 100644 --- a/tests/collections/llm/hf/sft_nemorun.py +++ b/tests/collections/llm/hf/sft_nemorun.py @@ -25,10 +25,8 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor: # Env vars for jobs are configured here env_vars = { - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/sft_nemorun_fsdp2.py b/tests/collections/llm/hf/sft_nemorun_fsdp2.py index 53dd863cb185..81d7ac8550ba 100644 --- a/tests/collections/llm/hf/sft_nemorun_fsdp2.py +++ b/tests/collections/llm/hf/sft_nemorun_fsdp2.py @@ -27,11 +27,9 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor: # Env vars for jobs are configured here env_vars = { - "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", - "NCCL_NVLS_ENABLE": "0", - "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", - "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory + "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory + "NVTE_FUSED_ATTN": "0", # Disable cuDNN attention } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb index c983b277e72a..54a571b9bc45 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb @@ -339,8 +339,6 @@ " env_vars = {\n", " \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n", " \"NCCL_NVLS_ENABLE\": \"0\",\n", - " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", - " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -454,8 +452,6 @@ " env_vars = {\n", " \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n", " \"NCCL_NVLS_ENABLE\": \"0\",\n", - " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", - " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb index 0bb4367d50e9..7dbd4b904ad9 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb @@ -480,8 +480,6 @@ " env_vars = {\n", " \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n", " \"NCCL_NVLS_ENABLE\": \"0\",\n", - " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", - " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -562,8 +560,6 @@ " env_vars = {\n", " \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n", " \"NCCL_NVLS_ENABLE\": \"0\",\n", - " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", - " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",