[polish] polish train doc and code (#257)

* update * bench * polish scripts
NUS-HPC-AI-Lab · Dec 10, 2024 · 9e69bff · 9e69bff
1 parent aa9f1f6
commit 9e69bff
Show file tree

Hide file tree

Showing 17 changed files with 66 additions and 120 deletions.
diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ You can also find easy demo with HuggingFace Space <a href="https://huggingface.
 <p align="center">
     <img src="./assets/figures/dcp_overview.png" alt="method" height="300">
 </p>
-Data-Centric Parallel (DCP) is a simple but effective approach to accelerate distributed training of any-size videos. Unlike previous methods that fix training settings, DCP dyanmically adjusts parallelism and other configs driven by incoming data during runtime. This method significantly reduces communication overhead and computational inefficiencies, achieving up to 2.1x speedup. As a ease-of-use method, DCP can enpower any video models and parallel methods with minimal code changes.
+Data-Centric Parallel (DCP) is a simple but effective approach to accelerate distributed training of variable sequences. Unlike previous methods that fix training settings, DCP dyanmically adjusts parallelism and other configs driven by incoming data during runtime, achieving up to 2.1x speedup. As a ease-of-use method, DCP can enpower any video models and parallel methods with minimal code changes.
 
 See its details [here](./docs/dcp.md).
 

diff --git a/assets/example_data/art-museum.mp4 b/assets/example_data/art-museum.mp4
diff --git a/assets/example_data/demo.csv b/assets/example_data/demo.csv
@@ -17,57 +17,3 @@ path,text,num_frames,height,width
 ./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,240,426
 ./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,360,640
 ./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,720,1280
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,720,1280
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,720,1280
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,720,1280
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,1,720,1280
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,51,720,1280
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,102,480,854
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,144,256
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,240,426
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,360,640
-./assets/example_data/suv-in-the-dust.mp4,suv in the dust,204,480,854
diff --git a/assets/example_data/demo_preprocess.csv b/assets/example_data/demo_preprocess.csv
diff --git a/assets/example_data/lagos.mp4 b/assets/example_data/lagos.mp4
diff --git a/assets/example_data/man-on-the-cloud.mp4 b/assets/example_data/man-on-the-cloud.mp4
diff --git a/docs/dcp.md b/docs/dcp.md
@@ -20,6 +20,9 @@ Quick start:
 ```bash
 # run benchmark
 bash examples/training/open_sora/benchmark.sh
+
+# train
+bash examples/training/open_sora/train.sh
 ```
 
 ## Motivation

diff --git a/examples/training/open_sora/benchmark.sh b/examples/training/open_sora/benchmark.sh
@@ -5,20 +5,20 @@ export TOKENIZERS_PARALLELISM=false
 
 # =============== benchmark commands ================
 # If you run this for the first time, you need to run the program twice.
-# The first time is to profile the model, and the second time is to run the benchmark.
+# The first time is to profile the model and save results, and the second time is to run the benchmark.
 
 # baseline
 torchrun --standalone --nproc_per_node 8 examples/training/open_sora/train.py \
-    examples/training/open_sora/configs/benchmarks/baseline.yaml > baseline.log 2>& 1
+    examples/training/open_sora/configs/benchmarks/baseline.yaml
 
 # DCP intra
 torchrun --standalone --nproc_per_node 8 examples/training/open_sora/train.py \
-    examples/training/open_sora/configs/benchmarks/dcp_intra.yaml > dcp_intra.log 2>& 1
+    examples/training/open_sora/configs/benchmarks/dcp_intra.yaml
 
 # DCP inter
 torchrun --standalone --nproc_per_node 8 examples/training/open_sora/train.py \
-    examples/training/open_sora/configs/benchmarks/dcp_inter.yaml > dcp_inter.log 2>& 1
+    examples/training/open_sora/configs/benchmarks/dcp_inter.yaml
 
 # DCP inter + ckpt
 torchrun --standalone --nproc_per_node 8 examples/training/open_sora/train.py \
-    examples/training/open_sora/configs/benchmarks/dcp_inter_ckpt.yaml > dcp_inter_ckpt.log 2>& 1
+    examples/training/open_sora/configs/benchmarks/dcp_inter_ckpt.yaml
diff --git a/examples/training/open_sora/preprocess.py b/examples/training/open_sora/preprocess.py
@@ -107,7 +107,7 @@ def main(args):
         y = batch["text"]
         model_args = encode_prompt(text_encoder, tokenizer, y)
         for i in range(len(y)):
-            cur_model_args = {k: v[i].cpu() for k, v in model_args.items()}
+            cur_model_args = {k: v[i].cpu() if isinstance(v, torch.Tensor) else v for k, v in model_args.items()}
             emb_path = os.path.join(
                 args.output_emb_path, os.path.basename(batch["path"][i]) + f"_{int(batch['index'][i])}_text.pt"
             )

diff --git a/examples/training/open_sora/train.py b/examples/training/open_sora/train.py
@@ -1,7 +1,6 @@
 import argparse
 import logging
 import os
-import time
 from copy import deepcopy
 from datetime import timedelta
 from pprint import pformat
@@ -279,7 +278,7 @@ def main(args):
     else:
         cfg_epochs = args.epochs
     running_loss = 0.0
-    logging.info(f"Training for {args.epochs} epochs{' with profiling' if profiler.need_profile() else ''}.")
+    logging.info(f"Training for {cfg_epochs} epochs{' with profiling' if profiler.need_profile() else ''}.")
 
     # =======================================================
     # 5. training loop
@@ -426,7 +425,7 @@ def main(args):
         token_counter.fill_(local_token_counter)
         dist.all_reduce(token_counter)
         if rank == 0 and not disable:
-            elapsed_time = pbar.format_dict['elapsed']
+            elapsed_time = pbar.format_dict["elapsed"]
             logging.info(
                 f"Epoch {epoch}: steps: {num_steps_per_epoch} elapsed time: {elapsed_time:.2f} s"
                 f", effective samples: {sampler.effective_samples}"

diff --git a/examples/training/open_sora/train.sh b/examples/training/open_sora/train.sh
@@ -0,0 +1,11 @@
+# =============== program params ================
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+
+# =============== preprocess ===============
+torchrun --standalone --nproc_per_node 8 examples/training/open_sora/preprocess.py examples/training/open_sora/configs/preprocess.yaml
+
+# =============== train ===============
+# If you run this for the first time, you need to run the program twice.
+# The first time is to profile the model and save results, and the second time is to run the benchmark.
+torchrun --standalone --nproc_per_node 8 examples/training/open_sora/train.py examples/training/open_sora/configs/train.yaml
diff --git a/requirements.txt b/requirements.txt
@@ -23,4 +23,4 @@ sentencepiece
 timm
 torch>=1.13
 tqdm
-transformers==4.39.3
+transformers==4.39.3
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
@@ -24,12 +24,7 @@
 from videosys.training.datasets.open_sora.utils import MaskGenerator
 from videosys.training.lr_schedulers.linear_warmup_open_sora import LinearWarmupLR
 from videosys.utils.logging import init_logger
-from videosys.utils.training import (
-    define_experiment_workspace,
-    format_numel_str,
-    get_model_numel,
-    requires_grad,
-)
+from videosys.utils.training import define_experiment_workspace, format_numel_str, get_model_numel, requires_grad
 from videosys.utils.utils import merge_args, set_seed, str_to_dtype
 
 
@@ -95,7 +90,7 @@ def main(args):
     # == build text-encoder and vae ==
     if not preprocessed_data:
         text_encoder = T5EncoderModel.from_pretrained("DeepFloyd/t5-v1_1-xxl", torch_dtype=dtype).to(device).eval()
-        tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")
+        AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")
         vae = (
             OpenSoraVAE_V1_2(
                 from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
@@ -146,7 +141,7 @@ def main(args):
     model.enable_parallel(parallel_mgr=parallel_mgr)
 
     if args.mask_ratios is not None:
-        mask_generator = MaskGenerator(args.mask_ratios)
+        MaskGenerator(args.mask_ratios)
 
     # ======================================================
     # 3. build dataset and dataloader

diff --git a/videosys/core/dcp/profiler.py b/videosys/core/dcp/profiler.py
@@ -636,7 +636,7 @@ def profile(self, batch, model, gas):
                         self.dp_results.append(result_row)
                     else:
                         self.latest_raw_result = result_row
-                
+
                 self.detail_results.append(result_row)
 
                 if self.logger:
@@ -656,18 +656,18 @@ def profile(self, batch, model, gas):
                     else:
                         assert self.latest_raw_result is not None
                         self.dp_results.append(self.latest_raw_result)
-                    
+
                 if sp_size < self.max_sp:
                     self.next_sp_size = sp_size * 2
                     # bs // 2 is the previous successful bs
-                    self.next_bs = 1 if self.dynamic_recompute else max(1, bs//4)
+                    self.next_bs = 1 if self.dynamic_recompute else max(1, bs // 4)
                     self.next_warmup_iter = not self.auto_grad_acc
                 elif len(self.dp_results) == 0:
                     if sp_size < self.world_size:
                         self.next_sp_size = sp_size * 2
-                        self.next_bs = 1 if self.dynamic_recompute else max(1, bs//4)
+                        self.next_bs = 1 if self.dynamic_recompute else max(1, bs // 4)
                         self.next_warmup_iter = not self.auto_grad_acc
-                        
+
                         if self.logger:
                             self.logger.info(
                                 f">>> [Profiling] bucket {ar_name} {num_frame} cross nodes, increase sp size to {self.next_sp_size}"
@@ -734,7 +734,7 @@ def profile(self, batch, model, gas):
                     sp_size = self.latest_raw_result.sp_size
 
                     pred_full_time, pred_full_mem = self.estimate_overhead(self.latest_raw_result)
-                    cur_throughput = bs/sp_size/pred_full_time
+                    cur_throughput = bs / sp_size / pred_full_time
                     if len(self.dp_results) > 0:
                         prev_row = self.dp_results[-2]
                         prev_time, prev_mem = self.estimate_overhead(prev_row)
@@ -757,7 +757,7 @@ def profile(self, batch, model, gas):
                             memory_consumed=pred_full_mem / GB,
                         ),
                     )
-                    
+
                     self.latest_raw_result = None
                     self.dp_results = []
 

diff --git a/videosys/models/modules/attentions.py b/videosys/models/modules/attentions.py
@@ -127,7 +127,7 @@ def native_attention(self, q, k, v):
 def reset_cross_attn_mask():
     # print(f"rank {torch.distributed.get_rank()} reset cross attention mask")
     global SPATIAL_Q_SEQINFO, SPATIAL_K_SEQINFO, TEMPORAL_Q_SEQINFO, TEMPORAL_K_SEQINFO
-    
+
     SPATIAL_Q_SEQINFO, SPATIAL_K_SEQINFO = None, None
     TEMPORAL_Q_SEQINFO, TEMPORAL_K_SEQINFO = None, None
 
@@ -187,7 +187,7 @@ def forward(self, x, cond, mask=None):
     def get_qk_mask(self, B, N, mask):
         global TEMPORAL_Q_SEQINFO, TEMPORAL_K_SEQINFO, SPATIAL_Q_SEQINFO, SPATIAL_K_SEQINFO
         device = torch.cuda.current_device()
-        if B*N > 200000 and self.training:
+        if B * N > 200000 and self.training:
             if self.temporal:
                 if TEMPORAL_Q_SEQINFO is None:
                     TEMPORAL_Q_SEQINFO = [_SeqLenInfo.from_seqlens([N] * 1) for _ in range(B)]

diff --git a/videosys/models/transformers/open_sora_transformer_3d.py b/videosys/models/transformers/open_sora_transformer_3d.py
@@ -123,7 +123,9 @@ def __init__(
             rope=rope,
             enable_flash_attn=enable_flash_attn,
         )
-        self.cross_attn = OpenSoraMultiHeadCrossAttention(hidden_size, num_heads, enable_flash_attn=enable_flash_attn, temporal=temporal)
+        self.cross_attn = OpenSoraMultiHeadCrossAttention(
+            hidden_size, num_heads, enable_flash_attn=enable_flash_attn, temporal=temporal
+        )
         self.norm2 = nn.LayerNorm(hidden_size, eps=1e-6, elementwise_affine=False)
         self.mlp = Mlp(
             in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0