fix ci

li126com · li126com · commit 24c6901b40ba · 2025-02-21T14:41:35.000+08:00
diff --git a/ci_scripts/train/load_ckpt.sh b/ci_scripts/train/load_ckpt.sh
@@ -8,7 +8,7 @@ source ./ci_scripts/common/variables.sh
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
 readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
-expected_num=22
+expected_num=23
 exit_code=0
 
 source ./ci_scripts/common/basic_func.sh
diff --git a/internlm/checkpoint/checkpoint_manager.py b/internlm/checkpoint/checkpoint_manager.py
@@ -631,7 +631,7 @@ def save_checkpoint(
         save_optimizer_checkpoint(optim=optimizer, state_path=folder)
         timer("save-optimizer").stop()
 
-        if gpc.get_global_rank() == 0:
+        if gpc.get_global_rank() == 0 and gpc.config.ckpt.need_metadata:
             assert self.meta_data is not None
             llm_save(os.path.join(folder, "metadata.pt"), saved_obj=self.meta_data)
 
diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py
@@ -60,7 +60,7 @@
 
 
 def fuse_wqkv(key, state_dict) -> None:  # pylint: disable=W0613
-    prefix = key.rstrip("wqkv.weight")
+    prefix = key.rstrip("Wqkv.weight")
     wq_name, wk_name, wv_name = (
         f"{prefix}wq.weight",
         f"{prefix}wk.weight",
@@ -78,8 +78,12 @@ def check_model_weights(model, ckpt_path, total_equal=False):
     copy_of_ordered_dict = model2_dict.copy()
 
     for key in copy_of_ordered_dict.keys():
+        if "wqkv" in key:
+            model2_dict[key.replace("wqkv", "Wqkv")] = model2_dict.pop(key)
+            key = key.replace("wqkv", "Wqkv")
+
         if key not in model1_dict:
-            if "wqkv" in key:
+            if "Wqkv" in key:
                 fuse_wqkv(key, model1_dict)
             else:
                 assert False, f"Error: The key {key} for current model dose not exist in standard ckpt!"
diff --git a/tools/convert_ckpt_parallel.py b/tools/convert_ckpt_parallel.py
@@ -1,3 +1,16 @@
+"""
+Usage:
+    python tools/convert_ckpt_parallel.py \
+    <origin_ckpt_path> <target_ckpt_path> \
+    (optional) [--origin_meta_path <origin_meta_path>] [--target_meta_path <target_meta_path>] \
+    (optional) [--copy_file <True/False>] [--convert_optimizer <True/False>]
+
+    When meta_path is not specified, it will automatically search and load meta in the ckpt path.
+    Default to convert optimizer state and copy files.
+Example:
+    srun -p llm_s python tools/convert_ckpt_parallel.py \
+    /llm_ckpt/100 /target_ckpt/converted
+"""
 import argparse
 import os
 import shutil
@@ -530,7 +543,6 @@ def convert_optimizer_ckpt(
                     base_state["base_optim_states"]["state"][group_id] = state
                     base_state["flat_fp32_weights"][group_id] = flat_fp32_weights
 
-                # print(f"optimizer tp{new_tp_rank}_pp{new_pp_rank}_zo{new_zero1_rank}: {base_state}")
                 torch.save(base_state, os.path.join(saved_folder, file_name))
 
     print("Finish optimizer convert", flush=True)
@@ -559,6 +571,7 @@ def convert_optimizer_ckpt(
         new_meta_path
     ), "new meta file does not exist, plese generate it before converting checkpoint."
 
+    # read and process metaData for original ckpt
     old_meta = torch.load(old_meta_path, map_location="cpu")
     old_pp_size = old_meta["parallel_setting"]["pp_size"]
     old_zero1_size = old_meta["parallel_setting"]["zero1_size"]
@@ -570,16 +583,19 @@ def convert_optimizer_ckpt(
         assert False, "tp or wp should be in parallel setting."
     old_tp_size = old_meta["parallel_setting"][f"{old_tp_mode}_size"]
 
+    # To facilitate key query, summarize meta_data.
     old_meta_data = {}
     for pp_rank in range(old_pp_size):
         for zero_rank in range(old_zero1_size):
             for states in old_meta["metaData"][0][pp_rank][zero_rank].values():
                 old_meta_data.update(states)
 
+    # map local fqn to global fqn
     old_map_local_to_global = [{} for _ in range(old_pp_size)]
     for global_fqn, states in old_meta_data.items():
         old_map_local_to_global[states["pp"]][states["fqn"]] = global_fqn
 
+    # read and process metaData for target ckpt
     new_meta = torch.load(new_meta_path, map_location="cpu")
     new_pp_size = new_meta["parallel_setting"]["pp_size"]
     new_zero1_size = new_meta["parallel_setting"]["zero1_size"]
@@ -597,6 +613,7 @@ def convert_optimizer_ckpt(
     ), "Error: old meta and new meta have diffent group_id lists."
     group_id_list = list(new_meta["metaData"][0][0][0].keys())
 
+    # To facilitate key query, summarize meta_data.
     new_meta_data = {}
     for pp_rank in range(new_pp_size):
         for zero_rank in range(new_zero1_size):