support save cache and load broadcast

li126com · li126com · commit 364c5949f666 · 2024-12-23T16:19:32.000+08:00
diff --git a/internlm/checkpoint/checkpoint_manager.py b/internlm/checkpoint/checkpoint_manager.py
@@ -87,7 +87,7 @@ def try_load_internevo_ckpt(ckpt_mm, load_info, train_state: TrainState = None,
     if universal_ckpt:
         from internlm.checkpoint.vescale.api import load as vescale_load
         checkpoint_state = {"model": ckpt_mm.model, "optimizer": ckpt_mm.optimizer}
-        vescale_load(load_ckpt_folder, checkpoint_state, broadcast_checkpoint=False)
+        vescale_load(load_ckpt_folder, checkpoint_state, broadcast_checkpoint=gpc.config.ckpt.universal_ckpt.broadcast_load)
 
     if not universal_ckpt and load_content.need_load(CheckpointLoadContent.MODEL):
         load_model_checkpoint(folder=load_ckpt_folder, model=ckpt_mm.model)
@@ -448,7 +448,7 @@ def try_save_checkpoint(self, train_state, force=False):
                 train_state=train_state,
                 model_config=self.model_config,
                 model_config_file=self.model_config_file,
-                universal_ckpt=gpc.config.ckpt.universal_ckpt,
+                universal_ckpt=gpc.config.ckpt.universal_ckpt.enable,
             )
 
             if (
@@ -591,7 +591,7 @@ def try_resume_training(self, train_state: TrainState, current_time=""):
             load_path = self.load_ckpt_info["path"]
             load_content = self.load_ckpt_info["content"]
             load_type = self.load_ckpt_info["ckpt_type"]
-            universal_ckpt = gpc.config.ckpt.universal_ckpt
+            universal_ckpt = gpc.config.ckpt.universal_ckpt.enable
             kwargs = {}
             
             if universal_ckpt:
@@ -656,7 +656,7 @@ def save_checkpoint(
             vescale_save(
                 path=folder,
                 checkpoint_state={"model": model, "optimizer": optimizer},
-                async_checkpoint=False,
+                async_checkpoint=gpc.config.ckpt.universal_ckpt.aysnc_save,
             )
             
 
diff --git a/internlm/checkpoint/vescale/common.py b/internlm/checkpoint/vescale/common.py
@@ -59,7 +59,7 @@ def sort_rank_ranges(process_list: List[Tuple]) -> List[Tuple]:
     return sorted_process_list
 
 
-_MAX_CACHE_SIZE = 8
+_MAX_CACHE_SIZE = 2 # model ckpt + optm ckpt
 
 
 class PlanLRUCache:
diff --git a/internlm/checkpoint/vescale/filesystem.py b/internlm/checkpoint/vescale/filesystem.py
@@ -27,6 +27,7 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.context import ParallelMode
 from internlm.train.pipeline import map_fqn_global_to_local, map_layer_attr
+from internlm.utils.common import get_current_device
 
 
 from torch.distributed.checkpoint.metadata import (
@@ -880,8 +881,9 @@ def read_from_files(self, per_file: Dict[str, List[ReadItem]], planner: LoadPlan
                         bytes.seek(0)
                         planner.load_bytes(req, bytes)
                     else:
-                        tensor = cast(Tensor, torch.load(file_slice, map_location="cpu"))
+                        tensor = cast(Tensor, torch.load(file_slice, map_location="cpu")) #att
                         tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
+                        print(f"req: {req.dest_index.fqn}, {req}", flush=True)
                         target_tensor = planner.resolve_tensor(req).detach()
 
                         assert (
@@ -892,18 +894,20 @@ def read_from_files(self, per_file: Dict[str, List[ReadItem]], planner: LoadPlan
 
     def read_data_with_broadcast(self, per_file: Dict[str, List[ReadItem]], planner: LoadPlanner):
         for relative_path, reqs in per_file.items():
-            if dist.get_rank(self.data_parallel_process_group) == 0:
+            # if dist.get_rank(self.data_parallel_process_group) == 0:
+            if gpc.get_local_rank(ParallelMode.DATA) == 0:
                 file_path = self._get_file_path(relative_path)
                 file = open(file_path, "rb")
             dist.barrier(self.data_parallel_process_group)
             reqs = sorted(reqs, key=lambda req: self.storage_data[req.storage_index].offset)
             for req in reqs:
-                if dist.get_rank(self.data_parallel_process_group) == 0:
+                if gpc.get_local_rank(ParallelMode.DATA)== 0:
                     item_md = self.storage_data[req.storage_index]
                     file_slice = self._slice_file(file, item_md)
 
                 if req.type == LoadItemType.BYTE_IO:
-                    if dist.get_rank(self.data_parallel_process_group) == 0:
+                    assert False
+                    if gpc.get_local_rank(ParallelMode.DATA) == 0:
                         object_list = [io.BytesIO(file_slice.read(item_md.length))]
                     else:
                         object_list = [None]
@@ -912,23 +916,23 @@ def read_data_with_broadcast(self, per_file: Dict[str, List[ReadItem]], planner:
                         object_list,
                         src=dist.get_global_rank(self.data_parallel_process_group, 0),
                         group=self.data_parallel_process_group,
-                        device=f"cuda:{torch.cuda.current_device()}",
+                        device=get_current_device(),
                     )
                     bytes = object_list[0]
                     bytes.seek(0)
                     planner.load_bytes(req, bytes)
                 else:
-                    if dist.get_rank(self.data_parallel_process_group) == 0:
+                    if gpc.get_local_rank(ParallelMode.DATA) == 0:
                         object_list = [cast(Tensor, torch.load(file_slice, map_location="cuda"))]
                     else:
                         object_list = [None]
                     dist.broadcast_object_list(
                         object_list,
                         src=dist.get_global_rank(self.data_parallel_process_group, 0),
                         group=self.data_parallel_process_group,
-                        device=f"cuda:{torch.cuda.current_device()}",
+                        device=get_current_device(),
                     )
-                    tensor = object_list[0].cpu()
+                    tensor = object_list[0].cpu() #att
                     tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
                     target_tensor = planner.resolve_tensor(req).detach()
 
diff --git a/internlm/checkpoint/vescale/save_state_dict.py b/internlm/checkpoint/vescale/save_state_dict.py
@@ -53,7 +53,6 @@ def save_state_dict(
     [veScale version] Saves a distributed model in SPMD style. Fix sub-group storage.
     Args and usage is the same as `torch.distributed.checkpoint.save_state_dict`.
     """
-
     # Step 0: create distributed world based on process group and coordinator rank
     distW = _DistWrapper(process_group, not no_dist, coordinator_rank)
     if process_group:
@@ -132,6 +131,7 @@ def finish_checkpoint(all_results):
 
     # Wait for last write futures to finish.
     if last_write_futures:
+        print(f"last_write_futures: {last_write_futures}", flush=True)
         logger.info("Start waiting for last write events.")
         last_write_start_time = time.time()
         for fut in last_write_futures:
@@ -145,22 +145,23 @@ def finish_checkpoint(all_results):
     plan_start_time = time.time()
     cached_data = None
 
+    # if isinstance(planner, VeScaleSavePlanner):
+    #     central_plan = distW.reduce_scatter("plan", local_step, global_step)
+    # else:
+    #     raise AssertionError("Unsupported planner for saving checkpoint")
+    
     if isinstance(planner, VeScaleSavePlanner):
-        central_plan = distW.reduce_scatter("plan", local_step, global_step)
+        cached_data = planner.lookup_plan_meta()
+        if cached_data:
+            logger.info("Plan cache hit. Reuse existing plan")
+            central_plan, _ = cached_data
+            # _ = local_step() #attn
+        else:
+            logger.info("Plan cache miss. The model/optimizer appears for the first time.")
+
+            central_plan = distW.reduce_scatter("plan", local_step, global_step)
     else:
         raise AssertionError("Unsupported planner for saving checkpoint")
-    # if isinstance(planner, VeScaleSavePlanner): #attn
-    #     cached_data = planner.lookup_plan_meta()
-    #     if cached_data:
-    #         logger.debug("Plan cache hit. Reuse existing plan")
-    #         central_plan, _ = cached_data
-    #         _ = local_step()
-    #     else:
-    #         logger.debug("Plan cache miss. The model/optimizer appears for the first time.")
-
-    #         central_plan = distW.reduce_scatter("plan", local_step, global_step)
-    # else:
-    #     raise AssertionError("Unsupported planner for saving checkpoint")
     
     
     
@@ -194,7 +195,7 @@ def finish_checkpoint(all_results):
             final_storage_metadata = distW.all_reduce("write", write_data, finish_checkpoint)
             assert central_plan is not None
             assert final_storage_metadata is not None
-            # planner.cache_plan_meta(central_plan, final_storage_metadata) #attn
+            planner.cache_plan_meta(central_plan, final_storage_metadata) #attn
     else:
         raise AssertionError("Unsupported planner for writing data and metadata")
     store_local_cost_time = time.time() - store_local_start_time
diff --git a/internlm/checkpoint/vescale/vescale_checkpointer.py b/internlm/checkpoint/vescale/vescale_checkpointer.py
@@ -225,8 +225,8 @@ def load(
                 # print(f"model_state {gpc.get_global_rank()} {gpc.get_local_rank(ParallelMode.PIPELINE)}: {p})", flush=True)
                 # Set process group
                 if broadcast_checkpoint:
-                    assert False
-                    model_load_process_group = VESCALE_DEVICE_MESH.get_data_parallel_dim_groups()
+                    # model_load_process_group = VESCALE_DEVICE_MESH.get_data_parallel_dim_groups()
+                    model_load_process_group = gpc.get_group(ParallelMode.DATA)
                 else:
                     model_load_process_group = None
                 # Load model
diff --git a/internlm/checkpoint/vescale/vescale_planner.py b/internlm/checkpoint/vescale/vescale_planner.py
@@ -135,17 +135,33 @@ def lookup_object(self, index: MetadataIndex, fqn=None) -> Any:
         return find_state_dict_object(self.state_dict, index, fqn)
 
     def lookup_plan_meta(self) -> Optional[Tuple[SavePlan, Metadata]]:
+        # if not hasattr(self, STATE_DICT_STR):
+        #     return None
+        # else:
+        #     device_mesh = VESCALE_DEVICE_MESH.get()
+        #     plan_key = hash((frozenset(self.state_dict.keys()), self.is_coordinator, device_mesh))
+        #     return self._plan_cache.get(plan_key)
+
         if not hasattr(self, STATE_DICT_STR):
             return None
         else:
-            device_mesh = VESCALE_DEVICE_MESH.get()
-            plan_key = hash((frozenset(self.state_dict.keys()), self.is_coordinator, device_mesh))
+            plan_key = hash((frozenset(self.state_dict.keys()), self.is_coordinator))
             return self._plan_cache.get(plan_key)
 
     def cache_plan_meta(self, new_plan: SavePlan, new_metadata: Metadata) -> None:
-        device_mesh = VESCALE_DEVICE_MESH.get()
-        plan_key = hash((frozenset(self.state_dict.keys()), self.is_coordinator, device_mesh))
+        # device_mesh = VESCALE_DEVICE_MESH.get()
+        # plan_key = hash((frozenset(self.state_dict.keys()), self.is_coordinator, device_mesh))
+        # self._plan_cache.put(plan_key, new_plan, new_metadata)
+        
+        print(f"new_plan {gpc.get_global_rank()}: {new_plan}", flush=True)
+        print(f"new_metadata {gpc.get_global_rank()}: {new_metadata}", flush=True)
+
+        plan_key = hash((frozenset(self.state_dict.keys()), self.is_coordinator))
+        print(f"Before GPU Memory Allocated {gpc.get_global_rank()}: {torch.cuda.memory_allocated() /1024/1024} bytes", flush=True)
+        print(f"Before GPU Memory Cached {gpc.get_global_rank()}: {torch.cuda.memory_reserved() /1024/1024} bytes", flush=True)
         self._plan_cache.put(plan_key, new_plan, new_metadata)
+        print(f"After GPU Memory Allocated {gpc.get_global_rank()}: {torch.cuda.memory_allocated() /1024/1024} bytes", flush=True)
+        print(f"After GPU Memory Cached {gpc.get_global_rank()}: {torch.cuda.memory_reserved() /1024/1024} bytes", flush=True)
 
     def clear_cache(self) -> None:
         self._plan_cache.clear()
diff --git a/internlm/checkpoint/vescale/vescale_planner_helpers.py b/internlm/checkpoint/vescale/vescale_planner_helpers.py
@@ -298,6 +298,7 @@ def find_state_dict_object(state_dict: STATE_DICT_TYPE, index: MetadataIndex, fq
     # if isinstance(obj, torch.Tensor): #att
     #     return find_tensor_shard(obj, index)
     if isinstance(obj, OptimizerStateSpec):
+        assert False
         return obj.local_tensor
     # elif index.offset is not None:
     #     raise ValueError(
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
@@ -261,7 +261,7 @@ def args_sanity_check():
         ckpt._add_item("auto_resume", True)
     
     if "universal_ckpt" not in ckpt:
-        ckpt._add_item("universal_ckpt", False)
+        ckpt._add_item("universal_ckpt", dict(enable=False, aysnc_save=False, broadcast_load=False))
 
     if gpc.is_rank_for_log():
         logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -1000,7 +1000,7 @@ def state_dict(self):
         optim_states = self.optim.state_dict()
         grad_scaler = self.grad_scaler.state_dict()
         states["grad_scaler"] = grad_scaler
-        if not gpc.config.ckpt.universal_ckpt:
+        if not gpc.config.ckpt.universal_ckpt.enable:
             states["base_optim_states"] = optim_states
             flat_fp32_weights = {}
             for group_id, param in self._fp32_flat_param_groups_of_current_rank.items():
@@ -1217,7 +1217,7 @@ def state_dict(self):
                 
 
     def load_state_dict(self, states, global_optimizer_state=None):
-        if not gpc.config.ckpt.universal_ckpt:
+        if not gpc.config.ckpt.universal_ckpt.enable:
             # TODO: Need to take into account the change in the number of DP.
             assert "grad_scaler" in states, "Not found grad_scaler state!"
             grad_scaler = states["grad_scaler"]