Skip to content

Commit

Permalink
allow to do remote data load with live reco
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Nov 14, 2024
1 parent a1adfa2 commit f9106d9
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 7 deletions.
17 changes: 10 additions & 7 deletions src/zeroband/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,20 +461,23 @@ def load(

if not skip_dataloader:
if self.config.remote_data_load:
remote_data_path = os.path.join(self.config.remote_data_path, f"data_{self.data_rank}", "latest")
id_ = uuid.uuid4()
dest = f"/tmp/zeroband/data_{id_}"
rsync_fsspec(remote_data_path, os.path.join(dest, "data"))
data_path = dest
self.remote_data_load()
else:
data_path = resume_ckpt_path if data_path is None else data_path

self._load_data(data_path)
self._load_data(data_path)

self._init_state()

self._logger.info(f"Loaded checkpoint from {resume_ckpt_path} in {time.perf_counter() - time_start} seconds")

def remote_data_load(self):
remote_data_path = os.path.join(self.config.remote_data_path, f"data_{self.data_rank}", "latest")
id_ = uuid.uuid4()
dest = f"/tmp/zeroband/data_{id_}"
rsync_fsspec(remote_data_path, os.path.join(dest, "data"))
data_path = dest
self._load_data(data_path)

@torch.no_grad()
def recv_ckpt_from_peer(self, global_pg: dist.ProcessGroup):
assert self.diloco_offloaded_param_list is not None, "recv_ckpt_from_peers is only supported with diloco"
Expand Down
4 changes: 4 additions & 0 deletions src/zeroband/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ def train(config: Config):
logger.info(f"inner optimizer hash: {get_optimizer_signature(inner_optimizer)}")

need_live_recovery = False

if config.ckpt.remote_data_load:
ckpt_manager.remote_data_load()

logger.info("live recovery done in %f", time.perf_counter() - time_start_live_recovery)

# at the beginning of the inner steps we allow joiner to arrive.
Expand Down

0 comments on commit f9106d9

Please sign in to comment.