Skip to content

Commit

Permalink
need matching wait in reinit
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackmin801 authored and samsja committed Nov 13, 2024
1 parent 14e0501 commit a1adfa2
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion src/zeroband/comms.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def __init__(

# Logging
self._optimize_ring_ranks()

if self.live_recovery_rank_src is not None:
self.live_recovery.ask_for_live_ckpt(self.live_recovery_rank_src)
self.global_pg.barrier().wait()
Expand Down Expand Up @@ -423,6 +422,7 @@ def maybe_reinit_global_pg(self, admit_joiners: bool = False) -> bool:
try:
self._create_global_pg()
self._optimize_ring_ranks()
self.global_pg.barrier().wait()
except Exception as e:
self._logger.error(f"Error recreating process group: {e}. Retrying...")
return self.maybe_reinit_global_pg(admit_joiners=admit_joiners)
Expand Down

0 comments on commit a1adfa2

Please sign in to comment.