Skip to content

Commit

Permalink
return early from save_checkpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
adammoody committed Oct 4, 2021
1 parent c6eee50 commit 1ac9895
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2396,6 +2396,10 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
module = self.module_state_dict()
self._curr_ckpt_path = None

# Only a subset of procs may need to save the general model params
if not self.save_non_zero_checkpoint:
return

state = dict(module=module,
buffer_names=self._get_buffer_names(),
optimizer=self.optimizer.state_dict()
Expand All @@ -2412,10 +2416,9 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
ds_version=version)
state.update(client_state)

if self.save_non_zero_checkpoint:
log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0, 1])
#logger.info('Saving model checkpoint: {}'.format(save_path))
torch.save(state, save_path)
log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0, 1])
#logger.info('Saving model checkpoint: {}'.format(save_path))
torch.save(state, save_path)

def _get_buffer_names(self):
buffer_names = []
Expand Down

0 comments on commit 1ac9895

Please sign in to comment.