Skip to content

Commit

Permalink
remove bug in parellel.py for resumption
Browse files Browse the repository at this point in the history
  • Loading branch information
HamidrezaKmK committed Jun 5, 2023
1 parent 96ad8fe commit 64277fc
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
2 changes: 1 addition & 1 deletion dysweep/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .parallel import dysweep_run_resume, ResumableSweepConfig
from .wandbX import hierarchical_config

__version__ = "0.0.6"
__version__ = "0.0.7"
21 changes: 18 additions & 3 deletions dysweep/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ def modified_function():

# create a new checkpoint directory for the inner function
new_checkpoint_dir = checkpoint_dir / new_dir_name

experiment_id = resume_id
else:
# if the run_id doesn't exist, then create a new run
# and create the subdirectory
Expand Down Expand Up @@ -168,8 +170,13 @@ def modified_function():
if len(params) != 3 or list(params.keys())[0] != "config" or list(params.keys())[1] != "logger" or list(params.keys())[2] != "checkpoint_dir":
raise ValueError(
"the run function should have the exact following parameters in order: (config, logger, checkpoint_dir)")

ret = function(sweep_config, logger, new_checkpoint_dir)
try:
ret = function(sweep_config, logger, new_checkpoint_dir)
except Exception as e:
# write exception into an err-log.txt file in the checkpoint_dir
with open(new_checkpoint_dir / "err-log.txt", "w") as f:
f.write(traceback.format_exc())
raise e
else:
# check the function signature matches
# the one we expect.
Expand All @@ -184,7 +191,13 @@ def modified_function():
if len(params) != 2 or list(params.keys())[0] != "config" or list(params.keys())[1] != "checkpoint_dir":
raise ValueError(
"the run function should have the exact following parameters in order: (config, checkpoint_dir)")
ret = function(sweep_config, new_checkpoint_dir)
try:
ret = function(sweep_config, new_checkpoint_dir)
except Exception as e:
# write exception into an err-log.txt file in the checkpoint_dir
with open(new_checkpoint_dir / "err-log.txt", "w") as f:
f.write(traceback.format_exc())
raise e

# remove the entire new_checkpoint_dir if the function has finished
# running.
Expand All @@ -193,6 +206,8 @@ def modified_function():
shutil.rmtree(new_checkpoint_dir)

return ret


if conf.resume:
for _ in range(conf.count):
if check_non_empty(checkpoint_dir):
Expand Down

0 comments on commit 64277fc

Please sign in to comment.