Skip to content

Commit

Permalink
👌 PwBaseWorkChain: Always do full restart for ERROR_OUT_OF_WALLTIME
Browse files Browse the repository at this point in the history
The current error handler for the `ERROR_OUT_OF_WALLTIME` exit code of the
`PwCalculation` will restart from scratch in case the structure has changed during the
`pw.x` run, as is typically the case for `relax`/`vc-relax` calculations. For larger
structures and more complex calculations - such as those using Hubbard corrections -
this can be quite inefficient since obtaining the electronic ground state is often
more challenging and hence expensive.

Here we adapt the error handler to always do a full restart from the previous
calculation. In case the structure has changed, we still set it as the input structure
of the restart calculation. Even though Quantum ESPRESSO will restart from the `.save`
directory and hence ignore the structure information in the input file, this at least
makes the structure in the provenance consistent with the one run by Quantum ESPRESSO.
  • Loading branch information
mbercx authored Jan 17, 2025
1 parent 74a18bc commit fcb8da9
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 32 deletions.
16 changes: 5 additions & 11 deletions src/aiida_quantumespresso/workflows/pw/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,19 +444,13 @@ def handle_diagonalization_errors(self, calculation):
def handle_out_of_walltime(self, calculation):
"""Handle `ERROR_OUT_OF_WALLTIME` exit code.
In this case the calculation shut down neatly and we can simply restart. We consider two cases:
1. If the structure is unchanged, we do a full restart.
2. If the structure has changed during the calculation, we restart from scratch.
In this case the calculation shut down cleanly and we can do a full restart.
"""
try:
if 'output_structure' in calculation.outputs:
self.ctx.inputs.structure = calculation.outputs.output_structure
except exceptions.NotExistent:
self.set_restart_type(RestartType.FULL, calculation.outputs.remote_folder)
self.report_error_handled(calculation, 'simply restart from the last calculation')
else:
self.set_restart_type(RestartType.FROM_SCRATCH)
self.report_error_handled(calculation, 'out of walltime: structure changed so restarting from scratch')

self.set_restart_type(RestartType.FULL, calculation.outputs.remote_folder)
self.report_error_handled(calculation, "restarting in full with `CONTROL.restart_mode` = 'restart'")

return ProcessHandlerReport(True)

Expand Down
41 changes: 20 additions & 21 deletions tests/workflows/pw/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,25 @@ def test_handle_unrecoverable_failure(generate_workchain_pw):
assert result == PwBaseWorkChain.exit_codes.ERROR_UNRECOVERABLE_FAILURE


def test_handle_out_of_walltime(generate_workchain_pw, fixture_localhost, generate_remote_data):
@pytest.mark.parametrize('structure_changed', (
True,
False,
))
def test_handle_out_of_walltime(
generate_workchain_pw, fixture_localhost, generate_remote_data, generate_structure, structure_changed
):
"""Test `PwBaseWorkChain.handle_out_of_walltime`."""
remote_data = generate_remote_data(computer=fixture_localhost, remote_path='/path/to/remote')
process = generate_workchain_pw(
exit_code=PwCalculation.exit_codes.ERROR_OUT_OF_WALLTIME, pw_outputs={'remote_folder': remote_data}
)
generate_inputs = {
'exit_code': PwCalculation.exit_codes.ERROR_OUT_OF_WALLTIME,
'pw_outputs': {
'remote_folder': generate_remote_data(computer=fixture_localhost, remote_path='/path/to/remote')
}
}
if structure_changed:
output_structure = generate_structure()
generate_inputs['pw_outputs']['output_structure'] = output_structure

process = generate_workchain_pw(**generate_inputs)
process.setup()

result = process.handle_electronic_convergence_not_reached(process.ctx.children[-1])
Expand All @@ -49,22 +62,8 @@ def test_handle_out_of_walltime(generate_workchain_pw, fixture_localhost, genera
result = process.inspect_process()
assert result.status == 0


def test_handle_out_of_walltime_structure_changed(generate_workchain_pw, generate_structure):
"""Test `PwBaseWorkChain.handle_out_of_walltime`."""
structure = generate_structure()
process = generate_workchain_pw(
exit_code=PwCalculation.exit_codes.ERROR_OUT_OF_WALLTIME, pw_outputs={'output_structure': structure}
)
process.setup()

result = process.handle_out_of_walltime(process.ctx.children[-1])
assert isinstance(result, ProcessHandlerReport)
assert process.ctx.inputs.parameters['CONTROL']['restart_mode'] == 'from_scratch'
assert result.do_break

result = process.inspect_process()
assert result.status == 0
if structure_changed:
assert process.ctx.inputs.structure == output_structure


def test_handle_electronic_convergence_not_reached(generate_workchain_pw, fixture_localhost, generate_remote_data):
Expand Down

0 comments on commit fcb8da9

Please sign in to comment.