Skip to content

Commit

Permalink
Revise the user resource selection. (#96)
Browse files Browse the repository at this point in the history
- The user can only specify the total number of MPI tasks.
- The number of nodes is automatically determined from the selected
code.
- The number of total number of MPI tasks is automatically adjusted
based on the selected structure.
- Warn if the user would run with more than one MPI task on the
localhost.
- Emit resource warnings based on the code and structure selection.
  • Loading branch information
csadorf authored Jul 17, 2021
1 parent 8c05ee9 commit 5324334
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 99 deletions.
2 changes: 1 addition & 1 deletion aiidalab_qe/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def _generate_report_dict(qeapp_wc):

# Properties
run_relax = builder_parameters["relax_type"] != "none"
run_bands = builder_parameters.get("run_bands", False)
run_bands = builder_parameters["run_bands"]
run_pdos = builder_parameters.get("run_pdos", False)

yield "relaxed", run_relax
Expand Down
114 changes: 74 additions & 40 deletions aiidalab_qe/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Carl Simon Adorf <[email protected]>
"""
from math import ceil
from pprint import pformat

import ipywidgets as ipw
Expand Down Expand Up @@ -251,9 +252,13 @@ def __init__(self, **kwargs):
class SubmitQeAppWorkChainStep(ipw.VBox, WizardAppWidgetStep):
"""Step for submission of a bands workchain."""

# The app will issue a warning to the user if the ratio between the total
# number of sites and the total number of CPUs is larger than this value:
MAX_NUM_SITES_PER_CPU_WARN_THRESHOLD = 6
# This number provides a rough estimate for how many MPI tasks are needed
# for a given structure.
NUM_SITES_PER_MPI_TASK_DEFAULT = 6

# Warn the user if they are trying to run calculations for a large
# structure on localhost.
RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD = 10

input_structure = traitlets.Instance(StructureData, allow_none=True)
process = traitlets.Instance(WorkChainNode, allow_none=True)
Expand All @@ -273,7 +278,9 @@ def __init__(self, **kwargs):
self._setup_builder_parameters_update()

self.codes_selector.pw.observe(self._update_state, "selected_code")
self.codes_selector.pw.observe(self._update_cpus_per_node, "selected_code")
self.codes_selector.pw.observe(
self._set_num_mpi_tasks_to_default, "selected_code"
)

self.tab = ipw.Tab(
children=[
Expand Down Expand Up @@ -361,7 +368,7 @@ def _update_state(self, _=None):
<div class="alert alert-{alert_class} alert-dismissible">
<a href="#" class="close" data-dismiss="alert" aria-label="close">&times;</a>
<span class="closebtn" onclick="this.parentElement.style.display='none';">&times;</span>
{message}
<strong>{message}</strong>
</div>"""

def _show_alert_message(self, message, alert_class="info"):
Expand All @@ -372,42 +379,72 @@ def _show_alert_message(self, message, alert_class="info"):
)
)

def _get_default_cpus_per_node(self):
"""Determine the default number of cpus per node based on the code configuration."""
def _get_default_num_mpi_tasks(self):
"""Determine a reasonable value for the number of MPI tasks for the selected structure."""
if self.codes_selector.pw.selected_code:
selected_code = self.codes_selector.pw.selected_code
return selected_code.computer.metadata["default_mpiprocs_per_machine"]
num_sites = len(self.input_structure.sites) if self.input_structure else 1
num_mpi_tasks = max(
1, ceil(num_sites / self.NUM_SITES_PER_MPI_TASK_DEFAULT)
)
return num_mpi_tasks

return 1

def _update_cpus_per_node(self, change):
"""Update the configured cpus per node based on the current code selection."""
if change["new"]:
current_value = self.resources_config.cpus_per_node.value
new_value = self._get_default_cpus_per_node()
self.resources_config.cpus_per_node.max = new_value
if current_value != new_value:
self.resources_config.cpus_per_node.value = new_value
self._show_alert_message(
"The number cpus per node was automatically adjusted to "
f"the number of cores per node for the selected code ({new_value})."
)
self._check_resources()
def _set_num_mpi_tasks_to_default(self, _=None):
"""Set the number of MPI tasks to a reasonable value for the selected structure."""
self.resources_config.num_mpi_tasks.value = self._get_default_num_mpi_tasks()
self._check_resources()

def _check_resources(self):
"""Check whether the currently selected resources will be sufficient and warn if not."""
if self.input_structure:
num_sites = len(self.input_structure.sites)
num_cpus = self.resources_config.total_num_cpus.value
if num_sites // num_cpus > self.MAX_NUM_SITES_PER_CPU_WARN_THRESHOLD:
self._show_alert_message(
"The ratio of the number of sites in the selected structure "
f"({num_sites}) and the number of total CPUs available for the "
f"calculations ({num_cpus}) is very large. Consider to increase "
"the number of cores or nodes and select a code running on a "
'larger computer if necessary (see the "Codes & Resources" tab).',
alert_class="warning",
)
self.expert_mode = True
num_mpi_tasks = self.resources_config.num_mpi_tasks.value
on_localhost = (
self.codes_selector.pw.selected_code.computer.get_hostname() == "localhost"
)
if self.codes_selector.pw.selected_code and on_localhost and num_mpi_tasks > 1:
self._show_alert_message(
"The selected code would be executed on the local host, but "
"the number of MPI tasks is larger than one. Please review "
"the configuration and consider to select a code that runs "
'on a larger system if necessary (see the "Codes & '
'Resources" tab).',
alert_class="warning",
)
self.expert_mode = True
elif (
self.input_structure
and on_localhost
and len(self.input_structure.sites)
> self.RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD
):
self._show_alert_message(
"The selected code would be executed on the local host, but the "
"number of sites of the selected structure is relatively large. "
"Consider to select a code that runs on a larger system if "
'necessary (see the "Codes & Resources" tab).',
alert_class="warning",
)
self.expert_mode = True

def _get_cpus_per_node(self):
"""Determine the default number of CPUs per node based on the code configuration."""
if self.codes_selector.pw.selected_code:
selected_code = self.codes_selector.pw.selected_code
return selected_code.computer.metadata["default_mpiprocs_per_machine"]
return 1

def _determine_resources(self):
"""Calculate the number of nodes and tasks per node."""
cpus_per_node = self._get_cpus_per_node()
num_mpi_tasks_selected = self.resources_config.num_mpi_tasks.value

num_nodes = max(1, ceil(num_mpi_tasks_selected / cpus_per_node))
num_mpi_tasks_per_node = ceil(num_mpi_tasks_selected / num_nodes)

return {
"num_machines": num_nodes,
"num_mpiprocs_per_machine": num_mpi_tasks_per_node,
}

@traitlets.observe("state")
def _observe_state(self, change):
Expand All @@ -422,7 +459,7 @@ def _observe_state(self, change):
def _observe_input_structure(self, change):
self.set_trait("builder_parameters", self._default_builder_parameters())
self._update_state()
self._check_resources()
self._set_num_mpi_tasks_to_default()

@traitlets.observe("process")
def _observe_process(self, change):
Expand Down Expand Up @@ -550,10 +587,7 @@ def submit(self, _=None):
if not run_bands:
builder.pop("bands")

resources = {
"num_machines": self.resources_config.number_of_nodes.value,
"num_mpiprocs_per_machine": self.resources_config.cpus_per_node.value,
}
resources = self._determine_resources()
update_resources(builder, resources)

self.process = submit(builder)
Expand Down
73 changes: 15 additions & 58 deletions aiidalab_qe/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,82 +231,39 @@ def _observe_node(self, change):
display(viewer(change["new"]))


class ResourceSelectionWidget(ipw.HBox):
"""Widget for the selection of compute (CPU) resources."""
class ResourceSelectionWidget(ipw.VBox):
"""Widget for the selection of compute resources."""

resource_selection_title = ipw.HTML(
title = ipw.HTML(
"""<div style="padding-top: 0px; padding-bottom: 0px">
<h4>Resources</h4>
</div>"""
)
resource_selection_prompt = ipw.HTML(
"Select the compute resources for this calculation."
)
resource_selection_help = ipw.HTML(
prompt = ipw.HTML(
"""<div style="line-height:120%; padding-top:0px">
<p style="padding-bottom:10px">
Select the amount of resources you want to use for the calculations.
Although specifying the optimal configuration of resources is a complex issue, in general:
</p>
<ul>
<li>Increase the number of nodes if you run out of memory for larger structures.</li>
<li>Increase the number of nodes and cores if you want to reduce the total runtime.</li>
</ul>
<p>Note that the amount of resources is limited by the computer on
which you are running the calculations on. The default computer
(localhost) is only suitable for single core operations which will be
insufficient for larger calculations. Make sure to setup a code for
these by clicking on "Setup new code".</p></div>"""
Specify the number of MPI tasks for this calculation.
In general, larger structures will require a larger number of tasks.
</p></div>"""
)

def __init__(self, **kwargs):
extra = {
"style": {"description_width": "150px"},
"layout": {"max_width": "200px"},
# "layout": {"max_width": "200px"},
"layout": {"min_width": "310px"},
}
self.number_of_nodes = ipw.BoundedIntText(
value=1, step=1, min=1, description="# nodes", disabled=False, **extra
)
self.cpus_per_node = ipw.BoundedIntText(
value=1, step=1, min=1, max=1, description="# cpus per node", **extra
)
self.total_num_cpus = ipw.BoundedIntText(
value=1, step=1, min=1, description="# total cpus", disabled=True, **extra
)

# Update the total # of CPUs int text:
self.number_of_nodes.observe(self._update_total_num_cpus, "value")
self.cpus_per_node.observe(self._update_total_num_cpus, "value")
self.num_mpi_tasks = ipw.BoundedIntText(
value=1, step=1, min=1, description="# MPI tasks", **extra
)

super().__init__(
children=[
ipw.VBox(
children=[
self.resource_selection_title,
ipw.HBox(
children=[
self.resource_selection_help,
ipw.VBox(
children=[
self.number_of_nodes,
self.cpus_per_node,
self.total_num_cpus,
],
layout=ipw.Layout(min_width="310px"),
),
]
),
]
)
self.title,
ipw.HBox(children=[self.prompt, self.num_mpi_tasks]),
]
)

def _update_total_num_cpus(self, change):
self.total_num_cpus.value = (
self.number_of_nodes.value * self.cpus_per_node.value
)

def reset(self):
with self.hold_trait_notifications():
self.number_of_nodes.value = 1
self.cpus_per_node.value = 1
self.num_mpi_tasks.value = 1

0 comments on commit 5324334

Please sign in to comment.