Skip to content

Commit

Permalink
Add support for DRAC clusters to the mila code command [MT-71] (#85)
Browse files Browse the repository at this point in the history
* Begin working on `mila code` on DRAC

Signed-off-by: Fabrice Normandin <[email protected]>

* `mila code` works on narval (not perfect though)

Signed-off-by: Fabrice Normandin <[email protected]>

* Add `cluster` argument to `qualified` function

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix `Literal` import

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix import order and typing error

Signed-off-by: Fabrice Normandin <[email protected]>

* Add a check for --account alloc when cluster"mila"

Signed-off-by: Fabrice Normandin <[email protected]>

* cd to $SCRATCH before running salloc

Signed-off-by: Fabrice Normandin <[email protected]>

* [wip] Try to move to $SCRATCH before sbatch

Signed-off-by: Fabrice Normandin <[email protected]>

* Show hostname, use `module load` after salloc

Signed-off-by: Fabrice Normandin <[email protected]>

* Simplify reconnecting to same node, no code-server

Signed-off-by: Fabrice Normandin <[email protected]>

* "Synchronize" VsCode extensions in subprocess

Signed-off-by: Fabrice Normandin <[email protected]>

* Only transfer missing extensions

Signed-off-by: Fabrice Normandin <[email protected]>

* A bit of code cleanup

Signed-off-by: Fabrice Normandin <[email protected]>

* Skip transfering extensions if already done.

Signed-off-by: Fabrice Normandin <[email protected]>

* Improve comments

Signed-off-by: Fabrice Normandin <[email protected]>

* Re-make the syncing happen in the background

Signed-off-by: Fabrice Normandin <[email protected]>

* Parametrize the paths a bit more

Signed-off-by: Fabrice Normandin <[email protected]>

* Improve the printed messages a bit

Signed-off-by: Fabrice Normandin <[email protected]>

* Reduce duplication of hard-coded cluster names

Signed-off-by: Fabrice Normandin <[email protected]>

* Minor (mostly typing-related) improvements

Signed-off-by: Fabrice Normandin <[email protected]>

* Move fixtures from test_remote.py to conftest

Signed-off-by: Fabrice Normandin <[email protected]>

* Change signature and improve copying of extensions

Signed-off-by: Fabrice Normandin <[email protected]>

* Add tests for the copying of VsCode extensions

Signed-off-by: Fabrice Normandin <[email protected]>

* Add test for packing of missing vscode extension

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix tests failing due to change in help str

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix tests failing due to added `cd $SCRATCH`

Signed-off-by: Fabrice Normandin <[email protected]>

* Make change to `hide` arg consistent in remote.py

Signed-off-by: Fabrice Normandin <[email protected]>

* Add test for `get_qualified_computenode_hostname`

Signed-off-by: Fabrice Normandin <[email protected]>

* Add missing `niagara` cluster

Signed-off-by: Fabrice Normandin <[email protected]>

* Remove niagara cluster

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix bug with shutil.copytree in python=3.7

Signed-off-by: Fabrice Normandin <[email protected]>

* Slightly simplify construction path based on home

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix compute node hostname issue on mila cluster

Signed-off-by: Fabrice Normandin <[email protected]>

* Apply suggestions from code review

Co-authored-by: satyaog <[email protected]>

* Remove hard-coded value for DRAC clusters

Signed-off-by: Fabrice Normandin <[email protected]>

* Remove commented lines and change to ensure_alloc

Signed-off-by: Fabrice Normandin <[email protected]>

* Use `filecmp` module to compare dirs

Signed-off-by: Fabrice Normandin <[email protected]>

* Make test_remote.py checks less strict

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix bug with mock_calls in py3.7

Signed-off-by: Fabrice Normandin <[email protected]>

* Add more colour to logs and mention the background

Signed-off-by: Fabrice Normandin <[email protected]>

* Add small note to comment

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix duplicate imports and such from rebase

Signed-off-by: Fabrice Normandin <[email protected]>

* Fix a bug in test_get_output for py3.7

Signed-off-by: Fabrice Normandin <[email protected]>

---------

Signed-off-by: Fabrice Normandin <[email protected]>
Signed-off-by: Fabrice Normandin <[email protected]>
Co-authored-by: satyaog <[email protected]>
  • Loading branch information
lebrice and satyaog authored Jan 23, 2024
1 parent aa953ab commit 6363e05
Show file tree
Hide file tree
Showing 15 changed files with 1,116 additions and 171 deletions.
119 changes: 95 additions & 24 deletions milatools/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import time
import traceback
import typing
import warnings
import webbrowser
from argparse import ArgumentParser, _HelpAction
from contextlib import ExitStack
Expand All @@ -26,6 +27,8 @@
import questionary as qn
from typing_extensions import TypedDict

from milatools.cli.vscode_utils import copy_vscode_extensions_to_remote

from ..version import version as mversion
from .init_command import (
print_welcome_message,
Expand All @@ -39,12 +42,16 @@
from .profile import ensure_program, setup_profile
from .remote import Remote, SlurmRemote
from .utils import (
CLUSTERS,
Cluster,
CommandNotFoundError,
MilatoolsUserError,
SSHConnectionError,
T,
get_fully_qualified_hostname_of_compute_node,
get_fully_qualified_name,
qualified,
make_process,
no_internet_on_compute_nodes,
randname,
running_inside_WSL,
with_control_file,
Expand All @@ -53,6 +60,7 @@
if typing.TYPE_CHECKING:
from typing_extensions import Unpack


logger = get_logger(__name__)


Expand Down Expand Up @@ -116,7 +124,6 @@ def mila():
version=f"milatools v{mversion}",
help="Milatools version",
)

subparsers = parser.add_subparsers(required=True, dest="<command>")

# ----- mila docs ------
Expand Down Expand Up @@ -181,6 +188,12 @@ def mila():
code_parser.add_argument(
"PATH", help="Path to open on the remote machine", type=str
)
code_parser.add_argument(
"--cluster",
choices=CLUSTERS,
default="mila",
help="Which cluster to connect to.",
)
code_parser.add_argument(
"--alloc",
nargs=argparse.REMAINDER,
Expand Down Expand Up @@ -448,6 +461,7 @@ def code(
job: str | None,
node: str | None,
alloc: Sequence[str],
cluster: Cluster = "mila",
):
"""Open a remote VSCode session on a compute node.
Expand All @@ -461,36 +475,89 @@ def code(
alloc: Extra options to pass to slurm
"""
here = Local()
remote = Remote("mila")
remote = Remote(cluster)

if cluster != "mila" and job is None and node is None:
if not any("--account" in flag for flag in alloc):
warnings.warn(
T.orange(
"Warning: When using the DRAC clusters, you usually need to "
"specify the account to use when submitting a job. You can specify "
"this in the job resources with `--alloc`, like so: "
"`--alloc --account=<account_to_use>`, for example:\n"
f"mila code {path} --cluster {cluster} --alloc "
f"--account=your-account-here"
)
)

if command is None:
command = os.environ.get("MILATOOLS_CODE_COMMAND", "code")

try:
check_disk_quota(remote)
except MilatoolsUserError:
raise
except Exception as exc:
logger.warning(f"Unable to check the disk-quota on the cluster: {exc}")

cnode = _find_allocation(
remote, job_name="mila-code", job=job, node=node, alloc=alloc
)
if persist:
cnode = cnode.persist()
data, proc = cnode.ensure_allocation()
if remote.hostname != "graham": # graham doesn't use lustre for $HOME
try:
check_disk_quota(remote)
except MilatoolsUserError:
raise
except Exception as exc:
logger.warning(f"Unable to check the disk-quota on the cluster: {exc}")

vscode_extensions_folder = Path.home() / ".vscode/extensions"
if vscode_extensions_folder.exists() and no_internet_on_compute_nodes(cluster):
# Sync the VsCode extensions from the local machine over to the target cluster.
print(
T.bold_cyan(
f"Copying VSCode extensions from local machine to {cluster} in the "
f"background..."
)
)
# Async:
copy_vscode_extensions_process = make_process(
copy_vscode_extensions_to_remote,
cluster,
vscode_extensions_folder,
)
copy_vscode_extensions_process.start()

# Sync:
# copy_vscode_extensions_to_remote(
# cluster,
# local_vscode_extensions_dir=vscode_extensions_folder,
# remote=remote,
# )

if node is None:
cnode = _find_allocation(
remote,
job_name="mila-code",
job=job,
node=node,
alloc=alloc,
cluster=cluster,
)
if persist:
cnode = cnode.persist()

node_name = data["node_name"]
data, proc = cnode.ensure_allocation()

node_name = data["node_name"]
else:
node_name = node
proc = None
data = None

if not path.startswith("/"):
# Get $HOME because we have to give the full path to code
home = remote.home()
path = "/".join([home, path])
path = home if path == "." else f"{home}/{path}"

command_path = shutil.which(command)
if not command_path:
raise CommandNotFoundError(command)
qualified_node_name = qualified(node_name)

# NOTE: Since we have the config entries for the DRAC compute nodes, there is no
# need to use the fully qualified hostname here.
if cluster == "mila":
node_name = get_fully_qualified_hostname_of_compute_node(node_name)

# Try to detect if this is being run from within the Windows Subsystem for Linux.
# If so, then we run `code` through a powershell.exe command to open VSCode without
Expand All @@ -504,15 +571,15 @@ def code(
"code",
"-nw",
"--remote",
f"ssh-remote+{qualified_node_name}",
f"ssh-remote+{node_name}",
path,
)
else:
here.run(
command_path,
"-nw",
"--remote",
f"ssh-remote+{qualified_node_name}",
f"ssh-remote+{node_name}",
path,
)
print(
Expand All @@ -532,6 +599,7 @@ def code(
print("To reconnect to this node:")
print(T.bold(f" mila code {path} --node {node_name}"))
print("To kill this allocation:")
assert data is not None
assert "jobid" in data
print(T.bold(f" ssh mila scancel {data['jobid']}"))

Expand Down Expand Up @@ -881,6 +949,7 @@ def _standard_server(
node=node,
job=job,
alloc=alloc,
cluster="mila",
)

patterns = {
Expand Down Expand Up @@ -949,7 +1018,7 @@ def _standard_server(

local_proc, local_port = _forward(
local=Local(),
node=qualified(node_name),
node=get_fully_qualified_hostname_of_compute_node(node_name, cluster="mila"),
to_forward=to_forward,
options=options,
port=port,
Expand Down Expand Up @@ -991,7 +1060,7 @@ def _get_disk_quota_usage(
# uid 1471600598 is using default file quota setting
#
home_disk_quota_output = remote.get_output(
"lfs quota -u $USER /home/mila", hide=not print_command_output
"lfs quota -u $USER $HOME", hide=not print_command_output
)
lines = home_disk_quota_output.splitlines()
(
Expand Down Expand Up @@ -1072,13 +1141,14 @@ def _find_allocation(
node: str | None,
job: str | None,
alloc: Sequence[str],
cluster: Cluster = "mila",
job_name: str = "mila-tools",
):
if (node is not None) + (job is not None) + bool(alloc) > 1:
exit("ERROR: --node, --job and --alloc are mutually exclusive")

if node is not None:
node_name = qualified(node)
node_name = get_fully_qualified_hostname_of_compute_node(node, cluster=cluster)
return Remote(node_name)

elif job is not None:
Expand All @@ -1090,6 +1160,7 @@ def _find_allocation(
return SlurmRemote(
connection=remote.connection,
alloc=alloc,
hostname=remote.hostname,
)


Expand Down
Loading

0 comments on commit 6363e05

Please sign in to comment.