Skip to content

Commit

Permalink
isoCluster: Ensure IPU netdevs on host are up
Browse files Browse the repository at this point in the history
Currently, we need to cold boot the IPU host anytime the IMC reboots to
ensure the netdevs are available.

Ensure the netdevs are available when isocluster deployment completes.

Signed-off-by: Salvatore Daniele <[email protected]>
  • Loading branch information
SalDaniele committed Sep 6, 2024
1 parent da2f9a2 commit 922ce5f
Showing 1 changed file with 30 additions and 0 deletions.
30 changes: 30 additions & 0 deletions isoCluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,36 @@ def enable_acc_connectivity(node: NodeConfig) -> None:
ipu_acc.run("nmcli con mod enp0s1f0 ipv4.route-metric 0")
ipu_acc.run("ip route delete default via 192.168.0.1") # remove imc default route to avoid conflict
logger.info(f"{node.name} connectivity established")
ensure_ipu_netdevs_available(node)


# TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
def host_from_imc(imc: str) -> str:
ipu_host = imc.split('-intel-ipu-imc')[0]
return ipu_host


# TODO: Remove this workaround once rebooting the IMC no longer causes the netdevs on the IPU host to be removed
def ensure_ipu_netdevs_available(node: NodeConfig) -> None:
# This is a hack, iso_cluster deployments in general should not need to know about the x86 host they are connected to.
# However, since we need to cold boot the corresponding host, for the time being, infer this from the IMC address
# rather than requiring the user to provide this information.
ipu_host_name = host_from_imc(node.bmc)
ipu_host_bmc = host.BMC.from_bmc(ipu_host_name + "-drac.anl.eng.bos2.dc.redhat.com", "root", "calvin")
ipu_host = host.Host(host_from_imc(node.bmc), ipu_host_bmc)
ipu_host.ssh_connect("core")
ret = ipu_host.run("test -d /sys/class/net/ens2f0")
retries = 3
while ret.returncode != 0:
logger.error(f"{ipu_host.hostname()} does not have a network device ens2f0 cold booting node to try to recover")
ipu_host.cold_boot()
logger.info("Cold boot triggered, waiting for host to reboot")
time.sleep(60)
ipu_host.ssh_connect("core")
retries = retries - 1
if retries == 0:
logger.error_and_exit(f"Failed to bring up IPU net device on {ipu_host.hostname()}")
ret = ipu_host.run("test -d /sys/class/net/ens2f0")


def is_http_url(url: str) -> bool:
Expand Down

0 comments on commit 922ce5f

Please sign in to comment.