Skip to content

Commit

Permalink
install RHEL 9.4 kernel
Browse files Browse the repository at this point in the history
Signed-off-by: Salvatore Daniele <[email protected]>
  • Loading branch information
SalDaniele committed Jul 2, 2024
1 parent b5700da commit ad42d16
Showing 1 changed file with 48 additions and 3 deletions.
51 changes: 48 additions & 3 deletions extraConfigDpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,51 @@
MICROSHIFT_KUBECONFIG = "/var/lib/microshift/resources/kubeadmin/kubeconfig"
OSE_DOCKERFILE = "https://pkgs.devel.redhat.com/cgit/containers/dpu-operator/tree/Dockerfile?h=rhaos-4.17-rhel-9"
REPO_DIR = "/root/dpu-operator"
SRIOV_NUM_VFS = 8

KERNEL_RPMS = [
"https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-5.14.0-427.2.1.el9_4.x86_64.rpm",
"https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-core-5.14.0-427.2.1.el9_4.x86_64.rpm",
"https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-modules-5.14.0-427.2.1.el9_4.x86_64.rpm",
"https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-modules-core-5.14.0-427.2.1.el9_4.x86_64.rpm",
"https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-modules-extra-5.14.0-427.2.1.el9_4.x86_64.rpm",
]


def ensure_rhel_9_4_kernel_is_installed(h: host.Host) -> None:
ret = h.run("uname -r")
if "el9_4" in ret.out:
return

logger.info(f"Installing RHEL 9.4 kernel on {h.hostname()}")

wd = "working_dir"
h.run(f"rm -rf {wd}")
h.run(f"mkdir -p {wd}")
logger.info(KERNEL_RPMS)

for e in KERNEL_RPMS:
fn = e.split("/")[-1]
cmd = f"curl -k {e} --create-dirs > {wd}/{fn}"
h.run(cmd)

logger.info("result: %s", h.run("sudo rpm-ostree").out)

cmd = f"sudo rpm-ostree override replace {wd}/*.rpm"
logger.info(cmd)
while True:
ret = h.run(cmd).out.strip().split("\n")
if len(ret) != 0 and ret[-1] == 'Run "systemctl reboot" to start a reboot':
break
else:
logger.info(ret)
logger.info("Output was something unexpected")

h.run("sudo systemctl reboot")
time.sleep(10)
h.ssh_connect("core")
ret = h.run("uname -r")
if "el9_4" not in ret.out:
logger.error_and_exit(f"Failed to install rhel 9.4 kernel on host {h.hostname()}")


def _update_dockerfile(image: str, path: str) -> None:
Expand Down Expand Up @@ -280,12 +324,13 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")

def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
logger.info(f"Manually creating vfs for host {h.hostname()}")
h.ssh_connect("core")
# Temporary workaround, remove once 4.16 installations are working
ensure_rhel_9_4_kernel_is_installed(h)
# There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
# As a result, we will need to trigger cold boots of the node until the device is available
# TODO: Remove when no longer needed
retries = 3
h.ssh_connect("core")
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
while ret.returncode != 0:
logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
Expand Down

0 comments on commit ad42d16

Please sign in to comment.