From ad42d16b200099c637fd305a490b64937996cb9d Mon Sep 17 00:00:00 2001 From: Salvatore Daniele Date: Mon, 1 Jul 2024 17:30:21 -0400 Subject: [PATCH] install RHEL 9.4 kernel Signed-off-by: Salvatore Daniele --- extraConfigDpu.py | 51 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/extraConfigDpu.py b/extraConfigDpu.py index cd418722..bcc8d5c9 100644 --- a/extraConfigDpu.py +++ b/extraConfigDpu.py @@ -18,7 +18,51 @@ MICROSHIFT_KUBECONFIG = "/var/lib/microshift/resources/kubeadmin/kubeconfig" OSE_DOCKERFILE = "https://pkgs.devel.redhat.com/cgit/containers/dpu-operator/tree/Dockerfile?h=rhaos-4.17-rhel-9" REPO_DIR = "/root/dpu-operator" -SRIOV_NUM_VFS = 8 + +KERNEL_RPMS = [ + "https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-5.14.0-427.2.1.el9_4.x86_64.rpm", + "https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-core-5.14.0-427.2.1.el9_4.x86_64.rpm", + "https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-modules-5.14.0-427.2.1.el9_4.x86_64.rpm", + "https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-modules-core-5.14.0-427.2.1.el9_4.x86_64.rpm", + "https://download.devel.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-modules-extra-5.14.0-427.2.1.el9_4.x86_64.rpm", +] + + +def ensure_rhel_9_4_kernel_is_installed(h: host.Host) -> None: + ret = h.run("uname -r") + if "el9_4" in ret.out: + return + + logger.info(f"Installing RHEL 9.4 kernel on {h.hostname()}") + + wd = "working_dir" + h.run(f"rm -rf {wd}") + h.run(f"mkdir -p {wd}") + logger.info(KERNEL_RPMS) + + for e in KERNEL_RPMS: + fn = e.split("/")[-1] + cmd = f"curl -k {e} --create-dirs > {wd}/{fn}" + h.run(cmd) + + logger.info("result: %s", h.run("sudo rpm-ostree").out) + + cmd = f"sudo rpm-ostree override replace {wd}/*.rpm" + logger.info(cmd) + while True: + ret = h.run(cmd).out.strip().split("\n") + if len(ret) != 0 and ret[-1] == 'Run "systemctl reboot" to start a reboot': + break + else: + logger.info(ret) + logger.info("Output was something unexpected") + + h.run("sudo systemctl reboot") + time.sleep(10) + h.ssh_connect("core") + ret = h.run("uname -r") + if "el9_4" not in ret.out: + logger.error_and_exit(f"Failed to install rhel 9.4 kernel on host {h.hostname()}") def _update_dockerfile(image: str, path: str) -> None: @@ -280,12 +324,13 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m") def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]: - logger.info(f"Manually creating vfs for host {h.hostname()}") + h.ssh_connect("core") + # Temporary workaround, remove once 4.16 installations are working + ensure_rhel_9_4_kernel_is_installed(h) # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot # As a result, we will need to trigger cold boots of the node until the device is available # TODO: Remove when no longer needed retries = 3 - h.ssh_connect("core") ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}") while ret.returncode != 0: logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")