test

Signed-off-by: Salvatore Daniele <[email protected]>
SalDaniele · Jul 8, 2024 · 68d6b84 · 68d6b84
1 parent 3a73ff2
commit 68d6b84
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 69 deletions.
diff --git a/extraConfigDpu.py b/extraConfigDpu.py
@@ -310,75 +310,88 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
     lh = host.LocalHost()
     client = K8sClient(cc.kubeconfig)
 
-    if cfg.rebuild_dpu_operators_images:
-        registry = build_dpu_operator_images()
-    else:
-        logger.info("Will not rebuild dpu-operator images")
-        registry = _ensure_local_registry_running(lh, delete_all=False)
-    operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
-    daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"
-
-    # Need to trust the registry in OCP / Microshift
-    logger.info("Ensuring local registry is trusted in OCP")
-    reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)
-
-    h = host.Host(cc.workers[0].node)
-    vendor_plugin = init_vendor_plugin(h)
-    vendor_plugin.build_and_start(lh, client, registry)
-
-    start_dpu_operator(lh, client, operator_image, daemon_image)
-    client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")
-
-    def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
-        # Temporary workaround, remove once 4.16 installations are working
-        logger.info("Ensuring Rhel 9.4 kernel is installed")
-        ensure_rhel_9_4_kernel_is_installed(h)
-        # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
-        # As a result, we will need to trigger cold boots of the node until the device is available
-        # TODO: Remove when no longer needed
-        retries = 3
-        h.ssh_connect("core")
-        ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
-        while ret.returncode != 0:
-            logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
-            h.cold_boot()
-            logger.info("Cold boot triggered, waiting for host to reboot")
-            time.sleep(60)
-            h.ssh_connect("core")
-            retries = retries - 1
-            if retries == 0:
-                logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
-            ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
-
-        # Label the node
-        logger.info(f"labeling node {h.hostname()} dpu=true")
-        client.oc_run_or_die(f"label no {e.name} dpu=true")
-        return None
-
-    executor = ThreadPoolExecutor(max_workers=len(cc.workers))
-    f = []
-    # Assuming that all workers have a DPU
-    for e in cc.workers:
-        logger.info(f"Calling helper function for node {e.node}")
-        bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
-        h = host.Host(e.node, bmc)
-        f.append(executor.submit(helper, h, e))
-
-    for thread in f:
-        logger.info(thread.result())
-
-    logger.info("Verified idpf is providing net-devs on DPU worker nodes")
-
-    # Create host nad
-    # TODO: Remove when this is automatically created by the dpu operator
-    logger.info("Creating dpu NAD")
-    client.oc("delete -f manifests/dpu/dpu_nad.yaml")
-    client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
-    # Deploy dpu daemon and wait for dpu pods to come up
-    logger.info("Creating dpu operator config")
-    client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
-    time.sleep(30)
-    client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
+    # if cfg.rebuild_dpu_operators_images:
+    #     registry = build_dpu_operator_images()
+    # else:
+    #     logger.info("Will not rebuild dpu-operator images")
+    #     registry = _ensure_local_registry_running(lh, delete_all=False)
+    # operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
+    # daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"
+
+    # # Need to trust the registry in OCP / Microshift
+    # logger.info("Ensuring local registry is trusted in OCP")
+    # reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)
+
+
+    logger.info("creating test container")
+    image = "alpine:latest"
+    name = "ipu_host_test"
+    cmd = f"podman pull {image}"
+    lh.run_or_die(cmd)
+    cmd = f"podman run -d --name {name} {image} sh -c 'while true; do sleep 1; done'"
+    lh.run_or_die(cmd)
+
+    CONTAINER_NAME = "local-container-registry"
+    cmd = f"podman run -d --name {CONTAINER_NAME} -p 5000:5000 -v /root/.local-container-registry/data:/var/lib/registry:z -v /root/.local-container-registry/auth:/auth:z -v /root/.local-container-registry/certs:/certs:z -e REGISTRY_HTTP_TLS_CERTIFICATE=/certs/domain.crt -e REGISTRY_HTTP_TLS_KEY=/certs/domain.key -e REGISTRY_COMPATIBILITY_SCHEMA1_ENABLED=true --annotation=LOCAL_CONTAINER_REGISTRY_HOSTNAME=wsfd-advnetlab217.anl.eng.bos2.dc.redhat.com docker.io/library/registry:latest"
+    lh.run_or_die(cmd)
+
+    # h = host.Host(cc.workers[0].node)
+    # vendor_plugin = init_vendor_plugin(h)
+    # vendor_plugin.build_and_start(lh, client, registry)
+
+    # start_dpu_operator(lh, client, operator_image, daemon_image)
+    # client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")
+
+    # def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
+    #     # Temporary workaround, remove once 4.16 installations are working
+    #     logger.info("Ensuring Rhel 9.4 kernel is installed")
+    #     ensure_rhel_9_4_kernel_is_installed(h)
+    #     # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
+    #     # As a result, we will need to trigger cold boots of the node until the device is available
+    #     # TODO: Remove when no longer needed
+    #     retries = 3
+    #     h.ssh_connect("core")
+    #     ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
+    #     while ret.returncode != 0:
+    #         logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
+    #         h.cold_boot()
+    #         logger.info("Cold boot triggered, waiting for host to reboot")
+    #         time.sleep(60)
+    #         h.ssh_connect("core")
+    #         retries = retries - 1
+    #         if retries == 0:
+    #             logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
+    #         ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
+
+    #     # Label the node
+    #     logger.info(f"labeling node {h.hostname()} dpu=true")
+    #     client.oc_run_or_die(f"label no {e.name} dpu=true")
+    #     return None
+
+    # executor = ThreadPoolExecutor(max_workers=len(cc.workers))
+    # f = []
+    # # Assuming that all workers have a DPU
+    # for e in cc.workers:
+    #     logger.info(f"Calling helper function for node {e.node}")
+    #     bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
+    #     h = host.Host(e.node, bmc)
+    #     f.append(executor.submit(helper, h, e))
+
+    # for thread in f:
+    #     logger.info(thread.result())
+
+    # logger.info("Verified idpf is providing net-devs on DPU worker nodes")
+
+    # # Create host nad
+    # # TODO: Remove when this is automatically created by the dpu operator
+    # logger.info("Creating dpu NAD")
+    # client.oc("delete -f manifests/dpu/dpu_nad.yaml")
+    # client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
+    # # Deploy dpu daemon and wait for dpu pods to come up
+    # logger.info("Creating dpu operator config")
+    # client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
+    # time.sleep(30)
+    # client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
     logger.info("Finished setting up dpu operator on host")
 
 

diff --git a/reglocal.py b/reglocal.py
@@ -110,6 +110,11 @@ def ensure_running(rsh: host.Host, *, delete_all: bool = False, listen_port: int
         )
     )
 
+    # Add logging to check the status of the container
+    logger.info("Checking if the container is still running")
+    status_ret = rsh.run(shlex.join(["podman", "ps", "-a", "--filter", f"name={CONTAINER_NAME}"]))
+    logger.info(f"Container status: {status_ret.out}")
+
     return dir_name, hostname, listen_port, ret.out.strip()