Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
Signed-off-by: Salvatore Daniele <[email protected]>
  • Loading branch information
SalDaniele committed Jul 8, 2024
1 parent 3a73ff2 commit 68d6b84
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 69 deletions.
151 changes: 82 additions & 69 deletions extraConfigDpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,75 +310,88 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
lh = host.LocalHost()
client = K8sClient(cc.kubeconfig)

if cfg.rebuild_dpu_operators_images:
registry = build_dpu_operator_images()
else:
logger.info("Will not rebuild dpu-operator images")
registry = _ensure_local_registry_running(lh, delete_all=False)
operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"

# Need to trust the registry in OCP / Microshift
logger.info("Ensuring local registry is trusted in OCP")
reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)

h = host.Host(cc.workers[0].node)
vendor_plugin = init_vendor_plugin(h)
vendor_plugin.build_and_start(lh, client, registry)

start_dpu_operator(lh, client, operator_image, daemon_image)
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")

def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
# Temporary workaround, remove once 4.16 installations are working
logger.info("Ensuring Rhel 9.4 kernel is installed")
ensure_rhel_9_4_kernel_is_installed(h)
# There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
# As a result, we will need to trigger cold boots of the node until the device is available
# TODO: Remove when no longer needed
retries = 3
h.ssh_connect("core")
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
while ret.returncode != 0:
logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
h.cold_boot()
logger.info("Cold boot triggered, waiting for host to reboot")
time.sleep(60)
h.ssh_connect("core")
retries = retries - 1
if retries == 0:
logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")

# Label the node
logger.info(f"labeling node {h.hostname()} dpu=true")
client.oc_run_or_die(f"label no {e.name} dpu=true")
return None

executor = ThreadPoolExecutor(max_workers=len(cc.workers))
f = []
# Assuming that all workers have a DPU
for e in cc.workers:
logger.info(f"Calling helper function for node {e.node}")
bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
h = host.Host(e.node, bmc)
f.append(executor.submit(helper, h, e))

for thread in f:
logger.info(thread.result())

logger.info("Verified idpf is providing net-devs on DPU worker nodes")

# Create host nad
# TODO: Remove when this is automatically created by the dpu operator
logger.info("Creating dpu NAD")
client.oc("delete -f manifests/dpu/dpu_nad.yaml")
client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
# Deploy dpu daemon and wait for dpu pods to come up
logger.info("Creating dpu operator config")
client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
time.sleep(30)
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
# if cfg.rebuild_dpu_operators_images:
# registry = build_dpu_operator_images()
# else:
# logger.info("Will not rebuild dpu-operator images")
# registry = _ensure_local_registry_running(lh, delete_all=False)
# operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
# daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"

# # Need to trust the registry in OCP / Microshift
# logger.info("Ensuring local registry is trusted in OCP")
# reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)


logger.info("creating test container")
image = "alpine:latest"
name = "ipu_host_test"
cmd = f"podman pull {image}"
lh.run_or_die(cmd)
cmd = f"podman run -d --name {name} {image} sh -c 'while true; do sleep 1; done'"
lh.run_or_die(cmd)

CONTAINER_NAME = "local-container-registry"
cmd = f"podman run -d --name {CONTAINER_NAME} -p 5000:5000 -v /root/.local-container-registry/data:/var/lib/registry:z -v /root/.local-container-registry/auth:/auth:z -v /root/.local-container-registry/certs:/certs:z -e REGISTRY_HTTP_TLS_CERTIFICATE=/certs/domain.crt -e REGISTRY_HTTP_TLS_KEY=/certs/domain.key -e REGISTRY_COMPATIBILITY_SCHEMA1_ENABLED=true --annotation=LOCAL_CONTAINER_REGISTRY_HOSTNAME=wsfd-advnetlab217.anl.eng.bos2.dc.redhat.com docker.io/library/registry:latest"
lh.run_or_die(cmd)

# h = host.Host(cc.workers[0].node)
# vendor_plugin = init_vendor_plugin(h)
# vendor_plugin.build_and_start(lh, client, registry)

# start_dpu_operator(lh, client, operator_image, daemon_image)
# client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")

# def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
# # Temporary workaround, remove once 4.16 installations are working
# logger.info("Ensuring Rhel 9.4 kernel is installed")
# ensure_rhel_9_4_kernel_is_installed(h)
# # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
# # As a result, we will need to trigger cold boots of the node until the device is available
# # TODO: Remove when no longer needed
# retries = 3
# h.ssh_connect("core")
# ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
# while ret.returncode != 0:
# logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
# h.cold_boot()
# logger.info("Cold boot triggered, waiting for host to reboot")
# time.sleep(60)
# h.ssh_connect("core")
# retries = retries - 1
# if retries == 0:
# logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
# ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")

# # Label the node
# logger.info(f"labeling node {h.hostname()} dpu=true")
# client.oc_run_or_die(f"label no {e.name} dpu=true")
# return None

# executor = ThreadPoolExecutor(max_workers=len(cc.workers))
# f = []
# # Assuming that all workers have a DPU
# for e in cc.workers:
# logger.info(f"Calling helper function for node {e.node}")
# bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
# h = host.Host(e.node, bmc)
# f.append(executor.submit(helper, h, e))

# for thread in f:
# logger.info(thread.result())

# logger.info("Verified idpf is providing net-devs on DPU worker nodes")

# # Create host nad
# # TODO: Remove when this is automatically created by the dpu operator
# logger.info("Creating dpu NAD")
# client.oc("delete -f manifests/dpu/dpu_nad.yaml")
# client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
# # Deploy dpu daemon and wait for dpu pods to come up
# logger.info("Creating dpu operator config")
# client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
# time.sleep(30)
# client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
logger.info("Finished setting up dpu operator on host")


Expand Down
5 changes: 5 additions & 0 deletions reglocal.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ def ensure_running(rsh: host.Host, *, delete_all: bool = False, listen_port: int
)
)

# Add logging to check the status of the container
logger.info("Checking if the container is still running")
status_ret = rsh.run(shlex.join(["podman", "ps", "-a", "--filter", f"name={CONTAINER_NAME}"]))
logger.info(f"Container status: {status_ret.out}")

return dir_name, hostname, listen_port, ret.out.strip()


Expand Down

0 comments on commit 68d6b84

Please sign in to comment.