testing 03

Cray-HPE · Nov 21, 2024 · 75e50a0 · 75e50a0
1 parent d401691
commit 75e50a0
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 16 deletions.
diff --git a/goss-testing/scripts/check_iuf_abort.sh b/goss-testing/scripts/check_iuf_abort.sh
@@ -23,6 +23,6 @@
 # OTHER DEALINGS IN THE SOFTWARE.
 #
 
-python3 /opt/cray/tests/install/ncn/scripts/python/iuf_run.py /opt/cray/tests/install/ncn/scripts/python/iuf_run_setup test-activity & \
+python3 /opt/cray/tests/install/ncn/scripts/python/iuf_run /opt/cray/tests/install/ncn/scripts/iuf_run_setup test-activity & \
 sleep 10 && \
 iuf -a test-activity abort -f
diff --git a/goss-testing/tests/ncn/goss-iuf-activity-resume.yaml b/goss-testing/tests/ncn/goss-iuf-activity-resume.yaml
@@ -43,15 +43,15 @@ command:
           "{{$iuf_pre_checks}}" "{{ index $pre_checks 0 }}" "{{ index $pre_checks 1 }}" "{{ index $pre_checks 2 }}" "{{ index $pre_checks 3 }}" "{{ index $pre_checks 4 }}" "{{ index $pre_checks 5 }}" && \
 
           # Run an IUF session and abort it
-          "{{$logrun}}" -l "{{$testlabel}}"  "{{$iuf_abort}}" && \
+          "{{$iuf_abort}}" && \
           sleep 30 && \
-          "{{$logrun}}" -l "{{$testlabel}}" /usr/bin/iuf -a test-activity resume
+          /usr/bin/iuf -a test-activity resume
 
           # Capture the exit code of the previous step
           run_exit_code=$?
 
           # Cleanup the workflows,log and media dir
-          "{{$logrun}}" -l "{{$testlabel}}" "{{$iuf_cleanup}}"
+          "{{$iuf_cleanup}}"
 
           # Exit with code 1 if the iuf_run command failed with exit code 1
           if [ $run_exit_code -eq 1 ]; then

diff --git a/src/csm_testing/tests/check_workflow_template/__main__.py b/src/csm_testing/tests/check_workflow_template/__main__.py
@@ -25,9 +25,12 @@
 This script checks workflow templates.
 """
 
+import json
+import subprocess
 import sys
 import time
 from kubernetes import client, config
+from kubernetes.client.rest import ApiException
 import yaml
 
 def is_main_container_finished(pod_name, namespace):
@@ -57,6 +60,7 @@ def check_and_kill_pod(workflow_name, namespace):
             print(f"INFO: Main container finished in pod {pod_name}. Deleting pod.")
             v1.delete_namespaced_pod(name=pod_name, namespace=namespace)
             print(f"INFO: Pod {pod_name} deleted.")
+            return True
         else:
             print(f"INFO: Main container still running in pod {pod_name}.")
 
@@ -68,17 +72,98 @@ def update_image_version_in_template(workflow_template_str,old_image_prefix,old_
     )
     return updated_workflow_template_str
 
+def wait_for_workflow_to_succeed(namespace, workflow_name, timeout=6000, interval=20):
+    """
+    Polls the workflow status until it succeeds or the timeout is reached.
+    """
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            # Get the workflow status
+            command = [
+            "kubectl", "get", "workflow", workflow_name, "-n", namespace, "-o", "jsonpath={.status.phase}"
+            ]
+            result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+            status=result.stdout.strip()
+
+            # Check the status of the workflow
+            print(f"Workflow {workflow_name} status: {status}")
+
+            if status == "Succeeded":
+                print(f"INFO: Workflow {workflow_name} succeeded.")
+                return True
+            elif status in ["Failed", "Error"]:
+                print(f"ERROR: Workflow {workflow_name} failed with status {status}.")
+                return False
+            elif status in ["Running"]:
+                print("INFO; Check if main is done")
+                if check_and_kill_pod(workflow_name, namespace):
+                    print(f"WARNING: Workflow {workflow_name} is running but main container is done.")
+                    return True
+        except ApiException as e:
+            print(f"ERROR: Exception when fetching workflow status: {e}")
+            return False
+
+        # Wait before polling again
+        time.sleep(interval)
+
+    print(f"ERROR: Workflow {workflow_name} did not complete within {timeout} seconds.")
+    return False
+
+def delete_resources(namespace, workflow_name, workflow_template_name):
+    """Delete the workflow, workflow template, and associated pods."""
+    try:
+        # Delete the workflow and workflow template
+        for resource, name in [("workflow", workflow_name), ("workflowtemplate", workflow_template_name)]:
+            print(f"INFO: Deleting {resource} {name}...")
+            subprocess.run(
+                ["kubectl", "delete", resource, name, "-n", namespace],
+                check=True
+            )
+            print(f"INFO: {resource.capitalize()} {name} deleted successfully.")
+
+        # Find pods owned by the workflow
+        print(f"INFO: Finding pods owned by workflow {workflow_name}...")
+        pods_json = subprocess.run(
+            ["kubectl", "get", "pods", "-n", namespace, "-o", "json"],
+            check=True, stdout=subprocess.PIPE, universal_newlines=True
+        ).stdout
+        pod_names = [
+            pod["metadata"]["name"]
+            for pod in json.loads(pods_json).get("items", [])
+            if any(owner.get("name") == workflow_name for owner in pod.get("metadata", {}).get("ownerReferences", []))
+        ]
+
+        # Delete the found pods
+        for pod_name in pod_names:
+            print(f"INFO: Deleting pod {pod_name}...")
+            subprocess.run(
+                ["kubectl", "delete", "pod", pod_name, "-n", namespace],
+                check=True
+            )
+            print(f"INFO: Pod {pod_name} deleted successfully.")
+
+        if not pod_names:
+            print(f"INFO: No pods found for workflow {workflow_name}.")
+    except subprocess.CalledProcessError as e:
+        print(f"ERROR: Failed to delete resources: {e}")
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"ERROR: Failed to parse pod list: {e}")
+        sys.exit(1)
+
+
 def main():
-    if len(sys.argv) != 3:
+    if len(sys.argv) != 4:
         print("Usage: python script.py <workflow_template_file_path> <workflow_file_path> <new_image_version>")
         sys.exit(1)
-
+    
     workflow_template_file = sys.argv[1]
     workflow_file = sys.argv[2]
     new_version= sys.argv[3]
     # Load kubeconfig (adjust if running inside a cluster)
     config.load_kube_config()
-
+    
     with open(workflow_template_file, 'r') as stream:
         workflow_template_str = stream.read()
 
@@ -119,7 +204,7 @@ def main():
             plural=plural,
             body=workflow_template_dict
         )
-        print(f"INFO: WorkflowTemplate created. Status: {api_response}")
+        print(f"INFO: WorkflowTemplate created.")
     except client.exceptions.ApiException as e:
         if e.status == 409:
             print("INFO: WorkflowTemplate already exists, updating it...")
@@ -131,7 +216,7 @@ def main():
                 name=workflow_template_dict['metadata']['name'],
                 body=workflow_template_dict
             )
-            print(f"INFO: WorkflowTemplate updated. Status: {api_response}")
+            print(f"INFO: WorkflowTemplate updated.")
         else:
             print(f"ERROR: Exception when creating WorkflowTemplate: {e}")
             sys.exit(1)
@@ -149,12 +234,26 @@ def main():
     except client.exceptions.ApiException as e:
         print(f"ERROR: Exception when submitting workflow: {e}")
         sys.exit(1)
+    workflow_name = api_response.get("metadata", {}).get("name")
 
-    # Check if the workflow template is iuf-base-template and handle pod cleanup
-    if workflow_template_dict['metadata']['name'] == "iuf-base-template":
-        workflow_name = workflow_dict['metadata']['name']
-        time.sleep(10)  # Wait for the pod to be created
-        check_and_kill_pod(workflow_name, namespace)
+    if not workflow_name:
+        print("ERROR: Workflow name not found in the response.")
+        sys.exit(1)
+
+    workflow_template_name = workflow_template_dict['metadata']['name']
+
+    print(f"INFO: Waiting for workflow {workflow_name} to succeed...")
+    # Wait for the workflow to succeed
+    time.sleep(15)
+    workflow_status = wait_for_workflow_to_succeed( namespace, workflow_name)
+
+    if workflow_status:
+        print("INFO: Workflow completed successfully, proceeding to next step.")
+    else:
+        print("ERROR: Workflow did not succeed, aborting.")
+        sys.exit(1)
+
+    delete_resources(namespace,workflow_name,workflow_template_name)
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/src/csm_testing/tests/iuf_cleanup/__main__.py b/src/csm_testing/tests/iuf_cleanup/__main__.py
@@ -93,7 +93,7 @@ def cleanup(activity_name = "test-activity"):
         if isinstance(configmaps, list) and all(isinstance(cm, str) for cm in configmaps) :
             print(f"INFO: configmaps found for {activity_name} :{configmaps}")
             for configmap in configmaps:
-                command_delete_configmap = f"kubectl delete workflow {configmap} -n argo"
+                command_delete_configmap = f"kubectl delete configmap {configmap} -n argo"
                 try : 
                     result = subprocess.run(command_delete_configmap, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
                     print("INFO: Command output:", result.stdout)

diff --git a/src/csm_testing/tests/iuf_stages/__main__.py b/src/csm_testing/tests/iuf_stages/__main__.py
@@ -119,6 +119,8 @@ def check_product_data(dummy_product):
     return True
 
 def run_iuf_script(*args):
+    tar_dir = sys.argv[1]
+    ACTIVITY_NAME = sys.argv[2]
     try:
         subprocess.run(['python3', '/opt/cray/tests/install/ncn/scripts/python/iuf_run', tar_dir,ACTIVITY_NAME], check=True)
     except subprocess.CalledProcessError as e:
@@ -128,6 +130,8 @@ def run_iuf_script(*args):
 
 def process_media(*args):
     global test_cases
+    tar_dir = sys.argv[1]
+    ACTIVITY_NAME = sys.argv[2]
     # Run process-media using iuf_run.py and execute tests
     if run_iuf_script(tar_dir,ACTIVITY_NAME):
         folder_to_check = "/etc/cray/upgrade/csm/automation-tests/dummy-1.0.0"