feat: Refactor GPUValidator to improve GPU quota validation testing w…

…ith modular methods current error: test logs and some gpu validator test don't accept ValueError return when no GPU is requested
ucsd-ets · Jul 25, 2024 · ed6e623 · ed6e623
1 parent 8c98722
commit ed6e623
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 32 deletions.
diff --git a/src/dsmlp/app/gpu_validator.py b/src/dsmlp/app/gpu_validator.py
@@ -17,35 +17,11 @@
 class GPUValidator(ComponentValidator):
 
     def __init__(self, awsed: AwsedClient, kube: KubeClient, logger: Logger) -> None:
+        self.awsed = awsed
         self.kube = kube
         self.logger = logger
-        self.awsed = awsed
-
-    def validate_pod(self, request: Request):
-        """
-        Validate pods for namespaces with the 'k8s-sync' label
-        """
-
-        # Low priority pods pass through
-        priority = request.object.spec.priorityClassName
-        if priority is not None and priority == LOW_PRIORITY_CLASS:
-            return
 
-        namespace = self.kube.get_namespace(request.namespace)
-        curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)
-        awsed_gpu_quota = self.awsed.get_user_gpu_quota(request.namespace)
-        """
-        Use AWSED GPU quota if it is not None and greater than 0 
-        else use namespace GPU quota if it is not None and greater than 0 
-        else use 1 as default
-        """
-
-        gpu_quota = 1
-        if awsed_gpu_quota is not None and awsed_gpu_quota > 0:
-            gpu_quota = awsed_gpu_quota
-        elif namespace.gpu_quota is not None and namespace.gpu_quota > 0:
-            gpu_quota = namespace.gpu_quota
-
+    def get_ultilized_gpu(self, request: Request):
         # Calculate the number of GPUs requested for kube client
         utilized_gpus = 0
         for container in request.object.spec.containers:
@@ -58,14 +34,51 @@ def validate_pod(self, request: Request):
                 limit = int(container.resources.limits[GPU_LABEL])
             except (KeyError, AttributeError, TypeError):
                 pass
-
+            
             utilized_gpus += max(requested, limit)
-
-        # Short circuit if no GPUs requested (permits overcap)
+        
+        # Short circuit if no GPUs requested (permits overcap) or return
         if utilized_gpus == 0:
-            return
+            raise ValueError("Error: No GPUs requested.")
+        return utilized_gpus
+
+    def get_gpu_quota(self, awsed_quota, kube_client_quota):
+        """
+        Use AWSED GPU quota if it is not None and greater than 0 
+        else use namespace GPU quota if it is not None and greater than 0 
+        else use 1 as default
+        """
+
+        default_gpu_quota = 1
+        if awsed_quota is not None and awsed_quota > 0:
+            default_gpu_quota = awsed_quota
+        elif kube_client_quota is not None and kube_client_quota > 0:
+            default_gpu_quota = kube_client_quota
+        return default_gpu_quota
+
+    def validate_pod(self, request: Request):
+        """
+        Validate pods for namespaces with the 'k8s-sync' label
+        """
 
+        # Low priority pods pass through
+        priority = request.object.spec.priorityClassName
+        if priority is not None and priority == LOW_PRIORITY_CLASS:
+            return
+
+        # initialized namespace, gpu_quota from awsed, and curr_gpus
+        namespace = self.kube.get_namespace(request.namespace)
+        curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)
+        awsed_gpu_quota = self.awsed.get_user_gpu_quota(request.namespace)
+
+        # check ultilized gpu
+        utilized_gpus = self.get_ultilized_gpu(request=request)
+
+        # request gpu_quota from method
+        gpu_quota = self.get_gpu_quota(awsed_gpu_quota, namespace.gpu_quota)
+
         # Check if the total number of utilized GPUs exceeds the GPU quota
         if utilized_gpus + curr_gpus > gpu_quota:
             raise ValidationFailure(
-                f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, the quota of {gpu_quota} would be exceeded.")
+                f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, "
+                f"the quota of {gpu_quota} would be exceeded.")
diff --git a/tests/app/test_gpu_validator.py b/tests/app/test_gpu_validator.py
@@ -135,4 +135,7 @@ def test_gpu_quota_client_priority2(self):
 
         self.try_validate(
             gen_request(gpu_req=6, username='user11'), expected=True, message="GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 18 would be exceeded."
-        )
+        )
+
+    # --- Modular / Unit Testing for Validate Pod ---
+    def test