Skip to content

Commit

Permalink
feat: Refactor GPUValidator to improve GPU quota validation testing w…
Browse files Browse the repository at this point in the history
…ith modular methods

current error: test logs and some gpu validator test don't accept ValueError return when no GPU is requested
  • Loading branch information
trn024 committed Jul 25, 2024
1 parent 8c98722 commit ed6e623
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 32 deletions.
75 changes: 44 additions & 31 deletions src/dsmlp/app/gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,11 @@
class GPUValidator(ComponentValidator):

def __init__(self, awsed: AwsedClient, kube: KubeClient, logger: Logger) -> None:
self.awsed = awsed
self.kube = kube
self.logger = logger
self.awsed = awsed

def validate_pod(self, request: Request):
"""
Validate pods for namespaces with the 'k8s-sync' label
"""

# Low priority pods pass through
priority = request.object.spec.priorityClassName
if priority is not None and priority == LOW_PRIORITY_CLASS:
return

namespace = self.kube.get_namespace(request.namespace)
curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)
awsed_gpu_quota = self.awsed.get_user_gpu_quota(request.namespace)
"""
Use AWSED GPU quota if it is not None and greater than 0
else use namespace GPU quota if it is not None and greater than 0
else use 1 as default
"""

gpu_quota = 1
if awsed_gpu_quota is not None and awsed_gpu_quota > 0:
gpu_quota = awsed_gpu_quota
elif namespace.gpu_quota is not None and namespace.gpu_quota > 0:
gpu_quota = namespace.gpu_quota

def get_ultilized_gpu(self, request: Request):
# Calculate the number of GPUs requested for kube client
utilized_gpus = 0
for container in request.object.spec.containers:
Expand All @@ -58,14 +34,51 @@ def validate_pod(self, request: Request):
limit = int(container.resources.limits[GPU_LABEL])
except (KeyError, AttributeError, TypeError):
pass

utilized_gpus += max(requested, limit)

# Short circuit if no GPUs requested (permits overcap)
# Short circuit if no GPUs requested (permits overcap) or return
if utilized_gpus == 0:
return
raise ValueError("Error: No GPUs requested.")
return utilized_gpus

def get_gpu_quota(self, awsed_quota, kube_client_quota):
"""
Use AWSED GPU quota if it is not None and greater than 0
else use namespace GPU quota if it is not None and greater than 0
else use 1 as default
"""

default_gpu_quota = 1
if awsed_quota is not None and awsed_quota > 0:
default_gpu_quota = awsed_quota
elif kube_client_quota is not None and kube_client_quota > 0:
default_gpu_quota = kube_client_quota
return default_gpu_quota

def validate_pod(self, request: Request):
"""
Validate pods for namespaces with the 'k8s-sync' label
"""

# Low priority pods pass through
priority = request.object.spec.priorityClassName
if priority is not None and priority == LOW_PRIORITY_CLASS:
return

# initialized namespace, gpu_quota from awsed, and curr_gpus
namespace = self.kube.get_namespace(request.namespace)
curr_gpus = self.kube.get_gpus_in_namespace(request.namespace)
awsed_gpu_quota = self.awsed.get_user_gpu_quota(request.namespace)

# check ultilized gpu
utilized_gpus = self.get_ultilized_gpu(request=request)

# request gpu_quota from method
gpu_quota = self.get_gpu_quota(awsed_gpu_quota, namespace.gpu_quota)

# Check if the total number of utilized GPUs exceeds the GPU quota
if utilized_gpus + curr_gpus > gpu_quota:
raise ValidationFailure(
f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, the quota of {gpu_quota} would be exceeded.")
f"GPU quota exceeded. Wanted {utilized_gpus} but with {curr_gpus} already in use, "
f"the quota of {gpu_quota} would be exceeded.")
5 changes: 4 additions & 1 deletion tests/app/test_gpu_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,4 +135,7 @@ def test_gpu_quota_client_priority2(self):

self.try_validate(
gen_request(gpu_req=6, username='user11'), expected=True, message="GPU quota exceeded. Wanted 6 but with 5 already in use, the quota of 18 would be exceeded."
)
)

# --- Modular / Unit Testing for Validate Pod ---
def test

0 comments on commit ed6e623

Please sign in to comment.