From d8a86f4cf55394c323acccf029d46152f5d8551f Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 3 Sep 2024 16:55:16 -0700 Subject: [PATCH] increase vllm pod startup to 30 min (#173) fixes #172 --- internal/modelcontroller/model_controller.go | 31 +++++++++++++------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go index a2c11565..781d8d1d 100644 --- a/internal/modelcontroller/model_controller.go +++ b/internal/modelcontroller/model_controller.go @@ -264,12 +264,24 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, index int32) *corev Name: "http", }, }, + StartupProbe: &corev1.Probe{ + // Give the model 30 minutes to start up. + FailureThreshold: 900, + PeriodSeconds: 2, + TimeoutSeconds: 2, + SuccessThreshold: 1, + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/health", + Port: intstr.FromString("http"), + }, + }, + }, ReadinessProbe: &corev1.Probe{ - FailureThreshold: 3, - InitialDelaySeconds: 20, - PeriodSeconds: 10, - TimeoutSeconds: 2, - SuccessThreshold: 1, + FailureThreshold: 3, + PeriodSeconds: 10, + TimeoutSeconds: 2, + SuccessThreshold: 1, ProbeHandler: corev1.ProbeHandler{ HTTPGet: &corev1.HTTPGetAction{ Path: "/health", @@ -278,11 +290,10 @@ func (r *ModelReconciler) vLLMPodForModel(m *kubeaiv1.Model, index int32) *corev }, }, LivenessProbe: &corev1.Probe{ - FailureThreshold: 3, - InitialDelaySeconds: 900, - PeriodSeconds: 30, - TimeoutSeconds: 3, - SuccessThreshold: 1, + FailureThreshold: 3, + PeriodSeconds: 30, + TimeoutSeconds: 3, + SuccessThreshold: 1, ProbeHandler: corev1.ProbeHandler{ HTTPGet: &corev1.HTTPGetAction{ Path: "/health",