wip: [kserve] Add granite-3-0-8b-instruct vLLM single-model gating

openshift-psap · Nov 13, 2024 · 1c9b044 · 1c9b044
1 parent 6617357
commit 1c9b044
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 0 deletions.
diff --git a/projects/kserve/testing/config.yaml b/projects/kserve/testing/config.yaml
@@ -353,6 +353,11 @@ ci_presets:
       testing:
         size: small
         max_concurrency: 512
+    - name: granite-3-0-8b-instruct
+      model: granite-3.0-8b-instruct
+      testing:
+        size: small
+        max_concurrency: 512
     tests.e2e.llm_load_test.args.concurrency: [1, 2, 4, 8, 16, 32, 64, 96, 128, 192, 256, 384, 512]
 
   # ---

diff --git a/.../toolbox/kserve_deploy_model/files/vllm/models/granite-3.0-8b-instruct/kustomization.yaml b/.../toolbox/kserve_deploy_model/files/vllm/models/granite-3.0-8b-instruct/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namePrefix: granite-3-0-8b-instruct-
+
+resources:
+- ../../base
+
+patches:
+- path: patch.yaml
+  target:
+    kind: InferenceService
+  options:
+    allowNameChange: true
diff --git a/...s/kserve/toolbox/kserve_deploy_model/files/vllm/models/granite-3.0-8b-instruct/patch.yaml b/...s/kserve/toolbox/kserve_deploy_model/files/vllm/models/granite-3.0-8b-instruct/patch.yaml
@@ -0,0 +1,16 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: isvc
+spec:
+  predictor:
+    minReplicas: 1
+    model:
+      storageUri: s3://psap-hf-models/ibm-granite/granite-3.0-8b-instruct/
+      resources:
+        requests:
+          cpu: "2"
+          memory: "16Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          nvidia.com/gpu: "1"