From 8729cfb85bd22d26047dcc42523c15e42fafed7b Mon Sep 17 00:00:00 2001
From: Yushi Homma <hommayushi3@gmail.com>
Date: Fri, 9 Aug 2024 15:03:49 -0700
Subject: [PATCH] feat: add kv-cache-dtype to list of environment variables

---
 endpoints-entrypoint.sh | 4 +++-
 examples/inference.py   | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/endpoints-entrypoint.sh b/endpoints-entrypoint.sh
index b24eb5c..29d4b1b 100644
--- a/endpoints-entrypoint.sh
+++ b/endpoints-entrypoint.sh
@@ -8,9 +8,11 @@ QUANTIZATION=${QUANTIZATION:-}
 DTYPE=${DTYPE:-"auto"}
 TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-false}
 GUIDED_DECODING_BACKEND=${GUIDED_DECODING_BACKEND:-"outlines"}
+KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-"auto"}
 
 # Entrypoint for the OpenAI API server
-CMD="vllm serve $MODEL_PATH --host '0.0.0.0' --port 80 --tensor-parallel-size '$NUM_SHARD' --dtype $DTYPE --guided-decoding-backend $GUIDED_DECODING_BACKEND"
+CMD="vllm serve $MODEL_PATH --host '0.0.0.0' --port 80 --tensor-parallel-size '$NUM_SHARD'"
+CMD="$CMD --dtype $DTYPE --guided-decoding-backend $GUIDED_DECODING_BACKEND --kv-cache-dtype $KV_CACHE_DTYPE"
 
 # Append --max-model-len if its value is not -1
 if [ "$MAX_MODEL_LEN" -ne -1 ]; then
diff --git a/examples/inference.py b/examples/inference.py
index b84440a..9ecdfe4 100644
--- a/examples/inference.py
+++ b/examples/inference.py
@@ -30,7 +30,7 @@
         ],
         max_tokens=500,
         temperature=0.0,
-        stream=STREAM
+        stream=STREAM,
     )
 
     if STREAM: