From 8729cfb85bd22d26047dcc42523c15e42fafed7b Mon Sep 17 00:00:00 2001 From: Yushi Homma Date: Fri, 9 Aug 2024 15:03:49 -0700 Subject: [PATCH] feat: add kv-cache-dtype to list of environment variables --- endpoints-entrypoint.sh | 4 +++- examples/inference.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/endpoints-entrypoint.sh b/endpoints-entrypoint.sh index b24eb5c..29d4b1b 100644 --- a/endpoints-entrypoint.sh +++ b/endpoints-entrypoint.sh @@ -8,9 +8,11 @@ QUANTIZATION=${QUANTIZATION:-} DTYPE=${DTYPE:-"auto"} TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-false} GUIDED_DECODING_BACKEND=${GUIDED_DECODING_BACKEND:-"outlines"} +KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-"auto"} # Entrypoint for the OpenAI API server -CMD="vllm serve $MODEL_PATH --host '0.0.0.0' --port 80 --tensor-parallel-size '$NUM_SHARD' --dtype $DTYPE --guided-decoding-backend $GUIDED_DECODING_BACKEND" +CMD="vllm serve $MODEL_PATH --host '0.0.0.0' --port 80 --tensor-parallel-size '$NUM_SHARD'" +CMD="$CMD --dtype $DTYPE --guided-decoding-backend $GUIDED_DECODING_BACKEND --kv-cache-dtype $KV_CACHE_DTYPE" # Append --max-model-len if its value is not -1 if [ "$MAX_MODEL_LEN" -ne -1 ]; then diff --git a/examples/inference.py b/examples/inference.py index b84440a..9ecdfe4 100644 --- a/examples/inference.py +++ b/examples/inference.py @@ -30,7 +30,7 @@ ], max_tokens=500, temperature=0.0, - stream=STREAM + stream=STREAM, ) if STREAM: