add disagg prefill example

Signed-off-by: YaoJiayi <[email protected]>
vllm-project · Feb 5, 2025 · 4a748b5 · 4a748b5
1 parent eb16b1f
commit 4a748b5
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 3 deletions.
diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py
@@ -16,7 +16,7 @@
 # LMCache-related environment variables
 # Use experimental features in LMCache
 os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-#
+# LMCache is set to use 256 tokens per chunk
 os.environ["LMCACHE_CHUNK_SIZE"] = "256"
 # Enable local CPU backend in LMCache
 os.environ["LMCACHE_LOCAL_CPU"] = "True"
@@ -37,7 +37,7 @@
 
 ktc = KVTransferConfig.from_cli(
     '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
-# Example: Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
 # memory. Reduce the value if your GPU has less memory.
 # Note that LMCache is not compatible with chunked prefill for now.
 llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",

diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -2,7 +2,9 @@
 """
 This file demonstrates the example usage of disaggregated prefilling
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
-and then transfer the KV cache between them.
+and launch an additional LMCache server.
+KV cache is transferred in the following manner: 
+VLLM prefill node -> LMCache server -> VLLM decode node.
 
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """