mlc-ai · YiyanZhai · Feb 4, 2025 · Feb 4, 2025 · Feb 17, 2025 · CharlieFRuan
diff --git a/examples/prefix-caching/README.md b/examples/prefix-caching/README.md
@@ -0,0 +1,14 @@
+# WebLLM App for Prefix Caching Demo
+
+This example demonstrates the use of `cachedPrefixes` in WebLLM.
+To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Note if you would like to hack WebLLM core package.
+You can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/prefix-caching/package.json b/examples/prefix-caching/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "prefix-caching-example",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/prefix-caching.html  --port 8888",
+    "build": "parcel build src/prefix-caching.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "^0.2.78"
+  }
+}
diff --git a/examples/prefix-caching/src/prefix-caching.html b/examples/prefix-caching/src/prefix-caching.html
@@ -0,0 +1,23 @@
+<!doctype html>
+<html>
+  <script>
+    webLLMGlobal = {};
+  </script>
+  <body>
+    <h2>WebLLM Prefix Caching Test Page</h2>
+    Open console to see output
+    <br />
+    <br />
+    <label id="init-label"> </label>
+
+    <h3>Prompt</h3>
+    <label id="prompt-label"> </label>
+
+    <h3>Response</h3>
+    <label id="generate-label"> </label>
+    <br />
+    <label id="stats-label"> </label>
+
+    <script type="module" src="./prefix-caching.ts"></script>
+  </body>
+</html>
diff --git a/examples/prefix-caching/src/prefix-caching.ts b/examples/prefix-caching/src/prefix-caching.ts
@@ -0,0 +1,142 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+const SYSTEM_PROMPT_PREFIX =
+  "You are a helpful assistant running in the user's browser, responsible for answering questions.";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+async function testPrefix() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO",
+      // Prefilling KV cache for efficiency
+      cachedPrefixes: [[{ role: "system", content: SYSTEM_PROMPT_PREFIX }]],
+    },
+    {
+      context_window_size: 2048,
+    },
+  );
+
+  const reply_using_prefix = await engine.chat.completions.create({
+    messages: [
+      { role: "system", content: SYSTEM_PROMPT_PREFIX },
+      { role: "user", content: "List three US states." },
+    ],
+    // below configurations are all optional
+    n: 1,
+    temperature: 1.5,
+    max_tokens: 64,
+    logprobs: true,
+    top_logprobs: 2,
+  });
+  console.log(reply_using_prefix);
+  console.log(reply_using_prefix.usage);
+}
+
+async function testWithoutPrefix() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
+  // Engine Initialization without cachedPrefixes
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO",
+    },
+    {
+      context_window_size: 2048,
+    },
+  );
+
+  const reply_without_prefix = await engine.chat.completions.create({
+    messages: [
+      { role: "system", content: SYSTEM_PROMPT_PREFIX },
+      { role: "user", content: "List three US states." },
+    ],
+    // below configurations are all optional
+    n: 1,
+    temperature: 1.5,
+    max_tokens: 64,
+    logprobs: true,
+    top_logprobs: 2,
+  });
+  console.log(reply_without_prefix);
+  console.log(reply_without_prefix.usage);
+}
+
+async function testMultiRound() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+
+  const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO",
+      cachedPrefixes: [[{ role: "system", content: SYSTEM_PROMPT_PREFIX }]], // Prefilling KV cache for efficiency
+    },
+    {
+      context_window_size: 2048,
+    },
+  );
+
+  // First Completion with cachedPrefixes
+  const reply0 = await engine.chat.completions.create({
+    messages: [
+      { role: "system", content: SYSTEM_PROMPT_PREFIX },
+      { role: "user", content: "List three US states." },
+    ],
+    // below configurations are all optional
+    n: 1,
+    temperature: 1.5,
+    max_tokens: 64,
+    logprobs: true,
+    top_logprobs: 2,
+  });
+  console.log(reply0);
+  console.log(reply0.usage);
+
+  // Second Completion with cachedPrefixes
+  const reply1 = await engine.chat.completions.create({
+    messages: [
+      { role: "system", content: SYSTEM_PROMPT_PREFIX },
+      { role: "user", content: "Where is the US capital?" },
+    ],
+    // below configurations are all optional
+    n: 1,
+    temperature: 1.5,
+    max_tokens: 64,
+    logprobs: true,
+    top_logprobs: 2,
+  });
+  console.log(reply1);
+  console.log(reply1.usage);
+}
+
+async function main() {
+  await testPrefix();
+
+  await testWithoutPrefix();
+
+  await testMultiRound();
+}
+
+main();
diff --git a/src/config.ts b/src/config.ts
@@ -9,6 +9,7 @@ import {
   NonNegativeError,
   RangeError,
 } from "./error";
+import { ChatCompletionMessageParam } from "./openai_api_protocols/chat_completion";
 
 /**
  * Conversation template config
@@ -105,15 +106,20 @@ export interface ChatOptions extends Partial<ChatConfig> {}
  * appConfig: Configure the app, including the list of models and whether to use IndexedDB cache.
  * initProgressCallback: A callback for showing the progress of loading the model.
  * logitProcessorRegistry: A register for stateful logit processors, see `webllm.LogitProcessor`.
+ * cachedPrefixes: Specifies a system prompt (prefix) that will be prefilled when loading the engine
+ * to create their corresponding KV cache and store them for reuse. These cached kv pairs persist
+ * until the engine is reloaded.
  *
  * @note All fields are optional, and `logitProcessorRegistry` is only used for `MLCEngine` and not
  * other `MLCEngine`s.
+ * @note cachedPrefixes is experimental. It may change in future versions.
  */
 export interface MLCEngineConfig {
   appConfig?: AppConfig;
   initProgressCallback?: InitProgressCallback;
   logitProcessorRegistry?: Map<string, LogitProcessor>;
   logLevel?: LogLevel;
+  cachedPrefixes?: ChatCompletionMessageParam[][];
 }
 
 /**

diff --git a/src/engine.ts b/src/engine.ts
@@ -131,6 +131,7 @@ export class MLCEngine implements MLCEngineInterface {
   private logitProcessorRegistry?: Map<string, LogitProcessor>;
   private initProgressCallback?: InitProgressCallback;
   private appConfig: AppConfig;
+  private cachedPrefixes: ChatCompletionMessageParam[][];
 
   // Signals and flags
   private interruptSignal = false;
@@ -149,6 +150,7 @@ export class MLCEngine implements MLCEngineInterface {
     this.setLogLevel(engineConfig?.logLevel || DefaultLogLevel);
     this.setInitProgressCallback(engineConfig?.initProgressCallback);
     this.setLogitProcessorRegistry(engineConfig?.logitProcessorRegistry);
+    this.cachedPrefixes = engineConfig?.cachedPrefixes || [];
 
     this.chat = new API.Chat(this);
     this.completions = new API.Completions(this);
@@ -392,6 +394,16 @@ export class MLCEngine implements MLCEngineInterface {
     this.loadedModelIdToPipeline.set(modelId, newPipeline);
     this.loadedModelIdToLock.set(modelId, new CustomLock());
 
+    // Call prefillConvSequence() if cachedPrefixes is specified
+    if (
+      newPipeline instanceof LLMChatPipeline &&
+      this.cachedPrefixes.length > 0
+    ) {
+      for (let i = 0; i < this.cachedPrefixes.length; i++) {
+        await newPipeline.prefillConvSequence(this.cachedPrefixes[i]);
+      }
+    }
+
     // Clean up
     const tend = performance.now();
     if (this.initProgressCallback !== undefined) {