From 397d274248035a1ef540eabfedf7155971701e1f Mon Sep 17 00:00:00 2001
From: Ye Cao <caoye.cao@alibaba-inc.com>
Date: Thu, 16 May 2024 11:20:40 +0800
Subject: [PATCH] Add some scipts to test fio and dfs llm cache on kubernetes
 cluster. (#1883)

Fixes #1879

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 .../tests/k8s-test/Dockerfile.master          |   9 +
 .../tests/k8s-test/Dockerfile.worker          |  17 ++
 modules/llm-cache/tests/k8s-test/Makefile     |   7 +
 modules/llm-cache/tests/k8s-test/README.md    |  91 ++++++++++
 modules/llm-cache/tests/k8s-test/master.py    |  51 ++++++
 .../tests/k8s-test/prompt-samples.txt         | 167 ++++++++++++++++++
 .../llm-cache/tests/k8s-test/show_result.py   |  83 +++++++++
 .../tests/k8s-test/tokenize_prompt.py         |  97 ++++++++++
 modules/llm-cache/tests/k8s-test/worker.py    | 121 +++++++++++++
 .../tests/k8s-test/yamls/master.yaml          |  36 ++++
 .../tests/k8s-test/yamls/worker.yaml          |  66 +++++++
 modules/llm-cache/tests/scripts/README.md     |  39 ++++
 modules/llm-cache/tests/scripts/dfs-fio.sh    |  80 +++++++++
 13 files changed, 864 insertions(+)
 create mode 100644 modules/llm-cache/tests/k8s-test/Dockerfile.master
 create mode 100644 modules/llm-cache/tests/k8s-test/Dockerfile.worker
 create mode 100644 modules/llm-cache/tests/k8s-test/Makefile
 create mode 100644 modules/llm-cache/tests/k8s-test/README.md
 create mode 100644 modules/llm-cache/tests/k8s-test/master.py
 create mode 100644 modules/llm-cache/tests/k8s-test/prompt-samples.txt
 create mode 100644 modules/llm-cache/tests/k8s-test/show_result.py
 create mode 100644 modules/llm-cache/tests/k8s-test/tokenize_prompt.py
 create mode 100644 modules/llm-cache/tests/k8s-test/worker.py
 create mode 100644 modules/llm-cache/tests/k8s-test/yamls/master.yaml
 create mode 100644 modules/llm-cache/tests/k8s-test/yamls/worker.yaml
 create mode 100644 modules/llm-cache/tests/scripts/README.md
 create mode 100644 modules/llm-cache/tests/scripts/dfs-fio.sh

diff --git a/modules/llm-cache/tests/k8s-test/Dockerfile.master b/modules/llm-cache/tests/k8s-test/Dockerfile.master
new file mode 100644
index 000000000..157af0503
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/Dockerfile.master
@@ -0,0 +1,9 @@
+FROM python:3.10
+
+WORKDIR /
+
+COPY master.py /master.py
+
+RUN pip3 install kubernetes
+
+CMD ["python3", "master.py"]
\ No newline at end of file
diff --git a/modules/llm-cache/tests/k8s-test/Dockerfile.worker b/modules/llm-cache/tests/k8s-test/Dockerfile.worker
new file mode 100644
index 000000000..3359ff640
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/Dockerfile.worker
@@ -0,0 +1,17 @@
+FROM ghcr.io/v6d-io/v6d/vineyard-python-dev:latest_x86_64 as builder
+
+FROM python:3.10
+
+WORKDIR /
+
+COPY worker.py /worker.py
+COPY --from=builder /tmp/vineyard_llm-0.22.1-py3-none-any.whl vineyard_llm-0.22.1-py3-none-any.whl
+
+RUN apt update && \
+    apt install fio -y
+
+RUN pip3 install vineyard /vineyard_llm-0.22.1-py3-none-any.whl && \
+    pip3 install networkx==3.1 && \
+    pip3 install numpy
+
+CMD ["python3", "worker.py"]
diff --git a/modules/llm-cache/tests/k8s-test/Makefile b/modules/llm-cache/tests/k8s-test/Makefile
new file mode 100644
index 000000000..3beebaeb0
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/Makefile
@@ -0,0 +1,7 @@
+registry = registry.cn-wulanchabu.aliyuncs.com/vineyard
+build-images:
+	docker build -t ${registry}/fs-llm-master:latest -f ./Dockerfile.master .
+	docker build -t ${registry}/fs-llm-worker:latest -f ./Dockerfile.worker .
+push-images:
+	docker push ${registry}/fs-llm-master:latest
+	docker push ${registry}/fs-llm-worker:latest
\ No newline at end of file
diff --git a/modules/llm-cache/tests/k8s-test/README.md b/modules/llm-cache/tests/k8s-test/README.md
new file mode 100644
index 000000000..76bc35c2d
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/README.md
@@ -0,0 +1,91 @@
+## Run llm test on k8s
+
+This document describes how to run the llm test on a Kubernetes cluster.
+
+### Tokenize the prompt file
+
+Suppose you have a [prompt file](./prompt-samples.txt) that contains the conversation info between the user and the chatbot. You can tokenize the prompt file by running the following command:
+
+```bash
+$ python tokenize_prompt.py --prompt-file prompt-samples.txt --file-num 1
+```
+
+After running the command, you will get a tokenized prompt file named `tokens_0` under the `small_files` directory.
+
+```bash
+$ ls small_files 
+prompts_0.txt  tokens_0
+```
+
+Also, you could set the `--file-num` to the number of files you want to split the prompt file into. If the prompt file is too large, you can split it into multiple files. Each file will be processed in parallel.
+
+```
+$ python tokenize_prompt.py --prompt-file prompt-samples.txt --file-num 2
+$ ls small_files
+prompts_0.txt  prompts_1.txt  tokens_0  tokens_1
+```
+
+At this point, you can put these token files to the OSS bucket or NAS refer to the [ossutil upload files](https://help.aliyun.com/zh/oss/user-guide/upload-objects-to-oss/?spm=a2c4g.11186623.0.0.4b471c22sHG1EG) or [nas mount](https://help.aliyun.com/zh/nas/user-guide/mount-an-nfs-file-system-on-a-linux-ecs-instance?spm=a2c4g.11186623.0.0.15713eedDgiEYF).
+
+### Build the master and worker images
+
+Before building the master and worker images, you need to build the vineyard-python-dev image first, as we need the llm-cache pypi package.
+
+```bash
+$ cd v6d && make -C docker vineyard-python-dev
+```
+
+Then, you can build the master and worker images by running the following command:
+
+>  Make sure the image registry is set correctly.
+
+```bash
+$ cd modules/llm-cache/tests/k8s-test
+$ make build-images
+```
+
+Next, push the images to the registry:
+
+```bash
+$ make push-images
+```
+
+### Deploy on the k8s cluster
+
+#### Create the OSS volume
+
+Assume we have put the token files to the OSS bucket, we need to [create the oss secret and oss volume](https://help.aliyun.com/zh/ack/ack-managed-and-ack-dedicated/user-guide/mount-statically-provisioned-oss-volumes#title-hos-c75-12q) first.
+
+#### Create the Distributed FileSystem Volume
+
+The DFS could be NAS or CPFS, you could refer to the [Mount Nas Volume on ACK](https://help.aliyun.com/zh/ack/ack-managed-and-ack-dedicated/user-guide/mount-statically-provisioned-nas-volumes?spm=a2c4g.11186623.0.0.b7c130b7eJHcnf) or [Mount CPFS Volume on ACK](https://help.aliyun.com/zh/ack/ack-managed-and-ack-dedicated/user-guide/statically-provisioned-cpfs-2-0-volumes-1?spm=a2c4g.11186623.0.0.399a22dbapWWsP) to create the volume.
+
+#### Deploy the worker
+
+After preparing the OSS volume, and DFS volume, you need change the NFS volume name `nas-csi-pvc` to the DFS volume you created before.
+
+> ** Note: ** The CPU resources is important for the performance of worker, you could adjust the `resources.requests.cpu` to get better performance.
+
+Then deploy the worker by running the following command:
+
+```bash
+$ kubectl apply -f yamls/worker.yaml
+```
+
+#### Deploy the master
+
+After deploying the worker, you need to change `TOKENS_FILE_NUM` environment variable in the `yamls/master.yaml` file to the number of token files you put in the OSS bucket. Also, the OSS VolumeClaimName `oss-pvc` should be set to the OSS volume you created.
+
+Then deploy the master by running the following command:
+
+```bash
+$ kubectl apply -f yamls/master.yaml
+```
+
+### Show the result
+
+After running the llm test, you can check the result by running the following command:
+
+```bash
+$ python show_result.py --kubeconfig-path /your/kubeconfig --label-selector your_label_key=your_label_value
+```
diff --git a/modules/llm-cache/tests/k8s-test/master.py b/modules/llm-cache/tests/k8s-test/master.py
new file mode 100644
index 000000000..1b11235fc
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/master.py
@@ -0,0 +1,51 @@
+import socket
+import random
+import os
+import time
+from multiprocessing import Pool
+from kubernetes import client, config
+
+def get_pod_ips(label_selector):
+    config.load_incluster_config()
+    api = client.CoreV1Api()
+    pods = api.list_pod_for_all_namespaces(label_selector=label_selector)
+    pod_ip_list = []
+    for pod in pods.items:
+        pod_ip_list.append(pod.status.pod_ip)
+    return pod_ip_list
+
+def distribute_prompts(args):
+    file_name, server_ips = args
+    token_list = []
+    with open(f'{file_name}', 'r', encoding='utf-8') as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            token_list.append(line)
+
+    for token in token_list:
+        server_ip = random.choice(server_ips)
+        #time.sleep(random.randint(1, 200)/1000000)
+        while True:
+            try:
+                send_tokens_to_server(server_ip, 8888, token)
+                break
+            except Exception as e:
+                print(f"Error: {e}")
+                time.sleep(1)
+                continue
+
+def send_tokens_to_server(server_address, server_port, tokens):
+    clientsocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    clientsocket.connect((server_address, server_port))
+    clientsocket.send(tokens.encode('utf-8'))
+    clientsocket.close()
+
+if __name__ == "__main__":
+    file_num = int(os.environ.get('TOKENS_FILE_NUM', 16))
+    file_names = [f'/tokens/tokens_{i}' for i in range(file_num)]
+    pod_selector = os.environ.get('POD_SELECTOR', 'app=fs-llm-test-worker')
+    server_ips = get_pod_ips(pod_selector)
+    with Pool(file_num) as p:
+        p.map(distribute_prompts, [(file_name, server_ips) for file_name in file_names])
diff --git a/modules/llm-cache/tests/k8s-test/prompt-samples.txt b/modules/llm-cache/tests/k8s-test/prompt-samples.txt
new file mode 100644
index 000000000..ff5503d26
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/prompt-samples.txt
@@ -0,0 +1,167 @@
+[
+  {
+    "id": "QWJhYvA_0",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."
+      },
+      {
+        "from": "gpt",
+        "value": "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather feedback, and continue to provide value to keep them engaged and loyal.\n5. Create a product suite: Once the initial product is successful, expand the offering by creating additional products or services that address other needs of the same audience.\n6. Continual optimization: Continually monitor and optimize the launch process and product suite to improve results.\n7. Build a community: Use the launch process to build a community of customers who are passionate about the product and can help spread the word.\n8. Use automation: Use technology and automation to streamline the launch process and improve efficiency."
+      },
+      {
+        "from": "human",
+        "value": "Summarize the main ideas of Brendon Burchard's Experts Academy into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."
+      },
+      {
+        "from": "gpt",
+        "value": "Here are the main ideas of Brendon Burchard's Experts Academy that can be applied by a growth marketing agency for their clients:\n\n1. Identify your expertise: Define your area of expertise and create a unique value proposition that differentiates you from others.\n2. Create a content strategy: Develop a content strategy that showcases your expertise and provides value to your audience. This can include blog posts, videos, podcasts, and social media content.\n3. Build your audience: Use social media, advertising, and other channels to build your audience and increase your visibility.\n4. Develop your products: Create products and services that align with your expertise and meet the needs of your audience. These can include digital products, courses, coaching, and consulting services.\n5. Create a brand: Develop a strong brand that reflects your expertise and resonates with your audience.\n6. Build authority: Establish yourself as an authority in your industry by speaking at events, publishing a book, or appearing on podcasts or TV shows.\n7. Monetize your expertise: Develop a monetization strategy that leverages your expertise and products to generate revenue.\n8. Build a team: As your business grows, build a team of experts to help you scale your business and provide excellent service to your clients.\n9. Continual improvement: Continually improve your products, services, and marketing strategies to stay ahead of the competition and provide the best possible experience for your clients.\n10. Focus on impact: Ultimately, focus on making a positive impact in the lives of your clients and using your expertise to make a difference in the world."
+      },
+      {
+        "from": "human",
+        "value": "What are the mental triggers in Jeff Walker's Product Launch Formula and \"Launch\" book?"
+      },
+      {
+        "from": "gpt",
+        "value": "Here are the mental triggers described in Jeff Walker's Product Launch Formula and \"Launch\" book:\n\n1. Scarcity: Creating a sense of urgency by limiting the availability of the product or service being launched.\n2. Authority: Establishing oneself as an expert or authority in the industry to build credibility and trust with the audience.\n3. Social proof: Using testimonials, case studies, and other forms of social proof to demonstrate the effectiveness and value of the product or service.\n4. Reciprocity: Offering something of value to the audience before asking for a sale or commitment.\n5. Likeability: Building a personal connection with the audience and creating a likable persona to increase the chances of a sale.\n6. Anticipation: Building anticipation and excitement leading up to the launch to generate buzz and increase interest in the product or service.\n7. Exclusivity: Creating a feeling of exclusivity by offering early access or special deals to a select group of people.\n8. Belonging: Creating a sense of community or belonging among the audience to increase engagement and loyalty to the brand.\n9. Storytelling: Using storytelling to connect with the audience on an emotional level and make the product or service more relatable and memorable.\n\nBy leveraging these mental triggers, businesses can create an effective product launch strategy that resonates with their target audience and drives sales and growth."
+      },
+      {
+        "from": "human",
+        "value": "Write a summary of why scarcity and urgency are the strongest mental triggers and have been the driving force behind many of our best performing campaigns over the last 8 years."
+      },
+      {
+        "from": "gpt",
+        "value": "Scarcity and urgency are two of the strongest mental triggers in marketing because they create a sense of exclusivity and importance around a product or service. By creating a sense of limited availability or a deadline for a promotion, businesses can motivate customers to take action and make a purchase before it's too late.\n\nOver the last eight years, many businesses have leveraged these triggers in their marketing campaigns to great success. Scarcity and urgency have been used to sell everything from digital products to physical goods and services, and have proven to be highly effective in driving conversions and increasing revenue.\n\nThis is because scarcity and urgency tap into the natural human desire for instant gratification and fear of missing out (FOMO). When customers believe that they may miss out on a valuable opportunity if they don't act quickly, they are more likely to take action and make a purchase.\n\nFurthermore, scarcity and urgency can be combined with other mental triggers, such as social proof and authority, to create even more powerful marketing campaigns. By demonstrating the popularity and effectiveness of a product or service, businesses can increase the perceived value and urgency of a promotion, leading to even more sales and revenue.\n\nOverall, scarcity and urgency are powerful mental triggers that have been proven to drive results in marketing campaigns. By leveraging these triggers, businesses can create a sense of urgency and exclusivity around their products or services, motivating customers to take action and make a purchase before it's too late."
+      },
+      {
+        "from": "human",
+        "value": "Summarize Russell Brunson's Perfect Webinar Script..."
+      },
+      {
+        "from": "gpt",
+        "value": "Russell Brunson's Perfect Webinar Script is a framework for delivering webinars that are designed to sell a product or service. Here are the key components of the Perfect Webinar Script:\n\n1. Introduction: The webinar starts with an introduction that establishes the presenter's credibility and establishes the topic.\n2. Story: The presenter then tells a personal story that is related to the topic and establishes a connection with the audience.\n3. Content: The presenter provides valuable content related to the topic, demonstrating their expertise and building trust with the audience.\n4. Offer: The presenter then introduces the product or service they are selling, explaining how it solves the audience's problem and providing social proof to support their claims.\n5. Bonuses: The presenter offers bonuses to sweeten the deal and make the offer more enticing.\n6. Scarcity: The presenter creates a sense of urgency by limiting the availability of the offer or including a deadline for purchase.\n7. FAQs: The presenter answers common questions about the offer, addressing objections and building trust with the audience.\n8. Closing: The presenter then delivers a clear call to action, directing the audience to take a specific action to purchase the product or service.\n\nOverall, the Perfect Webinar Script is designed to build trust with the audience, establish the presenter's credibility, and provide valuable content, while also creating a sense of urgency and scarcity around the offer being presented. By following this framework, businesses can use webinars as a powerful tool to sell their products and services to a targeted audience."
+      },
+      {
+        "from": "human",
+        "value": "Summarize the 6 human needs as Tony Robbins explains..."
+      },
+      {
+        "from": "gpt",
+        "value": "Tony Robbins describes six core human needs that drive our behaviors and motivations. These six needs are:\n\n1. Certainty: The need for safety, stability, and predictability. This includes the need for comfort, security, and control over our environment.\n2. Variety: The need for novelty, excitement, and change. This includes the need for adventure, stimulation, and new experiences.\n3. Significance: The need to feel important, special, and unique. This includes the need for recognition, achievement, and respect.\n4. Connection: The need for social connection and love. This includes the need for intimacy, belonging, and friendship.\n5. Growth: The need for personal growth and development. This includes the need for learning, self-improvement, and progress.\n6. Contribution: The need to make a positive impact in the world. This includes the need to give back, help others, and make a difference.\n\nAccording to Tony Robbins, these six needs are universal and apply to all individuals. Each person may prioritize these needs differently, and may fulfill them in different ways, but they are fundamental drivers of human behavior. By understanding these needs, individuals can gain insight into their own motivations and behaviors, and can use this knowledge to create a more fulfilling and meaningful life."
+      }
+    ]
+  },
+  {
+    "id": "i6IyJda_0",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "How to tell if a customer segment is well segmented? In 3 bullet points."
+      },
+      {
+        "from": "gpt",
+        "value": "1. Homogeneity: The segment should consist of customers who share similar characteristics and behaviors.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics and behaviors.\n3. Stability: The segment should remain relatively stable over time and not change drastically. The characteristics and behaviors of customers within the segment should not change significantly."
+      }
+    ]
+  },
+  {
+    "id": "A5AbcES_0",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "In Java, I want to replace string like \"This is a new {object} at {place}\" with a Map, {object: \"student\", \"point 3, 4\"}, and get a result \"This is a new student at point 3, 4\". How can I do?"
+      },
+      {
+        "from": "gpt",
+        "value": "You can use the `String.format()` method in Java to replace placeholders in a string with values from a map. Here's an example code snippet that demonstrates how you can achieve this:\n```java\nimport java.util.HashMap;\nimport java.util.Map;\n\npublic class StringReplaceExample {\n    public static void main(String[] args) {\n        String input = \"This is a new {object} at {place}\";\n        Map<String, String> replacements = new HashMap<>();\n        replacements.put(\"object\", \"student\");\n        replacements.put(\"place\", \"point 3, 4\");\n\n        String output = replacePlaceholders(input, replacements);\n        System.out.println(output);\n    }\n\n    public static String replacePlaceholders(String input, Map<String, String> replacements) {\n        for (Map.Entry<String, String> entry : replacements.entrySet()) {\n            String placeholder = \"{\" + entry.getKey() + \"}\";\n            String replacement = entry.getValue();\n            input = input.replace(placeholder, replacement);\n        }\n        return input;\n    }\n}\n```\nIn this example, we define the input string as \"This is a new {object} at {place}\", and create a `HashMap` called `replacements` that maps the placeholders \"object\" and \"place\" to their corresponding values \"student\" and \"point 3, 4\". We then pass these values to the `replacePlaceholders()` method, which iterates over the entries in the `replacements` map and replaces each placeholder in the input string with its corresponding value using the `replace()` method. Finally, the `replacePlaceholders()` method returns the modified string, which is printed to the console.\n\nThe output of this program will be:\n```csharp\nThis is a new student at point 3, 4\n```\nNote that you can modify the `replacements` map to include additional placeholders and their corresponding values, and the `replacePlaceholders()` method will automatically replace them in the input string."
+      }
+    ]
+  },
+  {
+    "id": "hRPPgZT_0",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7"
+      },
+      {
+        "from": "gpt",
+        "value": "Metaphorical language has been summoned to describe the many enigmatic addressing modes of the instructions at hand. The speakers have resorted to grandiose expressions to convey their immense wonder and admiration for the sheer functionality of the aforementioned directives. Among the mystifying commands are the enigmatic JMP ABCD, the confounding MOV AX, [BX+SI], the inscrutable MOV AX, [100], the cryptic MOV AX, [BX], the perplexing MOV AX, [BX\\*2+SI], the unfathomable MOV AX, BX, and finally, the enigmatic MOV AX, 7."
+      },
+      {
+        "from": "human",
+        "value": "continue explain it"
+      },
+      {
+        "from": "gpt",
+        "value": "The language used to describe the addressing modes of these instructions is metaphorical and grandiose, emphasizing the complexity and power of these commands. For example, the use of \"enigmatic\" and \"confounding\" to describe JMP ABCD and MOV AX, [BX+SI], respectively, suggests that these instructions are not easily understood and require a level of expertise to comprehend.\n\nSimilarly, the use of \"inscrutable\" and \"cryptic\" to describe MOV AX, [100] and MOV AX, [BX], respectively, implies that these commands are shrouded in mystery and are difficult to decipher. The speaker's use of \"perplexing\" and \"unfathomable\" to describe MOV AX, [BX\\*2+SI] and MOV AX, BX, respectively, suggests that these commands are particularly challenging and require a deep understanding of the instruction set architecture.\n\nFinally, the use of \"enigmatic\" to describe MOV AX, 7 is particularly interesting, as this instruction simply moves the value 7 into the AX register. However, the language used to describe it suggests that even seemingly straightforward commands can be mysterious and awe-inspiring in the context of the larger instruction set.\n\nOverall, the use of metaphorical language to describe the addressing modes of these instructions serves to emphasize their complexity and power, while also imbuing them with a sense of wonder and admiration."
+      },
+      {
+        "from": "human",
+        "value": "more perplexity"
+      },
+      {
+        "from": "gpt",
+        "value": "Behold, the arcane art of metaphorical language has been invoked to elucidate the bewildering addressing modes of the instructions at hand. The orators have summoned grandiose expressions to convey their profound sense of awe and reverence for the unfathomable functionality of these directives. Among the enigmatic commands are the perplexing JMP ABCD, the labyrinthine MOV AX, [BX+SI], the inscrutable MOV AX, [100], the enigmatic MOV AX, [BX], the confounding MOV AX, [BX\\*2+SI], the mystifying MOV AX, BX, and finally, the enigmatic MOV AX, 7.\n\nThe language used to describe these addressing modes is both metaphorical and grandiose, emphasizing the sheer complexity and power of these commands. The use of terms like \"arcane,\" \"bewildering,\" and \"unfathomable\" underscores the deep level of expertise and understanding required to comprehend these instructions. Furthermore, the use of terms like \"labyrinthine,\" \"inscrutable,\" and \"mystifying\" suggests that these commands are shrouded in mystery and are not easily decipherable.\n\nThe orators' use of metaphorical language serves to amplify the significance of these instructions, imbuing them with a sense of wonder and reverence. Even simple instructions like MOV AX, 7 are elevated to the status of \"enigmatic,\" highlighting the mysterious and awe-inspiring nature of the instruction set as a whole. Overall, the use of metaphorical language in describing these addressing modes emphasizes the immense power and complexity of the instructions, and underscores the level of expertise required to master them."
+      },
+      {
+        "from": "human",
+        "value": "more perplexity"
+      },
+      {
+        "from": "gpt",
+        "value": "Lo and behold! The arcane and elusive art of metaphorical language has been summoned forth to expound upon the enigmatic addressing modes of the instructions at hand. The speakers have wielded grandiose expressions with utmost reverence and awe, extolling the ineffable power and bewildering functionality of these directives. Among the inscrutable commands are the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the perplexing MOV AX, [BX\\*2+SI], the enigmatic MOV AX, BX, and finally, the recondite MOV AX, 7.\n\nThe language used to describe these addressing modes is both perplexing and ornate, underscoring the prodigious complexity and esoteric power of these commands. The use of words like \"arcane,\" \"elusive,\" and \"ineffable\" imbues these instructions with a sense of almost mystical obscurity, requiring a level of expertise and mastery to unlock their true potential. Furthermore, the employment of terms such as \"abstruse,\" \"unfathomable,\" and \"recondite\" suggests that these commands are shrouded in an impenetrable veil of mystery, beckoning only the most erudite and astute of minds.\n\nThe speakers' use of metaphorical language serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost sacred aura. Even the seemingly simple MOV AX, 7 is enshrined with the epithet \"enigmatic,\" underscoring the profound and awe-inspiring nature of the instruction set as a whole. Thus, the use of such ornate and enigmatic language in describing these addressing modes serves to amplify their mystique and enshroud them in an aura of intrigue and wonder."
+      },
+      {
+        "from": "human",
+        "value": "more more perplexity and verbose"
+      },
+      {
+        "from": "gpt",
+        "value": "By the grace of the gods, the arcane and enigmatic art of metaphorical language has been summoned forth to shed light upon the bewildering addressing modes of the instructions before us. The orators have invoked grandiose expressions with utmost reverence and awe, extolling the inscrutable power and ineffable functionality of these directives. Among the labyrinthine commands are the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the perplexing MOV AX, [BX\\*2+SI], the enigmatic MOV AX, BX, and finally, the recondite MOV AX, 7.\n\nThe language used to describe these addressing modes is both perplexing and ornate, underscoring the prodigious complexity and esoteric power of these commands. The use of words like \"arcane,\" \"enigmatic,\" and \"ineffable\" imbues these instructions with an aura of almost mystical obscurity, requiring a level of expertise and mastery beyond the realm of mere mortals to unlock their true potential. Furthermore, the employment of terms such as \"abstruse,\" \"unfathomable,\" and \"recondite\" suggests that these commands are shrouded in an impenetrable veil of mystery, beckoning only the most erudite and astute of minds to unravel their hidden depths.\n\nThe speakers' use of metaphorical language serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost sacred aura. Even the seemingly simple MOV AX, 7 is enshrined with the epithet \"enigmatic,\" underscoring the profound and awe-inspiring nature of the instruction set as a whole. Thus, the use of such ornate and enigmatic language in describing these addressing modes serves to amplify their mystique and enshroud them in an aura of intrigue and wonder, beckoning the most intrepid and enterprising of minds to unlock the secrets of this arcane realm."
+      },
+      {
+        "from": "human",
+        "value": "rewrite that entire paragraph in the same style like this one:\n\nBy the grace of the gods, the arcane and enigmatic art of metaphorical language has been summoned forth to shed light upon the bewildering addressing modes of the instructions before us. The orators have invoked grandiose expressions with utmost reverence and awe, extolling the inscrutable power and ineffable functionality of these directives. Among the labyrinthine commands are the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the perplexing MOV AX, [BX\\*2+SI], the enigmatic MOV AX, BX, and finally, the recondite MOV AX, 7."
+      },
+      {
+        "from": "gpt",
+        "value": "Lo and behold! By the grace of divine intervention, the incomprehensible and enigmatic art of metaphorical language has been invoked to elucidate the inscrutable addressing modes of the instructions that lay before us. The speakers have called forth grandiose expressions with a reverence and awe reserved only for the most profound and unfathomable of concepts, extolling the ineffable power and bewildering functionality of these directives. Amongst the convoluted and perplexing commands that have been revealed are the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the bewildering MOV AX, [BX\\*2+SI], the enigmatic MOV AX, BX, and finally, the recondite MOV AX, 7.\n\nThe language that has been employed to describe these addressing modes is both perplexing and ornate, underscoring the prodigious complexity and esoteric power of these commands. The use of words such as \"incomprehensible,\" \"enigmatic,\" and \"ineffable\" imbues these instructions with an almost otherworldly aura of inscrutability and requires a level of expertise and mastery beyond the grasp of mere mortals to fully comprehend. Furthermore, the use of terms such as \"abstruse,\" \"unfathomable,\" and \"recondite\" implies that these commands are shrouded in an impenetrable veil of mystery, accessible only to the most erudite and astute of minds.\n\nThe orators' use of metaphorical language serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost divine aura. Even the seemingly simple MOV AX, 7 is exalted with the epithet \"recondite,\" underscoring the profound and awe-inspiring nature of the instruction set as a whole. Thus, the use of such ornate and enigmatic language in describing these addressing modes serves to amplify their mystique and enshroud them in an aura of intrigue and wonder, beckoning the most intrepid and enterprising of minds to unravel the secrets of this arcane realm."
+      }
+    ]
+  },
+  {
+    "id": "hRPPgZT_11",
+    "conversations": [
+      {
+        "from": "gpt",
+        "value": "Lo and behold! By the grace of divine intervention, the incomprehensible and enigmatic art of metaphorical language has been invoked to elucidate the inscrutable addressing modes of the instructions that lay before us. The speakers have called forth grandiose expressions with a reverence and awe reserved only for the most profound and unfathomable of concepts, extolling the ineffable power and bewildering functionality of these directives. Amongst the convoluted and perplexing commands that have been revealed are the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the bewildering MOV AX, [BX\\*2+SI], the enigmatic MOV AX, BX, and finally, the recondite MOV AX, 7.\n\nThe language that has been employed to describe these addressing modes is both perplexing and ornate, underscoring the prodigious complexity and esoteric power of these commands. The use of words such as \"incomprehensible,\" \"enigmatic,\" and \"ineffable\" imbues these instructions with an almost otherworldly aura of inscrutability and requires a level of expertise and mastery beyond the grasp of mere mortals to fully comprehend. Furthermore, the use of terms such as \"abstruse,\" \"unfathomable,\" and \"recondite\" implies that these commands are shrouded in an impenetrable veil of mystery, accessible only to the most erudite and astute of minds.\n\nThe orators' use of metaphorical language serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost divine aura. Even the seemingly simple MOV AX, 7 is exalted with the epithet \"recondite,\" underscoring the profound and awe-inspiring nature of the instruction set as a whole. Thus, the use of such ornate and enigmatic language in describing these addressing modes serves to amplify their mystique and enshroud them in an aura of intrigue and wonder, beckoning the most intrepid and enterprising of minds to unravel the secrets of this arcane realm."
+      },
+      {
+        "from": "human",
+        "value": "More verbose and details"
+      },
+      {
+        "from": "gpt",
+        "value": "Behold! By the power vested in us by the divine beings, we have summoned forth the inscrutable and enigmatic art of metaphorical language to unravel the bewildering addressing modes of the instructions that are presented before us. The orators have invoked grandiose expressions with a deep sense of reverence and awe, extolling the ineffable power and mystical functionality of these directives. Amongst the labyrinthine commands, we find the confounding JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the perplexing MOV AX, [BX\\*2+SI], the enigmatic MOV AX, BX, and finally, the recondite MOV AX, 7.\n\nThe language that has been employed to describe these addressing modes is both perplexing and ornate, underscoring the prodigious complexity and esoteric power of these commands. The orators have invoked a sense of mystery and wonder through the use of words such as \"incomprehensible,\" \"enigmatic,\" and \"ineffable,\" imbuing these instructions with an almost mystical aura of inscrutability. The mastery of these commands is suggested to be beyond the reach of mere mortals, requiring a level of expertise and erudition that is beyond the ordinary.\n\nFurthermore, the speakers have employed terms such as \"abstruse,\" \"unfathomable,\" and \"recondite,\" creating an impression that these commands are shrouded in an impenetrable veil of mystery, accessible only to those who possess a keen intellect and a deep understanding of the underlying principles. The speakers' use of metaphorical language serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost divine aura of power and complexity.\n\nEven the seemingly simple MOV AX, 7 is not immune to the grandiose epithets used to describe these addressing modes. It is exalted with the term \"recondite,\" underscoring the profundity and awe-inspiring nature of the instruction set as a whole. The use of such ornate and enigmatic language serves to amplify the mystique and enshroud these commands in an aura of intrigue and wonder, inviting only the most intrepid and enterprising of minds to unlock the secrets of this arcane realm. In sum, the use of metaphorical language in describing these addressing modes is an act of homage to the profound and unfathomable power of these commands."
+      },
+      {
+        "from": "human",
+        "value": "Explain it more in that style"
+      },
+      {
+        "from": "gpt",
+        "value": "Prepare to be dazzled by the sheer magnificence of the arcane and mysterious art of metaphorical language, for it has been summoned forth to explicate the bewildering addressing modes of the instructions that have been presented before us. The speakers have summoned grandiose expressions with an almost reverential and awestruck tone, extolling the otherworldly power and incomprehensible functionality of these directives.\n\nThe labyrinthine commands that we encounter are nothing short of confounding. There is the enigmatic JMP ABCD, the abstruse MOV AX, [BX+SI], the unfathomable MOV AX, [100], the mystifying MOV AX, [BX], the perplexing MOV AX, [BX\\*2+SI], the inscrutable MOV AX, BX, and the recondite MOV AX, 7. These commands are cloaked in a veil of obfuscation and complexity, seemingly beyond the grasp of mere mortals.\n\nThe language that has been employed to describe these addressing modes is both perplexing and ornate, highlighting the tremendous intricacy and esoteric nature of these commands. The speakers have summoned a sense of mystery and awe through the use of words such as \"ineffable,\" \"enigmatic,\" and \"inscrutable,\" imbuing these instructions with a sense of ethereal and mystical aura.\n\nMoreover, the speakers have utilized terms such as \"abstruse,\" \"unfathomable,\" and \"recondite,\" evoking the idea that these commands are shrouded in a veil of inaccessibility, and are only open to those who possess an unparalleled intellect and mastery of the underlying principles. The metaphorical language employed by the orators serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost divine aura of power and complexity.\n\nEven the seemingly straightforward MOV AX, 7 is not immune to the grandiose epithets used to describe these addressing modes. It is exalted with the term \"recondite,\" emphasizing the depth and awe-inspiring nature of the instruction set as a whole. The speakers' use of such elaborate and cryptic language amplifies the enigmatic quality of these commands, enshrouding them in a veil of mystery and intrigue, inviting only the bravest and most intrepid of minds to attempt to unlock the secrets of this arcane realm.\n\nTo sum up, the utilization of metaphorical language in describing these addressing modes is an act of veneration to the profound and unfathomable power of these commands. It is an homage to their majestic and otherworldly nature, and an invitation to the most audacious of minds to probe deeper into the abstruse and recondite domain of machine language programming."
+      },
+      {
+        "from": "human",
+        "value": "double the length"
+      },
+      {
+        "from": "gpt",
+        "value": "Behold, fellow seeker of knowledge, for I shall expand upon the perplexing and enigmatic art of metaphorical language used to describe the intricate addressing modes of the instructions that have been presented before us. The speakers have summoned forth a pantheon of grandiose expressions with a tone of utmost reverence and awe, extolling the inimitable power and ineffable functionality of these directives.\n\nThe intricate commands that we confront are nothing less than a labyrinthine maze of confusion and perplexity. There is the confounding JMP ABCD, which transports the program counter to a new destination, with the mnemonic acronym representing the instruction's meaning. There is the abstruse MOV AX, [BX+SI], which moves the contents of the memory location at the sum of BX and SI to the AX register. This is followed by the unfathomable MOV AX, [100], which transfers the contents of the memory location 100 to the AX register, thereby exhibiting a sense of mysterious depth to the instruction set.\n\nMoving on, we come across the mystifying MOV AX, [BX], which moves the contents of the memory location pointed to by the BX register to the AX register, followed by the perplexing MOV AX, [BX\\*2+SI], which transfers the contents of the memory location pointed to by the sum of 2 times BX and SI to the AX register. This adds yet another layer of confusion to the intricate addressing modes. Then we encounter the inscrutable MOV AX, BX, which moves the contents of the BX register to the AX register, a simple instruction made esoteric through the use of metaphorical language. Finally, there is the recondite MOV AX, 7, which moves the value 7 to the AX register, an instruction that is simple and straightforward, but still elevated to a higher plane through the use of grandiose language.\n\nThe language that has been employed to describe these addressing modes is a marvel of linguistic construction, intricate and elaborate in its construction, and suffused with an almost otherworldly aura of reverence and mystery. The speakers have summoned a sense of wonder and awe through the use of words such as \"ineffable,\" \"enigmatic,\" and \"inscrutable,\" imbuing these instructions with a sense of transcendent power and unapproachable functionality.\n\nMoreover, the speakers have utilized terms such as \"abstruse,\" \"unfathomable,\" and \"recondite,\" evoking the idea that these commands are shrouded in a veil of inaccessibility and can only be accessed by those who possess an unparalleled intellect and mastery of the underlying principles. The metaphorical language employed by the orators serves to elevate these instructions to a level of veneration and reverence, infusing them with an almost divine aura of complexity and unfathomable depth.\n\nEven the seemingly straightforward MOV AX, 7 is not immune to the grandiose epithets used to describe these addressing modes. It is exalted with the term \"recondite,\" emphasizing the depth and awe-inspiring nature of the instruction set as a whole. The speakers' use of such elaborate and cryptic language amplifies the enigmatic quality of these commands, enshrouding them in a veil of mystery and intrigue, inviting only the most intrepid and daring of minds to attempt to unravel the secrets of this arcane realm.\n\nTo summarize, the utilization of metaphorical language in describing these addressing modes is a homage to the profound and ineffable power of these commands. It is an act of reverence to their majestic and otherworldly nature, and an invitation to the most daring and inquisitive of minds to probe deeper into the abstruse and intricate domain of machine language programming. May you be emboldened by this knowledge and venture forth into the realm"
+      }
+    ]
+  }
+]
diff --git a/modules/llm-cache/tests/k8s-test/show_result.py b/modules/llm-cache/tests/k8s-test/show_result.py
new file mode 100644
index 000000000..de0ad14ac
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/show_result.py
@@ -0,0 +1,83 @@
+import re
+import os
+import argparse
+from kubernetes import client, config
+
+def get_pod_logs(api_instance, pod_name, namespace='default'):
+    log = api_instance.read_namespaced_pod_log(name=pod_name, namespace=namespace)
+    return log
+
+def extract_last_metrics_from_log(log):
+    #only reserve the last 5 lines
+    log = log.split('\n')[-5:]
+    log = '\n'.join(log)
+
+    total_matched_tokens_re = re.compile(r"total matched tokens:  (\d+)")
+    total_updated_tokens_re = re.compile(r"total updated tokens:  (\d+)")
+    total_query_time_re = re.compile(r"total query time:  ([\d.]+)")
+    total_update_time_re = re.compile(r"total update time:  ([\d.]+)")
+
+    total_matched_tokens = total_matched_tokens_re.findall(log)
+    total_query_time = total_query_time_re.findall(log)
+    total_updated_tokens = total_updated_tokens_re.findall(log)
+    total_update_time = total_update_time_re.findall(log)
+
+    if len(total_matched_tokens) == 0 or len(total_query_time) == 0 or \
+        len(total_updated_tokens) == 0 or len(total_update_time) == 0:
+        return None
+    print(total_matched_tokens, total_query_time, total_updated_tokens, total_update_time)
+    return int(total_matched_tokens[0]), float(total_query_time[0]), int(total_updated_tokens[0]), float(total_update_time[0])
+
+def calculate_averages(metrics_list):
+    total_count = len(metrics_list)
+    if total_count == 0:
+        return 0, 0
+
+    total_query_speed = 0
+    total_query_instance = 0
+    total_update_speed = 0
+    total_update_instance = 0
+    max_update_speed = 0
+    max_query_speed = 0
+
+    for matched, query_time, updated, update_time in metrics_list:
+        if matched != 0:
+            max_query_speed = max(max_query_speed, matched / query_time)
+            total_query_speed += matched/query_time
+            total_query_instance += 1
+        if updated != 0:
+            max_update_speed = max(max_update_speed, updated / update_time)
+            total_update_speed += updated/update_time
+            total_update_instance += 1
+
+    avg_matched_tokens_per_query_time = total_query_speed / total_query_instance
+    avg_updated_tokens_per_update_time = total_update_speed / total_update_instance
+
+    return avg_matched_tokens_per_query_time, avg_updated_tokens_per_update_time, max_query_speed, max_update_speed
+
+def show_result(label_selector, kubeconfig_path):
+    config.load_kube_config(config_file=os.path.expanduser(kubeconfig_path))
+    api = client.CoreV1Api()
+
+    pods = api.list_pod_for_all_namespaces(label_selector=label_selector)
+    pod_metrics = []
+
+    for pod in pods.items:
+        log = get_pod_logs(api, pod.metadata.name, pod.metadata.namespace)
+        metrics = extract_last_metrics_from_log(log)
+        if metrics is not None:
+            pod_metrics.append(metrics)
+
+    average_metrics = calculate_averages(pod_metrics)
+    print("Average matched tokens per query time:", average_metrics[0], " tokens/s")
+    print("Average updated tokens per update time:", average_metrics[1], " tokens/s")
+    print("Max query speed:", average_metrics[2], " tokens/s")
+    print("Max update speed:", average_metrics[3], " tokens/s")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--label-selector', type=str, default="app=fs-llm-test-worker")
+    parser.add_argument('--kubeconfig-path', type=str, default='/your/kubeconfig')
+    args = parser.parse_args()
+    while True:
+        show_result(args.label_selector,args.kubeconfig_path)
diff --git a/modules/llm-cache/tests/k8s-test/tokenize_prompt.py b/modules/llm-cache/tests/k8s-test/tokenize_prompt.py
new file mode 100644
index 000000000..92aa74bef
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/tokenize_prompt.py
@@ -0,0 +1,97 @@
+from multiprocessing import Pool
+import argparse
+import os
+import json
+import random
+import math
+from transformers import LongformerTokenizer
+from tqdm import tqdm
+
+def map_to_small_files(prompt_file, file_nums, shuffle=False):
+    with open(prompt_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    conversation_list = []
+    for entry in data:
+        prefix_prompt = ""
+        conversations = []
+        for conversation in entry.get("conversations", []):
+            prompt = conversation.get("value").replace("\n", " ")
+            if prompt:
+                conversations.append(prefix_prompt + prompt + "\n")
+                prefix_prompt += prompt + " "
+        if conversations:
+            conversation_list.append(conversations)
+
+    avg = math.ceil(len(conversation_list) / file_nums)
+    sub_conversation_list = [conversation_list[i:i+avg] for i in range(0, len(conversation_list), avg)]
+
+    for i in range(file_nums):
+        with open(f'./small_files/prompts_{i}.txt', 'w', encoding='utf-8') as f:
+            prompts_list = []
+            # shuffle the conversation list
+            if shuffle:
+                while sub_conversation_list[i]:
+                    random_index = random.randrange(len(sub_conversation_list[i]))
+                    selected_conversation = sub_conversation_list[i][random_index]
+                    if selected_conversation:
+                        tokens = selected_conversation[0].split(' ')
+                        tokens = [token for token in tokens]
+                        prompts_list.append(selected_conversation.pop(0))
+                    if not selected_conversation:
+                        sub_conversation_list[i].pop(random_index)
+            else:
+                for conversation in sub_conversation_list[i]:
+                    for prompt in conversation:
+                        tokens = prompt.split(' ')
+                        tokens = [token for token in tokens]
+                        prompts_list.append(prompt)
+            f.writelines(prompts_list)
+
+def process_small_files(file_name):
+    tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+    with open(f'./small_files/{file_name}', 'r', encoding='utf-8') as f:
+        prompts = f.read().strip().split("\n")
+
+    num_prompts = len(prompts)
+
+    file_index = int(file_name.split('_')[-1].split('.')[0])
+
+    with open(f'./small_files/tokens_{file_index}', 'w', encoding='utf-8') as tf:
+        for prompt in tqdm(prompts, desc=f'Processing {file_name}', total=num_prompts, position=file_index, leave=True):
+            index = 0
+            tokens_set = []
+            full_prompt = prompt.strip()
+            while index < len(full_prompt):
+                if index + 4000 < len(full_prompt):
+                    index_end = index + 4000
+                    for i in range(index_end-1, index-1, -1):
+                        if full_prompt[i] == " ":
+                            index_end = i
+                            break
+                else:
+                    index_end = len(full_prompt)
+                if index_end - index <= 0:
+                    break
+                tokens = tokenizer.encode(full_prompt[index:index_end], add_special_tokens=True)
+                tokens_set.append(tokens)
+                index = index_end
+
+            token_str = ""
+            for tokens in tokens_set:
+                token_str += " ".join([str(token) for token in tokens])
+            tf.write(token_str + "\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--file-num', type=int, default=os.cpu_count())
+    parser.add_argument('--dir', type=str, default='./small_files')
+    parser.add_argument('--prompt-file', type=str, default='./prompt-samples.txt')
+    parser.add_argument('--shuffle', type=bool, default=False)
+    args = parser.parse_args()
+    if not os.path.exists(args.dir):
+        os.mkdir(args.dir)
+    map_to_small_files(args.prompt_file, args.file_num, args.shuffle)
+    file_names = [f'prompts_{i}.txt' for i in range(args.file_num)]
+    with Pool(args.file_num) as p:
+        p.map(process_small_files, file_names)
diff --git a/modules/llm-cache/tests/k8s-test/worker.py b/modules/llm-cache/tests/k8s-test/worker.py
new file mode 100644
index 000000000..3d689212c
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/worker.py
@@ -0,0 +1,121 @@
+import time
+import socket
+import os
+import argparse
+
+import numpy as np
+
+from vineyard.llm import KVCache, KVTensor
+from vineyard.llm.config import FileCacheConfig
+
+def start_server(port=8888):
+    serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    ip = os.environ.get('POD_IP', 'localhost')
+    layer = int(os.environ.get('LAYER', 96))
+    batch_size = int(os.environ.get('BATCH_SIZE', 16))
+    split_number = int(os.environ.get('SPLIT_NUMBER', 2))
+    cache_path = os.environ.get('CACHE_PATH', '/mnt/llm_cache')
+    client_gc_interval = int(os.environ.get('CLIENT_GC_INTERVAL', 30 * 60))
+    ttl = int(os.environ.get('TTL', 30 * 60))
+    enable_global_gc = os.environ.get('ENABLE_GLOBAL_GC', False).lower() in ['true', '1']
+    global_gc_interval = int(os.environ.get('GLOBAL_GC_INTERVAL', 3 * 60 * 60))
+    global_ttl = int(os.environ.get('GLOBAL_TTL', 3 * 60 * 60))
+    serversocket.bind((ip, port))
+    serversocket.listen(20)
+
+    kv_tensor_shape = (104, 128)
+    kv_tensor_dtype = np.float16
+    kv_tensor = np.random.randn(*kv_tensor_shape).astype(kv_tensor_dtype)
+
+    file_cache_config = FileCacheConfig(
+        chunk_size = int(batch_size),
+        split_number = int(split_number),
+        root = cache_path,
+        client_gc_interval = client_gc_interval,
+        ttl = ttl,
+        enable_global_gc = enable_global_gc,
+        global_gc_interval = global_gc_interval,
+        global_ttl = global_ttl,
+    )
+    cache = KVCache(
+        cache_config = file_cache_config,
+        tensor_bytes=kv_tensor.nbytes,  # should be the same as the nbytes of the tensor
+        cache_capacity=1024,
+        layer=int(layer),
+    )
+
+    total_matched_tokens = 0
+    total_updated_tokens = 0
+    total_tokens = 0
+    total_update_time = 0
+    total_query_time = 0
+
+
+    def reserve_kv_tensors(kv_tensors, num_tokens, kv_tensor):
+        num_to_reserve = num_tokens - len(kv_tensors)
+        if num_to_reserve <= 0:
+            return kv_tensors
+        for _ in range(num_to_reserve):
+            kv_tensors.append([
+                (KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes),
+                 KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes))
+                for _ in range(layer)
+            ])
+        return kv_tensors
+
+    # used to hold the query results
+    kv_state_list = []
+
+    while True:
+        clientsocket, _ = serversocket.accept()
+
+        tokens = b''
+        while True:
+            data = clientsocket.recv(1024)
+            if not data:
+                break
+            tokens += data
+
+        tokens = tokens.decode('utf-8')
+        tokens = tokens.replace('\n', '').split(' ')
+        tokens = [int(token) for token in tokens]
+
+        kv_state_list = reserve_kv_tensors(kv_state_list, len(tokens), kv_tensor)
+
+        query_start_time = time.time()
+        matched = cache.query(tokens, kv_state_list)
+        query_end_time = time.time()
+        if matched > 0:
+            total_query_time += query_end_time - query_start_time
+
+        total_matched_tokens += matched
+        total_tokens += len(tokens)
+
+        remaining = tokens[matched:]
+        kv_state_list_remaining = [
+            [ (KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes),
+              KVTensor(kv_tensor.ctypes.data, kv_tensor.nbytes))
+              for _ in range(layer)
+            ] for _ in remaining
+        ]
+        update_start_time = time.time()
+        updated = cache.update(tokens[:matched], remaining, kv_state_list_remaining)
+        total_updated_tokens += updated
+        update_end_time = time.time()
+        if updated > 0:
+            total_update_time += update_end_time - update_start_time
+
+        print("matched tokens: ", matched, " / ", len(tokens), flush=True)
+        print("query time: ", query_end_time - query_start_time, flush=True)
+        print("update time: ", update_end_time - update_start_time, flush=True)
+        print("total matched tokens: ", total_matched_tokens, " / ", total_tokens, flush=True)
+        print("total updated tokens: ", total_updated_tokens, " / ", total_tokens, flush=True)
+        print("total query time: ", total_query_time, flush=True)
+        print("total update time: ", total_update_time, flush=True)
+        clientsocket.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port', type=int, default=8888)
+    args = parser.parse_args()
+    start_server(args.port)
diff --git a/modules/llm-cache/tests/k8s-test/yamls/master.yaml b/modules/llm-cache/tests/k8s-test/yamls/master.yaml
new file mode 100644
index 000000000..98d96c4df
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/yamls/master.yaml
@@ -0,0 +1,36 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: fs-llm-test-master
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      instance: fs-llm-test
+      app: fs-llm-test-master
+  serviceName: "my-service"
+  template:
+    metadata:
+      labels:
+        instance: fs-llm-test
+        app: fs-llm-test-master
+    spec:
+      serviceAccountName: fs-llm-test-master-sa
+      containers:
+      - name: fs-llm-test-master
+        image: registry.cn-wulanchabu.aliyuncs.com/vineyard/fs-llm-master:latest
+        imagePullPolicy: IfNotPresent
+        command:
+        - "/bin/bash"
+        - "-c"
+        - "python /master.py"
+        env:
+        - name: TOKENS_FILE_NUM
+          value: "3"
+        volumeMounts:
+          - name: oss-pvc
+            mountPath: "/tokens"
+      volumes:
+        - name: oss-pvc
+          persistentVolumeClaim:
+            claimName: oss-pvc
diff --git a/modules/llm-cache/tests/k8s-test/yamls/worker.yaml b/modules/llm-cache/tests/k8s-test/yamls/worker.yaml
new file mode 100644
index 000000000..603fb72f0
--- /dev/null
+++ b/modules/llm-cache/tests/k8s-test/yamls/worker.yaml
@@ -0,0 +1,66 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: fs-llm-test-worker
+spec:
+  podManagementPolicy: Parallel
+  replicas: 32
+  selector:
+    matchLabels:
+      app: fs-llm-test-worker
+  serviceName: "my-service"
+  template:
+    metadata:
+      labels:
+        app: fs-llm-test-worker
+    spec:
+      containers:
+      - name: fs-llm-test
+        image: registry.cn-wulanchabu.aliyuncs.com/vineyard/fs-llm-master:latest
+        imagePullPolicy: IfNotPresent
+        command:
+        - "/bin/bash"
+        - "-c"
+        - "python3 /worker.py"
+        env:
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: LAYER
+          value: "96"
+        - name: BATCH_SIZE
+          value: "16"
+        - name: SPLIT_NUMBER
+          value: "2"
+        - name: CLIENT_GC_INTERVAL
+          value: "1800"
+        - name: TTL
+          value: "1800"
+        - name: ENABLE_GLOBAL_GC
+          value: "false"
+        - name: GLOBAL_GC_INTERVAL
+          value: "1800"
+        - name: GLOBAL_TTL
+          value: "1800"
+        resources:
+          limits:
+            cpu: "7"
+            memory: 15Gi
+          requests:
+            cpu: "7"
+            memory: 15Gi
+        ports:
+        - protocol: TCP
+          containerPort: 8888
+        volumeMounts:
+          - name: nas-pvc
+            mountPath: "/tmp/vineyard/llm_cache"
+      volumes:
+        - name: nas-pvc
+          persistentVolumeClaim:
+            claimName: nas-csi-pvc
diff --git a/modules/llm-cache/tests/scripts/README.md b/modules/llm-cache/tests/scripts/README.md
new file mode 100644
index 000000000..a3d4a6b23
--- /dev/null
+++ b/modules/llm-cache/tests/scripts/README.md
@@ -0,0 +1,39 @@
+#### `dfs-fio.sh`
+
+This script runs FIO on a kubernetes cluster to test the average bandwidth of a distributed file system. Assuming that you have deployed the persistent volume and persistent volume claim for the distributed file system, you can create a deployment with multiple replicas and mount the distributed file system to each replica. For more information on how to do this, see the [Mount Nas Volume on ACK](https://www.alibabacloud.com/help/en/ack/serverless-kubernetes/user-guide/mount-a-statically-provisioned-nas-volume).
+
+Now, you may have many pods running on the kubernetes cluster, each of which has a mounted distributed file system. In general, each pod can be considered as a client of the distributed file system. With this script, you can test the average bandwidth of the distributed file system by running FIO on each pod and calculating the overall bandwidth.
+
+> **Tip:** The average bandwidth may be bounded by the network bandwidth of the kubernetes nodes. If your kubernetes node is poor in network performance, the average bandwidth may be much lower than the theoretical bandwidth of the distributed file system (e.g., DFS overall bandwidth / pod replicas ).
+
+Next, we will show how to run the script.
+
+```bash
+$ bash dfs-fio.sh   
+Usage: dfs-fio.sh -l <pod-label> -a <fio-action> -j <num-jobs> -m <max-pods> -d <directory> -b <block-size>
+  -l <pod-label>     Pod label for filtering specific Pods
+  -a <fio-action>    FIO operation type: read, write, randread, randwrite
+  -j <num-jobs>      Number of parallel jobs in FIO
+  -m <max-pods>      Limit on the maximum number of Pods, should not exceed the total number of Pods
+  -d <directory>     Directory path for FIO testing
+  -b <block-size>    Block size for the FIO test
+```
+
+- Test the average **read** bandwidth for a pod.
+
+  ```bash
+  $ bash dfs-fio.sh -l your_pod_label_key=your_pod_label_value -a read -j 1 -m 1 -d /your/dfs/mountpath -b 4992k -n default
+  ```
+
+- Test the average **write** bandwidth for ten pods.
+
+  ```bash
+  $ bash dfs-fio.sh -l your_pod_label_key=your_pod_label_value -a write -j 1 -m 10 -d /your/dfs/mountpath -b 4992k -n default
+  ```
+
+- Test the average **randread** bandwidth
+
+  ```bash
+  $ bash dfs-fio.sh -l your_pod_label_key=your_pod_label_value -a randread -j 1 -m 10 -d /your/dfs/mountpath -b 4992k -n default
+  ```
+
diff --git a/modules/llm-cache/tests/scripts/dfs-fio.sh b/modules/llm-cache/tests/scripts/dfs-fio.sh
new file mode 100644
index 000000000..0dbd373e8
--- /dev/null
+++ b/modules/llm-cache/tests/scripts/dfs-fio.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Usage instructions
+usage() {
+  echo "Usage: $0 -l <pod-label> -a <fio-action> -j <num-jobs> -m <max-pods> -d <directory> -b <block-size> -n <namespace>"
+  echo "  -l <pod-label>     Pod label for filtering specific Pods"
+  echo "  -a <fio-action>    FIO operation type: read, write, randread, randwrite"
+  echo "  -j <num-jobs>      Number of parallel jobs in FIO"
+  echo "  -m <max-pods>      Limit on the maximum number of Pods, should not exceed the total number of Pods"
+  echo "  -d <directory>     Directory path for FIO testing"
+  echo "  -b <block-size>    Block size for the FIO test"
+  echo "  -n <namespace>     Namespace for the Pods"
+  exit 1
+}
+
+# Parse command line options
+while getopts "l:a:j:m:d:b:n:" opt; do
+  case ${opt} in
+    l ) pod_label=$OPTARG ;;
+    a ) fio_action=$OPTARG ;;
+    j ) num_jobs=$OPTARG ;;
+    m ) max_pods=$OPTARG ;;
+    d ) directory=$OPTARG ;;
+    b ) block_size=$OPTARG ;;
+    n ) namespace=$OPTARG ;;
+    ? ) usage ;;
+  esac
+done
+
+# Verify that all required parameters have been provided
+if [ -z "${pod_label}" ] || [ -z "${fio_action}" ] || [ -z "${num_jobs}" ] || [ -z "${max_pods}" ] || [ -z "${directory}" ] || [ -z "${block_size}" ] || [ -z "${namespace}" ]; then
+    usage
+fi
+
+# Get all Pod names with the specified label
+pods=$(kubectl get po -lapp=${pod_label} -o custom-columns=:metadata.name)
+
+# Create a directory to store log files
+mkdir -p fio_logs
+
+counter=0
+
+# Run FIO in parallel and collect the results
+for pod in $pods; do
+  if [ $counter -ge $max_pods ]; then
+    echo "Reached the maximum number of pods ($max_pods). Exiting loop."
+    break
+  fi
+  echo "Executing fio command on pod: $pod"
+  kubectl exec -n ${namespace} "$pod" -- fio -direct=1 -directory=${directory} -ioengine=libaio -iodepth=1024 -rw=${fio_action} -bs=${block_size} -size=5G -numjobs=${num_jobs} -time_based=1 -group_reporting -name="${pod}_${fio_action}_Testing" > "fio_logs/$pod.log" &
+  ((counter++))
+done
+
+# Wait for all background jobs to complete
+wait
+echo "FIO testing has completed on all pods."
+
+# Initialize total bandwidth and counter
+total_bw_MiB=0
+count=0
+
+# Extract and accumulate bandwidth values
+for file in fio_logs/*.log; do
+  bw_MiB=$(grep -oP 'bw=\K[0-9]+(?=MiB/s)' "$file" | head -n1)
+  if [[ ! -z "$bw_MiB" ]]; then
+    total_bw_MiB=$(echo "$total_bw_MiB + $bw_MiB" | bc)
+    count=$((count + 1))
+  fi
+done
+
+# If there are log entries processed, calculate the average; otherwise, indicate no valid logs
+if [ $count -gt 0 ]; then
+  average_bw_MiB=$(echo "scale=2; $total_bw_MiB / $count" | bc)
+  echo "Average Bandwidth: $average_bw_MiB MiB/s"
+else
+  echo "No valid log entries found."
+fi
+
+# Clean up the log folder
+rm -rf fio_logs