From 1edb37cd14461bd0411fef4b26d54cd8a9fc16ae Mon Sep 17 00:00:00 2001
From: Ye Cao <caoye.cao@alibaba-inc.com>
Date: Mon, 4 Mar 2024 17:59:46 +0800
Subject: [PATCH] Improve the query API of llm cache and use vector<uint8_t> as
 payload object.

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 modules/basic/ds/dataframe.cc                 |   3 +
 modules/llm-cache/ds/kv_state_cache.cc        |  41 ++++---
 modules/llm-cache/ds/kv_state_cache.h         |  16 +--
 modules/llm-cache/ds/kv_state_cache_block.cc  | 111 +++++++++---------
 modules/llm-cache/ds/kv_state_cache_block.h   |  50 ++++----
 .../llm-cache/ds/kv_state_cache_manager.cc    |  41 ++++---
 modules/llm-cache/ds/kv_state_cache_manager.h |  25 ++--
 modules/llm-cache/radix-tree/radix-tree.cc    |   2 +-
 .../tests/kv_state_cache_benchmark_test.cc    |  12 +-
 .../llm-cache/tests/kv_state_cache_test.cc    |  95 +++++++--------
 10 files changed, 198 insertions(+), 198 deletions(-)
diff --git a/modules/basic/ds/dataframe.cc b/modules/basic/ds/dataframe.cc
index ebfdbdbf..baa2cb34 100644
--- a/modules/basic/ds/dataframe.cc
+++ b/modules/basic/ds/dataframe.cc
@@ -85,6 +85,9 @@ const std::shared_ptr<arrow::RecordBatch> DataFrame::AsBatch(bool copy) const {
     } else if (auto tensor =
                    std::dynamic_pointer_cast<Tensor<std::string>>(df_col)) {
       num_rows = tensor->shape()[0];
+    } else if (auto tensor =
+                   std::dynamic_pointer_cast<Tensor<uint8_t>>(df_col)) {
+      num_rows = tensor->shape()[0];
     }
 
     std::vector<std::shared_ptr<arrow::Buffer>> buffer{
diff --git a/modules/llm-cache/ds/kv_state_cache.cc b/modules/llm-cache/ds/kv_state_cache.cc
index 4c1615f8..37ecc8e1 100644
--- a/modules/llm-cache/ds/kv_state_cache.cc
+++ b/modules/llm-cache/ds/kv_state_cache.cc
@@ -57,24 +57,24 @@ void KVStateCache::Resolve() {
   }
 
   // 3. construct the member field
-  this->dimension = this->meta_.GetKeyValue<int>("dimension");
+  this->tensorBytes = this->meta_.GetKeyValue<int>("tensorBytes");
   this->version = this->meta_.GetKeyValue<uint64_t>("version");
   this->layer = this->meta_.GetKeyValue<int>("layer");
-  VLOG(100) << "construct the member field success, with dimension:"
-            << this->dimension << " version:" << this->version
+  VLOG(100) << "construct the member field success, with tensorBytes:"
+            << this->tensorBytes << " version:" << this->version
             << " layer:" << this->layer;
 }
 
 KVStateCache::~KVStateCache() {}
 
-KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int dimension,
+KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int tensorBytes,
                                          int cacheCapacity, int layer,
                                          int blockSize) {
-  this->dimension = dimension;
+  this->tensorBytes = tensorBytes;
   this->version = 0;
   this->layer = layer;
   KVStateCacheBlockBuilder* builder =
-      new KVStateCacheBlockBuilder(client, this->dimension, layer, blockSize);
+      new KVStateCacheBlockBuilder(client, this->tensorBytes, layer, blockSize);
 
   this->rootTree = std::make_shared<RadixTree>(cacheCapacity);
 
@@ -90,7 +90,7 @@ KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int dimension,
 
 KVStateCacheBuilder::KVStateCacheBuilder(Client& client,
                                          std::shared_ptr<KVStateCache> cache) {
-  this->dimension = cache->GetDimension();
+  this->tensorBytes = cache->GetTensorBytes();
   this->version = cache->GetVersion();
   this->layer = cache->GetLayer();
   // 1. create block builder from block
@@ -118,7 +118,7 @@ KVStateCacheBlockBuilder* KVStateCacheBuilder::Split(
   // Split the tree if the list of kvState is full.
   VINEYARD_ASSERT(nodeDataList.size() > 0);
   KVStateCacheBlockBuilder* childKVStateCacheBlockBuilder =
-      new KVStateCacheBlockBuilder(client, this->dimension, this->layer,
+      new KVStateCacheBlockBuilder(client, this->tensorBytes, this->layer,
                                    kvStateCacheBlockBuilder->GetBlockSize());
   for (size_t i = 0; i < nodeDataList.size(); i++) {
     OffsetData* data =
@@ -138,10 +138,9 @@ KVStateCacheBlockBuilder* KVStateCacheBuilder::Split(
   return childKVStateCacheBlockBuilder;
 }
 
-void KVStateCacheBuilder::Update(Client& client,
-                                 const std::vector<int>& tokenList,
-                                 int nextToken,
-                                 const KV_STATE_WITH_LAYER& kvState) {
+void KVStateCacheBuilder::Update(
+    Client& client, const std::vector<int>& tokenList, int nextToken,
+    const std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   std::vector<int> tokenListCopy = tokenList;
   tokenListCopy.push_back(nextToken);
 
@@ -199,9 +198,9 @@ void KVStateCacheBuilder::Update(Client& client,
             << " bitmap:" << kvStateCacheBlockBuilder->GetBitmapStr();
 }
 
-int KVStateCacheBuilder::Query(Client& client,
-                               const std::vector<int>& tokenList, int token,
-                               KV_STATE_WITH_LAYER& kvState) {
+int KVStateCacheBuilder::Query(
+    Client& client, const std::vector<int>& tokenList, int token,
+    std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   std::vector<int> tokenListCopy = tokenList;
   tokenListCopy.push_back(token);
 
@@ -275,14 +274,14 @@ void KVStateCacheBuilder::Merge(Client& client,
   for (auto it = insertTokenList.begin(); it != insertTokenList.end(); ++it) {
     std::vector<int> tokenList =
         std::vector<int>((*it).begin(), (*it).end() - 1);
-    KV_STATE_WITH_LAYER kvState;
+    std::map<int, std::pair<K_STATE, V_STATE>> kvState;
     for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
       K_STATE key_state;
       V_STATE value_state;
-      key_state.data = malloc(this->dimension * sizeof(double));
-      key_state.length = this->dimension * sizeof(double);
-      value_state.data = malloc(this->dimension * sizeof(double));
-      value_state.length = this->dimension * sizeof(double);
+      key_state.data = malloc(this->tensorBytes);
+      key_state.length = this->tensorBytes;
+      value_state.data = malloc(this->tensorBytes);
+      value_state.length = this->tensorBytes;
 
       kvState.insert(
           std::make_pair(currentLayer, std::make_pair(key_state, value_state)));
@@ -309,7 +308,7 @@ std::shared_ptr<Object> KVStateCacheBuilder::_Seal(Client& client) {
   std::shared_ptr<KVStateCache> kvStateCache = std::make_shared<KVStateCache>();
 
   // 1. store the member variables to cache object meta
-  kvStateCache->meta_.AddKeyValue("dimension", this->dimension);
+  kvStateCache->meta_.AddKeyValue("tensorBytes", this->tensorBytes);
   kvStateCache->meta_.AddKeyValue("version", this->version);
   kvStateCache->meta_.AddKeyValue("layer", this->layer);
 
diff --git a/modules/llm-cache/ds/kv_state_cache.h b/modules/llm-cache/ds/kv_state_cache.h
index 82e6a76c..20e37cd7 100644
--- a/modules/llm-cache/ds/kv_state_cache.h
+++ b/modules/llm-cache/ds/kv_state_cache.h
@@ -16,6 +16,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <vector>
+#include <utility>
 
 #include "client/client.h"
 #include "common/util/logging.h"
@@ -40,7 +41,7 @@ class KVStateCache : public vineyard::Registered<KVStateCache> {
  private:
   std::vector<std::shared_ptr<KVStateCacheBlock>> kvStateCacheBlockList;
   std::shared_ptr<RadixTree> rootTree;
-  int dimension;
+  int tensorBytes;
   int cacheCapacity;
   int layer;
   uint64_t version;
@@ -60,7 +61,7 @@ class KVStateCache : public vineyard::Registered<KVStateCache> {
     return this->kvStateCacheBlockList;
   }
 
-  int GetDimension() { return this->dimension; }
+  int GetTensorBytes() { return this->tensorBytes; }
 
   int GetCacheCapacity() { return this->cacheCapacity; }
 
@@ -77,12 +78,12 @@ class KVStateCache : public vineyard::Registered<KVStateCache> {
 
 class KVStateCacheBuilder : public vineyard::ObjectBuilder {
   std::shared_ptr<RadixTree> rootTree;
-  int dimension;
+  int tensorBytes;
   int layer;
   uint64_t version;
 
  public:
-  KVStateCacheBuilder(Client& client, int dimension, int cacheCapacity,
+  KVStateCacheBuilder(Client& client, int tensorBytes, int cacheCapacity,
                       int layer, int blockSize = DEFAULT_BLOCK_SIZE);
 
   KVStateCacheBuilder(Client& client, std::shared_ptr<KVStateCache> cache);
@@ -92,10 +93,11 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder {
       std::vector<std::shared_ptr<NodeData>> nodeDataList);
 
   void Update(Client& client, const std::vector<int>& token_list,
-              int next_token, const KV_STATE_WITH_LAYER& kv_state);
+              int next_token,
+              const std::map<int, std::pair<K_STATE, V_STATE>>& kv_state);
 
   int Query(Client& client, const std::vector<int>& token_list, int token,
-            KV_STATE_WITH_LAYER& kv_state);
+            std::map<int, std::pair<K_STATE, V_STATE>>& kv_state);
 
   void Delete(std::shared_ptr<NodeData> evicted_node);
 
@@ -109,7 +111,7 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder {
 
   std::shared_ptr<Object> _Seal(Client& client) override;
 
-  uint64_t GetDimension() { return this->dimension; }
+  uint64_t GetTensorBytes() { return this->tensorBytes; }
 
   std::shared_ptr<RadixTree> GetRootTree() { return this->rootTree; }
 
diff --git a/modules/llm-cache/ds/kv_state_cache_block.cc b/modules/llm-cache/ds/kv_state_cache_block.cc
index 17477143..59eab1de 100644
--- a/modules/llm-cache/ds/kv_state_cache_block.cc
+++ b/modules/llm-cache/ds/kv_state_cache_block.cc
@@ -60,10 +60,10 @@ void KVStateCacheBlock::Construct(const ObjectMeta& meta) {
   this->layer = this->meta_.GetKeyValue<int>("layer");
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     this->keyStateTensorList.push_back(
-        std::dynamic_pointer_cast<Tensor<double>>(this->meta_.GetMember(
+        std::dynamic_pointer_cast<Tensor<uint8_t>>(this->meta_.GetMember(
             "keyStateTensorBuilder_" + std::to_string(currentLayer))));
     this->valueStateTensorList.push_back(
-        std::dynamic_pointer_cast<Tensor<double>>(this->meta_.GetMember(
+        std::dynamic_pointer_cast<Tensor<uint8_t>>(this->meta_.GetMember(
             "valueStateTensorBuilder_" + std::to_string(currentLayer))));
   }
   // 2. construct the member field
@@ -74,27 +74,27 @@ void KVStateCacheBlock::Construct(const ObjectMeta& meta) {
     this->bitmap[i] =
         this->meta_.GetKeyValue<uint64_t>("bitmap_" + std::to_string(i));
   }
-  this->dimension = this->meta_.GetKeyValue<int>("dimension");
+  this->tensorBytes = this->meta_.GetKeyValue<int>("tensorBytes");
   this->blockSize = this->meta_.GetKeyValue<int>("block_size");
 }
 
 KVStateCacheBlock::~KVStateCacheBlock() { delete this->bitmap; }
 
 KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(Client& client,
-                                                   int dimension, int layer,
+                                                   int tensorBytes, int layer,
                                                    int blockSize) {
   this->blockSize = blockSize;
   this->bitmapSize = (blockSize + 63) / 64;
   this->bitmap = new uint64_t[this->bitmapSize];
   memset(this->bitmap, UINT8_MAX, this->bitmapSize * sizeof(uint64_t));
-  std::vector<int64_t> shape = {(int64_t)(blockSize), dimension};
+  std::vector<int64_t> shape = {(int64_t)(blockSize), tensorBytes};
   for (int i = 0; i < layer; i++) {
     this->keyStateTensorBuilderList.push_back(
-        std::make_shared<TensorBuilder<double>>(client, shape));
+        std::make_shared<TensorBuilder<uint8_t>>(client, shape));
     this->valueStateTensorBuilderList.push_back(
-        std::make_shared<TensorBuilder<double>>(client, shape));
+        std::make_shared<TensorBuilder<uint8_t>>(client, shape));
   }
-  this->dimension = dimension;
+  this->tensorBytes = tensorBytes;
   this->layer = layer;
 }
 
@@ -108,37 +108,38 @@ KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(
   for (int i = 0; i < this->bitmapSize; i++) {
     this->bitmap[i] = kvStateCacheBlock->bitmap[i];
   }
-  this->dimension = kvStateCacheBlock->dimension;
+  this->tensorBytes = kvStateCacheBlock->tensorBytes;
   this->layer = kvStateCacheBlock->layer;
-  std::vector<int64_t> shape = {(int64_t)(blockSize), dimension};
+  std::vector<int64_t> shape = {(int64_t)(blockSize), tensorBytes};
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     this->keyStateTensorBuilderList.push_back(
-        std::make_shared<TensorBuilder<double>>(client, shape));
+        std::make_shared<TensorBuilder<uint8_t>>(client, shape));
     this->valueStateTensorBuilderList.push_back(
-        std::make_shared<TensorBuilder<double>>(client, shape));
+        std::make_shared<TensorBuilder<uint8_t>>(client, shape));
   }
 
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     memcpy(this->keyStateTensorBuilderList[currentLayer]->data(),
            kvStateCacheBlock->keyStateTensorList[currentLayer]->data(),
-           (int64_t)(blockSize) * this->dimension * sizeof(double));
+           (int64_t)(blockSize) * this->tensorBytes);
     memcpy(this->valueStateTensorBuilderList[currentLayer]->data(),
            kvStateCacheBlock->valueStateTensorList[currentLayer]->data(),
-           (int64_t)(blockSize) * this->dimension * sizeof(double));
+           (int64_t)(blockSize) * this->tensorBytes);
   }
 }
 
 // current we do not consider the layer.
-int KVStateCacheBlockBuilder::Query(Client& client, int index,
-                                    KV_STATE_WITH_LAYER& kvState) {
+int KVStateCacheBlockBuilder::Query(
+    Client& client, int index,
+    std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
-    memcpy((kvState.find(currentLayer)->second).first.data,
-           keyStateTensorBuilderList[currentLayer]->data() + index * dimension,
-           dimension * sizeof(double));
-    memcpy(
-        (kvState.find(currentLayer)->second).second.data,
-        valueStateTensorBuilderList[currentLayer]->data() + index * dimension,
-        dimension * sizeof(double));
+    K_STATE keyState = (kvState.find(currentLayer)->second).first;
+    V_STATE valueState = (kvState.find(currentLayer)->second).second;
+    keyState.data = keyStateTensorBuilderList[currentLayer]->data() + index;
+    keyState.length = tensorBytes;
+    valueState.data = valueStateTensorBuilderList[currentLayer]->data() + index;
+    valueState.length = tensorBytes;
+    kvState.emplace(currentLayer, std::make_pair(keyState, valueState));
   }
   return 0;
 }
@@ -164,23 +165,20 @@ bool KVStateCacheBlockBuilder::IsFull() {
   return true;
 }
 
-void KVStateCacheBlockBuilder::Update(const KV_STATE_WITH_LAYER& kvState,
-                                      OffsetData* data) {
+void KVStateCacheBlockBuilder::Update(
+    const std::map<int, std::pair<K_STATE, V_STATE>>& kvState,
+    OffsetData* data) {
   int index = this->FindEmptySlot();
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
     K_STATE keyState = (kvState.find(currentLayer)->second).first;
     V_STATE valueState = (kvState.find(currentLayer)->second).second;
-    VINEYARD_ASSERT(keyState.length ==
-                    (size_t) this->dimension * sizeof(double));
-    VINEYARD_ASSERT(valueState.length ==
-                    (size_t) this->dimension * sizeof(double));
-
-    double* keyData = keyStateTensorBuilderList[currentLayer]->data();
-    double* valueData = valueStateTensorBuilderList[currentLayer]->data();
-    memcpy(keyData + index * this->dimension, keyState.data,
-           this->dimension * sizeof(double));
-    memcpy(valueData + index * this->dimension, valueState.data,
-           this->dimension * sizeof(double));
+    VINEYARD_ASSERT(keyState.length == (size_t) this->tensorBytes);
+    VINEYARD_ASSERT(valueState.length == (size_t) this->tensorBytes);
+
+    uint8_t* keyData = keyStateTensorBuilderList[currentLayer]->data();
+    uint8_t* valueData = valueStateTensorBuilderList[currentLayer]->data();
+    memcpy(keyData + index, keyState.data, this->tensorBytes);
+    memcpy(valueData + index, valueState.data, this->tensorBytes);
   }
   data->offset = index;
 
@@ -193,25 +191,23 @@ int16_t KVStateCacheBlockBuilder::Split(KVStateCacheBlockBuilder* child,
   VINEYARD_ASSERT(this->layer == child->layer);
   int childIndex = child->FindEmptySlot();
   for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) {
-    std::shared_ptr<TensorBuilder<double>> keyStateTensorBuilder =
+    std::shared_ptr<TensorBuilder<uint8_t>> keyStateTensorBuilder =
         keyStateTensorBuilderList[currentLayer];
-    std::shared_ptr<TensorBuilder<double>> valueStateTensorBuilder =
+    std::shared_ptr<TensorBuilder<uint8_t>> valueStateTensorBuilder =
         valueStateTensorBuilderList[currentLayer];
-    std::shared_ptr<TensorBuilder<double>> childKeyStateTensorBuilder =
+    std::shared_ptr<TensorBuilder<uint8_t>> childKeyStateTensorBuilder =
         child->keyStateTensorBuilderList[currentLayer];
-    std::shared_ptr<TensorBuilder<double>> childValueStateTensorBuilder =
+    std::shared_ptr<TensorBuilder<uint8_t>> childValueStateTensorBuilder =
         child->valueStateTensorBuilderList[currentLayer];
 
-    double* keyState = keyStateTensorBuilder->data() + index * this->dimension;
-    double* valueState =
-        valueStateTensorBuilder->data() + index * this->dimension;
-    double* childKeyState =
-        childKeyStateTensorBuilder->data() + childIndex * this->dimension;
-    double* childValueState =
-        childValueStateTensorBuilder->data() + childIndex * this->dimension;
+    uint8_t* keyState = keyStateTensorBuilder->data() + index;
+    uint8_t* valueState = valueStateTensorBuilder->data() + index;
+    uint8_t* childKeyState = childKeyStateTensorBuilder->data() + childIndex;
+    uint8_t* childValueState =
+        childValueStateTensorBuilder->data() + childIndex;
 
-    memcpy(childKeyState, keyState, this->dimension * sizeof(double));
-    memcpy(childValueState, valueState, this->dimension * sizeof(double));
+    memcpy(childKeyState, keyState, this->tensorBytes);
+    memcpy(childValueState, valueState, this->tensorBytes);
   }
   ACQUIRE_BIT_RESOURCE(child->bitmap[childIndex / 64], childIndex % 64);
   FREE_BIT_RESOURCE(this->bitmap[index / 64], index % 64);
@@ -244,7 +240,7 @@ std::shared_ptr<Object> KVStateCacheBlockBuilder::_Seal(Client& client) {
   }
 
   kvStateCacheBlock->meta_.AddKeyValue("block_size", this->blockSize);
-  kvStateCacheBlock->meta_.AddKeyValue("dimension", this->dimension);
+  kvStateCacheBlock->meta_.AddKeyValue("tensorBytes", this->tensorBytes);
   kvStateCacheBlock->meta_.AddKeyValue("layer", this->layer);
   // 3. set the object type to meta
   kvStateCacheBlock->meta_.SetTypeName(type_name<KVStateCacheBlock>());
@@ -264,15 +260,16 @@ void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() {
     LOG(INFO) << "layer:" << currentLayer;
     for (int i = 0; i < this->blockSize; i++) {
       LOG(INFO) << "index:" << i;
+      uint8_t* key_state_data = keyStateTensorBuilderList[currentLayer]->data();
+      uint8_t* value_state_data =
+          valueStateTensorBuilderList[currentLayer]->data();
+      // print the first tensorBytes bytes
       std::string keyState = "";
       std::string valueState = "";
-      for (int j = 0; j < this->dimension; j++) {
-        keyState += std::to_string((keyStateTensorBuilderList[currentLayer]
-                                        ->data())[i * dimension + j]) +
-                    " ";
-        valueState += std::to_string((valueStateTensorBuilderList[currentLayer]
-                                          ->data())[i * dimension + j]) +
-                      " ";
+      for (int j = 0; j < this->tensorBytes; j++) {
+        keyState += std::to_string(key_state_data[i * tensorBytes + j]) + " ";
+        valueState +=
+            std::to_string(value_state_data[i * tensorBytes + j]) + " ";
       }
       LOG(INFO) << "keyState:" << keyState;
       LOG(INFO) << "valueState:" << valueState;
diff --git a/modules/llm-cache/ds/kv_state_cache_block.h b/modules/llm-cache/ds/kv_state_cache_block.h
index 5e0a7262..ac9522ad 100644
--- a/modules/llm-cache/ds/kv_state_cache_block.h
+++ b/modules/llm-cache/ds/kv_state_cache_block.h
@@ -37,12 +37,6 @@ struct State {
 using K_STATE = State;
 using V_STATE = State;
 
-using KV_STATE_WITH_LAYER = std::map<int, std::pair<K_STATE, V_STATE>>;
-using LIST_KV_STATE_WITH_LAYER =
-    std::vector<std::map<int, std::pair<K_STATE, V_STATE>>>;
-using KV_STATE = std::vector<std::pair<K_STATE, V_STATE>>;
-using LIST_KV_STATE = std::vector<std::pair<K_STATE, V_STATE>>;
-
 // Set the bit to 1, which means the resource is not being used
 #define FREE_BIT_RESOURCE(value, bit) ((value) |= (((uint64_t) 1) << (bit)))
 
@@ -72,14 +66,14 @@ namespace vineyard {
 
 class KVStateCacheBlock : public vineyard::Registered<KVStateCacheBlock> {
  private:
-  std::vector<std::shared_ptr<Tensor<double>>> keyStateTensorList;
-  std::vector<std::shared_ptr<Tensor<double>>> valueStateTensorList;
+  std::vector<std::shared_ptr<Tensor<uint8_t>>> keyStateTensorList;
+  std::vector<std::shared_ptr<Tensor<uint8_t>>> valueStateTensorList;
   uint64_t* bitmap;
   int blockSize;
   int bitmapSize;
   ObjectID id;
   int layer;
-  int dimension;
+  int tensorBytes;
 
  public:
   static std::unique_ptr<Object> Create() __attribute__((used)) {
@@ -91,25 +85,25 @@ class KVStateCacheBlock : public vineyard::Registered<KVStateCacheBlock> {
 
   std::string GetBitmapStr();
 
-  uint64_t GetDimension() { return this->dimension; }
+  uint64_t GetTensorBytes() { return this->tensorBytes; }
 
   uint64_t* GetBitmap() { return this->bitmap; }
 
   int GetBlockSize() { return this->blockSize; }
 
-  std::shared_ptr<const Tensor<double>> GetKeyTensor(int layer) {
+  std::shared_ptr<const Tensor<uint8_t>> GetKeyTensor(int layer) {
     return this->keyStateTensorList[layer];
   }
 
-  std::shared_ptr<const Tensor<double>> GetValueTensor(int layer) {
+  std::shared_ptr<const Tensor<uint8_t>> GetValueTensor(int layer) {
     return this->valueStateTensorList[layer];
   }
 
-  std::vector<std::shared_ptr<Tensor<double>>> GetKeyTensorList() {
+  std::vector<std::shared_ptr<Tensor<uint8_t>>> GetKeyTensorList() {
     return this->keyStateTensorList;
   }
 
-  std::vector<std::shared_ptr<Tensor<double>>> GetValueTensorList() {
+  std::vector<std::shared_ptr<Tensor<uint8_t>>> GetValueTensorList() {
     return this->valueStateTensorList;
   }
 
@@ -120,21 +114,22 @@ class KVStateCacheBlock : public vineyard::Registered<KVStateCacheBlock> {
 
 class KVStateCacheBlockBuilder : public ObjectBuilder {
  private:
-  std::vector<std::shared_ptr<TensorBuilder<double>>> keyStateTensorBuilderList;
-  std::vector<std::shared_ptr<TensorBuilder<double>>>
+  std::vector<std::shared_ptr<TensorBuilder<uint8_t>>>
+      keyStateTensorBuilderList;
+  std::vector<std::shared_ptr<TensorBuilder<uint8_t>>>
       valueStateTensorBuilderList;
   // TBD
   // support more than 64 kv-state cache slots
   uint64_t* bitmap;
   int blockSize;
   int bitmapSize;
-  int dimension;
+  int tensorBytes;
   int layer;
 
   int FindEmptySlot();
 
  public:
-  KVStateCacheBlockBuilder(Client& client, int dimension, int layer,
+  KVStateCacheBlockBuilder(Client& client, int tensorBytes, int layer,
                            int blockSize);
 
   KVStateCacheBlockBuilder(
@@ -147,9 +142,10 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
    * @param kv_state The kv-state of the prompt. A LLM inference can contain
    * multiple kv-states for each layer.
    */
-  void Update(const KV_STATE_WITH_LAYER& kv_state, OffsetData* data);
+  void Update(const std::map<int, std::pair<K_STATE, V_STATE>>& kv_state,
+              OffsetData* data);
 
-  void Update(double* keyState, double* valueState, uint64_t dataLength,
+  void Update(char* keyState, char* valueState, uint64_t dataLength,
               OffsetData* data);
 
   /**
@@ -160,7 +156,8 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
    * @param kv_state The kv-state of the prompt returned by radix-tree. If the
    * kv-state is not found, the data of kv-state is invalid.
    */
-  int Query(Client& client, int index, KV_STATE_WITH_LAYER& kv_state);
+  int Query(Client& client, int index,
+            std::map<int, std::pair<K_STATE, V_STATE>>& kv_state);
 
   bool IsFull();
 
@@ -170,20 +167,21 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
 
   int16_t Split(KVStateCacheBlockBuilder* child, int index);
 
-  const std::shared_ptr<TensorBuilder<double>> GetKeyStateBuilder(int layer) {
+  const std::shared_ptr<TensorBuilder<uint8_t>> GetKeyStateBuilder(int layer) {
     return keyStateTensorBuilderList[layer];
   }
 
-  const std::shared_ptr<TensorBuilder<double>> GetValueStateBuilder(int layer) {
+  const std::shared_ptr<TensorBuilder<uint8_t>> GetValueStateBuilder(
+      int layer) {
     return valueStateTensorBuilderList[layer];
   }
 
-  const std::vector<std::shared_ptr<TensorBuilder<double>>>
+  const std::vector<std::shared_ptr<TensorBuilder<uint8_t>>>
   GetKeyStateBuilderList() {
     return keyStateTensorBuilderList;
   }
 
-  const std::vector<std::shared_ptr<TensorBuilder<double>>>
+  const std::vector<std::shared_ptr<TensorBuilder<uint8_t>>>
   GetValueStateBuilderList() {
     return valueStateTensorBuilderList;
   }
@@ -196,7 +194,7 @@ class KVStateCacheBlockBuilder : public ObjectBuilder {
 
   uint64_t* GetBitmap() { return this->bitmap; }
 
-  uint64_t GetDimension() { return this->dimension; }
+  uint64_t GetTensorBytes() { return this->tensorBytes; }
 
   int GetBlockSize() { return this->blockSize; }
 
diff --git a/modules/llm-cache/ds/kv_state_cache_manager.cc b/modules/llm-cache/ds/kv_state_cache_manager.cc
index b13770df..c6db447d 100644
--- a/modules/llm-cache/ds/kv_state_cache_manager.cc
+++ b/modules/llm-cache/ds/kv_state_cache_manager.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace vineyard {
 
-KVStateCacheManager::KVStateCacheManager(int dimension, int cacheCapacity,
+KVStateCacheManager::KVStateCacheManager(int tensorBytes, int cacheCapacity,
                                          int layer, int blockSize,
                                          int syncInterval, std::string socket) {
   this->syncInterval = syncInterval;
@@ -61,7 +61,7 @@ KVStateCacheManager::KVStateCacheManager(int dimension, int cacheCapacity,
     // if failed, create a new cache object
     VLOG(100) << "failed to get the cache object, create a new one.";
     kvStateCacheBuilder = std::make_shared<KVStateCacheBuilder>(
-        client, dimension, cacheCapacity, layer, blockSize);
+        client, tensorBytes, cacheCapacity, layer, blockSize);
   }
 
   // release the lock
@@ -75,21 +75,21 @@ KVStateCacheManager::KVStateCacheManager(int dimension, int cacheCapacity,
   // use lease to prevent the deadlock if the client is down
 }
 
-void KVStateCacheManager::UpdateInternal(const std::vector<int>& tokenList,
-                                         int nextToken,
-                                         const KV_STATE_WITH_LAYER& kvState) {
+void KVStateCacheManager::UpdateInternal(
+    const std::vector<int>& tokenList, int nextToken,
+    const std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   kvStateCacheBuilder->Update(client, tokenList, nextToken, kvState);
 }
 
-int KVStateCacheManager::QueryInternal(const std::vector<int>& tokenList,
-                                       int token,
-                                       KV_STATE_WITH_LAYER& kvState) {
+int KVStateCacheManager::QueryInternal(
+    const std::vector<int>& tokenList, int token,
+    std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   return kvStateCacheBuilder->Query(client, tokenList, token, kvState);
 }
 
-void KVStateCacheManager::Update(const std::vector<int>& tokenList,
-                                 int nextToken,
-                                 const KV_STATE_WITH_LAYER& kvState) {
+void KVStateCacheManager::Update(
+    const std::vector<int>& tokenList, int nextToken,
+    const std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   if (!syncMutex.try_lock()) {
     return;
   }
@@ -99,8 +99,9 @@ void KVStateCacheManager::Update(const std::vector<int>& tokenList,
   syncMutex.unlock();
 }
 
-void KVStateCacheManager::Update(const std::vector<int>& tokenList,
-                                 const LIST_KV_STATE_WITH_LAYER& kvState) {
+void KVStateCacheManager::Update(
+    const std::vector<int>& tokenList,
+    const std::vector<std::map<int, std::pair<K_STATE, V_STATE>>>& kvState) {
   if (!syncMutex.try_lock()) {
     return;
   }
@@ -114,8 +115,9 @@ void KVStateCacheManager::Update(const std::vector<int>& tokenList,
   syncMutex.unlock();
 }
 
-int KVStateCacheManager::Query(const std::vector<int>& tokenList, int token,
-                               KV_STATE_WITH_LAYER& kvState) {
+int KVStateCacheManager::Query(
+    const std::vector<int>& tokenList, int token,
+    std::map<int, std::pair<K_STATE, V_STATE>>& kvState) {
   int result = -1;
 
   if (!syncMutex.try_lock()) {
@@ -128,8 +130,9 @@ int KVStateCacheManager::Query(const std::vector<int>& tokenList, int token,
   return result;
 }
 
-int KVStateCacheManager::Query(const std::vector<int>& tokenList,
-                               LIST_KV_STATE_WITH_LAYER& listKVState) {
+int KVStateCacheManager::Query(
+    const std::vector<int>& tokenList,
+    std::vector<std::map<int, std::pair<K_STATE, V_STATE>>>& listKVState) {
   int result = -1;
   if (!syncMutex.try_lock()) {
     return result;
@@ -138,6 +141,10 @@ int KVStateCacheManager::Query(const std::vector<int>& tokenList,
   std::vector<int> tokenListCopy;
   for (size_t i = 0; i < tokenList.size(); i++) {
     result = QueryInternal(tokenListCopy, tokenList[i], listKVState[i]);
+    // if the result is -1, it means the token is not in the cache
+    if (result == -1) {
+      break;
+    }
     tokenListCopy.push_back(tokenList[i]);
   }
 
diff --git a/modules/llm-cache/ds/kv_state_cache_manager.h b/modules/llm-cache/ds/kv_state_cache_manager.h
index 408cac8a..312ebb6e 100644
--- a/modules/llm-cache/ds/kv_state_cache_manager.h
+++ b/modules/llm-cache/ds/kv_state_cache_manager.h
@@ -14,10 +14,12 @@ limitations under the License.
 */
 
 #include <condition_variable>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
+#include <utility>
 
 #include "llm-cache/ds/kv_state_cache.h"
 
@@ -41,30 +43,33 @@ class KVStateCacheManager {
 
  public:
   KVStateCacheManager(
-      int dimension = 10, int cacheCapacity = 10, int layer = 1,
+      int tensorBytes = 80, int cacheCapacity = 10, int layer = 1,
       int blockSize = 5, int syncInterval = 3,
       std::string socket = std::string(getenv("VINEYARD_IPC_SOCKET")));
 
   void Update(const std::vector<int>& tokenList, int nextToken,
-              const KV_STATE_WITH_LAYER& kvState);
+              const std::map<int, std::pair<K_STATE, V_STATE>>& kvState);
 
-  void Update(const std::vector<int>& tokenList,
-              const LIST_KV_STATE_WITH_LAYER& kvState);
+  void Update(
+      const std::vector<int>& tokenList,
+      const std::vector<std::map<int, std::pair<K_STATE, V_STATE>>>& kvState);
 
   int Query(const std::vector<int>& tokenList, int token,
-            KV_STATE_WITH_LAYER& kvState);
+            std::map<int, std::pair<K_STATE, V_STATE>>& kvState);
 
-  int Query(const std::vector<int>& tokenList,
-            LIST_KV_STATE_WITH_LAYER& listKVState);
+  int Query(
+      const std::vector<int>& tokenList,
+      std::vector<std::map<int, std::pair<K_STATE, V_STATE>>>& listKVState);
 
   ~KVStateCacheManager();
 
  private:
-  void UpdateInternal(const std::vector<int>& tokenList, int nextToken,
-                      const KV_STATE_WITH_LAYER& kvState);
+  void UpdateInternal(
+      const std::vector<int>& tokenList, int nextToken,
+      const std::map<int, std::pair<K_STATE, V_STATE>>& kvState);
 
   int QueryInternal(const std::vector<int>& tokenList, int token,
-                    KV_STATE_WITH_LAYER& kvState);
+                    std::map<int, std::pair<K_STATE, V_STATE>>& kvState);
 
   void Delete(std::vector<int> token);
 
diff --git a/modules/llm-cache/radix-tree/radix-tree.cc b/modules/llm-cache/radix-tree/radix-tree.cc
index 93e37a79..89478e67 100644
--- a/modules/llm-cache/radix-tree/radix-tree.cc
+++ b/modules/llm-cache/radix-tree/radix-tree.cc
@@ -101,7 +101,7 @@ std::shared_ptr<NodeData> RadixTree::InsertInternal(
       this->tree, insertTokensArray, insertTokensArrayLen, dummyData,
       reinterpret_cast<void**>(&dataNode), reinterpret_cast<void**>(&oldData));
   if (dataNode == NULL) {
-    throw std::runtime_error("Insert token list failed");
+    LOG(ERROR) << "Insert token list failed";
     return NULL;
   }
   if (retval == 1) {
diff --git a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc b/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc
index feb1166b..7cc9f9c5 100644
--- a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc
+++ b/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 
 using namespace vineyard;  //  NOLINT(build/namespaces)
 
-#define DIMENSION 100
+#define TENSORBYTES 800
 #define CAPACITY 1000
 #define LAYER 64
 #define BLOCK_SIZE 100
@@ -36,7 +36,7 @@ KVStateCacheManager* manager;
 
 void init() {
   manager =
-      new KVStateCacheManager(DIMENSION, CAPACITY, LAYER, DEFAULT_BLOCK_SIZE);
+      new KVStateCacheManager(TENSORBYTES, CAPACITY, LAYER, DEFAULT_BLOCK_SIZE);
 }
 
 std::vector<int> generate_random_tokens(size_t max_length) {
@@ -57,10 +57,10 @@ std::map<int, std::pair<K_STATE, V_STATE>> generate_kv_state(int token) {
   for (int currentLayer = 0; currentLayer < LAYER; currentLayer++) {
     K_STATE key_state;
     V_STATE value_state;
-    key_state.data = malloc(DIMENSION * sizeof(double));
-    key_state.length = DIMENSION * sizeof(double);
-    value_state.data = malloc(DIMENSION * sizeof(double));
-    value_state.length = DIMENSION * sizeof(double);
+    key_state.data = malloc(TENSORBYTES);
+    key_state.length = TENSORBYTES;
+    value_state.data = malloc(TENSORBYTES);
+    value_state.length = TENSORBYTES;
 
     kv_state.insert(
         std::make_pair(currentLayer, std::make_pair(key_state, value_state)));
diff --git a/modules/llm-cache/tests/kv_state_cache_test.cc b/modules/llm-cache/tests/kv_state_cache_test.cc
index e2d1e98a..6a13e3e9 100644
--- a/modules/llm-cache/tests/kv_state_cache_test.cc
+++ b/modules/llm-cache/tests/kv_state_cache_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 using namespace vineyard;  // NOLINT(build/namespaces)
 
-int dimension = 10;
+int tensorBytes = 80;
 int capacity = 20;
 int layer = 3;
 int block_size = 5;
@@ -42,9 +42,9 @@ std::vector<std::vector<int>> tokens_list;
 
 KVStateCacheManager* kv_state_cache_manager;
 
-void init(int dimension, int capacity, int layer, int block_size,
+void init(int tensorBytes, int capacity, int layer, int block_size,
           std::string socket) {
-  kv_state_cache_manager = new KVStateCacheManager(dimension, capacity, layer,
+  kv_state_cache_manager = new KVStateCacheManager(tensorBytes, capacity, layer,
                                                    block_size, 3, socket);
 }
 
@@ -61,17 +61,16 @@ void print_kv_state(
     const std::map<int, std::pair<K_STATE, V_STATE>>& kv_state) {
   LOG(INFO) << "kv_state: ";
   for (auto iter = kv_state.begin(); iter != kv_state.end(); ++iter) {
+    uint8_t* key_state_data =
+        reinterpret_cast<uint8_t*>(iter->second.first.data);
+    uint8_t* value_state_data =
+        reinterpret_cast<uint8_t*>(iter->second.second.data);
+    // print the first tensorBytes bytes
     std::string key_state_str = "";
     std::string value_state_str = "";
-    for (int i = 0; i < dimension; ++i) {
-      key_state_str +=
-          std::to_string(
-              (reinterpret_cast<double*>(iter->second.first.data))[i]) +
-          " ";
-      value_state_str +=
-          std::to_string(
-              (reinterpret_cast<double*>(iter->second.second.data))[i]) +
-          " ";
+    for (int j = 0; j < tensorBytes; j++) {
+      key_state_str += std::to_string(key_state_data[j]) + " ";
+      value_state_str += std::to_string(value_state_data[j]) + " ";
     }
     LOG(INFO) << "layer " << iter->first << ":";
     LOG(INFO) << "key_state: " << key_state_str;
@@ -81,69 +80,59 @@ void print_kv_state(
 }
 
 // we do not consider the layer.
-std::map<int, std::pair<K_STATE, V_STATE>> generate_kv_state() {
+std::map<int, std::pair<K_STATE, V_STATE>> generate_kv_state(int token) {
   std::map<int, std::pair<K_STATE, V_STATE>> kv_state;
   for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
     K_STATE key_state;
     V_STATE value_state;
-    key_state.data = malloc(dimension * sizeof(double));
-    value_state.data = malloc(dimension * sizeof(double));
+    key_state.data = malloc(tensorBytes);
+    value_state.data = malloc(tensorBytes);
 
-    key_state.length = dimension * sizeof(double);
-    value_state.length = dimension * sizeof(double);
+    key_state.length = tensorBytes;
+    value_state.length = tensorBytes;
 
-    kv_state.insert(
-        std::make_pair(currentLayer, std::make_pair(key_state, value_state)));
-  }
-  return kv_state;
-}
-
-void update_kv_state(std::map<int, std::pair<K_STATE, V_STATE>>& kvState,
-                     int token) {
-  for (int currentLayer = 0; currentLayer < layer; currentLayer++) {
-    K_STATE key_state = kvState[currentLayer].first;
-    V_STATE value_state = kvState[currentLayer].second;
-    for (int i = 0; i < dimension; ++i) {
-      (reinterpret_cast<double*>(key_state.data))[i] =
-          (static_cast<double>(token)) / dimension * (i + 1) +
+    for (int i = 0; i < tensorBytes; ++i) {
+      (reinterpret_cast<uint8_t*>(key_state.data))[i] =
+          (static_cast<uint8_t>(token)) / tensorBytes * (i + 1) +
           currentLayer * 10;
-      (reinterpret_cast<double*>(value_state.data))[i] =
-          (static_cast<double>(token)) / dimension * (i + 1) * 2 +
+      (reinterpret_cast<uint8_t*>(value_state.data))[i] =
+          (static_cast<uint8_t>(token)) / tensorBytes * (i + 1) * 2 +
           currentLayer * 10;
     }
+    kv_state.insert(
+        std::make_pair(currentLayer, std::make_pair(key_state, value_state)));
   }
+  return kv_state;
 }
 
 void check_kv_state(const std::map<int, std::pair<K_STATE, V_STATE>>& kv_state,
                     int& token) {
   VINEYARD_ASSERT(kv_state.size() == (size_t) layer);
   for (auto iter = kv_state.begin(); iter != kv_state.end(); ++iter) {
-    VINEYARD_ASSERT(iter->second.first.length ==
-                    (size_t) dimension * sizeof(double));
-    VINEYARD_ASSERT(iter->second.second.length ==
-                    (size_t) dimension * sizeof(double));
-    for (int i = 0; i < dimension; ++i) {
-      if ((reinterpret_cast<double*>(iter->second.first.data))[i] !=
-          (static_cast<double>(token)) / dimension * (i + 1) +
+    VINEYARD_ASSERT(iter->second.first.length == (size_t) tensorBytes);
+    VINEYARD_ASSERT(iter->second.second.length == (size_t) tensorBytes);
+    for (int i = 0; i < tensorBytes; ++i) {
+      if ((reinterpret_cast<uint8_t*>(iter->second.first.data))[i] !=
+          (static_cast<uint8_t>(token)) / tensorBytes * (i + 1) +
               iter->first * 10) {
-        LOG(INFO) << "token:" << token << " dimension" << dimension
+        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
                   << " layer:" << iter->first;
         LOG(INFO) << "key_state[" << i << "]: "
-                  << (reinterpret_cast<double*>(iter->second.first.data))[i]
+                  << (reinterpret_cast<uint8_t*>(iter->second.first.data))[i]
                   << ". But is should be "
-                  << (static_cast<double>(token)) / dimension * (i + 1) +
+                  << (static_cast<uint8_t>(token)) / tensorBytes * (i + 1) +
                          iter->first * 10;
         throw std::runtime_error("key_state error!");
       }
-      if ((reinterpret_cast<double*>(iter->second.second.data))[i] !=
-          (static_cast<double>(token)) / dimension * (i + 1) * 2 +
+      if (reinterpret_cast<uint8_t*>(iter->second.second.data)[i] !=
+          (static_cast<uint8_t>(token)) / tensorBytes * (i + 1) * 2 +
               iter->first * 10) {
-        LOG(INFO) << "token:" << token << " dimension" << dimension
+        LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes
                   << " layer:" << iter->first;
         LOG(INFO) << "value_state[" << i << "]: "
-                  << (reinterpret_cast<double*>(iter->second.second.data))[i]
+                  << (reinterpret_cast<uint8_t*>(iter->second.second.data))[i]
                   << ". But is should be "
-                  << (static_cast<double>(token)) / dimension * (i + 1) * 2 +
+                  << (static_cast<uint8_t>(token)) / tensorBytes * (i + 1) * 2 +
                          iter->first * 10;
         throw std::runtime_error("value_state error!");
       }
@@ -154,15 +143,15 @@ void check_kv_state(const std::map<int, std::pair<K_STATE, V_STATE>>& kv_state,
 void inference(std::vector<int> tokens, bool block = false) {
   std::vector<int> inference_tokens;
   std::map<int, std::pair<K_STATE, V_STATE>> kv_state;
-  kv_state = generate_kv_state();
   for (size_t i = 0; i < tokens.size(); ++i) {
+    kv_state.clear();
     int result =
         kv_state_cache_manager->Query(inference_tokens, tokens[i], kv_state);
     if (result != 0) {
       LOG(INFO) << "Can not find the kv_state from cache:";
       print_current_tokens(inference_tokens, tokens[i]);
       LOG(INFO) << "Generate the kv_state and update the cache.";
-      update_kv_state(kv_state, tokens[i]);
+      kv_state = generate_kv_state(tokens[i]);
       print_kv_state(kv_state);
       kv_state_cache_manager->Update(inference_tokens, tokens[i], kv_state);
     } else {
@@ -184,7 +173,7 @@ int main(int argc, char** argv) {
 
   for (int i = 2; i < argc; i++) {
     if (strcmp(argv[i], "-d") == 0) {
-      dimension = atoi(argv[i + 1]);
+      tensorBytes = atoi(argv[i + 1]);
     } else if (strcmp(argv[i], "-c") == 0) {
       capacity = atoi(argv[i + 1]);
     } else if (strcmp(argv[i], "-l") == 0) {
@@ -209,11 +198,11 @@ int main(int argc, char** argv) {
     }
   }
 
-  LOG(INFO) << "Test KVStateCache with dimension: " << dimension
+  LOG(INFO) << "Test KVStateCache with tensorBytes: " << tensorBytes
             << ", capacity: " << capacity << ", layer: " << layer
             << ", block_size: " << block_size << ".";
 
-  init(dimension, capacity, layer, block_size, ipc_socket);
+  init(tensorBytes, capacity, layer, block_size, ipc_socket);
 
   for (size_t i = 0; i < tokens_list.size(); i++) {
     inference(tokens_list[i]);