From 1edb37cd14461bd0411fef4b26d54cd8a9fc16ae Mon Sep 17 00:00:00 2001 From: Ye Cao Date: Mon, 4 Mar 2024 17:59:46 +0800 Subject: [PATCH] Improve the query API of llm cache and use vector as payload object. Signed-off-by: Ye Cao --- modules/basic/ds/dataframe.cc | 3 + modules/llm-cache/ds/kv_state_cache.cc | 41 ++++--- modules/llm-cache/ds/kv_state_cache.h | 16 +-- modules/llm-cache/ds/kv_state_cache_block.cc | 111 +++++++++--------- modules/llm-cache/ds/kv_state_cache_block.h | 50 ++++---- .../llm-cache/ds/kv_state_cache_manager.cc | 41 ++++--- modules/llm-cache/ds/kv_state_cache_manager.h | 25 ++-- modules/llm-cache/radix-tree/radix-tree.cc | 2 +- .../tests/kv_state_cache_benchmark_test.cc | 12 +- .../llm-cache/tests/kv_state_cache_test.cc | 95 +++++++-------- 10 files changed, 198 insertions(+), 198 deletions(-) diff --git a/modules/basic/ds/dataframe.cc b/modules/basic/ds/dataframe.cc index ebfdbdbf..baa2cb34 100644 --- a/modules/basic/ds/dataframe.cc +++ b/modules/basic/ds/dataframe.cc @@ -85,6 +85,9 @@ const std::shared_ptr DataFrame::AsBatch(bool copy) const { } else if (auto tensor = std::dynamic_pointer_cast>(df_col)) { num_rows = tensor->shape()[0]; + } else if (auto tensor = + std::dynamic_pointer_cast>(df_col)) { + num_rows = tensor->shape()[0]; } std::vector> buffer{ diff --git a/modules/llm-cache/ds/kv_state_cache.cc b/modules/llm-cache/ds/kv_state_cache.cc index 4c1615f8..37ecc8e1 100644 --- a/modules/llm-cache/ds/kv_state_cache.cc +++ b/modules/llm-cache/ds/kv_state_cache.cc @@ -57,24 +57,24 @@ void KVStateCache::Resolve() { } // 3. construct the member field - this->dimension = this->meta_.GetKeyValue("dimension"); + this->tensorBytes = this->meta_.GetKeyValue("tensorBytes"); this->version = this->meta_.GetKeyValue("version"); this->layer = this->meta_.GetKeyValue("layer"); - VLOG(100) << "construct the member field success, with dimension:" - << this->dimension << " version:" << this->version + VLOG(100) << "construct the member field success, with tensorBytes:" + << this->tensorBytes << " version:" << this->version << " layer:" << this->layer; } KVStateCache::~KVStateCache() {} -KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int dimension, +KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int tensorBytes, int cacheCapacity, int layer, int blockSize) { - this->dimension = dimension; + this->tensorBytes = tensorBytes; this->version = 0; this->layer = layer; KVStateCacheBlockBuilder* builder = - new KVStateCacheBlockBuilder(client, this->dimension, layer, blockSize); + new KVStateCacheBlockBuilder(client, this->tensorBytes, layer, blockSize); this->rootTree = std::make_shared(cacheCapacity); @@ -90,7 +90,7 @@ KVStateCacheBuilder::KVStateCacheBuilder(Client& client, int dimension, KVStateCacheBuilder::KVStateCacheBuilder(Client& client, std::shared_ptr cache) { - this->dimension = cache->GetDimension(); + this->tensorBytes = cache->GetTensorBytes(); this->version = cache->GetVersion(); this->layer = cache->GetLayer(); // 1. create block builder from block @@ -118,7 +118,7 @@ KVStateCacheBlockBuilder* KVStateCacheBuilder::Split( // Split the tree if the list of kvState is full. VINEYARD_ASSERT(nodeDataList.size() > 0); KVStateCacheBlockBuilder* childKVStateCacheBlockBuilder = - new KVStateCacheBlockBuilder(client, this->dimension, this->layer, + new KVStateCacheBlockBuilder(client, this->tensorBytes, this->layer, kvStateCacheBlockBuilder->GetBlockSize()); for (size_t i = 0; i < nodeDataList.size(); i++) { OffsetData* data = @@ -138,10 +138,9 @@ KVStateCacheBlockBuilder* KVStateCacheBuilder::Split( return childKVStateCacheBlockBuilder; } -void KVStateCacheBuilder::Update(Client& client, - const std::vector& tokenList, - int nextToken, - const KV_STATE_WITH_LAYER& kvState) { +void KVStateCacheBuilder::Update( + Client& client, const std::vector& tokenList, int nextToken, + const std::map>& kvState) { std::vector tokenListCopy = tokenList; tokenListCopy.push_back(nextToken); @@ -199,9 +198,9 @@ void KVStateCacheBuilder::Update(Client& client, << " bitmap:" << kvStateCacheBlockBuilder->GetBitmapStr(); } -int KVStateCacheBuilder::Query(Client& client, - const std::vector& tokenList, int token, - KV_STATE_WITH_LAYER& kvState) { +int KVStateCacheBuilder::Query( + Client& client, const std::vector& tokenList, int token, + std::map>& kvState) { std::vector tokenListCopy = tokenList; tokenListCopy.push_back(token); @@ -275,14 +274,14 @@ void KVStateCacheBuilder::Merge(Client& client, for (auto it = insertTokenList.begin(); it != insertTokenList.end(); ++it) { std::vector tokenList = std::vector((*it).begin(), (*it).end() - 1); - KV_STATE_WITH_LAYER kvState; + std::map> kvState; for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { K_STATE key_state; V_STATE value_state; - key_state.data = malloc(this->dimension * sizeof(double)); - key_state.length = this->dimension * sizeof(double); - value_state.data = malloc(this->dimension * sizeof(double)); - value_state.length = this->dimension * sizeof(double); + key_state.data = malloc(this->tensorBytes); + key_state.length = this->tensorBytes; + value_state.data = malloc(this->tensorBytes); + value_state.length = this->tensorBytes; kvState.insert( std::make_pair(currentLayer, std::make_pair(key_state, value_state))); @@ -309,7 +308,7 @@ std::shared_ptr KVStateCacheBuilder::_Seal(Client& client) { std::shared_ptr kvStateCache = std::make_shared(); // 1. store the member variables to cache object meta - kvStateCache->meta_.AddKeyValue("dimension", this->dimension); + kvStateCache->meta_.AddKeyValue("tensorBytes", this->tensorBytes); kvStateCache->meta_.AddKeyValue("version", this->version); kvStateCache->meta_.AddKeyValue("layer", this->layer); diff --git a/modules/llm-cache/ds/kv_state_cache.h b/modules/llm-cache/ds/kv_state_cache.h index 82e6a76c..20e37cd7 100644 --- a/modules/llm-cache/ds/kv_state_cache.h +++ b/modules/llm-cache/ds/kv_state_cache.h @@ -16,6 +16,7 @@ limitations under the License. #include #include #include +#include #include "client/client.h" #include "common/util/logging.h" @@ -40,7 +41,7 @@ class KVStateCache : public vineyard::Registered { private: std::vector> kvStateCacheBlockList; std::shared_ptr rootTree; - int dimension; + int tensorBytes; int cacheCapacity; int layer; uint64_t version; @@ -60,7 +61,7 @@ class KVStateCache : public vineyard::Registered { return this->kvStateCacheBlockList; } - int GetDimension() { return this->dimension; } + int GetTensorBytes() { return this->tensorBytes; } int GetCacheCapacity() { return this->cacheCapacity; } @@ -77,12 +78,12 @@ class KVStateCache : public vineyard::Registered { class KVStateCacheBuilder : public vineyard::ObjectBuilder { std::shared_ptr rootTree; - int dimension; + int tensorBytes; int layer; uint64_t version; public: - KVStateCacheBuilder(Client& client, int dimension, int cacheCapacity, + KVStateCacheBuilder(Client& client, int tensorBytes, int cacheCapacity, int layer, int blockSize = DEFAULT_BLOCK_SIZE); KVStateCacheBuilder(Client& client, std::shared_ptr cache); @@ -92,10 +93,11 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder { std::vector> nodeDataList); void Update(Client& client, const std::vector& token_list, - int next_token, const KV_STATE_WITH_LAYER& kv_state); + int next_token, + const std::map>& kv_state); int Query(Client& client, const std::vector& token_list, int token, - KV_STATE_WITH_LAYER& kv_state); + std::map>& kv_state); void Delete(std::shared_ptr evicted_node); @@ -109,7 +111,7 @@ class KVStateCacheBuilder : public vineyard::ObjectBuilder { std::shared_ptr _Seal(Client& client) override; - uint64_t GetDimension() { return this->dimension; } + uint64_t GetTensorBytes() { return this->tensorBytes; } std::shared_ptr GetRootTree() { return this->rootTree; } diff --git a/modules/llm-cache/ds/kv_state_cache_block.cc b/modules/llm-cache/ds/kv_state_cache_block.cc index 17477143..59eab1de 100644 --- a/modules/llm-cache/ds/kv_state_cache_block.cc +++ b/modules/llm-cache/ds/kv_state_cache_block.cc @@ -60,10 +60,10 @@ void KVStateCacheBlock::Construct(const ObjectMeta& meta) { this->layer = this->meta_.GetKeyValue("layer"); for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { this->keyStateTensorList.push_back( - std::dynamic_pointer_cast>(this->meta_.GetMember( + std::dynamic_pointer_cast>(this->meta_.GetMember( "keyStateTensorBuilder_" + std::to_string(currentLayer)))); this->valueStateTensorList.push_back( - std::dynamic_pointer_cast>(this->meta_.GetMember( + std::dynamic_pointer_cast>(this->meta_.GetMember( "valueStateTensorBuilder_" + std::to_string(currentLayer)))); } // 2. construct the member field @@ -74,27 +74,27 @@ void KVStateCacheBlock::Construct(const ObjectMeta& meta) { this->bitmap[i] = this->meta_.GetKeyValue("bitmap_" + std::to_string(i)); } - this->dimension = this->meta_.GetKeyValue("dimension"); + this->tensorBytes = this->meta_.GetKeyValue("tensorBytes"); this->blockSize = this->meta_.GetKeyValue("block_size"); } KVStateCacheBlock::~KVStateCacheBlock() { delete this->bitmap; } KVStateCacheBlockBuilder::KVStateCacheBlockBuilder(Client& client, - int dimension, int layer, + int tensorBytes, int layer, int blockSize) { this->blockSize = blockSize; this->bitmapSize = (blockSize + 63) / 64; this->bitmap = new uint64_t[this->bitmapSize]; memset(this->bitmap, UINT8_MAX, this->bitmapSize * sizeof(uint64_t)); - std::vector shape = {(int64_t)(blockSize), dimension}; + std::vector shape = {(int64_t)(blockSize), tensorBytes}; for (int i = 0; i < layer; i++) { this->keyStateTensorBuilderList.push_back( - std::make_shared>(client, shape)); + std::make_shared>(client, shape)); this->valueStateTensorBuilderList.push_back( - std::make_shared>(client, shape)); + std::make_shared>(client, shape)); } - this->dimension = dimension; + this->tensorBytes = tensorBytes; this->layer = layer; } @@ -108,37 +108,38 @@ KVStateCacheBlockBuilder::KVStateCacheBlockBuilder( for (int i = 0; i < this->bitmapSize; i++) { this->bitmap[i] = kvStateCacheBlock->bitmap[i]; } - this->dimension = kvStateCacheBlock->dimension; + this->tensorBytes = kvStateCacheBlock->tensorBytes; this->layer = kvStateCacheBlock->layer; - std::vector shape = {(int64_t)(blockSize), dimension}; + std::vector shape = {(int64_t)(blockSize), tensorBytes}; for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { this->keyStateTensorBuilderList.push_back( - std::make_shared>(client, shape)); + std::make_shared>(client, shape)); this->valueStateTensorBuilderList.push_back( - std::make_shared>(client, shape)); + std::make_shared>(client, shape)); } for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { memcpy(this->keyStateTensorBuilderList[currentLayer]->data(), kvStateCacheBlock->keyStateTensorList[currentLayer]->data(), - (int64_t)(blockSize) * this->dimension * sizeof(double)); + (int64_t)(blockSize) * this->tensorBytes); memcpy(this->valueStateTensorBuilderList[currentLayer]->data(), kvStateCacheBlock->valueStateTensorList[currentLayer]->data(), - (int64_t)(blockSize) * this->dimension * sizeof(double)); + (int64_t)(blockSize) * this->tensorBytes); } } // current we do not consider the layer. -int KVStateCacheBlockBuilder::Query(Client& client, int index, - KV_STATE_WITH_LAYER& kvState) { +int KVStateCacheBlockBuilder::Query( + Client& client, int index, + std::map>& kvState) { for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { - memcpy((kvState.find(currentLayer)->second).first.data, - keyStateTensorBuilderList[currentLayer]->data() + index * dimension, - dimension * sizeof(double)); - memcpy( - (kvState.find(currentLayer)->second).second.data, - valueStateTensorBuilderList[currentLayer]->data() + index * dimension, - dimension * sizeof(double)); + K_STATE keyState = (kvState.find(currentLayer)->second).first; + V_STATE valueState = (kvState.find(currentLayer)->second).second; + keyState.data = keyStateTensorBuilderList[currentLayer]->data() + index; + keyState.length = tensorBytes; + valueState.data = valueStateTensorBuilderList[currentLayer]->data() + index; + valueState.length = tensorBytes; + kvState.emplace(currentLayer, std::make_pair(keyState, valueState)); } return 0; } @@ -164,23 +165,20 @@ bool KVStateCacheBlockBuilder::IsFull() { return true; } -void KVStateCacheBlockBuilder::Update(const KV_STATE_WITH_LAYER& kvState, - OffsetData* data) { +void KVStateCacheBlockBuilder::Update( + const std::map>& kvState, + OffsetData* data) { int index = this->FindEmptySlot(); for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { K_STATE keyState = (kvState.find(currentLayer)->second).first; V_STATE valueState = (kvState.find(currentLayer)->second).second; - VINEYARD_ASSERT(keyState.length == - (size_t) this->dimension * sizeof(double)); - VINEYARD_ASSERT(valueState.length == - (size_t) this->dimension * sizeof(double)); - - double* keyData = keyStateTensorBuilderList[currentLayer]->data(); - double* valueData = valueStateTensorBuilderList[currentLayer]->data(); - memcpy(keyData + index * this->dimension, keyState.data, - this->dimension * sizeof(double)); - memcpy(valueData + index * this->dimension, valueState.data, - this->dimension * sizeof(double)); + VINEYARD_ASSERT(keyState.length == (size_t) this->tensorBytes); + VINEYARD_ASSERT(valueState.length == (size_t) this->tensorBytes); + + uint8_t* keyData = keyStateTensorBuilderList[currentLayer]->data(); + uint8_t* valueData = valueStateTensorBuilderList[currentLayer]->data(); + memcpy(keyData + index, keyState.data, this->tensorBytes); + memcpy(valueData + index, valueState.data, this->tensorBytes); } data->offset = index; @@ -193,25 +191,23 @@ int16_t KVStateCacheBlockBuilder::Split(KVStateCacheBlockBuilder* child, VINEYARD_ASSERT(this->layer == child->layer); int childIndex = child->FindEmptySlot(); for (int currentLayer = 0; currentLayer < this->layer; currentLayer++) { - std::shared_ptr> keyStateTensorBuilder = + std::shared_ptr> keyStateTensorBuilder = keyStateTensorBuilderList[currentLayer]; - std::shared_ptr> valueStateTensorBuilder = + std::shared_ptr> valueStateTensorBuilder = valueStateTensorBuilderList[currentLayer]; - std::shared_ptr> childKeyStateTensorBuilder = + std::shared_ptr> childKeyStateTensorBuilder = child->keyStateTensorBuilderList[currentLayer]; - std::shared_ptr> childValueStateTensorBuilder = + std::shared_ptr> childValueStateTensorBuilder = child->valueStateTensorBuilderList[currentLayer]; - double* keyState = keyStateTensorBuilder->data() + index * this->dimension; - double* valueState = - valueStateTensorBuilder->data() + index * this->dimension; - double* childKeyState = - childKeyStateTensorBuilder->data() + childIndex * this->dimension; - double* childValueState = - childValueStateTensorBuilder->data() + childIndex * this->dimension; + uint8_t* keyState = keyStateTensorBuilder->data() + index; + uint8_t* valueState = valueStateTensorBuilder->data() + index; + uint8_t* childKeyState = childKeyStateTensorBuilder->data() + childIndex; + uint8_t* childValueState = + childValueStateTensorBuilder->data() + childIndex; - memcpy(childKeyState, keyState, this->dimension * sizeof(double)); - memcpy(childValueState, valueState, this->dimension * sizeof(double)); + memcpy(childKeyState, keyState, this->tensorBytes); + memcpy(childValueState, valueState, this->tensorBytes); } ACQUIRE_BIT_RESOURCE(child->bitmap[childIndex / 64], childIndex % 64); FREE_BIT_RESOURCE(this->bitmap[index / 64], index % 64); @@ -244,7 +240,7 @@ std::shared_ptr KVStateCacheBlockBuilder::_Seal(Client& client) { } kvStateCacheBlock->meta_.AddKeyValue("block_size", this->blockSize); - kvStateCacheBlock->meta_.AddKeyValue("dimension", this->dimension); + kvStateCacheBlock->meta_.AddKeyValue("tensorBytes", this->tensorBytes); kvStateCacheBlock->meta_.AddKeyValue("layer", this->layer); // 3. set the object type to meta kvStateCacheBlock->meta_.SetTypeName(type_name()); @@ -264,15 +260,16 @@ void KVStateCacheBlockBuilder::PrintKVStateCacheBlock() { LOG(INFO) << "layer:" << currentLayer; for (int i = 0; i < this->blockSize; i++) { LOG(INFO) << "index:" << i; + uint8_t* key_state_data = keyStateTensorBuilderList[currentLayer]->data(); + uint8_t* value_state_data = + valueStateTensorBuilderList[currentLayer]->data(); + // print the first tensorBytes bytes std::string keyState = ""; std::string valueState = ""; - for (int j = 0; j < this->dimension; j++) { - keyState += std::to_string((keyStateTensorBuilderList[currentLayer] - ->data())[i * dimension + j]) + - " "; - valueState += std::to_string((valueStateTensorBuilderList[currentLayer] - ->data())[i * dimension + j]) + - " "; + for (int j = 0; j < this->tensorBytes; j++) { + keyState += std::to_string(key_state_data[i * tensorBytes + j]) + " "; + valueState += + std::to_string(value_state_data[i * tensorBytes + j]) + " "; } LOG(INFO) << "keyState:" << keyState; LOG(INFO) << "valueState:" << valueState; diff --git a/modules/llm-cache/ds/kv_state_cache_block.h b/modules/llm-cache/ds/kv_state_cache_block.h index 5e0a7262..ac9522ad 100644 --- a/modules/llm-cache/ds/kv_state_cache_block.h +++ b/modules/llm-cache/ds/kv_state_cache_block.h @@ -37,12 +37,6 @@ struct State { using K_STATE = State; using V_STATE = State; -using KV_STATE_WITH_LAYER = std::map>; -using LIST_KV_STATE_WITH_LAYER = - std::vector>>; -using KV_STATE = std::vector>; -using LIST_KV_STATE = std::vector>; - // Set the bit to 1, which means the resource is not being used #define FREE_BIT_RESOURCE(value, bit) ((value) |= (((uint64_t) 1) << (bit))) @@ -72,14 +66,14 @@ namespace vineyard { class KVStateCacheBlock : public vineyard::Registered { private: - std::vector>> keyStateTensorList; - std::vector>> valueStateTensorList; + std::vector>> keyStateTensorList; + std::vector>> valueStateTensorList; uint64_t* bitmap; int blockSize; int bitmapSize; ObjectID id; int layer; - int dimension; + int tensorBytes; public: static std::unique_ptr Create() __attribute__((used)) { @@ -91,25 +85,25 @@ class KVStateCacheBlock : public vineyard::Registered { std::string GetBitmapStr(); - uint64_t GetDimension() { return this->dimension; } + uint64_t GetTensorBytes() { return this->tensorBytes; } uint64_t* GetBitmap() { return this->bitmap; } int GetBlockSize() { return this->blockSize; } - std::shared_ptr> GetKeyTensor(int layer) { + std::shared_ptr> GetKeyTensor(int layer) { return this->keyStateTensorList[layer]; } - std::shared_ptr> GetValueTensor(int layer) { + std::shared_ptr> GetValueTensor(int layer) { return this->valueStateTensorList[layer]; } - std::vector>> GetKeyTensorList() { + std::vector>> GetKeyTensorList() { return this->keyStateTensorList; } - std::vector>> GetValueTensorList() { + std::vector>> GetValueTensorList() { return this->valueStateTensorList; } @@ -120,21 +114,22 @@ class KVStateCacheBlock : public vineyard::Registered { class KVStateCacheBlockBuilder : public ObjectBuilder { private: - std::vector>> keyStateTensorBuilderList; - std::vector>> + std::vector>> + keyStateTensorBuilderList; + std::vector>> valueStateTensorBuilderList; // TBD // support more than 64 kv-state cache slots uint64_t* bitmap; int blockSize; int bitmapSize; - int dimension; + int tensorBytes; int layer; int FindEmptySlot(); public: - KVStateCacheBlockBuilder(Client& client, int dimension, int layer, + KVStateCacheBlockBuilder(Client& client, int tensorBytes, int layer, int blockSize); KVStateCacheBlockBuilder( @@ -147,9 +142,10 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { * @param kv_state The kv-state of the prompt. A LLM inference can contain * multiple kv-states for each layer. */ - void Update(const KV_STATE_WITH_LAYER& kv_state, OffsetData* data); + void Update(const std::map>& kv_state, + OffsetData* data); - void Update(double* keyState, double* valueState, uint64_t dataLength, + void Update(char* keyState, char* valueState, uint64_t dataLength, OffsetData* data); /** @@ -160,7 +156,8 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { * @param kv_state The kv-state of the prompt returned by radix-tree. If the * kv-state is not found, the data of kv-state is invalid. */ - int Query(Client& client, int index, KV_STATE_WITH_LAYER& kv_state); + int Query(Client& client, int index, + std::map>& kv_state); bool IsFull(); @@ -170,20 +167,21 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { int16_t Split(KVStateCacheBlockBuilder* child, int index); - const std::shared_ptr> GetKeyStateBuilder(int layer) { + const std::shared_ptr> GetKeyStateBuilder(int layer) { return keyStateTensorBuilderList[layer]; } - const std::shared_ptr> GetValueStateBuilder(int layer) { + const std::shared_ptr> GetValueStateBuilder( + int layer) { return valueStateTensorBuilderList[layer]; } - const std::vector>> + const std::vector>> GetKeyStateBuilderList() { return keyStateTensorBuilderList; } - const std::vector>> + const std::vector>> GetValueStateBuilderList() { return valueStateTensorBuilderList; } @@ -196,7 +194,7 @@ class KVStateCacheBlockBuilder : public ObjectBuilder { uint64_t* GetBitmap() { return this->bitmap; } - uint64_t GetDimension() { return this->dimension; } + uint64_t GetTensorBytes() { return this->tensorBytes; } int GetBlockSize() { return this->blockSize; } diff --git a/modules/llm-cache/ds/kv_state_cache_manager.cc b/modules/llm-cache/ds/kv_state_cache_manager.cc index b13770df..c6db447d 100644 --- a/modules/llm-cache/ds/kv_state_cache_manager.cc +++ b/modules/llm-cache/ds/kv_state_cache_manager.cc @@ -25,7 +25,7 @@ limitations under the License. namespace vineyard { -KVStateCacheManager::KVStateCacheManager(int dimension, int cacheCapacity, +KVStateCacheManager::KVStateCacheManager(int tensorBytes, int cacheCapacity, int layer, int blockSize, int syncInterval, std::string socket) { this->syncInterval = syncInterval; @@ -61,7 +61,7 @@ KVStateCacheManager::KVStateCacheManager(int dimension, int cacheCapacity, // if failed, create a new cache object VLOG(100) << "failed to get the cache object, create a new one."; kvStateCacheBuilder = std::make_shared( - client, dimension, cacheCapacity, layer, blockSize); + client, tensorBytes, cacheCapacity, layer, blockSize); } // release the lock @@ -75,21 +75,21 @@ KVStateCacheManager::KVStateCacheManager(int dimension, int cacheCapacity, // use lease to prevent the deadlock if the client is down } -void KVStateCacheManager::UpdateInternal(const std::vector& tokenList, - int nextToken, - const KV_STATE_WITH_LAYER& kvState) { +void KVStateCacheManager::UpdateInternal( + const std::vector& tokenList, int nextToken, + const std::map>& kvState) { kvStateCacheBuilder->Update(client, tokenList, nextToken, kvState); } -int KVStateCacheManager::QueryInternal(const std::vector& tokenList, - int token, - KV_STATE_WITH_LAYER& kvState) { +int KVStateCacheManager::QueryInternal( + const std::vector& tokenList, int token, + std::map>& kvState) { return kvStateCacheBuilder->Query(client, tokenList, token, kvState); } -void KVStateCacheManager::Update(const std::vector& tokenList, - int nextToken, - const KV_STATE_WITH_LAYER& kvState) { +void KVStateCacheManager::Update( + const std::vector& tokenList, int nextToken, + const std::map>& kvState) { if (!syncMutex.try_lock()) { return; } @@ -99,8 +99,9 @@ void KVStateCacheManager::Update(const std::vector& tokenList, syncMutex.unlock(); } -void KVStateCacheManager::Update(const std::vector& tokenList, - const LIST_KV_STATE_WITH_LAYER& kvState) { +void KVStateCacheManager::Update( + const std::vector& tokenList, + const std::vector>>& kvState) { if (!syncMutex.try_lock()) { return; } @@ -114,8 +115,9 @@ void KVStateCacheManager::Update(const std::vector& tokenList, syncMutex.unlock(); } -int KVStateCacheManager::Query(const std::vector& tokenList, int token, - KV_STATE_WITH_LAYER& kvState) { +int KVStateCacheManager::Query( + const std::vector& tokenList, int token, + std::map>& kvState) { int result = -1; if (!syncMutex.try_lock()) { @@ -128,8 +130,9 @@ int KVStateCacheManager::Query(const std::vector& tokenList, int token, return result; } -int KVStateCacheManager::Query(const std::vector& tokenList, - LIST_KV_STATE_WITH_LAYER& listKVState) { +int KVStateCacheManager::Query( + const std::vector& tokenList, + std::vector>>& listKVState) { int result = -1; if (!syncMutex.try_lock()) { return result; @@ -138,6 +141,10 @@ int KVStateCacheManager::Query(const std::vector& tokenList, std::vector tokenListCopy; for (size_t i = 0; i < tokenList.size(); i++) { result = QueryInternal(tokenListCopy, tokenList[i], listKVState[i]); + // if the result is -1, it means the token is not in the cache + if (result == -1) { + break; + } tokenListCopy.push_back(tokenList[i]); } diff --git a/modules/llm-cache/ds/kv_state_cache_manager.h b/modules/llm-cache/ds/kv_state_cache_manager.h index 408cac8a..312ebb6e 100644 --- a/modules/llm-cache/ds/kv_state_cache_manager.h +++ b/modules/llm-cache/ds/kv_state_cache_manager.h @@ -14,10 +14,12 @@ limitations under the License. */ #include +#include #include #include #include #include +#include #include "llm-cache/ds/kv_state_cache.h" @@ -41,30 +43,33 @@ class KVStateCacheManager { public: KVStateCacheManager( - int dimension = 10, int cacheCapacity = 10, int layer = 1, + int tensorBytes = 80, int cacheCapacity = 10, int layer = 1, int blockSize = 5, int syncInterval = 3, std::string socket = std::string(getenv("VINEYARD_IPC_SOCKET"))); void Update(const std::vector& tokenList, int nextToken, - const KV_STATE_WITH_LAYER& kvState); + const std::map>& kvState); - void Update(const std::vector& tokenList, - const LIST_KV_STATE_WITH_LAYER& kvState); + void Update( + const std::vector& tokenList, + const std::vector>>& kvState); int Query(const std::vector& tokenList, int token, - KV_STATE_WITH_LAYER& kvState); + std::map>& kvState); - int Query(const std::vector& tokenList, - LIST_KV_STATE_WITH_LAYER& listKVState); + int Query( + const std::vector& tokenList, + std::vector>>& listKVState); ~KVStateCacheManager(); private: - void UpdateInternal(const std::vector& tokenList, int nextToken, - const KV_STATE_WITH_LAYER& kvState); + void UpdateInternal( + const std::vector& tokenList, int nextToken, + const std::map>& kvState); int QueryInternal(const std::vector& tokenList, int token, - KV_STATE_WITH_LAYER& kvState); + std::map>& kvState); void Delete(std::vector token); diff --git a/modules/llm-cache/radix-tree/radix-tree.cc b/modules/llm-cache/radix-tree/radix-tree.cc index 93e37a79..89478e67 100644 --- a/modules/llm-cache/radix-tree/radix-tree.cc +++ b/modules/llm-cache/radix-tree/radix-tree.cc @@ -101,7 +101,7 @@ std::shared_ptr RadixTree::InsertInternal( this->tree, insertTokensArray, insertTokensArrayLen, dummyData, reinterpret_cast(&dataNode), reinterpret_cast(&oldData)); if (dataNode == NULL) { - throw std::runtime_error("Insert token list failed"); + LOG(ERROR) << "Insert token list failed"; return NULL; } if (retval == 1) { diff --git a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc b/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc index feb1166b..7cc9f9c5 100644 --- a/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc +++ b/modules/llm-cache/tests/kv_state_cache_benchmark_test.cc @@ -27,7 +27,7 @@ limitations under the License. using namespace vineyard; // NOLINT(build/namespaces) -#define DIMENSION 100 +#define TENSORBYTES 800 #define CAPACITY 1000 #define LAYER 64 #define BLOCK_SIZE 100 @@ -36,7 +36,7 @@ KVStateCacheManager* manager; void init() { manager = - new KVStateCacheManager(DIMENSION, CAPACITY, LAYER, DEFAULT_BLOCK_SIZE); + new KVStateCacheManager(TENSORBYTES, CAPACITY, LAYER, DEFAULT_BLOCK_SIZE); } std::vector generate_random_tokens(size_t max_length) { @@ -57,10 +57,10 @@ std::map> generate_kv_state(int token) { for (int currentLayer = 0; currentLayer < LAYER; currentLayer++) { K_STATE key_state; V_STATE value_state; - key_state.data = malloc(DIMENSION * sizeof(double)); - key_state.length = DIMENSION * sizeof(double); - value_state.data = malloc(DIMENSION * sizeof(double)); - value_state.length = DIMENSION * sizeof(double); + key_state.data = malloc(TENSORBYTES); + key_state.length = TENSORBYTES; + value_state.data = malloc(TENSORBYTES); + value_state.length = TENSORBYTES; kv_state.insert( std::make_pair(currentLayer, std::make_pair(key_state, value_state))); diff --git a/modules/llm-cache/tests/kv_state_cache_test.cc b/modules/llm-cache/tests/kv_state_cache_test.cc index e2d1e98a..6a13e3e9 100644 --- a/modules/llm-cache/tests/kv_state_cache_test.cc +++ b/modules/llm-cache/tests/kv_state_cache_test.cc @@ -24,7 +24,7 @@ limitations under the License. using namespace vineyard; // NOLINT(build/namespaces) -int dimension = 10; +int tensorBytes = 80; int capacity = 20; int layer = 3; int block_size = 5; @@ -42,9 +42,9 @@ std::vector> tokens_list; KVStateCacheManager* kv_state_cache_manager; -void init(int dimension, int capacity, int layer, int block_size, +void init(int tensorBytes, int capacity, int layer, int block_size, std::string socket) { - kv_state_cache_manager = new KVStateCacheManager(dimension, capacity, layer, + kv_state_cache_manager = new KVStateCacheManager(tensorBytes, capacity, layer, block_size, 3, socket); } @@ -61,17 +61,16 @@ void print_kv_state( const std::map>& kv_state) { LOG(INFO) << "kv_state: "; for (auto iter = kv_state.begin(); iter != kv_state.end(); ++iter) { + uint8_t* key_state_data = + reinterpret_cast(iter->second.first.data); + uint8_t* value_state_data = + reinterpret_cast(iter->second.second.data); + // print the first tensorBytes bytes std::string key_state_str = ""; std::string value_state_str = ""; - for (int i = 0; i < dimension; ++i) { - key_state_str += - std::to_string( - (reinterpret_cast(iter->second.first.data))[i]) + - " "; - value_state_str += - std::to_string( - (reinterpret_cast(iter->second.second.data))[i]) + - " "; + for (int j = 0; j < tensorBytes; j++) { + key_state_str += std::to_string(key_state_data[j]) + " "; + value_state_str += std::to_string(value_state_data[j]) + " "; } LOG(INFO) << "layer " << iter->first << ":"; LOG(INFO) << "key_state: " << key_state_str; @@ -81,69 +80,59 @@ void print_kv_state( } // we do not consider the layer. -std::map> generate_kv_state() { +std::map> generate_kv_state(int token) { std::map> kv_state; for (int currentLayer = 0; currentLayer < layer; currentLayer++) { K_STATE key_state; V_STATE value_state; - key_state.data = malloc(dimension * sizeof(double)); - value_state.data = malloc(dimension * sizeof(double)); + key_state.data = malloc(tensorBytes); + value_state.data = malloc(tensorBytes); - key_state.length = dimension * sizeof(double); - value_state.length = dimension * sizeof(double); + key_state.length = tensorBytes; + value_state.length = tensorBytes; - kv_state.insert( - std::make_pair(currentLayer, std::make_pair(key_state, value_state))); - } - return kv_state; -} - -void update_kv_state(std::map>& kvState, - int token) { - for (int currentLayer = 0; currentLayer < layer; currentLayer++) { - K_STATE key_state = kvState[currentLayer].first; - V_STATE value_state = kvState[currentLayer].second; - for (int i = 0; i < dimension; ++i) { - (reinterpret_cast(key_state.data))[i] = - (static_cast(token)) / dimension * (i + 1) + + for (int i = 0; i < tensorBytes; ++i) { + (reinterpret_cast(key_state.data))[i] = + (static_cast(token)) / tensorBytes * (i + 1) + currentLayer * 10; - (reinterpret_cast(value_state.data))[i] = - (static_cast(token)) / dimension * (i + 1) * 2 + + (reinterpret_cast(value_state.data))[i] = + (static_cast(token)) / tensorBytes * (i + 1) * 2 + currentLayer * 10; } + kv_state.insert( + std::make_pair(currentLayer, std::make_pair(key_state, value_state))); } + return kv_state; } void check_kv_state(const std::map>& kv_state, int& token) { VINEYARD_ASSERT(kv_state.size() == (size_t) layer); for (auto iter = kv_state.begin(); iter != kv_state.end(); ++iter) { - VINEYARD_ASSERT(iter->second.first.length == - (size_t) dimension * sizeof(double)); - VINEYARD_ASSERT(iter->second.second.length == - (size_t) dimension * sizeof(double)); - for (int i = 0; i < dimension; ++i) { - if ((reinterpret_cast(iter->second.first.data))[i] != - (static_cast(token)) / dimension * (i + 1) + + VINEYARD_ASSERT(iter->second.first.length == (size_t) tensorBytes); + VINEYARD_ASSERT(iter->second.second.length == (size_t) tensorBytes); + for (int i = 0; i < tensorBytes; ++i) { + if ((reinterpret_cast(iter->second.first.data))[i] != + (static_cast(token)) / tensorBytes * (i + 1) + iter->first * 10) { - LOG(INFO) << "token:" << token << " dimension" << dimension + LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes << " layer:" << iter->first; LOG(INFO) << "key_state[" << i << "]: " - << (reinterpret_cast(iter->second.first.data))[i] + << (reinterpret_cast(iter->second.first.data))[i] << ". But is should be " - << (static_cast(token)) / dimension * (i + 1) + + << (static_cast(token)) / tensorBytes * (i + 1) + iter->first * 10; throw std::runtime_error("key_state error!"); } - if ((reinterpret_cast(iter->second.second.data))[i] != - (static_cast(token)) / dimension * (i + 1) * 2 + + if (reinterpret_cast(iter->second.second.data)[i] != + (static_cast(token)) / tensorBytes * (i + 1) * 2 + iter->first * 10) { - LOG(INFO) << "token:" << token << " dimension" << dimension + LOG(INFO) << "token:" << token << " tensorBytes" << tensorBytes << " layer:" << iter->first; LOG(INFO) << "value_state[" << i << "]: " - << (reinterpret_cast(iter->second.second.data))[i] + << (reinterpret_cast(iter->second.second.data))[i] << ". But is should be " - << (static_cast(token)) / dimension * (i + 1) * 2 + + << (static_cast(token)) / tensorBytes * (i + 1) * 2 + iter->first * 10; throw std::runtime_error("value_state error!"); } @@ -154,15 +143,15 @@ void check_kv_state(const std::map>& kv_state, void inference(std::vector tokens, bool block = false) { std::vector inference_tokens; std::map> kv_state; - kv_state = generate_kv_state(); for (size_t i = 0; i < tokens.size(); ++i) { + kv_state.clear(); int result = kv_state_cache_manager->Query(inference_tokens, tokens[i], kv_state); if (result != 0) { LOG(INFO) << "Can not find the kv_state from cache:"; print_current_tokens(inference_tokens, tokens[i]); LOG(INFO) << "Generate the kv_state and update the cache."; - update_kv_state(kv_state, tokens[i]); + kv_state = generate_kv_state(tokens[i]); print_kv_state(kv_state); kv_state_cache_manager->Update(inference_tokens, tokens[i], kv_state); } else { @@ -184,7 +173,7 @@ int main(int argc, char** argv) { for (int i = 2; i < argc; i++) { if (strcmp(argv[i], "-d") == 0) { - dimension = atoi(argv[i + 1]); + tensorBytes = atoi(argv[i + 1]); } else if (strcmp(argv[i], "-c") == 0) { capacity = atoi(argv[i + 1]); } else if (strcmp(argv[i], "-l") == 0) { @@ -209,11 +198,11 @@ int main(int argc, char** argv) { } } - LOG(INFO) << "Test KVStateCache with dimension: " << dimension + LOG(INFO) << "Test KVStateCache with tensorBytes: " << tensorBytes << ", capacity: " << capacity << ", layer: " << layer << ", block_size: " << block_size << "."; - init(dimension, capacity, layer, block_size, ipc_socket); + init(tensorBytes, capacity, layer, block_size, ipc_socket); for (size_t i = 0; i < tokens_list.size(); i++) { inference(tokens_list[i]);