diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 62a72b1cbd..9ae7d0c69b 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -36,13 +36,13 @@ std::pair<EncodedResults, int32_t> beam_search( class StatefulLLMPipeline final : public LLMPipelineImplBase { public: ov::InferRequest m_model_runner; - bool is_chat_conversation = false; - bool m_is_cache_empty = true; + bool m_trust_encoded_history = true; std::optional<int32_t> m_selected_beam = std::nullopt; ChatHistory m_history; std::string m_templated_chat_history = {}; - TokenizedInputs m_tokenized_chat_history; + std::vector<int64_t> m_tokenized_chat_history; + ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; StatefulLLMPipeline( const ov::InferRequest& request, @@ -94,6 +94,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { OptionalGenerationConfig generation_config, StreamerVariant streamer ) override { + if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF) + m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING; + + if (is_chat_conversation) + OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS, + "Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat."); + auto start_time = std::chrono::steady_clock::now(); GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; TokenizedInputs encoded_input; @@ -119,14 +126,30 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); // Do not add special tokens in chat scenario to be aligned with HF. auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)); - if (m_is_cache_empty) { - encoded_input = new_chat_tokens; - } else { - auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); + auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); + + // some symbols combinations can be encoded by the tokenizer in different ways + // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history + // and find the difference as a prompt, so let's check it out and use the whole history in this case + if (!m_tokenized_chat_history.empty() && m_trust_encoded_history) + m_trust_encoded_history = ov::genai::utils::is_tokenized_history_same(prev_chat_tokens.input_ids, m_tokenized_chat_history); + + if (!m_trust_encoded_history) { + reset_kv_state(); + m_selected_beam = std::nullopt; + } + + if (!m_tokenized_chat_history.empty() && m_trust_encoded_history) { encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens); + } else { + encoded_input = new_chat_tokens; } m_templated_chat_history = new_templated_chat_history; - m_tokenized_chat_history = new_chat_tokens; + m_tokenized_chat_history.clear(); + m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size()); + std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(), + std::back_inserter(m_tokenized_chat_history)); + // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied } else { encoded_input = m_tokenizer.encode(prompt); @@ -180,6 +203,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { OptionalGenerationConfig generation_config, StreamerVariant streamer ) override { + if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF) + m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS; + + if (is_chat_conversation) + // if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role + OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user", + "Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat."); + auto start_time = std::chrono::steady_clock::now(); ov::Tensor input_ids; ov::Tensor attention_mask; @@ -191,6 +222,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { attention_mask = data->attention_mask; } + if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) + std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history)); + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; // If eos_token_id was not provided, take value from default m_generation_config @@ -222,24 +256,29 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { "(input_ids, attention_mask, position_ids, beam_idx) " "but you have '" + std::to_string(num_inputs) + "' inputs"); - + ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()); size_t kv_cache_len = 0; ov::Tensor concatenated_attention_mask; - if (is_chat_conversation && !m_is_cache_empty) { - OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); - // If history is saved in KV cache, concatenate new attention_mask with the already existing. - // Between subsequent runs attention_mask should not be modified. - auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); - auto prompt_len = attention_mask.get_shape()[1]; - kv_cache_len = atten_mask_history.get_shape()[1]; - - ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; - auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam); - std::copy(start_atten_hst, start_atten_hst + kv_cache_len, - new_atten_mask.data<int64_t>()); - std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len, - new_atten_mask.data<int64_t>() + kv_cache_len); - concatenated_attention_mask = new_atten_mask; + if (is_chat_conversation && !m_tokenized_chat_history.empty()) { + if (m_trust_encoded_history) { + OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); + // If history is saved in KV cache, concatenate new attention_mask with the already existing. + // Between subsequent runs attention_mask should not be modified. + auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); + auto prompt_len = attention_mask.get_shape()[1]; + kv_cache_len = atten_mask_history.get_shape()[1]; + + ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; + auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam); + std::copy(start_atten_hst, start_atten_hst + kv_cache_len, + new_atten_mask.data<int64_t>()); + std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len, + new_atten_mask.data<int64_t>() + kv_cache_len); + concatenated_attention_mask = new_atten_mask; + } else { + attention_mask = ov::genai::utils::init_attention_mask(tokenized_chat_history); + concatenated_attention_mask = attention_mask; + } } else { concatenated_attention_mask = attention_mask; } @@ -247,28 +286,36 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { bool position_ids_available = (num_inputs == 4); std::optional<ov::Tensor> position_ids = std::nullopt; if (position_ids_available) { - position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; - utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len); + if (is_chat_conversation && !m_trust_encoded_history) { + position_ids = ov::Tensor{ov::element::i64, tokenized_chat_history.get_shape()}; + } else { + position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; + } + utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len); } if(m_adapter_controller) { m_adapter_controller->apply(m_model_runner, config.adapters); } + auto input_tokens = input_ids; + if (is_chat_conversation && !m_trust_encoded_history) { + input_tokens = tokenized_chat_history; + m_trust_encoded_history = true; + } + ov::genai::EncodedResults result; if (config.is_beam_search() && is_chat_conversation) { - std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask, + std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_tokens, concatenated_attention_mask, config, position_ids, m_selected_beam); } else { std::vector<SequenceGroup::Ptr> requests; size_t block_size = 1; bool enable_prefix_caching = false; - - config.stop_token_ids.insert(config.eos_token_id); for (size_t request_id = 0; request_id < batch_size; request_id++) { SequenceGroup::Ptr sequence_group; - if (is_chat_conversation && !m_is_cache_empty) { - sequence_group = std::make_shared<SequenceGroup>(request_id, m_tokenized_chat_history.input_ids, config, block_size, enable_prefix_caching); + if (is_chat_conversation) { + sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching); } else { size_t seq_len = input_ids.get_shape().at(1); size_t batch_offset = request_id * seq_len; @@ -283,16 +330,17 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } Sampler sampler = Sampler(m_tokenizer); - std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr, - sampler, requests, position_ids, std::nullopt, m_selected_beam); + std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_tokens, concatenated_attention_mask, streamer_ptr, + sampler, requests, position_ids, std::nullopt, m_selected_beam); } - if (!is_chat_conversation) { + if (is_chat_conversation) { + std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history)); + } else { reset_kv_state(); m_selected_beam = std::nullopt; - } else { - m_is_cache_empty = false; } + auto stop_time = std::chrono::steady_clock::now(); // If is called without tokenization then that stat will not be reported. @@ -306,12 +354,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { void start_chat(const std::string& system_message) override { is_chat_conversation = true; - m_selected_beam = std::nullopt; - if (!m_is_cache_empty) { + m_selected_beam = std::nullopt; + m_trust_encoded_history = true; + m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; + if (!m_tokenized_chat_history.empty()) { reset_kv_state(); - m_is_cache_empty = true; m_history = {}; m_templated_chat_history = ""; + m_tokenized_chat_history.clear(); } if (system_message.empty()) return; @@ -325,11 +375,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { void finish_chat() override { is_chat_conversation = false; m_selected_beam = std::nullopt; - if (!m_is_cache_empty) { + m_trust_encoded_history = true; + m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; + if (!m_tokenized_chat_history.empty()) { reset_kv_state(); - m_is_cache_empty = true; m_history.clear(); m_templated_chat_history.clear(); + m_tokenized_chat_history.clear(); } } }; diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index dcc73f2ea3..5c2133d624 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -13,6 +13,8 @@ #include "openvino/op/tanh.hpp" #include "openvino/op/transpose.hpp" +#include "sampler.hpp" + namespace ov { namespace genai { namespace utils { @@ -265,6 +267,22 @@ ov::Core singleton_core() { return core; } +bool is_tokenized_history_same(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history) { + size_t i = 0; + // encoded_history should be eq or less then tokenized_history, tokenized_history also include eos token + if (encoded_history.get_size() > tokenized_history.size()) + return false; + auto encoded_history_data = encoded_history.data<int64_t>(); + while(i < encoded_history.get_size()) { + if (encoded_history_data[i] != tokenized_history[i]) + break; + i++; + } + + // encoded_history after decode of tokenizer could lose one last token (eos/stop token) + return i == tokenized_history.size() - 1 || i == tokenized_history.size() - 2; +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 9adc46c87a..ba1a29a8d3 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -12,6 +12,12 @@ namespace ov { namespace genai { namespace utils { +enum class GenerationChatInputsType { + UNDEF = 0, // Default value, type of inputs is not defined + STRING = 1, // Type of inputs is StringInputs + ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs +}; + Tensor init_attention_mask(const Tensor& position_ids); void print_tensor(const ov::Tensor& tensor); @@ -66,6 +72,8 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model); ov::Core singleton_core(); +bool is_tokenized_history_same(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history); + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index d61ab57f60..7ffb6fa7a7 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -36,8 +36,12 @@ class InputsEmbedder::IInputsEmbedder { ChatHistory m_history; // Templated chat history std::string m_templated_chat_history; + // Tokenized chat history + std::vector<int64_t> m_tokenized_chat_history; // Whether we have computed some inputs already bool m_is_cache_empty = true; + // Whether the text history has unambiguous encoded representation + bool m_trust_encoded_history = true; public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images) = 0; @@ -50,8 +54,18 @@ class InputsEmbedder::IInputsEmbedder { return m_tokenizer; } + std::vector<int64_t> get_tokenized_chat_history() const { + return m_tokenized_chat_history; + } + + void update_tokenized_chat_history(std::vector<int64_t> encoded_result) { + std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history)); + } + virtual void start_chat(const std::string& system_message) { m_is_chat_conversation = true; + m_trust_encoded_history = true; + m_tokenized_chat_history.clear(); if (!m_is_cache_empty) { m_history.clear(); m_templated_chat_history.clear(); @@ -75,9 +89,11 @@ class InputsEmbedder::IInputsEmbedder { virtual void finish_chat() { m_is_chat_conversation = false; m_is_cache_empty = true; + m_trust_encoded_history = true; m_history.clear(); m_templated_chat_history.clear(); + m_tokenized_chat_history.clear(); } protected: @@ -112,19 +128,29 @@ class InputsEmbedder::IInputsEmbedder { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; - if (m_is_cache_empty) { + TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history); + + // some symbols combinations can be encoded by the tokenizer in different ways + // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history + // and find the difference as a prompt + // so let's check it out and use the whole history in this case + if (!m_is_cache_empty && m_trust_encoded_history) { + m_trust_encoded_history = ov::genai::utils::is_tokenized_history_same(prev_chat_tokens.input_ids, m_tokenized_chat_history); + } + + if (m_is_cache_empty || m_trust_encoded_history) { encoded_input_ids = new_chat_tokens; // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty m_is_cache_empty = false; } else { - TokenizedInputs prev_chat_tokens = m_tokenizer.encode( - m_templated_chat_history - ); encoded_input_ids = utils::subtract_chat_tokenized_inputs( {new_chat_tokens}, prev_chat_tokens ).input_ids; } m_templated_chat_history = std::move(new_templated_chat_history); + m_tokenized_chat_history.clear(); + std::copy(new_chat_tokens.data<int64_t>(), new_chat_tokens.data<int64_t>() + new_chat_tokens.get_size(), + std::back_inserter(m_tokenized_chat_history)); } else { encoded_input_ids = m_tokenizer.encode(prompt).input_ids; } @@ -1032,6 +1058,14 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const { return m_impl->get_embedding_model(); } +std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const { + return m_impl->get_tokenized_chat_history(); +} + +void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) { + return m_impl->update_tokenized_chat_history(encoded_result); +} + Tokenizer InputsEmbedder::get_tokenizer() const { return m_impl->get_tokenizer(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 15df273ee8..a80e74adbb 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -31,6 +31,11 @@ class InputsEmbedder { // returns tokenizer Tokenizer get_tokenizer() const; + // returns tokenized chat history + std::vector<int64_t> get_tokenized_chat_history() const; + // add new results to tokenized chat history + void update_tokenized_chat_history(std::vector<int64_t> encoded_result); + // starts chat and adds optional system_message to chat history void start_chat(const std::string& system_message); // adds currently generated text to chat history diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 28077f3ece..835c18656d 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -99,8 +99,14 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { size_t request_id = 0; size_t block_size = 1; // not used bool enable_prefix_caching = false; + + auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history(); size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1); size_t inputs_embeds_size = inputs_embeds.get_shape().at(1); + // inputs_embeds contains whole history + if (inputs_embeds_size == tokenized_chat_history.size()) { + history_size = 0; + } ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size }); std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0); @@ -125,16 +131,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr), "Currently streaming is possible only for greedy or multinomial decoding"); - ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds.get_shape()[1] }}; + ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds_size }}; std::fill_n(new_atten_mask.data<int64_t>(), new_atten_mask.get_size(), 1); - ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds.get_shape()[1] }}; + ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size); ov::genai::EncodedResults encoded_result; int32_t m_selected_beam = 0; std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests, position_ids, m_embedding, std::nullopt); + + m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]); DecodedResults decoded; for (size_t idx = 0; idx < encoded_result.tokens.size(); ++idx) {