Skip to content

Commit

Permalink
Use whole history in case of undetermined tokenization of sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Nov 27, 2024
1 parent b99d35c commit 0c7c4da
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 47 deletions.
134 changes: 93 additions & 41 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ std::pair<EncodedResults, int32_t> beam_search(
class StatefulLLMPipeline final : public LLMPipelineImplBase {
public:
ov::InferRequest m_model_runner;

bool is_chat_conversation = false;
bool m_is_cache_empty = true;
bool m_trust_encoded_history = true;
std::optional<int32_t> m_selected_beam = std::nullopt;
ChatHistory m_history;
std::string m_templated_chat_history = {};
TokenizedInputs m_tokenized_chat_history;
std::vector<int64_t> m_tokenized_chat_history;
ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;

StatefulLLMPipeline(
const ov::InferRequest& request,
Expand Down Expand Up @@ -94,6 +94,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
OptionalGenerationConfig generation_config,
StreamerVariant streamer
) override {
if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
m_chat_input_type = ov::genai::utils::GenerationChatInputsType::STRING;

if (is_chat_conversation)
OPENVINO_ASSERT(m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS,
"Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");

auto start_time = std::chrono::steady_clock::now();
GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
TokenizedInputs encoded_input;
Expand All @@ -119,14 +126,30 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
// Do not add special tokens in chat scenario to be aligned with HF.
auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
if (m_is_cache_empty) {
encoded_input = new_chat_tokens;
} else {
auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));

// some symbols combinations can be encoded by the tokenizer in different ways
// if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
// and find the difference as a prompt, so let's check it out and use the whole history in this case
if (!m_tokenized_chat_history.empty() && m_trust_encoded_history)
m_trust_encoded_history = ov::genai::utils::is_tokenized_history_same(prev_chat_tokens.input_ids, m_tokenized_chat_history);

if (!m_trust_encoded_history) {
reset_kv_state();
m_selected_beam = std::nullopt;
}

if (!m_tokenized_chat_history.empty() && m_trust_encoded_history) {
encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
} else {
encoded_input = new_chat_tokens;
}
m_templated_chat_history = new_templated_chat_history;
m_tokenized_chat_history = new_chat_tokens;
m_tokenized_chat_history.clear();
m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
std::back_inserter(m_tokenized_chat_history));

// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
encoded_input = m_tokenizer.encode(prompt);
Expand Down Expand Up @@ -180,6 +203,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
OptionalGenerationConfig generation_config,
StreamerVariant streamer
) override {
if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::UNDEF)
m_chat_input_type = ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS;

if (is_chat_conversation)
// if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
OPENVINO_ASSERT(m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
"Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");

auto start_time = std::chrono::steady_clock::now();
ov::Tensor input_ids;
ov::Tensor attention_mask;
Expand All @@ -191,6 +222,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
attention_mask = data->attention_mask;
}

if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));

GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;

// If eos_token_id was not provided, take value from default m_generation_config
Expand Down Expand Up @@ -222,53 +256,66 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
"(input_ids, attention_mask, position_ids, beam_idx) "
"but you have '" + std::to_string(num_inputs) + "' inputs");


ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
size_t kv_cache_len = 0;
ov::Tensor concatenated_attention_mask;
if (is_chat_conversation && !m_is_cache_empty) {
OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
// If history is saved in KV cache, concatenate new attention_mask with the already existing.
// Between subsequent runs attention_mask should not be modified.
auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
auto prompt_len = attention_mask.get_shape()[1];
kv_cache_len = atten_mask_history.get_shape()[1];

ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
new_atten_mask.data<int64_t>());
std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
new_atten_mask.data<int64_t>() + kv_cache_len);
concatenated_attention_mask = new_atten_mask;
if (is_chat_conversation && !m_tokenized_chat_history.empty()) {
if (m_trust_encoded_history) {
OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1");
// If history is saved in KV cache, concatenate new attention_mask with the already existing.
// Between subsequent runs attention_mask should not be modified.
auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
auto prompt_len = attention_mask.get_shape()[1];
kv_cache_len = atten_mask_history.get_shape()[1];

ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
new_atten_mask.data<int64_t>());
std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
new_atten_mask.data<int64_t>() + kv_cache_len);
concatenated_attention_mask = new_atten_mask;
} else {
attention_mask = ov::genai::utils::init_attention_mask(tokenized_chat_history);
concatenated_attention_mask = attention_mask;
}
} else {
concatenated_attention_mask = attention_mask;
}

bool position_ids_available = (num_inputs == 4);
std::optional<ov::Tensor> position_ids = std::nullopt;
if (position_ids_available) {
position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
if (is_chat_conversation && !m_trust_encoded_history) {
position_ids = ov::Tensor{ov::element::i64, tokenized_chat_history.get_shape()};
} else {
position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
}
utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len);
}

if(m_adapter_controller) {
m_adapter_controller->apply(m_model_runner, config.adapters);
}

auto input_tokens = input_ids;
if (is_chat_conversation && !m_trust_encoded_history) {
input_tokens = tokenized_chat_history;
m_trust_encoded_history = true;
}

ov::genai::EncodedResults result;
if (config.is_beam_search() && is_chat_conversation) {
std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask,
std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_tokens, concatenated_attention_mask,
config, position_ids, m_selected_beam);
} else {
std::vector<SequenceGroup::Ptr> requests;
size_t block_size = 1;
bool enable_prefix_caching = false;

config.stop_token_ids.insert(config.eos_token_id);
for (size_t request_id = 0; request_id < batch_size; request_id++) {
SequenceGroup::Ptr sequence_group;
if (is_chat_conversation && !m_is_cache_empty) {
sequence_group = std::make_shared<SequenceGroup>(request_id, m_tokenized_chat_history.input_ids, config, block_size, enable_prefix_caching);
if (is_chat_conversation) {
sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
} else {
size_t seq_len = input_ids.get_shape().at(1);
size_t batch_offset = request_id * seq_len;
Expand All @@ -283,16 +330,17 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
}

Sampler sampler = Sampler(m_tokenizer);
std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr,
sampler, requests, position_ids, std::nullopt, m_selected_beam);
std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_tokens, concatenated_attention_mask, streamer_ptr,
sampler, requests, position_ids, std::nullopt, m_selected_beam);
}

if (!is_chat_conversation) {
if (is_chat_conversation) {
std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
} else {
reset_kv_state();
m_selected_beam = std::nullopt;
} else {
m_is_cache_empty = false;
}

auto stop_time = std::chrono::steady_clock::now();

// If is called without tokenization then that stat will not be reported.
Expand All @@ -306,12 +354,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {

void start_chat(const std::string& system_message) override {
is_chat_conversation = true;
m_selected_beam = std::nullopt;
if (!m_is_cache_empty) {
m_selected_beam = std::nullopt;
m_trust_encoded_history = true;
m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
if (!m_tokenized_chat_history.empty()) {
reset_kv_state();
m_is_cache_empty = true;
m_history = {};
m_templated_chat_history = "";
m_tokenized_chat_history.clear();
}
if (system_message.empty())
return;
Expand All @@ -325,11 +375,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
void finish_chat() override {
is_chat_conversation = false;
m_selected_beam = std::nullopt;
if (!m_is_cache_empty) {
m_trust_encoded_history = true;
m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
if (!m_tokenized_chat_history.empty()) {
reset_kv_state();
m_is_cache_empty = true;
m_history.clear();
m_templated_chat_history.clear();
m_tokenized_chat_history.clear();
}
}
};
Expand Down
18 changes: 18 additions & 0 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include "openvino/op/tanh.hpp"
#include "openvino/op/transpose.hpp"

#include "sampler.hpp"

namespace ov {
namespace genai {
namespace utils {
Expand Down Expand Up @@ -265,6 +267,22 @@ ov::Core singleton_core() {
return core;
}

bool is_tokenized_history_same(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history) {
size_t i = 0;
// encoded_history should be eq or less then tokenized_history, tokenized_history also include eos token
if (encoded_history.get_size() > tokenized_history.size())
return false;
auto encoded_history_data = encoded_history.data<int64_t>();
while(i < encoded_history.get_size()) {
if (encoded_history_data[i] != tokenized_history[i])
break;
i++;
}

// encoded_history after decode of tokenizer could lose one last token (eos/stop token)
return i == tokenized_history.size() - 1 || i == tokenized_history.size() - 2;
}

} // namespace utils
} // namespace genai
} // namespace ov
8 changes: 8 additions & 0 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ namespace ov {
namespace genai {
namespace utils {

enum class GenerationChatInputsType {
UNDEF = 0, // Default value, type of inputs is not defined
STRING = 1, // Type of inputs is StringInputs
ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
};

Tensor init_attention_mask(const Tensor& position_ids);

void print_tensor(const ov::Tensor& tensor);
Expand Down Expand Up @@ -66,6 +72,8 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);

ov::Core singleton_core();

bool is_tokenized_history_same(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history);

} // namespace utils
} // namespace genai
} // namespace ov
42 changes: 38 additions & 4 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ class InputsEmbedder::IInputsEmbedder {
ChatHistory m_history;
// Templated chat history
std::string m_templated_chat_history;
// Tokenized chat history
std::vector<int64_t> m_tokenized_chat_history;
// Whether we have computed some inputs already
bool m_is_cache_empty = true;
// Whether the text history has unambiguous encoded representation
bool m_trust_encoded_history = true;

public:
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images) = 0;
Expand All @@ -50,8 +54,18 @@ class InputsEmbedder::IInputsEmbedder {
return m_tokenizer;
}

std::vector<int64_t> get_tokenized_chat_history() const {
return m_tokenized_chat_history;
}

void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
}

virtual void start_chat(const std::string& system_message) {
m_is_chat_conversation = true;
m_trust_encoded_history = true;
m_tokenized_chat_history.clear();
if (!m_is_cache_empty) {
m_history.clear();
m_templated_chat_history.clear();
Expand All @@ -75,9 +89,11 @@ class InputsEmbedder::IInputsEmbedder {
virtual void finish_chat() {
m_is_chat_conversation = false;
m_is_cache_empty = true;
m_trust_encoded_history = true;

m_history.clear();
m_templated_chat_history.clear();
m_tokenized_chat_history.clear();
}

protected:
Expand Down Expand Up @@ -112,19 +128,29 @@ class InputsEmbedder::IInputsEmbedder {
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
}
ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
if (m_is_cache_empty) {
TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history);

// some symbols combinations can be encoded by the tokenizer in different ways
// if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
// and find the difference as a prompt
// so let's check it out and use the whole history in this case
if (!m_is_cache_empty && m_trust_encoded_history) {
m_trust_encoded_history = ov::genai::utils::is_tokenized_history_same(prev_chat_tokens.input_ids, m_tokenized_chat_history);
}

if (m_is_cache_empty || m_trust_encoded_history) {
encoded_input_ids = new_chat_tokens;
// after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty
m_is_cache_empty = false;
} else {
TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
m_templated_chat_history
);
encoded_input_ids = utils::subtract_chat_tokenized_inputs(
{new_chat_tokens}, prev_chat_tokens
).input_ids;
}
m_templated_chat_history = std::move(new_templated_chat_history);
m_tokenized_chat_history.clear();
std::copy(new_chat_tokens.data<int64_t>(), new_chat_tokens.data<int64_t>() + new_chat_tokens.get_size(),
std::back_inserter(m_tokenized_chat_history));
} else {
encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
}
Expand Down Expand Up @@ -1032,6 +1058,14 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
return m_impl->get_embedding_model();
}

std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
return m_impl->get_tokenized_chat_history();
}

void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
return m_impl->update_tokenized_chat_history(encoded_result);
}

Tokenizer InputsEmbedder::get_tokenizer() const {
return m_impl->get_tokenizer();
}
Expand Down
5 changes: 5 additions & 0 deletions src/cpp/src/visual_language/inputs_embedder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ class InputsEmbedder {
// returns tokenizer
Tokenizer get_tokenizer() const;

// returns tokenized chat history
std::vector<int64_t> get_tokenized_chat_history() const;
// add new results to tokenized chat history
void update_tokenized_chat_history(std::vector<int64_t> encoded_result);

// starts chat and adds optional system_message to chat history
void start_chat(const std::string& system_message);
// adds currently generated text to chat history
Expand Down
Loading

0 comments on commit 0c7c4da

Please sign in to comment.