Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Nov 25, 2024
1 parent a2dbeba commit 6f9335e
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 30 deletions.
86 changes: 63 additions & 23 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,30 @@
#include "sampler.hpp"
#include "lm_encoding.hpp"

#include "debug_utils.hpp"
namespace {

enum class GenerationChatInputsType {
UNDEF = 0, // Default value, type of inputs is not defined
STRING = 1, // Type of inputs is StringInputs
ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
};

} // namespace

namespace ov {
namespace genai {

class StatefulLLMPipeline final : public LLMPipelineImplBase {
public:
ov::InferRequest m_model_runner;

bool is_chat_conversation = false;
bool m_history_available = false;
std::optional<int32_t> m_selected_beam = std::nullopt;
bool m_is_cache_empty = true;
ChatHistory m_history;
std::string m_templated_chat_history = {};
std::vector<int64_t> m_tokenized_chat_history;
GenerationChatInputsType m_chat_input_type = GenerationChatInputsType::UNDEF;
std::optional<int64_t> m_last_disappeared_token = std::nullopt;

StatefulLLMPipeline(
const ov::InferRequest& request,
Expand Down Expand Up @@ -87,6 +96,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
OptionalGenerationConfig generation_config,
StreamerVariant streamer
) override {
if (is_chat_conversation && m_chat_input_type == GenerationChatInputsType::UNDEF)
m_chat_input_type = GenerationChatInputsType::STRING;

if (is_chat_conversation)
OPENVINO_ASSERT(m_chat_input_type != GenerationChatInputsType::ENCODED_INPUTS,
"Chat doesn't support switching between input types. Please, continue using EncodedInputs or restart the chat.");

auto start_time = std::chrono::steady_clock::now();
GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
TokenizedInputs encoded_input;
Expand Down Expand Up @@ -119,6 +135,16 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
encoded_input = new_chat_tokens;
}
m_templated_chat_history = new_templated_chat_history;

m_tokenized_chat_history.clear();
std::copy(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.data<int64_t>() + new_chat_tokens.input_ids.get_size(),
std::back_inserter(m_tokenized_chat_history));

// no need to add m_last_disappeared_token to encoded_input, it was kept by subtract_chat_tokenized_inputs
if (m_last_disappeared_token.has_value() && *m_last_disappeared_token == encoded_input.input_ids.data<int64_t>()[0]) {
m_last_disappeared_token = std::nullopt;
}

// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
encoded_input = m_tokenizer.encode(prompt);
Expand Down Expand Up @@ -172,6 +198,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
OptionalGenerationConfig generation_config,
StreamerVariant streamer
) override {
if (is_chat_conversation && m_chat_input_type == GenerationChatInputsType::UNDEF)
m_chat_input_type = GenerationChatInputsType::ENCODED_INPUTS;

if (is_chat_conversation)
// if chat was run in StringInputs mode, but it was called EncodedInputs generate, last m_history entry will be with assistant role
OPENVINO_ASSERT(m_chat_input_type == GenerationChatInputsType::ENCODED_INPUTS || m_history.back()["role"] == "user",
"Chat doesn't support switching between input types. Please, continue using StringInputs or restart the chat.");

auto start_time = std::chrono::steady_clock::now();
ov::Tensor input_ids;
ov::Tensor attention_mask;
Expand All @@ -183,6 +217,16 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
attention_mask = data->attention_mask;
}

if (is_chat_conversation && m_chat_input_type == GenerationChatInputsType::ENCODED_INPUTS) {
std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
}

// Tail of previous output in chat mode is missing in KV cache.
if (is_chat_conversation && m_last_disappeared_token.has_value()) {
attention_mask = ov::genai::utils::push_front_inputs(attention_mask, std::vector<int64_t>{1});
input_ids = ov::genai::utils::push_front_inputs(input_ids, std::vector<int64_t>{*m_last_disappeared_token});
}

GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;

// If eos_token_id was not provided, take value from default m_generation_config
Expand Down Expand Up @@ -214,16 +258,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
"(input_ids, attention_mask, position_ids, beam_idx) "
"but you have '" + std::to_string(num_inputs) + "' inputs");

if (is_chat_conversation) {
std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
}
ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
bool is_cache_empty = !m_selected_beam.has_value();

size_t kv_cache_len = 0;
ov::Tensor concatenated_attention_mask;
if (is_chat_conversation && m_history_available) {
if (is_cache_empty) {
if (m_is_cache_empty) {
attention_mask = ov::genai::utils::init_attention_mask(tokenized_chat_history);
concatenated_attention_mask = attention_mask;
} else {
Expand All @@ -249,7 +288,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
bool position_ids_available = (num_inputs == 4);
std::optional<ov::Tensor> position_ids = std::nullopt;
if (position_ids_available) {
if (is_chat_conversation && is_cache_empty) {
if (is_chat_conversation && m_is_cache_empty) {
position_ids = ov::Tensor{ov::element::i64, tokenized_chat_history.get_shape()};
} else {
position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
Expand All @@ -261,7 +300,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
m_adapter_controller->apply(m_model_runner, config.adapters);
}

ov::genai::EncodedResults result;
std::vector<SequenceGroup::Ptr> requests;
size_t block_size = 1;
bool enable_prefix_caching = false;
Expand All @@ -285,25 +323,25 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
Sampler sampler = Sampler(m_tokenizer);
// we can't properly refer to history in case of chat scenario with beam search, so reset_kv_state and use the whole history for each new propmt
auto input_tokens = input_ids;
if (is_chat_conversation && is_cache_empty) {
if (is_chat_conversation && m_is_cache_empty) {
input_tokens = tokenized_chat_history;
}
result = ov::genai::get_lm_encoded_results(m_model_runner, input_tokens, concatenated_attention_mask, streamer_ptr, sampler, requests, position_ids, std::nullopt);
ov::genai::EncodedResults result = ov::genai::get_lm_encoded_results(m_model_runner, input_tokens, concatenated_attention_mask,
streamer_ptr, sampler, requests, position_ids, std::nullopt);

m_selected_beam = 0;
m_is_cache_empty = false;
if (!is_chat_conversation || config.is_beam_search()) {
reset_kv_state();
m_selected_beam = std::nullopt;
m_is_cache_empty = true;
}

if (is_chat_conversation) {
m_history_available = true;
m_last_disappeared_token = result.tokens[0].back();
}

if (is_chat_conversation) {
auto decoded_result = m_tokenizer.decode(result.tokens[0]);
auto answer = m_tokenizer.encode(decoded_result, ov::genai::add_special_tokens(false)).input_ids;
std::copy(answer.data<int64_t>(), answer.data<int64_t>() + answer.get_size(), std::back_inserter(m_tokenized_chat_history));
if (is_chat_conversation && m_chat_input_type == GenerationChatInputsType::ENCODED_INPUTS) {
std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
}

auto stop_time = std::chrono::steady_clock::now();
Expand All @@ -319,13 +357,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {

void start_chat(const std::string& system_message) override {
is_chat_conversation = true;
m_selected_beam = std::nullopt;
m_is_cache_empty = true;
m_chat_input_type = GenerationChatInputsType::UNDEF;
m_tokenized_chat_history = {};
if (m_history_available) {
reset_kv_state();
m_history_available = false;
m_history = {};
m_templated_chat_history = "";
m_tokenized_chat_history = {};
}
if (system_message.empty())
return;
Expand All @@ -338,13 +377,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {

void finish_chat() override {
is_chat_conversation = false;
m_selected_beam = std::nullopt;
m_is_cache_empty = true;
m_chat_input_type = GenerationChatInputsType::UNDEF;
m_tokenized_chat_history = {};
if (m_history_available) {
reset_kv_state();
m_history_available = false;
m_history.clear();
m_templated_chat_history.clear();
m_tokenized_chat_history = {};
}
}
};
Expand Down
5 changes: 2 additions & 3 deletions src/cpp/src/lm_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
#include <regex>
#include <vector>

#include "utils.hpp"
#include "debug_utils.hpp"
#include "lm_encoding.hpp"
#include "openvino/genai/perf_metrics.hpp"

#include "utils.hpp"

#include "debug_utils.hpp"

namespace ov {
namespace genai {
Expand Down
4 changes: 0 additions & 4 deletions src/cpp/src/lm_encoding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,5 @@ EncodedResults get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor&
const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding);

void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams);

void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask);

}
}
8 changes: 8 additions & 0 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,14 @@ ov::Core singleton_core() {
return core;
}

ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front) {
ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + add_to_front.size()}};
auto new_tensor_data = new_tensor.data<int64_t>();
std::copy(add_to_front.begin(), add_to_front.end(), new_tensor_data);
std::copy(base_tensor.data<int64_t>(), base_tensor.data<int64_t>() + base_tensor.get_size(), new_tensor_data + add_to_front.size());
return new_tensor;
}

} // namespace utils
} // namespace genai
} // namespace ov
2 changes: 2 additions & 0 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);

ov::Core singleton_core();

ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front);

} // namespace utils
} // namespace genai
} // namespace ov

0 comments on commit 6f9335e

Please sign in to comment.