Skip to content

Commit

Permalink
[LLM][NPU] Ported sampler from Stateless to Stateful pipeline (#1507)
Browse files Browse the repository at this point in the history
- *Ported sampler functionality from Stateless to Stateful pipeline*
  • Loading branch information
AsyaPronina authored Jan 13, 2025
1 parent 71e6769 commit 5146984
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 24 deletions.
91 changes: 67 additions & 24 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,14 +686,16 @@ StatefulLLMPipeline::StatefulLLMPipeline(
const std::string&,
const ov::AnyMap& config
) : LLMPipelineImplBase(tokenizer,
utils::from_config_json_if_exists(models_path)) {
utils::from_config_json_if_exists(models_path)),
m_sampler(m_tokenizer) {

auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
ov::AnyMap properties = config;

auto compiled = setupAndCompileModel(model, model_desc, properties);
m_request = compiled->create_infer_request();
m_sampler.set_seed(m_generation_config.rng_seed);
}


Expand All @@ -704,10 +706,12 @@ StatefulLLMPipeline::StatefulLLMPipeline(
const std::string&,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config
) : LLMPipelineImplBase(tokenizer, generation_config) {
) : LLMPipelineImplBase(tokenizer, generation_config),
m_sampler(m_tokenizer) {
ov::AnyMap properties_copy = properties;
auto compiled = setupAndCompileModel(model, model_desc, properties_copy);
m_request = compiled->create_infer_request();
m_sampler.set_seed(m_generation_config.rng_seed);
}

std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
Expand All @@ -717,6 +721,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(

const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
m_max_prompt_len = kMaxPromptLen;
m_kvcache_total = kMaxPromptLen + kMinResponseLen;

update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
Expand Down Expand Up @@ -817,7 +822,9 @@ EncodedResults StatefulLLMPipeline::generate(
attention_mask = data->attention_mask;
}

OPENVINO_ASSERT(input_ids.get_shape().at(0) == 1u, "Currently only batch size=1 is supported");
ov::Shape prompts_shape = input_ids.get_shape();
const size_t batch_size = prompts_shape[0];
OPENVINO_ASSERT(batch_size == 1u, "Currently only batch size=1 is supported");

GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
// If eos_token_id was not provided, take value from default m_generation_config
Expand All @@ -834,19 +841,26 @@ EncodedResults StatefulLLMPipeline::generate(
streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
}

OPENVINO_ASSERT(config.is_greedy_decoding(), "Currently only greedy decoding is supported");
OPENVINO_ASSERT(config.is_greedy_decoding() || config.is_multinomial(),
"Currently only greedy and multinomial decoding are supported");

OPENVINO_ASSERT(config.num_return_sequences == 1u,
"Currently only \"num_return_sequences\" equal to 1 is supported!");

ov::Shape prompts_shape = input_ids.get_shape();
const size_t batch_size = prompts_shape[0];
ov::genai::EncodedResults results;
auto& raw_perf_counters = results.perf_metrics.raw_metrics;
// NB: Only batch=1 is supported now
results.scores.resize(1u);
results.scores[0] = 0u;
results.tokens.resize(1u);

// TODO: Check if there is enough space in KV-cache to process input prompt
// NB: Check if there is enough space in KV-cache to process input prompt
auto prompt_len = input_ids.get_size();
if (prompt_len > m_max_prompt_len) {
OPENVINO_THROW("Static Stateful LLM pipeline may only process prompts up to "
+ std::to_string(m_max_prompt_len) + " tokens. "
+ "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
}

ov::Tensor position_ids{ov::element::i64, input_ids.get_shape()};
utils::initialize_position_ids(position_ids, attention_mask);
Expand All @@ -857,26 +871,53 @@ EncodedResults StatefulLLMPipeline::generate(

m_request.infer();

int64_t last_token = utils::argmax(m_request.get_tensor("logits"), 0);

results.tokens[0].push_back(last_token);
if (streamer_ptr && streamer_ptr->put(last_token)) {
return results;
auto padded_logits = m_request.get_tensor("logits");
// FIXME: Here is workaround to get only useful units of returned logits.
// If SliceOut is applied, there will be only 1 useful logit returned,
// nothing is required here.
// Other way, model will return logits of full context length,
// as internally prefill model is specially reshaped to return them.
// Fix should be done on OpenVINO side, so the model should return only
// useful logits of input prompt length, dropping the implementation-related
// padding ones.
auto logits = padded_logits;
auto padded_sequence_len = padded_logits.get_shape()[1];
if (padded_sequence_len > 1) {
// If SliceOut is not applied:
logits = make_tensor_slice(padded_logits, 1, padded_sequence_len - prompt_len, padded_sequence_len);
}
int64_t output_sequence_len = logits.get_shape().at(1);

auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
sequence_group->schedule_tokens(output_sequence_len);

// NB: Controls what tokens are ready to be pushed into the streamer
GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());

SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
stream_generated_tokens(streamer_ptr, handle);

int64_t input_ids_data = -1;
int64_t position_ids_data = prompt_len - 1;
std::vector<int64_t> attention_mask_data(prompt_len - 1, 1);
m_request.set_tensor("input_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&input_ids_data)));
m_request.set_tensor("position_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&position_ids_data)));

const size_t max_tokens = config.get_max_new_tokens(prompt_len);
for (int i = 0; i < max_tokens - 1; ++i) {
while (sequence_group->is_running()) {
// KV Cache is full, no further generation is possible
if (position_ids_data + 1 == m_kvcache_total) {
sequence_group->set_out_of_memory();
break;
}

sequence_group->schedule_tokens(1);
const auto running_sequences = sequence_group->get_running_sequences();
OPENVINO_ASSERT(running_sequences.size() == 1u);
auto last_token = running_sequences.front()->get_generated_ids().back();

// Just change the variables here, as pointers to them are already set to corresponding tensors
input_ids_data = last_token;
++position_ids_data;
Expand All @@ -886,24 +927,24 @@ EncodedResults StatefulLLMPipeline::generate(

m_request.infer();

last_token = utils::argmax(m_request.get_tensor("logits"), 0);
results.tokens[0].push_back(last_token);

raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
if (streamer_ptr && streamer_ptr->put(last_token)) {
break;
}

if (last_token == config.eos_token_id && !config.ignore_eos) {
break;
}
SamplerOutput sampler_output = m_sampler.sample(
{sequence_group}, m_request.get_tensor("logits"));
stream_generated_tokens(streamer_ptr, handle);
}

if (streamer_ptr) {
streamer_ptr->end();
}

OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u);
auto sequence = sequence_group->get_finished_sequences().front();
results.tokens[0] = sequence->get_generated_ids();
results.scores[0] = sequence->get_cumulative_log_prob();
m_sampler.clear_request_info(sequence_group->get_request_id());

auto stop_time = std::chrono::steady_clock::now();
// If is called without tokenization then that stat will not be reported.
auto& metrics = results.perf_metrics;
Expand Down Expand Up @@ -1288,7 +1329,7 @@ EncodedResults StatelessLLMPipeline::generate(
// NB: Check if there is enough space in KV-cache to process input prompt
auto prompt_len = input_ids.get_size();
if (prompt_len > m_kvcache_desc.max_prompt_size) {
OPENVINO_THROW("Static LLM pipeline may only process prompts up to "
OPENVINO_THROW("Static Stateless LLM pipeline may only process prompts up to "
+ std::to_string(m_kvcache_desc.max_prompt_size) + " tokens. "
+ "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
}
Expand Down Expand Up @@ -1318,6 +1359,8 @@ EncodedResults StatelessLLMPipeline::generate(
auto logits = m_prefill_request.get_tensor("logits");
int64_t output_sequence_len = logits.get_shape().at(1);

// TODO: Pass input_ids to say that there is room for generation.
// Retrive only useful logits and work only with them here.
auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/src/llm_pipeline_static.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,12 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
void finish_chat() override;

private:
uint32_t m_max_prompt_len = 0u;
uint32_t m_kvcache_total = 0u;
ov::InferRequest m_request;

Sampler m_sampler;

bool m_is_chat_conversation = false;
ChatHistory m_history;
};
Expand Down

0 comments on commit 5146984

Please sign in to comment.