Skip to content

Commit

Permalink
[LLM/VLM] Stop generation when streaming callback returns true (#1410)
Browse files Browse the repository at this point in the history
Affects only stateful VLM and LLM pipelines and CB, SD implementation
should be fixed separately as 2 pipelines should be aborted in case of
exception / cancel via streaming callback
  • Loading branch information
ilya-lavrenov authored Dec 20, 2024
1 parent 4d18f8b commit 04d9728
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 114 deletions.
81 changes: 50 additions & 31 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
m_tokenizer = tokenizer;
m_generation_config = generation_config;
m_is_validation_mode_enabled = is_validation_mode_enabled;

ov::Core core;

auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
Expand Down Expand Up @@ -255,18 +255,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
}
}, streamer);

OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
"Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");

std::vector<GenerationHandle> generations;
for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
}

std::vector<EncodedGenerationResult> results;
results.reserve(m_awaiting_requests.size());

auto drop_requests = [&] () {
for (const std::shared_ptr<ov::genai::SequenceGroup> request : m_requests) {
for (const auto& sequence: request->get_sequences()) {
Expand All @@ -279,25 +267,40 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
m_requests.clear();
};

OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && sampling_params[0].num_return_sequences == 1 &&
(sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
"Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");

std::vector<GenerationHandle> generations;
for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
}
auto all_requests = m_awaiting_requests; // we need to store all requests to get results from them once generation has finished

bool continue_generation = true;
while (has_non_finished_requests() && continue_generation) {
try {
step();
} catch (...) {
drop_requests();
drop_requests(); // remove all requests from pipeline state in case of exception
throw;
}
if (streamer_ptr && generations.at(0)->can_read()) {
std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();

auto & generation = generations.at(0);
if (streamer_ptr && generation->can_read()) {
std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (!streamer_ptr->put(gen_token)) {
continue_generation = !streamer_ptr->put(gen_token);
if (!continue_generation) {
generation->drop();
break;
}
}
}
}

if (streamer_ptr) {
if (streamer_ptr) { // push streamer's cache
streamer_ptr->end();
}

Expand All @@ -307,16 +310,32 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
}

for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
const auto& generation = generations[generation_idx];
std::vector<EncodedGenerationResult> results;
results.reserve(all_requests.size());

for (size_t request_id = 0; request_id < all_requests.size(); ++request_id) {
const auto& request = all_requests[request_id];
auto sampling_params = request->get_sampling_parameters();
const auto& sequences = request->get_finished_sequences();
size_t num_outputs = std::min(sampling_params.num_return_sequences, sequences.size());

EncodedGenerationResult result;
result.m_request_id = 1;
std::vector<GenerationOutput> generation_outputs = generation->read_all();
for (const auto& generation_output : generation_outputs) {
result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
result.m_scores.push_back(generation_output.score);
result.m_request_id = request_id;
result.m_generation_ids.resize(num_outputs);
result.m_scores.resize(num_outputs);

for (size_t i = 0; i < num_outputs; ++i) {
const auto & sequence = sequences[i];
const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
const auto & generated_ids = sequence->get_generated_ids();

if (sampling_params.echo)
result.m_generation_ids[i] = request->get_prompt_ids();
std::copy(generated_ids.begin(), generated_ids.end(), std::back_inserter(result.m_generation_ids[i]));
result.m_scores[i] = score;
}
result.m_status = generation->get_status();

result.m_status = generations[request_id]->get_status();
results.push_back(std::move(result));
}

Expand Down Expand Up @@ -408,7 +427,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
// requests not scheduled, in decoding phase or not echoing are not processed
if (!sequence_group->is_scheduled() || sequence_group->get_context_len() > sequence_group->get_prompt_len() ||
if (!sequence_group->is_scheduled() || sequence_group->get_context_len() > sequence_group->get_prompt_len() ||
!sequence_group->get_sampling_parameters().echo)
continue;

Expand All @@ -421,10 +440,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(

size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());

// if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
// otherwise we include it as it will be used in the next part of the prompt
int exclude_last_logprob = 1;
// otherwise we include it as it will be used in the next part of the prompt
int exclude_last_logprob = 1;
if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
exclude_last_logprob = 0;

Expand All @@ -435,7 +454,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
token_logits_offset < actual_seq_len - exclude_last_logprob;
token_logits_offset++, token_id_offset++) {

const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
int64_t token_id = sequence_group->get_prompt_ids()[token_id_offset];
float token_logit = token_logits[token_id];
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/generation_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ GenerationStatus GenerationHandleImpl::get_status() {
}

bool GenerationHandleImpl::can_read() {
return !is_dropped() && m_generation_stream->can_read();
return !is_dropped() && m_generation_stream->can_read();
}

bool GenerationHandleImpl::is_dropped() {
Expand Down
5 changes: 2 additions & 3 deletions src/cpp/src/generation_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ class GenerationStream {
GenerationStatus m_status = GenerationStatus::RUNNING;
SynchronizedQueue<GenerationOutputs> m_output_queue;

std::vector<uint64_t> last_sequence_ids;

public:
using Ptr = std::shared_ptr<GenerationStream>;

Expand All @@ -30,10 +28,11 @@ class GenerationStream {
m_output_queue.push(std::move(outputs));
}

// Retrieving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt
// Retrieving vector of pairs <sequence_id, token_ids> as we can generate multiple outputs for a single prompt
GenerationOutputs back() {
return m_output_queue.back();
}

GenerationOutputs read() {
return m_output_queue.pull();
}
Expand Down
15 changes: 5 additions & 10 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
}

auto batch_size = input_ids.get_shape().at(0);
if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) {
OPENVINO_THROW("Currently streaming is possible only with batch size=1 and "
"only for greedy or multinomial decoding");
}
OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
(config.is_greedy_decoding() || config.is_multinomial()),
"Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");

auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
Expand Down Expand Up @@ -587,9 +586,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
std::vector<std::string> plain_replies;
std::vector<float> plain_scores;
for (GenerationResult& res : generated) {
if (GenerationStatus::FINISHED != res.m_status) {
OPENVINO_THROW("Got unfinished GenerationStatus");
}
OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
}
Expand Down Expand Up @@ -645,9 +642,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
std::vector<std::vector<int64_t>> plain_tokens;
std::vector<float> plain_scores;
for (EncodedGenerationResult& res : generated) {
if (GenerationStatus::FINISHED != res.m_status) {
OPENVINO_THROW("Got unfinished GenerationStatus");
}
OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
}
Expand Down
105 changes: 45 additions & 60 deletions src/cpp/src/lm_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,33 +67,49 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
generations.push_back(std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters()));
}

auto active_sequence_groups{sequence_groups};

auto stream_generated_tokens = [&streamer_ptr, &generations, &active_sequence_groups]() {
GenerationHandle& handle = generations.at(0);
if (streamer_ptr && handle->can_read()) {
std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (streamer_ptr->put(gen_token)) {
handle->drop();
break;
}
}
}

// free non running requests
auto removed_it = std::remove_if(active_sequence_groups.begin(), active_sequence_groups.end(),
[](SequenceGroup::Ptr sg) -> bool {
return sg->has_finished() || sg->out_of_memory() || sg->handle_dropped();
});
active_sequence_groups.erase(removed_it, active_sequence_groups.end());
};

ov::Shape prompts_shape = input_ids.get_shape();
const size_t batch_size = prompts_shape[0];

// Initialize results and performance metrics.

EncodedResults results;
auto& raw_perf_counters = results.perf_metrics.raw_metrics;
raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};

// Initialize inputs
if (m_embedding.has_value())
m_llm.set_tensor("inputs_embeds", input_ids);
else
m_llm.set_tensor("input_ids", input_ids);

m_llm.set_tensor(m_embedding.has_value() ? "inputs_embeds" : "input_ids", input_ids);
m_llm.set_tensor("attention_mask", attention_mask);

if (position_ids.has_value())
m_llm.set_tensor("position_ids", *position_ids);

ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
auto beam_data = beam_idx.data<int32_t>();
if (selected_beam_idx.has_value())
beam_data[0] = *selected_beam_idx;
else
std::fill_n(beam_data, batch_size, 0);
std::fill_n(beam_idx.data<int32_t>(), batch_size, selected_beam_idx.has_value() ? *selected_beam_idx : 0);
m_llm.set_tensor("beam_idx", beam_idx);

// "Prompt" phase

const auto infer_start = std::chrono::steady_clock::now();
m_llm.infer();
const auto infer_end = std::chrono::steady_clock::now();
Expand All @@ -109,35 +125,18 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
for (auto& sequence_group : sequence_groups) {
sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len);
sequence_group->schedule_tokens(sequence_len);

}

std::map<size_t, size_t> beam_offets;
for (size_t i = 0; i < sequence_groups.size(); i++)
beam_offets.insert({sequence_groups.at(i)->get_request_id(), i});

SamplerOutput sampler_output = sampler.sample(sequence_groups, logits);
stream_generated_tokens();

auto active_sequence_groups{sequence_groups};
auto get_active_sequence_groups = [](SequenceGroup::Ptr sg) { return sg->has_finished(); };

active_sequence_groups.erase(std::remove_if(active_sequence_groups.begin(),
active_sequence_groups.end(),
get_active_sequence_groups),
active_sequence_groups.end());

auto stream_generated_tokens = [&streamer_ptr, &generations]() {
if (streamer_ptr && generations.at(0).get()->can_read()) {
std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (!streamer_ptr->put(gen_token)) {
break;
}
}
}
};
// "Generation" phase

while (active_sequence_groups.size() > 0) {
while (!active_sequence_groups.empty()) {
size_t total_num_tokens = 0;

for (auto& sequence_group : active_sequence_groups) {
Expand Down Expand Up @@ -178,20 +177,13 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
}

for (size_t i = 0; i < sequence_groups.size(); i++) {
if (i == 0)
beam_offets[sequence_groups.at(i)->get_request_id()] = 0;
else {
beam_offets[sequence_groups.at(i)->get_request_id()] = sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i -1];
}
beam_offets[sequence_groups.at(i)->get_request_id()] = i == 0 ? 0 : (sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i - 1]);
}

if (m_embedding.has_value()) {
const ov::Tensor& embed_prompt_tensor = (*m_embedding).infer(new_input_ids);

m_llm.get_tensor("inputs_embeds").set_shape(embed_prompt_tensor.get_shape());
m_llm.set_tensor("inputs_embeds", embed_prompt_tensor);
} else {
m_llm.get_tensor("input_ids").set_shape(new_input_ids.get_shape());
m_llm.set_tensor("input_ids", new_input_ids);
}

Expand All @@ -201,7 +193,6 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
update_position_ids(m_llm.get_tensor("position_ids"), m_llm.get_tensor("attention_mask"));
}

m_llm.get_tensor("beam_idx").set_shape({ total_num_tokens });
m_llm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()});

const auto infer_start = std::chrono::steady_clock::now();
Expand All @@ -213,36 +204,30 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
raw_perf_counters.m_new_token_times.emplace_back(infer_end);
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);

stream_generated_tokens();

sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));

active_sequence_groups.erase(std::remove_if(active_sequence_groups.begin(),
active_sequence_groups.end(),
get_active_sequence_groups),
active_sequence_groups.end());
stream_generated_tokens();
}

// to stream last token
stream_generated_tokens();
if (streamer_ptr) {
if (streamer_ptr) { // push streamer's cache
streamer_ptr->end();
}


// Collect results

size_t next_selected_beam = 0;
for (size_t i = 0; i < sequence_groups.size(); i++) {
auto request = sequence_groups[i];
auto generation_outputs = generations[i]->read_all();
std::vector<GenerationOutput> generation_outputs;
auto sampling_params = request->get_sampling_parameters();
const auto& sequences = request->get_finished_sequences();
size_t num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, sequences.size());

std::sort(generation_outputs.begin(), generation_outputs.end(), [] (const GenerationOutput& r1, const GenerationOutput& r2) {
return r1.score > r2.score;
});
for (size_t seq_id = 0; seq_id < num_outputs; ++seq_id) {
const auto & sequence = sequences[seq_id];
const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();

auto num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, generation_outputs.size());
for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
const auto& generation_output = generation_outputs[generation_output_idx];
results.tokens.push_back(std::move(generation_output.generated_ids));
results.scores.push_back(generation_output.score);
results.tokens.push_back(sequence->get_generated_ids());
results.scores.push_back(score);
}
// next_selected_beam = sampler.last_selected_beam(request);
}
Expand Down
Loading

0 comments on commit 04d9728

Please sign in to comment.