Skip to content

Commit

Permalink
Handle chat and general modes for LLaVa and LLaVa-Next models (openvi…
Browse files Browse the repository at this point in the history
  • Loading branch information
yatarkan authored Oct 17, 2024
1 parent bc270f4 commit efaf0c8
Showing 1 changed file with 57 additions and 42 deletions.
99 changes: 57 additions & 42 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,8 +804,15 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {

ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::string image_token = m_vlm_config.im_start;
std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:";
ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids;
std::string formatted_prompt = images.empty() ? prompt : image_token + "\n" + prompt;

// std::string chat_template_fallback = m_templated_chat_history + " USER: " + formatted_prompt + " ASSISTANT: ";
// chat_template_fallback = chat_template_fallback.erase(0, chat_template_fallback.find_first_not_of(' '));

// Adapted from llava-1.5-7b-hf chat_template.json
std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, chat_template_fallback);

if (images.empty()) {
return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
} else {
Expand All @@ -824,18 +831,12 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {

ov::Tensor get_inputs_embeds_llava_next(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::string image_token = m_vlm_config.im_start;
std::string content = images.empty() ? prompt : image_token + "\n" + prompt;
m_history.push_back({{"role", "user"}, {"content", content}});
constexpr bool add_generation_prompt = true;
std::string new_templated_chat_history;
try {
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
} catch (const std::exception& error) {
// TODO Consider using template syntax instead of concatenating
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, "USER: " + content + " ASSISTANT:");
}

ov::Tensor input_ids = m_tokenizer.encode(new_templated_chat_history).input_ids;
std::string formatted_prompt = images.empty() ? prompt : image_token + "\n" + prompt;

// Adapted from llava-1.5-7b-hf chat_template.json
std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, chat_template_fallback);

if (images.empty()) {
return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
} else {
Expand Down Expand Up @@ -909,34 +910,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
}
}
images_prompt += prompt;
ov::Tensor encoded_input;
if (m_is_chat_conversation) {
// KV cache in model already contains prompts and answers from previous iterations.
// So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
// token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
// <bos token> will be inserted on every iteration.
// So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
// and takes only the difference between them.
// The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
// KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
m_history.push_back({{"role", "user"}, {"content", images_prompt}});
constexpr bool add_generation_prompt = true;
std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) {
encoded_input = new_chat_tokens;
} else {
TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
m_templated_chat_history
);
encoded_input = utils::subtract_chat_tokenized_inputs(
{new_chat_tokens}, prev_chat_tokens
).input_ids;
}
m_templated_chat_history = std::move(new_templated_chat_history);
} else {
encoded_input = m_tokenizer.encode(images_prompt).input_ids;
}

ov::Tensor encoded_input = get_encoded_input_ids(images_prompt);

m_embedding.set_input_tensor(encoded_input);
m_embedding.infer();
ov::Tensor inputs_embeds = m_embedding.get_output_tensor();
Expand Down Expand Up @@ -1038,6 +1014,45 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
pipe.m_resampler.infer();
return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size]
}

ov::Tensor get_encoded_input_ids(const std::string& prompt, const std::string& chat_template_fallback = "") {
ov::Tensor encoded_input_ids;
if (m_is_chat_conversation) {
// KV cache in model already contains prompts and answers from previous iterations.
// So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
// token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
// <bos token> will be inserted on every iteration.
// So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
// and takes only the difference between them.
// The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
// KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
m_history.push_back({{"role", "user"}, {"content", prompt}});
constexpr bool add_generation_prompt = true;
std::string new_templated_chat_history;
try {
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
} catch (const std::exception& error) {
// Use fallback chat template if it was not found in tokenizer_config.json
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
}
ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
auto history_len = m_language.get_tensor("attention_mask").get_shape().at(1);
if (history_len == 0) {
encoded_input_ids = new_chat_tokens;
} else {
TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
m_templated_chat_history
);
encoded_input_ids = utils::subtract_chat_tokenized_inputs(
{new_chat_tokens}, prev_chat_tokens
).input_ids;
}
m_templated_chat_history = std::move(new_templated_chat_history);
} else {
encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
}
return encoded_input_ids;
}
};

VLMPipeline::VLMPipeline(
Expand Down

0 comments on commit efaf0c8

Please sign in to comment.