Skip to content

[LLM] Enabled CB by default #1455

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Feb 5, 2025
Merged
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
bf462f5
Enabled CB by default
ilya-lavrenov Dec 30, 2024
2f99472
Enable prefix caching
ilya-lavrenov Jan 2, 2025
6038663
Merge remote-tracking branch 'upstream/master' into cb-by-default
ilya-lavrenov Jan 7, 2025
7fc5b4e
Merge branch 'master' into cb-by-default
andrei-kochin Jan 8, 2025
ba3a61b
Merge branch 'master' into cb-by-default
ilya-lavrenov Jan 9, 2025
52644b3
Merge branch 'master' into cb-by-default
andrei-kochin Jan 9, 2025
5245f42
Merge branch 'master' into cb-by-default
andrei-kochin Jan 10, 2025
f1a715f
Merge branch 'master' into cb-by-default
andrei-kochin Jan 10, 2025
5d98114
Merge branch 'master' into cb-by-default
ilya-lavrenov Jan 14, 2025
4136cfe
Merge remote-tracking branch 'upstream/master' into cb-by-default
ilya-lavrenov Jan 14, 2025
abc02db
Merge branch 'master' into cb-by-default
andrei-kochin Jan 15, 2025
36398cd
Merge branch 'master' into cb-by-default
andrei-kochin Jan 16, 2025
5999f95
Merge branch 'master' into cb-by-default
andrei-kochin Jan 22, 2025
c547e05
Merge branch 'master' into cb-by-default
ilya-lavrenov Jan 22, 2025
e7c7ed8
Merge branch 'master' into cb-by-default
ilya-lavrenov Jan 24, 2025
c8d35e5
Merge branch 'master' into cb-by-default
ilya-lavrenov Jan 28, 2025
e293a33
Merge branch 'master' into cb-by-default
ilya-lavrenov Jan 30, 2025
dcb3f3a
Merge branch 'master' into cb-by-default
ilya-lavrenov Feb 3, 2025
9d71fad
Ability to select Attention backend
ilya-lavrenov Feb 5, 2025
062e339
Merge branch 'master' into cb-by-default
Wovchena Feb 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 121 additions & 27 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <nlohmann/json.hpp>

#include "openvino/core/visibility.hpp"
#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/perf_metrics.hpp"

Expand All @@ -18,7 +19,7 @@ namespace genai {

namespace {

/*
/*
* NPU reads some properties from the config file, but when LLMPipeline is initialized
* from the model_str and weights_tensor, there are no files.
* In the later case ModelDesc is stored in properties.
Expand All @@ -37,10 +38,27 @@ std::pair<ov::AnyMap, ov::genai::static_llm::ModelConfigDesc> split_model_descr(
pop_property(main_properties, "name_or_path", model_descr.name_or_path);
pop_property(main_properties, "type", model_descr.type);
pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);

return {main_properties, model_descr};
}

const std::string PA_BACKEND = "PA";
const std::string SPDA_BACKEND = "SPDA";

std::pair<ov::AnyMap, std::string> extract_attention_backend(const ov::AnyMap& external_properties) {
ov::AnyMap properties = external_properties;
auto it = properties.find("ATTENTION_BACKEND");
std::string attention_backend = PA_BACKEND;
if (it != properties.end()) {
attention_backend = it->second.as<std::string>();
OPENVINO_ASSERT(attention_backend == PA_BACKEND || attention_backend == SPDA_BACKEND,
"Attention backend must be either '", PA_BACKEND, "' or '", SPDA_BACKEND, "', got '", attention_backend, "'");
properties.erase(it);
}
return {properties, attention_backend};
};


} // namespace


Expand All @@ -62,7 +80,7 @@ std::pair<std::string, Any> draft_model(
const std::string& device,
const ov::AnyMap& properties) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
auto generation_config = utils::from_config_json_if_exists(models_path);
Expand Down Expand Up @@ -91,41 +109,92 @@ ov::genai::LLMPipeline::LLMPipeline(
OptionalGenerationConfig generation_config) {
auto start_time = std::chrono::steady_clock::now();
m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
m_pimpl->save_load_time(start_time);
}

ov::genai::LLMPipeline::LLMPipeline(
const std::filesystem::path& models_path,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& properties) {
const ov::AnyMap& user_properties) {
auto start_time = std::chrono::steady_clock::now();
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||

auto [properties, attention_backend] = extract_attention_backend(user_properties);

// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
} else if (device == "NPU") {
auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, device_properties);
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr && attention_backend == PA_BACKEND) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
}

m_pimpl->save_load_time(start_time);
}

ov::genai::LLMPipeline::LLMPipeline(
const std::filesystem::path& models_path,
const std::string& device,
const ov::AnyMap& properties) {
const ov::AnyMap& user_properties) {
auto start_time = std::chrono::steady_clock::now();

if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
auto [properties, attention_backend] = extract_attention_backend(user_properties);

// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr && attention_backend == PA_BACKEND) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
}

Expand All @@ -137,40 +206,65 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::Tensor& weights_tensor,
const ov::genai::Tokenizer& tokenizer,
const std::string& device,
const ov::AnyMap& properties,
const ov::AnyMap& user_properties,
const ov::genai::GenerationConfig& generation_config) {
auto start_time = std::chrono::steady_clock::now();

if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
auto [properties, attention_backend] = extract_attention_backend(user_properties);

// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()){

auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
tokenizer, scheduler_config, device, device_properties, generation_config);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
// TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// Therefore, we need to pass these properties manually.
// This is necessary only for NPU, for other plugins can be ommited.
// Example of usage:
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// {"num_key_value_heads", 32}};
// ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
// This will convert from AnyMap to ModelDesc.
auto [filtered_properties, model_descr] = split_model_descr(properties);
auto [device_properties, model_descr] = split_model_descr(properties);

m_pimpl = static_llm::LLMPipelineFactory::create(
utils::singleton_core().read_model(model_str, weights_tensor),
utils::singleton_core().read_model(model_str, weights_tensor),
model_descr,
tokenizer,
device,
filtered_properties,
device_properties,
generation_config
);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr && attention_backend == PA_BACKEND) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
default_config, device, properties, generation_config);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(
utils::singleton_core().read_model(model_str, weights_tensor),
tokenizer,
Expand Down
Loading