Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLM] Enabled CB by default #1455

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
114 changes: 92 additions & 22 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <nlohmann/json.hpp>

#include "openvino/core/visibility.hpp"
#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/perf_metrics.hpp"

Expand All @@ -18,7 +19,7 @@ namespace genai {

namespace {

/*
/*
* NPU reads some properties from the config file, but when LLMPipeline is initialized
* from the model_str and weights_tensor, there are no files.
* In the later case ModelDesc is stored in properties.
Expand All @@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::static_llm::ModelConfigDesc> split_model_descr(
pop_property(main_properties, "name_or_path", model_descr.name_or_path);
pop_property(main_properties, "type", model_descr.type);
pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);

return {main_properties, model_descr};
}

Expand All @@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model(
const std::string& device,
const ov::AnyMap& properties) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
auto generation_config = utils::from_config_json_if_exists(models_path);
Expand Down Expand Up @@ -99,16 +100,40 @@ ov::genai::LLMPipeline::LLMPipeline(
const std::string& device,
const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||

// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = static_llm::LLMPipelineFactory::create(models_path, tokenizer, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
}

m_pimpl->save_load_time(start_time);
}

Expand All @@ -118,14 +143,36 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();

if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = static_llm::LLMPipelineFactory::create(models_path, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
}

Expand All @@ -141,36 +188,59 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::genai::GenerationConfig& generation_config) {
auto start_time = std::chrono::steady_clock::now();

if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()){

auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
tokenizer, scheduler_config, device, device_properties, generation_config);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
// TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// Therefore, we need to pass these properties manually.
// This is necessary only for NPU, for other plugins can be ommited.
// Example of usage:
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// {"num_key_value_heads", 32}};
// ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
// This will convert from AnyMap to ModelDesc.
auto [filtered_properties, model_descr] = split_model_descr(properties);
auto [device_properties, model_descr] = split_model_descr(properties);

m_pimpl = static_llm::LLMPipelineFactory::create(
utils::singleton_core().read_model(model_str, weights_tensor),
utils::singleton_core().read_model(model_str, weights_tensor),
model_descr,
tokenizer,
device,
filtered_properties,
device_properties,
generation_config
);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size
default_config.enable_prefix_caching = true; // for better TTFT in chat scenarios

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
default_config, device, properties, generation_config);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(
utils::singleton_core().read_model(model_str, weights_tensor),
tokenizer,
Expand Down
Loading