Skip to content

Commit

Permalink
Enabled CB by default
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Dec 31, 2024
1 parent afb4ad0 commit b0c004b
Showing 1 changed file with 90 additions and 23 deletions.
113 changes: 90 additions & 23 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <nlohmann/json.hpp>

#include "openvino/core/visibility.hpp"
#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/perf_metrics.hpp"

Expand All @@ -18,9 +19,9 @@ namespace genai {

namespace {

/*
/*
* NPU reads some properties from the config file, but when LLMPipeline is initialized
* from the model_str and weights_tensor, there are not files.
* from the model_str and weights_tensor, there are not files.
* In the later case ModelDesc is stored in properties.
* This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr.
*/
Expand All @@ -37,7 +38,7 @@ std::pair<ov::AnyMap, ov::genai::ModelConfigDesc> split_model_descr(const ov::An
pop_property(main_properties, "name_or_path", model_descr.name_or_path);
pop_property(main_properties, "type", model_descr.type);
pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads);

return {main_properties, model_descr};
}

Expand All @@ -62,7 +63,7 @@ std::pair<std::string, Any> draft_model(
const std::string& device,
const ov::AnyMap& properties) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);

std::filesystem::path openvino_model_name = "openvino_model.xml";
auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config);
auto generation_config = utils::from_config_json_if_exists(models_path);
Expand Down Expand Up @@ -99,16 +100,39 @@ ov::genai::LLMPipeline::LLMPipeline(
const std::string& device,
const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||

// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
}

m_pimpl->save_load_time(start_time);
}

Expand All @@ -118,14 +142,35 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::AnyMap& properties) {
auto start_time = std::chrono::steady_clock::now();

if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, device_properties);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, properties);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, default_config, device, properties);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, properties);
}

Expand All @@ -141,36 +186,58 @@ ov::genai::LLMPipeline::LLMPipeline(
const ov::genai::GenerationConfig& generation_config) {
auto start_time = std::chrono::steady_clock::now();

if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
if (properties.find(ov::genai::scheduler_config.name()) != properties.end() ||
properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() ||
properties.find(ov::genai::prompt_lookup.name()) != properties.end()){

auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties);
m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
tokenizer, scheduler_config, device, device_properties, generation_config);
} else if (device == "NPU") {
}

if (m_pimpl == nullptr && device == "NPU") {
// TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// NPU reads some properties from the config file, but when LLMPipeline is initialized
// from the model_str and weights_tensor, there is no files.
// Therefore, we need to pass these properties manually.
// This is necessary only for NPU, for other plugins can be ommited.
// Example of usage:
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"},
// {"type", "llama"},
// {"num_key_value_heads", 32}};
// ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties);
// This will convert from AnyMap to ModelDesc.
auto [filtered_properties, model_descr] = split_model_descr(properties);
auto [device_properties, model_descr] = split_model_descr(properties);

m_pimpl = std::make_unique<StaticLLMPipeline>(
utils::singleton_core().read_model(model_str, weights_tensor),
utils::singleton_core().read_model(model_str, weights_tensor),
model_descr,
tokenizer,
device,
filtered_properties,
device_properties,
generation_config
);
} else {
}

// try to call CB adapter one more time, but with safe guard to silent exception
if (m_pimpl == nullptr) {
try {
// we need use CB only for x86, as for other architectures like arm64 or risc-v we can create Paged Attention based model
// but cannot perform its inference later
#ifdef OPENVINO_ARCH_X86_64
SchedulerConfig default_config;
default_config.max_num_batched_tokens = std::numeric_limits<size_t>::max(); // don't limit total batch size

m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor, tokenizer,
default_config, device, properties, generation_config);
#endif
} catch (ov::Exception&) {
// ignore exceptions from PA
}
}

if (m_pimpl == nullptr) {
m_pimpl = std::make_unique<StatefulLLMPipeline>(
utils::singleton_core().read_model(model_str, weights_tensor),
tokenizer,
Expand Down

0 comments on commit b0c004b

Please sign in to comment.