Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring LLMCompiledModel according to review comments in GenAI static_llm::StatefulLLMPipeline #28267

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 3 additions & 45 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,60 +68,18 @@ DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fai
DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);

namespace npuw {
namespace llm {
struct ModelDesc {
std::string type;
std::string name_or_path;
int num_key_value_heads;
};
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
} // namespace llm
} // namespace npuw

struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::model_desc.name();
}

static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::llm::ModelDesc";
}

static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
return {};
}

static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
::intel_npu::npuw::llm::ModelDesc res;
std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
res.type = res_map["type"];
res.name_or_path = res_map["name_or_path"];
res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
return res;
}

static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
std::string res;
std::map<std::string, std::string> res_map;
res_map["type"] = val.type;
res_map["name_or_path"] = val.name_or_path;
res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads);
return OptionPrinter<std::map<std::string, std::string>>::toString(res_map);
}

static OptionMode mode() {
return OptionMode::CompileTime;
}

static bool isPublic() {
return true;
}
};

struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::generate_hint.name();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,41 +389,60 @@ static constexpr ov::Property<bool> enabled{"NPUW_LLM"};

/**
* @brief
* Type: std::map<std::string, std::string>.
* Tell NPUW about your LLM model. Use following structure for that:
* "type:<type>,name_or_path:<name_or_path>,num_key_value_heads:<number>".
* Default value: empty structure defined above.
* FIXME: Should be removed.
* Type: uint32_t.
* Dimension of the batch in input tensor shape.
* Default value: 0.
*/
static constexpr ov::Property<uint32_t> batch_dim{"NPUW_LLM_BATCH_DIM"};

/**
* @brief
* FIXME: Should be removed.
* Type: uint32_t.
* Dimension of KV-Cache size in input tensor shape.
* Default value: 2.
*/
static constexpr ov::Property<std::string> model_desc{"NPUW_LLM_MODEL_DESC"};
static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable max prompt length.
* Desirable max prompt length.
* Default value: 1024.
*/
static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable min response length.
* Desirable min response length.
* Default value: 128.
*/
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
* FIXME: Should be removed.
* Type: bool.
* Tell NPUW to apply values transpose optimization for the model.
* Default value: false.
*/
static constexpr ov::Property<bool> optimize_v_tensors{"NPUW_LLM_OPTIMIZE_V_TENSORS"};

/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of prefill model.
* Configuration for compilation of prefill model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};

/**
* @brief
* Type: std::string.
* Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it.
* Hint for generation stage. NPUW will use optimal configuration based on the passed preference via hint.
* Hint is ignored if used with "NPUW_LLM_GENERATE_CONFIG".
* Possible values: "FAST_COMPILE", "BEST_PERF".
* Default value: "FAST_COMPILE".
*/
Expand All @@ -432,7 +451,7 @@ static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT
/**
* @brief
* Type: ov::AnyMap.
* Tell NPUW the configuration for compilation of generate model.
* Configuration for compilation of generate model.
* NOTE: !! Write-only !!
*/
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {

void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM>();
desc.add<NPUW_LLM_MODEL_DESC>();
desc.add<NPUW_LLM_BATCH_DIM>();
desc.add<NPUW_LLM_SEQ_LEN_DIM>();
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
desc.add<NPUW_LLM_GENERATE_HINT>();
}
37 changes: 13 additions & 24 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,22 +280,6 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
model->reshape(new_shapes);
}

KVAxesPosition get_kv_axes(const std::string& model_type) {
KVAxesPosition axes;
if (model_type == "chatglm") {
axes.batch = 1u;
axes.seq_len = 0u;
} else if (model_type == "qwen") {
// Note, qwen2 does not fall into this category and conforms to default layout
axes.batch = 0u;
axes.seq_len = 1u;
} else {
axes.batch = 0u;
axes.seq_len = 2u;
}
return axes;
}

bool is_cw_compressed(const std::shared_ptr<ov::Model>& model) {
std::vector<std::string> rt_info_path = {"nncf", "weight_compression", "group_size"};
if (!model->has_rt_info(rt_info_path)) {
Expand Down Expand Up @@ -444,19 +428,22 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
auto prefill_model = kvcache_model->clone();
prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");

const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>();
const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
KVAxesPosition axes = get_kv_axes(model_desc.type);
m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len};
const uint32_t batch_dim = m_cfg.get<::intel_npu::NPUW_LLM_BATCH_DIM>();
const uint32_t seq_len_dim = m_cfg.get<::intel_npu::NPUW_LLM_SEQ_LEN_DIM>();
KVAxesPosition axes{batch_dim, seq_len_dim};
const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);

m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
LOG_DEBUG("4. Make prefill model with static shapes");
reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
LOG_DEBUG("5. Make kvcache model with static shapes");
reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes);
LOG_DEBUG("6.Check and apply opt layout if applicable.");

const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
// NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
if (model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
(model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) {
if (optimize_v_tensors) {
if (optimize_value_tensors(kvcache_model)) {
// NB: Check if TransposeValueTensors transformation was applied
m_kvcache_desc.v_tensors_transposed = true;
Expand Down Expand Up @@ -542,9 +529,11 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
}

m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get),
BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString),
BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get),
BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get),
BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get),
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)});
#undef BIND
}
Loading