diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index a1fa7a4453..7b5bf84762 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -678,13 +678,13 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { /* * NPU reads some properties from the config file, but when LLMPipeline is initialized -* from the model_str and weights_tensor, there are not files. +* from the model_str and weights_tensor, there are no files. * In the later case ModelDesc is stored in properties. * This function pops ModelDescr from the the properties and returns a pair of updated properties and ModelDescr. */ -std::pair split_model_descr(const ov::AnyMap& properties) { +std::pair split_model_descr(const ov::AnyMap& properties) { ov::AnyMap main_properties = properties; - ov::genai::ModelConfigDesc model_descr; + ov::genai::static_llm::ModelConfigDesc model_descr; auto pop_property = [](ov::AnyMap& orig_propertis, const std::string& key, auto& value) { if (orig_propertis.find(key) != orig_propertis.end()) { @@ -778,7 +778,7 @@ ov::genai::LLMPipeline::LLMPipeline( // This will convert from AnyMap to ModelDesc. auto [properties, model_descr] = split_model_descr(plugin_config); - m_pimpl = std::make_unique( + m_pimpl = static_llm::LLMPipelineFactory::create( utils::singleton_core().read_model(model_str, weights_tensor), model_descr, tokenizer, diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 97ac6d484d..11979a7628 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -396,12 +396,12 @@ KVAxesPosition get_kv_axes(const std::string& model_type) { return axes; } -ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) { +ov::genai::static_llm::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& filepath) { std::ifstream file(filepath); OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string()); nlohmann::json config_data = nlohmann::json::parse(file); - ov::genai::ModelConfigDesc desc; + ov::genai::static_llm::ModelConfigDesc desc; desc.type = config_data["model_type"].get(); // NB: In case _name_or_path field isn't presented in config.json if (config_data.contains("_name_or_path")) { @@ -412,7 +412,7 @@ ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path& return desc; } -std::string model_desc_to_string(const ov::genai::ModelConfigDesc& model_desc) { +std::string model_desc_to_string(const ov::genai::static_llm::ModelConfigDesc& model_desc) { std::map model_desc_map; model_desc_map["type"] = model_desc.type; model_desc_map["name_or_path"] = model_desc.name_or_path; @@ -672,7 +672,7 @@ StatefulLLMPipeline::StatefulLLMPipeline( utils::from_config_json_if_exists(models_path)) { auto model = genai::utils::singleton_core().read_model((models_path / "openvino_model.xml").string()); - ov::genai::ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); + ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); ov::AnyMap properties = config; auto compiled = setupAndCompileModel(model, model_desc, properties); @@ -698,7 +698,7 @@ StatefulLLMPipeline::StatefulLLMPipeline( OPENVINO_ASSERT(!use_blobs, "blobs cannot be used with model string and weights tensor"); ov::AnyMap properties_copy = properties; - auto compiled = setupAndCompileModel(model, model_desc, properties); + auto compiled = setupAndCompileModel(model, model_desc, properties_copy); m_request = compiled->create_infer_request(); } @@ -721,10 +721,7 @@ std::shared_ptr StatefulLLMPipeline::setupAndCompileModel( rename_key(pipeline_config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG"); rename_key(pipeline_config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG"); - - // FIXME: Support CACHE_DIR in future - drop_cache_dir(pipeline_config); - + return std::make_shared(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config)); } @@ -882,6 +879,19 @@ EncodedResults StatefulLLMPipeline::generate( // TODO: How to check that KV-Cache is full? } + + if (streamer_ptr) { + streamer_ptr->end(); + } + + auto stop_time = std::chrono::steady_clock::now(); + // If is called without tokenization then that stat will not be reported. + auto& metrics = results.perf_metrics; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.evaluate_statistics(start_time); + return results; } void StatefulLLMPipeline::start_chat(const std::string& system_message) { @@ -918,7 +928,7 @@ StatelessLLMPipeline::StatelessLLMPipeline( */ const auto use_blobs = pop_or_default(properties, "USE_BLOBS", false); if (!use_blobs) { - ov::genai::ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); + ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); auto model = genai::utils::singleton_core().read_model((models_path / "openvino_model.xml").string()); setupAndCompileModels(model, device, model_desc, properties); } else { @@ -1397,6 +1407,32 @@ LLMPipelineFactory::create(const std::filesystem::path& models_path, const ov::AnyMap& config) { return create(models_path, Tokenizer(models_path), device, config); } + +std::unique_ptr LLMPipelineFactory::create(const std::shared_ptr& model, + const ModelConfigDesc& model_desc, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config) { + auto properties_copy = properties; + const auto pipeline_mode = pop_or_default(properties_copy, "NPU_PIPELINE", std::string("STATELESS")); + OPENVINO_ASSERT(pipeline_mode == "STATELESS" || pipeline_mode == "STATEFUL", + "Only STATELESS and STATEFULL NPU_PIPELINE modes are supported!"); + if (pipeline_mode == "STATEFUL") { + return std::make_unique(model, + model_desc, + tokenizer, + device, + properties_copy, + generation_config); + } + return std::make_unique(model, + model_desc, + tokenizer, + device, + properties_copy, + generation_config); +} } // namespace static_llm } // namespace genai } // namespace ov diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 5c42d2506d..9fbdad89ec 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -26,6 +26,13 @@ struct LLMPipelineFactory { static std::unique_ptr create(const std::filesystem::path& path, const std::string& device, const ov::AnyMap& config); + + static std::unique_ptr create(const std::shared_ptr& model, + const ModelConfigDesc& model_desc, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& properties, + const ov::genai::GenerationConfig& generation_config = {}); }; class StatefulLLMPipeline : public LLMPipelineImplBase { @@ -48,7 +55,6 @@ class StatefulLLMPipeline : public LLMPipelineImplBase { std::shared_ptr setupAndCompileModel( const std::shared_ptr& model, - const std::string& device, const ModelConfigDesc& model_desc, ov::AnyMap& pipeline_config);