Merge branch 'master' into cb-by-default

openvinotoolkit · Jan 15, 2025 · abc02db · abc02db
2 parents 4136cfe + 2e5c2a1
commit abc02db
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 15 deletions.
diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml
@@ -13,5 +13,5 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.11
-      - run: python -m pip install bandit==1.8.0
+      - run: python -m pip install bandit
       - run: python -m bandit --recursive --configfile bandit.yml .
diff --git a/bandit.yml b/bandit.yml
@@ -79,7 +79,7 @@
 # IPAS Required Checkers. Do not disable these
 # Additional checkers may be added if desired
 tests:
-  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413']
+  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B411', 'B412', 'B413']
 
 # (optional) list skipped test IDs here, eg '[B101, B406]':
 # The following checkers are not required but be added to tests list if desired

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -690,19 +690,38 @@ namespace static_llm {
 StatefulLLMPipeline::StatefulLLMPipeline(
     const std::filesystem::path& models_path,
     const ov::genai::Tokenizer& tokenizer,
-    const std::string&,
+    const std::string& device,
     const ov::AnyMap& config
 ) : LLMPipelineImplBase(tokenizer,
                         utils::from_config_json_if_exists(models_path)),
     m_sampler(m_tokenizer) {
-
-    auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
-    ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
     ov::AnyMap properties = config;
-
-    auto compiled = setupAndCompileModel(model, model_desc, properties);
-    m_request = compiled->create_infer_request();
-    m_sampler.set_seed(m_generation_config.rng_seed);
+    const auto use_blob = pop_or_default(properties, "USE_BLOB", false);
+    if (use_blob) {
+        auto blob_path = pop_or_default(properties, "BLOB_PATH", std::string{});
+        if (blob_path.empty()) {
+            blob_path = (models_path / "openvino_model.blob").string();
+        }
+        if (!std::filesystem::exists(blob_path)) {
+            OPENVINO_THROW("Blob file is not found at: " + blob_path);
+        }
+        std::ifstream fin(blob_path, std::ios::in | std::ios::binary);
+        if (!fin.is_open()) {
+            OPENVINO_THROW("Blob file can't be opened: " + blob_path);
+        }
+        auto compiled = genai::utils::singleton_core().import_model(fin, device, {});
+        m_max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
+        auto min_resp_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
+        m_kvcache_total = m_max_prompt_len + min_resp_len;
+        m_request = compiled.create_infer_request();
+    } else {
+        auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
+        ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
+        ov::AnyMap properties = config;
+        auto compiled = setupAndCompileModel(model, model_desc, properties);
+        m_request = compiled->create_infer_request();
+        m_sampler.set_seed(m_generation_config.rng_seed);
+    }
 }
 
 
@@ -721,11 +740,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
-std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
-    const std::shared_ptr<ov::Model>& model,
+void StatefulLLMPipeline::updateStatefulConfig(
     const ModelConfigDesc& model_desc,
     ov::AnyMap& pipeline_config) {
-
     const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
     const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
     m_max_prompt_len = kMaxPromptLen;
@@ -755,6 +772,13 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
 
     // Replace CACHE_DIR option if NPUW is enabled
     set_npuw_cache_dir(pipeline_config);
+}
+
+std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
+    const std::shared_ptr<ov::Model>& model,
+    const ModelConfigDesc& model_desc,
+    ov::AnyMap& pipeline_config) {
+    updateStatefulConfig(model_desc, pipeline_config);
 
     return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
 }

diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
@@ -59,6 +59,10 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
         const ModelConfigDesc& model_desc,
         ov::AnyMap& pipeline_config);
 
+    void updateStatefulConfig(
+        const ModelConfigDesc& model_desc,
+        ov::AnyMap& pipeline_config);
+
     DecodedResults generate(
         StringInputs inputs,
         OptionalGenerationConfig generation_config,

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -172,6 +172,8 @@ class InputsEmbedder::IInputsEmbedder {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
             TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+            auto end_tokenizer_time = std::chrono::steady_clock::now();
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
 
             // some symbols combinations can be encoded by the tokenizer in different ways
             // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
@@ -211,8 +213,6 @@ class InputsEmbedder::IInputsEmbedder {
                 if (m_last_disappeared_token.has_value())
                     encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
-            auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
             m_tokenized_history.clear();
             std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));