From a7644562769e2be6bce688180c3230949a8231db Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Sat, 4 May 2024 15:31:13 +0400
Subject: [PATCH 01/22] rm requirements_2024.1.txt (#413)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merging releases/2024/1 into master added requirements_2024.1.txt. Which
I thought was fine. But now I see that the versions here aren’t going to
be modified and they are detected as vulnerabilities over time. Remove
requirements_2024.1.txt from master to resolve the vulnerabilities.
---
 llm_bench/python/requirements_2024.1.txt | 111 -----------------------
 1 file changed, 111 deletions(-)
 delete mode 100644 llm_bench/python/requirements_2024.1.txt

diff --git a/llm_bench/python/requirements_2024.1.txt b/llm_bench/python/requirements_2024.1.txt
deleted file mode 100644
index a0d4388870..0000000000
--- a/llm_bench/python/requirements_2024.1.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-nncf @ git+https://github.com/openvinotoolkit/nncf.git@ec497ce0781fe867d73d5c5bdf8310fdb40604a4#egg=nncf
-about-time==4.2.1
-accelerate==0.29.2
-aiohttp==3.9.4
-aiosignal==1.3.1
-alive-progress==3.1.5
-async-timeout==4.0.3
-attrs==23.2.0
-auto_gptq==0.7.1
-autograd==1.6.2
-bitsandbytes==0.43.1
-blobfile==2.1.1
-certifi==2019.11.28
-chardet==3.0.4
-charset-normalizer==3.3.2
-cma==3.2.2
-coloredlogs==15.0.1
-contourpy==1.2.1
-cycler==0.12.1
-datasets==2.18.0
-Deprecated==1.2.14
-diffusers==0.27.2
-dill==0.3.8
-einops==0.7.0
-filelock==3.13.4
-fonttools==4.51.0
-frozenlist==1.4.1
-fsspec==2024.2.0
-future==1.0.0
-gekko==1.1.1
-grapheme==0.6.0
-huggingface-hub==0.22.2
-humanfriendly==10.0
-idna==2.8
-importlib_metadata==7.1.0
-influxdb-client==1.41.0
-Jinja2==3.1.3
-joblib==1.4.0
-jsonschema==4.21.1
-jsonschema-specifications==2023.12.1
-jstyleson==0.0.2
-kiwisolver==1.4.5
-lxml==4.9.4
-Mako==1.1.0
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-matplotlib==3.8.4
-mdurl==0.1.2
-mpmath==1.3.0
-multidict==6.0.5
-multiprocess==0.70.16
-natsort==8.4.0
-networkx==3.3
-ninja==1.11.1.1
-numpy==1.26.4
-onnx==1.16.0
-openvino-telemetry==2024.1.0
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@ff792c278502a85444dd116413dbca71aa660599
-packaging==24.0
-pandas==2.2.2
-peft==0.10.0
-pillow==10.3.0
-pip==24.0
-protobuf==5.26.1
-psutil==5.9.8
-py-cpuinfo==9.0.0
-pyarrow==15.0.2
-pyarrow-hotfix==0.6
-pycryptodomex==3.20.0
-pydot==2.0.0
-Pygments==2.17.2
-pymoo==0.6.1.1
-pyparsing==3.1.2
-python-dateutil==2.9.0.post0
-python-git==2018.2.1
-pytz==2024.1
-PyYAML==6.0.1
-reactivex==4.0.4
-referencing==0.34.0
-regex==2023.12.25
-requests==2.31.0
-requests-unixsocket==0.2.0
-rich==13.7.1
-rouge==1.0.1
-rpds-py==0.18.0
-safetensors==0.4.3
-scikit-learn==1.4.2
-scipy==1.13.0
-Send2Trash==1.8.3
-sentencepiece==0.2.0
-setuptools==65.5.0
-six==1.14.0
-sympy==1.12
-tabulate==0.9.0
-threadpoolctl==3.4.0
-tiktoken==0.6.0
-timm==0.9.16
-tokenizers==0.15.2
-torch==2.2.2
-torchvision==0.17.2
-tqdm==4.66.2
-transformers==4.39.3
-transformers-stream-generator==0.0.5
-typing_extensions==4.11.0
-tzdata==2024.1
-urllib3==2.2.1
-wheel==0.41.2
-wrapt==1.16.0
-xxhash==3.4.1
-yarl==1.9.4
-zipp==3.18.1

From 96ad4ec742ae70032c33e231bb2f60c8d3e8597f Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Wed, 8 May 2024 09:47:26 +0200
Subject: [PATCH 02/22] Update NNCF default configs (#419)

**Updated NNCF defaults for:**

- dolly-v2-3b
- opt-2.7b
- red-pajama-incite-chat-3b-v1
- vicuna-7b-v1.5
- stablelm-tuned-alpha-3b

Based on the experiments from 135227
---
 llm_bench/python/utils/nncf_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
index 7142abd3bd..b0d0d93aa1 100644
--- a/llm_bench/python/utils/nncf_utils.py
+++ b/llm_bench/python/utils/nncf_utils.py
@@ -33,7 +33,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.5},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
     "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
@@ -64,4 +64,8 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
                           "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
+    "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
+    "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
 }

From 3a09f817c226862e5416f976554eba352be496b1 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 8 May 2024 21:28:42 +0400
Subject: [PATCH 03/22] Split convert_tokenizer and greedy_causal_lm,
 ubuntu-20.04-4-cores (#420)

cpp-greedy_causal_lm-redpajama-3b-chat randomply fails. Split the
commands to find whether it's convert_tokenizer or runtime
---
 .github/workflows/causal_lm_cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index f3ff8a6ee7..faf37f86b6 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -444,7 +444,7 @@ jobs:
           echo Phi-1_5 passed
           
   cpp-greedy_causal_lm-redpajama-3b-chat:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-4-cores
     steps:
       - uses: actions/checkout@v4
         with:
@@ -464,10 +464,10 @@ jobs:
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
           wait
+      - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/pytorch/dldt/FP16/ --output ./redpajama-3b-chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./redpajama-3b-chat/pytorch/dldt/FP16/ --output ./redpajama-3b-chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
           timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_greedy.txt 
       - name: Compare
         run: |

From 563e05df82fcd116b8dba5bf27cd182bd6fb3e10 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 May 2024 09:39:26 +0200
Subject: [PATCH 04/22] Consume all draft tokens for speculative decoding
 (#424)

Ticket: 140110
---
 .../causal_lm/cpp/speculative_decoding_lm.cpp | 317 +++++++++++-------
 1 file changed, 203 insertions(+), 114 deletions(-)

diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
index 92523f82a5..4aefec14db 100644
--- a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
+++ b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
@@ -1,18 +1,16 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/openvino.hpp>
 #include <cmath>
+#include <openvino/openvino.hpp>
 #include <random>
 
 constexpr size_t BATCH_SIZE = 1;
 
-// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], 
+// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
 // threfore usually SEQ_LEN_AXIS = 2
 constexpr size_t SEQ_LEN_AXIS = 2;
 
-int64_t SPECIAL_EOS_TOKEN;
-
 namespace {
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
@@ -43,7 +41,7 @@ struct TextStreamer {
             std::cout << std::string_view{text.data() + print_len, text.size() - print_len};
             token_cache.clear();
             print_len = 0;
-	    return;
+            return;
         }
         if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) {
             // Don't print incomplete text
@@ -60,22 +58,23 @@ struct TextStreamer {
         print_len = 0;
     }
 };
-}
 
 ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) {
     // Copy elements from the old to a new tensor and return it.
     // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...],
     // It that's not the case for your model please implement your own trim method.
-    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis);
-    
+    OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0,
+                    "Cannot trim key/values with sequence length axis = ",
+                    seq_len_axis);
+
     auto old_tensor_data = tensor.data<float>();
     auto shape = tensor.get_shape();
     size_t num_kv_heads = shape[1];
     size_t old_seq_len = shape[2];
     size_t head_size = shape[3];
-    
+
     OPENVINO_ASSERT(new_seq_len <= old_seq_len);
-    
+
     // if new_seq_len equal to old one no need to copy tensor, return as is
     if (old_seq_len == new_seq_len)
         return tensor;
@@ -88,11 +87,13 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
     // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
     auto new_tensor = ov::Tensor{ov::element::f32, {BATCH_SIZE, num_kv_heads, new_seq_len, head_size}};
     auto new_tensor_data = new_tensor.data<float>();
-    for (size_t batch = 0; batch < BATCH_SIZE; ++batch){
+    for (size_t batch = 0; batch < BATCH_SIZE; ++batch) {
         for (size_t i = 0; i < num_kv_heads; ++i) {
             for (size_t j = 0; j < new_seq_len; ++j) {
-                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i +  head_size * j;
-                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i +  head_size * j;
+                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch +
+                               new_seq_len * head_size * i + head_size * j;
+                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch +
+                               old_seq_len * head_size * i + head_size * j;
                 std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
             }
         }
@@ -102,12 +103,120 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
     // trim kv_cache values up to the new_seq_len
-    for (auto& state: request.query_state()) {
+    for (auto& state : request.query_state()) {
         ov::Tensor old_tensor = state.get_state();
         state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
     }
 }
 
+class AssistedCandidateGenerator {
+private:
+    ov::InferRequest draft_model;
+    size_t max_seq_length;
+    size_t num_pred_tokens = 5;
+    const size_t max_pred_tokens = 10;
+    int64_t out_of_kv_cache_token = -1;
+    size_t draft_model_seq_length = 0;
+
+public:
+    AssistedCandidateGenerator(ov::InferRequest draft_model, const size_t max_seq_length, const size_t num_pred_tokens)
+        : draft_model{draft_model},
+          max_seq_length{max_seq_length},
+          num_pred_tokens{num_pred_tokens} {};
+
+    int64_t generate_next_token(const std::vector<int64_t> tokens) {
+        size_t tokens_size = tokens.size();
+        auto input_ids = draft_model.get_tensor("input_ids");
+        input_ids.set_shape({BATCH_SIZE, tokens_size});
+        std::copy_n(tokens.begin(), tokens_size, input_ids.data<int64_t>());
+
+        auto attention_mask = draft_model.get_tensor("attention_mask");
+        attention_mask.set_shape({BATCH_SIZE, draft_model_seq_length + tokens_size});
+        std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+
+        auto position_ids = draft_model.get_tensor("position_ids");
+        position_ids.set_shape({BATCH_SIZE, tokens_size});
+        std::iota(position_ids.data<int64_t>(),
+                  position_ids.data<int64_t>() + position_ids.get_size(),
+                  draft_model_seq_length);
+
+        draft_model.get_tensor("beam_idx").set_shape({BATCH_SIZE});
+        draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+        draft_model.infer();
+
+        auto logits = draft_model.get_tensor("logits");
+        size_t vocab_size = logits.get_shape().back();
+        auto sequence_logits = logits.data<float>() + (tokens_size - 1) * vocab_size;
+
+        draft_model_seq_length += tokens_size;
+
+        return std::max_element(sequence_logits, sequence_logits + vocab_size) - sequence_logits;
+    }
+
+    std::vector<int64_t> generate_candidates(int64_t out_token) {
+        std::vector<int64_t> candidates;
+
+        // limit candidates size by num_pred_tokens or by max_seq_length
+        size_t candidates_to_generate = std::min(num_pred_tokens, max_seq_length - draft_model_seq_length - 1);
+
+        candidates.reserve(candidates_to_generate);
+
+        // generate cadidates
+        for (size_t i = 0; i < candidates_to_generate; i++) {
+            // if out_of_kv_cache_token is present, prepend it to out_token in order to collect kv cache for it
+            if (out_of_kv_cache_token != -1) {
+                out_token = generate_next_token(std::vector{out_of_kv_cache_token, out_token});
+                out_of_kv_cache_token = -1;
+            } else {
+                out_token = generate_next_token(std::vector{out_token});
+            }
+
+            candidates.push_back(out_token);
+        }
+
+        out_of_kv_cache_token = candidates.back();
+        return candidates;
+    }
+
+    void update_candidate_strategy(const size_t num_matches) {
+        // dynamically adjust number of generated candidates based on number of matches
+        // we want to balance the benefits of getting candidates tokens correct with the
+        // cost of forecasting incorrect candidates tokens.
+        if (num_matches == num_pred_tokens) {
+            num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens);
+        } else {
+            num_pred_tokens = std::max(int64_t(num_pred_tokens) - 1, int64_t(1));
+        }
+    }
+
+    void update_kv_cache(const size_t seq_length) {
+        // this is the case when main model accepted all candidates from draft model
+        // we need to collect kv cache for out_of_kv_cache_token by infering it
+        // on next candidates generation cycle out_of_kv_cache_token will be prefixed
+        // to main models's latest out token
+        if (draft_model_seq_length < seq_length) {
+            return;
+        }
+
+        out_of_kv_cache_token = -1;
+        ::update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_length);
+        draft_model_seq_length = seq_length;
+    }
+};
+
+int64_t get_eos_token(const std::shared_ptr<ov::Model> tokenizer) {
+    auto rt_info = tokenizer->get_rt_info();  // Get the runtime info for the model
+
+    auto it = rt_info.find("eos_token_id");
+    if (it == rt_info.end()) {
+        throw std::runtime_error("EOS token ID not found in model's runtime information.");
+    }
+    return it->second.as<int64_t>();
+}
+
+}  // namespace
+
 int main(int argc, char* argv[]) try {
     if (argc != 4) {
         throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <DRAFT MODEL_DIR> <MAIN MODEL_DIR> '<PROMPT>'");
@@ -118,150 +227,130 @@ int main(int argc, char* argv[]) try {
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
     auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
     // tokenizer and detokenizer work on CPU only
-    ov::InferRequest tokenizer = core.compile_model(
-        tokenizer_model, "CPU").create_infer_request();
-    auto [draft_input_ids, draft_attention_mask] = tokenize(tokenizer, argv[3]);
-    ov::InferRequest detokenizer = core.compile_model(
-        std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+    ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request();
+    auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]);
+    ov::InferRequest detokenizer =
+        core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     TextStreamer text_streamer{std::move(detokenizer)};
 
-    // draft model
-    ov::InferRequest draft_model = core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
+    // draft model (which is smaller, less accurate but faster)
+    ov::InferRequest draft_model =
+        core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
 
-    draft_model.set_tensor("input_ids", draft_input_ids);
-    draft_model.set_tensor("attention_mask", draft_attention_mask);
-    
-    ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids");
-    draft_position_ids.set_shape(draft_input_ids.get_shape());
-    std::iota(draft_position_ids.data<int64_t>(), draft_position_ids.data<int64_t>() + draft_position_ids.get_size(), 0);
-    uint64_t seq_len = draft_input_ids.get_shape()[1];
+    uint64_t seq_len = input_ids.get_shape()[1];
 
-    // main model
-    ov::InferRequest main_model = core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request();
+    // main model (which is bigger, more accurate but slower)
+    ov::InferRequest main_model =
+        core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request();
 
-    // Input tensors for the main model should not be mixed with draft.
-    // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids
-    auto input_ids = main_model.get_tensor("input_ids");
-    input_ids.set_shape(draft_input_ids.get_shape());
-    draft_input_ids.copy_to(input_ids);
+    size_t max_sequence_length = 100;
 
-    auto attention_mask = main_model.get_tensor("attention_mask");
-    attention_mask.set_shape(draft_input_ids.get_shape());
-    std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
+    AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5};
+
+    main_model.set_tensor("input_ids", input_ids);
+    main_model.set_tensor("attention_mask", attention_mask);
 
     auto position_ids = main_model.get_tensor("position_ids");
-    position_ids.set_shape(draft_input_ids.get_shape());
+    position_ids.set_shape(input_ids.get_shape());
     std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
-    
+
     // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1
-    draft_model.get_tensor("beam_idx").set_shape({BATCH_SIZE});
-    draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
     main_model.get_tensor("beam_idx").set_shape({BATCH_SIZE});
     main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0;
 
     // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request
-    draft_model.infer();
+    candidateGenerator.generate_next_token(
+        std::vector<int64_t>(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size()));
+
     main_model.infer();
 
     size_t vocab_size = draft_model.get_tensor("logits").get_shape().back();
-    OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models");
-       
+    OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(),
+                    "vocab size should be the same for the both models");
+
     // logits shape is [BATCH_SIZE, seq_len, vocab_size]
     auto logits = main_model.get_tensor("logits");
     auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size;
     int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits;
-    
-    // the first token which is fed to both draft and main netwoks on each iteration
-    auto first_token = out_token;
-    text_streamer.put(out_token);
-    
-    // run K infer requests on draft model and get next K prediction tokens on each iteration
-    uint64_t K = 5;
-    std::vector<int64_t> draft_tokens;
-
-    // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1.
-    draft_input_ids.set_shape({BATCH_SIZE, 1});
-    draft_position_ids.set_shape({BATCH_SIZE, 1});
 
-    auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
-
-    if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
-        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
-    } else {
-        throw std::runtime_error("EOS token ID not found in model's runtime information.");
-    }
+    text_streamer.put(out_token);
 
-/* Speculative decoding works the following way. The draft model predicts the next K
-   tokens one by one in an autoregressive manner, while the main model validates these
-   predictions and corrects them if necessary. We go through each predicted token, and
-   if a difference is detected between the draft and main model, we stop and keep the
-   last token predicted by the main model. Then the draft model gets the latest main
-   prediction and again tries to predict the next K tokens, repeating the cycle.
-
-   This approach reduces the need for multiple infer requests to the main model,
-   enhancing performance. For instance, in more predictable parts of text generation,
-   the draft model can, in best-case scenarios, generate the next K tokens that exactly
-   match the target. In tha caste the are validated in a single inference request to
-   the main model (which is bigger, more accurate but slower) instead of running K
-   subsequent requests. 
-   */
-    int max_sequence_length = 100;
-    while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
-        // infer the K next tokens with draft model
-        for (int i = 0; i < K; ++i) {
-            draft_input_ids.data<int64_t>()[0] = out_token;
-            draft_attention_mask.set_shape({BATCH_SIZE, seq_len + i + 1});
-            std::fill_n(draft_attention_mask.data<int64_t>(), draft_attention_mask.get_size(), 1);
-            draft_position_ids.data<int64_t>()[0] = int64_t(draft_attention_mask.get_size() - 1);
-
-            draft_model.infer();
-
-            auto draft_logits = draft_model.get_tensor("logits").data<float>();
-            int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits;
-            out_token = arg_max_token;
-            draft_tokens.emplace_back(arg_max_token);
+    const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
+
+    /* Speculative decoding works the following way. The draft model predicts the next K
+       tokens one by one in an autoregressive manner, while the main model validates these
+       predictions and corrects them if necessary. We go through each predicted token, and
+       if a difference is detected between the draft and main model, we stop and keep the
+       last token predicted by the main model. Then the draft model gets the latest main
+       prediction and again tries to predict the next K tokens, repeating the cycle.
+
+       This approach reduces the need for multiple infer requests to the main model,
+       enhancing performance. For instance, in more predictable parts of text generation,
+       the draft model can, in best-case scenarios, generate the next K tokens that exactly
+       match the target. In that case they are validated in a single inference call to
+       the main model instead of running K subsequent requests.
+       */
+
+    while (out_token != EOS_TOKEN && seq_len < max_sequence_length) {
+        // generate candidates from the draft model
+        std::vector<int64_t> candidates = candidateGenerator.generate_candidates(out_token);
+        size_t candidates_size = candidates.size();
+
+        // For the main network, candidates_size + 1 tokens will be fed at once in a single infer request.
+        input_ids.set_shape({BATCH_SIZE, candidates_size + 1});
+
+        input_ids.data<int64_t>()[0] = out_token;
+        if (candidates_size > 0) {
+            std::copy_n(candidates.begin(), candidates_size, input_ids.data<int64_t>() + 1);
         }
 
-        // For the main network, K tokens will be fed at once in a single infer request.
-        input_ids.set_shape({BATCH_SIZE, K});
-        // Set the first token for the main model to be the same as for the draft model.
-        input_ids.data<int64_t>()[0] = first_token;
-        for (int i = 0; i < K - 1; i++)
-            input_ids.data<int64_t>()[i + 1] = draft_tokens[i];
-
-        attention_mask.set_shape({BATCH_SIZE, seq_len + K});
+        attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1});
         std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1);
 
-        position_ids.set_shape({BATCH_SIZE, K});
+        position_ids.set_shape({BATCH_SIZE, candidates_size + 1});
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len);
 
         main_model.infer();
 
         data_logits = logits.data<float>();  // [BATCH_SIZE, K, vocab_size]
-        size_t disagree_idx = K - 1;
-        // Iterate through the predicted tokens from the main model and compare them with draft predictions.
-        // In the worst-case scenario (disagreement at the beginning), iter will increase by 1.
-        // In the best-case scenario, all elements match, and K predicted tokens will be taken.
-        for (size_t i = 0; i < K; i++) {
+
+        // match model tokens with candidate tokens
+        // 1. accept current out token (if not eos)
+        // 2. check if it matches apropriate candidate
+        //      2.1 if it's match, continue - accept next token
+        //      2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by
+        //      model from a valid sequence.
+        size_t accepted_tokens_number = 0;
+        for (size_t i = 0; i < candidates_size + 1; i++) {
             auto start = data_logits + vocab_size * i;
             auto stop = data_logits + vocab_size * (i + 1);
             out_token = std::max_element(start, stop) - start;
+
+            if (out_token == EOS_TOKEN) {
+                break;
+            }
+
             text_streamer.put(out_token);
+            accepted_tokens_number++;
 
-            disagree_idx = i;                
-            if (out_token != draft_tokens[i] || out_token == SPECIAL_EOS_TOKEN || seq_len + disagree_idx + 1 >= max_sequence_length)
+            if (i == candidates_size || out_token != candidates[i]) {
                 break;
+            }
         }
 
         // After the inference request, key/values have shape [BATCH_SIZE, seq_len + K, vocab_size].
         // Increment the sequence length by the number of matched tokens, and
         // trim the KV cache to match the new sequence length.
-        seq_len += disagree_idx + 1;
-        update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_len);
+        seq_len += accepted_tokens_number;
+
+        if (accepted_tokens_number > 0) {
+            candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1);
+        }
+
+        candidateGenerator.update_kv_cache(seq_len);
         update_kv_cache(main_model, SEQ_LEN_AXIS, seq_len);
-        
-        draft_tokens.clear();
-        first_token = out_token;
+
+        candidates.clear();
     }
     text_streamer.end();
     // Model is stateful which means that context (kv-cache) which belongs to a particular

From 8a6d85734f0fc87a0ada5c17778768f476fb487e Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Thu, 9 May 2024 18:54:54 +0400
Subject: [PATCH 05/22] Merge releases/2024/1 into master (#427)

Co-authored-by: guozhong wang <guozhong.wang@intel.com>
Co-authored-by: Chen Peter <peter.chen@intel.com>
Co-authored-by: yatarkan <yaroslav.tarkan@intel.com>
---
 .github/dependabot.yml                        |   4 +
 .github/workflows/bandit.yml                  |  16 +
 .github/workflows/causal_lm_cpp.yml           | 138 +++---
 .github/workflows/lcm_dreamshaper_cpp.yml     | 138 +++---
 .../workflows/stable_diffusion_1_5_cpp.yml    |  18 +-
 bandit.yml                                    | 398 ++++++++++++++++++
 .../common/diffusers/src/scheduler_lcm.cpp    |   2 +-
 .../lcm_dreamshaper_v7/cpp/README.md          |  46 +-
 .../np_latents_512x512.txt                    |   0
 .../torch_noise_step_0.txt                    |   0
 .../torch_noise_step_1.txt                    |   0
 .../torch_noise_step_2.txt                    |   0
 .../lcm_dreamshaper_v7/cpp/requirements.txt   |   4 +
 .../cpp/scripts/convert_model.py              |  41 --
 .../cpp/scripts/requirements.txt              |   4 -
 .../lcm_dreamshaper_v7/cpp/src/main.cpp       | 122 +++++-
 .../stable_diffusion_1_5/cpp/README.md        |  11 +-
 .../stable_diffusion_1_5/cpp/requirements.txt |   2 +-
 .../stable_diffusion_1_5/cpp/src/main.cpp     |   2 +-
 llm_bench/python/benchmark.py                 |   6 +-
 llm_bench/python/requirements.txt             |   2 +-
 text_generation/causal_lm/cpp/README.md       |  34 +-
 .../causal_lm/cpp/requirements.txt            |   5 +
 thirdparty/openvino_tokenizers                |   2 +-
 24 files changed, 742 insertions(+), 253 deletions(-)
 create mode 100644 .github/workflows/bandit.yml
 create mode 100644 bandit.yml
 rename image_generation/lcm_dreamshaper_v7/cpp/{scripts => latents}/np_latents_512x512.txt (100%)
 rename image_generation/lcm_dreamshaper_v7/cpp/{scripts => latents}/torch_noise_step_0.txt (100%)
 rename image_generation/lcm_dreamshaper_v7/cpp/{scripts => latents}/torch_noise_step_1.txt (100%)
 rename image_generation/lcm_dreamshaper_v7/cpp/{scripts => latents}/torch_noise_step_2.txt (100%)
 create mode 100644 image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
 delete mode 100644 image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py
 delete mode 100644 image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt
 create mode 100644 text_generation/causal_lm/cpp/requirements.txt

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index a9b468dff4..9ab4587c2a 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -8,3 +8,7 @@ updates:
     directory: "image_generation/lcm_dreamshaper_v7/cpp/scripts/"
     schedule:
       interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "text_generation/causal_lm/cpp/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml
new file mode 100644
index 0000000000..9faa853a2f
--- /dev/null
+++ b/.github/workflows/bandit.yml
@@ -0,0 +1,16 @@
+name: python -m bandit --recursive --configfile bandit.yml .
+on:
+  pull_request:
+    paths-ignore:
+      - 'thirdparty'
+      - '**.md'
+jobs:
+  bandit:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - run: python -m pip install bandit
+      - run: python -m bandit --recursive --configfile bandit.yml .
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index faf37f86b6..52f8656344 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -3,7 +3,6 @@ on:
   pull_request:
     paths:
       - .github/workflows/causal_lm_cpp.yml
-      - llm_bench/python/**
       - text_generation/causal_lm/cpp/*
       - thirdparty/openvino_tokenizers
       - "!**.md"
@@ -24,20 +23,20 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id openlm-research/open_llama_3b_v2 --output_dir ./open_llama_3b_v2/ --precision FP16  &
+          python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
-      - name: convert_tokenizer and run
+      - name: greedy_causal_lm
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer
-          ./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/ "return 0"
+          ./build/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
 
   cpp-beam_search_causal_lm-ubuntu:
     runs-on: ubuntu-20.04
@@ -51,21 +50,21 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --with-detokenizer
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?" > ./pred.txt
+          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -81,7 +80,7 @@ jobs:
           "
           echo "Why is the Sun yellow?" passed
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ 69 > ./pred.txt
+          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -97,7 +96,7 @@ jobs:
           "
           echo "69" passed
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ Hi > ./pred.txt
+          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -113,7 +112,7 @@ jobs:
           "
           echo "Hi" passed
 
-          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "return 0" > ./pred.txt
+          timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -129,7 +128,7 @@ jobs:
           "
           echo "return 0" passed
 
-          ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "你好！ 你好嗎？" > ./pred.txt
+          ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -145,7 +144,7 @@ jobs:
           "
           echo "你好！ 你好嗎？" passed
 
-          timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
+          timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -178,23 +177,23 @@ jobs:
       - name: Install OpenVINO
         shell: bash
         run: |
-          curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip
+          curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/windows/w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64.zip
           unzip ov.zip
       - name: Download, convert and build
         shell: cmd
         run: |
-          call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
-          python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16
+          call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat
+          python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Compare
         shell: cmd
         run: |
-          call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat
-          convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --with-detokenizer
+          call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat
 
-          .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "69" > .\pred.txt
+          .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
           echo import transformers > ref.py
           echo predictions = open('pred.txt', 'r').read() >> ref.py
           echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py
@@ -219,20 +218,20 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id Qwen/Qwen-7B-Chat --output_dir ./Qwen-7B-Chat/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./Qwen-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
-          timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/pytorch/dldt/FP16/ 69 > ./pred.txt
+          timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -246,20 +245,20 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id Qwen/Qwen1.5-7B-Chat --output_dir ./Qwen1.5-7B-Chat/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
       - name: Run
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
-          timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ "你好！" > ./pred_qwen15.txt
+          timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！" > ./pred_qwen15.txt
 
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
@@ -273,20 +272,21 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id microsoft/phi-2 --output_dir ./Phi-2/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j 15
-          wait
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./Phi-2/pytorch/dldt/FP16/ --output ./Phi-2/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
-          timeout 50s ./build/beam_search_causal_lm ./Phi-2/pytorch/dldt/FP16/ 69 > ./pred.txt
+          timeout 50s ./build/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt
+
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
     steps:
@@ -299,20 +299,20 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id argilla/notus-7b-v1 --output_dir ./notus-7b-v1/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
       - name: Compare
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./notus-7b-v1/pytorch/dldt/FP16/ --output ./notus-7b-v1/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
-          timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/pytorch/dldt/FP16/ 69 > ./pred.txt
+          timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt
 
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
@@ -331,19 +331,17 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
-          python ./llm_bench/python/convert.py --model_id databricks/dolly-v2-3b --output_dir ./dolly-v2-3b/ --precision FP16
-          python ./llm_bench/python/convert.py --model_id databricks/dolly-v2-7b --output_dir ./dolly-v2-7b/ --precision FP16
-          convert_tokenizer ./dolly-v2-3b/pytorch/dldt/FP16/ --output ./dolly-v2-3b/pytorch/dldt/FP16/ --with-detokenizer
-          convert_tokenizer ./dolly-v2-7b/pytorch/dldt/FP16/ --output ./dolly-v2-7b/pytorch/dldt/FP16/ --with-detokenizer
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
-          ./build/speculative_decoding_lm ./dolly-v2-3b/pytorch/dldt/FP16/ ./dolly-v2-7b/pytorch/dldt/FP16/ "Alan Turing was a" > predictions_speculative.txt
-          ./build/greedy_causal_lm ./dolly-v2-7b/pytorch/dldt/FP16/ "Alan Turing was a" > predictions_greedy.txt
+          ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
+          ./build/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -366,17 +364,16 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
-          python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16
-          convert_tokenizer ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --with-detokenizer
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
       - name: run and compare
         run: |
           source ./ov/setupvars.sh
@@ -388,8 +385,8 @@ jobs:
           Question: Can you please add 2 and 3
           A:' > ./prompt.txt
 
-          ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
-          ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "$(<prompt.txt)" > predictions_greedy.txt
+          ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
+          ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -411,21 +408,21 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id microsoft/phi-1_5 --output_dir ./Phi-1_5/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j 15
-          wait
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          convert_tokenizer ./Phi-1_5/pytorch/dldt/FP16/ --output ./Phi-1_5/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
-          timeout 50s ./build/greedy_causal_lm ./Phi-1_5/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_greedy.txt
-          timeout 50s ./build/beam_search_causal_lm ./Phi-1_5/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_beam.txt
+          timeout 50s ./build/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt
       - name: Compare
         run: |
           python -c "
@@ -455,20 +452,21 @@ jobs:
       - name: Install OpenVINO
         run: |
           mkdir ./ov/
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
           sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id ikala/redpajama-3b-chat --output_dir ./redpajama-3b-chat/ --precision FP16 &
+          python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
-          wait
-      - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/pytorch/dldt/FP16/ --output ./redpajama-3b-chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
+      - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_greedy.txt 
+          timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 0d21e42f95..de06153570 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -1,4 +1,5 @@
 name: lcm_dreamshaper
+
 on:
   pull_request:
     paths:
@@ -7,75 +8,106 @@ on:
       - .github/workflows/lcm_dreamshaper_cpp.yml
       - thirdparty/openvino_tokenizers
       - "!**.md"
+
+env:
+  working_directory: "./image_generation/lcm_dreamshaper_v7/cpp/"
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
+
 jobs:
   lcm_dreamshaper_v7_cpp-linux:
     runs-on: ubuntu-20.04
+    defaults:
+      run:
+        # Do not ignore bash profile files. From:
+        # https://github.com/marketplace/actions/setup-miniconda#important
+        shell: bash -l {0}
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: recursive
-      - uses: actions/setup-python@v4
+
+      - name: Setup conda
+        uses: conda-incubator/setup-miniconda@v3
         with:
-          python-version: 3.8
-      - name: Initialize OpenVINO
+          miniconda-version: "latest"
+          activate-environment: openvino_lcm_cpp
+          python-version: "3.10"
+
+      - name: Install OpenVINO and other conda dependencies
         run: |
-          mkdir openvino
-          curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./openvino/ --strip-components 1 -xz
-          sudo ./openvino/install_dependencies/install_openvino_dependencies.sh
-      - name: Download / convert a model / tokenizer
+          conda activate openvino_lcm_cpp
+          conda update -c conda-forge --all
+          conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake
+          conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
+      
+      - name: Install python dependencies
+        working-directory: ${{ env.working_directory }}
         run: |
-          source ./openvino/setupvars.sh
-          cd ./image_generation/lcm_dreamshaper_v7/cpp/scripts/
-          python -m pip install -U pip
-          python -m pip install -r ./requirements.txt
-          python -m pip install ../../../../thirdparty/openvino_tokenizers/
-          python convert_model.py -lcm "SimianLuo/LCM_Dreamshaper_v7" -t "FP16"
+          conda activate openvino_lcm_cpp
+          python -m pip install -r requirements.txt
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers]
+          
+      - name: Download and convert model and tokenizer
+        working-directory: ${{ env.working_directory }}
+        run: |
+          conda activate openvino_lcm_cpp
+          optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16
+
       - name: Build app
+        working-directory: ${{ env.working_directory }}
         run: |
-          source ./openvino/setupvars.sh
-          cd ./image_generation/lcm_dreamshaper_v7/cpp/
+          conda activate openvino_lcm_cpp
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release --parallel
+      
       - name: Run app
-        run: |
-          source ./openvino/setupvars.sh
-          cd ./image_generation/lcm_dreamshaper_v7/cpp/build/
-          ./lcm_dreamshaper
+        working-directory: ${{ env.working_directory }}
+        run: ./build/lcm_dreamshaper
+
   lcm_dreamshaper_v7_cpp-windows:
-      runs-on: windows-latest
-      steps:
-        - uses: actions/checkout@v4
-          with:
-            submodules: recursive
-        - uses: actions/setup-python@v4
-          with:
-            python-version: 3.8
-        - name: Initialize OpenVINO
-          shell: cmd
-          run: |
-            curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip
-            unzip ov.zip
-        - name: Download / convert a model / tokenizer
-          shell: cmd
-          run: |
-            call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64/setupvars.bat
-            cd ./image_generation/lcm_dreamshaper_v7/cpp/scripts/
-            python -m pip install -r ./requirements.txt
-            python -m pip install ../../../../thirdparty/openvino_tokenizers/
-            python convert_model.py -lcm "SimianLuo/LCM_Dreamshaper_v7" -t "FP16"
-        - name: Build app
-          shell: cmd
-          run: |
-            call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64/setupvars.bat
-            cd ./image_generation/lcm_dreamshaper_v7/cpp/
-            cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-            cmake --build ./build/ --config Release --parallel
-        - name: Run app
-          shell: cmd
-          run: |
-            call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64/setupvars.bat
-            cd ./image_generation/lcm_dreamshaper_v7/cpp/build/
-            call "./Release/lcm_dreamshaper.exe"
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Setup conda
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniconda-version: "latest"
+          activate-environment: openvino_lcm_cpp
+          python-version: "3.10"
+
+      - name: Install OpenVINO and other conda dependencies
+        run: |
+          conda activate openvino_lcm_cpp
+          conda update -c conda-forge --all
+          conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake
+          conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
+      
+      - name: Install python dependencies
+        working-directory: ${{ env.working_directory }}
+        run: |
+          conda activate openvino_lcm_cpp
+          python -m pip install -r requirements.txt
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers]
+
+      - name: Download and convert model and tokenizer
+        working-directory: ${{ env.working_directory }}
+        run: |
+          conda activate openvino_lcm_cpp
+          optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16
+  
+      - name: Build app
+        working-directory: ${{ env.working_directory }}
+        run: |
+          conda activate openvino_lcm_cpp
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release --parallel
+      
+      - name: Run app
+        working-directory: ${{ env.working_directory }}
+        run: '& "./build/Release/lcm_dreamshaper.exe" -r --dynamic'
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index 41c68becd8..38a2022e1d 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Install OpenVINO and other conda dependencies
         run: |
           conda activate openvino_sd_cpp
-          conda install -c conda-forge openvino=2024.0.0 c-compiler cxx-compiler make cmake
+          conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake
           conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
       - name: Install python dependencies
@@ -53,9 +53,7 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_sd_cpp
-          export MODEL_PATH="models/stable_diffusion_v1_5_ov/FP16"
-          optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --convert-tokenizer --weight-format fp16 $MODEL_PATH
-          convert_tokenizer $MODEL_PATH/tokenizer/ --tokenizer-output-type i32 -o $MODEL_PATH/tokenizer/
+          optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --weight-format fp16 models/stable_diffusion_v1_5_ov/FP16
 
       - name: Build app
         working-directory: ${{ env.working_directory }}
@@ -66,8 +64,7 @@ jobs:
 
       - name: Run app
         working-directory: ${{ env.working_directory }}
-        run: |
-          ./build/stable_diffusion -m ./models/stable_diffusion_v1_5_ov -t FP16
+        run: ./build/stable_diffusion -m ./models/stable_diffusion_v1_5_ov -t FP16
 
   stable_diffusion_1_5_cpp-windows:
       runs-on: windows-latest
@@ -86,7 +83,7 @@ jobs:
         - name: Install OpenVINO and other conda dependencies
           run: |
             conda activate openvino_sd_cpp
-            conda install -c conda-forge openvino=2024.0.0 c-compiler cxx-compiler make cmake
+            conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake
   
         - name: Install python dependencies
           working-directory: ${{ env.working_directory }}
@@ -99,9 +96,7 @@ jobs:
           working-directory: ${{ env.working_directory }}
           run: |
             conda activate openvino_sd_cpp
-            $env:MODEL_PATH='models/stable_diffusion_v1_5_ov/FP16'
-            optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --convert-tokenizer --weight-format fp16 $env:MODEL_PATH
-            convert_tokenizer $env:MODEL_PATH/tokenizer/ --tokenizer-output-type i32 -o $env:MODEL_PATH/tokenizer/
+            optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --weight-format fp16 models/stable_diffusion_v1_5_ov/FP16
   
         - name: Build app
           working-directory: ${{ env.working_directory }}
@@ -112,5 +107,4 @@ jobs:
   
         - name: Run app
           working-directory: ${{ env.working_directory }}
-          run: |
-            & "./build/Release/stable_diffusion.exe" -m ./models/stable_diffusion_v1_5_ov -t FP16 --dynamic
+          run: '& "./build/Release/stable_diffusion.exe" -m ./models/stable_diffusion_v1_5_ov -t FP16 --dynamic'
diff --git a/bandit.yml b/bandit.yml
new file mode 100644
index 0000000000..be2fd3da5b
--- /dev/null
+++ b/bandit.yml
@@ -0,0 +1,398 @@
+### This config may optionally select a subset of tests to run or skip by
+### filling out the 'tests' and 'skips' lists given below. If no tests are
+### specified for inclusion then it is assumed all tests are desired. The skips
+### set will remove specific tests from the include set. This can be controlled
+### using the -t/-s CLI options. Note that the same test ID should not appear
+### in both 'tests' and 'skips', this would be nonsensical and is detected by
+### Bandit at runtime.
+
+# Available tests:
+# B101 : assert_used
+# B102 : exec_used
+# B103 : set_bad_file_permissions
+# B104 : hardcoded_bind_all_interfaces
+# B105 : hardcoded_password_string
+# B106 : hardcoded_password_funcarg
+# B107 : hardcoded_password_default
+# B108 : hardcoded_tmp_directory
+# B110 : try_except_pass
+# B112 : try_except_continue
+# B201 : flask_debug_true
+# B301 : pickle
+# B302 : marshal
+# B303 : md5
+# B304 : ciphers
+# B305 : cipher_modes
+# B306 : mktemp_q
+# B307 : eval
+# B308 : mark_safe
+# B310 : urllib_urlopen
+# B311 : random
+# B312 : telnetlib
+# B313 : xml_bad_cElementTree
+# B314 : xml_bad_ElementTree
+# B315 : xml_bad_expatreader
+# B316 : xml_bad_expatbuilder
+# B317 : xml_bad_sax
+# B318 : xml_bad_minidom
+# B319 : xml_bad_pulldom
+# B320 : xml_bad_etree
+# B321 : ftplib
+# B323 : unverified_context
+# B324 : hashlib_new_insecure_functions
+# B401 : import_telnetlib
+# B402 : import_ftplib
+# B403 : import_pickle
+# B404 : import_subprocess
+# B405 : import_xml_etree
+# B406 : import_xml_sax
+# B407 : import_xml_expat
+# B408 : import_xml_minidom
+# B409 : import_xml_pulldom
+# B410 : import_lxml
+# B411 : import_xmlrpclib
+# B412 : import_httpoxy
+# B413 : import_pycrypto
+# B501 : request_with_no_cert_validation
+# B502 : ssl_with_bad_version
+# B503 : ssl_with_bad_defaults
+# B504 : ssl_with_no_version
+# B505 : weak_cryptographic_key
+# B506 : yaml_load
+# B507 : ssh_no_host_key_verification
+# B601 : paramiko_calls
+# B602 : subprocess_popen_with_shell_equals_true
+# B603 : subprocess_without_shell_equals_true
+# B604 : any_other_function_with_shell_equals_true
+# B605 : start_process_with_a_shell
+# B606 : start_process_with_no_shell
+# B607 : start_process_with_partial_path
+# B608 : hardcoded_sql_expressions
+# B609 : linux_commands_wildcard_injection
+# B610 : django_extra_used
+# B611 : django_rawsql_used
+# B701 : jinja2_autoescape_false
+# B702 : use_of_mako_templates
+# B703 : django_mark_safe
+
+# (optional) list included test IDs here, eg '[B101, B406]':
+# IPAS Required Checkers. Do not disable these
+# Additional checkers may be added if desired
+tests:
+  [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413']
+
+# (optional) list skipped test IDs here, eg '[B101, B406]':
+# The following checkers are not required but be added to tests list if desired
+skips:
+  [ 'B101', 'B102', 'B103', 'B104', 'B105', 'B106', 'B107', 'B108', 'B110', 'B112', 'B201', 'B501', 'B502', 'B503', 'B504', 'B505', 'B506', 'B507', 'B601', 'B602', 'B603', 'B604', 'B605', 'B606', 'B607', 'B608', 'B609', 'B610', 'B611', 'B701', 'B702', 'B703']
+
+### (optional) plugin settings - some test plugins require configuration data
+### that may be given here, per-plugin. All bandit test plugins have a built in
+### set of sensible defaults and these will be used if no configuration is
+### provided. It is not necessary to provide settings for every (or any) plugin
+### if the defaults are acceptable.
+
+any_other_function_with_shell_equals_true:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+assert_used:
+  skips: []
+hardcoded_tmp_directory:
+  tmp_dirs:
+  - /tmp
+  - /var/tmp
+  - /dev/shm
+linux_commands_wildcard_injection:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+ssl_with_bad_defaults:
+  bad_protocol_versions:
+  - PROTOCOL_SSLv2
+  - SSLv2_METHOD
+  - SSLv23_METHOD
+  - PROTOCOL_SSLv3
+  - PROTOCOL_TLSv1
+  - SSLv3_METHOD
+  - TLSv1_METHOD
+ssl_with_bad_version:
+  bad_protocol_versions:
+  - PROTOCOL_SSLv2
+  - SSLv2_METHOD
+  - SSLv23_METHOD
+  - PROTOCOL_SSLv3
+  - PROTOCOL_TLSv1
+  - SSLv3_METHOD
+  - TLSv1_METHOD
+start_process_with_a_shell:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+start_process_with_no_shell:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+start_process_with_partial_path:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+subprocess_popen_with_shell_equals_true:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+subprocess_without_shell_equals_true:
+  no_shell:
+  - os.execl
+  - os.execle
+  - os.execlp
+  - os.execlpe
+  - os.execv
+  - os.execve
+  - os.execvp
+  - os.execvpe
+  - os.spawnl
+  - os.spawnle
+  - os.spawnlp
+  - os.spawnlpe
+  - os.spawnv
+  - os.spawnve
+  - os.spawnvp
+  - os.spawnvpe
+  - os.startfile
+  shell:
+  - os.system
+  - os.popen
+  - os.popen2
+  - os.popen3
+  - os.popen4
+  - popen2.popen2
+  - popen2.popen3
+  - popen2.popen4
+  - popen2.Popen3
+  - popen2.Popen4
+  - commands.getoutput
+  - commands.getstatusoutput
+  subprocess:
+  - subprocess.Popen
+  - subprocess.call
+  - subprocess.check_call
+  - subprocess.check_output
+  - subprocess.run
+try_except_continue:
+  check_typed_exception: false
+try_except_pass:
+  check_typed_exception: false
+weak_cryptographic_key:
+  weak_key_size_dsa_high: 1024
+  weak_key_size_dsa_medium: 2048
+  weak_key_size_ec_high: 160
+  weak_key_size_ec_medium: 224
+  weak_key_size_rsa_high: 1024
+  weak_key_size_rsa_medium: 2048
+exclude_dirs:
+  - thirdparty
diff --git a/image_generation/common/diffusers/src/scheduler_lcm.cpp b/image_generation/common/diffusers/src/scheduler_lcm.cpp
index af82c981a4..d5f97b6772 100644
--- a/image_generation/common/diffusers/src/scheduler_lcm.cpp
+++ b/image_generation/common/diffusers/src/scheduler_lcm.cpp
@@ -192,7 +192,7 @@ std::map<std::string, ov::Tensor> LCMScheduler::step(ov::Tensor noise_pred, ov::
     if (inference_step != num_inference_steps - 1) {
         std::vector<float> noise;
         if (read_torch_noise) {
-            std::string noise_file = "../scripts/torch_noise_step_" + std::to_string(inference_step) + ".txt";
+            std::string noise_file = "./latents/torch_noise_step_" + std::to_string(inference_step) + ".txt";
             noise = read_vector_from_txt(noise_file);
         } else {
             noise = randn_function(noise_pred.get_size(), seed);
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/README.md b/image_generation/lcm_dreamshaper_v7/cpp/README.md
index 1d4f1f7ace..6077b8a1c7 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/README.md
+++ b/image_generation/lcm_dreamshaper_v7/cpp/README.md
@@ -2,19 +2,25 @@
 The pure C++ text-to-image pipeline, driven by the OpenVINO native API for SD v1.5 Latent Consistency Model with LCM Scheduler. It includes advanced features like LoRA integration with safetensors and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). Loading `openvino_tokenizers` to `ov::Core` enables tokenization. [The common folder](../../common/) contains schedulers for image generation and `imwrite()` for saving `bmp` images. This demo has been tested for Linux platform only. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/263-latent-consistency-models-image-generation/263-lcm-lora-controlnet.ipynb) which provides an example of image generaztion in Python.
 
 > [!NOTE]
->This tutorial assumes that the current working directory is `<openvino.genai repo>/image_generation/lcm_dreamshaper_v7/cpp/` and all paths are relative to this folder.
+> This tutorial assumes that the current working directory is `<openvino.genai repo>/image_generation/lcm_dreamshaper_v7/cpp/` and all paths are relative to this folder.
 
 ## Step 1: Prepare build environment
 
+Prerequisites:
+- Conda ([installation guide](https://conda.io/projects/conda/en/latest/user-guide/install/index.html))
+
 C++ Packages:
 * [CMake](https://cmake.org/download/): Cross-platform build tool
-* [OpenVINO](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html): Model inference
+* [OpenVINO](https://docs.openvino.ai/2024/get-started/install-openvino.html): Model inference
 
 Prepare a python environment and install dependencies:
 ```shell
 conda create -n openvino_lcm_cpp python==3.10
 conda activate openvino_lcm_cpp
-conda install -c conda-forge openvino c-compiler cxx-compiler make
+conda update -c conda-forge --all
+conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake
+# Ensure that Conda standard libraries are used
+conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 ```
 
 ## Step 2: Latent Consistency Model and Tokenizer models
@@ -26,21 +32,18 @@ conda install -c conda-forge openvino c-compiler cxx-compiler make
     ```shell
     git submodule update --init
     conda activate openvino_lcm_cpp
-    python -m pip install -r scripts/requirements.txt
+    python -m pip install -r requirements.txt
     python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers]
     ```
 
-2. Run model conversion script to download and convert PyTorch model to OpenVINO IR via [optimum-intel](https://github.com/huggingface/optimum-intel). Please, use the script `scripts/convert_model.py` to convert the model:
+2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel). Example command for downloading and exporting FP16 model:
 
-    ```shell
-    cd scripts
-    python convert_model.py -lcm "SimianLuo/LCM_Dreamshaper_v7" -t FP16
-    ```
+    `optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16`
 
 If https://huggingface.co/ is down, the script won't be able to download the model.
 
 > [!NOTE]
->Only static model is currently supported for this sample.
+> Only static model is currently supported for this sample.
 
 ### LoRA enabling with safetensors
 
@@ -67,19 +70,20 @@ Usage:
   lcm_dreamshaper [OPTION...]
 ```
 
-* `-p, --posPrompt arg` Initial positive prompt for SD  (default: cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting)
+* `-p, --posPrompt arg` Initial positive prompt for LCM (default: a beautiful pink unicorn)
 * `-d, --device arg`    AUTO, CPU, or GPU. Doesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only (default: CPU)
-* `--step arg`          Number of diffusion step ( default: 20)
+* `--step arg`          Number of diffusion step (default: 4)
 * `-s, --seed arg`      Number of random seed to generate latent (default: 42)
-* `--num arg`           Number of image output(default: 1)
+* `--num arg`           Number of image output (default: 1)
 * `--height arg`        Height of output image (default: 512)
 * `--width arg`         Width of output image (default: 512)
 * `-c, --useCache`      Use model caching
-* `-r, --readNPLatent`  Read numpy generated latents from file
-* `-m, --modelPath arg` Specify path of SD model IR (default: ../scripts/SimianLuo/LCM_Dreamshaper_v7)
-* `-t, --type arg`      Specify the type of SD model IR (FP16_static or FP16_dyn) (default: FP16_static)
-* `-l, --loraPath arg`  Specify path of lora file. (*.safetensors). (default: )
-* `-a, --alpha arg`     alpha for lora (default: 0.75)
+* `-r, --readNPLatent`  Read numpy generated latents from file, only supported for one output image
+* `-m, --modelPath arg` Specify path to LCM model IRs (default: ./models/lcm_dreamshaper_v7)
+* `-t, --type arg`      Specify the type of LCM model IRs (e.g., FP32, FP16 or INT8) (default: FP16)
+* `--dynamic`           Specify the model input shape to use dynamic shape
+* `-l, --loraPath arg`  Specify path to LoRA file (*.safetensors) (default: )
+* `-a, --alpha arg`     Specify alpha for LoRA (default: 0.75)
 * `-h, --help`          Print usage
 
 > [!NOTE]
@@ -91,15 +95,15 @@ Positive prompt: a beautiful pink unicorn
 
 Read the numpy latent input and noise for scheduler instead of C++ std lib for the alignment with Python pipeline.
 
-* Generate image with random data generated by Python `./build/lcm_dreamshaper -r`
+* Generate image with random data generated by Python: `./build/lcm_dreamshaper -r`
 
 ![image](./python_random.bmp)
 
-* Generate image with C++ lib generated latent and noise : `./build/lcm_dreamshaper`
+* Generate image with C++ lib generated latent and noise: `./build/lcm_dreamshaper`
 
 ![image](./cpp_random.bmp)
 
-* Generate image with soulcard lora and C++ generated latent and noise `./stable_diffusion -r -l path/to/soulcard.safetensors`
+* Generate image with soulcard lora and C++ generated latent and noise: `./stable_diffusion -r -l path/to/soulcard.safetensors`
 
 ![image](./lora_cpp_random.bmp)
 
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/np_latents_512x512.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/np_latents_512x512.txt
similarity index 100%
rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/np_latents_512x512.txt
rename to image_generation/lcm_dreamshaper_v7/cpp/latents/np_latents_512x512.txt
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_0.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_0.txt
similarity index 100%
rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_0.txt
rename to image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_0.txt
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_1.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_1.txt
similarity index 100%
rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_1.txt
rename to image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_1.txt
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_2.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_2.txt
similarity index 100%
rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_2.txt
rename to image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_2.txt
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
new file mode 100644
index 0000000000..7ffbb92137
--- /dev/null
+++ b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
@@ -0,0 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.2.2+cpu
+diffusers==0.27.2
+optimum-intel[nncf,openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py b/image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py
deleted file mode 100644
index c55ec0ecc9..0000000000
--- a/image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from pathlib import Path
-import argparse
-from optimum.intel.openvino import OVLatentConsistencyModelPipeline
-from transformers import AutoTokenizer
-from openvino_tokenizers import convert_tokenizer
-from openvino import Type, save_model
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse and return command line arguments."""
-    parser = argparse.ArgumentParser(add_help=False)
-    args = parser.add_argument_group('Options')
-    args.add_argument('-h', '--help', action = 'help',
-                      help='Show this help message and exit.')
-    args.add_argument('-t', '--type', type = str, default = "FP32", required = True,
-                      help='Required. data type, FP32, FP16.')
-    args.add_argument('-lcm','--lcm_weights', type = str, default="SimianLuo/LCM_Dreamshaper_v7", required = True,
-                      help='Specify the path of lcm model')
-    return parser.parse_args()
-
-args = parse_args()
-output_path = Path(args.lcm_weights) / (args.type + "_static")
-
-###convert LCM model to IR
-
-model = OVLatentConsistencyModelPipeline.from_pretrained(args.lcm_weights, trust_remote_code=True, export=True, compile=False)
-if args.type == "FP16":
-    model.half()
-
-model.reshape(1, 512, 512, 1)
-
-model.compile()
-model.save_pretrained(output_path)
-
-# convert tokenizer
-
-tokenizer_path = output_path / "tokenizer"
-hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-ov_tokenizer_encoder = convert_tokenizer(hf_tokenizer, tokenizer_output_type=Type.i32)
-
-save_model(ov_tokenizer_encoder, tokenizer_path / "openvino_tokenizer.xml", compress_to_fp16=False)
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt
deleted file mode 100644
index f7003f7218..0000000000
--- a/image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.3.0+cpu
-diffusers==0.27.2
-optimum-intel[nncf,openvino]==1.16.1
diff --git a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
index 1df11bee29..546bd170ba 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
+++ b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
@@ -21,11 +21,17 @@
 #include "lora.hpp"
 #include "imwrite.hpp"
 
+const size_t TOKENIZER_MODEL_MAX_LENGTH = 77;   // 'model_max_length' parameter from 'tokenizer_config.json'
+const int64_t UNET_IN_CHANNELS = 4;             // 'in_channels' parameter from 'unet/config.json'
+const int64_t UNET_TIME_COND_PROJ_DIM = 256;    // 'time_cond_proj_dim' parameter from 'unet/config.json'
+const int64_t VAE_DECODER_LATENT_CHANNELS = 4;  // 'latent_channels' parameter from 'vae_decoder/config.json'
+const size_t VAE_SCALE_FACTOR = 8;
+
 ov::Tensor randn_tensor(uint32_t height, uint32_t width, bool use_np_latents, uint32_t seed = 42) {
-    ov::Tensor noise(ov::element::f32, {1, 4, height / 8, width / 8});
+    ov::Tensor noise(ov::element::f32, {1, UNET_IN_CHANNELS, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
     if (use_np_latents) {
         // read np generated latents with defaut seed 42
-        const char * latent_file_name = "../scripts/np_latents_512x512.txt";
+        const char * latent_file_name = "./latents/np_latents_512x512.txt";
         std::ifstream latent_copy_file(latent_file_name, std::ios::ate);
         OPENVINO_ASSERT(latent_copy_file.is_open(), "Cannot open ", latent_file_name);
 
@@ -60,13 +66,67 @@ void apply_lora(std::shared_ptr<ov::Model> model, InsertLoRA::LoRAMap& lora_map)
     }
 }
 
-StableDiffusionModels compile_models(const std::string& model_path, const std::string& device,
-                                     const std::string& lora_path, const float alpha, const bool use_cache) {
+void reshape_text_encoder(std::shared_ptr<ov::Model> model, size_t batch_size, size_t tokenizer_model_max_length) {
+    ov::PartialShape input_shape = model->input(0).get_partial_shape();
+    input_shape[0] = batch_size;
+    input_shape[1] = tokenizer_model_max_length;
+    std::map<size_t, ov::PartialShape> idx_to_shape{{0, input_shape}};
+    model->reshape(idx_to_shape);
+}
+
+void reshape_unet(std::shared_ptr<ov::Model> model,
+                  int64_t batch_size,
+                  int64_t height,
+                  int64_t width,
+                  int64_t tokenizer_model_max_length) {
+    height = height / VAE_SCALE_FACTOR;
+    width = width / VAE_SCALE_FACTOR;
+
+    std::map<std::string, ov::PartialShape> name_to_shape;
+
+    for (auto input : model->inputs()) {
+        std::string input_name = input.get_any_name();
+        name_to_shape[input_name] = input.get_partial_shape();
+        if (input_name == "timestep") {
+            name_to_shape[input_name][0] = 1;
+        } else if (input_name == "sample") {
+            name_to_shape[input_name] = {batch_size, UNET_IN_CHANNELS, height, width};
+        } else if (input_name == "time_ids") {
+            name_to_shape[input_name][0] = batch_size;
+        } else if (input_name == "timestep_cond") {
+            name_to_shape[input_name] = {batch_size, UNET_TIME_COND_PROJ_DIM};
+        } else {
+            name_to_shape[input_name][0] = batch_size;
+            name_to_shape[input_name][1] = TOKENIZER_MODEL_MAX_LENGTH;
+        }
+    }
+
+    model->reshape(name_to_shape);
+}
+
+void reshape_vae_decoder(std::shared_ptr<ov::Model> model, int64_t height, int64_t width) {
+    height = height / VAE_SCALE_FACTOR;
+    width = width / VAE_SCALE_FACTOR;
+
+    std::map<size_t, ov::PartialShape> idx_to_shape{{0, {1, VAE_DECODER_LATENT_CHANNELS, height, width}}};
+    model->reshape(idx_to_shape);
+}
+
+StableDiffusionModels compile_models(const std::string& model_path,
+                                     const std::string& device,
+                                     const std::string& lora_path, 
+                                     const float alpha, 
+                                     const bool use_cache,
+                                     const bool use_dynamic_shapes,
+                                     const size_t batch_size,
+                                     const size_t height,
+                                     const size_t width) {
     StableDiffusionModels models;
 
     ov::Core core;
     if (use_cache)
         core.set_property(ov::cache_dir("./cache_dir"));
+
     core.add_extension(TOKENIZERS_LIBRARY_PATH);
 
     // read LoRA weights
@@ -78,6 +138,9 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s
     // Text encoder
     {
         auto text_encoder_model = core.read_model(model_path + "/text_encoder/openvino_model.xml");
+        if (!use_dynamic_shapes) {
+            reshape_text_encoder(text_encoder_model, batch_size, TOKENIZER_MODEL_MAX_LENGTH);
+        }
         apply_lora(text_encoder_model, lora_weights["text_encoder"]);
         models.text_encoder = core.compile_model(text_encoder_model, device);
     }
@@ -85,6 +148,9 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s
     // UNet
     {
         auto unet_model = core.read_model(model_path + "/unet/openvino_model.xml");
+        if (!use_dynamic_shapes) {
+            reshape_unet(unet_model, batch_size, height, width, TOKENIZER_MODEL_MAX_LENGTH);
+        }
         apply_lora(unet_model, lora_weights["unet"]);
         models.unet = core.compile_model(unet_model, device);
     }
@@ -92,6 +158,9 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s
     // VAE decoder
     {
         auto vae_decoder_model = core.read_model(model_path + "/vae_decoder/openvino_model.xml");
+        if (!use_dynamic_shapes) {
+            reshape_vae_decoder(vae_decoder_model, height, width);
+        }
         ov::preprocess::PrePostProcessor ppp(vae_decoder_model);
         ppp.output().model().set_layout("NCHW");
         ppp.output().tensor().set_layout("NHWC");
@@ -108,15 +177,14 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s
 }
 
 ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt) {
-    const size_t MAX_LENGTH = 77; // 'model_max_length' from 'tokenizer_config.json'
     const size_t HIDDEN_SIZE = static_cast<size_t>(models.text_encoder.output(0).get_partial_shape()[2].get_length());
     const int32_t EOS_TOKEN_ID = 49407, PAD_TOKEN_ID = EOS_TOKEN_ID;
-    const ov::Shape input_ids_shape({1, MAX_LENGTH});
+    const ov::Shape input_ids_shape({1, TOKENIZER_MODEL_MAX_LENGTH});
 
     ov::InferRequest tokenizer_req = models.tokenizer.create_infer_request();
     ov::InferRequest text_encoder_req = models.text_encoder.create_infer_request();
 
-    ov::Tensor text_embeddings(ov::element::f32, {1, MAX_LENGTH, HIDDEN_SIZE});
+    ov::Tensor text_embeddings(ov::element::f32, {1, TOKENIZER_MODEL_MAX_LENGTH, HIDDEN_SIZE});
     ov::Tensor input_ids(ov::element::i32, input_ids_shape);
     std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), PAD_TOKEN_ID);
 
@@ -124,7 +192,7 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt) {
     tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &pos_prompt});
     tokenizer_req.infer();
     ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids");
-    std::copy_n(input_ids_token.data<std::int32_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
+    std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
 
     // text embeddings
     text_encoder_req.set_tensor("input_ids", input_ids);
@@ -192,20 +260,23 @@ ov::Tensor postprocess_image(ov::Tensor decoded_image) {
 }
 
 int32_t main(int32_t argc, char* argv[]) try {
-    cxxopts::Options options("stable_diffusion", "Stable Diffusion implementation in C++ using OpenVINO\n");
+    cxxopts::Options options("lcm_dreamshaper", "LCM_Dreamshaper_v7 implementation in C++ using OpenVINO\n");
 
     options.add_options()
-    ("p,posPrompt", "Initial positive prompt for LCM ", cxxopts::value<std::string>()->default_value("a beautiful pink unicorn"))
+    ("p,posPrompt", "Initial positive prompt for LCM", cxxopts::value<std::string>()->default_value("a beautiful pink unicorn"))
     ("d,device", "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only", cxxopts::value<std::string>()->default_value("CPU"))
     ("step", "Number of diffusion steps", cxxopts::value<size_t>()->default_value("4"))
     ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value<size_t>()->default_value("42"))
     ("num", "Number of image output", cxxopts::value<size_t>()->default_value("1"))
+    ("height","Height of output image",cxxopts::value<size_t>()->default_value("512"))
+    ("width", "Width of output image", cxxopts::value<size_t>()->default_value("512"))
     ("c,useCache", "Use model caching", cxxopts::value<bool>()->default_value("false"))
     ("r,readNPLatent", "Read numpy generated latents from file, only supported for one output image", cxxopts::value<bool>()->default_value("false"))
-    ("m,modelPath", "Specify path of LCM model IRs", cxxopts::value<std::string>()->default_value("../scripts/SimianLuo/LCM_Dreamshaper_v7"))
-    ("t,type", "Specify the type of LCM model IRs (e.g., FP16_static or FP16_dyn)", cxxopts::value<std::string>()->default_value("FP16_static"))
+    ("m,modelPath", "Specify path to LCM model IRs", cxxopts::value<std::string>()->default_value("./models/lcm_dreamshaper_v7"))
+    ("t,type", "Specify the type of LCM model IRs (e.g., FP32, FP16 or INT8)", cxxopts::value<std::string>()->default_value("FP16"))
+    ("dynamic","Specify the model input shape to use dynamic shape",cxxopts::value<bool>()->default_value("false"))
     ("l,loraPath", "Specify path of LoRA file. (*.safetensors).", cxxopts::value<std::string>()->default_value(""))
-    ("a,alpha", "alpha for LoRA", cxxopts::value<float>()->default_value("0.75"))
+    ("a,alpha", "Specify alpha for LoRA", cxxopts::value<float>()->default_value("0.75"))
     ("h,help", "Print usage");
     cxxopts::ParseResult result;
 
@@ -227,13 +298,15 @@ int32_t main(int32_t argc, char* argv[]) try {
     const uint32_t num_inference_steps = result["step"].as<size_t>();
     const uint32_t user_seed = result["seed"].as<size_t>();
     const uint32_t num_images = result["num"].as<size_t>();
+    const uint32_t height = result["height"].as<size_t>();
+    const uint32_t width = result["width"].as<size_t>();
     const bool use_cache = result["useCache"].as<bool>();
     const bool read_np_latent = result["readNPLatent"].as<bool>();
     const std::string model_base_path = result["modelPath"].as<std::string>();
     const std::string model_type = result["type"].as<std::string>();
+    const bool use_dynamic_shapes = result["dynamic"].as<bool>();
     const std::string lora_path = result["loraPath"].as<std::string>();
     const float alpha = result["alpha"].as<float>();
-    const uint32_t height = 512, width = 512;
 
     OPENVINO_ASSERT(!read_np_latent || (read_np_latent && (num_images == 1)),
         "\"readNPLatent\" option is only supported for one output image. Number of image output was set to: " + std::to_string(num_images));
@@ -248,14 +321,23 @@ int32_t main(int32_t argc, char* argv[]) try {
     std::cout << "OpenVINO version: " << ov::get_openvino_version() << std::endl;
     std::cout << "Running (may take some time) ..." << std::endl;
 
-    // Stable Diffusion pipeline
+    const std::string model_path = model_base_path + "/" + model_type;
+    if (!std::filesystem::exists(model_path)) {
+        std::cerr << "Model IRs for type " << model_type << " don't exist in directory " << model_path << "\n";
+        std::cerr << "Refer to README.md to know how to export OpenVINO model with particular data type." << std::endl;
+        return EXIT_FAILURE;
+    }
 
-    StableDiffusionModels models = compile_models(model_base_path + "/" + model_type, device, lora_path, alpha, use_cache);
+    // Stable Diffusion pipeline
+    const size_t batch_size = 1;
+    StableDiffusionModels models = 
+        compile_models(model_path, device, lora_path, alpha, use_cache, use_dynamic_shapes, batch_size, height, width);
     ov::InferRequest unet_infer_request = models.unet.create_infer_request();
 
     ov::PartialShape sample_shape = models.unet.input("sample").get_partial_shape();
-    OPENVINO_ASSERT(sample_shape.is_dynamic() || (sample_shape[2] * 8 == width && sample_shape[3] * 8 == height),
-        "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]");
+    OPENVINO_ASSERT(sample_shape.is_dynamic() ||
+                        (sample_shape[2] * VAE_SCALE_FACTOR == height && sample_shape[3] * VAE_SCALE_FACTOR == width),
+                    "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]");
 
     // no negative prompt for LCM model: 
     // https://huggingface.co/docs/diffusers/api/pipelines/latent_consistency_models#diffusers.LatentConsistencyModelPipeline
@@ -269,9 +351,9 @@ int32_t main(int32_t argc, char* argv[]) try {
     std::vector<std::int64_t> timesteps = scheduler->get_timesteps();
 
     float guidance_scale = 8.0;
-    ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, 256);
+    ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, UNET_TIME_COND_PROJ_DIM);
 
-    ov::Tensor denoised(ov::element::f32, {1, 4, height / 8, width / 8});
+    ov::Tensor denoised(ov::element::f32, {1, UNET_IN_CHANNELS, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
     for (uint32_t n = 0; n < num_images; n++) {
         std::uint32_t seed = num_images == 1 ? user_seed: user_seed + n;
         ov::Tensor latent_model_input = randn_tensor(height, width, read_np_latent, seed);
diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md
index 0ff3ad0906..fb01326ea5 100644
--- a/image_generation/stable_diffusion_1_5/cpp/README.md
+++ b/image_generation/stable_diffusion_1_5/cpp/README.md
@@ -18,7 +18,7 @@ Prepare a python environment and install dependencies:
 ```shell
 conda create -n openvino_sd_cpp python==3.10
 conda activate openvino_sd_cpp
-conda install -c conda-forge openvino=2024.0.0 c-compiler cxx-compiler make cmake
+conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake
 # Ensure that Conda standard libraries are used
 conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 ```
@@ -40,13 +40,8 @@ python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers]
 - [dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) to run Stable Diffusion with LoRA adapters.
 
    Example command for downloading and exporting FP16 model:
-   ```shell
-   export MODEL_PATH="models/dreamlike_anime_1_0_ov/FP16"
-   # Using optimum-cli for exporting model to OpenVINO format
-   optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --convert-tokenizer --weight-format fp16 $MODEL_PATH
-   # Converting tokenizer manually (`--convert-tokenizer` flag of `optimum-cli` results in "OpenVINO Tokenizer export for CLIPTokenizer is not supported.")
-   convert_tokenizer $MODEL_PATH/tokenizer/ --tokenizer-output-type i32 -o $MODEL_PATH/tokenizer/
-   ```
+
+   `optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike_anime_1_0_ov/FP16`
 
    You can also choose other precision and export FP32 or INT8 model.
 
diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt
index 289149d134..5e6bfe0372 100644
--- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt
+++ b/image_generation/stable_diffusion_1_5/cpp/requirements.txt
@@ -2,5 +2,5 @@
 torch==2.2.2+cpu
 diffusers==0.27.2
 transformers==4.39.3
-optimum-intel[nncf,openvino]==1.16.0
+optimum-intel[nncf,openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
 huggingface_hub[cli]==0.22.2
diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
index d1c24c32a8..3d6c8a7994 100644
--- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
+++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
@@ -216,7 +216,7 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, s
         tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &prompt});
         tokenizer_req.infer();
         ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids");
-        std::copy_n(input_ids_token.data<std::int32_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
+        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
 
         // text embeddings
         text_encoder_req.set_tensor("input_ids", input_ids);
diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
index 3f1d1fa118..6b39fc9360 100644
--- a/llm_bench/python/benchmark.py
+++ b/llm_bench/python/benchmark.py
@@ -129,7 +129,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
         result_text = generated_text[bs_idx]
         if args["output_dir"] is not None:
             utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id)
-        result_md5_list.append(hashlib.md5(result_text.encode()).hexdigest())
+        result_md5_list.append(hashlib.md5(result_text.encode(), usedforsecurity=False).hexdigest())
     if num == 0:
         warmup_md5[prompt_index] = result_md5_list
     per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
@@ -239,7 +239,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
         mem_consumption.clear_max_memory_consumption()
     for bs_idx in range(args['batch_size']):
         rslt_img_fn = utils.output_file.output_gen_image(res[bs_idx], args, image_id, num, bs_idx, proc_id, '.png')
-        result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes()).hexdigest())
+        result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest())
     generation_time = end - start
     iter_data = gen_iterate_data(
         iter_idx=num,
@@ -339,7 +339,7 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im
     result_md5_list = []
     if framework == 'ov':
         rslt_img_fn = utils.output_file.output_gen_image(res[0], args, image_id, num, None, proc_id, '.png')
-        result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes()).hexdigest())
+        result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest())
 
     generation_time = end - start
     iter_data = gen_iterate_data(
diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index 0dc3476328..f2d406c230 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy
-openvino>=2024.0.0
+openvino~=2024.1.0
 auto-gptq>=0.5.1 # for gptq
 pillow
 torch
diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
index 55bc919ce0..35615e1f57 100644
--- a/text_generation/causal_lm/cpp/README.md
+++ b/text_generation/causal_lm/cpp/README.md
@@ -1,6 +1,6 @@
 # Text generation C++ samples that support most popular models like LLaMA 2
 
-These examples showcase inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `convert_tokenizer` to generate IRs for the samples. [group_beam_searcher.hpp](group_beam_searcher.hpp) implements the algorithm of the same name, which is used by `beam_search_causal_lm`. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
+These examples showcase inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. [group_beam_searcher.hpp](group_beam_searcher.hpp) implements the algorithm of the same name, which is used by `beam_search_causal_lm`. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
 ## How it works
 
@@ -53,7 +53,7 @@ This approach reduces the need for multiple infer requests to the main model, en
 
 ## Install OpenVINO
 
-Install [OpenVINO Archives >= 2024.0](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `<INSTALL_DIR>` below refers to the extraction location.
+Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `<INSTALL_DIR>` below refers to the extraction location.
 
 ## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers`
 
@@ -81,18 +81,20 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upg
 
 ```sh
 source <INSTALL_DIR>/setupvars.sh
-python3 -m pip install --upgrade-strategy eager "transformers<4.38" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
-python3 ../../../llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16
-convert_tokenizer ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code
+python3 -m pip install --upgrade-strategy eager -r requirements.txt
+# Update openvino_tokenizers from the submodule
+python3 -m pip install ./../../../thirdparty/openvino_tokenizers/[transformers]
+optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
 ```
 
 #### Windows
 
 ```bat
 <INSTALL_DIR>\setupvars.bat
-python -m pip install --upgrade-strategy eager "transformers<4.38" -r ..\..\..\llm_bench\python\requirements.txt ..\..\..\thirdparty\openvino_tokenizers\[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
-python ..\..\..\llm_bench\python\convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir .\TinyLlama-1.1B-Chat-v1.0\ --precision FP16
-convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --with-detokenizer --trust-remote-code
+python -m pip install --upgrade-strategy eager -r requirements.txt
+REM Update openvino_tokenizers from the submodule
+python -m pip install .\..\..\..\thirdparty\openvino_tokenizers\[transformers]
+optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
 ```
 
 ## Run
@@ -106,16 +108,16 @@ convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyL
 ### Examples:
 
 #### Linux/MacOS:
-1. `./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
-2. `./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
-3. `./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ ./Llama-2-7b-chat-hf/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
-4. `./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"`
+1. `./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"`
+2. `./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"`
+3. `./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ ./Llama-2-7b-chat-hf/ "Why is the Sun yellow?"`
+4. `./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"`
 
 #### Windows:
-1. `.\build\Release\greedy_causal_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "Why is the Sun yellow?"`
-2. `.\build\Release\beam_search_causal_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "Why is the Sun yellow?"`
-3. `.\build\Release\speculative_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ .\Llama-2-7b-chat-hf\pytorch\dldt\FP16\ "Why is the Sun yellow?"`
-4. `.\build\Release\prompt_lookup_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "Why is the Sun yellow?"`
+1. `.\build\Release\greedy_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"`
+2. `.\build\Release\beam_search_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"`
+3. `.\build\Release\speculative_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ .\Llama-2-7b-chat-hf\ "Why is the Sun yellow?"`
+4. `.\build\Release\prompt_lookup_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"`
 
 To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt
new file mode 100644
index 0000000000..019e172dd6
--- /dev/null
+++ b/text_generation/causal_lm/cpp/requirements.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+optimum[openvino]==1.19.1
+optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
+einops==0.7.0  # For Qwen
+transformers_stream_generator==0.0.4  # For Qwen
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 0e4bb32ca3..37d20ce209 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 0e4bb32ca3412f589e1d094faa8b0aad19ee47ca
+Subproject commit 37d20ce209b120f6ffd450484e207ef71f8c8d03

From b2bf38f76dc783ab68ba3aa8b07cd15c4ceeb733 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Thu, 9 May 2024 18:56:13 +0400
Subject: [PATCH 06/22] Add hardware suggestion to samples (#394)

Ticket 139548
---
 text_generation/causal_lm/cpp/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
index 35615e1f57..21f3a066a4 100644
--- a/text_generation/causal_lm/cpp/README.md
+++ b/text_generation/causal_lm/cpp/README.md
@@ -121,6 +121,8 @@ optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Tin
 
 To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
+Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
+
 ## Supported models
 
 1. chatglm

From ab489c02dfbe824f857a2117d9445220958026d9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 10 May 2024 02:00:44 +0400
Subject: [PATCH 07/22] Bump einops from 0.7.0 to 0.8.0 in
 /text_generation/causal_lm/cpp (#430)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [einops](https://github.com/arogozhnikov/einops) from 0.7.0 to
0.8.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/arogozhnikov/einops/releases">einops's
releases</a>.</em></p>
<blockquote>
<h2>v0.8.0: tinygrad, small fixes and updates</h2>
<h2>TLDR</h2>
<ul>
<li>tinygrad backend added</li>
<li>resolve warning in py3.11 related to docstring</li>
<li>remove graph break for unpack</li>
<li><strong>breaking</strong> TF layers were updated to follow new
instructions, new layers compatible with TF 2.16, and not compatible
with old TF (certainly does not work with TF2.13)</li>
</ul>
<h2>What's Changed</h2>
<ul>
<li>Fix invalid escape sequence in einsum docstring by <a
href="https://github.com/atwam"><code>@​atwam</code></a> in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/298">arogozhnikov/einops#298</a></li>
<li>Tinygrad support by <a
href="https://github.com/blueridanus"><code>@​blueridanus</code></a> in
<a
href="https://redirect.github.com/arogozhnikov/einops/pull/297">arogozhnikov/einops#297</a></li>
<li>Coerce bool to int in unpack by <a
href="https://github.com/drubinstein"><code>@​drubinstein</code></a> in
<a
href="https://redirect.github.com/arogozhnikov/einops/pull/287">arogozhnikov/einops#287</a></li>
<li>Remove oneflow from testing by <a
href="https://github.com/arogozhnikov"><code>@​arogozhnikov</code></a>
in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/289">arogozhnikov/einops#289</a></li>
<li>tests: fix torch installation to force CPU by <a
href="https://github.com/arogozhnikov"><code>@​arogozhnikov</code></a>
in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/288">arogozhnikov/einops#288</a></li>
<li>Allow anonymous axes in parse_shape, fix <a
href="https://redirect.github.com/arogozhnikov/einops/issues/302">#302</a>
by <a
href="https://github.com/arogozhnikov"><code>@​arogozhnikov</code></a>
in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/303">arogozhnikov/einops#303</a></li>
<li>Codebase standards + update TF layers by <a
href="https://github.com/arogozhnikov"><code>@​arogozhnikov</code></a>
in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/318">arogozhnikov/einops#318</a></li>
<li>update github actions by <a
href="https://github.com/arogozhnikov"><code>@​arogozhnikov</code></a>
in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/319">arogozhnikov/einops#319</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a
href="https://github.com/drubinstein"><code>@​drubinstein</code></a>
made their first contribution in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/287">arogozhnikov/einops#287</a></li>
<li><a href="https://github.com/atwam"><code>@​atwam</code></a> made
their first contribution in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/298">arogozhnikov/einops#298</a></li>
<li><a
href="https://github.com/blueridanus"><code>@​blueridanus</code></a>
made their first contribution in <a
href="https://redirect.github.com/arogozhnikov/einops/pull/297">arogozhnikov/einops#297</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/arogozhnikov/einops/compare/v0.7.0...v0.8.0">https://github.com/arogozhnikov/einops/compare/v0.7.0...v0.8.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/arogozhnikov/einops/commit/fe9d81d577dec5a224d572a5bc7d44ab8bd914eb"><code>fe9d81d</code></a>
bump version to 0.8.0</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/10e19b4bd633a3e4de9bc4cafcad57f4ed66d0ef"><code>10e19b4</code></a>
prepare readme for a new release</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/b67cfea95639652265030df1bcdb401feacaf242"><code>b67cfea</code></a>
Rename KerasBackend -&gt; TFKerasBackend to avoid confusion with keras
3.</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/f33113e46995aabbbd2c59683ce1a6c77174c62e"><code>f33113e</code></a>
update github actions</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/cff63f2dc859fd76309dbafff77b465be90ee71e"><code>cff63f2</code></a>
Codebase standards + update TF layers (<a
href="https://redirect.github.com/arogozhnikov/einops/issues/318">#318</a>)</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/2ab5c2a5f693648a16eb3ceb8d2e1ed857101bde"><code>2ab5c2a</code></a>
switch to stable paddlepaddle in testing, because of <a
href="https://github.com/Paddl">https://github.com/Paddl</a>...</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/60b87dee0464ebb6def063a7e4cd44cfcbd18d3c"><code>60b87de</code></a>
remove announcement</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/5655ce12ce49dac3fe6402b510d09f6bb0715b5b"><code>5655ce1</code></a>
Merge pull request <a
href="https://redirect.github.com/arogozhnikov/einops/issues/303">#303</a>
from arogozhnikov/dev</li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/5b41d90142917744172e308d5bf25a606413cb43"><code>5b41d90</code></a>
allow anonymous axes in parse_shape, fix <a
href="https://redirect.github.com/arogozhnikov/einops/issues/302">#302</a></li>
<li><a
href="https://github.com/arogozhnikov/einops/commit/9a9b30432aaceac7b35077a5e19bfa26c8c12ef4"><code>9a9b304</code></a>
delete experimental data-api_packing as production version is available
now</li>
<li>Additional commits viewable in <a
href="https://github.com/arogozhnikov/einops/compare/v0.7.0...v0.8.0">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=einops&package-manager=pip&previous-version=0.7.0&new-version=0.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 text_generation/causal_lm/cpp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt
index 019e172dd6..3fe4f4050d 100644
--- a/text_generation/causal_lm/cpp/requirements.txt
+++ b/text_generation/causal_lm/cpp/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 optimum[openvino]==1.19.1
 optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
-einops==0.7.0  # For Qwen
+einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.4  # For Qwen

From 4859a9752b9e451e27049d45e7025df80f49f02f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 10 May 2024 12:11:25 +0400
Subject: [PATCH 08/22] Bump optimum[openvino] from 1.19.1 to 1.19.2 in
 /text_generation/causal_lm/cpp (#429)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [optimum[openvino]](https://github.com/huggingface/optimum) from
1.19.1 to 1.19.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/huggingface/optimum/releases">optimum[openvino]'s
releases</a>.</em></p>
<blockquote>
<h2>v1.19.2: Patch release</h2>
<ul>
<li>Update the Transformers dependency in the Habana extra <a
href="https://redirect.github.com/huggingface/optimum/issues/1851">#1851</a>
<a href="https://github.com/regisss"><code>@​regisss</code></a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/huggingface/optimum/compare/v1.19.1...v1.19.2">https://github.com/huggingface/optimum/compare/v1.19.1...v1.19.2</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/huggingface/optimum/commit/65889fb8f53ac8e5cd2cc4dd977ae643dbc3e271"><code>65889fb</code></a>
Release: v1.19.2</li>
<li><a
href="https://github.com/huggingface/optimum/commit/ac3bc29af934d76c2d712adbc8ca84965cb36ca9"><code>ac3bc29</code></a>
Update the Transformers dependency in the Habana extra (<a
href="https://redirect.github.com/huggingface/optimum/issues/1851">#1851</a>)</li>
<li>See full diff in <a
href="https://github.com/huggingface/optimum/compare/v1.19.1...v1.19.2">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=optimum[openvino]&package-manager=pip&previous-version=1.19.1&new-version=1.19.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 text_generation/causal_lm/cpp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt
index 3fe4f4050d..b682be3c92 100644
--- a/text_generation/causal_lm/cpp/requirements.txt
+++ b/text_generation/causal_lm/cpp/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum[openvino]==1.19.1
+optimum[openvino]==1.19.2
 optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.4  # For Qwen

From 1b9d3896fe8d6c7444609df4f5b306ca26861293 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 10 May 2024 08:12:08 +0000
Subject: [PATCH 09/22] Bump transformers-stream-generator from 0.0.4 to 0.0.5
 in /text_generation/causal_lm/cpp (#431)

---
 text_generation/causal_lm/cpp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt
index b682be3c92..bb43d745c8 100644
--- a/text_generation/causal_lm/cpp/requirements.txt
+++ b/text_generation/causal_lm/cpp/requirements.txt
@@ -2,4 +2,4 @@
 optimum[openvino]==1.19.2
 optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
 einops==0.8.0  # For Qwen
-transformers_stream_generator==0.0.4  # For Qwen
+transformers_stream_generator==0.0.5  # For Qwen

From 71a634d8288adc44b1e9315b86bb492514265213 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 10 May 2024 14:06:28 +0400
Subject: [PATCH 10/22] Merged releases/2024/1

---
 image_generation/lcm_dreamshaper_v7/cpp/requirements.txt   | 2 +-
 image_generation/stable_diffusion_1_5/cpp/requirements.txt | 2 +-
 thirdparty/openvino_tokenizers                             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
index 7ffbb92137..208daf8f65 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
+++ b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.2.2+cpu
 diffusers==0.27.2
-optimum-intel[nncf,openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
+optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt
index 5e6bfe0372..0c3834717b 100644
--- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt
+++ b/image_generation/stable_diffusion_1_5/cpp/requirements.txt
@@ -2,5 +2,5 @@
 torch==2.2.2+cpu
 diffusers==0.27.2
 transformers==4.39.3
-optimum-intel[nncf,openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
+optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
 huggingface_hub[cli]==0.22.2
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 37d20ce209..c754503462 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 37d20ce209b120f6ffd450484e207ef71f8c8d03
+Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1

From 0441e27386b2fb790227d919e152ec7e74021577 Mon Sep 17 00:00:00 2001
From: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Date: Fri, 10 May 2024 14:55:16 +0300
Subject: [PATCH 11/22] Remove hardcoded constants in SD sample (#421)

---
 .../stable_diffusion_1_5/cpp/src/main.cpp      | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
index 3d6c8a7994..da01610f69 100644
--- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
+++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
@@ -17,8 +17,6 @@
 #include "scheduler_lms_discrete.hpp"
 
 const size_t TOKENIZER_MODEL_MAX_LENGTH = 77;   // 'model_max_length' parameter from 'tokenizer_config.json'
-const int64_t UNET_IN_CHANNELS = 4;             // 'in_channels' parameter from 'unet/config.json'
-const int64_t VAE_DECODER_LATENT_CHANNELS = 4;  // 'latent_channels' parameter from 'vae_decoder/config.json'
 const size_t VAE_SCALE_FACTOR = 8;
 
 class Timer {
@@ -35,8 +33,8 @@ class Timer {
     }
 };
 
-ov::Tensor randn_tensor(uint32_t height, uint32_t width, bool use_np_latents, uint32_t seed = 42) {
-    ov::Tensor noise(ov::element::f32, {1, UNET_IN_CHANNELS, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
+ov::Tensor randn_tensor(ov::Shape shape, bool use_np_latents, uint32_t seed = 42) {
+    ov::Tensor noise(ov::element::f32, shape);
     if (use_np_latents) {
         // read np generated latents with defaut seed 42
         const char* latent_file_name = "../np_latents_512x512.txt";
@@ -111,7 +109,7 @@ void reshape_unet_encoder(std::shared_ptr<ov::Model> model,
         if (input_name == "timestep") {
             name_to_shape[input_name][0] = 1;
         } else if (input_name == "sample") {
-            name_to_shape[input_name] = {batch_size, UNET_IN_CHANNELS, height, width};
+            name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width};
         } else if (input_name == "time_ids") {
             name_to_shape[input_name][0] = batch_size;
         } else {
@@ -127,7 +125,8 @@ void reshape_vae_decoder(std::shared_ptr<ov::Model> model, int64_t height, int64
     height = height / VAE_SCALE_FACTOR;
     width = width / VAE_SCALE_FACTOR;
 
-    std::map<size_t, ov::PartialShape> idx_to_shape{{0, {1, VAE_DECODER_LATENT_CHANNELS, height, width}}};
+    ov::PartialShape input_shape = model->input(0).get_partial_shape();
+    std::map<size_t, ov::PartialShape> idx_to_shape{{0, {1, input_shape[1], height, width}}};
     model->reshape(idx_to_shape);
 }
 
@@ -397,10 +396,13 @@ int32_t main(int32_t argc, char* argv[]) try {
 
     for (uint32_t n = 0; n < num_images; n++) {
         std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n;
-        ov::Tensor noise = randn_tensor(height, width, read_np_latent, seed);
+
+        const size_t unet_in_channels = static_cast<size_t>(sample_shape[1].get_length());
 
         // latents are multiplied by 'init_noise_sigma'
-        ov::Shape latent_shape = noise.get_shape(), latent_model_input_shape = latent_shape;
+        ov::Shape latent_shape = ov::Shape({batch_size, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
+        ov::Shape latent_model_input_shape = latent_shape;
+        ov::Tensor noise = randn_tensor(latent_shape, read_np_latent, seed);
         latent_model_input_shape[0] = 2;  // Unet accepts batch 2
         ov::Tensor latent(ov::element::f32, latent_shape),
             latent_model_input(ov::element::f32, latent_model_input_shape);

From 8f2ce8de23ab122503ecf38b2b0916a972817b08 Mon Sep 17 00:00:00 2001
From: Chen Peter <peter.chen@intel.com>
Date: Fri, 10 May 2024 22:25:12 +0800
Subject: [PATCH 12/22] openvino>=2024.1.0 on master (#434)

---
 llm_bench/python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index f2d406c230..87224e5d85 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy
-openvino~=2024.1.0
+openvino>=2024.1.0
 auto-gptq>=0.5.1 # for gptq
 pillow
 torch

From f411e6180b131136e956c281c9b4bc92bfa26a37 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 10 May 2024 17:39:34 +0400
Subject: [PATCH 13/22] Add gitignore

---
 image_generation/lcm_dreamshaper_v7/cpp/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 image_generation/lcm_dreamshaper_v7/cpp/.gitignore

diff --git a/image_generation/lcm_dreamshaper_v7/cpp/.gitignore b/image_generation/lcm_dreamshaper_v7/cpp/.gitignore
new file mode 100644
index 0000000000..cf7dbce266
--- /dev/null
+++ b/image_generation/lcm_dreamshaper_v7/cpp/.gitignore
@@ -0,0 +1,2 @@
+images
+models

From b9eb93f5d6729ecb363f74ac22c8287c5521bd29 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 10 May 2024 17:40:32 +0400
Subject: [PATCH 14/22] Remove unet in channels constant

---
 .../lcm_dreamshaper_v7/cpp/src/main.cpp           | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
index 546bd170ba..076bbbe55f 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
+++ b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
@@ -22,13 +22,12 @@
 #include "imwrite.hpp"
 
 const size_t TOKENIZER_MODEL_MAX_LENGTH = 77;   // 'model_max_length' parameter from 'tokenizer_config.json'
-const int64_t UNET_IN_CHANNELS = 4;             // 'in_channels' parameter from 'unet/config.json'
 const int64_t UNET_TIME_COND_PROJ_DIM = 256;    // 'time_cond_proj_dim' parameter from 'unet/config.json'
 const int64_t VAE_DECODER_LATENT_CHANNELS = 4;  // 'latent_channels' parameter from 'vae_decoder/config.json'
 const size_t VAE_SCALE_FACTOR = 8;
 
-ov::Tensor randn_tensor(uint32_t height, uint32_t width, bool use_np_latents, uint32_t seed = 42) {
-    ov::Tensor noise(ov::element::f32, {1, UNET_IN_CHANNELS, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
+ov::Tensor randn_tensor(ov::Shape shape, bool use_np_latents, uint32_t seed = 42) {
+    ov::Tensor noise(ov::element::f32, shape);
     if (use_np_latents) {
         // read np generated latents with defaut seed 42
         const char * latent_file_name = "./latents/np_latents_512x512.txt";
@@ -90,7 +89,7 @@ void reshape_unet(std::shared_ptr<ov::Model> model,
         if (input_name == "timestep") {
             name_to_shape[input_name][0] = 1;
         } else if (input_name == "sample") {
-            name_to_shape[input_name] = {batch_size, UNET_IN_CHANNELS, height, width};
+            name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width};
         } else if (input_name == "time_ids") {
             name_to_shape[input_name][0] = batch_size;
         } else if (input_name == "timestep_cond") {
@@ -353,10 +352,14 @@ int32_t main(int32_t argc, char* argv[]) try {
     float guidance_scale = 8.0;
     ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, UNET_TIME_COND_PROJ_DIM);
 
-    ov::Tensor denoised(ov::element::f32, {1, UNET_IN_CHANNELS, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
+    const size_t unet_in_channels = static_cast<size_t>(sample_shape[1].get_length());
+    ov::Shape latent_model_input_shape = ov::Shape({1, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
+
+    ov::Tensor denoised(ov::element::f32, latent_model_input_shape);
+    
     for (uint32_t n = 0; n < num_images; n++) {
         std::uint32_t seed = num_images == 1 ? user_seed: user_seed + n;
-        ov::Tensor latent_model_input = randn_tensor(height, width, read_np_latent, seed);
+        ov::Tensor latent_model_input = randn_tensor(latent_model_input_shape, read_np_latent, seed);
 
         for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) {
             ov::Tensor timestep(ov::element::i64, {1}, &timesteps[inference_step]);

From f98391fbce44a353f92ca061a442d45ce2c536d7 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 10 May 2024 17:49:03 +0400
Subject: [PATCH 15/22] Remove unet_time_cond_proj_dim constant

---
 image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
index 076bbbe55f..3758dc3e3b 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
+++ b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
@@ -22,7 +22,6 @@
 #include "imwrite.hpp"
 
 const size_t TOKENIZER_MODEL_MAX_LENGTH = 77;   // 'model_max_length' parameter from 'tokenizer_config.json'
-const int64_t UNET_TIME_COND_PROJ_DIM = 256;    // 'time_cond_proj_dim' parameter from 'unet/config.json'
 const int64_t VAE_DECODER_LATENT_CHANNELS = 4;  // 'latent_channels' parameter from 'vae_decoder/config.json'
 const size_t VAE_SCALE_FACTOR = 8;
 
@@ -93,7 +92,7 @@ void reshape_unet(std::shared_ptr<ov::Model> model,
         } else if (input_name == "time_ids") {
             name_to_shape[input_name][0] = batch_size;
         } else if (input_name == "timestep_cond") {
-            name_to_shape[input_name] = {batch_size, UNET_TIME_COND_PROJ_DIM};
+            name_to_shape[input_name][0] = batch_size;
         } else {
             name_to_shape[input_name][0] = batch_size;
             name_to_shape[input_name][1] = TOKENIZER_MODEL_MAX_LENGTH;
@@ -350,13 +349,14 @@ int32_t main(int32_t argc, char* argv[]) try {
     std::vector<std::int64_t> timesteps = scheduler->get_timesteps();
 
     float guidance_scale = 8.0;
-    ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, UNET_TIME_COND_PROJ_DIM);
+    const size_t unet_time_cond_proj_dim = static_cast<size_t>(models.unet.input("timestep_cond").get_partial_shape()[1].get_length());
+    ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, unet_time_cond_proj_dim);
 
     const size_t unet_in_channels = static_cast<size_t>(sample_shape[1].get_length());
     ov::Shape latent_model_input_shape = ov::Shape({1, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR});
 
     ov::Tensor denoised(ov::element::f32, latent_model_input_shape);
-    
+
     for (uint32_t n = 0; n < num_images; n++) {
         std::uint32_t seed = num_images == 1 ? user_seed: user_seed + n;
         ov::Tensor latent_model_input = randn_tensor(latent_model_input_shape, read_np_latent, seed);

From c8c259bf57c164ddb4e1d9636537f41ef3d1b97e Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 10 May 2024 17:53:14 +0400
Subject: [PATCH 16/22] Remove vae_decoder_latent_channels constant

---
 image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
index 3758dc3e3b..0b06d22067 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
+++ b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp
@@ -22,7 +22,6 @@
 #include "imwrite.hpp"
 
 const size_t TOKENIZER_MODEL_MAX_LENGTH = 77;   // 'model_max_length' parameter from 'tokenizer_config.json'
-const int64_t VAE_DECODER_LATENT_CHANNELS = 4;  // 'latent_channels' parameter from 'vae_decoder/config.json'
 const size_t VAE_SCALE_FACTOR = 8;
 
 ov::Tensor randn_tensor(ov::Shape shape, bool use_np_latents, uint32_t seed = 42) {
@@ -105,8 +104,8 @@ void reshape_unet(std::shared_ptr<ov::Model> model,
 void reshape_vae_decoder(std::shared_ptr<ov::Model> model, int64_t height, int64_t width) {
     height = height / VAE_SCALE_FACTOR;
     width = width / VAE_SCALE_FACTOR;
-
-    std::map<size_t, ov::PartialShape> idx_to_shape{{0, {1, VAE_DECODER_LATENT_CHANNELS, height, width}}};
+    ov::Dimension vae_decoder_latent_channels = model->input(0).get_partial_shape()[1];
+    std::map<size_t, ov::PartialShape> idx_to_shape{{0, {1, vae_decoder_latent_channels, height, width}}};
     model->reshape(idx_to_shape);
 }
 

From c799d4baf95f95cae148e5fbe1b6a5a6d73501db Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Fri, 10 May 2024 18:02:58 +0400
Subject: [PATCH 17/22] Format options, fix default path to model

---
 .../stable_diffusion_1_5/cpp/src/main.cpp     | 56 ++++++-------------
 1 file changed, 17 insertions(+), 39 deletions(-)

diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
index da01610f69..c51e509494 100644
--- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
+++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
@@ -215,7 +215,7 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, s
         tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &prompt});
         tokenizer_req.infer();
         ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids");
-        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
+        std::copy_n(input_ids_token.data<std::int32_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
 
         // text embeddings
         text_encoder_req.set_tensor("input_ids", input_ids);
@@ -285,44 +285,22 @@ ov::Tensor postprocess_image(ov::Tensor decoded_image) {
 int32_t main(int32_t argc, char* argv[]) try {
     cxxopts::Options options("stable_diffusion", "Stable Diffusion implementation in C++ using OpenVINO\n");
 
-    options.add_options()(
-        "p,posPrompt",
-        "Initial positive prompt for SD ",
-        cxxopts::value<std::string>()->default_value(
-            "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"))(
-        "n,negPrompt",
-        "Defaut is empty with space",
-        cxxopts::value<std::string>()->default_value(" "))(
-        "d,device",
-        "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device "
-        "only",
-        cxxopts::value<std::string>()->default_value(
-            "CPU"))("step", "Number of diffusion steps", cxxopts::value<size_t>()->default_value("20"))(
-        "s,seed",
-        "Number of random seed to generate latent for one image output",
-        cxxopts::value<size_t>()->default_value(
-            "42"))("num", "Number of image output", cxxopts::value<size_t>()->default_value("1"))(
-        "height",
-        "Destination image height",
-        cxxopts::value<size_t>()->default_value(
-            "512"))("width", "Destination image width", cxxopts::value<size_t>()->default_value("512"))(
-        "c,useCache",
-        "Use model caching",
-        cxxopts::value<bool>()->default_value("false"))("r,readNPLatent",
-                                                        "Read numpy generated latents from file",
-                                                        cxxopts::value<bool>()->default_value("false"))(
-        "m,modelPath",
-        "Specify path of SD model IRs",
-        cxxopts::value<std::string>()->default_value("../models/dreamlike_anime_1_0_ov"))(
-        "t,type",
-        "Specify the type of SD model IRs (FP32, FP16 or INT8)",
-        cxxopts::value<std::string>()->default_value("FP16"))("dynamic",
-                                                              "Specify the model input shape to use dynamic shape",
-                                                              cxxopts::value<bool>()->default_value("false"))(
-        "l,loraPath",
-        "Specify path of LoRA file. (*.safetensors).",
-        cxxopts::value<std::string>()->default_value(
-            ""))("a,alpha", "alpha for LoRA", cxxopts::value<float>()->default_value("0.75"))("h,help", "Print usage");
+    options.add_options()
+    ("p,posPrompt", "Initial positive prompt for SD ", cxxopts::value<std::string>()->default_value("cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"))
+    ("n,negPrompt", "Defaut is empty with space", cxxopts::value<std::string>()->default_value(" "))
+    ("d,device", "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only", cxxopts::value<std::string>()->default_value("CPU"))
+    ("step", "Number of diffusion steps", cxxopts::value<size_t>()->default_value("20"))
+    ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value<size_t>()->default_value("42"))
+    ("num", "Number of image output", cxxopts::value<size_t>()->default_value("1"))
+    ("height", "Destination image height", cxxopts::value<size_t>()->default_value("512"))
+    ("width", "Destination image width", cxxopts::value<size_t>()->default_value("512"))
+    ("c,useCache", "Use model caching", cxxopts::value<bool>()->default_value("false"))
+    ("r,readNPLatent", "Read numpy generated latents from file", cxxopts::value<bool>()->default_value("false"))
+    ("m,modelPath", "Specify path of SD model IRs", cxxopts::value<std::string>()->default_value("./models/dreamlike_anime_1_0_ov"))
+    ("t,type", "Specify the type of SD model IRs (FP32, FP16 or INT8)", cxxopts::value<std::string>()->default_value("FP16"))
+    ("dynamic", "Specify the model input shape to use dynamic shape", cxxopts::value<bool>()->default_value("false"))
+    ("l,loraPath", "Specify path of LoRA file. (*.safetensors).", cxxopts::value<std::string>()->default_value(""))
+    ("a,alpha", "alpha for LoRA", cxxopts::value<float>()->default_value("0.75"))("h,help", "Print usage");
     cxxopts::ParseResult result;
 
     try {

From 4196b0bb2ec8c15167ebf1b2c6b5f0aebd1929b2 Mon Sep 17 00:00:00 2001
From: yatarkan <yaroslav.tarkan@intel.com>
Date: Mon, 13 May 2024 10:25:10 +0400
Subject: [PATCH 18/22] Fix type

---
 image_generation/stable_diffusion_1_5/cpp/src/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
index c51e509494..7f9f9afc3b 100644
--- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
+++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
@@ -215,7 +215,7 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, s
         tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &prompt});
         tokenizer_req.infer();
         ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids");
-        std::copy_n(input_ids_token.data<std::int32_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
+        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
 
         // text embeddings
         text_encoder_req.set_tensor("input_ids", input_ids);

From 07193b62f0a39a53dd3c5dab106960e7125123b8 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Mon, 13 May 2024 15:09:24 +0200
Subject: [PATCH 19/22] Improve trim tensor implementation (#423)

Ticket: 140109
---
 .github/workflows/causal_lm_cpp.yml           | 10 +++++++
 text_generation/causal_lm/cpp/CMakeLists.txt  |  4 +++
 text_generation/causal_lm/cpp/README.md       |  9 ++++++
 .../cpp/prompt_lookup_decoding_lm.cpp         | 10 ++++---
 .../causal_lm/cpp/speculative_decoding_lm.cpp | 30 ++++++++-----------
 5 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 52f8656344..df03bab7c6 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -30,6 +30,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -57,6 +58,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -225,6 +227,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -252,6 +255,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -279,6 +283,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -306,6 +311,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -333,6 +339,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
@@ -371,6 +378,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -415,6 +423,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -459,6 +468,7 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+          sudo apt-get install libtbb-dev
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/
           cmake --build ./build/ --config Release -j
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index eb4cab5048..6da39c6abe 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -28,6 +28,8 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime)
 set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+find_package(TBB REQUIRED COMPONENTS tbb)
+target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb)
 
 add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
 target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<TARGET_FILE:openvino_tokenizers>\")
@@ -36,3 +38,5 @@ find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime)
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17)
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+find_package(TBB REQUIRED COMPONENTS tbb)
+target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb)
diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
index 21f3a066a4..08b91ab70e 100644
--- a/text_generation/causal_lm/cpp/README.md
+++ b/text_generation/causal_lm/cpp/README.md
@@ -55,6 +55,15 @@ This approach reduces the need for multiple infer requests to the main model, en
 
 Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `<INSTALL_DIR>` below refers to the extraction location.
 
+## Install `libtbb-dev` on Linux
+
+> [!NOTE]
+> `tbb` development files are installed with OpenVINO Archive on Windows and macOS.
+
+```sh
+sudo apt-get install libtbb-dev
+```
+
 ## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers`
 
 ### Linux/macOS
diff --git a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp
index f4a50e94bb..5060b88642 100644
--- a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp
+++ b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp
@@ -1,6 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 
 namespace {
@@ -94,10 +95,11 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
     // trim kv_cache values up to the new_seq_len
-    for (auto& state : request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
+    auto states = request.query_state();
+    ov::parallel_for(states.size(), [&](size_t i) {
+        ov::Tensor old_tensor = states.at(i).get_state();
+        states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    });
 }
 
 class PromptLookupCandidateGenerator {
diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
index 4aefec14db..b0c40a7a9f 100644
--- a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
+++ b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cmath>
+#include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 #include <random>
 
@@ -69,6 +70,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
     auto old_tensor_data = tensor.data<float>();
     auto shape = tensor.get_shape();
+    size_t batch_size = shape[0];
     size_t num_kv_heads = shape[1];
     size_t old_seq_len = shape[2];
     size_t head_size = shape[3];
@@ -82,31 +84,23 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
     if (seq_len_axis == 0) {
         shape[0] = new_seq_len;
         tensor.set_shape(shape);
+        return tensor;
     }
 
-    // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor
-    auto new_tensor = ov::Tensor{ov::element::f32, {BATCH_SIZE, num_kv_heads, new_seq_len, head_size}};
-    auto new_tensor_data = new_tensor.data<float>();
-    for (size_t batch = 0; batch < BATCH_SIZE; ++batch) {
-        for (size_t i = 0; i < num_kv_heads; ++i) {
-            for (size_t j = 0; j < new_seq_len; ++j) {
-                auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch +
-                               new_seq_len * head_size * i + head_size * j;
-                auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch +
-                               old_seq_len * head_size * i + head_size * j;
-                std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float));
-            }
-        }
-    }
+    ov::Coordinate new_shape_begin{0, 0, 0, 0};
+    ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size};
+    auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end);
+
     return new_tensor;
 }
 
 void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) {
     // trim kv_cache values up to the new_seq_len
-    for (auto& state : request.query_state()) {
-        ov::Tensor old_tensor = state.get_state();
-        state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
-    }
+    auto states = request.query_state();
+    ov::parallel_for(states.size(), [&](size_t i) {
+        ov::Tensor old_tensor = states.at(i).get_state();
+        states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len));
+    });
 }
 
 class AssistedCandidateGenerator {

From ee0f75aab2f74427f7101dcbb81209aa2fbb3078 Mon Sep 17 00:00:00 2001
From: guozhong wang <guozhong.wang@intel.com>
Date: Tue, 14 May 2024 09:36:41 +0800
Subject: [PATCH 20/22] Force to generate "inference count" tokens (#289)

CVS-133717
1. if has option -ic, output token size is same as infer count
2. if without option -ic, output token size is generated by default
according to the model.
3. remove the default output limit of 512 tokens.
4. if set env LOGLEVEL=DEBUG, will print latency of all tokens.

examples:
set env LOGLEVEL=DEBUG

[bloomz-560m-without-ic.txt](https://github.com/openvinotoolkit/openvino.genai/files/15245407/bloomz-560m-without-ic.txt)
[bloomz-560m-ic-1024.txt](https://github.com/openvinotoolkit/openvino.genai/files/15245409/bloomz-560m-ic-1024.txt)
[llama-2-7b-chat-without-ic-.txt](https://github.com/openvinotoolkit/openvino.genai/files/15245412/llama-2-7b-chat-without-ic-.txt)
[llama-2-7b-chat-without-ic-.txt](https://github.com/openvinotoolkit/openvino.genai/files/15245415/llama-2-7b-chat-without-ic-.txt)
---------
Co-authored-by: Chen Peter <peter.chen@intel.com>
---
 llm_bench/python/benchmark.py | 37 ++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
index 6b39fc9360..a9a696c68b 100644
--- a/llm_bench/python/benchmark.py
+++ b/llm_bench/python/benchmark.py
@@ -34,7 +34,6 @@
 DEFAULT_SUPER_RESOLUTION_STEPS = 50
 DEFAULT_SUPER_RESOLUTION_WIDTH = 128
 DEFAULT_SUPER_RESOLUTION_HEIGHT = 128
-DEFAULT_OUTPUT_TOKEN_SIZE = 512
 MAX_OUTPUT_TOKEN_SIZE = 64 * 1024
 
 mem_consumption = MemConsumption()
@@ -88,22 +87,22 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
     # Remove `token_type_ids` from inputs
     input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data
     input_token_size = input_tokens[0].numel()
-
-    max_output_token_size = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
-    max_output_token_size = MAX_OUTPUT_TOKEN_SIZE if max_output_token_size > MAX_OUTPUT_TOKEN_SIZE else max_output_token_size
     if args['batch_size'] > 1:
         out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
         out_str += " Batch_size={}, ".format(args['batch_size'])
         out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size'])
-        out_str += 'all max_output_token_size: {} * {}'.format(max_output_token_size, args['batch_size'])
+        if args['infer_count'] is not None:
+            out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
         log.info(out_str)
 
     max_rss_mem_consumption = ''
     max_shared_mem_consumption = ''
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.start_collect_memory_consumption()
+    min_gen_tokens = 0 if args['infer_count'] is None else args['infer_count']
+    max_gen_tokens = MAX_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
     start = time.perf_counter()
-    result = model.generate(**input_data, max_new_tokens=int(max_output_token_size), num_beams=args['num_beams'], use_cache=True)
+    result = model.generate(**input_data, min_new_tokens=int(min_gen_tokens), max_new_tokens=int(max_gen_tokens), num_beams=args['num_beams'], use_cache=True)
     end = time.perf_counter()
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.end_collect_momory_consumption()
@@ -124,7 +123,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
         else:
             generated_text_len = len(result[bs_idx])
         num_tokens += generated_text_len
-        if generated_text_len > max_output_token_size:
+        if generated_text_len > max_gen_tokens:
             log.error('Output token size is over max output token size!')
         result_text = generated_text[bs_idx]
         if args["output_dir"] is not None:
@@ -133,10 +132,14 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
     if num == 0:
         warmup_md5[prompt_index] = result_md5_list
     per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
+    tm_list = bench_hook.get_time_list()
+    log.debug('latency of all tokens:')
+    [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+    tm_infer_list = bench_hook.get_time_infer_list()
     iter_data = gen_iterate_data(
         num,
         input_token_size * args['batch_size'],
-        max_output_token_size,
+        len(tm_infer_list),
         num_tokens,
         generation_time,
         per_token_time,
@@ -147,8 +150,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
         tokenization_time=(tok_encode_time, tok_decode_time)
     )
     iter_data_list.append(iter_data)
-    tm_list = bench_hook.get_time_list()
-    tm_infer_list = bench_hook.get_time_infer_list()
     utils.metrics_print.print_metrics(
         num,
         iter_data,
@@ -412,6 +413,15 @@ def num_iters_type(x):
     return x
 
 
+def num_infer_count_type(x):
+    x = int(x)
+    if x < 1:
+        raise argparse.ArgumentTypeError('Minimum input value is 1')
+    elif x > MAX_OUTPUT_TOKEN_SIZE:
+        raise argparse.ArgumentTypeError(f'Max input value is {MAX_OUTPUT_TOKEN_SIZE}')
+    return x
+
+
 def get_argprser():
     parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError)
@@ -425,9 +435,8 @@ def get_argprser():
         '-ic',
         '--infer_count',
         default=None,
-        type=int,
-        help='limit the output token size '
-        f'(default {DEFAULT_OUTPUT_TOKEN_SIZE}) of text_gen and code_gen models.',
+        type=num_infer_count_type,
+        help='set the output token size, the value must be greater than 0.'
     )
     parser.add_argument(
         '-n',
@@ -501,7 +510,7 @@ def get_argprser():
 
 
 def main():
-    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
+    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout)
     args = get_argprser()
     model_path, framework, model_args, model_name = utils.model_utils.analyze_args(args)
 

From 2bc9a7f0ad7b464ed5ce2167af767bfcc2817aa8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 14 May 2024 12:46:53 +0400
Subject: [PATCH 21/22] Migrate to official optimum-intel (#439)

---
 image_generation/lcm_dreamshaper_v7/cpp/requirements.txt   | 2 +-
 image_generation/stable_diffusion_1_5/cpp/requirements.txt | 2 +-
 text_generation/causal_lm/cpp/requirements.txt             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
index 208daf8f65..047e0d826f 100644
--- a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
+++ b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.2.2+cpu
 diffusers==0.27.2
-optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
+optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3
diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt
index 0c3834717b..29b40d70c4 100644
--- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt
+++ b/image_generation/stable_diffusion_1_5/cpp/requirements.txt
@@ -2,5 +2,5 @@
 torch==2.2.2+cpu
 diffusers==0.27.2
 transformers==4.39.3
-optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
+optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3
 huggingface_hub[cli]==0.22.2
diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt
index bb43d745c8..e1c10930ad 100644
--- a/text_generation/causal_lm/cpp/requirements.txt
+++ b/text_generation/causal_lm/cpp/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 optimum[openvino]==1.19.2
-optimum-intel[openvino] @ git+https://github.com/apaniukov/optimum-intel.git@0029e9165a2dad4cfcf787aa63181d9dc0cd49d5
+optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen

From 234ad874af678d471e385e08824f1e17ecb4591a Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 14 May 2024 13:43:39 +0400
Subject: [PATCH 22/22] fix phi3 conversion (#440)

---
 llm_bench/python/convert.py        | 9 ++++-----
 llm_bench/python/utils/ov_utils.py | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py
index 01312b4ebe..221752bcfc 100644
--- a/llm_bench/python/convert.py
+++ b/llm_bench/python/convert.py
@@ -972,7 +972,7 @@ def ts_patched_forward(
     remote_code = False
     pt_model = None
     try:
-        config = AutoConfig.from_pretrained(args.model_id)
+        config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=False)
     except Exception:
         config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True)
         remote_code = True
@@ -1215,14 +1215,13 @@ def convert_falcon(args):
 def convert_phi(args):
     trust_remote_code = False
     try:
-        config = AutoConfig.from_pretrained(args.model_id)
+        config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=False)
     except Exception:
         config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True)
         trust_remote_code = True
     cuda, post_init = patch_gptq(config)
     model_kwargs = {}
-    if trust_remote_code:
-        model_kwargs["trust_remote_code"] = trust_remote_code
+    model_kwargs["trust_remote_code"] = trust_remote_code
     precision = args.precision
     compression_only = (
         args.compress_weights
@@ -1238,7 +1237,7 @@ def convert_phi(args):
     if not compression_only:
         pt_model = AutoModelForCausalLM.from_pretrained(
             args.model_id,
-            config=AutoConfig.from_pretrained(args.model_id),
+            config=config,
             **model_kwargs,
         )
         pt_model.config.use_cache = True
diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py
index ed62498fc6..a2416ccb92 100644
--- a/llm_bench/python/utils/ov_utils.py
+++ b/llm_bench/python/utils/ov_utils.py
@@ -143,7 +143,7 @@ def create_text_gen_model(model_path, device, **kwargs):
     else:
         remote_code = False
         try:
-            model_config = AutoConfig.from_pretrained(model_path)
+            model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False)
         except Exception:
             model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
             remote_code = True