diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 2e9d72e26..4aad3d4bc 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -16,10 +16,10 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241205_x86_64.tgz
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241205_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
index 166284bd4..5f4634616 100644
--- a/.github/workflows/job_vlm_sample_llava.yml
+++ b/.github/workflows/job_vlm_sample_llava.yml
@@ -11,7 +11,7 @@ on:
         type: string
 
 env:
-  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
 
 jobs:
   visual_language_chat_sample-ubuntu-llava:
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 258184e9e..c525b0be6 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -18,8 +18,8 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz
-  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip
+  LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz
+  WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip
   OV_INSTALL_DIR: ${{ github.workspace }}/ov
 
 jobs:
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0bb0c1af6..6c94a907e 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -270,7 +270,7 @@ jobs:
           - name: 'Whisper'
             cmd: 'tests/python_tests/test_whisper_generate_api.py'
           - name: 'LLM & VLM'
-            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct
+            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py'
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
index 1999bafcf..56145c080 100644
--- a/.github/workflows/llm_bench-python.yml
+++ b/.github/workflows/llm_bench-python.yml
@@ -114,14 +114,14 @@ jobs:
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel
         run: |
           huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI
         run: |
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4
       - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA
         run: |
           wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
-          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 -ic 4
+          python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4
           rm -rf ./ov_models/lcm_dreamshaper_v7/
       - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
         run: |
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 7a4ee31be..a9af13bc6 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: master
   OV_TARBALL: ''
 
 jobs:
@@ -225,7 +225,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 649d678c0..f88bc4c6f 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: master
   OV_TARBALL: ''
 
 jobs:
@@ -236,7 +236,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
diff --git a/README.md b/README.md
index c2509528c..be3de5e8c 100644
--- a/README.md
+++ b/README.md
@@ -331,10 +331,14 @@ For more examples check out our [Generative AI workflow](https://docs.openvino.a
 
 NOTE: Whisper Pipeline requires preprocessing of audio input (to adjust sampling rate and normalize)
  
- ### Converting and compressing image generation model from Hugging Face library
+ ### Converting and quantizing speech-to-text model from Hugging Face library
 ```sh
 #Download and convert to OpenVINO whisper-base model
 optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base
+
+#Download, convert and apply int8 static quantization to whisper-base model
+optimum-cli export openvino --trust-remote-code --model openai/whisper-base \
+--quant-mode int8 --dataset librispeech --num-samples 32 whisper-base-int8
 ```
 
 ### Run generation using Whisper Pipeline API in Python
diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index e69211002..8b48dbade 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -22,14 +22,10 @@ int main(int argc, char* argv[]) try {
     
     std::string device = "CPU";
 
-    ov::genai::SchedulerConfig scheduler_config;
-    scheduler_config.cache_size = 5;
-
     ov::genai::LLMPipeline pipe(
         model_path,
         device,
-        ov::genai::prompt_lookup(true),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::prompt_lookup(true));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index ea99dd73f..283d39c61 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -26,14 +26,10 @@ int main(int argc, char* argv[]) try {
     // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft.
     std::string main_device = "CPU", draft_device = "CPU";
 
-    ov::genai::SchedulerConfig scheduler_config;
-    scheduler_config.cache_size = 5;
-
     ov::genai::LLMPipeline pipe(
         main_model_path,
         main_device,
-        ov::genai::draft_model(draft_model_path, draft_device),
-        ov::genai::scheduler_config(scheduler_config));
+        ov::genai::draft_model(draft_model_path, draft_device));
 
     auto streamer = [](std::string subword) {
         std::cout << subword << std::flush;
diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md
index 773135b64..d64926661 100644
--- a/samples/cpp/whisper_speech_recognition/README.md
+++ b/samples/cpp/whisper_speech_recognition/README.md
@@ -33,6 +33,91 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```c++
+#include "openvino/genai/whisper_pipeline.hpp"
+
+ov::genai::WhisperPipeline pipeline(model_dir, "CPU");
+// Pipeline expects normalized audio with Sample Rate of 16kHz
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech);
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech);
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::language("<|en|>"));
+//  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav");
+result = pipeline.generate(raw_speech, ov::genai::language("<|fr|>"));
+//  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```c++
+ov::genai::RawSpeechInput raw_speech = read_wav("fr_sample.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::task("translate"));
+//  It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```C++
+ov::genai::RawSpeechInput raw_speech = read_wav("how_are_you_doing_today.wav");
+auto result = pipeline.generate(raw_speech, ov::genai::return_timestamps(true));
+
+std::cout << std::setprecision(2);
+for (auto& chunk : *result.chunks) {
+    std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
+}
+// timestamps: [0, 2] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```c++
+auto result = pipeline.generate(raw_speech);
+//  He has gone and gone for good answered Paul Icrom who...
+
+result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+//  He has gone and gone for good answered Polychrome who...
+```
+
+
 ### Troubleshooting
 
 #### Empty or rubbish output
diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
index 31d3f8c55..3df17a77f 100644
--- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
+++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -28,6 +28,7 @@ int main(int argc, char* argv[]) try {
 
     std::cout << result << "\n";
 
+    std::cout << std::setprecision(2);
     for (auto& chunk : *result.chunks) {
         std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
     }
diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
index 557897b6b..726391ba9 100755
--- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
+++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py
@@ -18,11 +18,8 @@ def main():
     args = parser.parse_args()
 
     device = 'CPU'
-    scheduler_config = openvino_genai.SchedulerConfig()
-    # cache params
-    scheduler_config.cache_size = 2
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True)
     
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
index 612e59474..217b8a273 100755
--- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
+++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py
@@ -25,13 +25,9 @@ def main():
     main_device = 'CPU'  # GPU can be used as well
     draft_device = 'CPU'
 
-    scheduler_config = openvino_genai.SchedulerConfig()
-    # cache params
-    scheduler_config.cache_size = 2
-
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
-    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model)
+    pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
     
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md
index 158bd1831..aeb46444b 100644
--- a/samples/python/whisper_speech_recognition/README.md
+++ b/samples/python/whisper_speech_recognition/README.md
@@ -40,6 +40,93 @@ timestamps: [0, 2] text:  How are you doing today?
 
 See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#whisper-models) for the list of supported models.
 
+# Whisper pipeline usage
+
+```python
+import openvino_genai
+import librosa
+
+def read_wav(filepath):
+    raw_speech, samplerate = librosa.load(filepath, sr=16000)
+    return raw_speech.tolist()
+
+pipe = openvino_genai.WhisperPipeline(model_dir, "CPU")
+# Pipeline expects normalized audio with Sample Rate of 16kHz
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+```
+
+### Transcription
+
+Whisper pipeline predicts the language of the source audio automatically.
+
+```python
+raw_speech = read_wav('how_are_you_doing_today.wav')
+result = pipe.generate(raw_speech)
+#  How are you doing today?
+
+raw_speech = read_wav('fr_sample.wav')
+result = pipe.generate(raw_speech)
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+If the source audio languange is know in advance, it can be specified as an argument to `generate` method:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, language="<|en|>")
+#  How are you doing today?
+
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, language="<|fr|>")
+#  Il s'agit d'une entité très complexe qui consiste...
+```
+
+### Translation
+
+By default, Whisper performs the task of speech transcription, where the source audio language is the same as the target text language. To perform speech translation, where the target text is in English, set the task to "translate":
+
+```python
+raw_speech = read_wav("fr_sample.wav")
+result = pipe.generate(raw_speech, task="translate")
+# It is a very complex entity that consists...
+```
+
+### Timestamps prediction
+
+The model can predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:
+
+```python
+raw_speech = read_wav("how_are_you_doing_today.wav")
+result = pipe.generate(raw_speech, return_timestamps=True)
+
+for chunk in result.chunks:
+    print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
+# timestamps: [0.00, 2.00] text:  How are you doing today?
+```
+
+### Long-Form audio Transcription
+
+The Whisper model is designed to work on audio samples of up to 30s in duration. Whisper pipeline uses sequential chunking algorithm to transcribe audio samples of arbitrary length.
+Sequential chunking algorithm uses a "sliding window", transcribing 30-second slices one after the other.
+
+### Initial prompt and hotwords
+
+Whisper pipeline has `initial_prompt` and `hotwords` generate arguments:
+* `initial_prompt`: initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window
+* `hotwords`: hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows
+
+The Whisper model can use that context to better understand the speech and maintain a consistent writing style. However, prompts do not need to be genuine transcripts from prior audio segments. Such prompts can be used to steer the model to use particular spellings or styles:
+
+```python
+result = pipe.generate(raw_speech)
+#  He has gone and gone for good answered Paul Icrom who...
+
+result = pipe.generate(raw_speech, initial_prompt="Polychrome")
+#  He has gone and gone for good answered Polychrome who...
+```
+
 ### Troubleshooting
 
 #### Empty or rubbish output
diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
index 3fddfc8ff..9cf3be5fa 100755
--- a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
+++ b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -18,7 +18,7 @@ def main():
     parser.add_argument("wav_file_path")
     args = parser.parse_args()
 
-    device = "CPU"  # GPU can be used as well
+    device = "CPU"  # GPU, NPU can be used as well
     pipe = openvino_genai.WhisperPipeline(args.model_dir, device)
 
     config = pipe.get_generation_config()
@@ -34,8 +34,9 @@ def main():
 
     print(result)
 
-    for chunk in result.chunks:
-        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
+    if result.chunks:
+        for chunk in result.chunks:
+            print(f"timestamps: [{chunk.start_ts:.2f}, {chunk.end_ts:.2f}] text: {chunk.text}")
 
 
 if "__main__" == __name__:
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index b8b222e34..4ea75e94c 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
  *                 Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
  *
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
+ * @param presence_penalty reduces absolute log prob if the token was generated at least once.
+ * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
+ *
  * Beam search specific parameters:
  * @param num_beams number of beams for beam search. 1 disables beam search.
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
  *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
  *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  *
- * Random sampling parameters:
+ * Random (or multinomial) sampling parameters:
+ * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
  * @param temperature the value used to modulate token probabilities for random sampling.
  * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
- * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
- * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
- * @param presence_penalty reduces absolute log prob if the token was generated at least once.
- * @param frequency_penalty reduces absolute log prob as many times as the token was generated.
  * @param rng_seed initializes random generator.
+ * @param num_return_sequences the number of sequences to generate from a single prompt.
  *
  * Assisting generation parameters:
  * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update.
@@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     size_t min_new_tokens = 0;
     bool echo = false;
     size_t logprobs = 0;
-    
+
     std::set<std::string> stop_strings;
     // Default setting in vLLM (and OpenAI API) is not to include stop string in the output
     bool include_stop_str_in_output = false;
diff --git a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
index 21c266aa5..25c5e07a2 100644
--- a/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/scheduler.hpp
@@ -19,7 +19,8 @@ class OPENVINO_GENAI_EXPORTS Scheduler {
         DDIM,
         EULER_DISCRETE,
         FLOW_MATCH_EULER_DISCRETE,
-        PNDM
+        PNDM,
+        EULER_ANCESTRAL_DISCRETE
     };
 
     static std::shared_ptr<Scheduler> from_config(const std::filesystem::path& scheduler_config_path,
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 38fc0aaf8..548e4dc33 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -36,9 +36,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights
-     * 
-     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. 
-     * When this constructor is used bos, eos, pad token ids are expected to be in IR. 
+     *
+     * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory.
+     * When this constructor is used bos, eos, pad token ids are expected to be in IR.
      * If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
      * @param tokenizer_model_str tokenizer model string
      * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights
@@ -55,9 +55,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     );
 
     /**
-     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. 
-     * 
-     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's 
+     * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights.
+     *
+     * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's
      * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids
      * are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored.
      * @param model_str model string
@@ -82,7 +82,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
         ov::Tensor& detokenizer_weights_tensor,
         Properties&&... properties
         ) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-    
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param model_str model string
@@ -93,7 +93,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor,
               Properties&&... properties)
         : Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward<Properties>(properties)...}) { }
-    
+
     /**
      * @brief ov::genai::Tokenizer constructor with variable number of properties
      * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
@@ -111,7 +111,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @return pair of [input_ids, attention_mask]
     */
     TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});
-    
+
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
@@ -127,7 +127,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @param prompt std::string with input prompt
     * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
     * @return pair of [input_ids, attention_mask]
-    */    
+    */
     template <typename... Properties>
     util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::string& prompt, Properties&&... properties) {
         return encode(prompt, AnyMap{std::forward<Properties>(properties)...});
@@ -164,7 +164,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     }
 
     /**
-    * @brief decode tokens. 
+    * @brief decode tokens.
     * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len]
     * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size = batch_size
@@ -183,7 +183,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     }
 
     /**
-    * @brief batched decoding of tokens. 
+    * @brief batched decoding of tokens.
     * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size
     * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false}
     * @return vector of std::string, with size equal to batch_size
@@ -203,8 +203,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
      * @brief Embeds input prompts with special tags for a chat scenario.
-     * 
-     * For example, for Qwen family models, the prompt "1+1=" would be transformed into 
+     *
+     * For example, for Qwen family models, the prompt "1+1=" would be transformed into
      * <|im_start|>user\n1+1=<|im_end|>\n<|im_start|>assistant\n.
      *
      * @param history A vector of maps, with chat history, e.g. [{"role": "user", "content": "prompt"}, ...].
@@ -214,7 +214,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
      * @throws Exception if the chat template was unable to parse the input history.
      */
     std::string apply_chat_template(ChatHistory history,
-                                    bool add_generation_prompt, 
+                                    bool add_generation_prompt,
                                     const std::string& chat_template = {}) const;
 
     /// @brief Override a chat_template read from tokenizer_config.json.
diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
index 37b23cde7..44d611923 100644
--- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp
+++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <optional>
 #include <filesystem>
+#include <optional>
 
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/runtime/compiled_model.hpp"
@@ -46,6 +46,9 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Transcribe token id.
     int64_t transcribe_token_id = 50359;
 
+    // Corresponds to the ”<|startofprev|>” token.
+    int64_t prev_sot_token_id = 50361;
+
     // No timestamps token id.
     int64_t no_timestamps_token_id = 50363;
 
@@ -75,6 +78,32 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Note that a segment of text refers to a sequence of one or more words, rather than individual words.
     bool return_timestamps = false;
 
+    /*
+     * Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+     * window. Can be used to steer the model to use particular spellings or styles.
+     *
+     * Example:
+     *  auto result = pipeline.generate(raw_speech);
+     *  //  He has gone and gone for good answered Paul Icrom who...
+     *
+     *  auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+     *  //  He has gone and gone for good answered Polychrome who...
+     */
+    std::optional<std::string> initial_prompt = std::nullopt;
+
+    /*
+     * Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+     * Can be used to steer the model to use particular spellings or styles.
+     *
+     * Example:
+     *  auto result = pipeline.generate(raw_speech);
+     *  //  He has gone and gone for good answered Paul Icrom who...
+     *
+     *  auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+     *  //  He has gone and gone for good answered Polychrome who...
+     */
+    std::optional<std::string> hotwords = std::nullopt;
+
     // A list containing tokens that will be suppressed at the beginning of the sampling process.
     std::vector<int64_t> begin_suppress_tokens;
 
@@ -111,9 +140,12 @@ static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
 static constexpr ov::Property<int64_t> transcribe_token_id{"transcribe_token_id"};
 static constexpr ov::Property<int64_t> translate_token_id{"translate_token_id"};
 static constexpr ov::Property<int64_t> no_timestamps_token_id{"no_timestamps_token_id"};
+static constexpr ov::Property<int64_t> prev_sot_token_id{"prev_sot_token_id"};
 static constexpr ov::Property<std::string> language{"language"};
 static constexpr ov::Property<std::string> task{"task"};
 static constexpr ov::Property<bool> return_timestamps{"return_timestamps"};
+static constexpr ov::Property<std::string> initial_prompt{"initial_prompt"};
+static constexpr ov::Property<std::string> hotwords{"hotwords"};
 static constexpr ov::Property<std::map<std::string, int64_t>> lang_to_id{"lang_to_id"};
 
 }  // namespace genai
diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
index dc82897dc..4ca263777 100644
--- a/src/cpp/src/block_manager.hpp
+++ b/src/cpp/src/block_manager.hpp
@@ -205,14 +205,20 @@ class BlockAllocator {
      * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache.
      */
     BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) :
-            m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
+            m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) {
         OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
         m_free_blocks.resize(m_num_layers);
-        for (auto& per_layer_block_list : m_free_blocks) {
-            for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) {
-                per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id));
+        if (num_blocks > 0) {
+            m_free_blocks_num = std::vector<size_t>(num_layers, num_blocks);
+            for (auto& per_layer_block_list : m_free_blocks) {
+                for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) {
+                    per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id));
+                }
             }
         }
+        else {
+            m_free_blocks_num = std::vector<size_t>(m_num_layers, 0);
+        }
     }
 
     ~BlockAllocator() {
@@ -220,6 +226,21 @@ class BlockAllocator {
         // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size());
     }
 
+    void increase_kv_blocks_number(size_t new_kv_blocks_count) {
+        OPENVINO_ASSERT(new_kv_blocks_count > m_total_num_blocks, "New blocks number should be more than previous blocks number.");
+        size_t added_blocks = new_kv_blocks_count - m_total_num_blocks;
+        for (auto idx = 0; idx < m_free_blocks_num.size(); idx++) {
+            m_free_blocks_num[idx] += added_blocks;
+        }
+        for (auto& per_layer_block_list : m_free_blocks) {
+            for (int block_id = m_total_num_blocks; block_id < new_kv_blocks_count; ++block_id) {
+                per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id));
+            }
+        }
+        m_total_num_blocks = new_kv_blocks_count;
+    }
+
+
     /**
      * Returns the number of free blocks for a given layer.
      * @param layer_idx Index of the layer.
@@ -459,6 +480,13 @@ class BlockAllocator {
         for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) sum += num_free_blocks(layer_idx);
         return static_cast<float>(m_num_layers * m_total_num_blocks - sum) / (m_num_layers * m_total_num_blocks) * 100;
     }
+
+    /**
+     * @return The total number of KV blocks .
+     */
+    size_t get_total_number_of_kv_blocks() const {
+        return m_total_num_blocks;
+    }
 };
 
 /**
@@ -713,6 +741,21 @@ class BlockManager {
         return m_allocator.get_used_percentage();
     }
 
+    /**
+     * Increases the number of KV blocks.
+     * @param num_blocks The new number of KV-blocks.
+     */
+    void increase_kv_blocks_number(size_t num_blocks) {
+        m_allocator.increase_kv_blocks_number(num_blocks);
+    }
+
+    /**
+     * @return The total number of KV blocks .
+     */
+    size_t get_total_number_of_kv_blocks() const {
+        return m_allocator.get_total_number_of_kv_blocks();
+    }
+
     /**
      * @brief Forks a sequence, establishing a new sequence from an existing one, reusing
      * currently allocated blocks of the existing sequence.
diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp
index a7444555a..0c04823f4 100644
--- a/src/cpp/src/cache_manager.hpp
+++ b/src/cpp/src/cache_manager.hpp
@@ -15,38 +15,118 @@ class CacheManager {
     DeviceConfig m_device_config;
     std::vector<ov::Tensor> m_key_cache;
     std::vector<ov::Tensor> m_value_cache;
+    size_t m_num_allocated_kv_blocks = 0;
     ov::Core m_core;
+    ov::InferRequest m_request;
+
+    ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) {
+        ov::PartialShape res_shape = shape;
+        res_shape[0] = dim;
+        OPENVINO_ASSERT(res_shape.is_static());
+        return res_shape.to_shape();
+    }
+
+    void update_request_tensor(size_t decoder_layer_id) {
+        m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]);
+        m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]);
+    }
 
 public:
-    explicit CacheManager(const DeviceConfig &device_config, ov::Core core) :
+    explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) :
             m_device_config(device_config),
+            m_request(request),
             m_core(core) {
         m_key_cache.reserve(m_device_config.get_num_layers());
         m_value_cache.reserve(m_device_config.get_num_layers());
+    }
+
+    void allocate_cache_if_needed(size_t num_kv_blocks) {
+        if (m_num_allocated_kv_blocks >= num_kv_blocks) {
+            return;
+        }
+        OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size());
+        m_num_allocated_kv_blocks = num_kv_blocks;
+        ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks);
+        ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks);
+
+        const std::string device_name = m_device_config.get_device();
+
+        ov::Coordinate start_key{0,0,0,0};
+        ov::Coordinate start_value{0,0,0,0};
 
-        const std::string device_name = device_config.get_device();
         if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
-                ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape());
-                ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape());
+                ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape);
+                ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape);
+
+                auto key_cache_roi_end = static_cast<unsigned char*>(key_cache.data());
+                auto value_cache_roi_end = static_cast<unsigned char*>(value_cache.data());
+                size_t key_roi_size_byte = 0;
+                size_t value_roi_size_byte = 0;
+
+                if (m_key_cache.size() > decoder_layer_id) {
+                    ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape();
+                    ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape();
+
+                    key_roi_size_byte = m_key_cache[decoder_layer_id].get_byte_size();
+                    value_roi_size_byte = m_value_cache[decoder_layer_id].get_byte_size();
+                    key_cache_roi_end = static_cast<unsigned char*>(key_cache.data()) + key_roi_size_byte;
+                    value_cache_roi_end = static_cast<unsigned char*>(value_cache.data()) + value_roi_size_byte;
+                    
+                    // copy current cache data
+                    ov::Tensor dst_key_roi(key_cache, start_key, end_key);
+                    ov::Tensor dst_value_roi(value_cache, start_value, end_value);
+
+                    m_key_cache[decoder_layer_id].copy_to(dst_key_roi);
+                    m_value_cache[decoder_layer_id].copy_to(dst_value_roi);
+
+                }
 
-                // force allocation
-                std::memset(key_cache.data(), 0, key_cache.get_byte_size());
-                std::memset(value_cache.data(), 0, value_cache.get_byte_size());
+                // Some optimizations like AVX2, AVX512, AMX require a minimal shape and 
+                // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, 
+                // so NAN * 0 returns non-zero invalid data.
+                // So we need to set zeros to all newly allocated tensors data.
+                std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - key_roi_size_byte);
+                std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - value_roi_size_byte);
+                
+                // set new cache tensors
+                if (m_key_cache.size() > decoder_layer_id) {
+                    m_key_cache[decoder_layer_id] = key_cache;
+                    m_value_cache[decoder_layer_id] = value_cache;
+                }
+                else {
+                    m_key_cache.emplace_back(key_cache);
+                    m_value_cache.emplace_back(value_cache);
+                }
 
-                m_key_cache.emplace_back(key_cache);
-                m_value_cache.emplace_back(value_cache);
+                update_request_tensor(decoder_layer_id);
             }
         } else {
             auto remote_context = m_core.get_default_context(device_name);
             for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
-                ov::Tensor key_cache = remote_context.create_tensor(device_config.get_cache_precision(),
-                                                                    device_config.get_key_cache_shape());
-                ov::Tensor value_cache = remote_context.create_tensor(device_config.get_cache_precision(),
-                                                                      device_config.get_value_cache_shape());
-
-                m_key_cache.emplace_back(key_cache);
-                m_value_cache.emplace_back(value_cache);
+                ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
+                                                                    key_cache_shape);
+                ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
+                                                                      value_cache_shape);
+                
+                if (m_key_cache.size() > decoder_layer_id) {
+                    ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape();
+                    ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape();
+
+                    // copy current cache data
+                    ov::RemoteTensor dst_key_roi(key_cache, start_key, end_key);
+                    ov::RemoteTensor dst_value_roi(value_cache, start_value, end_value);
+                    dst_key_roi.copy_from(m_key_cache[decoder_layer_id]);
+                    dst_value_roi.copy_from(m_value_cache[decoder_layer_id]);
+
+                    m_key_cache[decoder_layer_id] = key_cache;
+                    m_value_cache[decoder_layer_id] = value_cache;
+                }
+                else {
+                    m_key_cache.emplace_back(key_cache);
+                    m_value_cache.emplace_back(value_cache);
+                }
+                update_request_tensor(decoder_layer_id);
             }
         }
     }
@@ -62,8 +142,8 @@ class CacheManager {
     }
 
     void copy_blocks(const std::map<size_t, std::list<size_t>>& block_copy_map) {
-        ov::Shape key_shape = m_device_config.get_key_cache_shape();
-        ov::Shape value_shape = m_device_config.get_value_cache_shape();
+        ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks);
+        ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks);
 
         ov::Coordinate key_src_start_roi(key_shape.size(), 0);
         ov::Coordinate key_src_end_roi = key_shape;
@@ -98,5 +178,13 @@ class CacheManager {
             }
         }
     }
+
+    std::shared_ptr<Core> get_core() {
+        return std::make_shared<Core>(m_core);
+    }
+
+    std::shared_ptr<DeviceConfig> get_device_config() {
+        return std::make_shared<DeviceConfig>(m_device_config);
+    }
 };
 }
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 740d4c2b4..c810ee874 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -22,7 +22,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     m_tokenizer = tokenizer;
     m_generation_config = generation_config;
     m_is_validation_mode_enabled = is_validation_mode_enabled;
-    
+
     ov::Core core;
 
     auto [core_properties, compile_properties] = utils::split_core_compile_config(properties);
@@ -53,11 +53,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     ov::InferRequest infer_request = compiled_model.create_infer_request();
 
     // setup KV caches
-    m_cache_manager = std::make_shared<CacheManager>(device_config, core);
-    for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) {
-        infer_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_key_cache(decoder_layer_id));
-        infer_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_value_cache(decoder_layer_id));
-    }
+    m_cache_manager = std::make_shared<CacheManager>(device_config, infer_request, core);
 
     SchedulerConfig updated_config = scheduler_config;
     // update KV blocks number in scheduler config
@@ -71,8 +67,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
         // as it may lead to performance slowdown
         can_use_partial_preemption = false;
     }
-
-    m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), updated_config, device_config.get_num_layers(), can_use_partial_preemption);
+    m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption);
     // and finally create model runner
     bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
     m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
@@ -133,7 +128,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
     _pull_awaiting_requests();
 
     m_pipeline_metrics.requests = m_requests.size();
-
     Scheduler::Output scheduler_output;
     {
         static ManualTimer timer("scheduling");
@@ -255,18 +249,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         }
     }, streamer);
 
-    OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
-        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
-
-    std::vector<GenerationHandle> generations;
-    for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
-        OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
-        generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
-    }
-
-    std::vector<EncodedGenerationResult> results;
-    results.reserve(m_awaiting_requests.size());
-
     auto drop_requests = [&] () {
         for (const std::shared_ptr<ov::genai::SequenceGroup> request : m_requests) {
             for (const auto& sequence: request->get_sequences()) {
@@ -279,25 +261,40 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         m_requests.clear();
     };
 
+    OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && sampling_params[0].num_return_sequences == 1 &&
+        (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
+        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
+    std::vector<GenerationHandle> generations;
+    for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
+        OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
+        generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id]));
+    }
+    auto all_requests = m_awaiting_requests; // we need to store all requests to get results from them once generation has finished
+
     bool continue_generation = true;
     while (has_non_finished_requests() && continue_generation) {
         try {
             step();
         } catch (...) {
-            drop_requests();
+            drop_requests(); // remove all requests from pipeline state in case of exception
             throw;
         }
-        if (streamer_ptr && generations.at(0)->can_read()) {
-            std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
+
+        auto & generation = generations.at(0);
+        if (streamer_ptr && generation->can_read()) {
+            std::unordered_map<uint64_t, GenerationOutput> token = generation->back();
             for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (!streamer_ptr->put(gen_token)) {
+                continue_generation = !streamer_ptr->put(gen_token);
+                if (!continue_generation) {
+                    generation->drop();
                     break;
                 }
             }
         }
     }
 
-    if (streamer_ptr) {
+    if (streamer_ptr) { // push streamer's cache
         streamer_ptr->end();
     }
 
@@ -307,16 +304,32 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
     }
 
-    for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
-        const auto& generation = generations[generation_idx];
+    std::vector<EncodedGenerationResult> results;
+    results.reserve(all_requests.size());
+
+    for (size_t request_id = 0; request_id < all_requests.size(); ++request_id) {
+        const auto& request = all_requests[request_id];
+        auto sampling_params = request->get_sampling_parameters();
+        const auto& sequences = request->get_finished_sequences();
+        size_t num_outputs = std::min(sampling_params.num_return_sequences, sequences.size());
+
         EncodedGenerationResult result;
-        result.m_request_id = 1;
-        std::vector<GenerationOutput> generation_outputs = generation->read_all();
-        for (const auto& generation_output : generation_outputs) {
-            result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
-            result.m_scores.push_back(generation_output.score);
+        result.m_request_id = request_id;
+        result.m_generation_ids.resize(num_outputs);
+        result.m_scores.resize(num_outputs);
+
+        for (size_t i = 0; i < num_outputs; ++i) {
+            const auto & sequence = sequences[i];
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+            const auto & generated_ids = sequence->get_generated_ids();
+
+            if (sampling_params.echo)
+                result.m_generation_ids[i] = request->get_prompt_ids();
+            std::copy(generated_ids.begin(), generated_ids.end(), std::back_inserter(result.m_generation_ids[i]));
+            result.m_scores[i] = score;
         }
-        result.m_status = generation->get_status();
+
+        result.m_status = generations[request_id]->get_status();
         results.push_back(std::move(result));
     }
 
@@ -416,7 +429,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
         // requests not scheduled, in decoding phase or not echoing are not processed
-        if (!sequence_group->is_scheduled() || sequence_group->get_context_len() > sequence_group->get_prompt_len() || 
+        if (!sequence_group->is_scheduled() || sequence_group->get_context_len() > sequence_group->get_prompt_len() ||
             !sequence_group->get_sampling_parameters().echo)
             continue;
 
@@ -429,10 +442,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
         size_t num_prompt_tokens_processed = sequence_group->get_num_processed_tokens();
         OPENVINO_ASSERT(num_prompt_tokens_processed + actual_seq_len <= sequence_group->get_prompt_len());
-        
+
         // if we processed the whole prompt we don't include last logprob as it will be processed by the sampler (it's already completion)
-        // otherwise we include it as it will be used in the next part of the prompt 
-        int exclude_last_logprob = 1; 
+        // otherwise we include it as it will be used in the next part of the prompt
+        int exclude_last_logprob = 1;
         if (num_prompt_tokens_processed + actual_seq_len < sequence_group->get_prompt_len())
             exclude_last_logprob = 0;
 
@@ -443,7 +456,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
         for (int token_logits_offset = 0, token_id_offset = num_prompt_tokens_processed + 1;
              token_logits_offset < actual_seq_len - exclude_last_logprob;
              token_logits_offset++, token_id_offset++) {
-            
+
             const float* token_logits = (sequence_group_logits_data + token_logits_offset * vocab_size);
             int64_t token_id = sequence_group->get_prompt_ids()[token_id_offset];
             float token_logit = token_logits[token_id];
diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index 2af4559ef..371142701 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -12,7 +12,7 @@
 namespace ov::genai {
 class DeviceConfig {
     ov::element::Type m_kv_cache_type;
-    ov::Shape m_key_cache_shape, m_value_cache_shape;
+    ov::PartialShape m_key_cache_shape, m_value_cache_shape;
     ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers;
     size_t m_num_kv_blocks = 0;
     size_t m_block_size = 0;
@@ -80,11 +80,10 @@ class DeviceConfig {
             OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching");
         }
 
-        OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero.");
         if (scheduling_config.num_kv_blocks > 0) {
             m_num_kv_blocks = scheduling_config.num_kv_blocks;
         }
-        else {
+        else if (scheduling_config.cache_size > 0) {
             m_cache_size = scheduling_config.cache_size;
         }
     }
@@ -104,23 +103,22 @@ class DeviceConfig {
                 m_head_size += 8;
         }
 
-        if (m_num_kv_blocks == 0) {
-            OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero.");
+        if (m_num_kv_blocks == 0 && m_cache_size > 0) {
             size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
             m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size());
         }
 
-        m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks,
-                                                            m_num_kv_heads,
-                                                            m_block_size,
-                                                            m_head_size};
+        m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(),
+                                                                   ov::Dimension(m_num_kv_heads),
+                                                                   ov::Dimension(m_block_size),
+                                                                   ov::Dimension(m_head_size)};
 
         if (m_device.find("GPU") != std::string::npos) {
             // Update key shape, as the key's shape is different from the value's shape
-            m_key_cache_shape = ov::Shape{m_num_kv_blocks,
-                                          m_num_kv_heads,
-                                          m_head_size,
-                                          m_block_size};
+            m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(),
+                                                 ov::Dimension(m_num_kv_heads),
+                                                 ov::Dimension(m_head_size),
+                                                 ov::Dimension(m_block_size)};
         }
     }
 
@@ -136,13 +134,13 @@ class DeviceConfig {
         return m_num_decoder_layers;
     }
 
-    ov::Shape get_key_cache_shape() const {
-        OPENVINO_ASSERT(!m_key_cache_shape.empty());
+    ov::PartialShape get_key_cache_shape() const {
+        OPENVINO_ASSERT(m_key_cache_shape.size());
         return m_key_cache_shape;
     }
 
-    ov::Shape get_value_cache_shape() const {
-        OPENVINO_ASSERT(!m_value_cache_shape.empty());
+    ov::PartialShape get_value_cache_shape() const {
+        OPENVINO_ASSERT(m_value_cache_shape.size());
         return m_value_cache_shape;
     }
 
@@ -153,5 +151,9 @@ class DeviceConfig {
     size_t get_block_size() const {
         return m_block_size;
     }
+
+    size_t get_block_size_in_bytes() const {
+        return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size();
+    }
 };
 }
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 35ae92d60..4ff184547 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -185,6 +185,9 @@ void GenerationConfig::validate() const {
                     "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
     if (is_beam_search()) {
         OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive");
+        if (num_beam_groups > 1) {
+            OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search");
+        }
     } else {
         OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]");
         OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]");
diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
index a1dd46752..0f10a85a8 100644
--- a/src/cpp/src/generation_handle.cpp
+++ b/src/cpp/src/generation_handle.cpp
@@ -17,7 +17,7 @@ GenerationStatus GenerationHandleImpl::get_status() {
 }
 
 bool GenerationHandleImpl::can_read() {
-    return !is_dropped() &&  m_generation_stream->can_read();
+    return !is_dropped() && m_generation_stream->can_read();
 }
 
 bool GenerationHandleImpl::is_dropped() {
diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp
index 4d41f160e..518699ba3 100644
--- a/src/cpp/src/generation_stream.hpp
+++ b/src/cpp/src/generation_stream.hpp
@@ -14,8 +14,6 @@ class GenerationStream {
     GenerationStatus m_status = GenerationStatus::RUNNING;
     SynchronizedQueue<GenerationOutputs> m_output_queue;
 
-    std::vector<uint64_t> last_sequence_ids;
-
 public:
     using Ptr = std::shared_ptr<GenerationStream>;
 
@@ -30,10 +28,11 @@ class GenerationStream {
         m_output_queue.push(std::move(outputs));
     }
 
-    // Retrieving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt
+    // Retrieving vector of pairs <sequence_id, token_ids> as we can generate multiple outputs for a single prompt
     GenerationOutputs back() {
         return m_output_queue.back();
     }
+
     GenerationOutputs read() {
         return m_output_queue.pull();
     }
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
deleted file mode 100644
index a0262c0dc..000000000
--- a/src/cpp/src/group_beam_searcher.cpp
+++ /dev/null
@@ -1,455 +0,0 @@
-// Copyright (C) 2023-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include <openvino/runtime/tensor.hpp>
-
-#include <cassert>
-
-#include "openvino/genai/llm_pipeline.hpp"
-#include "utils.hpp"
-#include "lm_encoding.hpp"
-
-namespace {
-
-// Modified Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurrence in haystack
-std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) {
-    if (needle.empty()) {  // no_repeat_ngram_size == 1, ban every token
-        return {haystack.begin(), haystack.end()};
-    }
-    std::vector<int> partial_match_table(needle.size() + 1, -1);
-    int cnd = 0;
-    for (size_t pos = 1; pos < needle.size(); ++pos) {
-        if (needle.at(pos) == needle.at(size_t(cnd))) {
-            partial_match_table.at(pos) = partial_match_table.at(size_t(cnd));
-        } else {
-            partial_match_table.at(pos) = cnd;
-            while (cnd >= 0 && needle.at(pos) != needle.at(size_t(cnd))) {
-                cnd = partial_match_table.at(size_t(cnd));
-            }
-        }
-        ++cnd;
-    }
-    partial_match_table.back() = cnd;
-    std::vector<int64_t> res;
-    size_t haystack_id = 0;
-    int needle_id = 0;
-    while (haystack_id < haystack.size() - 1) {
-        if (needle.at(size_t(needle_id)) == haystack.at(haystack_id)) {
-            ++haystack_id;
-            ++needle_id;
-            if (needle_id == int(needle.size())) {
-                res.push_back(haystack.at(haystack_id));
-                needle_id = partial_match_table.at(size_t(needle_id));
-            }
-        } else {
-            needle_id = partial_match_table.at(size_t(needle_id));
-            if (needle_id < 0) {
-                ++haystack_id;
-                ++needle_id;
-            }
-        }
-    }
-    return res;
-}
-
-struct Token {
-    float log_prob;
-    int64_t idx;
-};
-
-std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx) {
-    if (logits.get_shape().at(0) <= batch_idx) {
-        throw std::runtime_error("logits batch size doesn't match the number of beams");
-    }
-    size_t vocab_size = logits.get_shape().back();
-    size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size;
-    size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
-    const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
-    float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
-    float log_sum = std::log(
-        std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
-            return accumulated + std::exp(to_add - max_logit);
-        }));
-    std::vector<Token> tokens;
-    tokens.reserve(vocab_size);
-    for (size_t idx = 0; idx < vocab_size; ++idx) {
-        tokens.push_back({beam_logits[idx] - max_logit - log_sum, int64_t(idx)});
-    }
-    return tokens;
-}
-
-struct Beam {
-    float score = -std::numeric_limits<float>::infinity();  // The bigger, the better
-    std::vector<int64_t> tokens;
-    size_t global_beam_idx = 0;
-};
-
-bool greater(const Beam& left, const Beam& right) {
-    return left.score > right.score;
-}
-
-struct Parameters {
-    std::vector<std::vector<int64_t>> prompts;
-    int64_t eos_token_id;
-    size_t n_groups = 3;
-    size_t group_size = 5;
-    float diversity_penalty = 1.0;
-    size_t max_new_tokens = 20;
-    ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC;
-    float length_penalty = 1.0;
-    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-
-    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-        return false;
-    };
-};
-
-struct Group {
-    std::vector<Beam> ongoing;   // Best beams in front
-    std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
-    bool done = false;
-
-    void finish(Beam&& beam, const Parameters& parameters) {
-        beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty);
-
-        min_heap.push_back(std::move(beam));
-        std::push_heap(min_heap.begin(), min_heap.end(), greater);
-        if (min_heap.size() > parameters.group_size) {
-            std::pop_heap(min_heap.begin(), min_heap.end(), greater);
-            min_heap.pop_back();
-        }
-    }
-    void is_done(const Parameters& parameters) {
-        if (min_heap.size() < parameters.group_size) {
-            return;
-        }
-        size_t cur_len = ongoing.front().tokens.size();
-        float best_sum_logprobs = ongoing.front().score;
-        float worst_score = min_heap.front().score;
-        switch (parameters.stop_criteria) {
-        case ov::genai::StopCriteria::EARLY:
-            done = true;
-            return;
-        case ov::genai::StopCriteria::HEURISTIC: {
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        case ov::genai::StopCriteria::NEVER: {
-            size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        default:
-            throw std::runtime_error("Never reached");
-        }
-    }
-};
-
-// GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
-// algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
-// are used for next inference. select_next_tokens() returns empty, if all groups are completed
-struct GroupBeamSearcher {
-    Parameters parameters;
-    std::vector<std::vector<Group>> prompts_groups;
-
-    GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} {
-        if (parameters.no_repeat_ngram_size == 0) {
-            throw std::runtime_error("no_repeat_ngram_size must be positive");
-        }
-        for (std::vector<Group>& prompts_groups : prompts_groups) {
-            prompts_groups.resize(parameters.n_groups);
-            for (Group& group : prompts_groups) {
-                group.ongoing.resize(parameters.group_size);
-                group.ongoing.front().score = 0.0;
-            }
-        }
-    }
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits) {
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-
-        const size_t promts_size = parameters.prompts.size();
-
-        next_tokens.reserve(promts_size * parameters.n_groups * parameters.group_size);
-        next_beams.reserve(promts_size * parameters.n_groups * parameters.group_size);
-
-        size_t beam_count = 0;
-        size_t prompt_id = 0;
-        for (std::vector<Group>& groups : prompts_groups) {
-            for (Group& group : groups) {
-                if (group.done) {
-                    continue;
-                }
-                for (Beam& beam : group.ongoing) {
-                    // beam.tokens.empty() holds for the first select_next_tokens() call.
-                    // Every beam is constructed from the single batch at first call
-                    if (beam.tokens.empty()) {
-                        beam.global_beam_idx = prompt_id;
-                    } else {
-                        beam.global_beam_idx = beam_count;
-                        ++beam_count;
-                    }
-                }
-            }
-
-            prompt_id += 1;
-        }
-
-        for (int prompt_id = 0; prompt_id < promts_size; prompt_id++) {
-            const std::vector<int64_t> prompt = parameters.prompts[prompt_id];
-            std::vector<Group>& groups = prompts_groups[prompt_id];
-            auto [prompt_next_tokens, prompt_next_beams] = select_prompt_next_tokens(logits, prompt, groups);
-
-            next_tokens.insert(next_tokens.end(), prompt_next_tokens.begin(), prompt_next_tokens.end());
-            next_beams.insert(next_beams.end(), prompt_next_beams.begin(), prompt_next_beams.end());
-        }
-
-        return {next_tokens, next_beams};
-    }
-
-    std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits,
-                                                                                    const std::vector<int64_t>& prompt,
-                                                                                    std::vector<Group>& groups) {
-        std::vector<int64_t> next_tokens;
-        std::vector<int32_t> next_beams;
-        next_tokens.reserve(parameters.n_groups * parameters.group_size);
-        next_beams.reserve(parameters.n_groups * parameters.group_size);
-
-        for (auto group = groups.begin(); group != groups.end(); ++group) {
-            if (group->done) {
-                continue;
-            }
-            std::vector<Beam> candidates;
-            candidates.reserve(parameters.group_size * 2 * parameters.group_size);
-            for (const Beam& beam : group->ongoing) {
-                std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx);
-                for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) {
-                    for (const Beam& prev_beam : prev_group->ongoing) {
-                        if (prev_beam.tokens.size() > beam.tokens.size()) {
-                            tokens.at(size_t(prev_beam.tokens.back())).log_prob -= parameters.diversity_penalty;
-                        }
-                    }
-                }
-                std::vector<int64_t> full_text{prompt};
-                full_text.insert(full_text.end(), beam.tokens.begin(), beam.tokens.end());
-                if (full_text.size() > 1 && full_text.size() >= parameters.no_repeat_ngram_size) {
-                    auto tail_start = full_text.end() - ptrdiff_t(parameters.no_repeat_ngram_size) + 1;
-                    for (int64_t banned_token : kmp_search(full_text, {tail_start, full_text.end()})) {
-                        tokens.at(size_t(banned_token)).log_prob = -std::numeric_limits<float>::infinity();
-                    }
-                }
-                std::sort(tokens.begin(), tokens.end(), [](Token left, Token right) {
-                    return left.log_prob > right.log_prob;  // Most probable tokens in front
-                });
-                size_t add_count = 0;
-                for (Token token : tokens) {
-                    Beam new_candidate = beam;
-                    new_candidate.score += token.log_prob;
-                    new_candidate.tokens.push_back(token.idx);
-                    if (parameters.early_finish(new_candidate)) {
-                        group->finish(std::move(new_candidate), parameters);
-                    } else {
-                        candidates.push_back(std::move(new_candidate));
-                        ++add_count;
-                        if (add_count == 2 * parameters.group_size) {
-                            break;
-                        }
-                    }
-                }
-            }
-            // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam
-            if (candidates.size() < 2 * parameters.group_size) {
-                throw std::runtime_error("No beams left to search");
-            }
-            auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size);
-            std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater);
-            group->ongoing.clear();
-            for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) {
-                if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) {
-                    // If beam_token does not belong to top num_beams tokens, it should not be added
-                    if (cand_idx >= parameters.group_size) {
-                        continue;
-                    }
-                    group->finish(std::move(candidates.at(cand_idx)), parameters);
-                } else {
-                    group->ongoing.push_back(std::move(candidates.at(cand_idx)));
-                    if (group->ongoing.size() == parameters.group_size) {
-                        break;
-                    }
-                }
-            }
-            group->is_done(parameters);
-            if (!group->done) {
-                for (const Beam& beam : group->ongoing) {
-                    next_tokens.push_back(beam.tokens.back());
-                    next_beams.push_back(int32_t(beam.global_beam_idx));
-                }
-            }
-        }
-        return {next_tokens, next_beams};
-    }
-};
-
-// Consume group_beam_searcher because beams are consumed
-std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher) {
-    std::vector<std::vector<std::vector<Beam>>> finalized;
-    finalized.resize(group_beam_searcher.prompts_groups.size());
-
-    for (size_t prompt_id = 0; prompt_id < group_beam_searcher.prompts_groups.size(); prompt_id++) {
-        std::vector<Group>& groups = group_beam_searcher.prompts_groups.at(prompt_id);
-        finalized.at(prompt_id).reserve(groups.size());
-
-        for (Group& group : groups) {
-            if (!group.done) {
-                for (Beam& beam : group.ongoing) {
-                    group.finish(std::move(beam), group_beam_searcher.parameters);
-                }
-            }
-            finalized.at(prompt_id).push_back(std::move(group.min_heap));
-        }
-    }
-
-    return finalized;
-}
-
-void reset_all_inputs_to_empty_tensors(ov::InferRequest& request) {
-    request.set_tensor("input_ids", ov::Tensor(ov::element::i64, {0, 0}));
-    request.set_tensor("beam_idx", ov::Tensor(ov::element::i32, {0}));
-    if (request.get_compiled_model().inputs().size() == 4)
-        request.set_tensor("position_ids", ov::Tensor(ov::element::i64, {0, 0}));
-}
-}  // namespace
-
-namespace ov {
-namespace genai {
-
-std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
-                           ov::Tensor input_ids,
-                           ov::Tensor attention_mask,
-                           GenerationConfig config, 
-                           std::optional<ov::Tensor> position_ids,
-                           std::optional<int32_t> selected_beam_idx) {
-    OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0,
-                    "number of beams should be divisible by number of groups");
-    
-    auto batch_size = input_ids.get_shape().at(0);
-    auto sequence_length = input_ids.get_shape().at(1);
-    
-    // Initialize beam search.
-    const int64_t* prompt_data = input_ids.data<const int64_t>();
-    std::vector<std::vector<int64_t>> prompts;
-    prompts.reserve(batch_size);
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t batch_offset = batch * sequence_length;
-        const int64_t* prompt_start = prompt_data + batch_offset;
-        prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
-    }
-
-    lm.set_tensor("input_ids", input_ids);
-    lm.set_tensor("attention_mask", attention_mask);
-    if (position_ids.has_value())
-        lm.set_tensor("position_ids", *position_ids);
-
-    ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
-    auto beam_data = beam_idx.data<int32_t>();
-    if (selected_beam_idx.has_value())
-        beam_data[0] = *selected_beam_idx;
-    else
-        std::fill_n(beam_data, batch_size, 0);
-    lm.set_tensor("beam_idx", beam_idx);
-
-    Parameters parameters{std::move(prompts)};
-    parameters.max_new_tokens = config.get_max_new_tokens(sequence_length);
-    parameters.eos_token_id = config.eos_token_id;
-    parameters.n_groups = config.num_beam_groups;
-    parameters.group_size = config.num_beams / config.num_beam_groups;
-    parameters.diversity_penalty = config.diversity_penalty;
-    parameters.length_penalty = config.length_penalty;
-    parameters.stop_criteria = config.stop_criteria;
-    parameters.no_repeat_ngram_size = config.no_repeat_ngram_size;
-    GroupBeamSearcher group_beam_searcher{parameters};
-
-    std::vector<int64_t> next_tokens;
-    std::vector<int32_t> next_beams;
-
-    // Reserve for performance counters.
-    std::vector<std::chrono::steady_clock::time_point> new_token_times;
-    std::vector<size_t> batch_sizes;
-    new_token_times.reserve(parameters.max_new_tokens);
-    batch_sizes.reserve(parameters.max_new_tokens);
-
-    for (size_t length_count = 0; ; ++length_count) {
-        lm.infer();
-
-        std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
-        new_token_times.emplace_back(std::chrono::steady_clock::now());
-        batch_sizes.emplace_back(batch_size);
-
-        if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
-            // Break the cycle before masks are extended in update_attention_mask_with_beams.
-            // If generation is continued, attention_mask length should be equal to KV cache size.
-            break;
-        }
-        
-        size_t running_batch_size = next_tokens.size();
-        // Set pointers
-        lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {running_batch_size, 1}, next_tokens.data()});
-        lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {running_batch_size}, next_beams.data()});
-
-        // Set auxiliary inputs
-        update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams);
-        if (position_ids.has_value())
-            update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask"));
-    }
-
-    reset_all_inputs_to_empty_tensors(lm);
-
-    auto scores_comparator = [](Beam& left, Beam& right) {
-        return (left.score > right.score);
-    };
-
-    auto result = finalize(std::move(group_beam_searcher));
-    ov::genai::EncodedResults results;
-    int32_t res_selected_beam_idx = 0;
-    results.scores.reserve(config.num_return_sequences * result.size());
-    results.tokens.reserve(config.num_return_sequences * result.size());
-    auto& raw_perf_counters = results.perf_metrics.raw_metrics;
-    raw_perf_counters.m_new_token_times = new_token_times;
-    raw_perf_counters.m_batch_sizes = batch_sizes;
-    
-    // align output with HF
-    for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) {
-        auto prompt_group = result.at(prompt_id);
-        std::vector<std::reference_wrapper<Beam>> plain_beams;
-        plain_beams.reserve(parameters.n_groups * parameters.group_size);
-        for (std::vector<Beam>& group : prompt_group) {
-            for (Beam& beam : group) {
-                plain_beams.push_back(beam);
-            }
-        }
-        assert(config.num_return_sequences <= plain_beams.size());
-        std::partial_sort(
-            plain_beams.begin(),
-            plain_beams.begin() + config.num_return_sequences,
-            plain_beams.end(),
-            scores_comparator
-        );
-        res_selected_beam_idx = plain_beams.at(0).get().global_beam_idx;
-        for (
-            auto beam = plain_beams.begin();
-            beam != plain_beams.begin() + config.num_return_sequences;
-            ++beam
-        ) {
-            results.scores.push_back(beam->get().score);
-            results.tokens.push_back(std::move(beam->get().tokens));
-        }
-    }
-    
-    return {results, res_selected_beam_idx};
-}
-
-}  // namespace genai
-}  // namespace ov
diff --git a/src/cpp/src/image_generation/models/clip_text_model.cpp b/src/cpp/src/image_generation/models/clip_text_model.cpp
index efbc840d4..72fdc6308 100644
--- a/src/cpp/src/image_generation/models/clip_text_model.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model.cpp
@@ -118,13 +118,20 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
     const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;
 
     auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
-        std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
-
         ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
-        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>());
+
+        if (input_ids.get_element_type() == ov::element::i32) {
+            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
+        } else {
+            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
+        }
     };
 
-    ov::Tensor input_ids(ov::element::i32, {text_embedding_batch_size, m_config.max_position_embeddings});
+    ov::Tensor input_ids = m_request.get_input_tensor();
+    input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
+
     size_t current_batch_idx = 0;
 
     if (do_classifier_free_guidance) {
@@ -141,7 +148,6 @@ ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string
                                                {current_batch_idx + 1, m_config.max_position_embeddings}));
 
     // text embeddings
-    m_request.set_tensor("input_ids", input_ids);
     m_request.infer();
 
     return m_request.get_output_tensor(0);
diff --git a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
index 982800a70..1160c30b6 100644
--- a/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
+++ b/src/cpp/src/image_generation/models/clip_text_model_with_projection.cpp
@@ -109,13 +109,20 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
     const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1;
 
     auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) {
-        std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
-
         ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids;
-        std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int64_t>());
+
+        if (input_ids.get_element_type() == ov::element::i32) {
+            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int32_t>());
+        } else {
+            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), input_ids_token.get_size(), input_ids.data<int64_t>());
+        }
     };
 
-    ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings});
+    ov::Tensor input_ids = m_request.get_input_tensor();
+    input_ids.set_shape({text_embedding_batch_size, m_config.max_position_embeddings});
+
     size_t current_batch_idx = 0;
 
     if (do_classifier_free_guidance) {
@@ -132,7 +139,6 @@ ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, con
                                                {current_batch_idx + 1, m_config.max_position_embeddings}));
 
     // text embeddings
-    m_request.set_tensor("input_ids", input_ids);
     m_request.infer();
 
     return m_request.get_output_tensor(0);
diff --git a/src/cpp/src/image_generation/models/t5_encoder_model.cpp b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
index 21df456d4..a83697b2e 100644
--- a/src/cpp/src/image_generation/models/t5_encoder_model.cpp
+++ b/src/cpp/src/image_generation/models/t5_encoder_model.cpp
@@ -80,8 +80,13 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin
         ov::Tensor input_ids_token = m_tokenizer.encode(prompt).input_ids;
         size_t min_length = std::min(input_ids.get_size(), input_ids_token.get_size());
 
-        std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
-        std::copy_n(input_ids_token.data<std::int64_t>(), min_length, input_ids.data<std::int32_t>());
+        if (input_ids.get_element_type() == ov::element::i32) {
+            std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), min_length, input_ids.data<int32_t>());
+        } else {
+            std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id);
+            std::copy_n(input_ids_token.data<int64_t>(), min_length, input_ids.data<int64_t>());
+        }
     };
 
     ov::Tensor input_ids = m_request.get_input_tensor();
@@ -114,7 +119,6 @@ ov::Tensor T5EncoderModel::infer(const std::string& pos_prompt, const std::strin
                                                {current_batch_idx + 1, input_ids.get_shape()[1]}));
 
     // text embeddings
-    m_request.set_tensor("input_ids", input_ids);
     m_request.infer();
 
     return m_request.get_output_tensor(0);
diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
index 6dc285f76..914fbcf50 100644
--- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
@@ -12,11 +12,8 @@ namespace genai {
 
 
 class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {
-
 public:
-
-    virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override
-    {
+    virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override {
         ov::Core core = utils::singleton_core();
 
         ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
@@ -24,20 +21,17 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::
         m_request = compiled_model.create_infer_request();
     }
 
-    virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override
-    {
+    virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override {
         OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
         m_request.set_tensor(tensor_name, encoder_hidden_states);
     }
 
-    virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override
-    {
+    virtual void set_adapters(AdapterController &adapter_controller, const AdapterConfig& adapters) override {
         OPENVINO_ASSERT(m_request, "UNet model must be compiled first");
         adapter_controller.apply(m_request, adapters);
     }
 
-    virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override
-    {
+    virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override {
         OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model");
 
         m_request.set_tensor("sample", sample);
@@ -49,10 +43,8 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::
     }
 
 private:
-
     ov::InferRequest m_request;
 };
 
-
 }  // namespace genai
 }  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
index 7aa6f6301..f63a8ea23 100644
--- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
+++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
@@ -42,8 +42,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel
         ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model");
 
-        for (int i = 0; i < m_native_batch_size; i++)
-        {
+        for (int i = 0; i < m_native_batch_size; i++) {
             m_requests[i] = compiled_model.create_infer_request();
         }
     }
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
new file mode 100644
index 000000000..a63a073cf
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
@@ -0,0 +1,261 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <random>
+#include <fstream>
+#include <iterator>
+
+#include "image_generation/schedulers/euler_ancestral_discrete.hpp"
+#include "image_generation/numpy_utils.hpp"
+
+namespace ov {
+namespace genai {
+
+EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) {
+    std::ifstream file(scheduler_config_path);
+    OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path);
+
+    nlohmann::json data = nlohmann::json::parse(file);
+    using utils::read_json_param;
+    
+    read_json_param(data, "num_train_timesteps", num_train_timesteps);
+    read_json_param(data, "beta_start", beta_start);
+    read_json_param(data, "beta_end", beta_end);
+    read_json_param(data, "beta_schedule", beta_schedule);
+    read_json_param(data, "trained_betas", trained_betas);
+    read_json_param(data, "steps_offset", steps_offset);
+    read_json_param(data, "prediction_type", prediction_type);
+    read_json_param(data, "timestep_spacing", timestep_spacing);
+    read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr);
+}
+
+EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) 
+    : EulerAncestralDiscreteScheduler(Config(scheduler_config_path)) {
+}
+
+EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const Config& scheduler_config): m_config(scheduler_config) {
+    std::vector<float> alphas, betas;
+
+    using numpy_utils::linspace;
+
+    if (!m_config.trained_betas.empty()) {
+        betas = m_config.trained_betas;
+    } else if (m_config.beta_schedule == BetaSchedule::LINEAR) {
+        betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps);
+    } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) {
+        float start = std::sqrt(m_config.beta_start);
+        float end = std::sqrt(m_config.beta_end);
+        betas = linspace<float>(start, end, m_config.num_train_timesteps);
+        std::for_each(betas.begin(), betas.end(), [](float& x) {
+            x *= x;
+        });
+    // TODO: else if beta_schedule == "squaredcos_cap_v2"
+    } else {
+        OPENVINO_THROW(
+            "'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types");
+    }
+
+    if (m_config.rescale_betas_zero_snr) {
+        using numpy_utils::rescale_zero_terminal_snr;
+        rescale_zero_terminal_snr(betas);
+    }
+
+    std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [](float b) {
+        return 1.0f - b;
+    });
+
+    for (size_t i = 1; i <= alphas.size(); ++i) {
+        float alpha_cumprod =
+            std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{});
+        m_alphas_cumprod.push_back(alpha_cumprod);
+    }
+
+    if (m_config.rescale_betas_zero_snr) {
+        m_alphas_cumprod.back() = std::pow(2, -24);
+    }
+
+    for (auto it = m_alphas_cumprod.rbegin(); it != m_alphas_cumprod.rend(); ++it) {
+        float sigma = std::pow(((1 - (*it)) / (*it)), 0.5);
+        m_sigmas.push_back(sigma);
+    }
+    m_sigmas.push_back(0);
+
+    // setable values
+    auto linspaced =
+        linspace<float>(0.0f, static_cast<float>(m_config.num_train_timesteps - 1), m_config.num_train_timesteps, true);
+    for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
+        m_timesteps.push_back(static_cast<int64_t>(std::round(*it)));
+    }
+    m_num_inference_steps = -1;
+    m_step_index = -1;
+    m_begin_index = -1;
+    m_is_scale_input_called = false;
+}
+
+void EulerAncestralDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) {
+    m_timesteps.clear();
+    m_sigmas.clear();
+    m_step_index = m_begin_index = -1;
+    m_num_inference_steps = num_inference_steps;
+    std::vector<float> sigmas;
+
+    switch (m_config.timestep_spacing) {
+    case TimestepSpacing::LINSPACE: {
+        using numpy_utils::linspace;
+        float end = static_cast<float>(m_config.num_train_timesteps - 1);
+        auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true);
+        for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
+            m_timesteps.push_back(static_cast<int64_t>(std::round(*it)));
+        }
+        break;
+    }
+    case TimestepSpacing::LEADING: {
+        size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps;
+        for (size_t i = num_inference_steps - 1; i != -1; --i) {
+            m_timesteps.push_back(i * step_ratio + m_config.steps_offset);
+        }
+        break;
+    }
+    case TimestepSpacing::TRAILING: {
+        float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps);
+        for (float i = m_config.num_train_timesteps; i > 0; i -= step_ratio) {
+            m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1);
+        }
+        break;
+    }
+    default:
+        OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
+    }
+
+    for (const float& i : m_alphas_cumprod) {
+        float sigma = std::pow(((1 - i) / i), 0.5);
+        sigmas.push_back(sigma);
+    }
+
+    using numpy_utils::interp;
+    std::vector<size_t> x_data_points(sigmas.size());
+    std::iota(x_data_points.begin(), x_data_points.end(), 0);
+    m_sigmas = interp(m_timesteps, x_data_points, sigmas);
+    m_sigmas.push_back(0.0f);
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        // keep original timesteps
+        m_schedule_timesteps = m_timesteps;
+        // while return patched ones by 'strength' parameter
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+        m_begin_index = t_start;
+    }
+}
+
+std::map<std::string, ov::Tensor> EulerAncestralDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
+    // noise_pred - model_output
+    // latents - sample
+    // inference_step
+
+    size_t timestep = m_timesteps[inference_step];
+
+    if (m_step_index == -1)
+        m_step_index = m_begin_index;
+
+    float sigma = m_sigmas[m_step_index];
+
+    float* model_output_data = noise_pred.data<float>();
+    float* sample_data = latents.data<float>();
+
+    ov::Tensor pred_original_sample(noise_pred.get_element_type(), noise_pred.get_shape());
+    float* pred_original_sample_data = pred_original_sample.data<float>();
+
+    switch (m_config.prediction_type) {
+    case PredictionType::EPSILON:
+        for (size_t i = 0; i < noise_pred.get_size(); ++i) {
+            pred_original_sample_data[i] = sample_data[i] - sigma * model_output_data[i];
+        }
+        break;
+    case PredictionType::V_PREDICTION:
+        for (size_t i = 0; i < noise_pred.get_size(); ++i) {
+            pred_original_sample_data[i] = model_output_data[i] * (-sigma / std::pow((std::pow(sigma, 2) + 1), 0.5)) +
+                                           (sample_data[i] / (std::pow(sigma, 2) + 1));
+        }
+        break;
+    default:
+        OPENVINO_THROW("Unsupported value for 'PredictionType': must be one of `epsilon`, or `v_prediction`");
+    }
+
+    float sigma_from = m_sigmas[m_step_index];
+    float sigma_to = m_sigmas[m_step_index + 1];
+    float sigma_up = std::sqrt(std::pow(sigma_to, 2) * (std::pow(sigma_from, 2) - std::pow(sigma_to, 2)) / std::pow(sigma_from, 2));
+    float sigma_down = std::sqrt(std::pow(sigma_to, 2) - std::pow(sigma_up, 2));
+    float dt = sigma_down - sigma;
+
+    ov::Tensor prev_sample = ov::Tensor(latents.get_element_type(), latents.get_shape());
+    float* prev_sample_data = prev_sample.data<float>();
+
+    ov::Tensor noise = generator->randn_tensor(noise_pred.get_shape());
+    const float* noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < prev_sample.get_size(); ++i) {
+        float derivative = (sample_data[i] - pred_original_sample_data[i]) / sigma;
+        prev_sample_data[i] = (sample_data[i] + derivative * dt) + noise_data[i] * sigma_up;
+    }
+
+    m_step_index++;
+
+    return {{"latent", prev_sample}, {"denoised", pred_original_sample}};
+}
+
+size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{
+    for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+        if (timestep == m_schedule_timesteps[i]) {
+            return i;
+        }
+    }
+
+    OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
+void EulerAncestralDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const {
+    size_t index_for_timestep = _index_for_timestep(latent_timestep);
+    const float sigma = m_sigmas[index_for_timestep];
+
+    float * init_latent_data = init_latent.data<float>();
+    const float * noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < init_latent.get_size(); ++i) {
+        init_latent_data[i] = init_latent_data[i] + sigma * noise_data[i];
+    }
+}
+
+std::vector<int64_t> EulerAncestralDiscreteScheduler::get_timesteps() const {
+    return m_timesteps;
+}
+
+void EulerAncestralDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
+    if (m_step_index == -1)
+        m_step_index = m_begin_index;
+
+    float sigma = m_sigmas[m_step_index];
+    float* sample_data = sample.data<float>();
+    for (size_t i = 0; i < sample.get_size(); i++) {
+        sample_data[i] /= std::pow((std::pow(sigma, 2) + 1), 0.5);
+    }
+    m_is_scale_input_called = true;
+}
+
+float EulerAncestralDiscreteScheduler::get_init_noise_sigma() const {
+    float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end());
+
+    if (m_config.timestep_spacing == TimestepSpacing::LINSPACE ||
+        m_config.timestep_spacing == TimestepSpacing::TRAILING) {
+        return max_sigma;
+    }
+
+    return std::sqrt(std::pow(max_sigma, 2) + 1);
+}
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp
new file mode 100644
index 000000000..9d82c9a0a
--- /dev/null
+++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.hpp
@@ -0,0 +1,61 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <filesystem>
+#include <list>
+#include <string>
+
+#include "image_generation/schedulers/types.hpp"
+#include "image_generation/schedulers/ischeduler.hpp"
+
+namespace ov {
+namespace genai {
+
+class EulerAncestralDiscreteScheduler : public IScheduler {
+public:
+    struct Config {
+        int32_t num_train_timesteps = 1000;
+        float beta_start = 0.0001f, beta_end = 0.02f;
+        BetaSchedule beta_schedule = BetaSchedule::LINEAR;
+        std::vector<float> trained_betas = {};
+        size_t steps_offset = 0;
+        PredictionType prediction_type = PredictionType::EPSILON;
+        TimestepSpacing timestep_spacing = TimestepSpacing::LEADING;
+        bool rescale_betas_zero_snr = false;
+
+        Config() = default;
+        explicit Config(const std::filesystem::path& scheduler_config_path);
+    };
+
+    explicit EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path);
+    explicit EulerAncestralDiscreteScheduler(const Config& scheduler_config);
+
+    void set_timesteps(size_t num_inference_steps, float strength) override;
+
+    std::vector<std::int64_t> get_timesteps() const override;
+
+    float get_init_noise_sigma() const override;
+
+    void scale_model_input(ov::Tensor sample, size_t inference_step) override;
+
+    std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) override;
+
+    void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override;
+
+private:
+    Config m_config;
+
+    std::vector<float> m_alphas_cumprod, m_sigmas;
+    std::vector<int64_t> m_timesteps, m_schedule_timesteps;
+    size_t m_num_inference_steps;
+
+    int m_step_index, m_begin_index;
+    bool m_is_scale_input_called;
+
+    size_t _index_for_timestep(int64_t timestep) const;
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/scheduler.cpp b/src/cpp/src/image_generation/schedulers/scheduler.cpp
index f9cd09834..868f6f05c 100644
--- a/src/cpp/src/image_generation/schedulers/scheduler.cpp
+++ b/src/cpp/src/image_generation/schedulers/scheduler.cpp
@@ -11,6 +11,7 @@
 #include "image_generation/schedulers/euler_discrete.hpp"
 #include "image_generation/schedulers/flow_match_euler_discrete.hpp"
 #include "image_generation/schedulers/pndm.hpp"
+#include "image_generation/schedulers/euler_ancestral_discrete.hpp"
 
 namespace ov {
 namespace genai {
@@ -41,6 +42,8 @@ std::shared_ptr<Scheduler> Scheduler::from_config(const std::filesystem::path& s
         scheduler = std::make_shared<FlowMatchEulerDiscreteScheduler>(scheduler_config_path);
     } else if (scheduler_type == Scheduler::Type::PNDM) {
         scheduler = std::make_shared<PNDMScheduler>(scheduler_config_path);
+    } else if (scheduler_type == Scheduler::Type::EULER_ANCESTRAL_DISCRETE) {
+        scheduler = std::make_shared<EulerAncestralDiscreteScheduler>(scheduler_config_path);
     } else {
         OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one");
     }
diff --git a/src/cpp/src/image_generation/schedulers/types.cpp b/src/cpp/src/image_generation/schedulers/types.cpp
index 2f7c6d3f2..5a9e5b686 100644
--- a/src/cpp/src/image_generation/schedulers/types.cpp
+++ b/src/cpp/src/image_generation/schedulers/types.cpp
@@ -57,6 +57,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Schedu
             param = Scheduler::FLOW_MATCH_EULER_DISCRETE;
         else if (scheduler_type_str == "PNDMScheduler")
             param = Scheduler::PNDM;
+        else if (scheduler_type_str == "EulerAncestralDiscreteScheduler")
+            param = Scheduler::EULER_ANCESTRAL_DISCRETE;
         else if (!scheduler_type_str.empty()) {
             OPENVINO_THROW("Unsupported value for 'scheduler' ", scheduler_type_str);
         }
diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
index 15f15219c..c3ebcdf1f 100644
--- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
@@ -320,7 +320,7 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline {
             } else if (m_pipeline_type == PipelineType::INPAINTING) {
                 m_generation_config.guidance_scale = 7.5f;
                 m_generation_config.num_inference_steps = 50;
-                m_generation_config.strength == 0.9999f;
+                m_generation_config.strength = 0.9999f;
             }
         } else {
             OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers");
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 6d9aae30f..be5ecf17f 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -24,27 +24,23 @@
 namespace ov {
 namespace genai {
 
-std::pair<EncodedResults, int32_t> beam_search(
-    ov::InferRequest& lm,
-    ov::Tensor prompts,
-    ov::Tensor attention_mask,
-    GenerationConfig config,
-    std::optional<ov::Tensor> position_ids,
-    std::optional<int32_t> selected_beam_idx
-);
-
 class StatefulLLMPipeline final : public LLMPipelineImplBase {
 public:
     ov::InferRequest m_model_runner;
     bool is_chat_conversation = false;
     bool m_trust_encoded_history = true;
-    std::optional<int32_t> m_selected_beam = std::nullopt;
     ChatHistory m_history;
     std::string m_templated_chat_history = {};
     std::vector<int64_t> m_tokenized_chat_history;
     ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
-    size_t m_to_remove_from_hist = 0;
     size_t m_kv_cache_seq_length_axis = 2;
+    Sampler m_sampler;
+    // Tail of previous output in chat mode is missing in KV cache, let's keep it
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
     StatefulLLMPipeline(
         const ov::InferRequest& request,
@@ -75,7 +71,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         const std::string& device,
         const ov::AnyMap& config,
         const ov::genai::GenerationConfig& generation_config
-    ) : LLMPipelineImplBase(tokenizer, generation_config) {
+    ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
         ov::Core core;
         ov::CompiledModel compiled_model;
         auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config);
@@ -96,6 +92,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1)
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
+
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     StatefulLLMPipeline(
@@ -151,35 +149,44 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 // some symbols combinations can be encoded by the tokenizer in different ways
                 // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
                 // so let's check it out, find the trusted part and use it in on the next step
-                size_t last_same_hist_token = 0;
+                size_t trusted_history_length = 0;
                 if (!m_tokenized_chat_history.empty()) {
                     std::set<int64_t> stop_tokens = config.stop_token_ids;
-                    last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
-                    m_trust_encoded_history = last_same_hist_token == SIZE_MAX;
+                    trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                    m_trust_encoded_history = trusted_history_length == SIZE_MAX;
                 }
 
                 if (m_tokenized_chat_history.empty()) {
                     encoded_input = new_chat_tokens;
-                } else if (last_same_hist_token != SIZE_MAX) {
-                    m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+                } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+                    // does_kv_cache_need_to_update will be true here if beam search is activated
+                    // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                    // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                    if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+                        trusted_history_length = m_kv_history_manager.trusted_history_length;
+                    } else {
+                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_chat_history.size() - trusted_history_length;
+                        // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                        m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                    }
 
                     ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token},
-                                                       new_chat_tokens.input_ids.data<int64_t>() + last_same_hist_token);
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length},
+                                                       new_chat_tokens.input_ids.data<int64_t>() + trusted_history_length);
 
                     ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape());
                     std::fill_n(new_attention_mask.data<int64_t>(), new_tensor.get_shape()[1], 1);
 
                     encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(),
-                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token});
+                                                       {1, new_chat_tokens.input_ids.get_shape().at(1) - trusted_history_length});
                     new_tensor.copy_to(encoded_input.input_ids);
                     encoded_input.attention_mask = new_attention_mask;
-
-                    m_selected_beam = std::nullopt;
+                    m_last_disappeared_token = std::nullopt;
                 } else {
                     encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
                 }
                 m_templated_chat_history = new_templated_chat_history;
+
                 m_tokenized_chat_history.clear();
                 m_tokenized_chat_history.reserve(new_chat_tokens.input_ids.get_size());
                 std::copy_n(new_chat_tokens.input_ids.data<int64_t>(), new_chat_tokens.input_ids.get_size(),
@@ -261,6 +268,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
             std::copy(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size(), std::back_inserter(m_tokenized_chat_history));
 
+        // Tail of previous output in chat mode is missing in KV cache.
+        if (m_last_disappeared_token.has_value()) {
+            attention_mask = ov::genai::utils::push_front_inputs(attention_mask, 1);
+            input_ids = ov::genai::utils::push_front_inputs(input_ids, *m_last_disappeared_token);
+        }
+
         GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
 
         // If eos_token_id was not provided, take value from default m_generation_config
@@ -281,10 +294,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         }
 
         auto batch_size = input_ids.get_shape().at(0);
-        if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) {
-            OPENVINO_THROW("Currently streaming is possible only with batch size=1 and "
-                            "only for greedy or multinomial decoding");
-        }
+        OPENVINO_ASSERT(streamer_ptr == nullptr || batch_size == 1 && config.num_return_sequences == 1 &&
+            (config.is_greedy_decoding() || config.is_multinomial()),
+            "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
 
         auto num_inputs = m_model_runner.get_compiled_model().inputs().size();
         OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: "
@@ -292,7 +304,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                         "(input_ids, attention_mask, position_ids, beam_idx) "
                         "but you have '" + std::to_string(num_inputs) + "' inputs");
 
-        ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist, m_kv_cache_seq_length_axis, m_adapter_controller);
+        ov::genai::utils::trim_kv_cache(m_model_runner, m_kv_history_manager.num_tokens_to_remove_from_kv_cache, m_kv_cache_seq_length_axis, m_adapter_controller);
 
         size_t kv_cache_len = 0;
         ov::Tensor concatenated_attention_mask;
@@ -302,10 +314,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             // Between subsequent runs attention_mask should not be modified.
             auto atten_mask_history = m_model_runner.get_tensor("attention_mask");
             auto prompt_len = attention_mask.get_shape()[1];
-            kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist;
+
+            kv_cache_len = atten_mask_history.get_shape()[1] - m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
 
             ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}};
-            auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam);
+            auto start_atten_hst = atten_mask_history.data<int64_t>();
+
             std::copy(start_atten_hst, start_atten_hst + kv_cache_len,
                     new_atten_mask.data<int64_t>());
             std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len,
@@ -315,6 +329,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             concatenated_attention_mask = attention_mask;
         }
 
+        size_t prev_attn_mask_size = concatenated_attention_mask.get_shape()[1];
+
         bool position_ids_available = (num_inputs == 4);
         std::optional<ov::Tensor> position_ids = std::nullopt;
         if (position_ids_available) {
@@ -328,48 +344,55 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
         if (is_chat_conversation && !m_trust_encoded_history) {
             m_trust_encoded_history = true;
-            m_to_remove_from_hist = 0;
+            m_kv_history_manager.reset();
         }
 
-        ov::genai::EncodedResults result;
-        if (config.is_beam_search() && is_chat_conversation) {
-            std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask,
-                                                            config, position_ids, m_selected_beam);
-        } else {
-            std::vector<SequenceGroup::Ptr> requests;
-            size_t block_size = 1;
-            bool enable_prefix_caching = false;
-
-            for (size_t request_id = 0; request_id < batch_size; request_id++) {
-                SequenceGroup::Ptr sequence_group;
-                if (is_chat_conversation) {
-                    ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
-                } else {
-                    size_t seq_len = input_ids.get_shape().at(1);
-                    size_t batch_offset = request_id * seq_len;
-                    const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
-                    std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
+        std::vector<SequenceGroup::Ptr> requests;
+        size_t block_size = 1;
+        bool enable_prefix_caching = false;
 
-                    sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
-                }
+        for (size_t request_id = 0; request_id < batch_size; request_id++) {
+            SequenceGroup::Ptr sequence_group;
+            if (is_chat_conversation) {
+                ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data());
+                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching);
+            } else {
+                size_t seq_len = input_ids.get_shape().at(1);
+                size_t batch_offset = request_id * seq_len;
+                const int64_t* prompt_start = input_ids.data<const int64_t>() + batch_offset;
+                std::vector<int64_t> tokenized_prompt(prompt_start, prompt_start + seq_len);
 
-                sequence_group->set_sequence_group_ptr(sequence_group);
-                requests.push_back(sequence_group);
+                sequence_group = std::make_shared<SequenceGroup>(request_id, tokenized_prompt, config, block_size, enable_prefix_caching);
             }
 
-            Sampler sampler = Sampler(m_tokenizer);
-            std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr,
-                                                                                  sampler, requests, position_ids, std::nullopt, m_selected_beam);
+            sequence_group->set_sequence_group_ptr(sequence_group);
+            requests.push_back(sequence_group);
+        }
+
+        if (m_sampler.get_seed() != config.rng_seed) {
+            m_sampler.set_seed(config.rng_seed);
         }
 
+        ov::genai::EncodedResults result;
+        std::tie(result, m_last_disappeared_token) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask,
+                                                                                       streamer_ptr, m_sampler, requests, position_ids, std::nullopt);
+
         if (is_chat_conversation) {
+            // force remove from kv_cache last answer
+            if (config.is_beam_search() && m_chat_input_type != ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
+                m_kv_history_manager.trusted_history_length = m_tokenized_chat_history.size();
+                m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_model_runner.get_tensor("attention_mask").get_shape()[1] - prev_attn_mask_size;
+            }
+
             std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
         } else {
             reset_kv_state();
-            m_selected_beam = std::nullopt;
+            m_last_disappeared_token = std::nullopt;
         }
 
+        if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS)
+            std::copy(result.tokens[0].begin(), result.tokens[0].end(), std::back_inserter(m_tokenized_chat_history));
+
         auto stop_time = std::chrono::steady_clock::now();
 
         // If is called without tokenization then that stat will not be reported.
@@ -383,10 +406,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void start_chat(const std::string& system_message) override {
         is_chat_conversation = true;
-        m_selected_beam = std::nullopt;
         m_trust_encoded_history = true;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
         m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        m_last_disappeared_token = std::nullopt;
         if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
             m_history = {};
@@ -404,10 +427,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
     void finish_chat() override {
         is_chat_conversation = false;
-        m_selected_beam = std::nullopt;
         m_trust_encoded_history = true;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
         m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF;
+        m_last_disappeared_token = std::nullopt;
         if (!m_tokenized_chat_history.empty()) {
             reset_kv_state();
             m_history.clear();
@@ -581,9 +604,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         std::vector<std::string> plain_replies;
         std::vector<float> plain_scores;
         for (GenerationResult& res : generated) {
-            if (GenerationStatus::FINISHED != res.m_status) {
-                OPENVINO_THROW("Got unfinished GenerationStatus");
-            }
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
             std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies));
             std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
         }
@@ -639,9 +660,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         std::vector<std::vector<int64_t>> plain_tokens;
         std::vector<float> plain_scores;
         for (EncodedGenerationResult& res : generated) {
-            if (GenerationStatus::FINISHED != res.m_status) {
-                OPENVINO_THROW("Got unfinished GenerationStatus");
-            }
+            OPENVINO_ASSERT(res.m_status == GenerationStatus::FINISHED || res.m_status == GenerationStatus::DROPPED_BY_HANDLE, "Got unfinished GenerationStatus");
             std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens));
             std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores));
         }
@@ -699,7 +718,9 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::AnyMap& properties
 ){
     auto start_time = std::chrono::steady_clock::now();
-    if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) {
+    if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || 
+        properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || 
+        properties.find(ov::genai::prompt_lookup.name()) != properties.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, plugin_config);
     } else if (device == "NPU") {
@@ -718,7 +739,9 @@ ov::genai::LLMPipeline::LLMPipeline(
 ){
     auto start_time = std::chrono::steady_clock::now();
 
-    if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
+    if (config.find(ov::genai::scheduler_config.name()) != config.end() || 
+        config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || 
+        config.find(ov::genai::prompt_lookup.name()) != config.end()) {
         auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, plugin_config);
     } else if (device == "NPU") {
@@ -741,7 +764,10 @@ ov::genai::LLMPipeline::LLMPipeline(
     auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config);
 
     auto start_time = std::chrono::steady_clock::now();
-    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) {
+    if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || 
+        plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || 
+        plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){
+
         auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config);
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_str, weights_tensor,
                                                               tokenizer, scheduler_config, device, plugin_config_, generation_config);
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 090aed965..6f4f12489 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -407,7 +407,8 @@ ov::genai::ModelConfigDesc get_modeldesc_from_json(const std::filesystem::path&
     if (config_data.contains("_name_or_path")) {
         desc.name_or_path = config_data["_name_or_path"].get<std::string>();
     }
-    desc.num_key_value_heads = config_data["num_key_value_heads"].get<int>();
+    desc.num_key_value_heads = config_data.contains("num_key_value_heads")
+        ? config_data["num_key_value_heads"].get<int>() : -1;
     return desc;
 }
 
@@ -1102,6 +1103,11 @@ EncodedResults StaticLLMPipeline::generate(
             m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice);
         }
     }
+
+    if (streamer_ptr) {
+        streamer_ptr->end();
+    }
+
     auto stop_time = std::chrono::steady_clock::now();
     // If is called without tokenization then that stat will not be reported.
     auto& metrics = results.perf_metrics;
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 3ab041fa5..17a20dd96 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -9,12 +9,11 @@
 #include <regex>
 #include <vector>
 
+#include "utils.hpp"
+#include "debug_utils.hpp"
 #include "lm_encoding.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
-#include "debug_utils.hpp"
-
-#include "utils.hpp"
 
 namespace ov {
 namespace genai {
@@ -51,7 +50,7 @@ void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<i
 }
 
 
-std::pair<EncodedResults, int32_t> get_lm_encoded_results(
+std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
     ov::InferRequest& m_llm,
     const ov::Tensor& input_ids,
     const ov::Tensor& attention_mask,
@@ -59,41 +58,56 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     Sampler& sampler,
     std::vector<SequenceGroup::Ptr> sequence_groups,
     std::optional<ov::Tensor> position_ids,
-    std::optional<EmbeddingsModel> m_embedding,
-    std::optional<int32_t> selected_beam_idx
+    std::optional<EmbeddingsModel> m_embedding
 ) {
     std::vector<GenerationHandle> generations;
     for (SequenceGroup::Ptr sequence_group : sequence_groups) {
         generations.push_back(std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters()));
     }
 
+    auto active_sequence_groups{sequence_groups};
+
+    auto stream_generated_tokens = [&streamer_ptr, &generations, &active_sequence_groups]() {
+        GenerationHandle& handle = generations.at(0);
+        if (streamer_ptr && handle->can_read()) {
+            std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
+            for (const auto& gen_token : token.begin()->second.generated_ids) {
+                if (streamer_ptr->put(gen_token)) {
+                    handle->drop();
+                    break;
+                }
+            }
+        }
+
+        // free non running requests
+        auto removed_it = std::remove_if(active_sequence_groups.begin(), active_sequence_groups.end(),
+            [](SequenceGroup::Ptr sg) -> bool {
+                return sg->has_finished() || sg->out_of_memory() || sg->handle_dropped();
+            });
+        active_sequence_groups.erase(removed_it, active_sequence_groups.end());
+    };
+
     ov::Shape prompts_shape = input_ids.get_shape();
     const size_t batch_size = prompts_shape[0];
 
     // Initialize results and performance metrics.
+
     EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }};
 
     // Initialize inputs
-    if (m_embedding.has_value())
-        m_llm.set_tensor("inputs_embeds", input_ids);
-    else
-        m_llm.set_tensor("input_ids", input_ids);
-
+    m_llm.set_tensor(m_embedding.has_value() ? "inputs_embeds" : "input_ids", input_ids);
     m_llm.set_tensor("attention_mask", attention_mask);
-    
     if (position_ids.has_value())
         m_llm.set_tensor("position_ids", *position_ids);
 
     ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
-    auto beam_data = beam_idx.data<int32_t>();
-    if (selected_beam_idx.has_value())
-        beam_data[0] = *selected_beam_idx;
-    else
-        std::fill_n(beam_data, batch_size, 0);
+    std::fill_n(beam_idx.data<int32_t>(), batch_size, 0);
     m_llm.set_tensor("beam_idx", beam_idx);
 
+    // "Prompt" phase
+
     const auto infer_start = std::chrono::steady_clock::now();
     m_llm.infer();
     const auto infer_end = std::chrono::steady_clock::now();
@@ -109,7 +123,6 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
     for (auto& sequence_group : sequence_groups) {
         sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len);
         sequence_group->schedule_tokens(sequence_len);
-
     }
 
     std::map<size_t, size_t> beam_offets;
@@ -117,27 +130,11 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         beam_offets.insert({sequence_groups.at(i)->get_request_id(), i});
 
     SamplerOutput sampler_output = sampler.sample(sequence_groups, logits);
+    stream_generated_tokens();
 
-    auto active_sequence_groups{sequence_groups};
-    auto get_active_sequence_groups = [](SequenceGroup::Ptr sg) { return sg->has_finished(); };
-
-    active_sequence_groups.erase(std::remove_if(active_sequence_groups.begin(),
-                                                active_sequence_groups.end(),
-                                                get_active_sequence_groups),
-                                 active_sequence_groups.end());
-    
-    auto stream_generated_tokens = [&streamer_ptr, &generations]() {
-        if (streamer_ptr && generations.at(0).get()->can_read()) {
-            std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
-            for (const auto& gen_token : token.begin()->second.generated_ids) {
-                if (!streamer_ptr->put(gen_token)) {
-                    break;
-                }
-            }
-        }
-    };
+    // "Generation" phase
 
-    while (active_sequence_groups.size() > 0) {
+    while (!active_sequence_groups.empty()) {
         size_t total_num_tokens = 0;
 
         for (auto& sequence_group : active_sequence_groups) {
@@ -172,26 +169,19 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
                 // apply strides to shift to a next sequence
                 input_ids_data += num_scheduled_tokens;
 
-                // for different sequences iteration of beams started from 0, but we collect it to one input_ids#
+                // for different sequences iteration of beams started from 0, but we collect it to one input_ids
                 next_beams.push_back(beam_idxs[sequence->get_id()] + beam_offets.at(sequence_group->get_request_id()));
             }
         }
 
-        for (size_t i = 0; i < sequence_groups.size(); i++) {
-            if (i == 0)
-                beam_offets[sequence_groups.at(i)->get_request_id()] = 0;
-            else {
-                beam_offets[sequence_groups.at(i)->get_request_id()] = sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i -1];
-            }
+        for (size_t i = 0; i < active_sequence_groups.size(); i++) {
+            beam_offets[active_sequence_groups.at(i)->get_request_id()] = i == 0 ? 0 : (active_sequence_groups.at(i - 1)->num_running_seqs() + beam_offets[i - 1]);
         }
 
         if (m_embedding.has_value()) {
             const ov::Tensor& embed_prompt_tensor = (*m_embedding).infer(new_input_ids);
-
-            m_llm.get_tensor("inputs_embeds").set_shape(embed_prompt_tensor.get_shape());
             m_llm.set_tensor("inputs_embeds", embed_prompt_tensor);
         } else {
-            m_llm.get_tensor("input_ids").set_shape(new_input_ids.get_shape());
             m_llm.set_tensor("input_ids", new_input_ids);
         }
 
@@ -201,7 +191,6 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
             update_position_ids(m_llm.get_tensor("position_ids"), m_llm.get_tensor("attention_mask"));
         }
 
-        m_llm.get_tensor("beam_idx").set_shape({ total_num_tokens });
         m_llm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()});
 
         const auto infer_start = std::chrono::steady_clock::now();
@@ -213,42 +202,38 @@ std::pair<EncodedResults, int32_t> get_lm_encoded_results(
         raw_perf_counters.m_new_token_times.emplace_back(infer_end);
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
-        stream_generated_tokens();
-
         sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));
-
-        active_sequence_groups.erase(std::remove_if(active_sequence_groups.begin(),
-                                                    active_sequence_groups.end(),
-                                                    get_active_sequence_groups),
-                                    active_sequence_groups.end());
+        stream_generated_tokens();
     }
 
-    // to stream last token
-    stream_generated_tokens();
-    if (streamer_ptr) {
+    if (streamer_ptr) { // push streamer's cache
         streamer_ptr->end();
     }
-    
-    size_t next_selected_beam = 0;
-    for (size_t i = 0; i < sequence_groups.size(); i++) {
-        auto request = sequence_groups[i];
-        auto generation_outputs = generations[i]->read_all();
-
-        std::sort(generation_outputs.begin(), generation_outputs.end(), [] (const GenerationOutput& r1, const GenerationOutput& r2) {
-            return r1.score > r2.score;
-        });
-
-        auto num_outputs = std::min(request->get_sampling_parameters().num_return_sequences, generation_outputs.size());
-        for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
-            const auto& generation_output = generation_outputs[generation_output_idx];
-            results.tokens.push_back(std::move(generation_output.generated_ids));
-            results.scores.push_back(generation_output.score);
+
+    for (auto& sequence_group : sequence_groups) {
+        auto sampling_params = sequence_group->get_sampling_parameters();
+        const auto& sequences = sequence_group->get_finished_sequences();
+        size_t num_outputs = std::min(sequence_group->get_sampling_parameters().num_return_sequences, sequences.size());
+
+        for (size_t seq_id = 0; seq_id < num_outputs; ++seq_id) {
+            const auto & sequence = sequences[seq_id];
+            const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) : sequence->get_cumulative_log_probs();
+
+            results.tokens.push_back(sequence->get_generated_ids());
+            results.scores.push_back(score);
         }
-        // next_selected_beam = sampler.last_selected_beam(request);
     }
 
-    return {results, next_selected_beam};
+    for (SequenceGroup::Ptr sequence_group : sequence_groups)
+        sampler.clear_request_info(sequence_group->get_request_id());
+
+    // it is not saved in KV cache, we need to add it for some cases
+    std::optional<int64_t> last_token_of_best_sequence = std::nullopt;
+    if (sequence_groups[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH || sequence_groups[0]->handle_dropped())
+        last_token_of_best_sequence = results.tokens[0].back();
+
+    return {results, last_token_of_best_sequence};
 }
 
 }  // namespace genai
-}  // namespace ov
\ No newline at end of file
+}  // namespace ov
diff --git a/src/cpp/src/lm_encoding.hpp b/src/cpp/src/lm_encoding.hpp
index fa6692ede..c31cffb9b 100644
--- a/src/cpp/src/lm_encoding.hpp
+++ b/src/cpp/src/lm_encoding.hpp
@@ -8,13 +8,9 @@
 namespace ov {
 namespace genai {
 
-std::pair<EncodedResults, int32_t> get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
-                                                          const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
-                                                          std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding, std::optional<int32_t> selected_beam_idx);
-
-void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams);
-
-void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask);
+std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
+                                                                         const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
+                                                                         std::optional<ov::Tensor> position_ids, std::optional<EmbeddingsModel> m_embedding);
 
 }
 }
diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
index 3bd6252c7..3725dc0cf 100644
--- a/src/cpp/src/perf_metrics.cpp
+++ b/src/cpp/src/perf_metrics.cpp
@@ -101,7 +101,7 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
 
         auto ttft = tok_times[0] - start_time_val;
         raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
-        raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
+        raw_metrics.m_times_to_first_token.emplace_back(ttft);
         num_generated_tokens = batch_sizes[0];
         
         // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens.
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index f77463d76..9c18dc772 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -85,75 +85,63 @@ std::string clean_wrapped_text(const std::string& wrapped_text, const std::strin
     return clean_text;
 }
 
+std::vector<int64_t> encode_and_process_string(const std::string& stop_string, ov::genai::Tokenizer& tokenizer) {
+    // encode stop_string
+    std::string stop_string_copy = stop_string;
+    ov::Tensor ov_encoded_stop_string = tokenizer.encode(stop_string_copy, ov::genai::add_special_tokens(false)).input_ids;
+    size_t tensor_size = ov_encoded_stop_string.get_size();
+    std::vector<int64_t> encoded_stop_string(tensor_size);
+    std::copy_n(ov_encoded_stop_string.data<int64_t>(), tensor_size, encoded_stop_string.begin());
+    return encoded_stop_string;
+}
+
+struct MatchStopStringResult {
+    size_t to_remove = 0;
+    // int64_t last_token_id = 0;
+    // bool is_to_update_last_token = false;
+    bool is_matched = false;
+};
+
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
-int match_stop_string(Tokenizer & tokenizer, const TokenIds & generated_tokens, const std::set<std::string> & stop_strings) {
-    /*
-    For catching stop_string hit we run comparisons character-wise to catch cases where stop string 
-    overlaps with part of another token on both sides or is just a part of a single token. 
-    For every stop_string we iterate over generated tokens starting from the last one and going backwards. 
-    Every token is wrapped with prefix tokens to ensure tokenizer doesn't remove prefix whitespace of the actual token.
-    After that all tokens are decoded and prefix is removed from the decoded text, so we end up with decoded token.
-    Its characters are compared to the stop_string character at a current_position 
-    (position of a character in the stop_string counting from the last one) - at the beginning position is 0.
-    When characters match we increase current_position and check if we have a full match already, if not we continue.
-    If we have already matched some characters (current_position > 0) and next character is not matching 
-    before we reach the full match, then we reset current_position to 0. 
-    */ 
-    std::string prefix = "a";
-    auto prefix_ov = tokenizer.encode(prefix).input_ids;
-    std::vector<int64_t> prefix_tokens(prefix_ov.data<int64_t>(), prefix_ov.data<int64_t>() + prefix_ov.get_size());
-    std::string suffix = "b";
-    auto suffix_ov = tokenizer.encode(suffix).input_ids;
-    std::vector<int64_t> suffix_tokens(suffix_ov.data<int64_t>(), suffix_ov.data<int64_t>() + suffix_ov.get_size());
-
-    // Since whitespace can be added at the beginning of the suffix we also try to capture that behavior here
-    // and get suffix string that will actually be part of the decoded string so we can remove it correctly
-    auto wrapped_suffix_tokens = suffix_tokens;
-    wrapped_suffix_tokens.insert(wrapped_suffix_tokens.begin(), prefix_tokens.begin(), prefix_tokens.end());
-    std::string wrapped_suffix = tokenizer.decode(wrapped_suffix_tokens);
-    auto wrapper_pos = wrapped_suffix.find(prefix);
-    suffix = wrapped_suffix.substr(wrapper_pos + prefix.size());
-    
-    for (auto stop_string: stop_strings) {
-        int current_position = 0;
-        int num_matched_tokens = 0; 
-        // Getting reverse iterator to check tokens starting from the last one generated and going backwards
-        auto generated_tokens_rit = generated_tokens.rbegin();
-        std::vector<int64_t> tokens_buffer;
-        while (generated_tokens_rit != generated_tokens.rend()) {
-            num_matched_tokens++;
-            tokens_buffer.insert(tokens_buffer.begin(), *generated_tokens_rit);
-
-            std::vector<int64_t> wrapped_tokens = wrap_tokens(tokens_buffer, prefix_tokens, suffix_tokens);
-            std::string wrapped_text = tokenizer.decode(wrapped_tokens);
-            std::string clean_text = clean_wrapped_text(wrapped_text, prefix, suffix);
-
-            if (clean_text == "" || (clean_text.size() >= 3 && (clean_text.compare(clean_text.size() - 3, 3, "�") == 0))) { 
-                generated_tokens_rit++;
-                continue;
-            } else {
-                tokens_buffer.clear();
-            }
-            // Checking clean_text characters starting from the last one
-            for (auto clean_text_rit = clean_text.rbegin(); clean_text_rit != clean_text.rend(); clean_text_rit++) {
-                // On character match increment current_position for the next comparisons
-                if (*clean_text_rit == *(stop_string.rbegin() + current_position)) {
-                    current_position++;
-                    // If this is the last character from the stop_string we have a match
-                    if ((stop_string.rbegin() + current_position) == stop_string.rend()) {
-                        return num_matched_tokens;
-                    } 
-                } else if (current_position) {
-                    // Already found matching characters, but the last one didn't match, so we reset current_position
-                    current_position = 0;
-                    // Looking for the match will start over from this character so we decrement iterator
-                    clean_text_rit--;
+MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
+                      const TokenIds& generated_tokens,
+                      const std::pair<size_t, std::set<std::string>>& stop_strings,
+                      bool is_include_to_output) {
+    MatchStopStringResult result;
+    if (generated_tokens.size() >= stop_strings.first) {
+        size_t offset = generated_tokens.size() - stop_strings.first;
+        TokenIds buffer(generated_tokens.begin() + offset, generated_tokens.end());
+        std::string decoded_buffer = tokenizer.decode(buffer);
+        for (const auto& stop_string : stop_strings.second) {
+            auto pos = decoded_buffer.find(stop_string);
+            if (pos != std::string::npos) {
+                result.is_matched = true;
+
+                auto stop_string_len = is_include_to_output ? stop_string.length() : 0;
+                decoded_buffer = decoded_buffer.substr(0, pos + stop_string_len);
+                // to remove word splitting symbols from tail
+                while (decoded_buffer.back() == ' ' || decoded_buffer.back() == '\n') {
+                    decoded_buffer.pop_back();
+                }
+                if (decoded_buffer.empty()) {
+                    result.to_remove = buffer.size();
+                    return result;
                 }
+
+                // find token cnt to be removed from sequence by decoding token by token
+                std::string decoded_partially_string = "";
+                for (size_t i = 0; i < buffer.size(); ++i) {
+                    decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]});
+                    if (decoded_partially_string.find(decoded_buffer) != std::string::npos) {
+                        result.to_remove = buffer.size() - i - 1;
+                        break;
+                    }
+                }
+                return result;
             }
-            generated_tokens_rit++;
         }
     }
-    return 0;
+    return result;
 }
 
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
@@ -245,7 +233,9 @@ std::map<size_t, int32_t> Sampler::GroupBeamSearcher::get_beam_idxs() {
     return next_beams;
 }
 
-void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) {
+void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
+    SamplerOutput& sampler_output,
+    const std::pair<size_t, std::set<std::string>>& stop_strings) {
     assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 &&
         "number of beams should be divisible by number of groups");
     size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups;
@@ -392,19 +382,17 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, Sa
                 // There's probably a better way to do that, than copying whole vector...
                 std::vector<int64_t> token_ids = candidate.m_sequence->get_generated_ids();
                 token_ids.push_back(candidate.m_token_id);
-                int num_last_matched_tokens = match_stop_string(m_tokenizer, token_ids, m_sequence_group->get_sampling_parameters().stop_strings);
-                if (num_last_matched_tokens) {
+                auto match_result = match_stop_string(m_tokenizer, token_ids, stop_strings, m_parameters.include_stop_str_in_output);
+                if (match_result.is_matched) {
                     // If beam_token does not belong to top num_beams tokens, it should not be added
                     if (cand_idx >= group_size)
                         continue;
 
-                    if(!m_parameters.include_stop_str_in_output) {
-                        // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
-                        candidate.m_sequence->remove_last_tokens(num_last_matched_tokens - 1);
-                    }
+                    // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point)
+                    candidate.m_sequence->remove_last_tokens(match_result.to_remove);
 
                     // try to finish candidate
-                    try_to_finish_candidate(group, candidate, m_parameters.include_stop_str_in_output);
+                    try_to_finish_candidate(group, candidate);
                     continue;
                 }
             }
@@ -576,10 +564,11 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
         }
 
         if (!sampling_params.stop_strings.empty()) {
-            int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings);
-            if (num_matched_last_tokens) {
-                if (!sampling_params.include_stop_str_in_output)
-                    running_sequence->remove_last_tokens(num_matched_last_tokens);
+            auto& stop_strings = m_stop_strings.at(sequence_group->get_request_id());
+            auto match_result = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), stop_strings, sampling_params.include_stop_str_in_output);
+            if (match_result.is_matched) {
+                running_sequence->remove_last_tokens(match_result.to_remove);
+
                 running_sequence->set_status(SequenceStatus::FINISHED);
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
                 dropped_seq_ids.push_back(running_sequence->get_id());
@@ -741,6 +730,19 @@ float get_p_prime(Sequence::Ptr& running_sequence,
     return p_prime;
 }
 
+std::pair<size_t, std::set<std::string>>
+process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& tokenizer) {
+    std::pair<size_t, std::set<std::string>> result;
+    for (const auto& stop_string : stop_strings) {
+        auto encoded_stop_string = encode_and_process_string(stop_string, tokenizer);
+        if (result.first < encoded_stop_string.size()) {
+            result.first = encoded_stop_string.size();
+        }
+        result.second.insert(stop_string);
+    }
+    return result;
+}
+
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                               ov::Tensor logits,
                               bool is_validation_mode_enabled) {
@@ -764,6 +766,12 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         if (!m_logit_processors.count(request_id)) {
             m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())});
         }
+        if (!m_stop_strings.count(request_id)) {
+            auto processed_stop_string = process_stop_strings(sampling_params.stop_strings, m_tokenizer);
+            m_stop_strings.insert({request_id, processed_stop_string});
+            sequence_group->set_stream_window_size(processed_stop_string.first);
+        }
+        auto& stop_strings = m_stop_strings.at(request_id);
         auto& logit_processor = m_logit_processors.at(request_id);
         const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
         ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data);
@@ -873,7 +881,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
                 }
 
                 // current algorithm already adds new tokens to running sequences and
-                m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output);
+                m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output, stop_strings);
 
                 // check max length stop criteria
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
@@ -886,8 +894,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
             // Notify handle after sampling is done. 
             // For non-streaming this is effective only when the generation is finished.
             OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request);
-            size_t num_output_token_to_push = num_tokens_to_process - max_removed_tokens_per_request + 1;
-            sequence_group->notify_handle(num_output_token_to_push);
+            sequence_group->notify_handle();
         } else {
             // we are in prompt processing phase when prompt is split into chunks and processed step by step
         }
@@ -926,6 +933,7 @@ void Sampler::create_logit_processor(uint64_t request_id, const GenerationConfig
 void Sampler::clear_request_info(uint64_t request_id) { 
     m_beam_search_info.erase(request_id);
     m_logit_processors.erase(request_id);
+    m_stop_strings.erase(request_id);
 }
 
 int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) {
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 0f7876cbf..981e11560 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -55,8 +55,11 @@ class Sampler {
     std::map<uint64_t, GroupBeamSearcher> m_beam_search_info;
 
     std::mt19937 rng_engine;
+    size_t seed = rng_engine.default_seed;
     // { request_id, logit_processor }
     std::map<uint64_t, LogitProcessor> m_logit_processors;
+    // { request_id, { max_encoded_len, { stop_strings }}}
+    std::map<int64_t, std::pair<size_t, std::set<std::string>>> m_stop_strings;
 
     Tokenizer m_tokenizer;
 
@@ -65,7 +68,11 @@ class Sampler {
     Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {};
 
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
-    void set_seed(size_t seed) { rng_engine.seed(seed); }
+    void set_seed(size_t new_seed) {
+        rng_engine.seed(new_seed);
+        seed = new_seed;
+    }
+    size_t get_seed() { return seed; }
 
     void clear_request_info(uint64_t request_id);
 
@@ -115,7 +122,7 @@ class Sampler::GroupBeamSearcher {
 public:
     explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group, Tokenizer tokenizer);
 
-    void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output);
+    void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output, const std::pair<size_t, std::set<std::string>>& stop_strings);
     void finalize(SamplerOutput& sampler_output);
     std::map<size_t, int32_t> get_beam_idxs();
 };
diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index 6de4adaa4..da65c68be 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -7,10 +7,12 @@
 #include <cstdlib>
 #include <vector>
 
+#include "openvino/runtime/intel_gpu/properties.hpp"
 #include "openvino/genai/scheduler_config.hpp"
 #include "device_config.hpp"
 #include "block_manager.hpp"
 #include "sequence_group.hpp"
+#include "cache_manager.hpp"
 
 namespace ov::genai {
 class Scheduler {
@@ -20,6 +22,13 @@ class Scheduler {
     BlockManager m_block_manager;
     friend class CacheStateDumper;
 
+    bool m_dynamic_memory_allocation = false;
+
+    // Dynamic KV-cache allocation params
+    size_t m_kv_blocks_initial_multiplier = 2;
+    const float m_cache_growth_factor = 2; // commmon values 1.5 or 2
+
+    std::shared_ptr<CacheManager> m_cache_manager;
 public:
     struct Output {
         // IDs of scheduled groups
@@ -36,15 +45,20 @@ class Scheduler {
         float m_cache_usage = 0.0;
     };
 
-    explicit Scheduler(size_t block_size, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) :
+    explicit Scheduler(size_t block_size, std::shared_ptr<CacheManager> cache_manager, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) :
+            m_cache_manager(cache_manager),
             m_can_use_partial_preemption(can_use_partial_preemption),
             m_config(config),
             m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) {
+        
         OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero");
     }
 
     Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) {
         Output scheduler_output;
+        if (m_block_manager.get_total_number_of_kv_blocks() == 0) {
+            _initialize_cache(sequence_groups);
+        }
 
         if (m_config.dynamic_split_fuse) {
             // deepspeed-mii case
@@ -64,9 +78,9 @@ class Scheduler {
             }
         }
 
+        m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks());
         _clear_waiting_sequences(sequence_groups);
         scheduler_output.m_cache_usage = m_block_manager.get_used_percentage();
-
         return scheduler_output;
     }
 
@@ -236,8 +250,13 @@ class Scheduler {
                 OPENVINO_ASSERT(currently_allocated_token_slots >= occupied_token_slots, "internal error");
                 size_t available_slots = currently_allocated_token_slots - occupied_token_slots,
                        required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0;
-                size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks();
-                size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks);
+                size_t num_required_blocks = (required_slots + block_size - 1) / block_size;
+                while (num_required_blocks > m_block_manager.num_free_blocks()) {
+                    if (!_try_increase_cache()) {
+                        break;
+                    }
+                }
+                size_t num_scheduled_blocks = std::min(num_required_blocks, m_block_manager.num_free_blocks());
                 // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks
                 // and total "scheduled capacity"
                 num_scheduled_tokens = std::min(num_scheduled_tokens, available_slots + num_scheduled_blocks * block_size);
@@ -289,10 +308,16 @@ class Scheduler {
                 size_t num_scheduled_tokens_per_seq = std::min(available_tokens_per_seq_in_megabatch, num_available_tokens_per_seq);
                 sequence_group->schedule_tokens(num_scheduled_tokens_per_seq);
 
+                while (!m_block_manager.can_append_slots(sequence_group)){
+                    if (!_try_increase_cache()) {
+                        break;
+                    }
+                }
+
                 _apply_preemption(sequence_group_id, sequence_groups);
 
                 // if we can't preemt any more sequences, clear scheduled tokens and move to next sequence
-                if (!m_block_manager.can_append_slots(sequence_group)){
+                if (!m_block_manager.can_append_slots(sequence_group)) {
                     sequence_group->clear_scheduled_tokens();
                     continue;
                 }
@@ -370,6 +395,11 @@ class Scheduler {
                 // apply KV cache limitations
                 size_t block_size = get_block_size();
                 const size_t num_required_blocks = (sequence_len + block_size - 1) / block_size;
+                while (!m_block_manager.can_allocate_blocks(num_required_blocks)){
+                    if (!_try_increase_cache()) {
+                        break;
+                    }
+                }
                 if (!m_block_manager.can_allocate_blocks(num_required_blocks))
                     break;
 
@@ -405,6 +435,86 @@ class Scheduler {
             sequence_groups[sequence_group_id]->clear_waiting_sequences();
         }
     }
+
+    size_t _get_available_gpu_memory() {
+        auto device_config = m_cache_manager->get_device_config();
+        auto core = m_cache_manager->get_core();
+        auto device = device_config->get_device();
+        OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only.");
+        auto memory_statistics = core->get_property(device, ov::intel_gpu::memory_statistics);
+        auto device_type = core->get_property(device, ov::device::type);
+
+        // sum up all used device memory
+        std::vector<std::string> device_memory_types = {"cl_mem", "usm_device"};
+        size_t used_device_mem = 0;
+        for (auto mem_type: device_memory_types) {
+            used_device_mem += memory_statistics[mem_type];
+        }
+
+        if (device_type == ov::device::Type::INTEGRATED) {
+            used_device_mem += memory_statistics["usm_host"];
+        }
+
+        // there could be unaccounted extra memory reserved by kernels, kept
+        // in memory pools, etc
+        // therefore, add a threshold to account for this
+        float used_memory_threshold = 1.1;
+        used_device_mem *= used_memory_threshold;
+
+        // total device memory in bytes
+        auto total_device_memory = core->get_property(device, ov::intel_gpu::device_total_mem_size);
+
+        return total_device_memory - used_device_mem;
+    }
+
+    void _initialize_cache(const std::vector<SequenceGroup::Ptr>& sequence_groups) {
+        size_t blocks_sum = 0;
+        for (auto idx = 0; idx < sequence_groups.size(); idx++) {
+            auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier;
+            auto gen_config = sequence_groups[idx]->get_sampling_parameters();
+            seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));
+            size_t blocks_num = std::ceil((float)seq_length / m_block_manager.get_block_size());
+            if (gen_config.is_beam_search()) {
+                blocks_num *= gen_config.num_beams;
+            } else if (gen_config.is_multinomial()) {
+                blocks_num *= gen_config.num_return_sequences;
+            }
+            blocks_sum  += blocks_num;
+        }
+        m_block_manager.increase_kv_blocks_number(blocks_sum);
+        m_dynamic_memory_allocation = true;
+    }
+
+    bool _try_increase_cache() {
+        if (!m_dynamic_memory_allocation) {
+            return false;
+        }
+        auto device_config = m_cache_manager->get_device_config();
+        auto device = device_config->get_device();
+        size_t current_num_of_kv_blocks = m_block_manager.get_total_number_of_kv_blocks();
+        size_t new_blocks_num = current_num_of_kv_blocks * m_cache_growth_factor;
+
+        if (device.find("GPU") == std::string::npos) {
+            m_block_manager.increase_kv_blocks_number(new_blocks_num);
+        }
+        else {
+            size_t available_gpu_memory = _get_available_gpu_memory();
+            size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * device_config->get_block_size_in_bytes();
+            if (required_memory <= available_gpu_memory) {
+                m_block_manager.increase_kv_blocks_number(new_blocks_num);
+            } else {
+                size_t possible_blocks_to_add = available_gpu_memory / device_config->get_block_size_in_bytes();
+                if (possible_blocks_to_add > 0) {
+                    m_block_manager.increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add);
+                }
+                else {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
 };
 
 }
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 6755255fe..220e93c03 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -126,23 +126,28 @@ class Sequence {
         }
     }
 
-    GenerationOutput get_last_generation_output(size_t token_cnt = 1) {
+    GenerationOutput get_last_generation_output(size_t token_cnt = 1, size_t num_token_to_ignore = 0) {
         GenerationOutput output;
-        OPENVINO_ASSERT(m_generated_ids.size());
-        output.score = get_cumulative_log_probs();
+        if (token_cnt > 0) {
+            OPENVINO_ASSERT(m_generated_ids.size());
+            output.score = get_cumulative_log_probs();
 
-        auto generated_token_id = get_generated_ids();
-        auto generated_log_probs = get_generated_log_probs();
+            auto generated_token_id = get_generated_ids();
+            auto generated_log_probs = get_generated_log_probs();
 
-        OPENVINO_ASSERT(get_generated_len() >= token_cnt);
-        auto offset = get_generated_len() - token_cnt;
+            OPENVINO_ASSERT(get_generated_len() >= token_cnt);
+            if (get_generated_len() > num_token_to_ignore) {
+                auto offset = get_generated_len() - token_cnt - num_token_to_ignore;
+                auto offset_back = get_generated_len() - num_token_to_ignore;
 
-        std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.end());
-        std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.end());
+                std::vector<int64_t> token_id(generated_token_id.begin() + offset, generated_token_id.begin() + offset_back);
+                std::vector<float> log_probs(generated_log_probs.begin() + offset, generated_log_probs.begin() + offset_back);
 
-        output.generated_ids = token_id;
-        output.generated_log_probs = log_probs;
-        output.finish_reason = get_finish_reason();
+                output.generated_ids = token_id;
+                output.generated_log_probs = log_probs;
+                output.finish_reason = get_finish_reason();
+            }
+        }
         return output;
     }
 
@@ -173,8 +178,6 @@ class Sequence {
         return score;
     }
 
-
-
     // Each KV block can be uniquely identified by
     void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) {
         m_sequence_group = sequence_group;
@@ -221,6 +224,8 @@ class SequenceGroup {
     // flag to enable/disable token generation, e.g. in speculative decoding scenario
     bool m_is_gen_paused = false;
 
+    size_t m_num_streamed_tokens = 0, m_stream_window_size = 0;
+
 
     SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching)
         : m_request_id(request_id),
@@ -332,14 +337,16 @@ class SequenceGroup {
     std::vector<Sequence::CPtr> get_finished_sequences() const {
         std::vector<Sequence::CPtr> finished_seqs;
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
-            if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory()) {
+            if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory() || handle_dropped()) {
                 finished_seqs.push_back(m_sequences[seq_id]);
             }
         }
 
-        // do we need to sort sequences here or sampler can handle it for us?
-        std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) {
-            return s1->get_beam_search_score(m_sampling_params) > s2->get_beam_search_score(m_sampling_params);
+        std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) -> bool {
+            bool is_beam_search = m_sampling_params.is_beam_search();
+            const float score_1 = is_beam_search ? s1->get_beam_search_score(m_sampling_params) : s1->get_cumulative_log_probs();
+            const float score_2 = is_beam_search ? s2->get_beam_search_score(m_sampling_params) : s2->get_cumulative_log_probs();
+            return score_1 > score_2;
         });
 
         return finished_seqs;
@@ -454,6 +461,10 @@ class SequenceGroup {
     size_t get_num_tokens_to_validate() {
         return m_num_validation_tokens;
     }
+    
+    void set_stream_window_size(size_t k) {
+        m_stream_window_size = k;
+    }
 
     size_t get_num_available_tokens_for_batching() const {
         OPENVINO_ASSERT(!has_finished(), "Internal error: this function cannot be called on finished sequence group");
@@ -571,7 +582,7 @@ class SequenceGroup {
         m_generation_stream->set_generation_status(status);
     }
 
-    bool handle_dropped() {
+    bool handle_dropped() const {
         return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE;
     }
 
@@ -601,7 +612,7 @@ class SequenceGroup {
         for (auto& sequence : m_sequences) {
             // todo: check seq.is_finished() to generate without several </s>
             // or is it ok to use padding?
-            auto output = sequence->get_last_generation_output(token_cnt);
+            auto output = sequence->get_last_generation_output(token_cnt, m_stream_window_size);
             if (m_sampling_params.echo && !m_has_echoed) {
                 output.generated_ids.insert(output.generated_ids.begin(), m_prompt_ids.begin(), m_prompt_ids.end());
                 output.generated_log_probs.insert(output.generated_log_probs.begin(), m_prompt_log_probs.begin(), m_prompt_log_probs.end());
@@ -612,24 +623,36 @@ class SequenceGroup {
         m_generation_stream->push(std::move(outputs));
     }
 
-    void notify_handle(size_t num_output_token_to_push = 0) {
+    void notify_handle() {
         if (out_of_memory()) {
             set_generation_status(GenerationStatus::IGNORED);
         } else if (has_finished()) {
             set_generation_status(GenerationStatus::FINISHED);
         }
         // For beam search streaming is not available, so we notify only upon finishing
-        if(m_sampling_params.is_beam_search()) {
+        if (m_sampling_params.is_beam_search()) {
             if (has_finished() || out_of_memory()) {
                 push_outputs();
             }
         } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) {
             // We can stream only when one sequence is returned and we don't use stop strings that would be excluded from the output
             // (after stop string is detected its tokens are already sent)
-            if (num_total_seqs() == 1 &&
-                (m_sampling_params.stop_strings.empty() || m_sampling_params.include_stop_str_in_output)) {
-                if (num_output_token_to_push)
-                    push_partial_outputs(num_output_token_to_push);
+            if (num_total_seqs() == 1) {
+                const auto generated_len = m_sequences.front()->get_generated_len();
+                if (has_finished()) {
+                    m_stream_window_size = 0;
+                }
+                if (generated_len <= (m_num_streamed_tokens + m_stream_window_size)) {
+                    return;
+                }
+                // speculative decoding draft handling
+                if (generated_len < m_num_streamed_tokens) {
+                    m_num_streamed_tokens = generated_len;
+                }
+                OPENVINO_ASSERT(generated_len >= (m_num_streamed_tokens + m_stream_window_size));
+                size_t num_output_token_to_push = generated_len - m_num_streamed_tokens - m_stream_window_size;
+                push_partial_outputs(num_output_token_to_push);
+                m_num_streamed_tokens += (num_output_token_to_push);
             } else if (has_finished() || out_of_memory()) {
                 push_outputs();
             }
diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
index 01202c086..ff56b78b1 100644
--- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
+++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
@@ -46,14 +46,14 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
                                draft_scheduler_config = is_scheduler_undefined ? main_scheduler_config : draft_model_desc.scheduler_config;
     if (is_scheduler_undefined) {
         // split KV cache to 2 caches for main and draft models
-        size_t main_model_cache_size = utils::get_kv_cache_size(main_model),
-            draft_model_cache_size = utils::get_kv_cache_size(draft_model);
-        auto k = static_cast<float>(draft_model_cache_size) / (main_model_cache_size + draft_model_cache_size);
+        size_t main_model_hidden_size = utils::get_hidden_size(main_model),
+               draft_model_hidden_size = utils::get_hidden_size(draft_model);
+        auto k = static_cast<float>(draft_model_hidden_size) / (main_model_hidden_size + draft_model_hidden_size);
 
-        size_t main_cache_size = main_scheduler_config.cache_size * (1 - k),
+        size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)),
                draft_cache_size = main_scheduler_config.cache_size - main_cache_size;
-        if (draft_cache_size == 0) {
-            main_cache_size -= main_cache_size > 1 ? 1 : 0;
+        if (draft_cache_size == 0 && main_cache_size > 0) {
+            main_cache_size -= (main_cache_size > 1 ? 1 : 0);
             draft_cache_size = 1;
         }
 
@@ -63,7 +63,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
 
     ov::AnyMap draft_properties = draft_model_desc.properties == ov::AnyMap{} ? compile_properties : draft_model_desc.properties;
 
-    DeviceConfig main_device_config(core, main_scheduler_config, main_device, compile_properties),
+    DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, compile_properties),
                  draft_device_config(core, draft_scheduler_config, draft_device, draft_properties);
 
     utils::set_kv_cache_type_and_shape(main_model, main_device_config);
@@ -82,7 +82,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         main_model, main_model_tokenizer, main_model_desc.generation_config,
-        main_device_config, main_scheduler_config, main_device, compile_properties, true);
+        main_device_config, main_scheduler_config_updated, main_device, compile_properties, true);
     m_draft_pipeline = std::make_shared<ContinuousBatchingForSpeculativeDecodingImpl>(core,
         draft_model, draft_model_tokenizer, draft_model_desc.generation_config,
         draft_device_config, draft_scheduler_config, draft_device, draft_properties, false);
@@ -304,4 +304,4 @@ SpeculativeDecodingMetrics
 ContinuousBatchingPipeline::SpeculativeDecodingImpl::get_speculative_decoding_metrics() {
     return m_sd_metrics;
 };
-}
\ No newline at end of file
+}
diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
index 314a7ffa4..5938b55f6 100644
--- a/src/cpp/src/text_callback_streamer.cpp
+++ b/src/cpp/src/text_callback_streamer.cpp
@@ -52,4 +52,4 @@ void TextCallbackStreamer::end() {
 ov::genai::StreamerBase::~StreamerBase() = default;
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp
index a03b0decc..6f0872ad1 100644
--- a/src/cpp/src/text_callback_streamer.hpp
+++ b/src/cpp/src/text_callback_streamer.hpp
@@ -25,4 +25,4 @@ class TextCallbackStreamer: public StreamerBase {
 };
 
 }  // namespace genai
-}  // namespace ov
+}  // namespace ov
\ No newline at end of file
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 642236d32..82c0a17a5 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -89,15 +89,16 @@ class Tokenizer::TokenizerImpl {
 public:
     ov::CompiledModel m_tokenizer;
     ov::CompiledModel m_detokenizer;
-    
+
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
-    // To change the adding special tokens mode we use a statefull subgraph, 
+
+    // To change the adding special tokens mode we use a statefull subgraph,
     // this flag holds the current state value of the CompiledModel.
     bool m_add_special_tokens = true;
     bool m_skip_special_tokens = true;
     bool m_older_than_24_5 = false;
-    
+
     int64_t m_pad_token_id = -1;
     int64_t m_bos_token_id = -1;
     int64_t m_eos_token_id = -1;
@@ -111,6 +112,7 @@ class Tokenizer::TokenizerImpl {
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = m_add_special_tokens;
         bool skip_special_tokens_flag = m_skip_special_tokens;
+
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
 
@@ -126,11 +128,11 @@ class Tokenizer::TokenizerImpl {
             // state but the effect is incorrect.
             return;
         }
-        
+
         // add_special_tokens is managed by Select op with a bool input.
         ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
         *add_special_tensor.data<bool>() = add_special_tokens_flag;
-        
+
         // skip_special_tokens is managed by multiplication with a number, therefore i32.
         ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
         *skip_special_tensor.data<int>() = skip_special_tokens_flag;
@@ -148,19 +150,19 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
 
-    TokenizerImpl(const std::filesystem::path& models_papth,  const ov::AnyMap& properties) {
-        setupTokenizer(models_papth, properties);
+    TokenizerImpl(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+        setup_tokenizer(models_path, properties);
     }
 
     TokenizerImpl(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
-        setupTokenizer(models, properties);
+        setup_tokenizer(models, properties);
     }
 
-    void setupTokenizer(const std::filesystem::path& models_path,  const ov::AnyMap& properties) {
+    void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
         ScopedVar env_manager(tokenizers_relative_to_genai().string());
         auto core = get_core_singleton();
 
-        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file");
+        OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file");
 
         std::shared_ptr<ov::Model> ov_tokenizer = nullptr;
         std::shared_ptr<ov::Model> ov_detokenizer = nullptr;
@@ -168,12 +170,12 @@ class Tokenizer::TokenizerImpl {
         if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) {
             ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml");
         }
-        
+
         if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) {
             ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml");
         }
 
-        setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
+        setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
 
         // If special tokens were not found from IR, try to read them from config.
         // This will be triggered only for IRs older than 2024.3.
@@ -184,20 +186,25 @@ class Tokenizer::TokenizerImpl {
             // Try to read tokenizer_config if some token ids or token str are not defined.
             read_tokenizer_config_if_necessary(models_path);
         }
-        
+
         // If chat_template was not found in IR, try to read them from config.
         if (m_chat_template.empty()) {
             m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path);
         }
     }
-    
 
-    void setupTokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models,  const ov::AnyMap& properties) {
+    void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
         auto [ov_tokenizer, ov_detokenizer] = models;
+        OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided");
 
-        m_older_than_24_5 = ov_tokenizer->get_rt_info().count("openvino_tokenizers_version") != 1;
         auto core = get_core_singleton();
         std::string device = "CPU"; // only CPU is supported for now
+
+        std::string version_str;
+        utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
+        // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
+        m_older_than_24_5 = version_str.empty();
+
         if (ov_tokenizer) {
             ov::pass::Manager manager;
             manager.register_pass<MakeCombineSegmentsSatateful>();
@@ -225,12 +232,13 @@ class Tokenizer::TokenizerImpl {
                     return std::move(this->m_detokenizer.create_infer_request());
                 });
         }
-        
+
         // Initialize tokenizer's cache to save time later.
         if (m_tokenizer) {
             // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
-            encode("non empty string").input_ids;
-        if (m_detokenizer)
+            encode("non empty string");
+        }
+        if (m_detokenizer) {
             decode({1, 33, 199, 42, 42});
         }
 
@@ -279,10 +287,11 @@ class Tokenizer::TokenizerImpl {
 
         nlohmann::json data = nlohmann::json::parse(f);
 
-        using ov::genai::utils::read_json_param;
         // they are in the format {"bos_token": { "content": "<s>",... }}
-        auto read_token_content_str = [&data](std::string key_name, std::string& val) {
-            if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); }
+        auto read_token_content_str = [&data](const std::string& key_name, std::string& val) {
+            if (val.empty() && data.contains(key_name)) {
+                utils::read_json_param(data[key_name], "content", val);
+            }
         };
         read_token_content_str(pad_token_key_name, m_pad_token);
         read_token_content_str(bos_token_key_name, m_bos_token);
@@ -377,6 +386,9 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) {
+        OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
+                                                "Tokenizer::encode is not available");
+
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
         set_state_if_necessary(infer_request_guard, tokenization_params);
         size_t batch_size = 1;
@@ -384,12 +396,14 @@ class Tokenizer::TokenizerImpl {
         infer_request_guard.get().start_async();
         infer_request_guard.get().wait();
         return get_copied_results(
-            infer_request_guard.get().get_tensor("input_ids"),
-            infer_request_guard.get().get_tensor("attention_mask")
+            infer_request_guard.get().get_output_tensor(0),
+            infer_request_guard.get().get_output_tensor(1)
         );
     }
 
     TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
+        OPENVINO_ASSERT(m_ireq_queue_tokenizer, "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. "
+                                                "Tokenizer::encode is not available");
         TokenizedInputs unpadded;
         {
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
@@ -400,8 +414,8 @@ class Tokenizer::TokenizerImpl {
             infer_request_guard.get().wait();
 
             unpadded = get_copied_results(
-                infer_request_guard.get().get_tensor("input_ids"),
-                infer_request_guard.get().get_tensor("attention_mask")
+                infer_request_guard.get().get_output_tensor(0),
+                infer_request_guard.get().get_output_tensor(1)
             );
         }
         return pad_left(unpadded.input_ids, unpadded.attention_mask);
@@ -482,7 +496,7 @@ class Tokenizer::TokenizerImpl {
             {"is none", "is undefined"},
             {"= none", "= undefined"},
             // Jinja2Cpp does not support Python-style slicing, e.g. [1:].
-            // If chat template contains such slicing, we replace it with 
+            // If chat template contains such slicing, we replace it with
             // a placeholder at the moment.
             {"messages[1:]", "slice(messages, 1)"},
         };
@@ -525,7 +539,7 @@ class Tokenizer::TokenizerImpl {
         env.GetSettings().trimBlocks = true;
         jinja2::Template tpl(&env);
         tpl.Load(chat_tpl);
-        
+
         jinja2::UserCallable slice_callable = jinja2::MakeCallable(
             [](const jinja2::GenericList& messages, const size_t& start) {
                 jinja2::ValuesList result;
@@ -595,7 +609,7 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
     ScopedVar env_manager(tokenizers_relative_to_genai().string());
     auto core = get_core_singleton();
     auto model = core.read_model(model_str, weights_tensor);
-    
+
     auto parameters = model->get_parameters();
     OPENVINO_ASSERT(!parameters.empty());
     if (parameters.front()->get_element_type() == ov::element::string) {
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 9fa14b7f9..be9fc972d 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -381,6 +381,14 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t se
     }
 }
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front) {
+    ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + 1}};
+    auto new_tensor_data = new_tensor.data<int64_t>();
+    new_tensor_data[0] = add_to_front;
+    std::copy_n(base_tensor.data<int64_t>(), base_tensor.get_size(), new_tensor_data + 1);
+    return new_tensor;
+}
+
 void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title) {
     // Specify the name of the environment variable
     const char* env_var_name = "OPENVINO_LOG_LEVEL";
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index 5342ac427..57225e60f 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -28,6 +28,21 @@ enum class GenerationChatInputsType {
     ENCODED_INPUTS = 2, // Type of inputs is EncodedInputs
 };
 
+struct HistoryRemoveManager
+{
+    size_t num_tokens_to_remove_from_kv_cache = 0;
+    size_t trusted_history_length = 0;
+
+    bool does_kv_cache_need_to_update() {
+        return (trusted_history_length > 0 || num_tokens_to_remove_from_kv_cache > 0);
+    }
+
+    void reset() {
+        num_tokens_to_remove_from_kv_cache = 0;
+        trusted_history_length = 0;
+    }
+};
+
 Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
@@ -104,6 +119,8 @@ size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
 
 void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, size_t seq_length_axis, std::optional<AdapterController> adapter_controller);
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, int64_t add_to_front);
+
 void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const char* model_title);
 
 }  // namespace utils
diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp
index 53690f770..4dedcf989 100644
--- a/src/cpp/src/utils/paged_attention_transformations.cpp
+++ b/src/cpp/src/utils/paged_attention_transformations.cpp
@@ -10,13 +10,8 @@ namespace ov {
 namespace genai {
 namespace utils {
 
-inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) {
-    ov::PartialShape partial_shape = static_shape;
-    partial_shape[0] = ov::Dimension::dynamic();
-    return partial_shape;
-}
 
-size_t get_kv_cache_size(const std::shared_ptr<ov::Model> model) {
+size_t get_hidden_size(const std::shared_ptr<ov::Model> model) {
     const auto& parameters = model->get_parameters();
     // extract num_kv_heads and head_size
     size_t kv_caches_inputs_offset = 2;
@@ -65,9 +60,8 @@ void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig&
     for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) {
         it_k->second->set_element_type(device_config.get_cache_precision());
         it_v->second->set_element_type(device_config.get_cache_precision());
-        // TODO: CVS-145270
-        it_k->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_key_cache_shape()));
-        it_v->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_value_cache_shape()));
+        it_k->second->set_partial_shape(device_config.get_key_cache_shape());
+        it_v->second->set_partial_shape(device_config.get_value_cache_shape());
     }
 
     model->validate_nodes_and_infer_types();
diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp
index 3bc423d7b..88ac0876c 100644
--- a/src/cpp/src/utils/paged_attention_transformations.hpp
+++ b/src/cpp/src/utils/paged_attention_transformations.hpp
@@ -23,7 +23,7 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
 
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false);
 
-size_t get_kv_cache_size(const std::shared_ptr<ov::Model> model);
+size_t get_hidden_size(const std::shared_ptr<ov::Model> model);
 
 void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index cf77dfce3..e53be4e1c 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -10,6 +10,7 @@
 
 #include "utils.hpp"
 
+
 namespace {
 
 constexpr size_t BATCH_SIZE = 1;
@@ -40,10 +41,13 @@ class InputsEmbedder::IInputsEmbedder {
     // Templated chat history
     std::string m_templated_chat_history;
     // Tokenized chat history
-    std::vector<int64_t> m_tokenized_chat_history;
-    // The number of elements, which need to remove from the end of KV cache
-    // removed elements will be added to inputs_ids
-    size_t m_to_remove_from_hist = 0;
+    std::vector<int64_t> m_tokenized_history;
+    // Tail of previous output for LM in chat mode is missing in KV cache.
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
+    // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -56,26 +60,34 @@ class InputsEmbedder::IInputsEmbedder {
         return m_tokenizer;
     }
 
-    std::vector<int64_t> get_tokenized_chat_history() const {
-        return m_tokenized_chat_history;
+    std::vector<int64_t> get_tokenized_history() const {
+        return m_tokenized_history;
     }
 
-    size_t get_amount_to_remove_from_hist() const {
-        return m_to_remove_from_hist;
+    size_t get_num_tokens_to_remove_from_hist() const {
+        return m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
     }
 
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
-        std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
-        m_to_remove_from_hist = 0;
+    void update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
+        if (is_beam_search) {
+            m_kv_history_manager.trusted_history_length = m_tokenized_history.size();
+            m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
+        } else {
+            m_kv_history_manager.reset();
+        }
+
+        m_last_disappeared_token = last_disappeared_token;
+  
+        std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
     }
 
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
-        m_to_remove_from_hist = 0;
-        if (!m_tokenized_chat_history.empty()) {
+        m_kv_history_manager.reset();
+        if (!m_tokenized_history.empty()) {
             m_history.clear();
             m_templated_chat_history.clear();
-            m_tokenized_chat_history.clear();
+            m_tokenized_history.clear();
         }
         if (system_message.empty()) {
             return;
@@ -94,11 +106,11 @@ class InputsEmbedder::IInputsEmbedder {
 
     virtual void finish_chat() {
         m_is_chat_conversation = false;
-        m_to_remove_from_hist = 0;
+        m_kv_history_manager.reset();
 
         m_history.clear();
         m_templated_chat_history.clear();
-        m_tokenized_chat_history.clear();
+        m_tokenized_history.clear();
     }
 
 protected:
@@ -164,38 +176,55 @@ class InputsEmbedder::IInputsEmbedder {
             // some symbols combinations can be encoded by the tokenizer in different ways
             // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
             // so let's check it out, find the trusted part and use it in on the next step
-            size_t last_same_hist_token = 0;
-            if (!m_tokenized_chat_history.empty()) {
+            size_t trusted_history_length = 0;
+            if (!m_tokenized_history.empty()) {
                 std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
-                last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens);
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
             }
 
-            if (m_tokenized_chat_history.empty()) {
+            if (m_tokenized_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
-            } else if (last_same_hist_token != SIZE_MAX) {
-                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+
+            } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
+                // does_kv_cache_need_to_update will be true here if beam search is activated
+                // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                if (m_kv_history_manager.does_kv_cache_need_to_update()) {
+                    trusted_history_length = m_kv_history_manager.trusted_history_length;
+                } else {
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length;
+                    // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                }
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
-                                                   {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
-                                                   new_chat_tokens.data<int64_t>() + last_same_hist_token);
-                encoded_input_ids = new_tensor;
+                                                   {1, new_chat_tokens.get_shape().at(1) - trusted_history_length},
+                                                   new_chat_tokens.data<int64_t>() + trusted_history_length);
+                encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(),
+                                                    {1, new_chat_tokens.get_shape().at(1) - trusted_history_length});
+                new_tensor.copy_to(encoded_input_ids);
             } else {
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
                     {new_chat_tokens}, prev_chat_tokens
                 ).input_ids;
+
+                if (m_last_disappeared_token.has_value())
+                    encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
-            m_tokenized_chat_history.clear();
-            std::copy(new_chat_tokens.data<int64_t>(), new_chat_tokens.data<int64_t>() + new_chat_tokens.get_size(),
-                        std::back_inserter(m_tokenized_chat_history));
+            m_tokenized_history.clear();
+            std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
         } else {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            m_tokenized_history.clear();
+            std::copy_n(encoded_input_ids.data<int64_t>(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history));
         }
+
         return encoded_input_ids;
     }
 
@@ -1172,16 +1201,16 @@ EmbeddingsModel InputsEmbedder::get_embedding_model() const {
     return m_impl->get_embedding_model();
 }
 
-std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
-    return m_impl->get_tokenized_chat_history();
+std::vector<int64_t> InputsEmbedder::get_tokenized_history() const {
+    return m_impl->get_tokenized_history();
 }
 
-void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
-    return m_impl->update_tokenized_chat_history(encoded_result);
+void InputsEmbedder::update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
+    return m_impl->update_tokenized_history(encoded_result, last_disappeared_token, is_beam_search, last_answer_len);
 }
 
-size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
-    return m_impl->get_amount_to_remove_from_hist();
+size_t InputsEmbedder::get_num_tokens_to_remove_from_hist() const {
+    return m_impl->get_num_tokens_to_remove_from_hist();
 }
 
 Tokenizer InputsEmbedder::get_tokenizer() const {
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 5c5b9d2b8..1d72b742a 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -41,16 +41,20 @@ class InputsEmbedder {
     Tokenizer get_tokenizer() const;
 
     // returns tokenized chat history
-    std::vector<int64_t> get_tokenized_chat_history() const;
-    // add new results to tokenized chat history
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
+    std::vector<int64_t> get_tokenized_history() const;
+
+    // add new results to tokenized history
+    void update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len);
+
     // returns amount of elements, which need to remove from the end of the KV cache
-    size_t get_amount_to_remove_from_hist() const;
+    size_t get_num_tokens_to_remove_from_hist() const;
 
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
+
     // adds currently generated text to chat history
     void update_chat_history(const std::string& decoded_results);
+
     // finishes chat and clears a chat history 
     void finish_chat();
 private:
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 1ce0cbf21..d62548520 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -17,6 +17,7 @@
 #include "utils.hpp"
 #include "lm_encoding.hpp"
 
+
 using namespace ov::genai;
 
 namespace {
@@ -66,6 +67,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
     float m_load_time_ms = 0;
     // Axis num in kv cache from m_language model, which contains information about history len
     size_t m_kv_cache_seq_length_axis = 2;
+    // Component for applying sampling to lm outputs
+    Sampler m_sampler;
 
     VLMPipelineImpl(
         const std::filesystem::path& models_dir,
@@ -104,6 +107,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         if (m_generation_config.eos_token_id == -1) {
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
         }
+
+        m_sampler = Sampler(m_tokenizer);
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     VLMPipelineImpl(
@@ -139,6 +145,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         if (m_generation_config.eos_token_id == -1) {
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
         }
+
+        m_sampler = Sampler(m_tokenizer);
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     VLMDecodedResults generate(
@@ -160,22 +169,21 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
-        auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
+        auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist();
         ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, m_kv_cache_seq_length_axis, std::nullopt);
 
-        Sampler sampler = Sampler(m_tokenizer);
-
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
         bool enable_prefix_caching = false;
 
-        auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
 
+        auto tokenized_history = m_inputs_embedder->get_tokenized_history();
         ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
-        std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);
+        std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
+        std::copy(tokenized_history.begin(), tokenized_history.end(), prompt_ids.data<int64_t>());
 
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
         sequence_group->set_sequence_group_ptr(sequence_group);
@@ -195,8 +203,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             },
         }, streamer);
 
-        OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr),
-                        "Currently streaming is possible only for greedy or multinomial decoding");
+        OPENVINO_ASSERT(streamer_ptr == nullptr || generation_config.num_return_sequences == 1 &&
+            (generation_config.is_greedy_decoding() || generation_config.is_multinomial()),
+            "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
 
         ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, { 1, history_size + inputs_embeds_size }};
         std::fill_n(new_atten_mask.data<int64_t>(), new_atten_mask.get_size(), 1);
@@ -204,10 +213,14 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
+        if (m_sampler.get_seed() != generation_config.rng_seed) {
+            m_sampler.set_seed(generation_config.rng_seed);
+        }
+
         ov::genai::EncodedResults encoded_result;
-        int32_t m_selected_beam = 0;
-        std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
-                                                                                      position_ids, m_embedding, std::nullopt);
+        std::optional<int64_t> last_disappeared_token;
+        std::tie(encoded_result, last_disappeared_token) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, requests,
+                                                                                             position_ids, m_embedding);
 
         auto decode_start_time = std::chrono::steady_clock::now();
         VLMDecodedResults decoded;
@@ -217,6 +230,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         }
         auto decode_end_time = std::chrono::steady_clock::now();
 
+        m_inputs_embedder->update_tokenized_history(encoded_result.tokens[0], last_disappeared_token, generation_config.is_beam_search(),
+                                                    m_language.get_tensor("attention_mask").get_shape()[1] - (history_size + inputs_embeds_size));
+
         std::string decoded_results = decoded.texts.at(0);
         if (m_is_chat_conversation) {
             m_inputs_embedder->update_chat_history(decoded_results);
@@ -243,8 +259,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
 
-        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
-
         return decoded;
     }
 
diff --git a/src/cpp/src/whisper/context_tokens.cpp b/src/cpp/src/whisper/context_tokens.cpp
new file mode 100644
index 000000000..75ee44255
--- /dev/null
+++ b/src/cpp/src/whisper/context_tokens.cpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "context_tokens.hpp"
+
+namespace {
+std::pair<std::vector<int64_t>, float> tokenize(std::string&& text,
+                                                const ov::genai::WhisperGenerationConfig& config,
+                                                ov::genai::Tokenizer& tokenizer) {
+    if (text.empty()) {
+        return {{}, 0.0f};
+    }
+
+    auto start_time = std::chrono::steady_clock::now();
+    auto encoded = tokenizer.encode(text, ov::genai::add_special_tokens(false));
+    auto duration = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - start_time);
+
+    auto input_ids = encoded.input_ids;
+    auto input_ids_data = input_ids.data<int64_t>();
+
+    std::vector<int64_t> prompt_tokens;
+    prompt_tokens.reserve(input_ids.get_size());
+
+    // even with ov::genai::add_special_tokens(false) tokenizer adds next special tokens. Ticket: 159569
+    std::set<int64_t> special_tokens{config.decoder_start_token_id, config.eos_token_id, config.no_timestamps_token_id};
+
+    for (size_t i = 0; i < input_ids.get_size(); i++) {
+        if (special_tokens.count(input_ids_data[i])) {
+            continue;
+        }
+
+        prompt_tokens.emplace_back(input_ids_data[i]);
+    }
+
+    return {prompt_tokens, duration};
+}
+}  // namespace
+
+namespace ov {
+namespace genai {
+
+std::pair<WhisperContextTokens, float> prepare_context_tokens(const WhisperGenerationConfig& config,
+                                                              Tokenizer& tokenizer) {
+    WhisperContextTokens context_tokens;
+    float duration = 0.0f;
+
+    if (config.initial_prompt.has_value()) {
+        auto [initial_prompt_tokens, initial_prompt_duration] =
+            tokenize(" " + *config.initial_prompt, config, tokenizer);
+        context_tokens.initial_prompt = std::move(initial_prompt_tokens);
+        duration += initial_prompt_duration;
+    }
+
+    if (config.hotwords.has_value()) {
+        auto [hotwords_tokens, hotwords_duration] = tokenize(" " + *config.hotwords, config, tokenizer);
+        context_tokens.hotwords = std::move(hotwords_tokens);
+        duration += hotwords_duration;
+    }
+
+    return {context_tokens, duration};
+}
+
+std::vector<int64_t> get_prompt_tokens(const WhisperContextTokens& context_tokens,
+                                       const WhisperGenerationConfig& config,
+                                       size_t chunk_offset) {
+    bool should_add_initial_prompt = !context_tokens.initial_prompt.empty() && chunk_offset == 0;
+    bool should_add_hotwords = !context_tokens.hotwords.empty();
+
+    if (!should_add_initial_prompt && !should_add_hotwords) {
+        return {};
+    }
+
+    std::vector<int64_t> prompt_tokens{config.prev_sot_token_id};
+
+    if (should_add_initial_prompt) {
+        prompt_tokens.insert(prompt_tokens.end(),
+                             context_tokens.initial_prompt.begin(),
+                             context_tokens.initial_prompt.end());
+    }
+
+    if (should_add_hotwords) {
+        prompt_tokens.insert(prompt_tokens.end(), context_tokens.hotwords.begin(), context_tokens.hotwords.end());
+    }
+
+    return prompt_tokens;
+}
+
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper/context_tokens.hpp b/src/cpp/src/whisper/context_tokens.hpp
new file mode 100644
index 000000000..0042ba813
--- /dev/null
+++ b/src/cpp/src/whisper/context_tokens.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/perf_metrics.hpp"
+#include "openvino/genai/whisper_generation_config.hpp"
+
+namespace ov {
+namespace genai {
+
+struct WhisperContextTokens {
+    std::vector<int64_t> initial_prompt;
+    std::vector<int64_t> hotwords;
+};
+
+std::pair<WhisperContextTokens, float> prepare_context_tokens(const WhisperGenerationConfig& config,
+                                                              Tokenizer& tokenizer);
+
+std::vector<int64_t> get_prompt_tokens(const WhisperContextTokens& context_tokens,
+                                       const WhisperGenerationConfig& config,
+                                       size_t chunk_offset);
+
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 355ccc619..04993f288 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -8,6 +8,7 @@
 #include <regex>
 #include <thread>
 
+#include "context_tokens.hpp"
 #include "logit_processor.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
@@ -17,6 +18,7 @@
 #include "whisper_config.hpp"
 #include "whisper_feature_extractor.hpp"
 #include "whisper_models.hpp"
+#include "whisper_utils.hpp"
 
 using ov::genai::MicroSeconds;
 
@@ -78,17 +80,6 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
     }
 }
 
-void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
-    const auto infer_start = std::chrono::steady_clock::now();
-    request.infer();
-    const auto infer_end = std::chrono::steady_clock::now();
-    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
-    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
-    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
-    raw_metrics.m_new_token_times.emplace_back(infer_end);
-    raw_metrics.m_batch_sizes.emplace_back(1);
-}
-
 int64_t decode(ov::Tensor& encoder_hidden_state,
                ov::InferRequest& decoder,
                std::vector<int64_t>& input_ids,
@@ -101,7 +92,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
     ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
     decoder.set_tensor("input_ids", input_ids_tensor);
 
-    infer_with_perf_metrics(decoder, raw_metrics);
+    ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);
 
     auto output_tensor = decoder.get_tensor("logits");
 
@@ -137,7 +128,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
     cache_position_tensor.set_shape({1});
     cache_position_tensor.data<int64_t>()[0] = cache_position;
 
-    infer_with_perf_metrics(decoder_with_past, raw_metrics);
+    ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);
 
     auto output_tensor = decoder_with_past.get_tensor("logits");
 
@@ -175,11 +166,11 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
     return output_token;
 }
 
-std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
-                                      ov::InferRequest decoder,
-                                      const ov::genai::WhisperGenerationConfig& config,
-                                      const bool return_timestamps,
-                                      ov::genai::RawPerfMetrics& raw_metrics) {
+std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
+                                         ov::InferRequest decoder,
+                                         const ov::genai::WhisperGenerationConfig& config,
+                                         const bool return_timestamps,
+                                         ov::genai::RawPerfMetrics& raw_metrics) {
     if (!config.is_multilingual) {
         if (return_timestamps) {
             return std::vector<int64_t>{config.decoder_start_token_id};
@@ -264,25 +255,6 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
     return {false, output_tokens};
 }
 
-template <typename T>
-void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
-    OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
-    std::vector<T> result{value.begin(), value.begin() + offset};
-    for (auto [start, end] : ranges) {
-        result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
-    }
-
-    value = result;
-}
-
-void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
-                                size_t offset,
-                                std::vector<std::pair<size_t, size_t>>& ranges) {
-    filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
-    filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
-    filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
-}
-
 }  // namespace
 
 namespace ov {
@@ -290,6 +262,7 @@ namespace genai {
 
 WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
                                        const ov::genai::WhisperConfig& model_config,
+                                       const WhisperContextTokens& context_tokens,
                                        const RawSpeechInput& raw_speech,
                                        ov::genai::WhisperInitializedModels& models,
                                        WhisperFeatureExtractor& feature_extractor,
@@ -313,7 +286,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
     // long-form audio processing requires timestamps to be enabled
     const bool return_timestamps = config.return_timestamps || !is_shortform;
 
-    std::vector<int64_t> init_ids;
+    std::vector<int64_t> init_tokens;
     std::vector<int64_t>& output_tokens = result.output_tokens;
     std::vector<Segment> segments;
 
@@ -335,14 +308,18 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                                 raw_metrics);
 
         // prepare init_ids just once for whole input
-        if (init_ids.empty()) {
-            init_ids = prepare_init_ids(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics);
+        if (init_tokens.empty()) {
+            init_tokens =
+                prepare_init_tokens(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics);
         }
 
+        std::vector<int64_t> chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset);
+        chunk_init_tokens.insert(chunk_init_tokens.end(), init_tokens.begin(), init_tokens.end());
+
         auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
                                                             config,
                                                             models,
-                                                            init_ids,
+                                                            chunk_init_tokens,
                                                             max_new_tokens - output_tokens.size(),
                                                             return_timestamps,
                                                             raw_metrics,
@@ -356,7 +333,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                                                   feature_extractor.nb_max_frames,
                                                                   time_precision);
 
-            filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
+            ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
 
             segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());
 
diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp
index 4904edf92..81f559db9 100644
--- a/src/cpp/src/whisper/whisper.hpp
+++ b/src/cpp/src/whisper/whisper.hpp
@@ -5,6 +5,7 @@
 
 #include <openvino/openvino.hpp>
 
+#include "context_tokens.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
 #include "whisper_config.hpp"
@@ -28,6 +29,7 @@ struct WhisperGenerateResult {
 
 WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
                                        const ov::genai::WhisperConfig& model_config,
+                                       const WhisperContextTokens& context_tokens,
                                        const ov::genai::RawSpeechInput& raw_speech,
                                        ov::genai::WhisperInitializedModels& models,
                                        ov::genai::WhisperFeatureExtractor& feature_extractor,
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
new file mode 100644
index 000000000..6e56a1439
--- /dev/null
+++ b/src/cpp/src/whisper/whisper_utils.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "whisper_utils.hpp"
+
+namespace {
+
+template <typename T>
+void filter_by_ranges(std::vector<T>& value, size_t offset, std::vector<std::pair<size_t, size_t>>& ranges) {
+    OPENVINO_ASSERT(ranges.empty() || value.size() >= (offset + ranges.back().second));
+    std::vector<T> result{value.begin(), value.begin() + offset};
+    for (auto [start, end] : ranges) {
+        result.insert(result.end(), value.begin() + offset + start, value.begin() + offset + end);
+    }
+
+    value = result;
+}
+
+}  // namespace
+
+namespace ov {
+namespace genai {
+namespace utils {
+
+void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics) {
+    const auto infer_start = std::chrono::steady_clock::now();
+    request.infer();
+    const auto infer_end = std::chrono::steady_clock::now();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(infer_end - infer_start);
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
+    raw_metrics.m_new_token_times.emplace_back(infer_end);
+    raw_metrics.m_batch_sizes.emplace_back(1);
+}
+
+void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
+                                size_t offset,
+                                std::vector<std::pair<size_t, size_t>>& ranges) {
+    filter_by_ranges(raw_metrics.m_token_infer_durations, offset, ranges);
+    filter_by_ranges(raw_metrics.m_new_token_times, offset, ranges);
+    filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
+}
+
+}  // namespace utils
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
new file mode 100644
index 000000000..234feed6a
--- /dev/null
+++ b/src/cpp/src/whisper/whisper_utils.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+
+#include "openvino/genai/perf_metrics.hpp"
+
+namespace ov {
+namespace genai {
+namespace utils {
+
+void infer_with_perf_metrics(ov::InferRequest& request, ov::genai::RawPerfMetrics& raw_metrics);
+
+void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
+                                size_t offset,
+                                std::vector<std::pair<size_t, size_t>>& ranges);
+
+}  // namespace utils
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp
index 0fba4e962..beb663caa 100644
--- a/src/cpp/src/whisper_generation_config.cpp
+++ b/src/cpp/src/whisper_generation_config.cpp
@@ -8,8 +8,8 @@
 #include <nlohmann/json.hpp>
 #include <openvino/runtime/core.hpp>
 
-#include "utils.hpp"
 #include "json_utils.hpp"
+#include "utils.hpp"
 
 namespace ov {
 namespace genai {
@@ -31,6 +31,7 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js
     read_json_param(data, "pad_token_id", pad_token_id);
     read_json_param(data, "no_timestamps_token_id", no_timestamps_token_id);
     read_json_param(data, "max_initial_timestamp_index", max_initial_timestamp_index);
+    read_json_param(data, "prev_sot_token_id", prev_sot_token_id);
 
     read_json_param(data, "is_multilingual", is_multilingual);
     if (is_multilingual) {
@@ -73,6 +74,8 @@ void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_
     read_anymap_param(config_map, "lang_to_id", lang_to_id);
     read_anymap_param(config_map, "task", task);
     read_anymap_param(config_map, "return_timestamps", return_timestamps);
+    read_anymap_param(config_map, "initial_prompt", initial_prompt);
+    read_anymap_param(config_map, "hotwords", hotwords);
 }
 
 size_t WhisperGenerationConfig::get_max_new_tokens(size_t prompt_length) const {
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index d472a2023..f0fb34cdf 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -9,6 +9,7 @@
 #include <variant>
 
 #include "utils.hpp"
+#include "whisper/context_tokens.hpp"
 #include "whisper/streamer.hpp"
 #include "whisper/whisper.hpp"
 #include "whisper/whisper_config.hpp"
@@ -91,8 +92,11 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
             streamer_ptr = std::make_shared<ChunkTextCallbackStreamer>(m_tokenizer, *callback);
         }
 
+        auto [context_tokens, tokenization_duration_microseconds] = prepare_context_tokens(config, m_tokenizer);
+
         auto generate_result = ov::genai::whisper_generate(config,
                                                            m_model_config,
+                                                           context_tokens,
                                                            raw_speech_input,
                                                            m_models,
                                                            m_feature_extractor,
@@ -102,6 +106,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         generate_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
             PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
 
+        result.perf_metrics.raw_metrics.tokenization_durations.emplace_back(tokenization_duration_microseconds);
+
         result.perf_metrics = generate_result.perf_metrics;
         auto& segments = generate_result.segments;
 
diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp
index 136819fa0..cc61eb065 100644
--- a/src/cpp/src/whisper_pipeline_static.cpp
+++ b/src/cpp/src/whisper_pipeline_static.cpp
@@ -14,6 +14,7 @@
 #include "whisper/timestamps.hpp"
 #include "whisper/whisper.hpp"
 #include "whisper/whisper_config.hpp"
+#include "whisper/whisper_utils.hpp"
 
 #include "openvino/core/layout.hpp"
 #include "openvino/core/preprocess/pre_post_process.hpp"
@@ -26,6 +27,8 @@
 #include "openvino/op/convert.hpp"
 #include "openvino/op/parameter.hpp"
 
+using ov::genai::MicroSeconds;
+
 namespace {
 
 template <typename T>
@@ -44,7 +47,8 @@ void copy_to_tensor(const std::vector<T>& src_vec, ov::Tensor dst_tensor) {
 ov::Tensor encode(ov::InferRequest& request,
                   std::vector<float>& mel_data,
                   const size_t feature_size,
-                  const size_t nb_max_frames) {
+                  const size_t nb_max_frames,
+                  ov::genai::RawPerfMetrics& raw_metrics) {
     OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames,
                     "Mel spectrogram required size: ",
                     feature_size,
@@ -54,7 +58,12 @@ ov::Tensor encode(ov::InferRequest& request,
                     mel_data.size(),
                     ".");
     copy_to_tensor(mel_data, request.get_tensor("input_features"));
+
+    const auto infer_start = std::chrono::steady_clock::now();
     request.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+
     return request.get_tensor("last_hidden_state");
 }
 
@@ -140,13 +149,14 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
                ov::InferRequest& decoder,
                const std::vector<int32_t>& init_ids,
                const ov::genai::WhisperGenerationConfig& config,
+               ov::genai::RawPerfMetrics& raw_metrics,
                const bool apply_logit_processors = true,
                const bool return_timestamps = false) {
     // NB: Fill decoder inputs
     encoder_hidden_state.copy_to(decoder.get_tensor("encoder_hidden_states"));
     set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);
 
-    decoder.infer();
+    ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);
 
     auto output_tensor = decoder.get_tensor("logits");
 
@@ -167,6 +177,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
                          const int64_t input_id,
                          const int64_t position_id,
                          const ov::genai::WhisperGenerationConfig& config,
+                         ov::genai::RawPerfMetrics& raw_metrics,
                          const bool return_timestamps,
                          const std::vector<int64_t>& generated_tokens) {
     // FIXME: Avoid this cast to i32. Why it's not i64 precision in model?
@@ -175,7 +186,7 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past,
     // FIXME: Is "attention_mask" supposed to be f16?
     decoder_with_past.get_tensor("attention_mask").data<ov::float16>()[position_id - 1] = 0u;
 
-    decoder_with_past.infer();
+    ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);
 
     auto output_tensor = decoder_with_past.get_tensor("logits");
     ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
@@ -217,13 +228,17 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq
 
 int64_t detect_language(ov::Tensor& encoder_hidden_state,
                         ov::InferRequest decoder,
-                        const ov::genai::WhisperGenerationConfig& config) {
+                        const ov::genai::WhisperGenerationConfig& config,
+                        ov::genai::RawPerfMetrics& raw_metrics) {
     decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
 
     std::vector<int32_t> init_ids{static_cast<int32_t>(config.decoder_start_token_id)};
     set_decoder_input_ids_attention_mask(decoder, init_ids, config.pad_token_id);
 
+    const auto infer_start = std::chrono::steady_clock::now();
     decoder.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
 
     auto output_tensor = decoder.get_tensor("logits");
 
@@ -246,7 +261,8 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
 std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
                                       ov::InferRequest& decoder,
                                       const ov::genai::WhisperGenerationConfig& config,
-                                      const bool return_timestamps) {
+                                      const bool return_timestamps,
+                                      ov::genai::RawPerfMetrics& raw_metrics) {
     if (!config.is_multilingual) {
         if (return_timestamps) {
             return std::vector<int32_t>{static_cast<int32_t>(config.decoder_start_token_id)};
@@ -263,7 +279,7 @@ std::vector<int32_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
             language_token_id = static_cast<int32_t>(config.lang_to_id.at(language));
         }
     } else {
-        language_token_id = detect_language(encoder_hidden_state, decoder, config);
+        language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics);
     }
 
     int32_t task_token_id = static_cast<int32_t>(config.transcribe_token_id);
@@ -289,8 +305,9 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
                                                   std::vector<int32_t> init_ids,
                                                   const size_t max_new_tokens,
                                                   const bool return_timestamps,
+                                                  ov::genai::RawPerfMetrics& raw_metrics,
                                                   const std::shared_ptr<ov::genai::ChunkStreamerBase> streamer) {
-    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
+    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps);
     std::vector<int64_t> output_tokens{output_token};
 
     if (!return_timestamps && streamer && streamer->put(output_token)) {
@@ -308,6 +325,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
                                              output_tokens.back(),
                                              i + init_ids.size(),
                                              config,
+                                             raw_metrics,
                                              return_timestamps,
                                              output_tokens);
         update_past_key_value(models.decoder_with_past, models.decoder_with_past, i + init_ids.size());
@@ -576,9 +594,13 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
     const RawSpeechInput& raw_speech_input,
     OptionalWhisperGenerationConfig generation_config,
     ChunkStreamerVariant streamer) {
+    auto start_time = std::chrono::steady_clock::now();
     WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
     config.validate();
 
+    OPENVINO_ASSERT(!config.initial_prompt.has_value(), "'initial_prompt' parameter is not supported on NPU device.");
+    OPENVINO_ASSERT(!config.hotwords.has_value(), "'hotwords' parameter is not supported on NPU device.");
+
     std::shared_ptr<ChunkStreamerBase> streamer_ptr;
     if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) {
         streamer_ptr = nullptr;
@@ -588,14 +610,25 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         streamer_ptr = std::make_shared<ChunkTextCallbackStreamer>(m_tokenizer, *callback);
     }
 
+    size_t max_new_tokens = config.get_max_new_tokens();
+
+    WhisperPerfMetrics perf_metrics;
+    perf_metrics.num_input_tokens = 0;
+    RawPerfMetrics& raw_metrics = perf_metrics.raw_metrics;
+    raw_metrics.m_new_token_times.reserve(max_new_tokens);
+    raw_metrics.m_batch_sizes.reserve(max_new_tokens);
+    raw_metrics.m_token_infer_durations.reserve(max_new_tokens);
+    raw_metrics.m_inference_durations = {{MicroSeconds(0.0f)}};
+
+    const auto extract_start = std::chrono::steady_clock::now();
     auto input_features = m_feature_extractor.extract(raw_speech_input);
+    const auto extract_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - extract_start);
+    perf_metrics.whisper_raw_metrics.features_extraction_durations.emplace_back(extract_ms);
 
     const bool is_shortform = input_features.n_frames <= m_feature_extractor.nb_max_frames;
     // long-form audio processing requires timestamps to be enabled
     const bool return_timestamps = config.return_timestamps || !is_shortform;
 
-    size_t max_new_tokens = config.get_max_new_tokens();
-
     std::vector<int32_t> init_ids;
     std::vector<int64_t> output_tokens;
     std::vector<Segment> segments;
@@ -616,11 +649,12 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         ov::Tensor hidden_state_tensor = encode(m_models.encoder,
                                                 input_features_chunk,
                                                 m_feature_extractor.feature_size,
-                                                m_feature_extractor.nb_max_frames);
+                                                m_feature_extractor.nb_max_frames,
+                                                raw_metrics);
 
         // prepare init_ids just once for whole input
         if (init_ids.empty()) {
-            init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps);
+            init_ids = prepare_init_ids(hidden_state_tensor, m_models.decoder, config, return_timestamps, raw_metrics);
         }
 
         auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
@@ -629,6 +663,7 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
                                                             init_ids,
                                                             max_new_tokens - output_tokens.size(),
                                                             return_timestamps,
+                                                            raw_metrics,
                                                             streamer_ptr);
 
         if (return_timestamps) {
@@ -637,6 +672,8 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
                                                                   m_feature_extractor.nb_max_frames,
                                                                   time_precision);
 
+            ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
+
             segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());
 
             output_tokens.insert(output_tokens.end(),
@@ -666,7 +703,11 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         streamer_ptr->end();
     }
 
+    auto decode_start_time = std::chrono::steady_clock::now();
     WhisperDecodedResults result{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}};
+    result.perf_metrics = perf_metrics;
+    result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
+            PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
 
     // if return_timestamps wasn't enabled by user
     if (!config.return_timestamps) {
@@ -678,13 +719,23 @@ WhisperDecodedResults WhisperPipeline::StaticWhisperPipeline::generate(
         chunks.reserve(segments.size());
 
         for (auto& segment : segments) {
+            decode_start_time = std::chrono::steady_clock::now();
             chunks.push_back(
                 WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)});
+            result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
+                    PerfMetrics::get_microsec(std::chrono::steady_clock::now() - decode_start_time));
         }
 
         result.chunks = chunks;
     }
 
+    auto& metrics = result.perf_metrics;
+    metrics.load_time = this->m_load_time_ms;
+    auto stop_time = std::chrono::steady_clock::now();
+    metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time));
+    metrics.raw_metrics.tokenization_durations.emplace_back(MicroSeconds(0.0f));
+    metrics.evaluate_statistics(start_time);
+
     return result;
 }
 
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 8c922ee64..976287459 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -217,6 +217,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
         <ul>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9"><code>stabilityai/stable-diffusion-xl-base-0.9</code></a></li>
           <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0"><code>stabilityai/stable-diffusion-xl-base-1.0</code></a></li>
+          <li><a href="https://huggingface.co/stabilityai/sdxl-turbo"><code>stabilityai/sdxl-turbo</code></a></li>
         </ul>
       </td>
     </tr>
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index 524ff0f92..8510a8389 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -361,10 +361,10 @@ class ContinuousBatchingPipeline:
     This class is used for generation with LLMs with continuous batchig
     """
     @typing.overload
-    def __init__(self, models_path: str, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def __init__(self, models_path: str, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
     def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle:
@@ -522,17 +522,17 @@ class FluxTransformer2DModel:
 class GenerationConfig:
     """
     
-        Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-        and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+        Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+        and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
         be used while greedy and beam search parameters will not affect decoding at all.
     
-        Parameters: 
+        Parameters:
         max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                        max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
         max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
         ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
         eos_token_id:  token_id of <eos> (end of sentence)
-        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
         stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
         include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
         stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -540,6 +540,10 @@ class GenerationConfig:
         logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                         Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
     
+        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+        presence_penalty: reduces absolute log prob if the token was generated at least once.
+        frequency_penalty: reduces absolute log prob as many times as the token was generated.
+    
         Beam search specific parameters:
         num_beams:         number of beams for beam search. 1 disables beam search.
         num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -550,8 +554,8 @@ class GenerationConfig:
             length_penalty < 0.0 encourages shorter sequences.
         num_return_sequences: the number of sequences to return for grouped beam search decoding.
         no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
             "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
             "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
     
@@ -560,7 +564,7 @@ class GenerationConfig:
         top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
         top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
         do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+        num_return_sequences: the number of sequences to generate from a single prompt.
     """
     adapters: AdapterConfig | None
     assistant_confidence_threshold: float
@@ -951,17 +955,17 @@ class LLMPipeline:
             :rtype: DecodedResults, EncodedResults, str
          
          
-            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
             be used while greedy and beam search parameters will not affect decoding at all.
         
-            Parameters: 
+            Parameters:
             max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                            max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
             stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -969,6 +973,10 @@ class LLMPipeline:
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
         
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
             Beam search specific parameters:
             num_beams:         number of beams for beam search. 1 disables beam search.
             num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -979,8 +987,8 @@ class LLMPipeline:
                 length_penalty < 0.0 encourages shorter sequences.
             num_return_sequences: the number of sequences to return for grouped beam search decoding.
             no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
                 "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
                 "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
         
@@ -989,7 +997,7 @@ class LLMPipeline:
             top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
             do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     @typing.overload
     def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None:
@@ -1032,17 +1040,17 @@ class LLMPipeline:
             :rtype: DecodedResults, EncodedResults, str
          
          
-            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+            Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+            and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
             be used while greedy and beam search parameters will not affect decoding at all.
         
-            Parameters: 
+            Parameters:
             max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                            max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
             max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
             eos_token_id:  token_id of <eos> (end of sentence)
-            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
             stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
             include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
             stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -1050,6 +1058,10 @@ class LLMPipeline:
             logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                             Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
         
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
             Beam search specific parameters:
             num_beams:         number of beams for beam search. 1 disables beam search.
             num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -1060,8 +1072,8 @@ class LLMPipeline:
                 length_penalty < 0.0 encourages shorter sequences.
             num_return_sequences: the number of sequences to return for grouped beam search decoding.
             no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
                 "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
                 "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
         
@@ -1070,7 +1082,7 @@ class LLMPipeline:
             top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
             do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     def get_generation_config(self) -> GenerationConfig:
         ...
@@ -1343,15 +1355,18 @@ class Scheduler:
           FLOW_MATCH_EULER_DISCRETE
         
           PNDM
+        
+          EULER_ANCESTRAL_DISCRETE
         """
         AUTO: typing.ClassVar[Scheduler.Type]  # value = <Type.AUTO: 0>
         DDIM: typing.ClassVar[Scheduler.Type]  # value = <Type.DDIM: 3>
+        EULER_ANCESTRAL_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.EULER_ANCESTRAL_DISCRETE: 7>
         EULER_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.EULER_DISCRETE: 4>
         FLOW_MATCH_EULER_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.FLOW_MATCH_EULER_DISCRETE: 5>
         LCM: typing.ClassVar[Scheduler.Type]  # value = <Type.LCM: 1>
         LMS_DISCRETE: typing.ClassVar[Scheduler.Type]  # value = <Type.LMS_DISCRETE: 2>
         PNDM: typing.ClassVar[Scheduler.Type]  # value = <Type.PNDM: 6>
-        __members__: typing.ClassVar[dict[str, Scheduler.Type]]  # value = {'AUTO': <Type.AUTO: 0>, 'LCM': <Type.LCM: 1>, 'LMS_DISCRETE': <Type.LMS_DISCRETE: 2>, 'DDIM': <Type.DDIM: 3>, 'EULER_DISCRETE': <Type.EULER_DISCRETE: 4>, 'FLOW_MATCH_EULER_DISCRETE': <Type.FLOW_MATCH_EULER_DISCRETE: 5>, 'PNDM': <Type.PNDM: 6>}
+        __members__: typing.ClassVar[dict[str, Scheduler.Type]]  # value = {'AUTO': <Type.AUTO: 0>, 'LCM': <Type.LCM: 1>, 'LMS_DISCRETE': <Type.LMS_DISCRETE: 2>, 'DDIM': <Type.DDIM: 3>, 'EULER_DISCRETE': <Type.EULER_DISCRETE: 4>, 'FLOW_MATCH_EULER_DISCRETE': <Type.FLOW_MATCH_EULER_DISCRETE: 5>, 'PNDM': <Type.PNDM: 6>, 'EULER_ANCESTRAL_DISCRETE': <Type.EULER_ANCESTRAL_DISCRETE: 7>}
         def __eq__(self, other: typing.Any) -> bool:
             ...
         def __getstate__(self) -> int:
@@ -1417,7 +1432,7 @@ class StopCriteria:
     """
     
         StopCriteria controls the stopping condition for grouped beam search.
-        
+    
         The following values are possible:
             "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates.
             "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates.
@@ -1945,6 +1960,9 @@ class WhisperGenerationConfig:
         :param no_timestamps_token_id: No timestamps token id.
         :type no_timestamps_token_id: int
     
+        :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token.
+        :type prev_sot_token_id: int
+    
         :param is_multilingual:
         :type is_multilingual: bool
     
@@ -1973,10 +1991,34 @@ class WhisperGenerationConfig:
                            then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                            Note that a segment of text refers to a sequence of one or more words, rather than individual words.
         :type return_timestamps: bool
+    
+        :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+        window. Can be used to steer the model to use particular spellings or styles.
+    
+        Example:
+          auto result = pipeline.generate(raw_speech);
+          //  He has gone and gone for good answered Paul Icrom who...
+    
+          auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+          //  He has gone and gone for good answered Polychrome who...
+        :type initial_prompt: Optional[str]
+    
+        :param hotwords:  Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+        Can be used to steer the model to use particular spellings or styles.
+    
+        Example:
+          auto result = pipeline.generate(raw_speech);
+          //  He has gone and gone for good answered Paul Icrom who...
+    
+          auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+          //  He has gone and gone for good answered Polychrome who...
+        :type hotwords: Optional[str]
     """
     begin_suppress_tokens: list[int]
     decoder_start_token_id: int
     eos_token_id: int
+    hotwords: str | None
+    initial_prompt: str | None
     is_multilingual: bool
     lang_to_id: dict[str, int]
     language: str | None
@@ -1985,6 +2027,7 @@ class WhisperGenerationConfig:
     max_new_tokens: int
     no_timestamps_token_id: int
     pad_token_id: int
+    prev_sot_token_id: int
     return_timestamps: bool
     suppress_tokens: list[int]
     task: str | None
@@ -2077,6 +2120,9 @@ class WhisperPipeline:
             :param no_timestamps_token_id: No timestamps token id.
             :type no_timestamps_token_id: int
         
+            :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token.
+            :type prev_sot_token_id: int
+        
             :param is_multilingual:
             :type is_multilingual: bool
         
@@ -2105,6 +2151,28 @@ class WhisperPipeline:
                                then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                                Note that a segment of text refers to a sequence of one or more words, rather than individual words.
             :type return_timestamps: bool
+        
+            :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+            window. Can be used to steer the model to use particular spellings or styles.
+        
+            Example:
+              auto result = pipeline.generate(raw_speech);
+              //  He has gone and gone for good answered Paul Icrom who...
+        
+              auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+              //  He has gone and gone for good answered Polychrome who...
+            :type initial_prompt: Optional[str]
+        
+            :param hotwords:  Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+            Can be used to steer the model to use particular spellings or styles.
+        
+            Example:
+              auto result = pipeline.generate(raw_speech);
+              //  He has gone and gone for good answered Paul Icrom who...
+        
+              auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+              //  He has gone and gone for good answered Polychrome who...
+            :type hotwords: Optional[str]
         """
     def get_generation_config(self) -> WhisperGenerationConfig:
         ...
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
index 772ba0af8..be7a72481 100644
--- a/src/python/py_continuous_batching_pipeline.cpp
+++ b/src/python/py_continuous_batching_pipeline.cpp
@@ -212,7 +212,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
             .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig")
-        .def(py::init([](const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(models_path, scheduler_config, device, pyutils::properties_to_any_map(llm_plugin_config), pyutils::properties_to_any_map(tokenizer_plugin_config));
         }),
@@ -222,7 +222,7 @@ void init_continuous_batching_pipeline(py::module_& m) {
         py::arg("properties") = ov::AnyMap({}),
         py::arg("tokenizer_properties") = ov::AnyMap({}))
 
-        .def(py::init([](const std::string& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
         }),
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index b1a5c6cd2..f49bcf29b 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -20,7 +20,7 @@ namespace {
 
 auto stop_criteria_docstring =  R"(
     StopCriteria controls the stopping condition for grouped beam search.
-    
+
     The following values are possible:
         "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates.
         "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates.
@@ -30,17 +30,17 @@ auto stop_criteria_docstring =  R"(
 } // namespace
 
 char generation_config_docstring[] = R"(
-    Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group 
-    and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+    Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+    and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
     be used while greedy and beam search parameters will not affect decoding at all.
 
-    Parameters: 
+    Parameters:
     max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
                    max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
     max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
     ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
     eos_token_id:  token_id of <eos> (end of sentence)
-    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
     stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
     include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
     stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
@@ -48,6 +48,10 @@ char generation_config_docstring[] = R"(
     logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
                     Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
 
+    repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+    presence_penalty: reduces absolute log prob if the token was generated at least once.
+    frequency_penalty: reduces absolute log prob as many times as the token was generated.
+
     Beam search specific parameters:
     num_beams:         number of beams for beam search. 1 disables beam search.
     num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
@@ -58,8 +62,8 @@ char generation_config_docstring[] = R"(
         length_penalty < 0.0 encourages shorter sequences.
     num_return_sequences: the number of sequences to return for grouped beam search decoding.
     no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
-    stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values: 
-        "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; 
+    stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+        "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
         "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
         "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
 
@@ -68,7 +72,7 @@ char generation_config_docstring[] = R"(
     top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
     top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
     do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
-    repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.    
+    num_return_sequences: the number of sequences to generate from a single prompt.
 )";
 
 void init_generation_config(py::module_& m) {
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index f5347c279..311f3f376 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -198,7 +198,8 @@ void init_image_generation_pipelines(py::module_& m) {
         .value("DDIM", ov::genai::Scheduler::Type::DDIM)
         .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE)
         .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE)
-        .value("PNDM", ov::genai::Scheduler::Type::PNDM);
+        .value("PNDM", ov::genai::Scheduler::Type::PNDM)
+        .value("EULER_ANCESTRAL_DISCRETE", ov::genai::Scheduler::Type::EULER_ANCESTRAL_DISCRETE);
     image_generation_scheduler.def_static("from_config",
         &ov::genai::Scheduler::from_config,
         py::arg("scheduler_config_path"),
diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index 49152c03f..cd42dcf58 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -103,6 +103,9 @@ auto whisper_generation_config_docstring = R"(
     :param no_timestamps_token_id: No timestamps token id.
     :type no_timestamps_token_id: int
 
+    :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token.
+    :type prev_sot_token_id: int
+
     :param is_multilingual:
     :type is_multilingual: bool
 
@@ -131,6 +134,28 @@ auto whisper_generation_config_docstring = R"(
                        then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                        Note that a segment of text refers to a sequence of one or more words, rather than individual words.
     :type return_timestamps: bool
+
+    :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing
+    window. Can be used to steer the model to use particular spellings or styles.
+
+    Example:
+      auto result = pipeline.generate(raw_speech);
+      //  He has gone and gone for good answered Paul Icrom who...
+
+      auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome"));
+      //  He has gone and gone for good answered Polychrome who...
+    :type initial_prompt: Optional[str]
+
+    :param hotwords:  Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows.
+    Can be used to steer the model to use particular spellings or styles.
+
+    Example:
+      auto result = pipeline.generate(raw_speech);
+      //  He has gone and gone for good answered Paul Icrom who...
+
+      auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
+      //  He has gone and gone for good answered Polychrome who...
+    :type hotwords: Optional[str]
 )";
 
 auto streamer_base_docstring = R"(
@@ -262,11 +287,14 @@ void init_whisper_pipeline(py::module_& m) {
         .def_readwrite("transcribe_token_id", &WhisperGenerationConfig::transcribe_token_id)
         .def_readwrite("max_initial_timestamp_index", &WhisperGenerationConfig::max_initial_timestamp_index)
         .def_readwrite("no_timestamps_token_id", &WhisperGenerationConfig::no_timestamps_token_id)
+        .def_readwrite("prev_sot_token_id", &WhisperGenerationConfig::prev_sot_token_id)
         .def_readwrite("is_multilingual", &WhisperGenerationConfig::is_multilingual)
         .def_readwrite("language", &WhisperGenerationConfig::language)
         .def_readwrite("lang_to_id", &WhisperGenerationConfig::lang_to_id)
         .def_readwrite("task", &WhisperGenerationConfig::task)
         .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps)
+        .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt)
+        .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords)
         .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"));
 
     py::class_<WhisperRawPerfMetrics>(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring)
diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp
index b2a5396d5..7f0798038 100644
--- a/tests/cpp/cache_manager.cpp
+++ b/tests/cpp/cache_manager.cpp
@@ -7,8 +7,43 @@
 #include "scheduler.hpp"
 #include "device_config.hpp"
 #include "cache_manager.hpp"
+#include "openvino/op/concat.hpp"
 
-TEST(TestCacheManager, general_test) {
+using namespace ov::genai;
+
+std::shared_ptr<ov::Model> get_dummy_model(size_t num_layers) {
+    ov::NodeVector keys;
+    ov::NodeVector values;
+    ov::ParameterVector params;
+    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
+    for (size_t i = 0; i < num_layers; i++) {
+        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
+        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
+        keys.push_back(key);
+        values.push_back(value);
+        params.push_back(key);
+        params.push_back(value);
+    }
+    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
+    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
+    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+}
+
+size_t get_total_allocated_bytes(std::shared_ptr<ov::genai::CacheManager> cache_manager, size_t num_decoder_layers) {
+    size_t allocated_bytes = 0;
+    for (size_t i = 0; i < num_decoder_layers; i++) {
+        auto key_cache = cache_manager->get_key_cache(i);
+        auto value_cache = cache_manager->get_value_cache(i);
+        allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
+    }
+    return allocated_bytes;
+}
+
+
+TEST(TestCacheManager, test_cache_size_param) {
     ov::Core core;
     ov::genai::SchedulerConfig scheduler_config;
     scheduler_config.max_num_batched_tokens = 32;
@@ -21,14 +56,73 @@ TEST(TestCacheManager, general_test) {
     size_t num_decoder_layers = 12;
     device_config.set_model_params(12, 64, num_decoder_layers);
 
-    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, core);
-
-    size_t allocated_bytes = 0;
-    for (size_t i = 0; i < num_decoder_layers; i++) {
-        auto key_cache = cache_manager->get_key_cache(i);
-        auto value_cache = cache_manager->get_value_cache(i);
-        allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size();
-    }
+    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
+    auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
     
-    ASSERT_EQ(allocated_bytes, 2146959360);
+    ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360);
 }
+
+
+TEST(TestCacheManager, test_kv_blocks_param) {
+    ov::Core core;
+    ov::genai::SchedulerConfig scheduler_config;
+    scheduler_config.max_num_batched_tokens = 32;
+    scheduler_config.num_kv_blocks = 150;
+    scheduler_config.cache_size = 0;
+    scheduler_config.max_num_seqs = 2;
+
+    const std::string device = "CPU";
+    ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
+    size_t num_decoder_layers = 12;
+    device_config.set_model_params(12, 64, num_decoder_layers);
+
+    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
+    auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
+    OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks);
+}
+
+
+TEST(TestCacheManager, test_dynamic_cache_increase) {
+    ov::Core core;
+    ov::genai::SchedulerConfig scheduler_config;
+    scheduler_config.max_num_batched_tokens = 32;
+    scheduler_config.num_kv_blocks = 0;
+    scheduler_config.cache_size = 0;
+    scheduler_config.max_num_seqs = 2;
+
+    const std::string device = "CPU";
+    ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
+    size_t num_decoder_layers = 12;
+    size_t head_size = 64;
+    size_t num_kv_heads = 12;
+    device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers);
+    size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size();
+
+
+    ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request();
+    auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
+    auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
+
+    // check initial cache allocation
+    block_manager.increase_kv_blocks_number(100);
+    OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100);
+
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
+    OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes);
+
+
+    // check cache increase
+    block_manager.increase_kv_blocks_number(200);
+    OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200);
+
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
+    OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes);
+
+
+    // check that cache does not increase if new blocks were not allocated
+    cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
+    OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes);
+}
\ No newline at end of file
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index 40c3e7374..ea1720faa 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -4,6 +4,7 @@
 
 #include <gtest/gtest.h>
 #include "openvino/runtime/core.hpp"
+#include "openvino/op/concat.hpp"
 #include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "sequence_group.hpp"
@@ -17,6 +18,37 @@ void clear_finished_sequences(std::vector<SequenceGroup::Ptr>& requests) {
     });
     requests.erase(new_end, requests.end());
 }
+std::shared_ptr<ov::Model> get_model(size_t num_layers) {
+    ov::NodeVector keys;
+    ov::NodeVector values;
+    ov::ParameterVector params;
+    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
+    for (size_t i = 0; i < num_layers; i++) {
+        auto key = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, shape);
+        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
+        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
+        keys.push_back(key);
+        values.push_back(value);
+        params.push_back(key);
+        params.push_back(value);
+    }
+    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
+    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
+    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+}
+
+std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_config) {
+    ov::Core core = ov::Core();
+    size_t num_decoder_layers = 12;
+    ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request();
+    size_t head_size = 64, head_size_u8 = head_size + 8;
+    size_t num_kv_heads = 12;
+    ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
+    device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
+    return std::make_shared<CacheManager>(device_config, request, core);  
+}
 
 TEST(TestScheduler, general_test) {
     std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()};
@@ -40,10 +72,9 @@ TEST(TestScheduler, general_test) {
                                                                                 ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
         auto idx2 = (*sequence_group3)[0]->get_id();
         std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2, sequence_group3};
-                                                                        
         
         // schedule 3 sequence groups that use 6 kv blocks 
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
 
         std::vector<uint64_t> ref_ids = {0, 1, 2};
@@ -144,7 +175,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) {
     auto idx1 = (*sequence_group2)[0]->get_id();
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
-    Scheduler scheduler = Scheduler(4, scheduler_config);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
     auto out1 = scheduler.schedule(requests);
 
     std::vector<uint64_t> ref_ids = {0, 1};
@@ -212,7 +243,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) {
 
 
     // schedule 2 sequence groups that use 5 kv blocks
-    Scheduler scheduler = Scheduler(4, scheduler_config);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
     auto out0 = scheduler.schedule(requests);
 
     for (auto seq: requests) {
@@ -297,7 +328,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) {
         sequence_group->set_sequence_group_ptr(sequence_group);
         std::vector<SequenceGroup::Ptr> requests = {sequence_group};
 
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out = scheduler.schedule(requests);
         for (auto sequence: sequence_group->get_not_finished_sequences()) {
             sequence->append_token(token, 0.7);
@@ -405,11 +436,10 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
         SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
                                                                                 ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching);
         auto idx1 = (*sequence_group2)[0]->get_id();
-        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
-                                                                        
+        std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};                                                
         
         // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
         auto out1 = scheduler.schedule(requests);
 
         for (auto seq: requests) {
@@ -503,7 +533,7 @@ TEST(TestScheduler, prefix_caching_test) {
         std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
         std::vector<uint64_t> histrory_tokens = {};
         // schedule prompt
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
 
         size_t chat_iterations = 10;
 
@@ -566,7 +596,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) {
         std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
         std::vector<uint64_t> histrory_tokens = {};
         // schedule prompt
-        Scheduler scheduler = Scheduler(4, scheduler_config);
+        Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config);
 
         size_t chat_iterations = 10;
 
@@ -640,7 +670,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) {
     for (auto scheduler_config: configs) {
         std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
         // schedule prompt
-        Scheduler scheduler = Scheduler(32, scheduler_config);
+        Scheduler scheduler = Scheduler(32, init_cache_manager(scheduler_config), scheduler_config);
 
         size_t chat_iterations = 2;
 
@@ -701,7 +731,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) {
 
     // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
     const bool can_use_partial_preemption = false;
-    Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption);
     auto out1 = scheduler.schedule(requests);
 
     for (auto req : requests)
@@ -775,7 +805,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) {
 
     // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks.
     const bool can_use_partial_preemption = false;
-    Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption);
+    Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption);
     scheduler.schedule(requests);
     for (auto req: requests)
         req->finish_iteration();
@@ -874,7 +904,6 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) {
     scheduler_config.use_cache_eviction = true;
     scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(2, 2, 6, ov::genai::AggregationMode::NORM_SUM);
 
-
     std::vector<uint64_t> tokens1 = {0, 1};  // 1 full block
     SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0,
                                                                          ov::Tensor(ov::element::i64, {tokens1.size()},
@@ -890,7 +919,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) {
     std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2};
 
 
-    Scheduler scheduler = Scheduler(2, scheduler_config);
+    Scheduler scheduler = Scheduler(2, init_cache_manager(scheduler_config), scheduler_config);
     // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2
     auto out = scheduler.schedule(requests);
 
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 50ee452f5..7e3c07540 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -42,13 +42,6 @@ def get_greedy_with_penalties() -> GenerationConfig:
     generation_config.max_new_tokens = 30
     return generation_config
 
-def get_greedy_with_min_and_max_tokens() -> GenerationConfig:
-    generation_config = GenerationConfig()
-    generation_config.num_return_sequences = 1
-    generation_config.min_new_tokens = 15
-    generation_config.max_new_tokens = 30
-    return generation_config
-
 def get_greedy_with_single_stop_string() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.num_return_sequences = 1
@@ -125,6 +118,34 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig:
     generation_config.include_stop_str_in_output = True
     return generation_config
 
+def get_greedy_stop_strings_exclude_from_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines" }
+    generation_config.include_stop_str_in_output = False
+    return generation_config
+
+def get_greedy_stop_strings_include_to_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines" }
+    generation_config.include_stop_str_in_output = True
+    return generation_config
+
+def get_greedy_n_stop_strings_exclude_from_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines", "manage" }
+    generation_config.include_stop_str_in_output = False
+    return generation_config
+
+def get_greedy_n_stop_strings_include_to_output() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+    generation_config.stop_strings = { "machines", "manage" }
+    generation_config.include_stop_str_in_output = True
+    return generation_config
+
 def get_multinomial_temperature() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
@@ -238,7 +259,6 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
 
 def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
     scheduler_config = SchedulerConfig()
-    scheduler_config.cache_size = 1
     if scheduler_params is None:
         scheduler_config.dynamic_split_fuse = True
         # vLLM specific
@@ -269,10 +289,12 @@ def convert_to_hf(
     kwargs['max_length'] = generation_config.max_length
     # has higher priority than 'max_length'
     kwargs['max_new_tokens'] = generation_config.max_new_tokens
+    kwargs['min_new_tokens'] = generation_config.min_new_tokens
     if generation_config.stop_strings:
         kwargs['stop_strings'] = generation_config.stop_strings
 
     # copy default parameters
+    kwargs['bos_token_id'] = default_generation_config.bos_token_id
     kwargs['eos_token_id'] = default_generation_config.eos_token_id
     kwargs['pad_token_id'] = default_generation_config.pad_token_id
     kwargs['repetition_penalty'] = generation_config.repetition_penalty
@@ -281,11 +303,12 @@ def convert_to_hf(
         # beam search case
         kwargs['num_beam_groups'] = generation_config.num_beam_groups
         kwargs['num_beams'] = generation_config.num_beams
-        kwargs['diversity_penalty'] = generation_config.diversity_penalty
         kwargs['length_penalty'] = generation_config.length_penalty
         kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size
         kwargs['num_return_sequences'] = generation_config.num_return_sequences
         kwargs['output_scores'] = True
+        if generation_config.num_beam_groups > 1:
+            kwargs['diversity_penalty'] = generation_config.diversity_penalty
     elif generation_config.do_sample:
         # mulitinomial
         kwargs['temperature'] = generation_config.temperature
@@ -301,7 +324,7 @@ def convert_to_hf(
 
 
 def run_hugging_face(
-    model,
+    opt_model,
     hf_tokenizer,
     prompts: List[str],
     generation_configs: List[GenerationConfig],
@@ -310,8 +333,9 @@ def run_hugging_face(
     for prompt, generation_config in zip(prompts, generation_configs):
         inputs = hf_tokenizer(prompt, return_tensors="pt")
         prompt_len = inputs['input_ids'].numel()
-        generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config),
-                                        return_dict_in_generate=True, tokenizer=hf_tokenizer)
+        generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
+                                              generation_config=convert_to_hf(opt_model.generation_config, generation_config),
+                                              return_dict_in_generate=True, tokenizer=hf_tokenizer)
         all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
 
         generation_result = GenerationResult()
@@ -322,7 +346,7 @@ def run_hugging_face(
         generation_results.append(generation_result)
 
     del hf_tokenizer
-    del model
+    del opt_model
 
     return generation_results
 
@@ -333,14 +357,14 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {}, {})
+    pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(models_path)
     return output
 
 
-def get_models_list(file_name: str):
+def read_models_list(file_name: str):
     models = []
     with open(file_name) as f:
         for model_name in f:
@@ -359,9 +383,22 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge
             # Note, that for fp32 / fp16 models scores are different less than 0.001
             assert abs(hf_score - ov_score) < 0.02
 
-    assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
-    for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
-        assert hf_text == ov_text
+    if not generation_config.include_stop_str_in_output and len(generation_config.stop_strings) > 0:
+        assert len(hf_result.m_generation_ids) >= len(ov_result.m_generation_ids)
+        for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            assert ov_text in hf_text
+    else:
+        assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
+        for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            assert hf_text == ov_text
+
+
+def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True):
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
+                AutoModelForCausalLM.from_pretrained(model_id)
+    return opt_model, hf_tokenizer
+
 
 def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
     model.save_pretrained(models_path)
@@ -372,23 +409,6 @@ def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path):
     serialize(tokenizer, models_path / "openvino_tokenizer.xml")
     serialize(detokenizer, models_path / "openvino_detokenizer.xml")
 
-def get_model_and_tokenizer(model_id: str, use_optimum = True):
-    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \
-            AutoModelForCausalLM.from_pretrained(model_id)
-    return model, hf_tokenizer
-
-def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
-    use_optimum = True
-    models_path : Path = tmp_path / model_id
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum)
-
-    if use_optimum:
-        save_ov_model_from_optimum(model, hf_tokenizer, models_path)
-
-    hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs)
-    _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config)
-
 
 def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
     ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
@@ -401,19 +421,32 @@ def _generate_and_compare_with_reference_results(models_path: Path, prompts: Lis
         compare_results(ref_result, ov_result, generation_config)
 
 
+def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path):
+    use_optimum = True
+    models_path : Path = tmp_path / model_id
+    opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum)
+
+    if use_optimum:
+        save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path)
+
+    hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs)
+    _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config)
+
+
 def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig):
     ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs)
 
     assert len(prompts) == len(reference_texts_per_prompt)
     assert len(prompts) == len(ov_results)
 
-    for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip(prompts, reference_texts_per_prompt, ov_results, generation_configs):
+    for prompt, ref_texts_for_this_prompt, ov_result in zip(prompts, reference_texts_per_prompt, ov_results):
         print(f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}")
 
         assert len(ref_texts_for_this_prompt) == len(ov_result.m_generation_ids)
         for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids):
             assert ref_text == ov_text
 
+
 def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None):
     prompts, generation_configs = get_test_dataset()
     scheduler_config = get_scheduler_config(scheduler_params)
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index f98f47ecf..e15904560 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -3,7 +3,8 @@
 
 def pytest_make_parametrize_id(config, val, argname):
     if argname in ['prompt', 'prompts', 'batched_prompts']:
-        return f'{val}'
+        # Print only first 1000 characters of long prompts.
+        return f'{val[:1000]}'
     elif argname == 'model_descr':
         return f"{val[0]}"
     elif argname == 'chat_config':
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index b633497d3..87b2147bc 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -57,33 +57,6 @@ def get_models_list():
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
 
-def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
-    precommit_models = [
-        "openai/whisper-tiny",
-        "openai/whisper-tiny.en",
-        "distil-whisper/distil-small.en",
-    ]
-    if multilingual:
-        precommit_models = ["openai/whisper-tiny"]
-    if en_only:
-        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
-    if tiny_only:
-        precommit_models = ["openai/whisper-tiny"]
-
-    nightly_models = []
-
-    if pytest.run_marker == "precommit":
-        model_ids = precommit_models
-    else:
-        model_ids = nightly_models
-
-    if pytest.selected_model_ids:
-        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
-
-    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
-    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
-
-
 def get_chat_models_list():
     precommit_models = [
         "Qwen/Qwen2-0.5B-Instruct",
@@ -101,90 +74,31 @@ def get_chat_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-    
+
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
 
-def get_chat_templates():
-    # Returns chat templates saved in tokenizer_configs.py, 
-    # but skips some models that currently are not processed correctly.
-
-    skipped_models = {
-        # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template.
-        # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads.
-        "openchat/openchat-3.5-0106",
-        
-        # These models fail even on HF so no need to check if applying chat matches.
-        "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
-        "codellama/CodeLlama-34b-Instruct-hf",
-        "deepseek-ai/deepseek-math-7b-rl",
-        "allenai/tulu-2-7b",
-        "alexsobolev/IcaroLM",
-        "tokyotech-llm/Swallow-7b-instruct-v0.1",
-        "bofenghuang/vigogne-2-7b-chat",
-        "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k",
-        "AliAbdelrasheed/maqa_llama_4bit",
-        "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored",
-
-        # TODO: Need to support chat templates in more models: CVS-145963
-        # Either ov_genai is unable to parse chat_template or results do not match with HF.
-        "meta-llama/Meta-Llama-3-8B-Instruct",
-        "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp
-        "mosaicml/mpt-30b-chat",
-        "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp
-        "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp
-        "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp
-        "openchat/openchat-3.5-0106",
-        "casperhansen/llama-3-70b-instruct-awq",
-        "TheBloke/deepseek-coder-33B-instruct-GPTQ",
-        "AI-Sweden-Models/gpt-sw3-356m-instruct",
-        "google/gemma-7b-it",
-        "THUDM/cogvlm2-llama3-chat-19B",
-        "KnutJaegersberg/internlm-20b-llama",
-        "maywell/Synatra-Mixtral-8x7B",
-        "MediaTek-Research/Breeze-7B-Instruct-v1_0",
-        "bofenghuang/vigostral-7b-chat",
-        "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp
-        "openchat/openchat-3.6-8b-20240522",
-        "tenyx/TenyxChat-7B-v1",
-        "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2",
-        "yam-peleg/Hebrew-Gemma-11B-V2",
-        "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError
-        "nlpai-lab/KULLM3",
-        "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
-        "MediaTek-Research/Breeze-7B-Instruct-v0_1", 
-        "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError
-        "MLP-KTLim/llama-3-Korean-Bllossom-8B",
-        "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp
-        "codellama/CodeLlama-70b-Instruct-hf",
-        "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp
-        "BramVanroy/Llama-2-13b-chat-dutch"
-    }
-    from tokenizer_configs import get_tokenizer_configs
-    return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models]
-
-
 @functools.lru_cache(1)
 def read_model(params, **tokenizer_kwargs):
     model_id, path = params
     
     from optimum.intel.openvino import OVModelForCausalLM
     from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
     if (path / "openvino_model.xml").exists():
         opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, 
                                                        compile=False, device='CPU')
     else:
-        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, 
+        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, 
                                                                              with_detokenizer=True,
                                                                              **tokenizer_kwargs)
         openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
         openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
         
         # to store tokenizer config jsons with special tokens
-        tokenizer.save_pretrained(path)
+        hf_tokenizer.save_pretrained(path)
         
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, 
                                                        compile=False, device='CPU', load_in_8bit=False)
@@ -195,7 +109,7 @@ def read_model(params, **tokenizer_kwargs):
     return (
         model_id,
         path,
-        tokenizer,
+        hf_tokenizer,
         opt_model,
         ov_genai.LLMPipeline(path, 'CPU', **{'ENABLE_MMAP': False}),
     )
@@ -256,20 +170,8 @@ def model_tokenizers_path_tmp_path(tmpdir_factory):
     yield model_id, Path(temp_path)
 
 
-def load_tok(configs: List[Tuple], temp_path):
-    # load Tokenizer where all configs are cleared.
-    # remove existing jsons from previous tests
-    for json_file in temp_path.glob("*.json"):
-        json_file.unlink()
-
-    for config_json, config_name in configs:
-        with (temp_path / config_name).open('w') as f:
-            json.dump(config_json, f)
-    return ov_genai.Tokenizer(temp_path)
-
-
-def load_pipe(configs: List[Tuple], temp_path):
-    # Load LLMPipline where all configs are cleared.
+def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
+    # Load LLMPipeline where all configs are cleared.
     # remove existing jsons from previous tests
     for json_file in temp_path.glob("*.json"):
         json_file.unlink()
@@ -283,5 +185,4 @@ def load_pipe(configs: List[Tuple], temp_path):
 @functools.lru_cache(1)
 def get_continuous_batching(path):
     scheduler_config = ov_genai.SchedulerConfig()
-    scheduler_config.cache_size = 1
     return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config})
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index 3dac3f8b0..00bffb664 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
+diffusers==0.31.0
 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
-numpy<2.0.0; sys_platform == 'darwin'
+numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
 
diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py
index 45704f9dc..d89697ba4 100644
--- a/tests/python_tests/test_cache_optimizations.py
+++ b/tests/python_tests/test_cache_optimizations.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT
+from common import TESTS_ROOT, run_test_pipeline
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     scheduler_config_opt.enable_prefix_caching = enable_prefix_caching
 
     models_path = converted_model.models_path
-    model_cb_noopt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {})
-    model_cb_opt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config_opt, "CPU", {})
+    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
+    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
 
     tokenizer = converted_model.tokenizer
 
@@ -145,3 +145,28 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     del model_cb_noopt
 
 
+def get_greedy_seq_len_300() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.num_return_sequences = 3
+    generation_config.max_new_tokens = 300
+    return generation_config
+
+def get_beam_search_seq_len_300() -> GenerationConfig:
+    generation_config = GenerationConfig()
+    generation_config.num_beam_groups = 3
+    generation_config.num_beams = 6
+    generation_config.max_new_tokens = 300
+    generation_config.num_return_sequences = generation_config.num_beams
+    return generation_config
+
+scheduler_params_list = [
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_beam_search_seq_len_300()),
+                         ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())]
+@pytest.mark.parametrize("params", scheduler_params_list)
+@pytest.mark.precommit
+def test_dynamic_memory_allocation(tmp_path, params):
+    run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
+
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 9260e671d..07b4f7c15 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -4,24 +4,21 @@
 import openvino_genai as ov_genai
 import pytest
 from typing import Dict, Tuple
+
 from ov_genai_test_utils import (
-    get_models_list,
     get_chat_models_list,
     read_model,
-    load_tok,
-    model_tmp_path,
-    get_chat_templates,
     get_continuous_batching,
 )
 
 
-configs = [
+generation_configs = [
     dict(do_sample=False, max_new_tokens=20),
     dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0)
 ]
 
 
-quenstions = [
+questions = [
     '1+1=',
     'What is the previous answer?',
     'Why is the Sun yellow?',
@@ -29,7 +26,7 @@
 ]
 
 
-@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
@@ -37,18 +34,18 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     chat_history_hf = []
     chat_history_ov = []
     chat_prompt = ''
-    
+
     # Will set add_special_tokens=False inside pipeline when start_chat() is called.
     model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
-    pipe.start_chat()    
-    for prompt in quenstions:
+    pipe.start_chat()
+    for prompt in questions:
         chat_history_hf.append({'role': 'user', 'content': prompt})
         chat_history_ov.append({'role': 'user', 'content': prompt})
-        
+
         chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
         tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-        
+
         answer = model_opt.generate(**tokenized, **generation_config)
         answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
@@ -57,14 +54,15 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
         chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
 
     pipe.finish_chat()
-    
+
     if chat_history_ov != chat_history_hf:
         print(f'hf_output: {chat_history_hf}')
         print(f'ov_output: {chat_history_ov}')
+
     assert chat_history_ov == chat_history_hf
 
 
-@pytest.mark.parametrize("generation_config", configs)
+@pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
@@ -73,169 +71,48 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
     chat_history_hf = []
     chat_history_ov = []
     chat_prompt = ''
-    
+
     # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
     # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-    
-    for prompt in quenstions:
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
+    ov_tokenizer = ov_pipe.get_tokenizer()
+
+    for prompt in questions:
         chat_history_hf.append({'role': 'user', 'content': prompt})
         chat_history_ov.append({'role': 'user', 'content': prompt})
-        
-        chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
-        tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
-        
+
+        chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True)
+        tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
+
         answer = model_opt.generate(**tokenized, **generation_config)
-        answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
+        answer_str = hf_tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
         chat_history_hf.append({'role': 'assistant', 'content': answer_str})
-        
-        chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer_ov = pipe.generate(chat_prompt, **generation_config)
+
+        chat_prompt = ov_tokenizer.apply_chat_template(chat_history_ov, add_generation_prompt=True)
+        answer_ov = ov_pipe.generate(chat_prompt, **generation_config)
         chat_history_ov.append({'role': 'assistant', 'content': answer_ov})
-  
+
     if chat_history_ov != chat_history_hf:
         print(f'hf_output: {chat_history_hf}')
         print(f'ov_output: {chat_history_ov}')
+
     assert chat_history_ov == chat_history_hf
 
 
-@pytest.mark.parametrize("generation_config", configs)
-@pytest.mark.parametrize("model_descr", get_chat_models_list())
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
-    # Check that when history is stored in KV cache results are the same as when history stored in a text.
-    device ='CPU'
-    
-    chat_history_with_kv_cache = []
-    chat_history_ov = []
-    
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
-    pipe_with_kv_cache = ov_genai.LLMPipeline(path, device, **{"ENABLE_MMAP": False})
-  
-    pipe_with_kv_cache.start_chat()
-    for question in quenstions:
-        chat_history_with_kv_cache.append({'role': 'user', 'content': question})
-        answer = pipe_with_kv_cache.generate(question, **generation_config)
-        chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer})
-        
-        chat_history_ov.append({'role': 'user', 'content': question})
-        prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True)
-        answer = pipe.generate(prompt, **generation_config)
-        chat_history_ov.append({'role': 'assistant', 'content': answer})
-    pipe_with_kv_cache.finish_chat()
-
-    if chat_history_ov != chat_history_with_kv_cache:
-        print(f'kvcache_hist: {chat_history_with_kv_cache}')
-        print(f'text_history: {chat_history_ov}')
-    assert chat_history_ov == chat_history_with_kv_cache
-
-
-conversation = [
-    {'role': 'user', 'content': '1+1='},
-    {'role': 'assistant', 'content': '1 + 1 = 2'},
-    {'role': 'user', 'content': 'What is the previous answer?'},
-    {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'},
-    {'role': 'user', 'content': 'Why is the sun yellow?'},
-    {'role': 'assistant', 'content': 'Because it emits yeloow light.'},
-    {'role': 'user', 'content': 'What was my first question?'},
-]
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.parametrize('chat_config', get_chat_templates())
-def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
-    tokenizer_config = chat_config[1]
-
-    # Will load openvino_model for tiny-random-phi as a placeholder
-    # but indeed only Tokenizer and apply_chat_template will be tested.
-    model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0])
-    
-    full_history_str_hf = tokenizer.apply_chat_template(conversation, 
-        add_generation_prompt=False,
-        tokenize=False,
-        **tokenizer_config)
-    
-    tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
-    tok.set_chat_template(tokenizer_config['chat_template'])
-    full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False)
-    if full_history_str != full_history_str_hf:
-        print(f'hf reference: {full_history_str_hf}')
-        print(f'ov_genai out: {full_history_str}')
-    assert full_history_str == full_history_str_hf
-
-
-@pytest.mark.parametrize("generation_config", configs[1:])
+@pytest.mark.parametrize("generation_config", generation_configs[1:])
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
 def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict):
-    model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    cb = get_continuous_batching(path)
-    stateful.start_chat()
-    cb.start_chat()
-    for question in quenstions:
-        generated = cb.generate(question, **generation_config)
-        reference = stateful.generate(question, **generation_config)
-        assert generated == reference
-    # Test that finish_chat() doesn't fail just in case.
-    cb.finish_chat()
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_set_chat_template():
-    model_descr = get_chat_models_list()[0]
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}")
-    pipe.start_chat()
-    generated = pipe.generate("a", max_new_tokens=1)
-    pipe.finish_chat()
-    reference = pipe.generate("a", max_new_tokens=1)
-    assert generated == reference
+    model_id, path, hf_tokenizer, opt_model, ov_stateful_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    cb_pipe = get_continuous_batching(path)
 
-prompts = [
-    '1+1=',
-    'What is the previous answer?',
-    'Why is the Sun yellow?',
-    'What was my first question?',
-    ['Why is the Sun yellow?'],
-    "若我有一亿美元，在人工智能盛行的今天，我怎样投资才能收益最大化？",
-    "מחרוזת בדיקה",
-    "Multiline\nstring!\nWow!",
-]
+    ov_stateful_pipe.start_chat()
+    cb_pipe.start_chat()
 
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.parametrize("add_special_tokens", [True, False])
-@pytest.mark.parametrize("prompt", prompts)
-def test_add_special_tokens(add_special_tokens, prompt):
-    import numpy as np
-    model_descr = get_chat_models_list()[0]
-    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    genai_tokenzier = pipe.get_tokenizer()
-    
-    # Calling encode with add_special_tokens will set state flag.
-    res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data
-    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
-    assert np.all(res_genai == res_hf)
+    for question in questions:
+        generated = cb_pipe.generate(question, **generation_config)
+        reference = ov_stateful_pipe.generate(question, **generation_config)
+        assert generated == reference
 
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.parametrize("add_special_tokens", [True, False])
-@pytest.mark.parametrize("skip_special_tokens", [True, False])
-@pytest.mark.parametrize("prompt", prompts)
-def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt):
-    import numpy as np
-    model_descr = get_chat_models_list()[0]
-    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
-    genai_tokenizer = pipe.get_tokenizer()
-    
-    # Calling encode with add_special_tokens will set state flag.
-    res_genai = genai_tokenizer.encode(prompt, add_special_tokens).input_ids.data
-    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
-    assert np.all(res_genai == res_hf)
-    
-    # Decode with skip_special_tokens
-    decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)[0]
-    decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens)
-    assert decoded_genai == decoded_hf
+    # Test that finish_chat() doesn't fail just in case.
+    cb_pipe.finish_chat()
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index d15747be6..824a3cca2 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -4,7 +4,6 @@
 import openvino_genai as ov_genai
 from openvino_genai import StopCriteria
 import pytest
-import transformers
 from typing import Union, List, Dict, Optional
 import numpy as np
 import openvino as ov
@@ -15,8 +14,7 @@
 from ov_genai_test_utils import (
     get_models_list, 
     read_model, 
-    load_pipe,
-    load_tok, 
+    load_genai_pipe_with_configs,
     model_tmp_path, 
     STOP_CRITERIA_MAP, 
     get_continuous_batching,
@@ -24,7 +22,7 @@
 
 
 def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]):
-    model_id, path, tokenizer, model, pipe = model_descr
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
     config = generation_config.copy()  # to avoid side effects
     num_beams = config['num_beams'] if 'num_beams' in config else 1
     config['num_return_sequences'] = num_beams
@@ -39,25 +37,25 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-    
+
     generation_config_hf = config.copy()
     if generation_config_hf.get('stop_criteria'):
         generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
     generation_config_hf.pop('ignore_eos', None)
 
     # Encode the batch of prompts
-    tokenizer.padding_side = "left"
-    encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
+    hf_tokenizer.padding_side = "left"
+    encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True)
     prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask']
-    
-    hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
+
+    hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
 
     hf_outputs = []
     for idx, hf_encoded_out in enumerate(hf_encoded_outputs):
         prompt_count = idx // num_beams
-        hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
+        hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True))
 
-    ov_outputs = pipe.generate(prompts, **config).texts
+    ov_outputs = ov_pipe.generate(prompts, **config).texts
 
     hf_outputs.sort()
     ov_outputs.sort()
@@ -67,8 +65,9 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro
             print(f'ov_output: {ov_output}')
         assert hf_output == ov_output
 
-def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str):
-    model_id, path, tokenizer, model, pipe = model_descr
+
+def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
 
@@ -85,12 +84,12 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
         generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')]
     generation_config_hf.pop('ignore_eos', None)
 
-    encoded_prompt = tokenizer([prompt], return_tensors='pt', add_special_tokens=True)
+    encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True)
     prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask']
-    hf_encoded_output = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
-    hf_output = tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True)
+    hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf)
+    hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True)
 
-    ov_output = pipe.generate(prompt, **config)
+    ov_output = ov_pipe.generate(prompt, **config)
     if config.get('num_return_sequences', 1) > 1:
         assert hf_output in ov_output.texts
     else:
@@ -100,14 +99,15 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str
 
         assert hf_output == ov_output
 
-def hf_ov_genai_tensors_comparison(
+
+def run_hf_ov_genai_comparison_encoded_inputs(
         model_descr, 
         generation_config: Dict, 
         input_ids: np.ndarray, 
         attention_mask: Optional[np.array] = None
     ):
     device = 'CPU'
-    model_id, path, tokenizer, model, pipe = model_descr
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
 
@@ -131,10 +131,8 @@ def hf_ov_genai_tensors_comparison(
         inputs_hf = dict(inputs=torch.tensor(input_ids))
         inputs_ov = ov.Tensor(input_ids)
 
-    hf_output = model.generate(**inputs_hf, **generation_config_hf)
-
-    pipe = ov_genai.LLMPipeline(path, device)
-    ov_output = pipe.generate(inputs_ov, **config)
+    hf_output = opt_model.generate(**inputs_hf, **generation_config_hf)
+    ov_output = ov_pipe.generate(inputs_ov, **config)
 
     hf_res = hf_output[0, input_ids.shape[1]:].numpy()
     ov_res = np.array(ov_output.tokens, dtype=np.int64)
@@ -154,7 +152,8 @@ def hf_ov_genai_tensors_comparison(
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_decoding(model_descr, generation_config, prompt):
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
+
 
 input_tensors_list = [
     # input_ids, attention_mask
@@ -165,62 +164,8 @@ def test_decoding(model_descr, generation_config, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_ov_tensors(model_descr, inputs):
-    hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
-
-
-prompts = [
-    'table is made of',
-    '你好！ 你好嗎？',
-    'Alan Turing was a',
-    'The Sun is yellow because',
-    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
-]
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.parametrize("prompt", prompts)
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_genai_tokenizer_encode(model_descr, prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-    tok = pipe.get_tokenizer()
-    
-    encoded_ov = tok.encode(prompt).input_ids.data
-    if isinstance(prompt, list):
-        encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids']
-        for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf):
-            assert np.all(tokens_ov == tokens_hf)
-    else:
-        encoded_hf = tokenizer.encode(prompt)
-        assert np.all(encoded_hf == encoded_ov[0])
-
-encoded_prompts = [
-    [1, 1591, 338, 1754, 310],
-    [1, 17102,   323,  3864,   471,   263],
-    
-    # chineze characters
-    [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882],
-
-    # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer  after removing the last token
-    [3113, 264, 364, 267],
-
-    # batched tokens
-    [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102,   323,  3864,   471,   263]]
-]
-@pytest.mark.parametrize("model_descr", get_models_list())
-@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
-@pytest.mark.precommit
-def test_genai_tokenizer_decode(model_descr, encoded_prompt):
-    model_id, path, tokenizer, model, pipe = read_model(model_descr)
-    tok = pipe.get_tokenizer()
-    decoded_ov = tok.decode(encoded_prompt)
-    
-    if isinstance(encoded_prompt[0], list):
-        decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True)
-        for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf):
-            assert np.all(tokens_ov == tokens_hf)
-    else:
-        decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True)
-        assert decoded_hf == decoded_ov
+def test_encoded_inputs(model_descr, inputs):
+    run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
 
 test_configs = [
@@ -239,7 +184,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_multibatch(model_descr, generation_config, prompts):
+def test_batch_text_input(model_descr, generation_config, prompts):
     run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
 
 
@@ -261,7 +206,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
         num_return_sequences=num_beam_groups * group_size, 
         max_new_tokens=max_new_tokens, 
     )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
 @pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC])
@@ -283,7 +228,7 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
         max_new_tokens=max_new_tokens, 
         stop_criteria=stop_criteria,
     )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
 # test long sequences
@@ -302,7 +247,7 @@ def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size,
         num_return_sequences=num_beam_groups * group_size, 
         max_new_tokens=max_new_tokens, 
     )
-    run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt)
 
 
 @pytest.mark.parametrize("prompt", prompts)
@@ -317,17 +262,17 @@ def test_greedy_repetition_penalty(model_descr, prompt):
         max_new_tokens=20,
         do_sample=False
     )
-    run_hf_ov_genai_comparison((model_id, path, tokenizer, model, pipe), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
 
     generation_config = dict(
         repetition_penalty=1.0,
         max_new_tokens=20,
         do_sample=False
     )
-    run_hf_ov_genai_comparison((model_id, path, tokenizer, model, pipe), generation_config, prompt)
+    run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt)
 
     ov_output = pipe.generate(prompt, **generation_config)
-    
+
     generation_config = dict(
         repetition_penalty=0.5,
         max_new_tokens=20,
@@ -346,19 +291,19 @@ def user_defined_callback(subword):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_callback_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    generation_config = pipe.get_generation_config()
+    ov_pipe = read_model(get_models_list()[0])[4]
+    generation_config = ov_pipe.get_generation_config()
     generation_config.max_new_tokens = 10
-    pipe.generate('table is made of', generation_config, callback)
+    ov_pipe.generate('table is made of', generation_config, callback)
 
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_callback_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
+def test_callback_batch_throws(callback):
+    ov_pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback)
+        ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), callback)
 
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@@ -368,24 +313,25 @@ def test_callback_kwargs_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
 
+
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("model_descr", get_models_list())
 def test_callback_decoding_metallama(model_descr, callback):
-    # On metallam this prompt generates output which can shorten after adding new tokens.
+    # On metallama this prompt generates output which can shorten after adding new tokens.
     # Test that streamer correctly handles such cases.
     prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature'
     if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct':
         pytest.skip()
-    pipe = read_model(model_descr)[4]
-    pipe.generate(prompt, max_new_tokens=300, streamer=callback)
+    ov_pipe = read_model(model_descr)[4]
+    ov_pipe.generate(prompt, max_new_tokens=300, streamer=callback)
 
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_callback_kwargs_batch_fail(callback):
+def test_callback_kwargs_batch_throws(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
         pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback)
@@ -408,200 +354,73 @@ def end(self):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_streamer_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    generation_config = pipe.get_generation_config()
+    ov_pipe = read_model(get_models_list()[0])[4]
+    generation_config = ov_pipe.get_generation_config()
     generation_config.max_new_tokens = 10
-    printer = Printer(pipe.get_tokenizer())
-    pipe.generate('table is made of', generation_config, printer)
+    printer = Printer(ov_pipe.get_tokenizer())
+    ov_pipe.generate('table is made of', generation_config, printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_streamer_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
+def test_streamer_batch_throws():
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
     with pytest.raises(RuntimeError):
-        pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer)
+        ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_streamer_kwargs_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
+    ov_pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_streamer_kwargs_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
+def test_streamer_kwargs_batch_throws():
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
     with pytest.raises(RuntimeError):
-        pipe.generate('', num_beams=2, streamer=printer)
+        ov_pipe.generate('', num_beams=2, streamer=printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_one_string(callback):
-    pipe = read_model(get_models_list()[0])[4]
-    ten_tokens = pipe.get_generation_config()
+    ov_pipe = read_model(get_models_list()[0])[4]
+    ten_tokens = ov_pipe.get_generation_config()
     ten_tokens.max_new_tokens = 10
-    pipe('talbe is made of', ten_tokens, callback)
+    ov_pipe('talbe is made of', ten_tokens, callback)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
-def test_operator_with_callback_batch_fail(callback):
-    pipe = read_model(get_models_list()[0])[4]
+def test_operator_with_callback_batch_throws(callback):
+    ov_pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
-        pipe(['1', '2'], ov_genai.GenerationConfig(), callback)
+        ov_pipe(['1', '2'], ov_pipe.get_generation_config(), callback)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_operator_with_streamer_kwargs_one_string():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
-    pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
+    ov_pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_operator_with_streamer_kwargs_batch_fail():
-    pipe = read_model(get_models_list()[0])[4]
-    printer = Printer(pipe.get_tokenizer())
+def test_operator_with_streamer_kwargs_batch_throws():
+    ov_pipe = read_model(get_models_list()[0])[4]
+    printer = Printer(ov_pipe.get_tokenizer())
     with pytest.raises(RuntimeError):
-        pipe('', num_beams=2, streamer=printer)
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_load_special_tokens_ids_1(model_tmp_path):
-    # test when there is an available config.json
-    config_json = { 
-        "pad_token_id": 422,
-        "bos_token_id": 42, 
-        "eos_token_id": 37,
-    }
-    tok = load_tok([(config_json, "config.json")], model_tmp_path[1])
-    assert tok.get_pad_token_id() == config_json['pad_token_id']
-    assert tok.get_bos_token_id() == config_json['bos_token_id']
-    assert tok.get_eos_token_id() == config_json['eos_token_id']
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_load_special_tokens_str_2(model_tmp_path):
-    # test with special_tokens_map
-    special_tokens_map_json = { 
-        "pad_token": {"content": "<custom_pad>"},
-        "bos_token": {"content": "<custom_bos>"},
-        "eos_token": {"content": "<custom_eos>"},
-    }
-    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1])
-    assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"]
-    assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"]
-    assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"]
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons")
-def test_load_special_tokens_3_(model_tokenizers_path_tmp_path):
-    # special_tokens_map is not available 
-    # but tokenize_config.json exists
-    # will load both string and integer representations
-    tok_config_json = {
-        "added_tokens_decoder": {
-            "422": {"content": "<pad>"},
-            "37": {"content": "<s>"},
-            "42": {"content": "</s>"},
-        },
-        "pad_token": "<pad>",
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-    }
-
-    tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1])
-    assert tok.get_pad_token() == tok_config_json['pad_token']
-    assert tok.get_bos_token() == tok_config_json['bos_token']
-    assert tok.get_eos_token() == tok_config_json['eos_token']
-
-    assert tok.get_pad_token_id() == 422
-    assert tok.get_bos_token_id() == 37
-    assert tok.get_eos_token_id() == 42
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-def test_load_special_tokens_3(model_tmp_path):
-    # both config.json is available and tokenizer_config.json available
-    # check that it does not read int values from tokenizer_config.json if they are in config.json
-    tok_config_json = {
-    "added_tokens_decoder": {
-        # integers differ from config.json to check they don't override config.json
-        "777": {"content": "<pad>"},
-        "888": {"content": "<s>"},
-        "656": {"content": "</s>"},
-    },
-    "pad_token": "<pad>",
-    "bos_token": "<s>",
-    "eos_token": "</s>",
-    }
-    config_json = { 
-        "pad_token_id": 422,
-        "bos_token_id": 42, 
-        "eos_token_id": 37,
-    }
-    configs = [
-        (tok_config_json, "tokenizer_config.json"),
-        (config_json, "config.json")
-    ]
-    tok = load_tok(configs, model_tmp_path[1])
-    assert tok.get_pad_token_id() == config_json['pad_token_id']
-    assert tok.get_bos_token_id() == config_json['bos_token_id']
-    assert tok.get_eos_token_id() == config_json['eos_token_id']
-
-    assert tok.get_pad_token() == tok_config_json['pad_token']
-    assert tok.get_bos_token() == tok_config_json['bos_token']
-    assert tok.get_eos_token() == tok_config_json['eos_token']
-
-
-@pytest.mark.precommit
-@pytest.mark.nightly
-@pytest.mark.xfail(
-    raises=AssertionError, 
-    reason="CVS-143410 ov tokenizer should be aligned with hf",
-    strict=False,
-)
-def test_load_special_tokens_4(model_tmp_path):
-    # only string representation is provided, find token integers by inference
-    model_id, temp_path = model_tmp_path
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    
-    special_tokens_map_json = {}
-    token_str_int_map = {}
-    special_token_names = ['pad_token', 'bos_token', 'eos_token']
-    for token_str in special_token_names:
-        if hasattr(tokenizer, token_str):
-            token_val = getattr(tokenizer, token_str)
-            special_tokens_map_json.update({token_str: {"content": token_val}})
-            token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0]
-            token_str_int_map.update({token_str: token_id})
-
-    # since only string representations are present in the json will try to get by inference
-    tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path)
-
-    # check ids inferred correctly for special tokens existing if HF tokenizer
-    if 'pad_token' in token_str_int_map:
-        assert tok.get_pad_token_id() == token_str_int_map['pad_token']
-    if 'bos_token' in token_str_int_map:
-        assert tok.get_bos_token_id() == token_str_int_map['bos_token']
-    if 'eos_token' in token_str_int_map:
-        assert tok.get_eos_token_id() == token_str_int_map['eos_token']
+        ov_pipe('', num_beams=2, streamer=printer)
 
 
 invalid_configs = [
@@ -617,23 +436,24 @@ def test_load_special_tokens_4(model_tmp_path):
 @pytest.mark.parametrize("generation_config", invalid_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_invalid_configs(model_tmp_path, generation_config):
+def test_invalid_generation_configs_throws(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
     config_json = {}
-    pipe = load_pipe([(config_json, "config.json")], temp_path)
+    ov_pipe = load_genai_pipe_with_configs([(config_json, "config.json")], temp_path)
     with pytest.raises(RuntimeError):
-        pipe.generate('blah blah', **generation_config)
+        ov_pipe.generate('blah blah', **generation_config)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_valid_configs(model_tmp_path):
     model_id, temp_path = model_tmp_path
-    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
+    ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path)
 
     config = ov_genai.GenerationConfig()
     config.do_sample = True  # no eos_token_id but it's loaded from config.json
-    pipe.set_generation_config(config)
+    ov_pipe.set_generation_config(config)
+
 
 invalid_py_configs = [
     dict(num_beam_groups=3, num_beams=15, do_sample=True),
@@ -648,49 +468,48 @@ def test_valid_configs(model_tmp_path):
 @pytest.mark.precommit
 @pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", invalid_py_configs)
-def test_python_generation_config_validation(model_tmp_path, generation_config):
+def test_python_generation_config_validation_throws(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
-    pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
-    
+    ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path)
+
     # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned
     #  instead of RuntimeError, which is returned when GenerationConfig values are validated
     return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError
     with pytest.raises(return_exception_type):
-        pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
+        ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config))
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_unicode_pybind_decoding_1():
+def test_unicode_pybind_decoding_one_string():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = pipe.generate(',', max_new_tokens=4)
+    ov_pipe = read_model((model_id, path))[4]
+    res_str = ov_pipe.generate(',', max_new_tokens=4)
     assert '�' == res_str[-1]
 
 
-
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_unicode_pybind_decoding_2():
+def test_unicode_pybind_decoding_batched():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
-    res_str = pipe.generate([","], max_new_tokens=4)
+    ov_pipe = read_model((model_id, path))[4]
+    res_str = ov_pipe.generate([","], max_new_tokens=4)
     assert '�' == res_str.texts[0][-1]
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_unicode_pybind_decoding_3():
+def test_unicode_pybind_decoding_one_string_streamer():
     # On this model this prompt generates unfinished utf-8 string
     # and streams it. Test that pybind will not fail while we pass string to python.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
-    pipe = read_model((model_id, path))[4]
+    ov_pipe = read_model((model_id, path))[4]
     res_str = []
-    pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
+    ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
     assert '�' == res_str[-1]
 
 
@@ -741,22 +560,24 @@ def test_continuous_batching_vs_stateful(prompt, generation_config):
         for gen, ref in zip(generated.scores, reference.scores):
             assert math.isclose(gen, ref, abs_tol=0.0003)
 
+
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
 def test_cb_streamer_vs_return_vs_stateful(prompt):
-    model_id, path, tokenizer, model, stateful = read_model((
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((
         "facebook/opt-125m",
         Path("opt-125m")
     ))
-    cb = get_continuous_batching(path)
+    cb_pipe = get_continuous_batching(path)
     streamed = []
-    generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
-    reference = stateful.generate(prompt, max_new_tokens=20)
+    generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword))
+    reference = ov_pipe.generate(prompt, max_new_tokens=20)
     assert generated == "".join(streamed)
     assert "".join(streamed) == reference
 
+
 def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics:
-    model_id, path, tokenizer, model, pipe = model_descr
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr
 
     config = generation_config.copy()  # to avoid side effects
 
@@ -767,7 +588,7 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st
         # Do not apply 'repetition_penalty' if sampling is not used.
         config['do_sample'] = False
         config['repetition_penalty'] = 1.0 # 1.0 means no penalty
-    return pipe.generate([prompt], **config).perf_metrics
+    return ov_pipe.generate([prompt], **config).perf_metrics
 
 
 test_cases = [
@@ -798,6 +619,12 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std)
     assert mean_ttft > 0 and mean_ttft < 1000.0
 
+    raw_metrics = perf_metrics.raw_metrics
+    durations = np.array(raw_metrics.m_durations) / 1000
+    # Check that prefill is not included in durations for TPOT calculation.
+    # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration.
+    assert np.all(mean_ttft > durations * 2)
+
     mean_tpot, std_tpot = perf_metrics.get_tpot()
     assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std)
     assert mean_tpot > 0 and mean_ttft < 1000.0
@@ -822,7 +649,9 @@ def test_perf_metrics(model_descr, generation_config, prompt):
     assert std_detok_duration == 0
     
     # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics
-    raw_metrics = perf_metrics.raw_metrics
+    assert np.allclose(mean_tpot, np.mean(durations))
+    assert np.allclose(std_tpot, np.std(durations))
+
     raw_dur = np.array(raw_metrics.generate_durations) / 1000
     assert np.allclose(mean_gen_duration, np.mean(raw_dur))
     assert np.allclose(std_gen_duration, np.std(raw_dur))
@@ -843,19 +672,19 @@ def test_perf_metrics(model_descr, generation_config, prompt):
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_batch_switch():
-    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    pipe.generate(["a"], max_new_tokens=2)
-    pipe.generate(["1", "2"], max_new_tokens=2)
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    ov_pipe.generate(["a"], max_new_tokens=2)
+    ov_pipe.generate(["1", "2"], max_new_tokens=2)
 
 
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_stop_token_ids():
-    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = pipe.generate(
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    res = ov_pipe.generate(
         ov.Tensor([(1,)]),
         max_new_tokens=3,
-        stop_token_ids={-1, 9935, pipe.get_tokenizer().get_eos_token_id()},
+        stop_token_ids={-1, 9935, ov_pipe.get_tokenizer().get_eos_token_id()},
         include_stop_str_in_output=False
     )
     assert 2 == len(res.tokens[0])
@@ -865,8 +694,8 @@ def test_stop_token_ids():
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_stop_strings():
-    pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
-    res = pipe.generate(
+    ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4]
+    res = ov_pipe.generate(
         "",
         max_new_tokens=5,
         stop_strings={"ignored", "боль"}
diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py
index 49d6c8f6b..7c648e73d 100644
--- a/tests/python_tests/test_preemption.py
+++ b/tests/python_tests/test_preemption.py
@@ -4,7 +4,7 @@
 import pytest
 
 from openvino_genai import GenerationConfig
-from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
+from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \
     get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
@@ -87,7 +87,7 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
         config.rng_seed = 0
         config.max_new_tokens = 30
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
@@ -168,7 +168,7 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
     for config in generation_configs:
         config.rng_seed = 0
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 9aa6931d8..fbcce76bf 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -10,7 +10,7 @@
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 from typing import List, TypedDict
 
-from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
+from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \
     generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \
     get_greedy_with_penalties, get_multinomial_temperature, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \
@@ -21,23 +21,25 @@
     get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \
     get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \
     get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \
+    get_greedy_stop_strings_exclude_from_output, get_greedy_stop_strings_include_to_output, \
+    get_greedy_n_stop_strings_exclude_from_output, get_greedy_n_stop_strings_include_to_output, \
     generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \
     run_continuous_batching
 
 
 @pytest.mark.precommit
-@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
 def test_sampling_precommit(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
 
 @pytest.mark.nightly
-@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly")))
 def test_sampling_nightly(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
 @pytest.mark.real_models
-@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
+@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models")))
 def test_real_models(tmp_path, model_id):
     run_test_pipeline(tmp_path, model_id)
 
@@ -77,7 +79,9 @@ def test_eos_greedy(tmp_path):
 @pytest.mark.precommit
 @pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(),
                                                get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), 
-                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), ],
+                                               get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(),
+                                               get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(),
+                                               get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ],
         ids=[
             "greedy",
             "greedy_with_min_and_max_tokens",
@@ -88,6 +92,10 @@ def test_eos_greedy(tmp_path):
             "beam",
             "beam_search_min_and_max_tokens",
             "beam_search_with_multiple_stop_strings_no_match",
+            "get_greedy_stop_strings_exclude_from_output",
+            "get_greedy_stop_strings_include_to_output",
+            "get_greedy_n_stop_strings_exclude_from_output",
+            "get_greedy_n_stop_strings_include_to_output"
             ])
 def test_individual_generation_configs_deterministic(tmp_path, generation_config):
     prompts = [
@@ -305,7 +313,7 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl
     generation_config.rng_seed = 0
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
@@ -329,12 +337,12 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche
     scheduler_config.max_num_batched_tokens = max_num_batched_tokens
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
 
     outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
@@ -356,12 +364,12 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t
     scheduler_config.max_num_batched_tokens = max_num_batched_tokens
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU")
 
     outputs = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(outputs))
@@ -384,12 +392,12 @@ def test_post_oom_health(tmp_path, sampling_config):
     scheduler_config.num_kv_blocks = 10
     generation_configs = [generation_config]
     model_id : str = "facebook/opt-125m"
-    model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True)
+    model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True)
 
     models_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, models_path)
 
-    pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix()), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert (len(output))
diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py
new file mode 100644
index 000000000..0c2a106d5
--- /dev/null
+++ b/tests/python_tests/test_tokenizer.py
@@ -0,0 +1,360 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import numpy as np
+from transformers import AutoTokenizer
+from typing import Dict, Tuple, List
+import openvino_genai
+import json
+
+from ov_genai_test_utils import (
+    get_models_list,
+    get_chat_models_list,
+    read_model,
+    model_tmp_path
+)
+
+
+def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path):
+    # load Tokenizer where all configs are cleared.
+    # remove existing jsons from previous tests
+    for json_file in temp_path.glob("*.json"):
+        json_file.unlink()
+
+    for config_json, config_name in configs:
+        with (temp_path / config_name).open('w') as f:
+            json.dump(config_json, f)
+    return openvino_genai.Tokenizer(temp_path)
+
+
+def get_chat_templates():
+    # Returns chat templates saved in tokenizer_configs.py, 
+    # but skips some models that currently are not processed correctly.
+
+    skipped_models = {
+        # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template.
+        # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads.
+        "openchat/openchat-3.5-0106",
+        
+        # These models fail even on HF so no need to check if applying chat matches.
+        "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
+        "codellama/CodeLlama-34b-Instruct-hf",
+        "deepseek-ai/deepseek-math-7b-rl",
+        "allenai/tulu-2-7b",
+        "alexsobolev/IcaroLM",
+        "tokyotech-llm/Swallow-7b-instruct-v0.1",
+        "bofenghuang/vigogne-2-7b-chat",
+        "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k",
+        "AliAbdelrasheed/maqa_llama_4bit",
+        "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored",
+
+        # TODO: Need to support chat templates in more models: CVS-145963
+        # Either ov_genai is unable to parse chat_template or results do not match with HF.
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp
+        "mosaicml/mpt-30b-chat",
+        "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp
+        "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp
+        "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp
+        "openchat/openchat-3.5-0106",
+        "casperhansen/llama-3-70b-instruct-awq",
+        "TheBloke/deepseek-coder-33B-instruct-GPTQ",
+        "AI-Sweden-Models/gpt-sw3-356m-instruct",
+        "google/gemma-7b-it",
+        "THUDM/cogvlm2-llama3-chat-19B",
+        "KnutJaegersberg/internlm-20b-llama",
+        "maywell/Synatra-Mixtral-8x7B",
+        "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+        "bofenghuang/vigostral-7b-chat",
+        "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp
+        "openchat/openchat-3.6-8b-20240522",
+        "tenyx/TenyxChat-7B-v1",
+        "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2",
+        "yam-peleg/Hebrew-Gemma-11B-V2",
+        "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError
+        "nlpai-lab/KULLM3",
+        "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
+        "MediaTek-Research/Breeze-7B-Instruct-v0_1", 
+        "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError
+        "MLP-KTLim/llama-3-Korean-Bllossom-8B",
+        "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp
+        "codellama/CodeLlama-70b-Instruct-hf",
+        "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp
+        "BramVanroy/Llama-2-13b-chat-dutch"
+    }
+
+    from tokenizer_configs import get_tokenizer_configs
+    return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models]
+
+
+prompts = [
+    'table is made of',
+    '你好！ 你好嗎？',
+    'Alan Turing was a',
+    'The Sun is yellow because',
+    ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a']
+]
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("prompt", prompts)
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_encode(model_descr, prompt):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+    ov_tokenizer = ov_pipe.get_tokenizer()
+
+    encoded_ov = ov_tokenizer.encode(prompt).input_ids.data
+    if isinstance(prompt, list):
+        encoded_hf = hf_tokenizer.batch_encode_plus(prompt)['input_ids']
+        for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf):
+            assert np.all(tokens_ov == tokens_hf)
+    else:
+        encoded_hf = hf_tokenizer.encode(prompt)
+        assert np.all(encoded_hf == encoded_ov[0])
+
+
+encoded_prompts = [
+    [1, 1591, 338, 1754, 310],
+    [1, 17102,   323,  3864,   471,   263],
+
+    # chineze characters
+    [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882],
+
+    # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer  after removing the last token
+    [3113, 264, 364, 267],
+
+    # batched tokens
+    [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102,   323,  3864,   471,   263]]
+]
+@pytest.mark.parametrize("model_descr", get_models_list())
+@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
+@pytest.mark.precommit
+def test_decode(model_descr, encoded_prompt):
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr)
+    ov_tokenizer = ov_pipe.get_tokenizer()
+    decoded_ov = ov_tokenizer.decode(encoded_prompt)
+
+    if isinstance(encoded_prompt[0], list):
+        decoded_hf = hf_tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True)
+        for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf):
+            assert np.all(tokens_ov == tokens_hf)
+    else:
+        decoded_hf = hf_tokenizer.decode(encoded_prompt, skip_special_tokens=True)
+        assert decoded_hf == decoded_ov
+
+
+conversation = [
+    {'role': 'user', 'content': '1+1='},
+    {'role': 'assistant', 'content': '1 + 1 = 2'},
+    {'role': 'user', 'content': 'What is the previous answer?'},
+    {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'},
+    {'role': 'user', 'content': 'Why is the sun yellow?'},
+    {'role': 'assistant', 'content': 'Because it emits yeloow light.'},
+    {'role': 'user', 'content': 'What was my first question?'},
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize('chat_config', get_chat_templates())
+def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
+    tokenizer_config = chat_config[1]
+
+    # Will load openvino_model for tiny-random-phi as a placeholder
+    # but indeed only Tokenizer and apply_chat_template will be tested.
+    model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(get_models_list()[0])
+
+    hf_full_history_str = hf_tokenizer.apply_chat_template(conversation,
+        add_generation_prompt=False,
+        tokenize=False,
+        **tokenizer_config)
+
+    ov_tokenizer = load_genai_tokenizer_with_configs([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1])
+    ov_tokenizer.set_chat_template(tokenizer_config['chat_template'])
+    ov_full_history_str = ov_tokenizer.apply_chat_template(conversation, add_generation_prompt=False)
+
+    if ov_full_history_str != hf_full_history_str:
+        print(f'hf reference: {hf_full_history_str}')
+        print(f'ov_genai out: {ov_full_history_str}')
+    assert ov_full_history_str == hf_full_history_str
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_set_chat_template():
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+
+    prompt = "how are you?"
+    dummy_conversation = [
+        {'role': 'user', 'content': prompt},
+    ]
+
+    ov_tokenizer = ov_pipe.get_tokenizer()
+    identity_chat_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}"
+
+    templated_prompt_inline = ov_tokenizer.apply_chat_template(dummy_conversation, add_generation_prompt=False, chat_template=identity_chat_template)
+
+    ov_tokenizer.set_chat_template(identity_chat_template)
+    templated_prompt = ov_tokenizer.apply_chat_template(dummy_conversation, add_generation_prompt=False)
+
+    assert templated_prompt_inline == templated_prompt
+    assert prompt == templated_prompt
+
+
+prompts = [
+    '1+1=',
+    'What is the previous answer?',
+    'Why is the Sun yellow?',
+    'What was my first question?',
+    ['Why is the Sun yellow?'],
+    "若我有一亿美元，在人工智能盛行的今天，我怎样投资才能收益最大化？",
+    "מחרוזת בדיקה",
+    "Multiline\nstring!\nWow!",
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("skip_special_tokens", [True, False])
+@pytest.mark.parametrize("prompt", prompts)
+def test_encode_decode_with_special_tokens_option(add_special_tokens, skip_special_tokens, prompt):
+    import numpy as np
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    ov_tokenzier = ov_pipe.get_tokenizer()
+
+    # Calling encode with 'add_special_tokens' will set state flag.
+    ov_res = ov_tokenzier.encode(prompt, add_special_tokens=add_special_tokens).input_ids.data
+    hf_res = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
+    assert np.all(ov_res == hf_res)
+
+    # Decode with 'skip_special_tokens'
+    decoded_genai = ov_tokenzier.decode(ov_res, skip_special_tokens=skip_special_tokens)[0]
+    decoded_hf = hf_tokenizer.decode(hf_res[0], skip_special_tokens=skip_special_tokens)
+    assert decoded_genai == decoded_hf
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_load_special_tokens_from_config_json(model_tmp_path):
+    # test when there is an available config.json
+    config_json = {
+        "pad_token_id": 422,
+        "bos_token_id": 42,
+        "eos_token_id": 37,
+    }
+    tok = load_genai_tokenizer_with_configs([(config_json, "config.json")], model_tmp_path[1])
+    assert tok.get_pad_token_id() == config_json['pad_token_id']
+    assert tok.get_bos_token_id() == config_json['bos_token_id']
+    assert tok.get_eos_token_id() == config_json['eos_token_id']
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_load_special_tokens_from_special_tokens_map_json(model_tmp_path):
+    # test with special_tokens_map
+    special_tokens_map_json = {
+        "pad_token": {"content": "<custom_pad>"},
+        "bos_token": {"content": "<custom_bos>"},
+        "eos_token": {"content": "<custom_eos>"},
+    }
+    tok = load_genai_tokenizer_with_configs([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1])
+    assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"]
+    assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"]
+    assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"]
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons")
+def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tmp_path):
+    # special_tokens_map is not available
+    # but tokenize_config.json exists
+    # will load both string and integer representations
+    tok_config_json = {
+        "added_tokens_decoder": {
+            "422": {"content": "<pad>"},
+            "37": {"content": "<s>"},
+            "42": {"content": "</s>"},
+        },
+        "pad_token": "<pad>",
+        "bos_token": "<s>",
+        "eos_token": "</s>",
+    }
+
+    tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1])
+    assert tok.get_pad_token() == tok_config_json['pad_token']
+    assert tok.get_bos_token() == tok_config_json['bos_token']
+    assert tok.get_eos_token() == tok_config_json['eos_token']
+
+    assert tok.get_pad_token_id() == 422
+    assert tok.get_bos_token_id() == 37
+    assert tok.get_eos_token_id() == 42
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_load_special_tokens_from_tokenizer_config_and_config_json(model_tmp_path):
+    # both config.json is available and tokenizer_config.json available
+    # check that it does not read int values from tokenizer_config.json if they are in config.json
+    tok_config_json = {
+    "added_tokens_decoder": {
+        # integers differ from config.json to check they don't override config.json
+        "777": {"content": "<pad>"},
+        "888": {"content": "<s>"},
+        "656": {"content": "</s>"},
+    },
+    "pad_token": "<pad>",
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    }
+    config_json = {
+        "pad_token_id": 422,
+        "bos_token_id": 42,
+        "eos_token_id": 37,
+    }
+    configs = [
+        (tok_config_json, "tokenizer_config.json"),
+        (config_json, "config.json")
+    ]
+    tok = load_genai_tokenizer_with_configs(configs, model_tmp_path[1])
+    assert tok.get_pad_token_id() == config_json['pad_token_id']
+    assert tok.get_bos_token_id() == config_json['bos_token_id']
+    assert tok.get_eos_token_id() == config_json['eos_token_id']
+
+    assert tok.get_pad_token() == tok_config_json['pad_token']
+    assert tok.get_bos_token() == tok_config_json['bos_token']
+    assert tok.get_eos_token() == tok_config_json['eos_token']
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.xfail(
+    raises=AssertionError,
+    reason="CVS-143410 ov tokenizer should be aligned with hf",
+    strict=False,
+)
+def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(model_tmp_path):
+    # only string representation is provided, find token integers by inference
+    model_id, temp_path = model_tmp_path
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+    special_tokens_map_json = {}
+    token_str_int_map = {}
+    special_token_names = ['pad_token', 'bos_token', 'eos_token']
+    for token_str in special_token_names:
+        if hasattr(tokenizer, token_str):
+            token_val = getattr(tokenizer, token_str)
+            special_tokens_map_json.update({token_str: {"content": token_val}})
+            token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0]
+            token_str_int_map.update({token_str: token_id})
+
+    # since only string representations are present in the json will try to get by inference
+    tok = load_genai_tokenizer_with_configs([(special_tokens_map_json, "special_tokens_map.json")], temp_path)
+
+    # check ids inferred correctly for special tokens existing if HF tokenizer
+    if 'pad_token' in token_str_int_map:
+        assert tok.get_pad_token_id() == token_str_int_map['pad_token']
+    if 'bos_token' in token_str_int_map:
+        assert tok.get_bos_token_id() == token_str_int_map['bos_token']
+    if 'eos_token' in token_str_int_map:
+        assert tok.get_eos_token_id() == token_str_int_map['eos_token']
+
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py
index 5a68dd98b..aa78666e3 100644
--- a/tests/python_tests/test_whisper_generate_api.py
+++ b/tests/python_tests/test_whisper_generate_api.py
@@ -6,7 +6,6 @@
 import pytest
 import openvino_tokenizers
 import openvino
-from ov_genai_test_utils import get_whisper_models_list
 import datasets
 from transformers import WhisperProcessor, pipeline, AutoTokenizer
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
@@ -15,6 +14,8 @@
 import time
 import typing
 import numpy as np
+import os
+import pathlib
 
 @pytest.fixture(scope="class", autouse=True)
 def run_gc_after_test():
@@ -25,7 +26,37 @@ def run_gc_after_test():
     yield
     gc.collect()
 
-@functools.lru_cache(1)
+
+def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False):
+    precommit_models = [
+        "openai/whisper-tiny",
+        "openai/whisper-tiny.en",
+        "distil-whisper/distil-small.en",
+    ]
+    if multilingual:
+        precommit_models = ["openai/whisper-tiny"]
+    if en_only:
+        precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"]
+    if tiny_only:
+        precommit_models = ["openai/whisper-tiny"]
+
+    nightly_models = []
+
+    if pytest.run_marker == "precommit":
+        model_ids = precommit_models
+    else:
+        model_ids = nightly_models
+
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+
+    prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
+    return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
+
+
+# used whisper models are relatively small
+# cache them in memory to speedup tests
+@functools.lru_cache(3)
 def read_whisper_model(params, **tokenizer_kwargs):
     model_id, path = params
 
@@ -568,6 +599,31 @@ def test_longform_audio(model_descr, test_sample):
     assert genai_result.chunks == None
 
 
+@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
+@pytest.mark.parametrize(
+    "test_sample",
+    get_samples_from_dataset(length=1),
+)
+@pytest.mark.precommit
+def test_initial_prompt_hotwords(model_descr, test_sample):
+    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+
+    result = pipe.generate(test_sample)
+
+    assert "Joel Keaton" in result.texts[0]
+    assert "Joel Kyton" not in result.texts[0]
+
+    result = pipe.generate(test_sample, initial_prompt="Joel Kyton")
+
+    assert "Joel Keaton" not in result.texts[0]
+    assert "Joel Kyton" in result.texts[0]
+
+    result = pipe.generate(test_sample, hotwords="Joel Kyton")
+
+    assert "Joel Keaton" not in result.texts[0]
+    assert "Joel Kyton" in result.texts[0]
+
+
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",
diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py
index 45d60f998..2b51dc2b0 100644
--- a/tests/python_tests/tokenizer_configs.py
+++ b/tests/python_tests/tokenizer_configs.py
@@ -2,1011 +2,1011 @@
 def get_tokenizer_configs():
     return {
         "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "TheBloke/Mistral-7B-OpenOrca-GPTQ": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "upstage/SOLAR-10.7B-Instruct-v1.0": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}"
         },
         "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+             "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+                "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "Qwen/Qwen1.5-0.5B": {
-        "bos_token": None,
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "Felladrin/Llama-68M-Chat-v1": {
-        "bos_token": "<|im_start|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|im_end|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<|im_start|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|im_end|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "databricks/dbrx-instruct": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|pad|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|pad|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}"
         },
         "speakleash/Bielik-7B-Instruct-v0.1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + eos_token }}{% endif %}{% endfor %}"
         },
         "internlm/internlm2-chat-7b": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "Qwen/Qwen2-7B-Instruct": {
-        "bos_token": None,
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "codellama/CodeLlama-34b-Instruct-hf": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+                "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": {
-        "bos_token": None,
-        "eos_token": "<|end|>",
-        "pad_token": "<|pad|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|end|>",
+            "pad_token": "<|pad|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}"
         },
         "mosaicml/mpt-30b-chat": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}"
         },
         "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "deepseek-ai/deepseek-coder-6.7b-instruct": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "<|EOT|>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "<|EOT|>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
         },
         "deepseek-ai/deepseek-math-7b-rl": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": {
-        "__type": "AddedToken",
-        "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": {
+                "__type": "AddedToken",
+                "content": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "unk_token": None,
+             "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
         },
         "FINGU-AI/FinguAI-Chat-v1": {
-        "bos_token": None,
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "allenai/tulu-2-7b": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+                "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "maldv/winter-garden-7b-alpha": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}"
         },
         "mlabonne/NeuralMonarch-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
         },
         "meta-llama/Llama-2-7b-chat-hf": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "GritLM/GritLM-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "ishorn5/RTLCoder-Deepseek-v1.1": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
         },
         "jondurbin/bagel-34b-v0.2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}"
         },
         "openchat/openchat-3.5-0106": {
-        "bos_token": "<s>",
-        "eos_token": "<|end_of_turn|>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|end_of_turn|>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}"
         },
         "mobiuslabsgmbh/aanaphi2-v0.1": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "[PAD]",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: '  + message['content'].strip() + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "[PAD]",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: '  + message['content'].strip() + '\n'}}{% endif %}{% endfor %}"
         },
         "typeof/mistral-60m": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}"
         },
         "turboderp/Cat-Llama-3-70B-instruct": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "saltlux/Ko-Llama3-Luxia-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}"
         },
         "h2oai/h2o-danube2-1.8b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
         },
         "abhishek/autotrain-llama3-70b-orpo-v1": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<pad>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<pad>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"
         },
         "casperhansen/llama-3-70b-instruct-awq": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
         },
         "01-ai/Yi-1.5-34B-Chat": {
-        "bos_token": "<|startoftext|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|startoftext|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
         },
         "allenai/OLMo-7B-Instruct": {
-        "bos_token": None,
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|padding|>",
-        "unk_token": None,
-        "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": None,
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|padding|>",
+            "unk_token": None,
+            "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "TheBloke/deepseek-coder-33B-instruct-GPTQ": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<|EOT|>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<|EOT|>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
         },
         "cognitivecomputations/dolphin-2.8-mistral-7b-v02": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "alexsobolev/IcaroLM": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "tokyotech-llm/Swallow-7b-instruct-v0.1": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ''  + content.strip() + '' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ''  + content.strip() + '' + eos_token }}{% endif %}{% endfor %}"
         },
         "instructlab/merlinite-7b-lab": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|pad|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|pad|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}"
         },
         "microsoft/Phi-3-medium-128k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|placeholder6|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|placeholder6|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "katuni4ka/tiny-random-phi3": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "microsoft/Phi-3-mini-128k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|placeholder6|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|placeholder6|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "VAGOsolutions/SauerkrautLM-Qwen-32b": {
-        "bos_token": None,
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": None,
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
         },
         "AI-Sweden-Models/gpt-sw3-356m-instruct": {
-        "bos_token": None,
-        "eos_token": None,
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:"
+            "bos_token": None,
+            "eos_token": None,
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:"
         },
         "google/gemma-7b-it": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
         },
         "ise-uiuc/Magicoder-S-DS-6.7B": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}"
         },
         "Deci/DeciLM-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}"
         },
         "katuni4ka/tiny-random-minicpm": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"
         },
         "UnicomLLM/Unichat-llama3-Chinese-8B-28K": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content']  %}{% if loop.index0 == 0  %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content =  'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content =  bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and  not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content']  %}{% if loop.index0 == 0  %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content =  'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content =  bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and  not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}"
         },
         "RLHFlow/LLaMA3-SFT": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}"
         },
         "bofenghuang/vigogne-2-7b-chat": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": False,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}"
         },
         "aisingapore/sea-lion-7b-instruct": {
-        "bos_token": None,
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|padding|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}"
+            "bos_token": None,
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|padding|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}"
         },
         "microsoft/Phi-3-small-8k-instruct": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
         },
         "THUDM/cogvlm2-llama3-chat-19B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}"
         },
         "tiiuae/falcon-11B": {
-        "bos_token": ">>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n'  + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": ">>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n'  + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}"
         },
         "Mihaiii/Pallas-0.5": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}"
         },
         "prithivida/Asimov-7B-v2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}"
         },
         "dreamgen/opus-v1.2-7b": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}"
         },
         "KnutJaegersberg/internlm-20b-llama": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:'  + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:'  + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}"
         },
         "alpindale/WizardLM-2-8x22B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}"
         },
         "yentinglin/Taiwan-LLM-7B-v2.0-base": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}"
         },
         "maywell/Synatra-Mixtral-8x7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}"
         },
         "MediaTek-Research/Breeze-7B-Instruct-v1_0": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "MTSAIR/multi_verse_model": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}"
         },
         "bofenghuang/vigostral-7b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "SeaLLMs/SeaLLM-7B-v2.5": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<eos>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<eos>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "qnguyen3/Master-Yi-9B": {
-        "bos_token": "<|startoftext|>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|startoftext|>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
         },
         "meetkai/functionary-small-v2.5": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "h2oai/h2o-danube-1.8b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}"
         },
         "TheBloke/CodeLlama-70B-Instruct-AWQ": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
         },
         "FairMind/Phi-3-mini-4k-instruct-bnb-4bit-Ita": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
         },
         "ibm-granite/granite-8b-code-instruct": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n'  + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n'  + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}"
         },
         "dicta-il/dictalm2.0-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "nvidia/Llama3-ChatQA-1.5-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}"
         },
         "openchat/openchat-3.6-8b-20240522": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": {
-        "bos_token": {
-        "__type": "AddedToken",
-        "content": "<s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "eos_token": {
-        "__type": "AddedToken",
-        "content": "</s>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "pad_token": None,
-        "unk_token": {
-        "__type": "AddedToken",
-        "content": "<unk>",
-        "lstrip": False,
-        "normalized": True,
-        "rstrip": False,
-        "single_word": False
-        },
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}"
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "pad_token": None,
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}"
         },
         "tenyx/TenyxChat-7B-v1": {
-        "bos_token": "<s>",
-        "eos_token": "<|end_of_turn|>",
-        "pad_token": "<|end_of_turn|>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|end_of_turn|>",
+            "pad_token": "<|end_of_turn|>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}"
         },
         "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}"
         },
         "SeaLLMs/SeaLLM-7B-v2": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
         },
         "cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser": {
-        "bos_token": "<s>",
-        "eos_token": "<|im_end|>",
-        "pad_token": "<|im_end|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "<|im_end|>",
+            "pad_token": "<|im_end|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}"
         },
         "vaiv/llamion-14b-chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
         },
         "yam-peleg/Hebrew-Gemma-11B-V2": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
         },
         "shenzhi-wang/Llama3-8B-Chinese-Chat": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "ericzzz/falcon-rw-1b-chat": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}"
         },
         "NLPark/AnFeng_v3_Avocet": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
         },
         "microsoft/Phi-3-vision-128k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
         },
         "jphme/em_german_leo_mistral": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}"
         },
         "nlpai-lab/KULLM3": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1": {
-        "bos_token": "<bos>",
-        "eos_token": "<eos>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}"
+            "bos_token": "<bos>",
+            "eos_token": "<eos>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}"
         },
         "MediaTek-Research/Breeze-7B-Instruct-v0_1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }}   {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }}   {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "microsoft/DialoGPT-large": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
         },
         "meta-llama/Meta-Llama-Guard-2-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}"
         },
         "chinoll/Yi-6b-200k-dpo": {
-        "bos_token": "<|startoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<|startoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "shanchen/llama3-8B-slerp-biomed-chat-chinese": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "MLP-KTLim/llama-3-Korean-Bllossom-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "UnfilteredAI/UNfilteredAI-1B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}"
         },
         "abacusai/Smaug-Mixtral-v0.1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
         },
         "ProbeMedicalYonseiMAILab/medllama3-v20": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] +  eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: '  + message['content'] +  eos_token  }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] +  eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: '  + message['content'] +  eos_token  }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}"
         },
         "vinai/PhoGPT-4B-Chat": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<pad>",
-        "unk_token": "<unk>",
-        "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<pad>",
+            "unk_token": "<unk>",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}"
         },
         "lucyknada/microsoft_WizardLM-2-7B": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}"
         },
         "bigcode/starcoder2-15b-instruct-v0.1": {
-        "bos_token": "<|endoftext|>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": None,
-        "unk_token": "<|endoftext|>",
-        "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}"
+            "bos_token": "<|endoftext|>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": None,
+            "unk_token": "<|endoftext|>",
+            "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n        {{ raise_exception('System messages are not allowed in this template.') }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n        {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}"
         },
         "AliAbdelrasheed/maqa_llama_4bit": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|reserved_special_token_250|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|reserved_special_token_250|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
         },
         "lightonai/alfred-40b-1023": {
-        "bos_token": None,
-        "eos_token": "<end_message>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_user>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'system' %}{{ '<start_system>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'assistant' %}{{ '<start_assistant>'  + message['content'] + '<end_message>' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<start_assistant>' }}{% endif %}{% endfor %}"
+            "bos_token": None,
+            "eos_token": "<end_message>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_user>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'system' %}{{ '<start_system>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'assistant' %}{{ '<start_assistant>'  + message['content'] + '<end_message>' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<start_assistant>' }}{% endif %}{% endfor %}"
         },
         "aloobun/CosmicBun-8B": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}"
         },
         "Undi95/Mixtral-8x7B-MoE-RP-Story": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n"
         },
         "TIGER-Lab/MAmmoTH2-8B-Plus": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": "<|eot_id|>",
-        "unk_token": None,
-        "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": "<|eot_id|>",
+            "unk_token": None,
+            "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}"
         },
         "codellama/CodeLlama-70b-Instruct-hf": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}"
         },
         "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "[control_768]",
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "[control_768]",
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}"
         },
         "gorilla-llm/gorilla-openfunctions-v2": {
-        "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
-        "eos_token": "<|EOT|>",
-        "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
-        "unk_token": None,
-        "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+            "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>",
+            "eos_token": "<|EOT|>",
+            "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>",
+            "unk_token": None,
+            "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
         },
         "ghost-x/ghost-7b-alpha": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": "</s>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n'  + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute]('  + message['content'] + ')<//>' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": "</s>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n'  + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute]('  + message['content'] + ')<//>' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
         },
         "winninghealth/WiNGPT2-Llama-3-8B-Chat": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|end_of_text|>",
-        "pad_token": "<|end_of_text|>",
-        "unk_token": None,
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a"
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "pad_token": "<|end_of_text|>",
+            "unk_token": None,
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a"
         },
         "BramVanroy/Llama-2-13b-chat-dutch": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
         },
         "THUDM/chatglm3-6b": {
-        "bos_token": None,
-        "eos_token": "</s>",
-        "pad_token": "<unk>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
+            "bos_token": None,
+            "eos_token": "</s>",
+            "pad_token": "<unk>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
         },
         "microsoft/Phi-3-mini-4k-instruct": {
-        "bos_token": "<s>",
-        "eos_token": "<|endoftext|>",
-        "pad_token": "<|endoftext|>",
-        "unk_token": "<unk>",
-        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
+            "bos_token": "<s>",
+            "eos_token": "<|endoftext|>",
+            "pad_token": "<|endoftext|>",
+            "unk_token": "<unk>",
+            "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
         },
         "mistralai/Mistral-7B-Instruct-v0.1": {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "pad_token": None,
-        "unk_token": "<unk>",
-        "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n"
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "pad_token": None,
+            "unk_token": "<unk>",
+            "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n"
         },
         "meta-llama/Meta-Llama-3.1-8B-Instruct": {
-        "bos_token": "<|begin_of_text|>",
-        "eos_token": "<|eot_id|>",
-        "pad_token": None,
-        "unk_token": None,
-        "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|eot_id|>",
+            "pad_token": None,
+            "unk_token": None,
+            "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
         }
     }
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
index 5fa22497c..39b6306e7 100644
--- a/tools/llm_bench/benchmark.py
+++ b/tools/llm_bench/benchmark.py
@@ -158,7 +158,9 @@ def get_argprser():
     parser.add_argument('--set_torch_thread', default=0, type=num_infer_count_type, help='Set the number of Torch thread. ')
     parser.add_argument('-tl', '--tokens_len', type=int, required=False, help='The length of tokens print each time in streaming mode, chunk streaming.')
     parser.add_argument('--streaming', action='store_true', help='Set whether to use streaming mode, only applicable to LLM.')
-
+    parser.add_argument("--num_steps", type=int, required=False, help="Number of inference steps for image generation")
+    parser.add_argument("--height", type=int, required=False, help="Generated image height. Applicable only for Image Generation.")
+    parser.add_argument("--width", type=int, required=False, help="Generated image width. Applicable only for Image Generation.")
     return parser.parse_args()
 
 
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
index f3e7d2177..b3e2f23f0 100644
--- a/tools/llm_bench/llm_bench_utils/model_utils.py
+++ b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -97,6 +97,9 @@ def analyze_args(args):
     model_args['prompt'] = args.prompt
     model_args['prompt_file'] = args.prompt_file
     model_args['infer_count'] = args.infer_count
+    model_args["num_steps"] = args.num_steps
+    model_args["height"] = args.height
+    model_args["width"] = args.width
     model_args['images'] = args.images
     model_args['seed'] = args.seed
     model_args['mem_consumption'] = args.memory_consumption
@@ -137,6 +140,9 @@ def analyze_args(args):
 
     model_framework = args.framework
     model_path = Path(args.model)
+    if model_args["torch_compile_backend"]:
+        log.info("Setting Framework to PyTorch Since torch_compile_backend is provided.")
+        model_framework = 'pt'
     if not model_path.exists():
         raise RuntimeError(f'==Failure FOUND==: Incorrect model path:{model_path}')
     if model_framework in ('ov', 'pt'):
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
index c3df84925..316c9d0b8 100644
--- a/tools/llm_bench/llm_bench_utils/ov_utils.py
+++ b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -421,7 +421,7 @@ def get_vae_decoder_step_count(self):
 
     scheduler_type = data.get("scheduler", ["", ""])[1]
     if (scheduler_type not in ["LCMScheduler", "DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler", "EulerDiscreteScheduler",
-                               "FlowMatchEulerDiscreteScheduler"]):
+                               "FlowMatchEulerDiscreteScheduler", "EulerAncestralDiscreteScheduler"]):
         scheduler = openvino_genai.Scheduler.from_config(model_path / "scheduler/scheduler_config.json", openvino_genai.Scheduler.Type.DDIM)
         log.warning(f'Type of scheduler {scheduler_type} is unsupported. Please, be aware that it will be replaced to DDIMScheduler')
 
diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py
index 4c41efad0..dc2c6d05f 100644
--- a/tools/llm_bench/llm_bench_utils/pt_utils.py
+++ b/tools/llm_bench/llm_bench_utils/pt_utils.py
@@ -131,6 +131,7 @@ def create_image_gen_model(model_path, device, **kwargs):
             model_class = PT_MODEL_CLASSES_MAPPING[model_type]
             start = time.perf_counter()
             pipe = model_class.from_pretrained(model_path)
+            pipe = set_bf16(pipe, device, **kwargs)
             end = time.perf_counter()
             from_pretrain_time = end - start
         else:
diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py
index 7f43afe6e..125794704 100644
--- a/tools/llm_bench/task/image_generation.py
+++ b/tools/llm_bench/task/image_generation.py
@@ -25,10 +25,10 @@
 stable_diffusion_hook = StableDiffusionHook()
 
 
-def collects_input_args(image_param, model_type, model_name, infer_count=None, callback=None):
+def collects_input_args(image_param, model_type, model_name, infer_count=None, height=None, width=None, callback=None):
     input_args = {}
-    input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH)
-    input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT)
+    input_args["width"] = image_param.get('width', width or DEFAULT_IMAGE_WIDTH)
+    input_args["height"] = image_param.get('height', height or DEFAULT_IMAGE_HEIGHT)
     if infer_count is None:
         input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS)
     else:
@@ -60,7 +60,7 @@ def collects_input_args(image_param, model_type, model_name, infer_count=None, c
 def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"])
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["num_steps"], args.get("height"), args.get("width"))
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
@@ -84,7 +84,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
         for bs_idx, in_text in enumerate(input_text_list):
             llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id)
     start = time.perf_counter()
-    res = pipe(input_text_list, **input_args).images
+    res = pipe(input_text_list, **input_args, num_images_per_prompt=2).images
     end = time.perf_counter()
     if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
         mem_consumption.end_collect_momory_consumption()
@@ -123,7 +123,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list,
 def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None):
     set_seed(args['seed'])
     input_text = image_param['prompt']
-    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"], callback)
+    input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["num_steps"], args.get("height"), args.get("width"), callback)
     out_str = f"Input params: Batch_size={args['batch_size']}, " \
               f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}"
     if 'guidance_scale' in input_args:
diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py
index f1e7ac54a..15a47a8b6 100644
--- a/tools/llm_bench/task/speech_to_text_generation.py
+++ b/tools/llm_bench/task/speech_to_text_generation.py
@@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
             - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
         ).tolist()
         tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
-        tm_infer_list = None
+        tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
         result_text = result_text.texts[0]
     else:
         start = time.perf_counter()
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 3f5b5ed30..4822b228c 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -301,7 +301,8 @@ def token_printer():
         - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
     ).tolist()
 
-    tm_list = np.array([first_token_time] + second_tokens_durations) / 1000
+    tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
+    inference_durations = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
     log.debug('latency of all tokens:')
     [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
     iter_data = gen_output_data.gen_iterate_data(
@@ -322,8 +323,8 @@ def token_printer():
     metrics_print.print_metrics(
         num,
         iter_data,
-        tm_list.tolist(),
-        None,
+        tm_list,
+        inference_durations,
         warm_up=(num == 0),
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
index c4144366b..068ae0cf6 100644
--- a/tools/llm_bench/task/visual_language_generation.py
+++ b/tools/llm_bench/task/visual_language_generation.py
@@ -268,11 +268,12 @@ def run_visual_language_generation_genai(
         mm_embeddings_preparation_time=perf_metrics.get_prepare_embeddings_duration().mean
     )
     iter_data_list.append(iter_data)
+    inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000
     metrics_print.print_metrics(
         num,
         iter_data,
         tm_list.tolist(),
-        None,
+        inference_durations.tolist(),
         warm_up=(num == 0),
         max_rss_mem=max_rss_mem_consumption,
         max_shared_mem=max_shared_mem_consumption,
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 026a6cc69..04813f5fd 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -1,7 +1,3 @@
-from .utils import patch_diffusers
-
-patch_diffusers()
-
 import argparse
 import difflib
 import numpy as np