From d5921487836103b7e9f32c8577021ac2a4d9d912 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 24 Dec 2024 13:30:44 +0400 Subject: [PATCH 01/14] [Inpainting] Update stable_diffusion_xl_pipeline.hpp (#1427) --- src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index 15f15219c2..c3ebcdf1f4 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -320,7 +320,7 @@ class StableDiffusionXLPipeline : public StableDiffusionPipeline { } else if (m_pipeline_type == PipelineType::INPAINTING) { m_generation_config.guidance_scale = 7.5f; m_generation_config.num_inference_steps = 50; - m_generation_config.strength == 0.9999f; + m_generation_config.strength = 0.9999f; } } else { OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); From db28c8c5775fe61a03519355f433f9885460e9e3 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 24 Dec 2024 13:31:24 +0400 Subject: [PATCH 02/14] Fix compile warnings in tokenizer.cpp (#1428) ``` /Users/runner/work/openvino.genai/openvino.genai/src/cpp/src/tokenizer.cpp:238:40: warning: expression result unused [-Wunused-value] encode("non empty string").input_ids; ``` --- src/cpp/src/tokenizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index ed6fbc0a06..5364acfd91 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -235,7 +235,7 @@ class Tokenizer::TokenizerImpl { // Initialize tokenizer's cache to save time later. if (m_tokenizer) { // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. - encode("non empty string").input_ids; + encode("non empty string"); } if (m_detokenizer) { decode({1, 33, 199, 42, 42}); From 0da48cd1fdb3dd9620b0a0f4d494d64d78d3a491 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 24 Dec 2024 21:34:34 +0400 Subject: [PATCH 03/14] Revert "Pin optimum-intel commit" (#1426) Reverts openvinotoolkit/openvino.genai#1420 Fixed here https://github.com/huggingface/optimum-intel/pull/1091 --- .github/workflows/llm_bench-python.yml | 4 ++-- samples/export-requirements.txt | 2 +- tests/python_tests/requirements.txt | 5 +++-- tools/llm_bench/requirements.txt | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 8356805e19..1999bafcfe 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -151,7 +151,7 @@ jobs: rm -rf ./ov_models/internvl2-1B - name: WWB Tests run: | - pip install git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a + pip install git+https://github.com/huggingface/optimum-intel.git GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }} python -m pytest -v ${{ env.WWB_PATH }}/tests stateful: @@ -190,7 +190,7 @@ jobs: - name: WWB Tests run: | pip install pytest - pip install git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a + pip install git+https://github.com/huggingface/optimum-intel.git GIT_CLONE_PROTECTION_ACTIVE=false PIP_PRE=1 PIP_EXTRA_INDEX_URL=https://storage.openvinotoolkit.org/simple/wheels/nightly pip install ${{ env.WWB_PATH }} python -m pytest -v ${{ env.WWB_PATH }}/tests diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index d75fdbacee..797b680b9a 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index bc5324b211..00bffb6646 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a -numpy<2.0.0; sys_platform == 'darwin' +diffusers==0.31.0 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt index acbc668c52..f5f4a3fdeb 100644 --- a/tools/llm_bench/requirements.txt +++ b/tools/llm_bench/requirements.txt @@ -10,7 +10,7 @@ torch transformers>=4.40.0 diffusers>=0.22.0 #optimum is in dependency list of optimum-intel -git+https://github.com/huggingface/optimum-intel.git@420fa87d039425a906b7f755e4562b65947f016a#egg=optimum-intel +git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil From 021d88059d1367ef5ccc7938183de3dcdaafe82f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 24 Dec 2024 19:43:58 +0100 Subject: [PATCH 04/14] Dynamic KV cache allocation (#1364) Dynamic KV cache allocation Ticket: CVS-158409 --------- Co-authored-by: Ilya Lavrenov --- .../prompt_lookup_decoding_lm.cpp | 6 +- .../speculative_decoding_lm.cpp | 6 +- .../prompt_lookup_decoding_lm.py | 5 +- .../speculative_decoding_lm.py | 6 +- src/cpp/src/block_manager.hpp | 51 ++++++- src/cpp/src/cache_manager.hpp | 124 +++++++++++++++--- src/cpp/src/continuous_batching_impl.cpp | 10 +- src/cpp/src/device_config.hpp | 36 ++--- src/cpp/src/llm_pipeline.cpp | 13 +- src/cpp/src/scheduler.hpp | 120 ++++++++++++++++- .../speculative_decoding_impl.cpp | 3 +- .../utils/paged_attention_transformations.cpp | 10 +- tests/cpp/cache_manager.cpp | 114 ++++++++++++++-- tests/cpp/scheduler.cpp | 59 ++++++--- tests/python_tests/common.py | 1 - tests/python_tests/ov_genai_test_utils.py | 1 - .../python_tests/test_cache_optimizations.py | 27 +++- 17 files changed, 480 insertions(+), 112 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index e692110027..8b48dbade0 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -22,14 +22,10 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; - ov::genai::SchedulerConfig scheduler_config; - scheduler_config.cache_size = 5; - ov::genai::LLMPipeline pipe( model_path, device, - ov::genai::prompt_lookup(true), - ov::genai::scheduler_config(scheduler_config)); + ov::genai::prompt_lookup(true)); auto streamer = [](std::string subword) { std::cout << subword << std::flush; diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 487296566b..e10228863f 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -26,14 +26,10 @@ int main(int argc, char* argv[]) try { // Please, set device for main model in `LLMPipeline` constructor and in in `ov::genai::draft_model` for draft. std::string main_device = "CPU", draft_device = "CPU"; - ov::genai::SchedulerConfig scheduler_config; - scheduler_config.cache_size = 5; - ov::genai::LLMPipeline pipe( main_model_path, main_device, - ov::genai::draft_model(draft_model_path, draft_device), - ov::genai::scheduler_config(scheduler_config)); + ov::genai::draft_model(draft_model_path, draft_device)); auto streamer = [](std::string subword) { std::cout << subword << std::flush; diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py index 557897b6b1..726391ba9b 100755 --- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py +++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py @@ -18,11 +18,8 @@ def main(): args = parser.parse_args() device = 'CPU' - scheduler_config = openvino_genai.SchedulerConfig() - # cache params - scheduler_config.cache_size = 2 - pipe = openvino_genai.LLMPipeline(args.model_dir, device, scheduler_config=scheduler_config, prompt_lookup=True) + pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 612e59474e..217b8a2730 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -25,13 +25,9 @@ def main(): main_device = 'CPU' # GPU can be used as well draft_device = 'CPU' - scheduler_config = openvino_genai.SchedulerConfig() - # cache params - scheduler_config.cache_size = 2 - draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device) - pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, scheduler_config=scheduler_config, draft_model=draft_model) + pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index dc82897dc8..4ca263777b 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -205,14 +205,20 @@ class BlockAllocator { * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache. */ BlockAllocator(size_t num_blocks, bool enable_prefix_caching, size_t num_layers = 1) : - m_free_blocks_num(num_layers, num_blocks), m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { + m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); m_free_blocks.resize(m_num_layers); - for (auto& per_layer_block_list : m_free_blocks) { - for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { - per_layer_block_list.push_back(std::make_shared(block_id)); + if (num_blocks > 0) { + m_free_blocks_num = std::vector(num_layers, num_blocks); + for (auto& per_layer_block_list : m_free_blocks) { + for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { + per_layer_block_list.push_back(std::make_shared(block_id)); + } } } + else { + m_free_blocks_num = std::vector(m_num_layers, 0); + } } ~BlockAllocator() { @@ -220,6 +226,21 @@ class BlockAllocator { // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size()); } + void increase_kv_blocks_number(size_t new_kv_blocks_count) { + OPENVINO_ASSERT(new_kv_blocks_count > m_total_num_blocks, "New blocks number should be more than previous blocks number."); + size_t added_blocks = new_kv_blocks_count - m_total_num_blocks; + for (auto idx = 0; idx < m_free_blocks_num.size(); idx++) { + m_free_blocks_num[idx] += added_blocks; + } + for (auto& per_layer_block_list : m_free_blocks) { + for (int block_id = m_total_num_blocks; block_id < new_kv_blocks_count; ++block_id) { + per_layer_block_list.push_back(std::make_shared(block_id)); + } + } + m_total_num_blocks = new_kv_blocks_count; + } + + /** * Returns the number of free blocks for a given layer. * @param layer_idx Index of the layer. @@ -459,6 +480,13 @@ class BlockAllocator { for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) sum += num_free_blocks(layer_idx); return static_cast(m_num_layers * m_total_num_blocks - sum) / (m_num_layers * m_total_num_blocks) * 100; } + + /** + * @return The total number of KV blocks . + */ + size_t get_total_number_of_kv_blocks() const { + return m_total_num_blocks; + } }; /** @@ -713,6 +741,21 @@ class BlockManager { return m_allocator.get_used_percentage(); } + /** + * Increases the number of KV blocks. + * @param num_blocks The new number of KV-blocks. + */ + void increase_kv_blocks_number(size_t num_blocks) { + m_allocator.increase_kv_blocks_number(num_blocks); + } + + /** + * @return The total number of KV blocks . + */ + size_t get_total_number_of_kv_blocks() const { + return m_allocator.get_total_number_of_kv_blocks(); + } + /** * @brief Forks a sequence, establishing a new sequence from an existing one, reusing * currently allocated blocks of the existing sequence. diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index a7444555ab..0c04823f4f 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -15,38 +15,118 @@ class CacheManager { DeviceConfig m_device_config; std::vector m_key_cache; std::vector m_value_cache; + size_t m_num_allocated_kv_blocks = 0; ov::Core m_core; + ov::InferRequest m_request; + + ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { + ov::PartialShape res_shape = shape; + res_shape[0] = dim; + OPENVINO_ASSERT(res_shape.is_static()); + return res_shape.to_shape(); + } + + void update_request_tensor(size_t decoder_layer_id) { + m_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_key_cache[decoder_layer_id]); + m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); + } public: - explicit CacheManager(const DeviceConfig &device_config, ov::Core core) : + explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) : m_device_config(device_config), + m_request(request), m_core(core) { m_key_cache.reserve(m_device_config.get_num_layers()); m_value_cache.reserve(m_device_config.get_num_layers()); + } + + void allocate_cache_if_needed(size_t num_kv_blocks) { + if (m_num_allocated_kv_blocks >= num_kv_blocks) { + return; + } + OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size()); + m_num_allocated_kv_blocks = num_kv_blocks; + ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); + ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); + + const std::string device_name = m_device_config.get_device(); + + ov::Coordinate start_key{0,0,0,0}; + ov::Coordinate start_value{0,0,0,0}; - const std::string device_name = device_config.get_device(); if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape()); - ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape()); + ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); + ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); + + auto key_cache_roi_end = static_cast(key_cache.data()); + auto value_cache_roi_end = static_cast(value_cache.data()); + size_t key_roi_size_byte = 0; + size_t value_roi_size_byte = 0; + + if (m_key_cache.size() > decoder_layer_id) { + ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); + ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); + + key_roi_size_byte = m_key_cache[decoder_layer_id].get_byte_size(); + value_roi_size_byte = m_value_cache[decoder_layer_id].get_byte_size(); + key_cache_roi_end = static_cast(key_cache.data()) + key_roi_size_byte; + value_cache_roi_end = static_cast(value_cache.data()) + value_roi_size_byte; + + // copy current cache data + ov::Tensor dst_key_roi(key_cache, start_key, end_key); + ov::Tensor dst_value_roi(value_cache, start_value, end_value); + + m_key_cache[decoder_layer_id].copy_to(dst_key_roi); + m_value_cache[decoder_layer_id].copy_to(dst_value_roi); + + } - // force allocation - std::memset(key_cache.data(), 0, key_cache.get_byte_size()); - std::memset(value_cache.data(), 0, value_cache.get_byte_size()); + // Some optimizations like AVX2, AVX512, AMX require a minimal shape and + // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, + // so NAN * 0 returns non-zero invalid data. + // So we need to set zeros to all newly allocated tensors data. + std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - key_roi_size_byte); + std::memset(value_cache_roi_end, 0, value_cache.get_byte_size() - value_roi_size_byte); + + // set new cache tensors + if (m_key_cache.size() > decoder_layer_id) { + m_key_cache[decoder_layer_id] = key_cache; + m_value_cache[decoder_layer_id] = value_cache; + } + else { + m_key_cache.emplace_back(key_cache); + m_value_cache.emplace_back(value_cache); + } - m_key_cache.emplace_back(key_cache); - m_value_cache.emplace_back(value_cache); + update_request_tensor(decoder_layer_id); } } else { auto remote_context = m_core.get_default_context(device_name); for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Tensor key_cache = remote_context.create_tensor(device_config.get_cache_precision(), - device_config.get_key_cache_shape()); - ov::Tensor value_cache = remote_context.create_tensor(device_config.get_cache_precision(), - device_config.get_value_cache_shape()); - - m_key_cache.emplace_back(key_cache); - m_value_cache.emplace_back(value_cache); + ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), + key_cache_shape); + ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), + value_cache_shape); + + if (m_key_cache.size() > decoder_layer_id) { + ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); + ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); + + // copy current cache data + ov::RemoteTensor dst_key_roi(key_cache, start_key, end_key); + ov::RemoteTensor dst_value_roi(value_cache, start_value, end_value); + dst_key_roi.copy_from(m_key_cache[decoder_layer_id]); + dst_value_roi.copy_from(m_value_cache[decoder_layer_id]); + + m_key_cache[decoder_layer_id] = key_cache; + m_value_cache[decoder_layer_id] = value_cache; + } + else { + m_key_cache.emplace_back(key_cache); + m_value_cache.emplace_back(value_cache); + } + update_request_tensor(decoder_layer_id); } } } @@ -62,8 +142,8 @@ class CacheManager { } void copy_blocks(const std::map>& block_copy_map) { - ov::Shape key_shape = m_device_config.get_key_cache_shape(); - ov::Shape value_shape = m_device_config.get_value_cache_shape(); + ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks); + ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks); ov::Coordinate key_src_start_roi(key_shape.size(), 0); ov::Coordinate key_src_end_roi = key_shape; @@ -98,5 +178,13 @@ class CacheManager { } } } + + std::shared_ptr get_core() { + return std::make_shared(m_core); + } + + std::shared_ptr get_device_config() { + return std::make_shared(m_device_config); + } }; } diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index e1ffd062de..52ec6a8302 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -53,11 +53,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( ov::InferRequest infer_request = compiled_model.create_infer_request(); // setup KV caches - m_cache_manager = std::make_shared(device_config, core); - for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) { - infer_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_key_cache(decoder_layer_id)); - infer_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_value_cache(decoder_layer_id)); - } + m_cache_manager = std::make_shared(device_config, infer_request, core); SchedulerConfig updated_config = scheduler_config; // update KV blocks number in scheduler config @@ -71,8 +67,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( // as it may lead to performance slowdown can_use_partial_preemption = false; } - - m_scheduler = std::make_shared(device_config.get_block_size(), updated_config, device_config.get_num_layers(), can_use_partial_preemption); + m_scheduler = std::make_shared(device_config.get_block_size(), m_cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption); // and finally create model runner bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; m_model_runner = std::make_shared(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction); @@ -133,7 +128,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { _pull_awaiting_requests(); m_pipeline_metrics.requests = m_requests.size(); - Scheduler::Output scheduler_output; { static ManualTimer timer("scheduling"); diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 2af4559ef1..371142701c 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -12,7 +12,7 @@ namespace ov::genai { class DeviceConfig { ov::element::Type m_kv_cache_type; - ov::Shape m_key_cache_shape, m_value_cache_shape; + ov::PartialShape m_key_cache_shape, m_value_cache_shape; ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers; size_t m_num_kv_blocks = 0; size_t m_block_size = 0; @@ -80,11 +80,10 @@ class DeviceConfig { OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching"); } - OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); if (scheduling_config.num_kv_blocks > 0) { m_num_kv_blocks = scheduling_config.num_kv_blocks; } - else { + else if (scheduling_config.cache_size > 0) { m_cache_size = scheduling_config.cache_size; } } @@ -104,23 +103,22 @@ class DeviceConfig { m_head_size += 8; } - if (m_num_kv_blocks == 0) { - OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); + if (m_num_kv_blocks == 0 && m_cache_size > 0) { size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); } - m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks, - m_num_kv_heads, - m_block_size, - m_head_size}; + m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads), + ov::Dimension(m_block_size), + ov::Dimension(m_head_size)}; if (m_device.find("GPU") != std::string::npos) { // Update key shape, as the key's shape is different from the value's shape - m_key_cache_shape = ov::Shape{m_num_kv_blocks, - m_num_kv_heads, - m_head_size, - m_block_size}; + m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads), + ov::Dimension(m_head_size), + ov::Dimension(m_block_size)}; } } @@ -136,13 +134,13 @@ class DeviceConfig { return m_num_decoder_layers; } - ov::Shape get_key_cache_shape() const { - OPENVINO_ASSERT(!m_key_cache_shape.empty()); + ov::PartialShape get_key_cache_shape() const { + OPENVINO_ASSERT(m_key_cache_shape.size()); return m_key_cache_shape; } - ov::Shape get_value_cache_shape() const { - OPENVINO_ASSERT(!m_value_cache_shape.empty()); + ov::PartialShape get_value_cache_shape() const { + OPENVINO_ASSERT(m_value_cache_shape.size()); return m_value_cache_shape; } @@ -153,5 +151,9 @@ class DeviceConfig { size_t get_block_size() const { return m_block_size; } + + size_t get_block_size_in_bytes() const { + return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size(); + } }; } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 33180a9199..be5ecf17fa 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -718,7 +718,9 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& properties ){ auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) { + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); } else if (device == "NPU") { @@ -737,7 +739,9 @@ ov::genai::LLMPipeline::LLMPipeline( ){ auto start_time = std::chrono::steady_clock::now(); - if (config.find(ov::genai::scheduler_config.name()) != config.end()) { + if (config.find(ov::genai::scheduler_config.name()) != config.end() || + config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end() || + config.find(ov::genai::prompt_lookup.name()) != config.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(config); m_pimpl = std::make_unique(models_path, scheduler_config, device, plugin_config); } else if (device == "NPU") { @@ -760,7 +764,10 @@ ov::genai::LLMPipeline::LLMPipeline( auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config); auto start_time = std::chrono::steady_clock::now(); - if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { + if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end() || + plugin_config.find(utils::DRAFT_MODEL_ARG_NAME) != plugin_config.end() || + plugin_config.find(ov::genai::prompt_lookup.name()) != plugin_config.end()){ + auto [plugin_config_, scheduler_config] = utils::split_scheduler_config(plugin_config); m_pimpl = std::make_unique(model_str, weights_tensor, tokenizer, scheduler_config, device, plugin_config_, generation_config); diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 6de4adaa47..da65c68bec 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -7,10 +7,12 @@ #include #include +#include "openvino/runtime/intel_gpu/properties.hpp" #include "openvino/genai/scheduler_config.hpp" #include "device_config.hpp" #include "block_manager.hpp" #include "sequence_group.hpp" +#include "cache_manager.hpp" namespace ov::genai { class Scheduler { @@ -20,6 +22,13 @@ class Scheduler { BlockManager m_block_manager; friend class CacheStateDumper; + bool m_dynamic_memory_allocation = false; + + // Dynamic KV-cache allocation params + size_t m_kv_blocks_initial_multiplier = 2; + const float m_cache_growth_factor = 2; // commmon values 1.5 or 2 + + std::shared_ptr m_cache_manager; public: struct Output { // IDs of scheduled groups @@ -36,15 +45,20 @@ class Scheduler { float m_cache_usage = 0.0; }; - explicit Scheduler(size_t block_size, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + explicit Scheduler(size_t block_size, std::shared_ptr cache_manager, const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + m_cache_manager(cache_manager), m_can_use_partial_preemption(can_use_partial_preemption), m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, block_size, num_layers) { + OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); } Output schedule(std::vector& sequence_groups) { Output scheduler_output; + if (m_block_manager.get_total_number_of_kv_blocks() == 0) { + _initialize_cache(sequence_groups); + } if (m_config.dynamic_split_fuse) { // deepspeed-mii case @@ -64,9 +78,9 @@ class Scheduler { } } + m_cache_manager->allocate_cache_if_needed(m_block_manager.get_total_number_of_kv_blocks()); _clear_waiting_sequences(sequence_groups); scheduler_output.m_cache_usage = m_block_manager.get_used_percentage(); - return scheduler_output; } @@ -236,8 +250,13 @@ class Scheduler { OPENVINO_ASSERT(currently_allocated_token_slots >= occupied_token_slots, "internal error"); size_t available_slots = currently_allocated_token_slots - occupied_token_slots, required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0; - size_t num_required_blocks = (required_slots + block_size - 1) / block_size, num_free_blocks = m_block_manager.num_free_blocks(); - size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks); + size_t num_required_blocks = (required_slots + block_size - 1) / block_size; + while (num_required_blocks > m_block_manager.num_free_blocks()) { + if (!_try_increase_cache()) { + break; + } + } + size_t num_scheduled_blocks = std::min(num_required_blocks, m_block_manager.num_free_blocks()); // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks // and total "scheduled capacity" num_scheduled_tokens = std::min(num_scheduled_tokens, available_slots + num_scheduled_blocks * block_size); @@ -289,10 +308,16 @@ class Scheduler { size_t num_scheduled_tokens_per_seq = std::min(available_tokens_per_seq_in_megabatch, num_available_tokens_per_seq); sequence_group->schedule_tokens(num_scheduled_tokens_per_seq); + while (!m_block_manager.can_append_slots(sequence_group)){ + if (!_try_increase_cache()) { + break; + } + } + _apply_preemption(sequence_group_id, sequence_groups); // if we can't preemt any more sequences, clear scheduled tokens and move to next sequence - if (!m_block_manager.can_append_slots(sequence_group)){ + if (!m_block_manager.can_append_slots(sequence_group)) { sequence_group->clear_scheduled_tokens(); continue; } @@ -370,6 +395,11 @@ class Scheduler { // apply KV cache limitations size_t block_size = get_block_size(); const size_t num_required_blocks = (sequence_len + block_size - 1) / block_size; + while (!m_block_manager.can_allocate_blocks(num_required_blocks)){ + if (!_try_increase_cache()) { + break; + } + } if (!m_block_manager.can_allocate_blocks(num_required_blocks)) break; @@ -405,6 +435,86 @@ class Scheduler { sequence_groups[sequence_group_id]->clear_waiting_sequences(); } } + + size_t _get_available_gpu_memory() { + auto device_config = m_cache_manager->get_device_config(); + auto core = m_cache_manager->get_core(); + auto device = device_config->get_device(); + OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); + auto memory_statistics = core->get_property(device, ov::intel_gpu::memory_statistics); + auto device_type = core->get_property(device, ov::device::type); + + // sum up all used device memory + std::vector device_memory_types = {"cl_mem", "usm_device"}; + size_t used_device_mem = 0; + for (auto mem_type: device_memory_types) { + used_device_mem += memory_statistics[mem_type]; + } + + if (device_type == ov::device::Type::INTEGRATED) { + used_device_mem += memory_statistics["usm_host"]; + } + + // there could be unaccounted extra memory reserved by kernels, kept + // in memory pools, etc + // therefore, add a threshold to account for this + float used_memory_threshold = 1.1; + used_device_mem *= used_memory_threshold; + + // total device memory in bytes + auto total_device_memory = core->get_property(device, ov::intel_gpu::device_total_mem_size); + + return total_device_memory - used_device_mem; + } + + void _initialize_cache(const std::vector& sequence_groups) { + size_t blocks_sum = 0; + for (auto idx = 0; idx < sequence_groups.size(); idx++) { + auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; + auto gen_config = sequence_groups[idx]->get_sampling_parameters(); + seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())); + size_t blocks_num = std::ceil((float)seq_length / m_block_manager.get_block_size()); + if (gen_config.is_beam_search()) { + blocks_num *= gen_config.num_beams; + } else if (gen_config.is_multinomial()) { + blocks_num *= gen_config.num_return_sequences; + } + blocks_sum += blocks_num; + } + m_block_manager.increase_kv_blocks_number(blocks_sum); + m_dynamic_memory_allocation = true; + } + + bool _try_increase_cache() { + if (!m_dynamic_memory_allocation) { + return false; + } + auto device_config = m_cache_manager->get_device_config(); + auto device = device_config->get_device(); + size_t current_num_of_kv_blocks = m_block_manager.get_total_number_of_kv_blocks(); + size_t new_blocks_num = current_num_of_kv_blocks * m_cache_growth_factor; + + if (device.find("GPU") == std::string::npos) { + m_block_manager.increase_kv_blocks_number(new_blocks_num); + } + else { + size_t available_gpu_memory = _get_available_gpu_memory(); + size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * device_config->get_block_size_in_bytes(); + if (required_memory <= available_gpu_memory) { + m_block_manager.increase_kv_blocks_number(new_blocks_num); + } else { + size_t possible_blocks_to_add = available_gpu_memory / device_config->get_block_size_in_bytes(); + if (possible_blocks_to_add > 0) { + m_block_manager.increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add); + } + else { + return false; + } + } + } + return true; + } + }; } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 46b7b106a6..257c20bf01 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -52,8 +52,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con size_t main_cache_size = std::ceil(main_scheduler_config.cache_size * (1.f - k)), draft_cache_size = main_scheduler_config.cache_size - main_cache_size; - OPENVINO_ASSERT(main_cache_size > 0, "KV cache model cache size should be > 0"); - if (draft_cache_size == 0) { + if (draft_cache_size == 0 && main_cache_size > 0) { main_cache_size -= (main_cache_size > 1 ? 1 : 0); draft_cache_size = 1; } diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index 16c9556151..4dedcf989a 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -10,11 +10,6 @@ namespace ov { namespace genai { namespace utils { -inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) { - ov::PartialShape partial_shape = static_shape; - partial_shape[0] = ov::Dimension::dynamic(); - return partial_shape; -} size_t get_hidden_size(const std::shared_ptr model) { const auto& parameters = model->get_parameters(); @@ -65,9 +60,8 @@ void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) { it_k->second->set_element_type(device_config.get_cache_precision()); it_v->second->set_element_type(device_config.get_cache_precision()); - // TODO: CVS-145270 - it_k->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_key_cache_shape())); - it_v->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_value_cache_shape())); + it_k->second->set_partial_shape(device_config.get_key_cache_shape()); + it_v->second->set_partial_shape(device_config.get_value_cache_shape()); } model->validate_nodes_and_infer_types(); diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index b2a5396d5f..7f07980389 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -7,8 +7,43 @@ #include "scheduler.hpp" #include "device_config.hpp" #include "cache_manager.hpp" +#include "openvino/op/concat.hpp" -TEST(TestCacheManager, general_test) { +using namespace ov::genai; + +std::shared_ptr get_dummy_model(size_t num_layers) { + ov::NodeVector keys; + ov::NodeVector values; + ov::ParameterVector params; + auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(ov::element::f16, shape); + auto value = std::make_shared(ov::element::f16, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} + +size_t get_total_allocated_bytes(std::shared_ptr cache_manager, size_t num_decoder_layers) { + size_t allocated_bytes = 0; + for (size_t i = 0; i < num_decoder_layers; i++) { + auto key_cache = cache_manager->get_key_cache(i); + auto value_cache = cache_manager->get_value_cache(i); + allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); + } + return allocated_bytes; +} + + +TEST(TestCacheManager, test_cache_size_param) { ov::Core core; ov::genai::SchedulerConfig scheduler_config; scheduler_config.max_num_batched_tokens = 32; @@ -21,14 +56,73 @@ TEST(TestCacheManager, general_test) { size_t num_decoder_layers = 12; device_config.set_model_params(12, 64, num_decoder_layers); - auto cache_manager = std::make_shared(device_config, core); - - size_t allocated_bytes = 0; - for (size_t i = 0; i < num_decoder_layers; i++) { - auto key_cache = cache_manager->get_key_cache(i); - auto value_cache = cache_manager->get_value_cache(i); - allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); - } + ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); + auto cache_manager = std::make_shared(device_config, request, core); + auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - ASSERT_EQ(allocated_bytes, 2146959360); + ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); } + + +TEST(TestCacheManager, test_kv_blocks_param) { + ov::Core core; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 150; + scheduler_config.cache_size = 0; + scheduler_config.max_num_seqs = 2; + + const std::string device = "CPU"; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + size_t num_decoder_layers = 12; + device_config.set_model_params(12, 64, num_decoder_layers); + + ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); + auto cache_manager = std::make_shared(device_config, request, core); + auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks); +} + + +TEST(TestCacheManager, test_dynamic_cache_increase) { + ov::Core core; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 0; + scheduler_config.cache_size = 0; + scheduler_config.max_num_seqs = 2; + + const std::string device = "CPU"; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + size_t num_decoder_layers = 12; + size_t head_size = 64; + size_t num_kv_heads = 12; + device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers); + size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + + + ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); + auto cache_manager = std::make_shared(device_config, request, core); + auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + + // check initial cache allocation + block_manager.increase_kv_blocks_number(100); + OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100); + + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); + OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes); + + + // check cache increase + block_manager.increase_kv_blocks_number(200); + OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200); + + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); + OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); + + + // check that cache does not increase if new blocks were not allocated + cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); + OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); +} \ No newline at end of file diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 40c3e73747..ea1720faa2 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -4,6 +4,7 @@ #include #include "openvino/runtime/core.hpp" +#include "openvino/op/concat.hpp" #include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/generation_config.hpp" #include "sequence_group.hpp" @@ -17,6 +18,37 @@ void clear_finished_sequences(std::vector& requests) { }); requests.erase(new_end, requests.end()); } +std::shared_ptr get_model(size_t num_layers) { + ov::NodeVector keys; + ov::NodeVector values; + ov::ParameterVector params; + auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(ov::element::f16, shape); + auto value = std::make_shared(ov::element::f16, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} + +std::shared_ptr init_cache_manager(SchedulerConfig scheduler_config) { + ov::Core core = ov::Core(); + size_t num_decoder_layers = 12; + ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request(); + size_t head_size = 64, head_size_u8 = head_size + 8; + size_t num_kv_heads = 12; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + return std::make_shared(device_config, request, core); +} TEST(TestScheduler, general_test) { std::array configs = {SchedulerConfig(), SchedulerConfig()}; @@ -40,10 +72,9 @@ TEST(TestScheduler, general_test) { ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching); auto idx2 = (*sequence_group3)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2, sequence_group3}; - // schedule 3 sequence groups that use 6 kv blocks - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1, 2}; @@ -144,7 +175,7 @@ TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) { auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); std::vector ref_ids = {0, 1}; @@ -212,7 +243,7 @@ TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) { // schedule 2 sequence groups that use 5 kv blocks - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out0 = scheduler.schedule(requests); for (auto seq: requests) { @@ -297,7 +328,7 @@ TEST(TestScheduler, test_partial_preemption_beam_search) { sequence_group->set_sequence_group_ptr(sequence_group); std::vector requests = {sequence_group}; - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out = scheduler.schedule(requests); for (auto sequence: sequence_group->get_not_finished_sequences()) { sequence->append_token(token, 0.7); @@ -405,11 +436,10 @@ TEST(TestScheduler, test_partially_preempted_prompt) { SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), ov::genai::greedy(), 4, scheduler_config.enable_prefix_caching); auto idx1 = (*sequence_group2)[0]->get_id(); - std::vector requests = {sequence_group1, sequence_group2}; - + std::vector requests = {sequence_group1, sequence_group2}; // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); auto out1 = scheduler.schedule(requests); for (auto seq: requests) { @@ -503,7 +533,7 @@ TEST(TestScheduler, prefix_caching_test) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 10; @@ -566,7 +596,7 @@ TEST(TestScheduler, prefix_caching_test_two_identical_sequences) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; std::vector histrory_tokens = {}; // schedule prompt - Scheduler scheduler = Scheduler(4, scheduler_config); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 10; @@ -640,7 +670,7 @@ TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) { for (auto scheduler_config: configs) { std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; // schedule prompt - Scheduler scheduler = Scheduler(32, scheduler_config); + Scheduler scheduler = Scheduler(32, init_cache_manager(scheduler_config), scheduler_config); size_t chat_iterations = 2; @@ -701,7 +731,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); auto out1 = scheduler.schedule(requests); for (auto req : requests) @@ -775,7 +805,7 @@ TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) { // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. const bool can_use_partial_preemption = false; - Scheduler scheduler = Scheduler(4, scheduler_config, 1, can_use_partial_preemption); + Scheduler scheduler = Scheduler(4, init_cache_manager(scheduler_config), scheduler_config, 1, can_use_partial_preemption); scheduler.schedule(requests); for (auto req: requests) req->finish_iteration(); @@ -874,7 +904,6 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { scheduler_config.use_cache_eviction = true; scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(2, 2, 6, ov::genai::AggregationMode::NORM_SUM); - std::vector tokens1 = {0, 1}; // 1 full block SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens1.size()}, @@ -890,7 +919,7 @@ TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { std::vector requests = {sequence_group1, sequence_group2}; - Scheduler scheduler = Scheduler(2, scheduler_config); + Scheduler scheduler = Scheduler(2, init_cache_manager(scheduler_config), scheduler_config); // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2 auto out = scheduler.schedule(requests); diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 163a00192e..cf5fbb3403 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -266,7 +266,6 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]: def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig: scheduler_config = SchedulerConfig() - scheduler_config.cache_size = 1 if scheduler_params is None: scheduler_config.dynamic_split_fuse = True # vLLM specific diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index b633497d32..5f2702a774 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -283,5 +283,4 @@ def load_pipe(configs: List[Tuple], temp_path): @functools.lru_cache(1) def get_continuous_batching(path): scheduler_config = ov_genai.SchedulerConfig() - scheduler_config.cache_size = 1 return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config}) diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index 45704f9dc6..3c09d34756 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT +from common import TESTS_ROOT, run_test_pipeline def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -145,3 +145,28 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t del model_cb_noopt +def get_greedy_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 3 + generation_config.max_new_tokens = 300 + return generation_config + +def get_beam_search_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 300 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + +scheduler_params_list = [ + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())] +@pytest.mark.parametrize("params", scheduler_params_list) +@pytest.mark.precommit +def test_dynamic_memory_allocation(tmp_path, params): + run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + From c83f8160896994d5c2a917d7dbc7465c368d1c8e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 25 Dec 2024 06:08:42 +0400 Subject: [PATCH 05/14] [GHA] Updated OpenVINO nightly (#1433) To catch up https://github.com/openvinotoolkit/openvino/pull/28067 --- .github/workflows/causal_lm_cpp.yml | 8 ++++---- .github/workflows/job_vlm_sample_llava.yml | 2 +- .github/workflows/lcm_dreamshaper_cpp.yml | 4 ++-- src/cpp/src/tokenizer.cpp | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 2e9d72e263..4aad3d4bc3 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -16,10 +16,10 @@ concurrency: cancel-in-progress: true env: - l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241205_x86_64.tgz - l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz - m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241205_x86_64.tgz - w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu20_2025.0.0.dev20241224_x86_64.tgz + l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/m_openvino_toolkit_macos_12_6_2025.0.0.dev20241224_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip jobs: cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml index 166284bd4b..5f4634616a 100644 --- a/.github/workflows/job_vlm_sample_llava.yml +++ b/.github/workflows/job_vlm_sample_llava.yml @@ -11,7 +11,7 @@ on: type: string env: - l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz + l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz jobs: visual_language_chat_sample-ubuntu-llava: diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 258184e9e4..c525b0be68 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -18,8 +18,8 @@ concurrency: env: PYTHON_VERSION: '3.9' - LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241205_x86_64.tgz - WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17539-6abe2e39391/w_openvino_toolkit_windows_2025.0.0.dev20241205_x86_64.zip + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241224_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17709-688f0428cfc/w_openvino_toolkit_windows_2025.0.0.dev20241224_x86_64.zip OV_INSTALL_DIR: ${{ github.workspace }}/ov jobs: diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 5364acfd91..b098f96fe6 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -394,8 +394,8 @@ class Tokenizer::TokenizerImpl { infer_request_guard.get().start_async(); infer_request_guard.get().wait(); return get_copied_results( - infer_request_guard.get().get_tensor("input_ids"), - infer_request_guard.get().get_tensor("attention_mask") + infer_request_guard.get().get_output_tensor(0), + infer_request_guard.get().get_output_tensor(1) ); } @@ -412,8 +412,8 @@ class Tokenizer::TokenizerImpl { infer_request_guard.get().wait(); unpadded = get_copied_results( - infer_request_guard.get().get_tensor("input_ids"), - infer_request_guard.get().get_tensor("attention_mask") + infer_request_guard.get().get_output_tensor(0), + infer_request_guard.get().get_output_tensor(1) ); } return pad_left(unpadded.input_ids, unpadded.attention_mask); From fabb5b312f92c3cf3bfae86f80c6a3bfbba95d78 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 25 Dec 2024 06:15:31 +0400 Subject: [PATCH 06/14] temporary use num_steps instead of infer_count for image generation (#1432) workaround for CVS-159838 proper fix required on validation pipeline side --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/llm_bench-python.yml | 6 +++--- tools/llm_bench/benchmark.py | 4 +++- tools/llm_bench/llm_bench_utils/model_utils.py | 3 +++ tools/llm_bench/task/image_generation.py | 12 ++++++------ 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 1999bafcfe..56145c080c 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -114,14 +114,14 @@ jobs: - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux Optimum Intel run: | huggingface-cli download OpenVINO/LCM_Dreamshaper_v7-int8-ov --local-dir ov_models/lcm_dreamshaper_v7 - python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum -ic 4 + python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --optimum --num_steps 4 - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI run: | - python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 -ic 4 + python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --num_steps 4 - name: Test OpenVINO/LCM_Dreamshaper_v7-int8-ov on Linux with GenAI and LoRA run: | wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591 - python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 -ic 4 + python ./tools/llm_bench/benchmark.py -m ./ov_models/lcm_dreamshaper_v7/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 --num_steps 4 rm -rf ./ov_models/lcm_dreamshaper_v7/ - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux run: | diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index 5fa22497c1..39b6306e7f 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -158,7 +158,9 @@ def get_argprser(): parser.add_argument('--set_torch_thread', default=0, type=num_infer_count_type, help='Set the number of Torch thread. ') parser.add_argument('-tl', '--tokens_len', type=int, required=False, help='The length of tokens print each time in streaming mode, chunk streaming.') parser.add_argument('--streaming', action='store_true', help='Set whether to use streaming mode, only applicable to LLM.') - + parser.add_argument("--num_steps", type=int, required=False, help="Number of inference steps for image generation") + parser.add_argument("--height", type=int, required=False, help="Generated image height. Applicable only for Image Generation.") + parser.add_argument("--width", type=int, required=False, help="Generated image width. Applicable only for Image Generation.") return parser.parse_args() diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 78f72147c7..b3e2f23f0b 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -97,6 +97,9 @@ def analyze_args(args): model_args['prompt'] = args.prompt model_args['prompt_file'] = args.prompt_file model_args['infer_count'] = args.infer_count + model_args["num_steps"] = args.num_steps + model_args["height"] = args.height + model_args["width"] = args.width model_args['images'] = args.images model_args['seed'] = args.seed model_args['mem_consumption'] = args.memory_consumption diff --git a/tools/llm_bench/task/image_generation.py b/tools/llm_bench/task/image_generation.py index 7f43afe6e2..125794704d 100644 --- a/tools/llm_bench/task/image_generation.py +++ b/tools/llm_bench/task/image_generation.py @@ -25,10 +25,10 @@ stable_diffusion_hook = StableDiffusionHook() -def collects_input_args(image_param, model_type, model_name, infer_count=None, callback=None): +def collects_input_args(image_param, model_type, model_name, infer_count=None, height=None, width=None, callback=None): input_args = {} - input_args["width"] = image_param.get('width', DEFAULT_IMAGE_WIDTH) - input_args["height"] = image_param.get('height', DEFAULT_IMAGE_HEIGHT) + input_args["width"] = image_param.get('width', width or DEFAULT_IMAGE_WIDTH) + input_args["height"] = image_param.get('height', height or DEFAULT_IMAGE_HEIGHT) if infer_count is None: input_args["num_inference_steps"] = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in model_name else LCM_DEFAULT_INFERENCE_STEPS) else: @@ -60,7 +60,7 @@ def collects_input_args(image_param, model_type, model_name, infer_count=None, c def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None): set_seed(args['seed']) input_text = image_param['prompt'] - input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"]) + input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["num_steps"], args.get("height"), args.get("width")) out_str = f"Input params: Batch_size={args['batch_size']}, " \ f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}" if 'guidance_scale' in input_args: @@ -84,7 +84,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, for bs_idx, in_text in enumerate(input_text_list): llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id) start = time.perf_counter() - res = pipe(input_text_list, **input_args).images + res = pipe(input_text_list, **input_args, num_images_per_prompt=2).images end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() @@ -123,7 +123,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, def run_image_generation_genai(image_param, num, image_id, pipe, args, iter_data_list, proc_id, mem_consumption, callback=None): set_seed(args['seed']) input_text = image_param['prompt'] - input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["infer_count"], callback) + input_args = collects_input_args(image_param, args['model_type'], args['model_name'], args["num_steps"], args.get("height"), args.get("width"), callback) out_str = f"Input params: Batch_size={args['batch_size']}, " \ f"steps={input_args['num_inference_steps']}, width={input_args['width']}, height={input_args['height']}" if 'guidance_scale' in input_args: From ca4460a71c95982177f5e119f74ac6e2ee33830e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 25 Dec 2024 14:02:42 +0400 Subject: [PATCH 07/14] [GHA] Use latest OV on macos and windows (#1434) --- .github/workflows/mac.yml | 2 +- .github/workflows/windows.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 7a4ee31beb..5cc8772ac5 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -17,7 +17,7 @@ concurrency: env: PYTHON_VERSION: '3.9' - OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753 + OV_BRANCH: master OV_TARBALL: '' jobs: diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 649d678c02..7e1aacc715 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -17,7 +17,7 @@ concurrency: env: PYTHON_VERSION: '3.11' - OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753 + OV_BRANCH: master OV_TARBALL: '' jobs: From 0789c7b8273343908fb717824d52a74e73efd668 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 25 Dec 2024 15:20:14 +0400 Subject: [PATCH 08/14] [Text generation] Enable tests with Qwen2-0.5B-Instruct (#1438) --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 0bb0c1af6e..6c94a907ea 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -270,7 +270,7 @@ jobs: - name: 'Whisper' cmd: 'tests/python_tests/test_whisper_generate_api.py' - name: 'LLM & VLM' - cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py -k "not Qwen2-0.5B-Instruct"' # Skip failed tests Qwen2-0.5B-Instruct + cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py' defaults: run: shell: bash From 812163a2e15e31e94fa1261010c07f9a106f774a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 25 Dec 2024 18:24:05 +0400 Subject: [PATCH 09/14] Moved tokenizers tests to a dedicated file (#1436) --- .github/workflows/mac.yml | 2 +- .github/workflows/windows.yml | 2 +- .../openvino/genai/generation_config.hpp | 14 +- src/cpp/include/openvino/genai/tokenizer.hpp | 30 +- src/cpp/src/generation_config.cpp | 3 + src/cpp/src/tokenizer.cpp | 48 +- .../openvino_genai/py_openvino_genai.pyi | 60 +- .../py_continuous_batching_pipeline.cpp | 4 +- src/python/py_generation_config.cpp | 20 +- tests/python_tests/common.py | 65 +- tests/python_tests/ov_genai_test_utils.py | 112 +- .../python_tests/test_cache_optimizations.py | 4 +- tests/python_tests/test_chat_generate_api.py | 202 +-- tests/python_tests/test_generate_api.py | 391 ++--- tests/python_tests/test_preemption.py | 6 +- tests/python_tests/test_sampling.py | 22 +- tests/python_tests/test_tokenizer.py | 360 ++++ .../python_tests/test_whisper_generate_api.py | 31 +- tests/python_tests/tokenizer_configs.py | 1536 ++++++++--------- 19 files changed, 1461 insertions(+), 1451 deletions(-) create mode 100644 tests/python_tests/test_tokenizer.py diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 5cc8772ac5..a9af13bc66 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -225,7 +225,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template + python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template env: PYTHONPATH: "./build/:$PYTHONPATH" diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 7e1aacc715..f88bc4c6f3 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -236,7 +236,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template + python -m pytest -v ./tests/python_tests/test_tokenizer.py::test_set_chat_template env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index b8b222e347..4ea75e94c5 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -45,6 +45,10 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param logprobs number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. * Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). * + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + * @param presence_penalty reduces absolute log prob if the token was generated at least once. + * @param frequency_penalty reduces absolute log prob as many times as the token was generated. + * * Beam search specific parameters: * @param num_beams number of beams for beam search. 1 disables beam search. * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -61,15 +65,13 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; * "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). * - * Random sampling parameters: + * Random (or multinomial) sampling parameters: + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. * @param temperature the value used to modulate token probabilities for random sampling. * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. - * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. - * @param presence_penalty reduces absolute log prob if the token was generated at least once. - * @param frequency_penalty reduces absolute log prob as many times as the token was generated. * @param rng_seed initializes random generator. + * @param num_return_sequences the number of sequences to generate from a single prompt. * * Assisting generation parameters: * @param assistant_confidence_threshold the lower token probability of candidate to be validated by main model in case of dynamic strategy candidates number update. @@ -90,7 +92,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t min_new_tokens = 0; bool echo = false; size_t logprobs = 0; - + std::set stop_strings; // Default setting in vLLM (and OpenAI API) is not to include stop string in the output bool include_stop_str_in_output = false; diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 38fc0aaf8c..548e4dc332 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -36,9 +36,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights - * - * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. - * When this constructor is used bos, eos, pad token ids are expected to be in IR. + * + * This constructor is used when tokenizer and detokenizer are separate models already loaded into memory. + * When this constructor is used bos, eos, pad token ids are expected to be in IR. * If an IR is older (< 2024.3) then this tokens are default initialized to be ignored. * @param tokenizer_model_str tokenizer model string * @param tokenizer_weights_tensor ov::Tensor with tokenizer weights @@ -55,9 +55,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { ); /** - * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. - * - * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's + * @brief ov::genai::Tokenizer constructor to initialize directly from model and weights. + * + * This constructor is used when tokenizer (or detokenizer) already loaded into memory. Whether it's * tokenizer or detokenizer is defined from model input signature. When this constructor is used bos, eos, pad token ids * are expected to be in IR. If an IR is older (< 2024.3) then this tokens are default initialized to be ignored. * @param model_str model string @@ -82,7 +82,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { ov::Tensor& detokenizer_weights_tensor, Properties&&... properties ) : Tokenizer(tokenizer_model_str, tokenizer_weights_tensor, detokenizer_model_str, detokenizer_weights_tensor, ov::AnyMap{std::forward(properties)...}) { } - + /** * @brief ov::genai::Tokenizer constructor with variable number of properties * @param model_str model string @@ -93,7 +93,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, Properties&&... properties) : Tokenizer(model_str, weights_tensor, ov::AnyMap{std::forward(properties)...}) { } - + /** * @brief ov::genai::Tokenizer constructor with variable number of properties * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path @@ -111,7 +111,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @return pair of [input_ids, attention_mask] */ TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {}); - + /** * @brief encode batch of prompts. Left padding will be applied by default * @param prompts vector storing batch of prompts @@ -127,7 +127,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param prompt std::string with input prompt * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false) * @return pair of [input_ids, attention_mask] - */ + */ template util::EnableIfAllStringAny encode(std::string& prompt, Properties&&... properties) { return encode(prompt, AnyMap{std::forward(properties)...}); @@ -164,7 +164,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { } /** - * @brief decode tokens. + * @brief decode tokens. * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return vector of std::string, with size = batch_size @@ -183,7 +183,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { } /** - * @brief batched decoding of tokens. + * @brief batched decoding of tokens. * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size * @param detokenization_params AnyMap with detokenization parameters, e.g. {"skip_special_tokens", false} * @return vector of std::string, with size equal to batch_size @@ -203,8 +203,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief Embeds input prompts with special tags for a chat scenario. - * - * For example, for Qwen family models, the prompt "1+1=" would be transformed into + * + * For example, for Qwen family models, the prompt "1+1=" would be transformed into * <|im_start|>user\n1+1=<|im_end|>\n<|im_start|>assistant\n. * * @param history A vector of maps, with chat history, e.g. [{"role": "user", "content": "prompt"}, ...]. @@ -214,7 +214,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @throws Exception if the chat template was unable to parse the input history. */ std::string apply_chat_template(ChatHistory history, - bool add_generation_prompt, + bool add_generation_prompt, const std::string& chat_template = {}) const; /// @brief Override a chat_template read from tokenizer_config.json. diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 35ae92d605..4ff184547e 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -185,6 +185,9 @@ void GenerationConfig::validate() const { "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); if (is_beam_search()) { OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + if (num_beam_groups > 1) { + OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search"); + } } else { OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index b098f96fe6..82c0a17a55 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -89,15 +89,16 @@ class Tokenizer::TokenizerImpl { public: ov::CompiledModel m_tokenizer; ov::CompiledModel m_detokenizer; - + std::unique_ptr> m_ireq_queue_tokenizer; std::unique_ptr> m_ireq_queue_detokenizer; - // To change the adding special tokens mode we use a statefull subgraph, + + // To change the adding special tokens mode we use a statefull subgraph, // this flag holds the current state value of the CompiledModel. bool m_add_special_tokens = true; bool m_skip_special_tokens = true; bool m_older_than_24_5 = false; - + int64_t m_pad_token_id = -1; int64_t m_bos_token_id = -1; int64_t m_eos_token_id = -1; @@ -111,6 +112,7 @@ class Tokenizer::TokenizerImpl { void set_state_if_necessary(CircularBufferQueueElementGuard& infer_request_guard, const ov::AnyMap& params) { bool add_special_tokens_flag = m_add_special_tokens; bool skip_special_tokens_flag = m_skip_special_tokens; + ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag); ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag); @@ -126,11 +128,11 @@ class Tokenizer::TokenizerImpl { // state but the effect is incorrect. return; } - + // add_special_tokens is managed by Select op with a bool input. ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {}); *add_special_tensor.data() = add_special_tokens_flag; - + // skip_special_tokens is managed by multiplication with a number, therefore i32. ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1}); *skip_special_tensor.data() = skip_special_tokens_flag; @@ -148,19 +150,19 @@ class Tokenizer::TokenizerImpl { TokenizerImpl() = default; - TokenizerImpl(const std::filesystem::path& models_papth, const ov::AnyMap& properties) { - setupTokenizer(models_papth, properties); + TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties) { + setup_tokenizer(models_path, properties); } TokenizerImpl(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { - setupTokenizer(models, properties); + setup_tokenizer(models, properties); } - void setupTokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) { + void setup_tokenizer(const std::filesystem::path& models_path, const ov::AnyMap& properties) { ScopedVar env_manager(tokenizers_relative_to_genai().string()); auto core = get_core_singleton(); - OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_papth' parameter should be a path to a dir not a xml file"); + OPENVINO_ASSERT(models_path.extension() != ".xml", "'models_path' parameter should be a path to a dir not a xml file"); std::shared_ptr ov_tokenizer = nullptr; std::shared_ptr ov_detokenizer = nullptr; @@ -168,12 +170,12 @@ class Tokenizer::TokenizerImpl { if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) { ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml"); } - + if (std::filesystem::exists(models_path / "openvino_detokenizer.xml")) { ov_detokenizer = core.read_model(models_path / "openvino_detokenizer.xml"); } - setupTokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties); + setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties); // If special tokens were not found from IR, try to read them from config. // This will be triggered only for IRs older than 2024.3. @@ -184,21 +186,20 @@ class Tokenizer::TokenizerImpl { // Try to read tokenizer_config if some token ids or token str are not defined. read_tokenizer_config_if_necessary(models_path); } - + // If chat_template was not found in IR, try to read them from config. if (m_chat_template.empty()) { m_chat_template = chat_template_from_tokenizer_json_if_exists(models_path); } } - - void setupTokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { + void setup_tokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { auto [ov_tokenizer, ov_detokenizer] = models; OPENVINO_ASSERT(ov_tokenizer || ov_detokenizer, "Neither tokenizer nor detokenzier models were provided"); auto core = get_core_singleton(); std::string device = "CPU"; // only CPU is supported for now - + std::string version_str; utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str); // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5 @@ -231,7 +232,7 @@ class Tokenizer::TokenizerImpl { return std::move(this->m_detokenizer.create_infer_request()); }); } - + // Initialize tokenizer's cache to save time later. if (m_tokenizer) { // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. @@ -286,10 +287,11 @@ class Tokenizer::TokenizerImpl { nlohmann::json data = nlohmann::json::parse(f); - using ov::genai::utils::read_json_param; // they are in the format {"bos_token": { "content": "",... }} - auto read_token_content_str = [&data](std::string key_name, std::string& val) { - if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); } + auto read_token_content_str = [&data](const std::string& key_name, std::string& val) { + if (val.empty() && data.contains(key_name)) { + utils::read_json_param(data[key_name], "content", val); + } }; read_token_content_str(pad_token_key_name, m_pad_token); read_token_content_str(bos_token_key_name, m_bos_token); @@ -494,7 +496,7 @@ class Tokenizer::TokenizerImpl { {"is none", "is undefined"}, {"= none", "= undefined"}, // Jinja2Cpp does not support Python-style slicing, e.g. [1:]. - // If chat template contains such slicing, we replace it with + // If chat template contains such slicing, we replace it with // a placeholder at the moment. {"messages[1:]", "slice(messages, 1)"}, }; @@ -537,7 +539,7 @@ class Tokenizer::TokenizerImpl { env.GetSettings().trimBlocks = true; jinja2::Template tpl(&env); tpl.Load(chat_tpl); - + jinja2::UserCallable slice_callable = jinja2::MakeCallable( [](const jinja2::GenericList& messages, const size_t& start) { jinja2::ValuesList result; @@ -607,7 +609,7 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c ScopedVar env_manager(tokenizers_relative_to_genai().string()); auto core = get_core_singleton(); auto model = core.read_model(model_str, weights_tensor); - + auto parameters = model->get_parameters(); OPENVINO_ASSERT(!parameters.empty()); if (parameters.front()->get_element_type() == ov::element::string) { diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 3d27b23052..8510a8389f 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -361,10 +361,10 @@ class ContinuousBatchingPipeline: This class is used for generation with LLMs with continuous batchig """ @typing.overload - def __init__(self, models_path: str, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None: + def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload - def __init__(self, models_path: str, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: + def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle: @@ -522,17 +522,17 @@ class FluxTransformer2DModel: class GenerationConfig: """ - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -540,6 +540,10 @@ class GenerationConfig: logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -550,8 +554,8 @@ class GenerationConfig: length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -560,7 +564,7 @@ class GenerationConfig: top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. """ adapters: AdapterConfig | None assistant_confidence_threshold: float @@ -951,17 +955,17 @@ class LLMPipeline: :rtype: DecodedResults, EncodedResults, str - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -969,6 +973,10 @@ class LLMPipeline: logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -979,8 +987,8 @@ class LLMPipeline: length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -989,7 +997,7 @@ class LLMPipeline: top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. """ @typing.overload def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, device: str, config: dict[str, typing.Any] = {}, **kwargs) -> None: @@ -1032,17 +1040,17 @@ class LLMPipeline: :rtype: DecodedResults, EncodedResults, str - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -1050,6 +1058,10 @@ class LLMPipeline: logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -1060,8 +1072,8 @@ class LLMPipeline: length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -1070,7 +1082,7 @@ class LLMPipeline: top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. """ def get_generation_config(self) -> GenerationConfig: ... @@ -1420,7 +1432,7 @@ class StopCriteria: """ StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 772ba0af8a..be7a72481f 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -212,7 +212,7 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); py::class_(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") - .def(py::init([](const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { + .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); return std::make_unique(models_path, scheduler_config, device, pyutils::properties_to_any_map(llm_plugin_config), pyutils::properties_to_any_map(tokenizer_plugin_config)); }), @@ -222,7 +222,7 @@ void init_continuous_batching_pipeline(py::module_& m) { py::arg("properties") = ov::AnyMap({}), py::arg("tokenizer_properties") = ov::AnyMap({})) - .def(py::init([](const std::string& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& plugin_config) { + .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& plugin_config) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); return std::make_unique(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config)); }), diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index b1a5c6cd2e..f49bcf29bd 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -20,7 +20,7 @@ namespace { auto stop_criteria_docstring = R"( StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. @@ -30,17 +30,17 @@ auto stop_criteria_docstring = R"( } // namespace char generation_config_docstring[] = R"( - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. ignore_eos: if set to true, then generation will not stop even if token is met. eos_token_id: token_id of (end of sentence) - min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. stop_strings: a set of strings that will cause pipeline to stop generating further tokens. include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. @@ -48,6 +48,10 @@ char generation_config_docstring[] = R"( logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -58,8 +62,8 @@ char generation_config_docstring[] = R"( length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -68,7 +72,7 @@ char generation_config_docstring[] = R"( top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + num_return_sequences: the number of sequences to generate from a single prompt. )"; void init_generation_config(py::module_& m) { diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index cf5fbb3403..7e3c075405 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -42,13 +42,6 @@ def get_greedy_with_penalties() -> GenerationConfig: generation_config.max_new_tokens = 30 return generation_config -def get_greedy_with_min_and_max_tokens() -> GenerationConfig: - generation_config = GenerationConfig() - generation_config.num_return_sequences = 1 - generation_config.min_new_tokens = 15 - generation_config.max_new_tokens = 30 - return generation_config - def get_greedy_with_single_stop_string() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 @@ -296,10 +289,12 @@ def convert_to_hf( kwargs['max_length'] = generation_config.max_length # has higher priority than 'max_length' kwargs['max_new_tokens'] = generation_config.max_new_tokens + kwargs['min_new_tokens'] = generation_config.min_new_tokens if generation_config.stop_strings: kwargs['stop_strings'] = generation_config.stop_strings # copy default parameters + kwargs['bos_token_id'] = default_generation_config.bos_token_id kwargs['eos_token_id'] = default_generation_config.eos_token_id kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty @@ -308,11 +303,12 @@ def convert_to_hf( # beam search case kwargs['num_beam_groups'] = generation_config.num_beam_groups kwargs['num_beams'] = generation_config.num_beams - kwargs['diversity_penalty'] = generation_config.diversity_penalty kwargs['length_penalty'] = generation_config.length_penalty kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size kwargs['num_return_sequences'] = generation_config.num_return_sequences kwargs['output_scores'] = True + if generation_config.num_beam_groups > 1: + kwargs['diversity_penalty'] = generation_config.diversity_penalty elif generation_config.do_sample: # mulitinomial kwargs['temperature'] = generation_config.temperature @@ -328,7 +324,7 @@ def convert_to_hf( def run_hugging_face( - model, + opt_model, hf_tokenizer, prompts: List[str], generation_configs: List[GenerationConfig], @@ -337,8 +333,9 @@ def run_hugging_face( for prompt, generation_config in zip(prompts, generation_configs): inputs = hf_tokenizer(prompt, return_tensors="pt") prompt_len = inputs['input_ids'].numel() - generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), - return_dict_in_generate=True, tokenizer=hf_tokenizer) + generate_outputs = opt_model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], + generation_config=convert_to_hf(opt_model.generation_config, generation_config), + return_dict_in_generate=True, tokenizer=hf_tokenizer) all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) generation_result = GenerationResult() @@ -349,7 +346,7 @@ def run_hugging_face( generation_results.append(generation_result) del hf_tokenizer - del model + del opt_model return generation_results @@ -360,14 +357,14 @@ def run_continuous_batching( prompts: List[str], generation_configs : List[GenerationConfig] ) -> List[GenerationResult]: - pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {}, {}) + pipe = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU") output = pipe.generate(prompts, generation_configs) del pipe shutil.rmtree(models_path) return output -def get_models_list(file_name: str): +def read_models_list(file_name: str): models = [] with open(file_name) as f: for model_name in f: @@ -395,6 +392,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): assert hf_text == ov_text + +def get_hugging_face_model_and_tokenizer(model_id: str, use_optimum = True): + hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ + AutoModelForCausalLM.from_pretrained(model_id) + return opt_model, hf_tokenizer + + def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): model.save_pretrained(models_path) # convert tokenizers as well @@ -404,23 +409,6 @@ def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): serialize(tokenizer, models_path / "openvino_tokenizer.xml") serialize(detokenizer, models_path / "openvino_detokenizer.xml") -def get_model_and_tokenizer(model_id: str, use_optimum = True): - hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ - AutoModelForCausalLM.from_pretrained(model_id) - return model, hf_tokenizer - -def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): - use_optimum = True - models_path : Path = tmp_path / model_id - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum) - - if use_optimum: - save_ov_model_from_optimum(model, hf_tokenizer, models_path) - - hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) - _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) - def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) @@ -433,19 +421,32 @@ def _generate_and_compare_with_reference_results(models_path: Path, prompts: Lis compare_results(ref_result, ov_result, generation_config) +def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): + use_optimum = True + models_path : Path = tmp_path / model_id + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum) + + if use_optimum: + save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + + hf_results = run_hugging_face(opt_model=opt_model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) + _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) + + def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) assert len(prompts) == len(reference_texts_per_prompt) assert len(prompts) == len(ov_results) - for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip(prompts, reference_texts_per_prompt, ov_results, generation_configs): + for prompt, ref_texts_for_this_prompt, ov_result in zip(prompts, reference_texts_per_prompt, ov_results): print(f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}") assert len(ref_texts_for_this_prompt) == len(ov_result.m_generation_ids) for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids): assert ref_text == ov_text + def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): prompts, generation_configs = get_test_dataset() scheduler_config = get_scheduler_config(scheduler_params) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 5f2702a774..87b2147bcd 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -57,33 +57,6 @@ def get_models_list(): return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] -def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False): - precommit_models = [ - "openai/whisper-tiny", - "openai/whisper-tiny.en", - "distil-whisper/distil-small.en", - ] - if multilingual: - precommit_models = ["openai/whisper-tiny"] - if en_only: - precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"] - if tiny_only: - precommit_models = ["openai/whisper-tiny"] - - nightly_models = [] - - if pytest.run_marker == "precommit": - model_ids = precommit_models - else: - model_ids = nightly_models - - if pytest.selected_model_ids: - model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] - - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] - - def get_chat_models_list(): precommit_models = [ "Qwen/Qwen2-0.5B-Instruct", @@ -101,90 +74,31 @@ def get_chat_models_list(): model_ids = precommit_models else: model_ids = nightly_models - + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] -def get_chat_templates(): - # Returns chat templates saved in tokenizer_configs.py, - # but skips some models that currently are not processed correctly. - - skipped_models = { - # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. - # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads. - "openchat/openchat-3.5-0106", - - # These models fail even on HF so no need to check if applying chat matches. - "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", - "codellama/CodeLlama-34b-Instruct-hf", - "deepseek-ai/deepseek-math-7b-rl", - "allenai/tulu-2-7b", - "alexsobolev/IcaroLM", - "tokyotech-llm/Swallow-7b-instruct-v0.1", - "bofenghuang/vigogne-2-7b-chat", - "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k", - "AliAbdelrasheed/maqa_llama_4bit", - "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored", - - # TODO: Need to support chat templates in more models: CVS-145963 - # Either ov_genai is unable to parse chat_template or results do not match with HF. - "meta-llama/Meta-Llama-3-8B-Instruct", - "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp - "mosaicml/mpt-30b-chat", - "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp - "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp - "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp - "openchat/openchat-3.5-0106", - "casperhansen/llama-3-70b-instruct-awq", - "TheBloke/deepseek-coder-33B-instruct-GPTQ", - "AI-Sweden-Models/gpt-sw3-356m-instruct", - "google/gemma-7b-it", - "THUDM/cogvlm2-llama3-chat-19B", - "KnutJaegersberg/internlm-20b-llama", - "maywell/Synatra-Mixtral-8x7B", - "MediaTek-Research/Breeze-7B-Instruct-v1_0", - "bofenghuang/vigostral-7b-chat", - "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp - "openchat/openchat-3.6-8b-20240522", - "tenyx/TenyxChat-7B-v1", - "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2", - "yam-peleg/Hebrew-Gemma-11B-V2", - "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError - "nlpai-lab/KULLM3", - "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", - "MediaTek-Research/Breeze-7B-Instruct-v0_1", - "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError - "MLP-KTLim/llama-3-Korean-Bllossom-8B", - "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp - "codellama/CodeLlama-70b-Instruct-hf", - "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp - "BramVanroy/Llama-2-13b-chat-dutch" - } - from tokenizer_configs import get_tokenizer_configs - return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models] - - @functools.lru_cache(1) def read_model(params, **tokenizer_kwargs): model_id, path = params from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) if (path / "openvino_model.xml").exists(): opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, compile=False, device='CPU') else: - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, **tokenizer_kwargs) openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") # to store tokenizer config jsons with special tokens - tokenizer.save_pretrained(path) + hf_tokenizer.save_pretrained(path) opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, compile=False, device='CPU', load_in_8bit=False) @@ -195,7 +109,7 @@ def read_model(params, **tokenizer_kwargs): return ( model_id, path, - tokenizer, + hf_tokenizer, opt_model, ov_genai.LLMPipeline(path, 'CPU', **{'ENABLE_MMAP': False}), ) @@ -256,20 +170,8 @@ def model_tokenizers_path_tmp_path(tmpdir_factory): yield model_id, Path(temp_path) -def load_tok(configs: List[Tuple], temp_path): - # load Tokenizer where all configs are cleared. - # remove existing jsons from previous tests - for json_file in temp_path.glob("*.json"): - json_file.unlink() - - for config_json, config_name in configs: - with (temp_path / config_name).open('w') as f: - json.dump(config_json, f) - return ov_genai.Tokenizer(temp_path) - - -def load_pipe(configs: List[Tuple], temp_path): - # Load LLMPipline where all configs are cleared. +def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): + # Load LLMPipeline where all configs are cleared. # remove existing jsons from previous tests for json_file in temp_path.glob("*.json"): json_file.unlink() diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index 3c09d34756..d89697ba42 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.enable_prefix_caching = enable_prefix_caching models_path = converted_model.models_path - model_cb_noopt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {}) - model_cb_opt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) + model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU") + model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU") tokenizer = converted_model.tokenizer diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index d9661e538b..07b4f7c15f 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -4,24 +4,21 @@ import openvino_genai as ov_genai import pytest from typing import Dict, Tuple + from ov_genai_test_utils import ( - get_models_list, get_chat_models_list, read_model, - load_tok, - model_tmp_path, - get_chat_templates, get_continuous_batching, ) -configs = [ +generation_configs = [ dict(do_sample=False, max_new_tokens=20), dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) ] -quenstions = [ +questions = [ '1+1=', 'What is the previous answer?', 'Why is the Sun yellow?', @@ -29,7 +26,7 @@ ] -@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("generation_config", generation_configs) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit @pytest.mark.nightly @@ -37,18 +34,18 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict): chat_history_hf = [] chat_history_ov = [] chat_prompt = '' - + # Will set add_special_tokens=False inside pipeline when start_chat() is called. model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - pipe.start_chat() - for prompt in quenstions: + pipe.start_chat() + for prompt in questions: chat_history_hf.append({'role': 'user', 'content': prompt}) chat_history_ov.append({'role': 'user', 'content': prompt}) - + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - + answer = model_opt.generate(**tokenized, **generation_config) answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) chat_history_hf.append({'role': 'assistant', 'content': answer_str}) @@ -57,14 +54,15 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict): chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) pipe.finish_chat() - + if chat_history_ov != chat_history_hf: print(f'hf_output: {chat_history_hf}') print(f'ov_output: {chat_history_ov}') + assert chat_history_ov == chat_history_hf -@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("generation_config", generation_configs) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit @pytest.mark.nightly @@ -73,172 +71,48 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict) chat_history_hf = [] chat_history_ov = [] chat_prompt = '' - + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. # Need to regenerate openvino_tokenizer/detokenizer. - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - - for prompt in quenstions: + model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) + ov_tokenizer = ov_pipe.get_tokenizer() + + for prompt in questions: chat_history_hf.append({'role': 'user', 'content': prompt}) chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - + + chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + answer = model_opt.generate(**tokenized, **generation_config) - answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + answer_str = hf_tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) - answer_ov = pipe.generate(chat_prompt, **generation_config) + + chat_prompt = ov_tokenizer.apply_chat_template(chat_history_ov, add_generation_prompt=True) + answer_ov = ov_pipe.generate(chat_prompt, **generation_config) chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - + if chat_history_ov != chat_history_hf: print(f'hf_output: {chat_history_hf}') print(f'ov_output: {chat_history_ov}') + assert chat_history_ov == chat_history_hf -@pytest.mark.parametrize("generation_config", configs) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict): - # Check that when history is stored in KV cache results are the same as when history stored in a text. - device ='CPU' - - chat_history_with_kv_cache = [] - chat_history_ov = [] - - # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. - # Need to regenerate openvino_tokenizer/detokenizer. - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - pipe_with_kv_cache = ov_genai.LLMPipeline(path, device, **{"ENABLE_MMAP": False}) - - pipe_with_kv_cache.start_chat() - for question in quenstions: - chat_history_with_kv_cache.append({'role': 'user', 'content': question}) - answer = pipe_with_kv_cache.generate(question, **generation_config) - chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer}) - - chat_history_ov.append({'role': 'user', 'content': question}) - prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) - answer = pipe.generate(prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer}) - pipe_with_kv_cache.finish_chat() - - if chat_history_ov != chat_history_with_kv_cache: - print(f'kvcache_hist: {chat_history_with_kv_cache}') - print(f'text_history: {chat_history_ov}') - assert chat_history_ov == chat_history_with_kv_cache - - -conversation = [ - {'role': 'user', 'content': '1+1='}, - {'role': 'assistant', 'content': '1 + 1 = 2'}, - {'role': 'user', 'content': 'What is the previous answer?'}, - {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'}, - {'role': 'user', 'content': 'Why is the sun yellow?'}, - {'role': 'assistant', 'content': 'Because it emits yeloow light.'}, - {'role': 'user', 'content': 'What was my first question?'}, -] -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.parametrize('chat_config', get_chat_templates()) -def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): - tokenizer_config = chat_config[1] - - # Will load openvino_model for tiny-random-phi as a placeholder - # but indeed only Tokenizer and apply_chat_template will be tested. - model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0]) - - full_history_str_hf = tokenizer.apply_chat_template(conversation, - add_generation_prompt=False, - tokenize=False, - **tokenizer_config) - - tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) - tok.set_chat_template(tokenizer_config['chat_template']) - full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False) - if full_history_str != full_history_str_hf: - print(f'hf reference: {full_history_str_hf}') - print(f'ov_genai out: {full_history_str}') - assert full_history_str == full_history_str_hf - - -@pytest.mark.parametrize("generation_config", configs[1:]) +@pytest.mark.parametrize("generation_config", generation_configs[1:]) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): - model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat')) - cb = get_continuous_batching(path) - stateful.start_chat() - cb.start_chat() - for question in quenstions: - generated = cb.generate(question, **generation_config) - reference = stateful.generate(question, **generation_config) - assert generated == reference - # Test that finish_chat() doesn't fail just in case. - cb.finish_chat() - -@pytest.mark.precommit -@pytest.mark.nightly -def test_set_chat_template(): - model_descr = get_chat_models_list()[0] - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}") - config = ov_genai.GenerationConfig() - config.max_new_tokens = 1 - config.do_sample = False - pipe.start_chat() - generated = pipe.generate("a", config) - pipe.finish_chat() - reference = pipe.generate("a", config) - assert generated == reference + model_id, path, hf_tokenizer, opt_model, ov_stateful_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb_pipe = get_continuous_batching(path) -prompts = [ - '1+1=', - 'What is the previous answer?', - 'Why is the Sun yellow?', - 'What was my first question?', - ['Why is the Sun yellow?'], - "若我有一亿美元,在人工智能盛行的今天,我怎样投资才能收益最大化?", - "מחרוזת בדיקה", - "Multiline\nstring!\nWow!", -] + ov_stateful_pipe.start_chat() + cb_pipe.start_chat() -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.parametrize("add_special_tokens", [True, False]) -@pytest.mark.parametrize("prompt", prompts) -def test_add_special_tokens(add_special_tokens, prompt): - import numpy as np - model_descr = get_chat_models_list()[0] - model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - genai_tokenzier = pipe.get_tokenizer() - - # Calling encode with add_special_tokens will set state flag. - res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data - res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"] - assert np.all(res_genai == res_hf) + for question in questions: + generated = cb_pipe.generate(question, **generation_config) + reference = ov_stateful_pipe.generate(question, **generation_config) + assert generated == reference -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.parametrize("add_special_tokens", [True, False]) -@pytest.mark.parametrize("skip_special_tokens", [True, False]) -@pytest.mark.parametrize("prompt", prompts) -def test_add_special_tokens(add_special_tokens, skip_special_tokens, prompt): - import numpy as np - model_descr = get_chat_models_list()[0] - model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - genai_tokenizer = pipe.get_tokenizer() - - # Calling encode with add_special_tokens will set state flag. - res_genai = genai_tokenizer.encode(prompt, add_special_tokens).input_ids.data - res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"] - assert np.all(res_genai == res_hf) - - # Decode with skip_special_tokens - decoded_genai = genai_tokenizer.decode(res_genai, skip_special_tokens=skip_special_tokens)[0] - decoded_hf = hf_tokenizer.decode(res_hf[0], skip_special_tokens=skip_special_tokens) - assert decoded_genai == decoded_hf + # Test that finish_chat() doesn't fail just in case. + cb_pipe.finish_chat() diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 9bb9eff49c..824a3cca26 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -4,7 +4,6 @@ import openvino_genai as ov_genai from openvino_genai import StopCriteria import pytest -import transformers from typing import Union, List, Dict, Optional import numpy as np import openvino as ov @@ -15,8 +14,7 @@ from ov_genai_test_utils import ( get_models_list, read_model, - load_pipe, - load_tok, + load_genai_pipe_with_configs, model_tmp_path, STOP_CRITERIA_MAP, get_continuous_batching, @@ -24,7 +22,7 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): - model_id, path, tokenizer, model, pipe = model_descr + model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr config = generation_config.copy() # to avoid side effects num_beams = config['num_beams'] if 'num_beams' in config else 1 config['num_return_sequences'] = num_beams @@ -39,25 +37,25 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = 1.0 # 1.0 means no penalty - + generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) # Encode the batch of prompts - tokenizer.padding_side = "left" - encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True) + hf_tokenizer.padding_side = "left" + encoded_prompts = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True) prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask'] - - hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) + + hf_encoded_outputs = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) hf_outputs = [] for idx, hf_encoded_out in enumerate(hf_encoded_outputs): prompt_count = idx // num_beams - hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) + hf_outputs.append(hf_tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) - ov_outputs = pipe.generate(prompts, **config).texts + ov_outputs = ov_pipe.generate(prompts, **config).texts hf_outputs.sort() ov_outputs.sort() @@ -67,8 +65,9 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro print(f'ov_output: {ov_output}') assert hf_output == ov_output -def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str): - model_id, path, tokenizer, model, pipe = model_descr + +def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, prompt: str): + model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr config = generation_config.copy() # to avoid side effects @@ -85,12 +84,12 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) - encoded_prompt = tokenizer([prompt], return_tensors='pt', add_special_tokens=True) + encoded_prompt = hf_tokenizer([prompt], return_tensors='pt', add_special_tokens=True) prompt_ids, attention_mask = encoded_prompt['input_ids'], encoded_prompt['attention_mask'] - hf_encoded_output = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) - hf_output = tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True) + hf_encoded_output = opt_model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) + hf_output = hf_tokenizer.decode(hf_encoded_output[0, prompt_ids.shape[1]:], skip_special_tokens=True) - ov_output = pipe.generate(prompt, **config) + ov_output = ov_pipe.generate(prompt, **config) if config.get('num_return_sequences', 1) > 1: assert hf_output in ov_output.texts else: @@ -100,14 +99,15 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str assert hf_output == ov_output -def hf_ov_genai_tensors_comparison( + +def run_hf_ov_genai_comparison_encoded_inputs( model_descr, generation_config: Dict, input_ids: np.ndarray, attention_mask: Optional[np.array] = None ): device = 'CPU' - model_id, path, tokenizer, model, pipe = model_descr + model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr config = generation_config.copy() # to avoid side effects @@ -131,10 +131,8 @@ def hf_ov_genai_tensors_comparison( inputs_hf = dict(inputs=torch.tensor(input_ids)) inputs_ov = ov.Tensor(input_ids) - hf_output = model.generate(**inputs_hf, **generation_config_hf) - - pipe = ov_genai.LLMPipeline(path, device) - ov_output = pipe.generate(inputs_ov, **config) + hf_output = opt_model.generate(**inputs_hf, **generation_config_hf) + ov_output = ov_pipe.generate(inputs_ov, **config) hf_res = hf_output[0, input_ids.shape[1]:].numpy() ov_res = np.array(ov_output.tokens, dtype=np.int64) @@ -154,7 +152,8 @@ def hf_ov_genai_tensors_comparison( @pytest.mark.precommit @pytest.mark.nightly def test_decoding(model_descr, generation_config, prompt): - run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) + input_tensors_list = [ # input_ids, attention_mask @@ -165,62 +164,8 @@ def test_decoding(model_descr, generation_config, prompt): @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_ov_tensors(model_descr, inputs): - hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs) - - -prompts = [ - 'table is made of', - '你好! 你好嗎?', - 'Alan Turing was a', - 'The Sun is yellow because', - ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a'] -] -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.precommit -@pytest.mark.nightly -def test_genai_tokenizer_encode(model_descr, prompt): - model_id, path, tokenizer, model, pipe = read_model(model_descr) - tok = pipe.get_tokenizer() - - encoded_ov = tok.encode(prompt).input_ids.data - if isinstance(prompt, list): - encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids'] - for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf): - assert np.all(tokens_ov == tokens_hf) - else: - encoded_hf = tokenizer.encode(prompt) - assert np.all(encoded_hf == encoded_ov[0]) - -encoded_prompts = [ - [1, 1591, 338, 1754, 310], - [1, 17102, 323, 3864, 471, 263], - - # chineze characters - [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], - - # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token - [3113, 264, 364, 267], - - # batched tokens - [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102, 323, 3864, 471, 263]] -] -@pytest.mark.parametrize("model_descr", get_models_list()) -@pytest.mark.parametrize("encoded_prompt", encoded_prompts) -@pytest.mark.precommit -def test_genai_tokenizer_decode(model_descr, encoded_prompt): - model_id, path, tokenizer, model, pipe = read_model(model_descr) - tok = pipe.get_tokenizer() - decoded_ov = tok.decode(encoded_prompt) - - if isinstance(encoded_prompt[0], list): - decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True) - for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf): - assert np.all(tokens_ov == tokens_hf) - else: - decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True) - assert decoded_hf == decoded_ov +def test_encoded_inputs(model_descr, inputs): + run_hf_ov_genai_comparison_encoded_inputs(read_model(model_descr), dict(max_new_tokens=20), *inputs) test_configs = [ @@ -239,7 +184,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt): @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_multibatch(model_descr, generation_config, prompts): +def test_batch_text_input(model_descr, generation_config, prompts): run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) @@ -261,7 +206,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size, num_return_sequences=num_beam_groups * group_size, max_new_tokens=max_new_tokens, ) - run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) @@ -283,7 +228,7 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): max_new_tokens=max_new_tokens, stop_criteria=stop_criteria, ) - run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) # test long sequences @@ -302,7 +247,7 @@ def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, num_return_sequences=num_beam_groups * group_size, max_new_tokens=max_new_tokens, ) - run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @pytest.mark.parametrize("prompt", prompts) @@ -317,17 +262,17 @@ def test_greedy_repetition_penalty(model_descr, prompt): max_new_tokens=20, do_sample=False ) - run_hf_ov_genai_comparison((model_id, path, tokenizer, model, pipe), generation_config, prompt) + run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt) generation_config = dict( repetition_penalty=1.0, max_new_tokens=20, do_sample=False ) - run_hf_ov_genai_comparison((model_id, path, tokenizer, model, pipe), generation_config, prompt) + run_hf_ov_genai_comparison_text_inputs((model_id, path, tokenizer, model, pipe), generation_config, prompt) ov_output = pipe.generate(prompt, **generation_config) - + generation_config = dict( repetition_penalty=0.5, max_new_tokens=20, @@ -346,19 +291,19 @@ def user_defined_callback(subword): @pytest.mark.precommit @pytest.mark.nightly def test_callback_one_string(callback): - pipe = read_model(get_models_list()[0])[4] - generation_config = pipe.get_generation_config() + ov_pipe = read_model(get_models_list()[0])[4] + generation_config = ov_pipe.get_generation_config() generation_config.max_new_tokens = 10 - pipe.generate('table is made of', generation_config, callback) + ov_pipe.generate('table is made of', generation_config, callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit @pytest.mark.nightly -def test_callback_batch_fail(callback): - pipe = read_model(get_models_list()[0])[4] +def test_callback_batch_throws(callback): + ov_pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): - pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback) + ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @@ -368,24 +313,25 @@ def test_callback_kwargs_one_string(callback): pipe = read_model(get_models_list()[0])[4] pipe.generate('table is made of', max_new_tokens=10, streamer=callback) + @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("model_descr", get_models_list()) def test_callback_decoding_metallama(model_descr, callback): - # On metallam this prompt generates output which can shorten after adding new tokens. + # On metallama this prompt generates output which can shorten after adding new tokens. # Test that streamer correctly handles such cases. prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct': pytest.skip() - pipe = read_model(model_descr)[4] - pipe.generate(prompt, max_new_tokens=300, streamer=callback) + ov_pipe = read_model(model_descr)[4] + ov_pipe.generate(prompt, max_new_tokens=300, streamer=callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit @pytest.mark.nightly -def test_callback_kwargs_batch_fail(callback): +def test_callback_kwargs_batch_throws(callback): pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) @@ -408,200 +354,73 @@ def end(self): @pytest.mark.precommit @pytest.mark.nightly def test_streamer_one_string(): - pipe = read_model(get_models_list()[0])[4] - generation_config = pipe.get_generation_config() + ov_pipe = read_model(get_models_list()[0])[4] + generation_config = ov_pipe.get_generation_config() generation_config.max_new_tokens = 10 - printer = Printer(pipe.get_tokenizer()) - pipe.generate('table is made of', generation_config, printer) + printer = Printer(ov_pipe.get_tokenizer()) + ov_pipe.generate('table is made of', generation_config, printer) @pytest.mark.precommit @pytest.mark.nightly -def test_streamer_batch_fail(): - pipe = read_model(get_models_list()[0])[4] - printer = Printer(pipe.get_tokenizer()) +def test_streamer_batch_throws(): + ov_pipe = read_model(get_models_list()[0])[4] + printer = Printer(ov_pipe.get_tokenizer()) with pytest.raises(RuntimeError): - pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer) + ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), printer) @pytest.mark.precommit @pytest.mark.nightly def test_streamer_kwargs_one_string(): - pipe = read_model(get_models_list()[0])[4] - printer = Printer(pipe.get_tokenizer()) - pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer) + ov_pipe = read_model(get_models_list()[0])[4] + printer = Printer(ov_pipe.get_tokenizer()) + ov_pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer) @pytest.mark.precommit @pytest.mark.nightly -def test_streamer_kwargs_batch_fail(): - pipe = read_model(get_models_list()[0])[4] - printer = Printer(pipe.get_tokenizer()) +def test_streamer_kwargs_batch_throws(): + ov_pipe = read_model(get_models_list()[0])[4] + printer = Printer(ov_pipe.get_tokenizer()) with pytest.raises(RuntimeError): - pipe.generate('', num_beams=2, streamer=printer) + ov_pipe.generate('', num_beams=2, streamer=printer) @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) def test_operator_with_callback_one_string(callback): - pipe = read_model(get_models_list()[0])[4] - ten_tokens = pipe.get_generation_config() + ov_pipe = read_model(get_models_list()[0])[4] + ten_tokens = ov_pipe.get_generation_config() ten_tokens.max_new_tokens = 10 - pipe('talbe is made of', ten_tokens, callback) + ov_pipe('talbe is made of', ten_tokens, callback) @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) -def test_operator_with_callback_batch_fail(callback): - pipe = read_model(get_models_list()[0])[4] +def test_operator_with_callback_batch_throws(callback): + ov_pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): - pipe(['1', '2'], ov_genai.GenerationConfig(), callback) + ov_pipe(['1', '2'], ov_pipe.get_generation_config(), callback) @pytest.mark.precommit @pytest.mark.nightly def test_operator_with_streamer_kwargs_one_string(): - pipe = read_model(get_models_list()[0])[4] - printer = Printer(pipe.get_tokenizer()) - pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer) + ov_pipe = read_model(get_models_list()[0])[4] + printer = Printer(ov_pipe.get_tokenizer()) + ov_pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer) @pytest.mark.precommit @pytest.mark.nightly -def test_operator_with_streamer_kwargs_batch_fail(): - pipe = read_model(get_models_list()[0])[4] - printer = Printer(pipe.get_tokenizer()) +def test_operator_with_streamer_kwargs_batch_throws(): + ov_pipe = read_model(get_models_list()[0])[4] + printer = Printer(ov_pipe.get_tokenizer()) with pytest.raises(RuntimeError): - pipe('', num_beams=2, streamer=printer) - - -@pytest.mark.precommit -@pytest.mark.nightly -def test_load_special_tokens_ids_1(model_tmp_path): - # test when there is an available config.json - config_json = { - "pad_token_id": 422, - "bos_token_id": 42, - "eos_token_id": 37, - } - tok = load_tok([(config_json, "config.json")], model_tmp_path[1]) - assert tok.get_pad_token_id() == config_json['pad_token_id'] - assert tok.get_bos_token_id() == config_json['bos_token_id'] - assert tok.get_eos_token_id() == config_json['eos_token_id'] - - -@pytest.mark.precommit -@pytest.mark.nightly -def test_load_special_tokens_str_2(model_tmp_path): - # test with special_tokens_map - special_tokens_map_json = { - "pad_token": {"content": ""}, - "bos_token": {"content": ""}, - "eos_token": {"content": ""}, - } - tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1]) - assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"] - assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"] - assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"] - - -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons") -def test_load_special_tokens_3_(model_tokenizers_path_tmp_path): - # special_tokens_map is not available - # but tokenize_config.json exists - # will load both string and integer representations - tok_config_json = { - "added_tokens_decoder": { - "422": {"content": ""}, - "37": {"content": ""}, - "42": {"content": ""}, - }, - "pad_token": "", - "bos_token": "", - "eos_token": "", - } - - tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1]) - assert tok.get_pad_token() == tok_config_json['pad_token'] - assert tok.get_bos_token() == tok_config_json['bos_token'] - assert tok.get_eos_token() == tok_config_json['eos_token'] - - assert tok.get_pad_token_id() == 422 - assert tok.get_bos_token_id() == 37 - assert tok.get_eos_token_id() == 42 - - -@pytest.mark.precommit -@pytest.mark.nightly -def test_load_special_tokens_3(model_tmp_path): - # both config.json is available and tokenizer_config.json available - # check that it does not read int values from tokenizer_config.json if they are in config.json - tok_config_json = { - "added_tokens_decoder": { - # integers differ from config.json to check they don't override config.json - "777": {"content": ""}, - "888": {"content": ""}, - "656": {"content": ""}, - }, - "pad_token": "", - "bos_token": "", - "eos_token": "", - } - config_json = { - "pad_token_id": 422, - "bos_token_id": 42, - "eos_token_id": 37, - } - configs = [ - (tok_config_json, "tokenizer_config.json"), - (config_json, "config.json") - ] - tok = load_tok(configs, model_tmp_path[1]) - assert tok.get_pad_token_id() == config_json['pad_token_id'] - assert tok.get_bos_token_id() == config_json['bos_token_id'] - assert tok.get_eos_token_id() == config_json['eos_token_id'] - - assert tok.get_pad_token() == tok_config_json['pad_token'] - assert tok.get_bos_token() == tok_config_json['bos_token'] - assert tok.get_eos_token() == tok_config_json['eos_token'] - - -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.xfail( - raises=AssertionError, - reason="CVS-143410 ov tokenizer should be aligned with hf", - strict=False, -) -def test_load_special_tokens_4(model_tmp_path): - # only string representation is provided, find token integers by inference - model_id, temp_path = model_tmp_path - tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - - special_tokens_map_json = {} - token_str_int_map = {} - special_token_names = ['pad_token', 'bos_token', 'eos_token'] - for token_str in special_token_names: - if hasattr(tokenizer, token_str): - token_val = getattr(tokenizer, token_str) - special_tokens_map_json.update({token_str: {"content": token_val}}) - token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0] - token_str_int_map.update({token_str: token_id}) - - # since only string representations are present in the json will try to get by inference - tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path) - - # check ids inferred correctly for special tokens existing if HF tokenizer - if 'pad_token' in token_str_int_map: - assert tok.get_pad_token_id() == token_str_int_map['pad_token'] - if 'bos_token' in token_str_int_map: - assert tok.get_bos_token_id() == token_str_int_map['bos_token'] - if 'eos_token' in token_str_int_map: - assert tok.get_eos_token_id() == token_str_int_map['eos_token'] + ov_pipe('', num_beams=2, streamer=printer) invalid_configs = [ @@ -617,23 +436,24 @@ def test_load_special_tokens_4(model_tmp_path): @pytest.mark.parametrize("generation_config", invalid_configs) @pytest.mark.precommit @pytest.mark.nightly -def test_invalid_configs(model_tmp_path, generation_config): +def test_invalid_generation_configs_throws(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path config_json = {} - pipe = load_pipe([(config_json, "config.json")], temp_path) + ov_pipe = load_genai_pipe_with_configs([(config_json, "config.json")], temp_path) with pytest.raises(RuntimeError): - pipe.generate('blah blah', **generation_config) + ov_pipe.generate('blah blah', **generation_config) @pytest.mark.precommit @pytest.mark.nightly def test_valid_configs(model_tmp_path): model_id, temp_path = model_tmp_path - pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) + ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path) config = ov_genai.GenerationConfig() config.do_sample = True # no eos_token_id but it's loaded from config.json - pipe.set_generation_config(config) + ov_pipe.set_generation_config(config) + invalid_py_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), @@ -648,49 +468,48 @@ def test_valid_configs(model_tmp_path): @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.parametrize("generation_config", invalid_py_configs) -def test_python_generation_config_validation(model_tmp_path, generation_config): +def test_python_generation_config_validation_throws(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path - pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) - + ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path) + # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned # instead of RuntimeError, which is returned when GenerationConfig values are validated return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError with pytest.raises(return_exception_type): - pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) + ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) @pytest.mark.precommit @pytest.mark.nightly -def test_unicode_pybind_decoding_1(): +def test_unicode_pybind_decoding_one_string(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') - pipe = read_model((model_id, path))[4] - res_str = pipe.generate(',', max_new_tokens=4) + ov_pipe = read_model((model_id, path))[4] + res_str = ov_pipe.generate(',', max_new_tokens=4) assert '�' == res_str[-1] - @pytest.mark.precommit @pytest.mark.nightly -def test_unicode_pybind_decoding_2(): +def test_unicode_pybind_decoding_batched(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') - pipe = read_model((model_id, path))[4] - res_str = pipe.generate([","], max_new_tokens=4) + ov_pipe = read_model((model_id, path))[4] + res_str = ov_pipe.generate([","], max_new_tokens=4) assert '�' == res_str.texts[0][-1] @pytest.mark.precommit @pytest.mark.nightly -def test_unicode_pybind_decoding_3(): +def test_unicode_pybind_decoding_one_string_streamer(): # On this model this prompt generates unfinished utf-8 string # and streams it. Test that pybind will not fail while we pass string to python. model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') - pipe = read_model((model_id, path))[4] + ov_pipe = read_model((model_id, path))[4] res_str = [] - pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) + ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) assert '�' == res_str[-1] @@ -741,22 +560,24 @@ def test_continuous_batching_vs_stateful(prompt, generation_config): for gen, ref in zip(generated.scores, reference.scores): assert math.isclose(gen, ref, abs_tol=0.0003) + @pytest.mark.parametrize("prompt", prompts) @pytest.mark.precommit def test_cb_streamer_vs_return_vs_stateful(prompt): - model_id, path, tokenizer, model, stateful = read_model(( + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(( "facebook/opt-125m", Path("opt-125m") )) - cb = get_continuous_batching(path) + cb_pipe = get_continuous_batching(path) streamed = [] - generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) - reference = stateful.generate(prompt, max_new_tokens=20) + generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + reference = ov_pipe.generate(prompt, max_new_tokens=20) assert generated == "".join(streamed) assert "".join(streamed) == reference + def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: - model_id, path, tokenizer, model, pipe = model_descr + model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr config = generation_config.copy() # to avoid side effects @@ -767,7 +588,7 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = 1.0 # 1.0 means no penalty - return pipe.generate([prompt], **config).perf_metrics + return ov_pipe.generate([prompt], **config).perf_metrics test_cases = [ @@ -851,19 +672,19 @@ def test_perf_metrics(model_descr, generation_config, prompt): @pytest.mark.precommit @pytest.mark.nightly def test_batch_switch(): - pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - pipe.generate(["a"], max_new_tokens=2) - pipe.generate(["1", "2"], max_new_tokens=2) + ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] + ov_pipe.generate(["a"], max_new_tokens=2) + ov_pipe.generate(["1", "2"], max_new_tokens=2) @pytest.mark.precommit @pytest.mark.nightly def test_stop_token_ids(): - pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - res = pipe.generate( + ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] + res = ov_pipe.generate( ov.Tensor([(1,)]), max_new_tokens=3, - stop_token_ids={-1, 9935, pipe.get_tokenizer().get_eos_token_id()}, + stop_token_ids={-1, 9935, ov_pipe.get_tokenizer().get_eos_token_id()}, include_stop_str_in_output=False ) assert 2 == len(res.tokens[0]) @@ -873,8 +694,8 @@ def test_stop_token_ids(): @pytest.mark.precommit @pytest.mark.nightly def test_stop_strings(): - pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - res = pipe.generate( + ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] + res = ov_pipe.generate( "", max_new_tokens=5, stop_strings={"ignored", "боль"} diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py index 49d6c8f6b0..7c648e73dc 100644 --- a/tests/python_tests/test_preemption.py +++ b/tests/python_tests/test_preemption.py @@ -4,7 +4,7 @@ import pytest from openvino_genai import GenerationConfig -from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ +from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p @@ -87,7 +87,7 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): config.rng_seed = 0 config.max_new_tokens = 30 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) @@ -168,7 +168,7 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): for config in generation_configs: config.rng_seed = 0 model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index d5df28bfd6..fbcce76bf7 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -10,7 +10,7 @@ from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer from typing import List, TypedDict -from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ +from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ get_greedy_with_penalties, get_multinomial_temperature, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ @@ -28,18 +28,18 @@ @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) def test_sampling_precommit(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) @pytest.mark.nightly -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) def test_sampling_nightly(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) @pytest.mark.real_models -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) def test_real_models(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) @@ -313,7 +313,7 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl generation_config.rng_seed = 0 generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) @@ -337,12 +337,12 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") outputs = pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) @@ -364,12 +364,12 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") outputs = pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) @@ -392,12 +392,12 @@ def test_post_oom_health(tmp_path, sampling_config): scheduler_config.num_kv_blocks = 10 generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) models_path : Path = tmp_path / model_id save_ov_model_from_optimum(model, hf_tokenizer, models_path) - pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix()), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") # First run should return incomplete response output = pipe.generate(["What is OpenVINO?"], generation_configs) assert (len(output)) diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py new file mode 100644 index 0000000000..0c2a106d50 --- /dev/null +++ b/tests/python_tests/test_tokenizer.py @@ -0,0 +1,360 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import numpy as np +from transformers import AutoTokenizer +from typing import Dict, Tuple, List +import openvino_genai +import json + +from ov_genai_test_utils import ( + get_models_list, + get_chat_models_list, + read_model, + model_tmp_path +) + + +def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): + # load Tokenizer where all configs are cleared. + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return openvino_genai.Tokenizer(temp_path) + + +def get_chat_templates(): + # Returns chat templates saved in tokenizer_configs.py, + # but skips some models that currently are not processed correctly. + + skipped_models = { + # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. + # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads. + "openchat/openchat-3.5-0106", + + # These models fail even on HF so no need to check if applying chat matches. + "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", + "codellama/CodeLlama-34b-Instruct-hf", + "deepseek-ai/deepseek-math-7b-rl", + "allenai/tulu-2-7b", + "alexsobolev/IcaroLM", + "tokyotech-llm/Swallow-7b-instruct-v0.1", + "bofenghuang/vigogne-2-7b-chat", + "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k", + "AliAbdelrasheed/maqa_llama_4bit", + "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored", + + # TODO: Need to support chat templates in more models: CVS-145963 + # Either ov_genai is unable to parse chat_template or results do not match with HF. + "meta-llama/Meta-Llama-3-8B-Instruct", + "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp + "mosaicml/mpt-30b-chat", + "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp + "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp + "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.5-0106", + "casperhansen/llama-3-70b-instruct-awq", + "TheBloke/deepseek-coder-33B-instruct-GPTQ", + "AI-Sweden-Models/gpt-sw3-356m-instruct", + "google/gemma-7b-it", + "THUDM/cogvlm2-llama3-chat-19B", + "KnutJaegersberg/internlm-20b-llama", + "maywell/Synatra-Mixtral-8x7B", + "MediaTek-Research/Breeze-7B-Instruct-v1_0", + "bofenghuang/vigostral-7b-chat", + "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.6-8b-20240522", + "tenyx/TenyxChat-7B-v1", + "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2", + "yam-peleg/Hebrew-Gemma-11B-V2", + "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError + "nlpai-lab/KULLM3", + "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", + "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError + "MLP-KTLim/llama-3-Korean-Bllossom-8B", + "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp + "codellama/CodeLlama-70b-Instruct-hf", + "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp + "BramVanroy/Llama-2-13b-chat-dutch" + } + + from tokenizer_configs import get_tokenizer_configs + return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models] + + +prompts = [ + 'table is made of', + '你好! 你好嗎?', + 'Alan Turing was a', + 'The Sun is yellow because', + ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a'] +] +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.precommit +@pytest.mark.nightly +def test_encode(model_descr, prompt): + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) + ov_tokenizer = ov_pipe.get_tokenizer() + + encoded_ov = ov_tokenizer.encode(prompt).input_ids.data + if isinstance(prompt, list): + encoded_hf = hf_tokenizer.batch_encode_plus(prompt)['input_ids'] + for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf): + assert np.all(tokens_ov == tokens_hf) + else: + encoded_hf = hf_tokenizer.encode(prompt) + assert np.all(encoded_hf == encoded_ov[0]) + + +encoded_prompts = [ + [1, 1591, 338, 1754, 310], + [1, 17102, 323, 3864, 471, 263], + + # chineze characters + [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], + + # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token + [3113, 264, 364, 267], + + # batched tokens + [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102, 323, 3864, 471, 263]] +] +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.parametrize("encoded_prompt", encoded_prompts) +@pytest.mark.precommit +def test_decode(model_descr, encoded_prompt): + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(model_descr) + ov_tokenizer = ov_pipe.get_tokenizer() + decoded_ov = ov_tokenizer.decode(encoded_prompt) + + if isinstance(encoded_prompt[0], list): + decoded_hf = hf_tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True) + for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf): + assert np.all(tokens_ov == tokens_hf) + else: + decoded_hf = hf_tokenizer.decode(encoded_prompt, skip_special_tokens=True) + assert decoded_hf == decoded_ov + + +conversation = [ + {'role': 'user', 'content': '1+1='}, + {'role': 'assistant', 'content': '1 + 1 = 2'}, + {'role': 'user', 'content': 'What is the previous answer?'}, + {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'}, + {'role': 'user', 'content': 'Why is the sun yellow?'}, + {'role': 'assistant', 'content': 'Because it emits yeloow light.'}, + {'role': 'user', 'content': 'What was my first question?'}, +] +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize('chat_config', get_chat_templates()) +def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): + tokenizer_config = chat_config[1] + + # Will load openvino_model for tiny-random-phi as a placeholder + # but indeed only Tokenizer and apply_chat_template will be tested. + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(get_models_list()[0]) + + hf_full_history_str = hf_tokenizer.apply_chat_template(conversation, + add_generation_prompt=False, + tokenize=False, + **tokenizer_config) + + ov_tokenizer = load_genai_tokenizer_with_configs([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) + ov_tokenizer.set_chat_template(tokenizer_config['chat_template']) + ov_full_history_str = ov_tokenizer.apply_chat_template(conversation, add_generation_prompt=False) + + if ov_full_history_str != hf_full_history_str: + print(f'hf reference: {hf_full_history_str}') + print(f'ov_genai out: {ov_full_history_str}') + assert ov_full_history_str == hf_full_history_str + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_set_chat_template(): + model_descr = get_chat_models_list()[0] + model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + + prompt = "how are you?" + dummy_conversation = [ + {'role': 'user', 'content': prompt}, + ] + + ov_tokenizer = ov_pipe.get_tokenizer() + identity_chat_template = "{% for message in messages %}{{ message['content'] }}{% endfor %}" + + templated_prompt_inline = ov_tokenizer.apply_chat_template(dummy_conversation, add_generation_prompt=False, chat_template=identity_chat_template) + + ov_tokenizer.set_chat_template(identity_chat_template) + templated_prompt = ov_tokenizer.apply_chat_template(dummy_conversation, add_generation_prompt=False) + + assert templated_prompt_inline == templated_prompt + assert prompt == templated_prompt + + +prompts = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?', + ['Why is the Sun yellow?'], + "若我有一亿美元,在人工智能盛行的今天,我怎样投资才能收益最大化?", + "מחרוזת בדיקה", + "Multiline\nstring!\nWow!", +] +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("add_special_tokens", [True, False]) +@pytest.mark.parametrize("skip_special_tokens", [True, False]) +@pytest.mark.parametrize("prompt", prompts) +def test_encode_decode_with_special_tokens_option(add_special_tokens, skip_special_tokens, prompt): + import numpy as np + model_descr = get_chat_models_list()[0] + model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + ov_tokenzier = ov_pipe.get_tokenizer() + + # Calling encode with 'add_special_tokens' will set state flag. + ov_res = ov_tokenzier.encode(prompt, add_special_tokens=add_special_tokens).input_ids.data + hf_res = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"] + assert np.all(ov_res == hf_res) + + # Decode with 'skip_special_tokens' + decoded_genai = ov_tokenzier.decode(ov_res, skip_special_tokens=skip_special_tokens)[0] + decoded_hf = hf_tokenizer.decode(hf_res[0], skip_special_tokens=skip_special_tokens) + assert decoded_genai == decoded_hf + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_from_config_json(model_tmp_path): + # test when there is an available config.json + config_json = { + "pad_token_id": 422, + "bos_token_id": 42, + "eos_token_id": 37, + } + tok = load_genai_tokenizer_with_configs([(config_json, "config.json")], model_tmp_path[1]) + assert tok.get_pad_token_id() == config_json['pad_token_id'] + assert tok.get_bos_token_id() == config_json['bos_token_id'] + assert tok.get_eos_token_id() == config_json['eos_token_id'] + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_from_special_tokens_map_json(model_tmp_path): + # test with special_tokens_map + special_tokens_map_json = { + "pad_token": {"content": ""}, + "bos_token": {"content": ""}, + "eos_token": {"content": ""}, + } + tok = load_genai_tokenizer_with_configs([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1]) + assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"] + assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"] + assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"] + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons") +def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tmp_path): + # special_tokens_map is not available + # but tokenize_config.json exists + # will load both string and integer representations + tok_config_json = { + "added_tokens_decoder": { + "422": {"content": ""}, + "37": {"content": ""}, + "42": {"content": ""}, + }, + "pad_token": "", + "bos_token": "", + "eos_token": "", + } + + tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1]) + assert tok.get_pad_token() == tok_config_json['pad_token'] + assert tok.get_bos_token() == tok_config_json['bos_token'] + assert tok.get_eos_token() == tok_config_json['eos_token'] + + assert tok.get_pad_token_id() == 422 + assert tok.get_bos_token_id() == 37 + assert tok.get_eos_token_id() == 42 + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_from_tokenizer_config_and_config_json(model_tmp_path): + # both config.json is available and tokenizer_config.json available + # check that it does not read int values from tokenizer_config.json if they are in config.json + tok_config_json = { + "added_tokens_decoder": { + # integers differ from config.json to check they don't override config.json + "777": {"content": ""}, + "888": {"content": ""}, + "656": {"content": ""}, + }, + "pad_token": "", + "bos_token": "", + "eos_token": "", + } + config_json = { + "pad_token_id": 422, + "bos_token_id": 42, + "eos_token_id": 37, + } + configs = [ + (tok_config_json, "tokenizer_config.json"), + (config_json, "config.json") + ] + tok = load_genai_tokenizer_with_configs(configs, model_tmp_path[1]) + assert tok.get_pad_token_id() == config_json['pad_token_id'] + assert tok.get_bos_token_id() == config_json['bos_token_id'] + assert tok.get_eos_token_id() == config_json['eos_token_id'] + + assert tok.get_pad_token() == tok_config_json['pad_token'] + assert tok.get_bos_token() == tok_config_json['bos_token'] + assert tok.get_eos_token() == tok_config_json['eos_token'] + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.xfail( + raises=AssertionError, + reason="CVS-143410 ov tokenizer should be aligned with hf", + strict=False, +) +def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(model_tmp_path): + # only string representation is provided, find token integers by inference + model_id, temp_path = model_tmp_path + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + + special_tokens_map_json = {} + token_str_int_map = {} + special_token_names = ['pad_token', 'bos_token', 'eos_token'] + for token_str in special_token_names: + if hasattr(tokenizer, token_str): + token_val = getattr(tokenizer, token_str) + special_tokens_map_json.update({token_str: {"content": token_val}}) + token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0] + token_str_int_map.update({token_str: token_id}) + + # since only string representations are present in the json will try to get by inference + tok = load_genai_tokenizer_with_configs([(special_tokens_map_json, "special_tokens_map.json")], temp_path) + + # check ids inferred correctly for special tokens existing if HF tokenizer + if 'pad_token' in token_str_int_map: + assert tok.get_pad_token_id() == token_str_int_map['pad_token'] + if 'bos_token' in token_str_int_map: + assert tok.get_bos_token_id() == token_str_int_map['bos_token'] + if 'eos_token' in token_str_int_map: + assert tok.get_eos_token_id() == token_str_int_map['eos_token'] + diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py index 1450ef1f2e..aa78666e32 100644 --- a/tests/python_tests/test_whisper_generate_api.py +++ b/tests/python_tests/test_whisper_generate_api.py @@ -6,7 +6,6 @@ import pytest import openvino_tokenizers import openvino -from ov_genai_test_utils import get_whisper_models_list import datasets from transformers import WhisperProcessor, pipeline, AutoTokenizer from optimum.intel.openvino import OVModelForSpeechSeq2Seq @@ -15,6 +14,8 @@ import time import typing import numpy as np +import os +import pathlib @pytest.fixture(scope="class", autouse=True) def run_gc_after_test(): @@ -25,6 +26,34 @@ def run_gc_after_test(): yield gc.collect() + +def get_whisper_models_list(tiny_only=False, multilingual=False, en_only=False): + precommit_models = [ + "openai/whisper-tiny", + "openai/whisper-tiny.en", + "distil-whisper/distil-small.en", + ] + if multilingual: + precommit_models = ["openai/whisper-tiny"] + if en_only: + precommit_models = ["openai/whisper-tiny.en", "distil-whisper/distil-small.en"] + if tiny_only: + precommit_models = ["openai/whisper-tiny"] + + nightly_models = [] + + if pytest.run_marker == "precommit": + model_ids = precommit_models + else: + model_ids = nightly_models + + if pytest.selected_model_ids: + model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + # used whisper models are relatively small # cache them in memory to speedup tests @functools.lru_cache(3) diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py index 45d60f998d..2b51dc2b0d 100644 --- a/tests/python_tests/tokenizer_configs.py +++ b/tests/python_tests/tokenizer_configs.py @@ -2,1011 +2,1011 @@ def get_tokenizer_configs(): return { "meta-llama/Meta-Llama-3-8B-Instruct": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" }, "TheBloke/Mistral-7B-OpenOrca-GPTQ": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|im_end|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "TinyLlama/TinyLlama-1.1B-Chat-v1.0": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" }, "upstage/SOLAR-10.7B-Instruct-v1.0": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}" }, "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" }, "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}" }, "Qwen/Qwen1.5-0.5B": { - "bos_token": None, - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "<|endoftext|>", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" }, "Felladrin/Llama-68M-Chat-v1": { - "bos_token": "<|im_start|>", - "eos_token": "<|im_end|>", - "pad_token": "<|im_end|>", - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "<|im_start|>", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "databricks/dbrx-instruct": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|pad|>", - "unk_token": "<|endoftext|>", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}" }, "speakleash/Bielik-7B-Instruct-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + eos_token }}{% endif %}{% endfor %}" }, "internlm/internlm2-chat-7b": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "Qwen/Qwen2-7B-Instruct": { - "bos_token": None, - "eos_token": "<|im_end|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "codellama/CodeLlama-34b-Instruct-hf": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}" }, "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": { - "bos_token": None, - "eos_token": "<|end|>", - "pad_token": "<|pad|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}" + "bos_token": None, + "eos_token": "<|end|>", + "pad_token": "<|pad|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}" }, "mosaicml/mpt-30b-chat": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}" }, "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "deepseek-ai/deepseek-coder-6.7b-instruct": { - "bos_token": { - "__type": "AddedToken", - "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "<|EOT|>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": { - "__type": "AddedToken", - "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "<|EOT|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" }, "deepseek-ai/deepseek-math-7b-rl": { - "bos_token": { - "__type": "AddedToken", - "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": { - "__type": "AddedToken", - "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" }, "FINGU-AI/FinguAI-Chat-v1": { - "bos_token": None, - "eos_token": "<|im_end|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "allenai/tulu-2-7b": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" }, "maldv/winter-garden-7b-alpha": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}" }, "mlabonne/NeuralMonarch-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" }, "meta-llama/Llama-2-7b-chat-hf": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" }, "GritLM/GritLM-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" }, "ishorn5/RTLCoder-Deepseek-v1.1": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" }, "jondurbin/bagel-34b-v0.2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}" }, "openchat/openchat-3.5-0106": { - "bos_token": "", - "eos_token": "<|end_of_turn|>", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}" + "bos_token": "", + "eos_token": "<|end_of_turn|>", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}" }, "mobiuslabsgmbh/aanaphi2-v0.1": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "[PAD]", - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: ' + message['content'].strip() + '\n'}}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "[PAD]", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: ' + message['content'].strip() + '\n'}}{% endif %}{% endfor %}" }, "typeof/mistral-60m": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" }, "turboderp/Cat-Llama-3-70B-instruct": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|im_end|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "saltlux/Ko-Llama3-Luxia-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}" }, "h2oai/h2o-danube2-1.8b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" }, "abhishek/autotrain-llama3-70b-orpo-v1": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": None, - "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" }, "casperhansen/llama-3-70b-instruct-awq": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" }, "01-ai/Yi-1.5-34B-Chat": { - "bos_token": "<|startoftext|>", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" }, "allenai/OLMo-7B-Instruct": { - "bos_token": None, - "eos_token": "<|endoftext|>", - "pad_token": "<|padding|>", - "unk_token": None, - "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": None, + "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" }, "TheBloke/deepseek-coder-33B-instruct-GPTQ": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<|EOT|>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set found_item = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set found_item = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" }, "cognitivecomputations/dolphin-2.8-mistral-7b-v02": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "alexsobolev/IcaroLM": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "tokyotech-llm/Swallow-7b-instruct-v0.1": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}" }, "instructlab/merlinite-7b-lab": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|pad|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}" }, "microsoft/Phi-3-medium-128k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|placeholder6|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" }, "katuni4ka/tiny-random-phi3": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" }, "microsoft/Phi-3-mini-128k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|placeholder6|>", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" }, "VAGOsolutions/SauerkrautLM-Qwen-32b": { - "bos_token": None, - "eos_token": "<|im_end|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" }, "AI-Sweden-Models/gpt-sw3-356m-instruct": { - "bos_token": None, - "eos_token": None, - "pad_token": None, - "unk_token": None, - "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:" + "bos_token": None, + "eos_token": None, + "pad_token": None, + "unk_token": None, + "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:" }, "google/gemma-7b-it": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" }, "ise-uiuc/Magicoder-S-DS-6.7B": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}" }, "Deci/DeciLM-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}" }, "katuni4ka/tiny-random-minicpm": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}" }, "UnicomLLM/Unichat-llama3-Chinese-8B-28K": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content = 'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content = bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content = 'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content = bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}" }, "RLHFlow/LLaMA3-SFT": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}" }, "bofenghuang/vigogne-2-7b-chat": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": False, - "rstrip": False, - "single_word": False - }, - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}" }, "aisingapore/sea-lion-7b-instruct": { - "bos_token": None, - "eos_token": "<|endoftext|>", - "pad_token": "<|padding|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}" + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}" }, "microsoft/Phi-3-small-8k-instruct": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" }, "THUDM/cogvlm2-llama3-chat-19B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}" }, "tiiuae/falcon-11B": { - "bos_token": ">>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": None, - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n' + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}" + "bos_token": ">>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n' + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}" }, "Mihaiii/Pallas-0.5": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}" }, "prithivida/Asimov-7B-v2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}" }, "dreamgen/opus-v1.2-7b": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}" }, "KnutJaegersberg/internlm-20b-llama": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}" }, "alpindale/WizardLM-2-8x22B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}" }, "yentinglin/Taiwan-LLM-7B-v2.0-base": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}" }, "maywell/Synatra-Mixtral-8x7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}" }, "MediaTek-Research/Breeze-7B-Instruct-v1_0": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" }, "MTSAIR/multi_verse_model": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}" }, "bofenghuang/vigostral-7b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" }, "SeaLLMs/SeaLLM-7B-v2.5": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "qnguyen3/Master-Yi-9B": { - "bos_token": "<|startoftext|>", - "eos_token": "<|im_end|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" }, "meetkai/functionary-small-v2.5": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" }, "h2oai/h2o-danube-1.8b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" }, "TheBloke/CodeLlama-70B-Instruct-AWQ": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" }, "FairMind/Phi-3-mini-4k-instruct-bnb-4bit-Ita": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" }, "ibm-granite/granite-8b-code-instruct": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}" }, "dicta-il/dictalm2.0-instruct": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" }, "nvidia/Llama3-ChatQA-1.5-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}" }, "openchat/openchat-3.6-8b-20240522": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}" }, "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": { - "bos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "eos_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "pad_token": None, - "unk_token": { - "__type": "AddedToken", - "content": "", - "lstrip": False, - "normalized": True, - "rstrip": False, - "single_word": False - }, - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}" + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}" }, "tenyx/TenyxChat-7B-v1": { - "bos_token": "", - "eos_token": "<|end_of_turn|>", - "pad_token": "<|end_of_turn|>", - "unk_token": "", - "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "<|end_of_turn|>", + "pad_token": "<|end_of_turn|>", + "unk_token": "", + "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}" }, "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}" }, "SeaLLMs/SeaLLM-7B-v2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + ''}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + ''}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" }, "cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser": { - "bos_token": "", - "eos_token": "<|im_end|>", - "pad_token": "<|im_end|>", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}" }, "vaiv/llamion-14b-chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" }, "yam-peleg/Hebrew-Gemma-11B-V2": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" }, "shenzhi-wang/Llama3-8B-Chinese-Chat": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" }, "ericzzz/falcon-rw-1b-chat": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}" }, "NLPark/AnFeng_v3_Avocet": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}" }, "microsoft/Phi-3-vision-128k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" }, "jphme/em_german_leo_mistral": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}" }, "nlpai-lab/KULLM3": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" }, "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}" }, "MediaTek-Research/Breeze-7B-Instruct-v0_1": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" }, "microsoft/DialoGPT-large": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" }, "meta-llama/Meta-Llama-Guard-2-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n\n\n\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n\n\n\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}" }, "chinoll/Yi-6b-200k-dpo": { - "bos_token": "<|startoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}" }, "shanchen/llama3-8B-slerp-biomed-chat-chinese": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" }, "MLP-KTLim/llama-3-Korean-Bllossom-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" }, "UnfilteredAI/UNfilteredAI-1B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}" }, "abacusai/Smaug-Mixtral-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" }, "ProbeMedicalYonseiMAILab/medllama3-v20": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}" }, "vinai/PhoGPT-4B-Chat": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}" }, "lucyknada/microsoft_WizardLM-2-7B": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}" }, "bigcode/starcoder2-15b-instruct-v0.1": { - "bos_token": "<|endoftext|>", - "eos_token": "<|endoftext|>", - "pad_token": None, - "unk_token": "<|endoftext|>", - "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}" + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}" }, "AliAbdelrasheed/maqa_llama_4bit": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|reserved_special_token_250|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|reserved_special_token_250|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" }, "lightonai/alfred-40b-1023": { - "bos_token": None, - "eos_token": "", - "pad_token": None, - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'system' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'assistant' %}{{ '' + message['content'] + '' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '' }}{% endif %}{% endfor %}" + "bos_token": None, + "eos_token": "", + "pad_token": None, + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'system' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'assistant' %}{{ '' + message['content'] + '' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '' }}{% endif %}{% endfor %}" }, "aloobun/CosmicBun-8B": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}" }, "Undi95/Mixtral-8x7B-MoE-RP-Story": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n" }, "TIGER-Lab/MAmmoTH2-8B-Plus": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": "<|eot_id|>", - "unk_token": None, - "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}" }, "codellama/CodeLlama-70b-Instruct-hf": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" }, "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored": { - "bos_token": "", - "eos_token": "", - "pad_token": "[control_768]", - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "[control_768]", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}" }, "gorilla-llm/gorilla-openfunctions-v2": { - "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", - "eos_token": "<|EOT|>", - "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", - "unk_token": None, - "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" }, "ghost-x/ghost-7b-alpha": { - "bos_token": "", - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n' + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute](' + message['content'] + ')' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n' + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute](' + message['content'] + ')' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" }, "winninghealth/WiNGPT2-Llama-3-8B-Chat": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|end_of_text|>", - "pad_token": "<|end_of_text|>", - "unk_token": None, - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a" + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a" }, "BramVanroy/Llama-2-13b-chat-dutch": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" }, "THUDM/chatglm3-6b": { - "bos_token": None, - "eos_token": "", - "pad_token": "", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}" + "bos_token": None, + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}" }, "microsoft/Phi-3-mini-4k-instruct": { - "bos_token": "", - "eos_token": "<|endoftext|>", - "pad_token": "<|endoftext|>", - "unk_token": "", - "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" }, "mistralai/Mistral-7B-Instruct-v0.1": { - "bos_token": "", - "eos_token": "", - "pad_token": None, - "unk_token": "", - "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n" + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n" }, "meta-llama/Meta-Llama-3.1-8B-Instruct": { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>", - "pad_token": None, - "unk_token": None, - "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", } } From e8db2ef894267760e40cf3066110eadd72880a83 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:33:06 +0000 Subject: [PATCH 10/14] Bump diffusers from 0.31.0 to 0.32.1 (#1441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.31.0 to 0.32.1.
Release notes

Sourced from diffusers's releases.

v0.32.1

TorchAO Quantizer fixes

This patch release fixes a few bugs related to the TorchAO Quantizer introduced in v0.32.0.

  • Importing Diffusers would raise an error in PyTorch versions lower than 2.3.0. This should no longer be a problem.
  • Device Map does not work as expected when using the quantizer. We now raise an error if it is used. Support for using device maps with different quantization backends will be added in the near future.
  • Quantization was not performed due to faulty logic. This is now fixed and better tested.

Refer to our documentation to learn more about how to use different quantization backends.

All commits

Diffusers 0.32.0: New video pipelines, new image pipelines, new quantization backends, new training scripts, and more

https://github.com/user-attachments/assets/34d5f7ca-8e33-4401-8109-5c245ce7595f

This release took a while, but it has many exciting updates. It contains several new pipelines for image and video generation, new quantization backends, and more.

Going forward, to provide more transparency to the community about ongoing developments and releases in Diffusers, we will be making use of a roadmap tracker.

New Video Generation Pipelines 📹

Open video generation models are on the rise, and we’re pleased to provide comprehensive integration support for all of them. The following video pipelines are bundled in this release:

Check out this section to learn more about the fine-tuning options available for these new video models.

New Image Generation Pipelines

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=diffusers&package-manager=pip&previous-version=0.31.0&new-version=0.32.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- samples/export-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index 797b680b9a..a589696beb 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -6,7 +6,7 @@ optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen -diffusers==0.31.0 # For image generation pipelines +diffusers==0.32.1 # For image generation pipelines timm==1.0.12 # For exporting InternVL2 torchvision # For visual language models transformers>=4.43 # For Whisper From 94547e9d3bb8afa1d64054db186a334dcf92d6be Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:41:17 +0000 Subject: [PATCH 11/14] Bump diffusers from 0.31.0 to 0.32.1 in /tests/python_tests (#1442) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.31.0 to 0.32.1.
Release notes

Sourced from diffusers's releases.

v0.32.1

TorchAO Quantizer fixes

This patch release fixes a few bugs related to the TorchAO Quantizer introduced in v0.32.0.

  • Importing Diffusers would raise an error in PyTorch versions lower than 2.3.0. This should no longer be a problem.
  • Device Map does not work as expected when using the quantizer. We now raise an error if it is used. Support for using device maps with different quantization backends will be added in the near future.
  • Quantization was not performed due to faulty logic. This is now fixed and better tested.

Refer to our documentation to learn more about how to use different quantization backends.

All commits

Diffusers 0.32.0: New video pipelines, new image pipelines, new quantization backends, new training scripts, and more

https://github.com/user-attachments/assets/34d5f7ca-8e33-4401-8109-5c245ce7595f

This release took a while, but it has many exciting updates. It contains several new pipelines for image and video generation, new quantization backends, and more.

Going forward, to provide more transparency to the community about ongoing developments and releases in Diffusers, we will be making use of a roadmap tracker.

New Video Generation Pipelines 📹

Open video generation models are on the rise, and we’re pleased to provide comprehensive integration support for all of them. The following video pipelines are bundled in this release:

Check out this section to learn more about the fine-tuning options available for these new video models.

New Image Generation Pipelines

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=diffusers&package-manager=pip&previous-version=0.31.0&new-version=0.32.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- tests/python_tests/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 00bffb6646..c2c7d634f5 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu -diffusers==0.31.0 +diffusers==0.32.1 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 From 8fe0ff595015bae822b4c2867372e219900c4421 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 26 Dec 2024 20:33:10 +0400 Subject: [PATCH 12/14] Added more FLUX supported models (#1444) --- src/docs/SUPPORTED_MODELS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index 9762874596..44da29ced4 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -243,6 +243,8 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
  • Freepik/flux.1-lite-8B-alpha
  • black-forest-labs/FLUX.1-dev
  • shuttleai/shuttle-3-diffusion
  • +
  • shuttleai/shuttle-3.1-aesthetic
  • +
  • Shakker-Labs/AWPortrait-FL
  • From 82b44fab5b538ef9e11ff47fcd245f7885c1a25f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 27 Dec 2024 07:47:50 +0400 Subject: [PATCH 13/14] LLM tests restructuring (#1440) - Merged chat scenario tests to test_llm_pipeline.py - Created CB dedicated test_continuous_batching.py file with CB-specific tests (in addition to test_llm_pipeline.py, which cover basic LLM pipeline functionality) CVS-159921 --- .github/labeler.yml | 29 +- .github/workflows/linux.yml | 4 +- .github/workflows/mac.yml | 8 +- .github/workflows/windows.yml | 8 +- src/cpp/src/llm_pipeline.cpp | 12 +- tests/python_tests/common.py | 14 +- tests/python_tests/ov_genai_test_utils.py | 29 +- tests/python_tests/test_chat_generate_api.py | 118 -------- ...emption.py => test_continuous_batching.py} | 165 ++++++++++- ...mizations.py => test_kv_cache_eviction.py} | 4 +- ...t_generate_api.py => test_llm_pipeline.py} | 273 ++++++++++-------- .../python_tests/test_llm_pipeline_static.py | 2 +- tests/python_tests/test_sampling.py | 140 +++------ .../{test_vlm_api.py => test_vlm_pipeline.py} | 0 ...nerate_api.py => test_whisper_pipeline.py} | 0 15 files changed, 418 insertions(+), 388 deletions(-) delete mode 100644 tests/python_tests/test_chat_generate_api.py rename tests/python_tests/{test_preemption.py => test_continuous_batching.py} (62%) rename tests/python_tests/{test_cache_optimizations.py => test_kv_cache_eviction.py} (98%) rename tests/python_tests/{test_generate_api.py => test_llm_pipeline.py} (87%) rename tests/python_tests/{test_vlm_api.py => test_vlm_pipeline.py} (100%) rename tests/python_tests/{test_whisper_generate_api.py => test_whisper_pipeline.py} (100%) diff --git a/.github/labeler.yml b/.github/labeler.yml index c162f6aff4..f618bdb7fc 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -13,17 +13,20 @@ - 'src/python/py_tokenizer.cpp' - 'thirdparty/openvino_tokenizers' - 'tests/python_tests/tokenizer_configs.py' +- 'tests/python_tests/test_tokenizer.py' 'category: LLM': - 'src/cpp/include/openvino/genai/llm_pipeline.hpp' - 'src/cpp/src/llm_pipeline.cpp' +- 'src/cpp/src/lm_encoding.hpp' - 'src/cpp/src/lm_encoding.cpp' - 'src/cpp/src/llm_pipeline_base.hpp' - 'src/cpp/src/llm_pipeline_static.hpp' - 'src/cpp/src/llm_pipeline_static.cpp' +- 'src/cpp/src/text_callback_streamer.cpp' +- 'src/cpp/src/text_callback_streamer.hpp' - 'src/python/py_llm_pipeline.cpp' -- 'tests/python_tests/test_generate_api.py' -- 'tests/python_tests/test_chat_generate_api.py' +- 'tests/python_tests/test_llm_pipeline.py' 'category: sampling': - 'src/cpp/include/openvino/genai/generation_config.hpp' @@ -35,6 +38,7 @@ - 'tests/cpp/logit_filtering.cpp' - 'tests/cpp/generate_config.cpp' - 'tests/cpp/sampler.cpp' +- 'tests/python_tests/test_sampling.py' 'category: LoRA': - 'src/cpp/include/openvino/genai/lora_adapter.hpp' @@ -54,9 +58,12 @@ - 'src/cpp/include/openvino/genai/whisper_pipeline.hpp' - 'src/cpp/src/whisper/**/*' - 'src/cpp/src/whisper_generation_config.cpp' +- 'src/cpp/src/whisper_pipeline_base.hpp' - 'src/cpp/src/whisper_pipeline.cpp' +- 'src/cpp/src/whisper_pipeline_static.cpp' +- 'src/cpp/src/whisper_pipeline_static.hpp' - 'src/python/py_whisper_pipeline.cpp' -- 'tests/python_tests/test_whisper_generate_api.py' +- 'tests/python_tests/test_whisper_pipeline.py' 'category: Python API': - 'src/python/**/*' @@ -65,10 +72,14 @@ - 'src/include/openvino/genai/visual_language/**/*' - 'src/cpp/src/visual_language/**/*' - 'src/python/py_vlm_pipeline.cpp' -- 'tests/python_tests/test_vlm_api.py' +- 'tests/python_tests/test_vlm_pipeline.py' 'category: speculative decoding': - 'src/cpp/src/speculative_decoding/**/*' +- 'tests/cpp/speculative_decoding.cpp' + +'category: prompt lookup': +- 'src/cpp/src/prompt_lookup/**/*' 'category: continuous batching': - 'src/cpp/include/openvino/genai/cache_eviction.hpp' @@ -91,19 +102,19 @@ - 'src/cpp/src/generation_handle.cpp' - 'src/cpp/src/generation_stream.hpp' - 'src/cpp/src/model_runner.hpp' -- 'src/cpp/src/paged_attention_transformations.cpp' -- 'src/cpp/src/paged_attention_transformations.hpp' +- 'src/cpp/src/utils/paged_attention_transformations.cpp' +- 'src/cpp/src/utils/paged_attention_transformations.hpp' - 'src/cpp/src/scheduler.hpp' - 'src/cpp/src/sequence_group.cpp' - 'src/cpp/src/sequence_group.hpp' - 'src/cpp/src/timer.hpp' - 'src/python/py_continuous_batching_pipeline.cpp' -- 'tests/python_tests/test_cache_optimizations.py' -- 'tests/python_tests/test_preemption.py' -- 'tests/python_tests/test_sampling.py' +- 'tests/python_tests/test_continuous_batching.py' +- 'tests/python_tests/test_kv_cache_eviction.py' - 'tests/cpp/block_allocator.cpp' - 'tests/cpp/block_hash_store.cpp' - 'tests/cpp/block_manager.cpp' +- 'tests/cpp/cache_eviction.cpp' - 'tests/cpp/cache_manager.cpp' - 'tests/cpp/device_config.cpp' - 'tests/cpp/scheduler.cpp' diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 6c94a907ea..9b21491f9b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -268,9 +268,9 @@ jobs: matrix: test: - name: 'Whisper' - cmd: 'tests/python_tests/test_whisper_generate_api.py' + cmd: 'tests/python_tests/test_whisper_pipeline.py' - name: 'LLM & VLM' - cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py' + cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py' defaults: run: shell: bash diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index a9af13bc66..4d9b7f032b 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -178,7 +178,7 @@ jobs: if: | always() && (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') - timeout-minutes: 90 + timeout-minutes: 120 defaults: run: shell: bash @@ -235,7 +235,7 @@ jobs: python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels python -c "from openvino_genai import LLMPipeline" python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template" genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -290,7 +290,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -300,7 +300,7 @@ jobs: python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels python -c "from openvino_genai import LLMPipeline" python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" genai_package: name: OpenVINO genai extension (install to OpenVINO package) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index f88bc4c6f3..fc63129281 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -245,7 +245,7 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template" genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -301,7 +301,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -310,7 +310,7 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" genai_python_lib_vlm: name: OpenVINO genai VLM tests (cmake + wheel) @@ -366,7 +366,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_vlm_api.py + python -m pytest -v ./tests/python_tests/test_vlm_pipeline.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index be5ecf17fa..5e448fe88c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -703,8 +703,7 @@ std::pair split_model_descr(const ov::An ov::genai::LLMPipeline::LLMPipeline( const ov::InferRequest& request, const ov::genai::Tokenizer& tokenizer, - OptionalGenerationConfig generation_config -) { + OptionalGenerationConfig generation_config) { auto start_time = std::chrono::steady_clock::now(); m_pimpl = std::make_unique(request, tokenizer, generation_config); auto stop_time = std::chrono::steady_clock::now(); @@ -715,8 +714,7 @@ ov::genai::LLMPipeline::LLMPipeline( const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& properties -){ + const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || @@ -735,8 +733,7 @@ ov::genai::LLMPipeline::LLMPipeline( ov::genai::LLMPipeline::LLMPipeline( const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& config -){ + const ov::AnyMap& config) { auto start_time = std::chrono::steady_clock::now(); if (config.find(ov::genai::scheduler_config.name()) != config.end() || @@ -759,8 +756,7 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& config, - const ov::genai::GenerationConfig& generation_config -){ + const ov::genai::GenerationConfig& generation_config) { auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config); auto start_time = std::chrono::steady_clock::now(); diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 7e3c075405..f940d272ed 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -364,18 +364,6 @@ def run_continuous_batching( return output -def read_models_list(file_name: str): - models = [] - with open(file_name) as f: - for model_name in f: - model_name = model_name.strip() - # skip comment in model scope file - if model_name.startswith('#'): - continue - models.append(model_name) - return models - - def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) @@ -447,7 +435,7 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st assert ref_text == ov_text -def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): +def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): prompts, generation_configs = get_test_dataset() scheduler_config = get_scheduler_config(scheduler_params) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 87b2147bcd..3fc89cb8a7 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -32,7 +32,7 @@ def get_models_list(): "HuggingFaceH4/zephyr-7b-beta", "ikala/redpajama-3b-chat", "mistralai/Mistral-7B-v0.1", - + # "meta-llama/Llama-2-7b-chat-hf", # Cannot be downloaded without access token # "google/gemma-2b-it", # Cannot be downloaded without access token. # "google/gemma-7b-it", # Cannot be downloaded without access token. @@ -49,7 +49,7 @@ def get_models_list(): model_ids = precommit_models else: model_ids = nightly_models - + if pytest.selected_model_ids: model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] # pytest.set_trace() @@ -82,30 +82,30 @@ def get_chat_models_list(): @functools.lru_cache(1) def read_model(params, **tokenizer_kwargs): model_id, path = params - + from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) if (path / "openvino_model.xml").exists(): - opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, + opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, compile=False, device='CPU') else: - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, **tokenizer_kwargs) openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - + # to store tokenizer config jsons with special tokens hf_tokenizer.save_pretrained(path) - - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, + + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, compile=False, device='CPU', load_in_8bit=False) opt_model.generation_config.save_pretrained(path) opt_model.config.save_pretrained(path) opt_model.save_pretrained(path) - + return ( model_id, path, @@ -116,11 +116,11 @@ def read_model(params, **tokenizer_kwargs): # in OpenVINO GenAI this parameter is called stop_criteria, -# while in HF it's called early_stopping. +# while in HF it's called early_stopping. # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" STOP_CRITERIA_MAP = { - ov_genai.StopCriteria.NEVER: "never", - ov_genai.StopCriteria.EARLY: True, + ov_genai.StopCriteria.NEVER: "never", + ov_genai.StopCriteria.EARLY: True, ov_genai.StopCriteria.HEURISTIC: False } @@ -137,6 +137,7 @@ def model_tmp_path(tmpdir_factory): shutil.copy(src_file, temp_path / src_file.name) yield model_id, Path(temp_path) + @pytest.fixture(scope="module") def model_tokenizers_path_tmp_path(tmpdir_factory): model_id, path, _, _, _ = read_model(get_models_list()[0]) @@ -146,7 +147,7 @@ def model_tokenizers_path_tmp_path(tmpdir_factory): # There was no easy way to add tokens to IR in tests, so we remove them # and set tokens in configs and to check if they are read and validated correctly. import openvino as ov - + # copy openvino converted model and tokenizers for pattern in ['*.xml', '*.bin']: for src_file in path.glob(pattern): @@ -162,7 +163,7 @@ def model_tokenizers_path_tmp_path(tmpdir_factory): ov_model.set_rt_info("eos_token_id", "") ov_model.set_rt_info("chat_template", "") ov.save_model(ov_model, str(temp_path / src_file.name)) - + if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']: continue if src_file.is_file(): diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py deleted file mode 100644 index 07b4f7c15f..0000000000 --- a/tests/python_tests/test_chat_generate_api.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import openvino_genai as ov_genai -import pytest -from typing import Dict, Tuple - -from ov_genai_test_utils import ( - get_chat_models_list, - read_model, - get_continuous_batching, -) - - -generation_configs = [ - dict(do_sample=False, max_new_tokens=20), - dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) -] - - -questions = [ - '1+1=', - 'What is the previous answer?', - 'Why is the Sun yellow?', - 'What was my first question?' -] - - -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_chat_compare_with_HF(model_descr, generation_config: Dict): - chat_history_hf = [] - chat_history_ov = [] - chat_prompt = '' - - # Will set add_special_tokens=False inside pipeline when start_chat() is called. - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - - pipe.start_chat() - for prompt in questions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **generation_config) - answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - answer_ov = pipe.generate(prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - - pipe.finish_chat() - - if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') - - assert chat_history_ov == chat_history_hf - - -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict): - # compares with HF when history in ov_genai is save as a text - chat_history_hf = [] - chat_history_ov = [] - chat_prompt = '' - - # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. - # Need to regenerate openvino_tokenizer/detokenizer. - model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - ov_tokenizer = ov_pipe.get_tokenizer() - - for prompt in questions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **generation_config) - answer_str = hf_tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - chat_prompt = ov_tokenizer.apply_chat_template(chat_history_ov, add_generation_prompt=True) - answer_ov = ov_pipe.generate(chat_prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - - if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') - - assert chat_history_ov == chat_history_hf - - -@pytest.mark.parametrize("generation_config", generation_configs[1:]) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): - model_id, path, hf_tokenizer, opt_model, ov_stateful_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - cb_pipe = get_continuous_batching(path) - - ov_stateful_pipe.start_chat() - cb_pipe.start_chat() - - for question in questions: - generated = cb_pipe.generate(question, **generation_config) - reference = ov_stateful_pipe.generate(question, **generation_config) - assert generated == reference - - # Test that finish_chat() doesn't fail just in case. - cb_pipe.finish_chat() diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_continuous_batching.py similarity index 62% rename from tests/python_tests/test_preemption.py rename to tests/python_tests/test_continuous_batching.py index 7c648e73dc..3a1e9fa092 100644 --- a/tests/python_tests/test_preemption.py +++ b/tests/python_tests/test_continuous_batching.py @@ -1,15 +1,172 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os import pytest +import math +from typing import Dict + +from pathlib import Path +from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer -from openvino_genai import GenerationConfig from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ - get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \ + get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts +from ov_genai_test_utils import ( + get_chat_models_list, + read_model, + get_continuous_batching, +) + +def read_models_list(file_name: str): + models = [] + with open(file_name) as f: + for model_name in f: + model_name = model_name.strip() + # skip comment in model scope file + if model_name.startswith('#'): + continue + models.append(model_name) + return models + +# +# e2e tests on random and real models +# + +@pytest.mark.precommit +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +def test_e2e_precommit(tmp_path, model_id): + run_continuous_batching_pipeline_test(tmp_path, model_id) + + +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) +def test_e2e_nightly(tmp_path, model_id): + run_continuous_batching_pipeline_test(tmp_path, model_id) + + +@pytest.mark.real_models +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) +def test_e2e_real_models(tmp_path, model_id): + run_continuous_batching_pipeline_test(tmp_path, model_id) + +# +# Comparison with stateful +# TODO: remove these tests once test_llm_pipeline.py are generalized and parametrized to test both Stateful and PA paths +# + +test_configs = [ + dict(max_new_tokens=20), + dict(max_new_tokens=200, ignore_eos=True), + dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) +] +batched_prompts = [ + ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], + ['hello', 'Here is the longest nowel ever: '], + ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], + ['table is made', 'table is made [force left pad tokens]'] +] +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. +@pytest.mark.precommit +def test_continuous_batching_vs_stateful(prompt, generation_config): + model_id, path, tokenizer, model, stateful = read_model(( + "facebook/opt-125m", + Path("opt-125m") + )) + cb = get_continuous_batching(path) + generated = cb.generate(prompt, **generation_config) + reference = stateful.generate(prompt, **generation_config) + assert generated.texts == reference.texts + if 1 != generation_config.get("num_return_sequences", 1): + # Stateful puts zeroes to generated.scores. Don't compare them. + for gen, ref in zip(generated.scores, reference.scores): + assert math.isclose(gen, ref, abs_tol=0.0003) + + +prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.precommit +def test_cb_streamer_vs_return_vs_stateful(prompt): + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(( + "facebook/opt-125m", + Path("opt-125m") + )) + cb_pipe = get_continuous_batching(path) + streamed = [] + generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + reference = ov_pipe.generate(prompt, max_new_tokens=20) + assert generated == "".join(streamed) + assert "".join(streamed) == reference + + +generation_configs = [ + dict(do_sample=False, max_new_tokens=20), + dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) +] +questions = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] +@pytest.mark.parametrize("generation_config", generation_configs[1:]) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +def test_chat_scenario_vs_stateful(model_descr, generation_config: Dict): + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb_pipe = get_continuous_batching(path) + + ov_pipe.start_chat() + cb_pipe.start_chat() + + for question in questions: + generated = cb_pipe.generate(question, **generation_config) + reference = ov_pipe.generate(question, **generation_config) + assert generated == reference + + # Test that finish_chat() doesn't fail just in case. + cb_pipe.finish_chat() + +# +# Stress tests to check OOM case +# + +@pytest.mark.precommit +@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()], + ids=["greedy", "beam_search", "multinomial_all_parameters"]) +def test_post_oom_health(tmp_path, sampling_config): + generation_config = sampling_config + generation_config.ignore_eos = True + generation_config.max_new_tokens = 1000000 + + scheduler_config = get_scheduler_config() + scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly + + model_id : str = "facebook/opt-125m" + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + + models_path : Path = tmp_path / model_id + save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") + + # First run should return incomplete response + output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) + assert (len(output)) + assert (len(output[0].m_generation_ids)) + + # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM + output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) + assert (len(output)) + assert (len(output[0].m_generation_ids)) + +# +# Pre-emption +# def get_greedy_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() @@ -36,7 +193,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_preemption(tmp_path, params): - run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) multinomial_params = RandomSamplingTestStruct( @@ -175,4 +332,4 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file + generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_kv_cache_eviction.py similarity index 98% rename from tests/python_tests/test_cache_optimizations.py rename to tests/python_tests/test_kv_cache_eviction.py index d89697ba42..bbd0da6bb2 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_test_pipeline +from common import TESTS_ROOT, run_continuous_batching_pipeline_test def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -168,5 +168,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): - run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1]) diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_llm_pipeline.py similarity index 87% rename from tests/python_tests/test_generate_api.py rename to tests/python_tests/test_llm_pipeline.py index 824a3cca26..9f00996a58 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -12,11 +12,12 @@ import torch import math from ov_genai_test_utils import ( - get_models_list, - read_model, + get_models_list, + read_model, load_genai_pipe_with_configs, - model_tmp_path, - STOP_CRITERIA_MAP, + get_chat_models_list, + model_tmp_path, + STOP_CRITERIA_MAP, get_continuous_batching, ) @@ -26,12 +27,12 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro config = generation_config.copy() # to avoid side effects num_beams = config['num_beams'] if 'num_beams' in config else 1 config['num_return_sequences'] = num_beams - + if not isinstance(prompts, list): prompts = [prompts] if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. @@ -72,7 +73,7 @@ def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, config = generation_config.copy() # to avoid side effects if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. @@ -101,9 +102,9 @@ def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, def run_hf_ov_genai_comparison_encoded_inputs( - model_descr, - generation_config: Dict, - input_ids: np.ndarray, + model_descr, + generation_config: Dict, + input_ids: np.ndarray, attention_mask: Optional[np.array] = None ): device = 'CPU' @@ -112,18 +113,18 @@ def run_hf_ov_genai_comparison_encoded_inputs( config = generation_config.copy() # to avoid side effects if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = 1.0 # 1.0 means no penalty - + generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) - + if attention_mask is not None: inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) @@ -138,6 +139,9 @@ def run_hf_ov_genai_comparison_encoded_inputs( ov_res = np.array(ov_output.tokens, dtype=np.int64) assert np.all(ov_res == hf_res) +# +# e2e work +# test_cases = [ (dict(max_new_tokens=20), 'table is made of'), @@ -197,14 +201,13 @@ def test_batch_text_input(model_descr, generation_config, prompts): @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_beam_search_decoding(model_descr, num_beam_groups, group_size, - max_new_tokens, diversity_penalty, prompt): +def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=diversity_penalty, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @@ -215,17 +218,17 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size, @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): +def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): # todo: with EARLY stop_criteria looks like HF return invalid out with sentence # while genai ends sentence with if (stop_criteria == StopCriteria.EARLY): pytest.skip() generation_config = dict( - num_beam_groups=2, - num_beams=2 * 3, - diversity_penalty=1.0, - num_return_sequences=2 * 3, - max_new_tokens=max_new_tokens, + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, stop_criteria=stop_criteria, ) run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @@ -241,11 +244,11 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, max_new_tokens, prompt): generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=1.0, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @@ -283,6 +286,72 @@ def test_greedy_repetition_penalty(model_descr, prompt): assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' ')))) +@pytest.mark.precommit +@pytest.mark.nightly +def test_batch_size_switch(): + ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] + ov_pipe.generate(["a"], max_new_tokens=2) + ov_pipe.generate(["1", "2"], max_new_tokens=2) + ov_pipe.generate(["a"], max_new_tokens=2) + +# +# Chat scenario +# + +generation_configs = [ + dict(do_sample=False, max_new_tokens=20), + dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) +] + + +questions = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] + + +@pytest.mark.parametrize("generation_config", generation_configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_with_HF(model_descr, generation_config: Dict): + chat_history_hf = [] + chat_history_ov = [] + chat_prompt = '' + + # Will set add_special_tokens=False inside pipeline when start_chat() is called. + model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + + ov_pipe.start_chat() + for prompt in questions: + chat_history_hf.append({'role': 'user', 'content': prompt}) + chat_history_ov.append({'role': 'user', 'content': prompt}) + + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + + answer = opt_model.generate(**tokenized, **generation_config) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + + answer_ov = ov_pipe.generate(prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + + ov_pipe.finish_chat() + + if chat_history_ov != chat_history_hf: + print(f'hf_output: {chat_history_hf}') + print(f'ov_output: {chat_history_ov}') + + assert chat_history_ov == chat_history_hf + + +# +# Streaming with callback +# + def user_defined_callback(subword): print(subword) @@ -422,11 +491,14 @@ def test_operator_with_streamer_kwargs_batch_throws(): with pytest.raises(RuntimeError): ov_pipe('', num_beams=2, streamer=printer) +# +# Tests on generation configs (invalid cases and handling within LLMPipeline) +# invalid_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests - # dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len + # dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp @@ -446,7 +518,7 @@ def test_invalid_generation_configs_throws(model_tmp_path, generation_config): @pytest.mark.precommit @pytest.mark.nightly -def test_valid_configs(model_tmp_path): +def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path): model_id, temp_path = model_tmp_path ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path) @@ -454,6 +526,8 @@ def test_valid_configs(model_tmp_path): config.do_sample = True # no eos_token_id but it's loaded from config.json ov_pipe.set_generation_config(config) + assert 37 == ov_pipe.get_generation_config().eos_token_id + invalid_py_configs = [ dict(num_beam_groups=3, num_beams=15, do_sample=True), @@ -478,6 +552,9 @@ def test_python_generation_config_validation_throws(model_tmp_path, generation_c with pytest.raises(return_exception_type): ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) +# +# Work with Unicode in Python API +# @pytest.mark.precommit @pytest.mark.nightly @@ -512,69 +589,9 @@ def test_unicode_pybind_decoding_one_string_streamer(): ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) assert '�' == res_str[-1] - -@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") -def test_left_pad(): - # test left pad tokenizer post processing implementation - prompts = [ - "The Sun is yellow because", - "The Sun is yellow because [force left pad tokens]" - ] - models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) - - config = { - "max_new_tokens": 20, - "num_beam_groups": 2, - "num_beams": 2, - "num_return_sequences": 2, - "do_sample": False, - "diversity_penalty": 1.0, - # phi 1_5 has no eos_token_id in model configuration - # ov genai will detect eos_token_id from tokenizer config - # hf implementation doesn't fetch it from tokenizer config and defaults to None - # align ov genai and hf by setting eos_token_id explicitly - "eos_token_id": 50256, - } - - models[2].pad_token = models[2].eos_token - run_hf_ov_genai_comparison_batched(models, config, prompts) - - -@pytest.mark.parametrize("generation_config", test_configs) -@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. -@pytest.mark.precommit -def test_continuous_batching_vs_stateful(prompt, generation_config): - model_id, path, tokenizer, model, stateful = read_model(( - "facebook/opt-125m", - Path("opt-125m") - )) - cb = get_continuous_batching(path) - generated = cb.generate(prompt, **generation_config) - reference = stateful.generate(prompt, **generation_config) - assert generated.texts == reference.texts - if 1 != generation_config.get("num_return_sequences", 1): - # Stateful puts zeroes to generated.scores. Don't compare them. - for gen, ref in zip(generated.scores, reference.scores): - assert math.isclose(gen, ref, abs_tol=0.0003) - - -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.precommit -def test_cb_streamer_vs_return_vs_stateful(prompt): - model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(( - "facebook/opt-125m", - Path("opt-125m") - )) - cb_pipe = get_continuous_batching(path) - streamed = [] - generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) - reference = ov_pipe.generate(prompt, max_new_tokens=20) - assert generated == "".join(streamed) - assert "".join(streamed) == reference - +# +# Perf metrics +# def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr @@ -582,12 +599,13 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st config = generation_config.copy() # to avoid side effects if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = 1.0 # 1.0 means no penalty + return ov_pipe.generate([prompt], **config).perf_metrics @@ -598,20 +616,21 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly +@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.") def test_perf_metrics(model_descr, generation_config, prompt): import time start_time = time.perf_counter() perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) total_time = (time.perf_counter() - start_time) * 1000 - + # Check that load time is adequate. load_time = perf_metrics.get_load_time() - assert load_time > 0 and load_time < 1000.0 - + assert load_time > 0 and load_time < 1000.0 + # Check that num input and generated tokens are adequate. num_generated_tokens = perf_metrics.get_num_generated_tokens() - assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] - + assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] + num_input_tokens = perf_metrics.get_num_input_tokens() assert num_input_tokens > 0 and num_input_tokens <= len(prompt) @@ -622,7 +641,7 @@ def test_perf_metrics(model_descr, generation_config, prompt): raw_metrics = perf_metrics.raw_metrics durations = np.array(raw_metrics.m_durations) / 1000 # Check that prefill is not included in durations for TPOT calculation. - # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration. + # For the very long prompt prefill is slow and TTFT is much larger than any other token generation duration. assert np.all(mean_ttft > durations * 2) mean_tpot, std_tpot = perf_metrics.get_tpot() @@ -632,7 +651,7 @@ def test_perf_metrics(model_descr, generation_config, prompt): mean_throughput, std_throughput = perf_metrics.get_throughput() assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std) assert mean_throughput > 0 and mean_throughput < 20000.0 - + mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration() assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std) assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time @@ -647,7 +666,7 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std) assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration assert std_detok_duration == 0 - + # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics assert np.allclose(mean_tpot, np.mean(durations)) assert np.allclose(std_tpot, np.std(durations)) @@ -668,15 +687,11 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert len(raw_metrics.m_batch_sizes) > 0 assert len(raw_metrics.m_durations) > 0 +# +# Misc +# -@pytest.mark.precommit -@pytest.mark.nightly -def test_batch_switch(): - ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - ov_pipe.generate(["a"], max_new_tokens=2) - ov_pipe.generate(["1", "2"], max_new_tokens=2) - - +# TODO: move to test_sampling.py @pytest.mark.precommit @pytest.mark.nightly def test_stop_token_ids(): @@ -691,6 +706,7 @@ def test_stop_token_ids(): assert 9935 in res.tokens[0] +# TODO: move to test_sampling.py @pytest.mark.precommit @pytest.mark.nightly def test_stop_strings(): @@ -701,3 +717,34 @@ def test_stop_strings(): stop_strings={"ignored", "боль"} ) assert "боль" not in res + + +# TODO: move this test to test_tokenizer.py +@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") +def test_left_pad(): + # test left pad tokenizer post processing implementation + prompts = [ + "The Sun is yellow because", + "The Sun is yellow because [force left pad tokens]" + ] + models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) + + config = { + "max_new_tokens": 20, + "num_beam_groups": 2, + "num_beams": 2, + "num_return_sequences": 2, + "do_sample": False, + "diversity_penalty": 1.0, + # phi 1_5 has no eos_token_id in model configuration + # ov genai will detect eos_token_id from tokenizer config + # hf implementation doesn't fetch it from tokenizer config and defaults to None + # align ov genai and hf by setting eos_token_id explicitly + "eos_token_id": 50256, + } + + models[2].pad_token = models[2].eos_token + run_hf_ov_genai_comparison_batched(models, config, prompts) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index cad8b0fea0..c3500d15ac 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -145,7 +145,7 @@ def test_chat_generation(model_descr): 'What was my first question?' ] - model_path = get_chat_models_lists()[0][1] + model_path = get_chat_models_list()[0][1] chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index fbcce76bf7..25ae9d8afa 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -10,13 +10,13 @@ from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer from typing import List, TypedDict -from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ - generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ +from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ + get_greedy, get_beam_search, get_multinomial_temperature, \ get_greedy_with_penalties, get_multinomial_temperature, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \ + get_greedy, get_greedy_with_min_and_max_tokens, \ get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \ get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ @@ -27,25 +27,9 @@ run_continuous_batching +# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) -def test_sampling_precommit(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - - -@pytest.mark.nightly -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) -def test_sampling_nightly(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - -@pytest.mark.real_models -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) -def test_real_models(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - - -@pytest.mark.precommit -def test_eos_beam_search(tmp_path): +def test_beam_search_has_eos_token_at_end(tmp_path): ''' Current test checks that in case of beam search, some generation results explicitly have EOS token at the end, which is aligned with HF @@ -61,8 +45,9 @@ def test_eos_beam_search(tmp_path): generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) +# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -def test_eos_greedy(tmp_path): +def test_greedy_has_eos_token_at_end(tmp_path): ''' Current test checks that in case of gready, some generation results explicitly have EOS token at the end, which is aligned with HF: @@ -76,55 +61,44 @@ def test_eos_greedy(tmp_path): scheduler_config = get_scheduler_config() generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + +# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), - get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), - get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), - get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), - get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ], - ids=[ - "greedy", - "greedy_with_min_and_max_tokens", - "greedy_with_repetition_penalty", - "greedy_with_single_stop_string", - "greedy_with_multiple_stop_strings", - "greedy_with_multiple_stop_strings_no_match", - "beam", - "beam_search_min_and_max_tokens", - "beam_search_with_multiple_stop_strings_no_match", - "get_greedy_stop_strings_exclude_from_output", - "get_greedy_stop_strings_include_to_output", - "get_greedy_n_stop_strings_exclude_from_output", - "get_greedy_n_stop_strings_include_to_output" - ]) -def test_individual_generation_configs_deterministic(tmp_path, generation_config): - prompts = [ - "What is OpenVINO?", - ] +@pytest.mark.parametrize("generation_config", + [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), + get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), + get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), + get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), + get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()], + ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string", + "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens", + "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output", + "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"]) +def test_sampling_against_optimum(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] generation_configs = [generation_config] model_id : str = "facebook/opt-125m" generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + @pytest.mark.precommit @pytest.mark.xfail( raises=AssertionError, reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", strict=True, ) -@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings(),], - ids=[ - "beam_search_with_single_stop_string", - "beam_search_with_multiple_stop_strings", - ]) +@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()], + ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"]) def test_beam_search_with_stop_string(tmp_path, generation_config): - prompts = [ - "What is OpenVINO?", - ] + prompts = [ "What is OpenVINO?" ] generation_configs = [generation_config] model_id : str = "facebook/opt-125m" generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) +# TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF +# and merge this tests with 'test_sampling_against_optimum' by extending a list of generation configs + class PlatformsRefTexts(TypedDict, total=False): linux: List[List[str]] win32: List[List[str]] @@ -306,7 +280,7 @@ class RandomSamplingTestStruct: "multinomial_temperature_and_frequence_penalty", "greedy_with_penalties", "multinomial_max_and_min_token"]) -def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): +def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSamplingTestStruct): generation_config = test_struct.generation_config prompts = test_struct.prompts @@ -326,9 +300,10 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl @pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters]) +@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], + ids=["greedy", "beam_search", "multinomial_all_parameters"]) @pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_without_completion(tmp_path, get_generation_config, max_num_batched_tokens): +def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens): generation_config = get_generation_config() generation_config.max_new_tokens = 0 generation_config.echo = True @@ -337,14 +312,14 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") + cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") - outputs = pipe.generate(["What is OpenVINO?"], generation_configs) + outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) for output in outputs: assert(len(output.m_generation_ids)) @@ -353,9 +328,10 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche @pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters]) +@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], + ids=["greedy", "beam_search", "multinomial_all_parameters"]) @pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_tokens): +def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens): generation_config = get_generation_config() generation_config.max_new_tokens = 10 generation_config.echo = True @@ -364,45 +340,17 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) - - pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") + save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - outputs = pipe.generate(["What is OpenVINO?"], generation_configs) + cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") + outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) + for output in outputs: assert(len(output.m_generation_ids)) for sequence in output.m_generation_ids: assert(sequence.startswith("What is OpenVINO?")) assert(len(sequence) > len("What is OpenVINO?")) - - -@pytest.mark.precommit -@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()]) -def test_post_oom_health(tmp_path, sampling_config): - generation_config = sampling_config - generation_config.ignore_eos = True - generation_config.max_new_tokens = 1000000 - - scheduler_config = get_scheduler_config() - # Low cache size to trigger OOM quickly - scheduler_config.num_kv_blocks = 10 - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) - - models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) - - pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") - # First run should return incomplete response - output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert (len(output)) - assert(len(output[0].m_generation_ids)) - # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM - output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert (len(output)) - assert(len(output[0].m_generation_ids)) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_pipeline.py similarity index 100% rename from tests/python_tests/test_vlm_api.py rename to tests/python_tests/test_vlm_pipeline.py diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_pipeline.py similarity index 100% rename from tests/python_tests/test_whisper_generate_api.py rename to tests/python_tests/test_whisper_pipeline.py From 842c99edb567a701c289677a34a3af87553054e0 Mon Sep 17 00:00:00 2001 From: Mang Guo Date: Fri, 27 Dec 2024 14:36:19 +0800 Subject: [PATCH 14/14] Support unfixed kv heads number (#1416) Fix decilm-7b-instruct benchmark test failure. The number heads per layer is not fixed in decilm-7b-instruct model, current code can not handle such case. JIRA ticket CVS-157864. Co-authored-by: Ilya Lavrenov --- src/cpp/src/cache_manager.hpp | 41 ++++++------- src/cpp/src/device_config.hpp | 61 ++++++++++++------- .../utils/paged_attention_transformations.cpp | 20 +++--- tests/cpp/cache_manager.cpp | 13 ++-- tests/cpp/device_config.cpp | 2 +- tests/cpp/scheduler.cpp | 2 +- 6 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 0c04823f4f..20d4c0c51c 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -46,8 +46,6 @@ class CacheManager { } OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size()); m_num_allocated_kv_blocks = num_kv_blocks; - ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); - ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); const std::string device_name = m_device_config.get_device(); @@ -56,6 +54,8 @@ class CacheManager { if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks); + ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks); ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); @@ -104,6 +104,8 @@ class CacheManager { } else { auto remote_context = m_core.get_default_context(device_name); for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks); + ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks); ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), key_cache_shape); ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), @@ -142,30 +144,27 @@ class CacheManager { } void copy_blocks(const std::map>& block_copy_map) { - ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks); - ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks); - - ov::Coordinate key_src_start_roi(key_shape.size(), 0); - ov::Coordinate key_src_end_roi = key_shape; - ov::Coordinate key_dst_start_roi(key_shape.size(), 0); - ov::Coordinate key_dst_end_roi = key_shape; - - ov::Coordinate value_src_start_roi(value_shape.size(), 0); - ov::Coordinate value_src_end_roi = value_shape; - ov::Coordinate value_dst_start_roi(value_shape.size(), 0); - ov::Coordinate value_dst_end_roi = value_shape; - for (const auto & blocks_pair : block_copy_map) { size_t src_block_id = blocks_pair.first; - key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1; - value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1; - const std::list& dst_block_ids = blocks_pair.second; for (size_t dst_block_id : dst_block_ids) { - key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1; - value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1; - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks); + ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks); + ov::Coordinate key_src_start_roi(key_shape.size(), 0); + ov::Coordinate key_src_end_roi = key_shape; + ov::Coordinate key_dst_start_roi(key_shape.size(), 0); + ov::Coordinate key_dst_end_roi = key_shape; + + ov::Coordinate value_src_start_roi(value_shape.size(), 0); + ov::Coordinate value_src_end_roi = value_shape; + ov::Coordinate value_dst_start_roi(value_shape.size(), 0); + ov::Coordinate value_dst_end_roi = value_shape; + key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1; + value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1; + key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1; + value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1; + ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi); ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi); diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 371142701c..cc2e21b9a1 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -12,8 +12,9 @@ namespace ov::genai { class DeviceConfig { ov::element::Type m_kv_cache_type; - ov::PartialShape m_key_cache_shape, m_value_cache_shape; - ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers; + std::vector m_key_cache_shape, m_value_cache_shape; + std::vector m_num_kv_heads; + ov::Shape::value_type m_head_size, m_num_decoder_layers; size_t m_num_kv_blocks = 0; size_t m_block_size = 0; size_t m_cache_size = 0; @@ -88,11 +89,14 @@ class DeviceConfig { } } - void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) { - m_num_kv_heads = num_kv_heads; + void set_model_params(std::vector num_kv_heads, size_t head_size, size_t num_decoder_layers) { m_head_size = head_size; m_num_decoder_layers = num_decoder_layers; + m_num_kv_heads.assign(num_kv_heads.begin(), num_kv_heads.end()); + m_key_cache_shape.reserve(m_num_decoder_layers); + m_value_cache_shape.reserve(m_num_decoder_layers); + if (m_device == "CPU") { // Scale, zero point and quantized data will be stored together. // The layout for per token per head: @@ -104,21 +108,32 @@ class DeviceConfig { } if (m_num_kv_blocks == 0 && m_cache_size > 0) { + size_t block_size = 0; size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; - m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size(); + } + m_num_kv_blocks = size_in_bytes / block_size; } - m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads), - ov::Dimension(m_block_size), - ov::Dimension(m_head_size)}; - - if (m_device.find("GPU") != std::string::npos) { - // Update key shape, as the key's shape is different from the value's shape - m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads), - ov::Dimension(m_head_size), - ov::Dimension(m_block_size)}; + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(m_block_size), + ov::Dimension(m_head_size)}); + + m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(m_block_size), + ov::Dimension(m_head_size)}); + + if (m_device.find("GPU") != std::string::npos) { + // Update key shape, as the key's shape is different from the value's shape + m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(m_head_size), + ov::Dimension(m_block_size)}); + } } } @@ -134,14 +149,14 @@ class DeviceConfig { return m_num_decoder_layers; } - ov::PartialShape get_key_cache_shape() const { + ov::PartialShape get_key_cache_shape(size_t id) const { OPENVINO_ASSERT(m_key_cache_shape.size()); - return m_key_cache_shape; + return m_key_cache_shape[id]; } - ov::PartialShape get_value_cache_shape() const { + ov::PartialShape get_value_cache_shape(size_t id) const { OPENVINO_ASSERT(m_value_cache_shape.size()); - return m_value_cache_shape; + return m_value_cache_shape[id]; } size_t get_num_kv_blocks() const { @@ -153,7 +168,11 @@ class DeviceConfig { } size_t get_block_size_in_bytes() const { - return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size(); + size_t block_size = 0; + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision().size(); + } + return block_size; } }; } diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index 4dedcf989a..f564be8f19 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -53,15 +53,21 @@ void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters"); ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape(); OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape); - size_t num_kv_heads = k_shape[1].get_length(), head_size = k_shape[2].get_length(); - + size_t head_size = k_shape[2].get_length(); + std::vector num_kv_heads(num_layers); + for (size_t idx = 0; idx < num_layers; idx++) { + size_t num_heads = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape()[1].get_length(); + num_kv_heads[idx] = num_heads; + } device_config.set_model_params(num_kv_heads, head_size, num_layers); - for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) { - it_k->second->set_element_type(device_config.get_cache_precision()); - it_v->second->set_element_type(device_config.get_cache_precision()); - it_k->second->set_partial_shape(device_config.get_key_cache_shape()); - it_v->second->set_partial_shape(device_config.get_value_cache_shape()); + for (size_t idx = 0; idx < num_layers; idx++) { + auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)]; + auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)]; + k->set_element_type(device_config.get_cache_precision()); + v->set_element_type(device_config.get_cache_precision()); + k->set_partial_shape(device_config.get_key_cache_shape(idx)); + v->set_partial_shape(device_config.get_value_cache_shape(idx)); } model->validate_nodes_and_infer_types(); diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 7f07980389..5dc848aba5 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -54,7 +54,8 @@ TEST(TestCacheManager, test_cache_size_param) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; - device_config.set_model_params(12, 64, num_decoder_layers); + std::vector num_kv_heads(12, 12); + device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); @@ -76,7 +77,8 @@ TEST(TestCacheManager, test_kv_blocks_param) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; - device_config.set_model_params(12, 64, num_decoder_layers); + std::vector num_kv_heads(12, 12); + device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); @@ -97,9 +99,12 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; size_t head_size = 64; - size_t num_kv_heads = 12; + std::vector num_kv_heads(12, 12); device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers); - size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + size_t block_size_in_bytes = 0; + for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) { + block_size_in_bytes += 2 * num_kv_heads[layer_id] * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + } ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp index 0d7435818f..973648f637 100644 --- a/tests/cpp/device_config.cpp +++ b/tests/cpp/device_config.cpp @@ -18,7 +18,7 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) { const std::string device = "CPU"; size_t num_decoder_layers = 12; size_t head_size = 64, head_size_u8 = head_size + 8; - size_t num_kv_heads = 12; + std::vector num_kv_heads(12, 12); ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index ea1720faa2..cc0b53a433 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -44,7 +44,7 @@ std::shared_ptr init_cache_manager(SchedulerConfig scheduler_confi size_t num_decoder_layers = 12; ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request(); size_t head_size = 64, head_size_u8 = head_size + 8; - size_t num_kv_heads = 12; + std::vector num_kv_heads(12, 12); ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); return std::make_shared(device_config, request, core);