diff --git a/.github/labeler.yml b/.github/labeler.yml index c162f6aff4..f618bdb7fc 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -13,17 +13,20 @@ - 'src/python/py_tokenizer.cpp' - 'thirdparty/openvino_tokenizers' - 'tests/python_tests/tokenizer_configs.py' +- 'tests/python_tests/test_tokenizer.py' 'category: LLM': - 'src/cpp/include/openvino/genai/llm_pipeline.hpp' - 'src/cpp/src/llm_pipeline.cpp' +- 'src/cpp/src/lm_encoding.hpp' - 'src/cpp/src/lm_encoding.cpp' - 'src/cpp/src/llm_pipeline_base.hpp' - 'src/cpp/src/llm_pipeline_static.hpp' - 'src/cpp/src/llm_pipeline_static.cpp' +- 'src/cpp/src/text_callback_streamer.cpp' +- 'src/cpp/src/text_callback_streamer.hpp' - 'src/python/py_llm_pipeline.cpp' -- 'tests/python_tests/test_generate_api.py' -- 'tests/python_tests/test_chat_generate_api.py' +- 'tests/python_tests/test_llm_pipeline.py' 'category: sampling': - 'src/cpp/include/openvino/genai/generation_config.hpp' @@ -35,6 +38,7 @@ - 'tests/cpp/logit_filtering.cpp' - 'tests/cpp/generate_config.cpp' - 'tests/cpp/sampler.cpp' +- 'tests/python_tests/test_sampling.py' 'category: LoRA': - 'src/cpp/include/openvino/genai/lora_adapter.hpp' @@ -54,9 +58,12 @@ - 'src/cpp/include/openvino/genai/whisper_pipeline.hpp' - 'src/cpp/src/whisper/**/*' - 'src/cpp/src/whisper_generation_config.cpp' +- 'src/cpp/src/whisper_pipeline_base.hpp' - 'src/cpp/src/whisper_pipeline.cpp' +- 'src/cpp/src/whisper_pipeline_static.cpp' +- 'src/cpp/src/whisper_pipeline_static.hpp' - 'src/python/py_whisper_pipeline.cpp' -- 'tests/python_tests/test_whisper_generate_api.py' +- 'tests/python_tests/test_whisper_pipeline.py' 'category: Python API': - 'src/python/**/*' @@ -65,10 +72,14 @@ - 'src/include/openvino/genai/visual_language/**/*' - 'src/cpp/src/visual_language/**/*' - 'src/python/py_vlm_pipeline.cpp' -- 'tests/python_tests/test_vlm_api.py' +- 'tests/python_tests/test_vlm_pipeline.py' 'category: speculative decoding': - 'src/cpp/src/speculative_decoding/**/*' +- 'tests/cpp/speculative_decoding.cpp' + +'category: prompt lookup': +- 'src/cpp/src/prompt_lookup/**/*' 'category: continuous batching': - 'src/cpp/include/openvino/genai/cache_eviction.hpp' @@ -91,19 +102,19 @@ - 'src/cpp/src/generation_handle.cpp' - 'src/cpp/src/generation_stream.hpp' - 'src/cpp/src/model_runner.hpp' -- 'src/cpp/src/paged_attention_transformations.cpp' -- 'src/cpp/src/paged_attention_transformations.hpp' +- 'src/cpp/src/utils/paged_attention_transformations.cpp' +- 'src/cpp/src/utils/paged_attention_transformations.hpp' - 'src/cpp/src/scheduler.hpp' - 'src/cpp/src/sequence_group.cpp' - 'src/cpp/src/sequence_group.hpp' - 'src/cpp/src/timer.hpp' - 'src/python/py_continuous_batching_pipeline.cpp' -- 'tests/python_tests/test_cache_optimizations.py' -- 'tests/python_tests/test_preemption.py' -- 'tests/python_tests/test_sampling.py' +- 'tests/python_tests/test_continuous_batching.py' +- 'tests/python_tests/test_kv_cache_eviction.py' - 'tests/cpp/block_allocator.cpp' - 'tests/cpp/block_hash_store.cpp' - 'tests/cpp/block_manager.cpp' +- 'tests/cpp/cache_eviction.cpp' - 'tests/cpp/cache_manager.cpp' - 'tests/cpp/device_config.cpp' - 'tests/cpp/scheduler.cpp' diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 6c94a907ea..9b21491f9b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -268,9 +268,9 @@ jobs: matrix: test: - name: 'Whisper' - cmd: 'tests/python_tests/test_whisper_generate_api.py' + cmd: 'tests/python_tests/test_whisper_pipeline.py' - name: 'LLM & VLM' - cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_generate_api.py' + cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py' defaults: run: shell: bash diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index a9af13bc66..4d9b7f032b 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -178,7 +178,7 @@ jobs: if: | always() && (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') - timeout-minutes: 90 + timeout-minutes: 120 defaults: run: shell: bash @@ -235,7 +235,7 @@ jobs: python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels python -c "from openvino_genai import LLMPipeline" python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template" genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -290,7 +290,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -300,7 +300,7 @@ jobs: python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels python -c "from openvino_genai import LLMPipeline" python -m pip install ./tools/who_what_benchmark --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" genai_package: name: OpenVINO genai extension (install to OpenVINO package) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 4530068797..67c6cc8fdb 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -244,7 +244,7 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_pipeline.py --ignore ./tests/python_tests/test_vlm_pipeline.py -k "not test_set_chat_template" genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -300,7 +300,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k test_smoke env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -309,7 +309,7 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" genai_python_lib_vlm: name: OpenVINO genai VLM tests (cmake + wheel) @@ -365,7 +365,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels - python -m pytest -v ./tests/python_tests/test_vlm_api.py + python -m pytest -v ./tests/python_tests/test_vlm_pipeline.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. diff --git a/pyproject.toml b/pyproject.toml index 5f952010f2..27318d42ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF"} [build-system] requires = [ - "py-build-cmake==0.3.3", + "py-build-cmake==0.3.4", "openvino~=2025.0.0.0.dev", "pybind11-stubgen==2.5.1", "cmake~=3.23.0" diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp index 236b31b351..fc18fa8e0c 100644 --- a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try { config.max_new_tokens = 20; config.num_beam_groups = 3; config.num_beams = 15; + config.diversity_penalty = 1.0f; config.num_return_sequences = config.num_beams; // Since the streamer is set, the results will diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py index 16b8b76175..4e2430a47f 100755 --- a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py +++ b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py @@ -19,6 +19,7 @@ def main(): config.max_new_tokens = 20 config.num_beam_groups = 3 config.num_beams = 15 + config.diversity_penalty = 1 config.num_return_sequences = config.num_beams beams = pipe.generate(args.prompts, config) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 4ea75e94c5..164ff29131 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -93,15 +93,22 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool echo = false; size_t logprobs = 0; + // EOS special token + int64_t eos_token_id = -1; std::set stop_strings; // Default setting in vLLM (and OpenAI API) is not to include stop string in the output bool include_stop_str_in_output = false; std::set stop_token_ids; + // penalties (not used in beam search) + float repetition_penalty = 1.0f; + float presence_penalty = 0.0; + float frequency_penalty = 0.0f; + // Beam search specific size_t num_beam_groups = 1; size_t num_beams = 1; - float diversity_penalty = 1.0f; + float diversity_penalty = 0.0f; float length_penalty = 1.0f; size_t num_return_sequences = 1; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -112,9 +119,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { float top_p = 1.0f; size_t top_k = std::numeric_limits::max(); bool do_sample = false; - float repetition_penalty = 1.0f; - float presence_penalty = 0.0; - float frequency_penalty = 0.0f; size_t rng_seed = 0; // Assisting generation parameters @@ -122,9 +126,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t num_assistant_tokens = 0; size_t max_ngram_size = 0; - // EOS special token - int64_t eos_token_id = -1; - std::optional adapters; /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. @@ -136,11 +137,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool is_greedy_decoding() const; bool is_beam_search() const; bool is_multinomial() const; - OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2025.0.0 release") - bool is_speculative_decoding() const; bool is_assisting_generation() const; bool is_prompt_lookup() const; - void update_generation_config(const ov::AnyMap& config_map); + + OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release") + bool is_speculative_decoding() const; + + void update_generation_config(const ov::AnyMap& properties); template util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { @@ -187,8 +190,13 @@ static constexpr ov::Property assistant_confidence_threshold{"assistant_c static constexpr ov::Property num_assistant_tokens{"num_assistant_tokens"}; // Predefined Configs + +OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); +OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); +OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") OPENVINO_GENAI_EXPORTS GenerationConfig multinomial(); + } // namespace genai } // namespace ov diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 0c04823f4f..20d4c0c51c 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -46,8 +46,6 @@ class CacheManager { } OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size()); m_num_allocated_kv_blocks = num_kv_blocks; - ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), num_kv_blocks); - ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), num_kv_blocks); const std::string device_name = m_device_config.get_device(); @@ -56,6 +54,8 @@ class CacheManager { if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks); + ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks); ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); @@ -104,6 +104,8 @@ class CacheManager { } else { auto remote_context = m_core.get_default_context(device_name); for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks); + ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks); ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), key_cache_shape); ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), @@ -142,30 +144,27 @@ class CacheManager { } void copy_blocks(const std::map>& block_copy_map) { - ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(), m_num_allocated_kv_blocks); - ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(), m_num_allocated_kv_blocks); - - ov::Coordinate key_src_start_roi(key_shape.size(), 0); - ov::Coordinate key_src_end_roi = key_shape; - ov::Coordinate key_dst_start_roi(key_shape.size(), 0); - ov::Coordinate key_dst_end_roi = key_shape; - - ov::Coordinate value_src_start_roi(value_shape.size(), 0); - ov::Coordinate value_src_end_roi = value_shape; - ov::Coordinate value_dst_start_roi(value_shape.size(), 0); - ov::Coordinate value_dst_end_roi = value_shape; - for (const auto & blocks_pair : block_copy_map) { size_t src_block_id = blocks_pair.first; - key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1; - value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1; - const std::list& dst_block_ids = blocks_pair.second; for (size_t dst_block_id : dst_block_ids) { - key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1; - value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1; - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks); + ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks); + ov::Coordinate key_src_start_roi(key_shape.size(), 0); + ov::Coordinate key_src_end_roi = key_shape; + ov::Coordinate key_dst_start_roi(key_shape.size(), 0); + ov::Coordinate key_dst_end_roi = key_shape; + + ov::Coordinate value_src_start_roi(value_shape.size(), 0); + ov::Coordinate value_src_end_roi = value_shape; + ov::Coordinate value_dst_start_roi(value_shape.size(), 0); + ov::Coordinate value_dst_end_roi = value_shape; + key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1; + value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1; + key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1; + value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1; + ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi); ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi); diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 371142701c..cc2e21b9a1 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -12,8 +12,9 @@ namespace ov::genai { class DeviceConfig { ov::element::Type m_kv_cache_type; - ov::PartialShape m_key_cache_shape, m_value_cache_shape; - ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers; + std::vector m_key_cache_shape, m_value_cache_shape; + std::vector m_num_kv_heads; + ov::Shape::value_type m_head_size, m_num_decoder_layers; size_t m_num_kv_blocks = 0; size_t m_block_size = 0; size_t m_cache_size = 0; @@ -88,11 +89,14 @@ class DeviceConfig { } } - void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) { - m_num_kv_heads = num_kv_heads; + void set_model_params(std::vector num_kv_heads, size_t head_size, size_t num_decoder_layers) { m_head_size = head_size; m_num_decoder_layers = num_decoder_layers; + m_num_kv_heads.assign(num_kv_heads.begin(), num_kv_heads.end()); + m_key_cache_shape.reserve(m_num_decoder_layers); + m_value_cache_shape.reserve(m_num_decoder_layers); + if (m_device == "CPU") { // Scale, zero point and quantized data will be stored together. // The layout for per token per head: @@ -104,21 +108,32 @@ class DeviceConfig { } if (m_num_kv_blocks == 0 && m_cache_size > 0) { + size_t block_size = 0; size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; - m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size(); + } + m_num_kv_blocks = size_in_bytes / block_size; } - m_key_cache_shape = m_value_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads), - ov::Dimension(m_block_size), - ov::Dimension(m_head_size)}; - - if (m_device.find("GPU") != std::string::npos) { - // Update key shape, as the key's shape is different from the value's shape - m_key_cache_shape = ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads), - ov::Dimension(m_head_size), - ov::Dimension(m_block_size)}; + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(m_block_size), + ov::Dimension(m_head_size)}); + + m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(m_block_size), + ov::Dimension(m_head_size)}); + + if (m_device.find("GPU") != std::string::npos) { + // Update key shape, as the key's shape is different from the value's shape + m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), + ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(m_head_size), + ov::Dimension(m_block_size)}); + } } } @@ -134,14 +149,14 @@ class DeviceConfig { return m_num_decoder_layers; } - ov::PartialShape get_key_cache_shape() const { + ov::PartialShape get_key_cache_shape(size_t id) const { OPENVINO_ASSERT(m_key_cache_shape.size()); - return m_key_cache_shape; + return m_key_cache_shape[id]; } - ov::PartialShape get_value_cache_shape() const { + ov::PartialShape get_value_cache_shape(size_t id) const { OPENVINO_ASSERT(m_value_cache_shape.size()); - return m_value_cache_shape; + return m_value_cache_shape[id]; } size_t get_num_kv_blocks() const { @@ -153,7 +168,11 @@ class DeviceConfig { } size_t get_block_size_in_bytes() const { - return m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * get_cache_precision().size(); + size_t block_size = 0; + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision().size(); + } + return block_size; } }; } diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 4ff184547e..59be603fd9 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -24,6 +24,7 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { nlohmann::json data = nlohmann::json::parse(f); + read_json_param(data, "eos_token_id", eos_token_id); read_json_param(data, "max_new_tokens", max_new_tokens); read_json_param(data, "max_length", max_length); // note that ignore_eos is not present in HF GenerationConfig @@ -32,28 +33,40 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { read_json_param(data, "stop_strings", stop_strings); // note that include_stop_str_in_output is not present in HF GenerationConfig read_json_param(data, "include_stop_str_in_output", include_stop_str_in_output); - // note that stop_token_ids is not present in HF GenerationConfig - read_json_param(data, "stop_token_ids", stop_token_ids); + // note that stop_token_ids is not present in HF GenerationConfig, but some generation_config.json define + // multiple eos_token_id (e.g. https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/generation_config.json) + // so, we need to read them as 'stop_token_ids' + std::vector ordered_stop_token_ids; + read_json_param(data, "eos_token_id", ordered_stop_token_ids); + + if (!ordered_stop_token_ids.empty()) { + for (int64_t stop_token_id : ordered_stop_token_ids) + stop_token_ids.insert(stop_token_id); + + if (eos_token_id == -1) { + eos_token_id = ordered_stop_token_ids[0]; + } + } + + // note that echo is not present in HF GenerationConfig + read_json_param(data, "echo", echo); + // note that logprobs is not present in HF GenerationConfig + read_json_param(data, "logprobs", logprobs); + + // penalties + read_json_param(data, "repetition_penalty", repetition_penalty); + // note that frequency_penalty is not present in HF GenerationConfig + read_json_param(data, "frequency_penalty", frequency_penalty); + // note that presence_penalty is not present in HF GenerationConfig + read_json_param(data, "presence_penalty", presence_penalty); + + // beam search read_json_param(data, "num_beam_groups", num_beam_groups); read_json_param(data, "num_beams", num_beams); read_json_param(data, "diversity_penalty", diversity_penalty); read_json_param(data, "length_penalty", length_penalty); read_json_param(data, "num_return_sequences", num_return_sequences); read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size); - read_json_param(data, "temperature", temperature); - read_json_param(data, "top_p", top_p); - read_json_param(data, "top_k", top_k); - read_json_param(data, "do_sample", do_sample); - read_json_param(data, "repetition_penalty", repetition_penalty); - read_json_param(data, "eos_token_id", eos_token_id); - // note that echo is not present in HF GenerationConfig - read_json_param(data, "echo", echo); - // note that logprobs is not present in HF GenerationConfig - read_json_param(data, "logprobs", logprobs); - - // append EOS to stop_token_ids - if (eos_token_id != -1) - set_eos_token_id(eos_token_id); if (data.contains("early_stopping")) { auto field_type = data["early_stopping"].type(); @@ -65,6 +78,21 @@ GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { stop_criteria = StopCriteria::HEURISTIC; } } + + // multinomial + read_json_param(data, "do_sample", do_sample); + read_json_param(data, "temperature", temperature); + read_json_param(data, "top_p", top_p); + read_json_param(data, "top_k", top_k); + + // assistant generation + read_json_param(data, "assistant_confidence_threshold", assistant_confidence_threshold); + read_json_param(data, "num_assistant_tokens", num_assistant_tokens); + read_json_param(data, "max_ngram_size", max_ngram_size); + + // append EOS to stop_token_ids + if (eos_token_id != -1) + set_eos_token_id(eos_token_id); } void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { @@ -79,35 +107,50 @@ void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { stop_token_ids.insert(eos_token_id); } -void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { +void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { using utils::read_anymap_param; - read_anymap_param(config_map, "max_new_tokens", max_new_tokens); - read_anymap_param(config_map, "max_length", max_length); - read_anymap_param(config_map, "ignore_eos", ignore_eos); - read_anymap_param(config_map, "min_new_tokens", min_new_tokens); - read_anymap_param(config_map, "stop_strings", stop_strings); - read_anymap_param(config_map, "include_stop_str_in_output", include_stop_str_in_output); - read_anymap_param(config_map, "stop_token_ids", stop_token_ids); - read_anymap_param(config_map, "num_beam_groups", num_beam_groups); - read_anymap_param(config_map, "num_beams", num_beams); - read_anymap_param(config_map, "diversity_penalty", diversity_penalty); - read_anymap_param(config_map, "length_penalty", length_penalty); - read_anymap_param(config_map, "num_return_sequences", num_return_sequences); - read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size); - read_anymap_param(config_map, "stop_criteria", stop_criteria); - read_anymap_param(config_map, "temperature", temperature); - read_anymap_param(config_map, "top_p", top_p); - read_anymap_param(config_map, "top_k", top_k); - read_anymap_param(config_map, "do_sample", do_sample); - read_anymap_param(config_map, "repetition_penalty", repetition_penalty); - read_anymap_param(config_map, "eos_token_id", eos_token_id); - read_anymap_param(config_map, "echo", echo); - read_anymap_param(config_map, "logprobs", logprobs); - read_anymap_param(config_map, "adapters", adapters); + // stop conditions + read_anymap_param(properties, "eos_token_id", eos_token_id); + read_anymap_param(properties, "max_new_tokens", max_new_tokens); + read_anymap_param(properties, "max_length", max_length); + read_anymap_param(properties, "ignore_eos", ignore_eos); + read_anymap_param(properties, "min_new_tokens", min_new_tokens); + read_anymap_param(properties, "stop_strings", stop_strings); + read_anymap_param(properties, "include_stop_str_in_output", include_stop_str_in_output); + read_anymap_param(properties, "stop_token_ids", stop_token_ids); + + // generic + read_anymap_param(properties, "echo", echo); + read_anymap_param(properties, "logprobs", logprobs); + read_anymap_param(properties, "num_return_sequences", num_return_sequences); + read_anymap_param(properties, "adapters", adapters); + // penalties + read_anymap_param(properties, "frequency_penalty", frequency_penalty); + read_anymap_param(properties, "presence_penalty", presence_penalty); + read_anymap_param(properties, "repetition_penalty", repetition_penalty); + + // beam search + read_anymap_param(properties, "num_beam_groups", num_beam_groups); + read_anymap_param(properties, "num_beams", num_beams); + read_anymap_param(properties, "diversity_penalty", diversity_penalty); + read_anymap_param(properties, "length_penalty", length_penalty); + read_anymap_param(properties, "stop_criteria", stop_criteria); + read_anymap_param(properties, "no_repeat_ngram_size", no_repeat_ngram_size); + + // multinomial + read_anymap_param(properties, "do_sample", do_sample); + read_anymap_param(properties, "temperature", temperature); + read_anymap_param(properties, "top_p", top_p); + read_anymap_param(properties, "top_k", top_k); // TODO: add support of 'generator' property similar to Image generation - read_anymap_param(config_map, "rng_seed", rng_seed); + read_anymap_param(properties, "rng_seed", rng_seed); + + // assistant generation + read_anymap_param(properties, "assistant_confidence_threshold", assistant_confidence_threshold); + read_anymap_param(properties, "num_assistant_tokens", num_assistant_tokens); + read_anymap_param(properties, "max_ngram_size", max_ngram_size); } size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { @@ -136,69 +179,94 @@ bool GenerationConfig::is_speculative_decoding() const { } bool GenerationConfig::is_assisting_generation() const { - return (assistant_confidence_threshold > 0 || num_assistant_tokens > 0); + return assistant_confidence_threshold > 0 || num_assistant_tokens > 0; } bool GenerationConfig::is_prompt_lookup() const { - return (max_ngram_size > 0 && num_assistant_tokens > 0); + return max_ngram_size > 0 && num_assistant_tokens > 0; } void GenerationConfig::validate() const { + OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); + + // Stop conditions + OPENVINO_ASSERT(eos_token_id == -1 || stop_token_ids.find(eos_token_id) != stop_token_ids.end(), "'stop_token_ids' must contain 'eos_token_id'. Please, call 'set_eos_token_id' with 'eos_token_id' value"); - OPENVINO_ASSERT(!do_sample || num_beams == 1, - "Beam search with sampling is not supported yet. " - "Please either set do_sample=false to use beam search " - "or set num_beams=1 if you with to use multinomial sampling."); - OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); + auto stop_token_ids_it = std::find_if(stop_token_ids.begin(), stop_token_ids.end(), [] (int64_t stop_token_id) -> bool { + return stop_token_id < 0; + }); + OPENVINO_ASSERT(stop_token_ids_it == stop_token_ids.end(), "'stop_token_ids' must be non-negative, but it contains a value ", *stop_token_ids_it); + + OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "ignore_eos is true, in this case either 'max_new_tokens', or 'max_length' should be defined."); + + OPENVINO_ASSERT(eos_token_id != -1 || !stop_token_ids.empty() || !stop_strings.empty() || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "Either 'eos_token_id', or 'stop_token_ids', or 'stop_strings', or 'max_new_tokens', or 'max_length' should be defined."); + OPENVINO_ASSERT(max_new_tokens > 0 || (max_new_tokens == 0 && echo), "'max_new_tokens' must be greater than 0, if `echo` is set, 0 is also accepted"); OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); - OPENVINO_ASSERT( - num_beams % num_beam_groups == 0, - "number of beams should be divisible by number of groups" - ); - - // max_new_tokens has priority over max_length - // if max_new_tokens is defined no need to check max_length - OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0, - "'max_length' must be greater than 0 or 'max_new_tokens' should be defined"); - - OPENVINO_ASSERT(!do_sample || top_k > 0, - "top_k must be a strictly positive, but got ", - top_k); - OPENVINO_ASSERT(!do_sample || (top_p > 0 && top_p <= 1.0f), - "top_p must be a positive float > 0 and < 1, but got ", - top_p); - OPENVINO_ASSERT(!do_sample || temperature > 0, - "Temperature must be a strictly positive float, but got ", - temperature); - - OPENVINO_ASSERT(repetition_penalty > 0, - "Repetition penalty must be a strictly positive float, but got ", - repetition_penalty); - - OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, - "ignore_eos == true, in this case either 'max_new_tokens', or 'max_length' should be defined."); - OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, - "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + // Sampling strategies + + OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), + "'num_return_sequences' can be more than 1 only in case of beam search or multinomial sampling, but got ", num_return_sequences); + + // generic penalties, but not supported by beam search currently + if (!is_beam_search()) { + OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "'frequence_penalty' penalty must be within [-2.0; 2.0], but got ", frequency_penalty); + OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "'presence_penalty' penalty must be within [-2.0; 2.0], but got ", presence_penalty); + OPENVINO_ASSERT(repetition_penalty > 0.0f, "'repetition_penalty' must be a strictly positive float, but got ", repetition_penalty); + } else { + OPENVINO_ASSERT(frequency_penalty == 0.0f, "'frequency_penalty' is not currently supported by beam search and should be 0.0f, but got ", frequency_penalty); + OPENVINO_ASSERT(presence_penalty == 0.0f, "'presence_penalty' is not currently supported by beam search and should be 0.0f, but got ", presence_penalty); + OPENVINO_ASSERT(repetition_penalty == 1.0f, "'repetition_penalty' is not currently supported by beam search and should be 1.0f, but got ", repetition_penalty); + } + + if (is_multinomial()) { + OPENVINO_ASSERT(top_k >= 0, "When 'do_sample' is true, top_k must be a non-negative, but got ", top_k); + OPENVINO_ASSERT(top_p > 0 && top_p <= 1.0f, "When 'do_sample' is true, top_p must be a positive float > 0.0 and <= 1.0, but got ", top_p); + OPENVINO_ASSERT(temperature > 0, "When 'do_sample' is true, temperature must be a strictly positive float, but got ", temperature); + } else { + // parameters requiring multinomial + OPENVINO_ASSERT(top_k == std::numeric_limits::max(), "When 'do_sample' is false, top_k must be max of size_t, but got ", top_k); + OPENVINO_ASSERT(top_p == 1.0f, "When 'do_sample' is false, top_p must be 1.0f, but got ", top_p); + OPENVINO_ASSERT(temperature == 1.0f, "When 'do_sample' is false, temperature must be a 1.0f, but got ", temperature); + } + if (is_beam_search()) { - OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + OPENVINO_ASSERT(num_beams % num_beam_groups == 0, "'num_beams' (", num_beams, ") should be divisible by 'num_beam_groups' (", num_beam_groups, ")"); + OPENVINO_ASSERT(num_beams >= num_return_sequences, "'num_beams' (", num_beams, ") must be greater equal than 'num_return_sequences' (", num_return_sequences, ")"); + + OPENVINO_ASSERT(!do_sample, + "Beam search with sampling is not supported yet. " + "Please either set do_sample=false to use beam search " + "or set num_beams=1 if you with to use multinomial sampling."); + + OPENVINO_ASSERT(no_repeat_ngram_size > 0, "'no_repeat_ngram_size' must be positive"); if (num_beam_groups > 1) { - OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, it it fallbacks to non-grouped beam search"); + OPENVINO_ASSERT(diversity_penalty != 0.0f, "For grouped beam search 'diversity_penalty' should not be zero, otherwise it fallbacks to non-grouped beam search"); + } else { + OPENVINO_ASSERT(diversity_penalty == 0.0f, "For beam search 'diversity_penalty' is applicable only when grouped beam search is used, but got 'num_beam_groups' == 1"); } } else { - OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); - OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); + // parameters requiring beam search + OPENVINO_ASSERT(num_beam_groups == 1, "'num_beam_groups' is supported by beam search only and should be 1 otherwise, but got ", num_beam_groups); + OPENVINO_ASSERT(no_repeat_ngram_size == std::numeric_limits::max(), "'no_repeat_ngram_size' is supported only by beam search, otherwise should be set to max of size_t, but got ", no_repeat_ngram_size); + OPENVINO_ASSERT(diversity_penalty == 0.0f, "'diversity_penalty' is set to ", diversity_penalty, " (default is 0.0f), which is supported only by beam search sampling"); + OPENVINO_ASSERT(length_penalty == 1.0f, "'length_penalty' is set to ", length_penalty, " (default is 1.0f), which is supported only by beam search sampling"); } + + // assistant generation + if (is_assisting_generation()) { - if (assistant_confidence_threshold != 0.f) { - OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); - OPENVINO_ASSERT(!is_prompt_lookup(), "Parameters `assistant_confidence_threshold` cannot be used while Prompt Lookup decoding"); - } else { - OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); - }; + OPENVINO_ASSERT(!is_beam_search() && num_return_sequences == 1, "Beam search and parallel sampling are not compatible with assistant generation"); + OPENVINO_ASSERT(assistant_confidence_threshold == 0.0f || num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`"); + } + + if (num_assistant_tokens == 0) { + OPENVINO_ASSERT(max_ngram_size == 0, "'max_ngram_size' should be set to default value 0 when prompt lookup is disabled"); } } diff --git a/src/cpp/src/json_utils.hpp b/src/cpp/src/json_utils.hpp index 13d792e9db..4a4bb001df 100644 --- a/src/cpp/src/json_utils.hpp +++ b/src/cpp/src/json_utils.hpp @@ -4,6 +4,9 @@ #pragma once +#include +#include + #include namespace ov { @@ -40,6 +43,15 @@ void read_json_param(const nlohmann::json& data, const std::string& name, std::v } } +template +void read_json_param(const nlohmann::json& data, const std::string& name, std::set& param) { + if (data.contains(name) && data[name].is_array()) { + for (const auto elem : data[name]) { + param.insert(elem.get()); + } + } +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 44fff08658..b34217beb8 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -72,19 +72,18 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { const ov::AnyMap& config, const ov::genai::GenerationConfig& generation_config ) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) { - ov::Core core; ov::CompiledModel compiled_model; auto [core_plugin_config, plugin_config] = ov::genai::utils::split_core_compile_config(config); - utils::slice_matmul_statefull_model(model); + utils::slice_matmul_stateful_model(model); m_kv_cache_seq_length_axis = ov::genai::utils::get_seq_len_axis(model); if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model."); m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device); // TODO: Make the prefix name configurable - compiled_model = core.compile_model(model, device, *filtered_plugin_config); + compiled_model = utils::singleton_core().compile_model(model, device, *filtered_plugin_config); m_model_runner = compiled_model.create_infer_request(); } else { - compiled_model = core.compile_model(model, device, plugin_config); + compiled_model = utils::singleton_core().compile_model(model, device, plugin_config); m_model_runner = compiled_model.create_infer_request(); } ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model"); @@ -705,8 +704,7 @@ std::pair split_model_descr(const ov::An ov::genai::LLMPipeline::LLMPipeline( const ov::InferRequest& request, const ov::genai::Tokenizer& tokenizer, - OptionalGenerationConfig generation_config -) { + OptionalGenerationConfig generation_config) { OPENVINO_THROW("Not supported"); auto start_time = std::chrono::steady_clock::now(); m_pimpl = std::make_unique(request, tokenizer, generation_config); @@ -718,8 +716,7 @@ ov::genai::LLMPipeline::LLMPipeline( const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& properties -){ + const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || @@ -749,8 +746,7 @@ ov::genai::LLMPipeline::LLMPipeline( ov::genai::LLMPipeline::LLMPipeline( const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& config -){ + const ov::AnyMap& config) { auto start_time = std::chrono::steady_clock::now(); if (config.find(ov::genai::scheduler_config.name()) != config.end() || @@ -783,8 +779,7 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& config, - const ov::genai::GenerationConfig& generation_config -){ + const ov::genai::GenerationConfig& generation_config) { auto [core_properties, plugin_config] = ov::genai::utils::split_core_compile_config(config); auto start_time = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index be9fc972dc..83dbf15376 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -259,7 +259,7 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token return {new_input_ids, new_attention_mask}; } -void slice_matmul_statefull_model(std::shared_ptr model) { +void slice_matmul_stateful_model(std::shared_ptr model) { auto last_node = model->output(0).get_node()->input_value(0).get_node(); ov::Node* matmul = dynamic_cast(last_node); if (matmul) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 57225e60ff..6207c889a2 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -106,7 +106,7 @@ std::shared_ptr read_model_with_config(const std::filesystem::path& m ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); -void slice_matmul_statefull_model(std::shared_ptr model); +void slice_matmul_stateful_model(std::shared_ptr model); ov::Core singleton_core(); diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index 4dedcf989a..f564be8f19 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -53,15 +53,21 @@ void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters"); ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape(); OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape); - size_t num_kv_heads = k_shape[1].get_length(), head_size = k_shape[2].get_length(); - + size_t head_size = k_shape[2].get_length(); + std::vector num_kv_heads(num_layers); + for (size_t idx = 0; idx < num_layers; idx++) { + size_t num_heads = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape()[1].get_length(); + num_kv_heads[idx] = num_heads; + } device_config.set_model_params(num_kv_heads, head_size, num_layers); - for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) { - it_k->second->set_element_type(device_config.get_cache_precision()); - it_v->second->set_element_type(device_config.get_cache_precision()); - it_k->second->set_partial_shape(device_config.get_key_cache_shape()); - it_v->second->set_partial_shape(device_config.get_value_cache_shape()); + for (size_t idx = 0; idx < num_layers; idx++) { + auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)]; + auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)]; + k->set_element_type(device_config.get_cache_precision()); + v->set_element_type(device_config.get_cache_precision()); + k->set_partial_shape(device_config.get_key_cache_shape(idx)); + v->set_partial_shape(device_config.get_value_cache_shape(idx)); } model->validate_nodes_and_infer_types(); diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index 9762874596..44da29ced4 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -243,6 +243,8 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
  • Freepik/flux.1-lite-8B-alpha
  • black-forest-labs/FLUX.1-dev
  • shuttleai/shuttle-3-diffusion
  • +
  • shuttleai/shuttle-3.1-aesthetic
  • +
  • Shakker-Labs/AWPortrait-FL
  • diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 8510a8389f..5d82fa89a3 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -367,16 +367,16 @@ class ContinuousBatchingPipeline: def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None: ... @typing.overload - def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, sampling_params: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: int, prompt: str, sampling_params: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: int, prompt: str, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def generate(self, input_ids: list[openvino._pyopenvino.Tensor], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]: + def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[EncodedGenerationResult]: ... @typing.overload - def generate(self, prompts: list[str], sampling_params: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]: + def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | StreamerBase | None = None) -> list[GenerationResult]: ... def get_config(self) -> GenerationConfig: ... @@ -609,11 +609,15 @@ class GenerationConfig: ... def is_greedy_decoding(self) -> bool: ... + def is_multinomial(self) -> bool: + ... def is_prompt_lookup(self) -> bool: ... def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: ... - def update_generation_config(self, config_map: dict[str, openvino._pyopenvino.OVAny]) -> None: + def update_generation_config(self, **kwargs) -> None: + ... + def validate(self) -> None: ... class GenerationFinishReason: """ @@ -826,7 +830,7 @@ class Image2ImagePipeline: ... def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: ... - def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + def set_generation_config(self, config: ImageGenerationConfig) -> None: ... def set_scheduler(self, scheduler: Scheduler) -> None: ... @@ -927,7 +931,7 @@ class InpaintingPipeline: ... def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: ... - def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + def set_generation_config(self, config: ImageGenerationConfig) -> None: ... def set_scheduler(self, scheduler: Scheduler) -> None: ... @@ -1615,7 +1619,7 @@ class Text2ImagePipeline: ... def reshape(self, num_images_per_prompt: int, height: int, width: int, guidance_scale: float) -> None: ... - def set_generation_config(self, generation_config: ImageGenerationConfig) -> None: + def set_generation_config(self, config: ImageGenerationConfig) -> None: ... def set_scheduler(self, scheduler: Scheduler) -> None: ... @@ -1865,9 +1869,9 @@ class VLMPipeline: ... def get_tokenizer(self) -> Tokenizer: ... - def set_chat_template(self, new_template: str) -> None: + def set_chat_template(self, chat_template: str) -> None: ... - def set_generation_config(self, new_config: GenerationConfig) -> None: + def set_generation_config(self, config: GenerationConfig) -> None: ... def start_chat(self, system_message: str = '') -> None: ... @@ -2043,6 +2047,8 @@ class WhisperGenerationConfig: ... def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: ... + def update_generation_config(self, **kwargs) -> None: + ... class WhisperPerfMetrics(PerfMetrics): """ diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index be7a72481f..2b48e4d44d 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -235,22 +235,22 @@ void init_continuous_batching_pipeline(py::module_& m) { .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("sampling_params")) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("sampling_params")) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("generation_config")) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("generation_config")) .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) .def( "generate", py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), py::arg("input_ids"), - py::arg("sampling_params"), + py::arg("generation_config"), py::arg("streamer") = std::monostate{} ) .def( "generate", py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), py::arg("prompts"), - py::arg("sampling_params"), + py::arg("generation_config"), py::arg("streamer") = std::monostate{} ); } diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index f49bcf29bd..a97a43fc5c 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -118,7 +118,13 @@ void init_generation_config(py::module_& m) { .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) + .def("is_multinomial", &GenerationConfig::is_multinomial) .def("is_assisting_generation", &GenerationConfig::is_assisting_generation) .def("is_prompt_lookup", &GenerationConfig::is_prompt_lookup) - .def("update_generation_config", static_cast(&ov::genai::GenerationConfig::update_generation_config), py::arg("config_map")); + .def("validate", &GenerationConfig::validate) + .def("update_generation_config", []( + ov::genai::GenerationConfig& config, + const py::kwargs& kwargs) { + config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + }); } diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index 311f3f3760..c246557a97 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -224,7 +224,7 @@ void init_image_generation_pipelines(py::module_& m) { .def_readwrite("max_sequence_length", &ov::genai::ImageGenerationConfig::max_sequence_length) .def("validate", &ov::genai::ImageGenerationConfig::validate) .def("update_generation_config", []( - ov::genai::ImageGenerationConfig config, + ov::genai::ImageGenerationConfig& config, const py::kwargs& kwargs) { config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); }); @@ -255,8 +255,8 @@ void init_image_generation_pipelines(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Text2ImagePipeline properties )") - .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config) - .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("generation_config")) + .def("get_generation_config", &ov::genai::Text2ImagePipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::Text2ImagePipeline::set_generation_config, py::arg("config")) .def("set_scheduler", &ov::genai::Text2ImagePipeline::set_scheduler, py::arg("scheduler")) .def("reshape", &ov::genai::Text2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) .def_static("stable_diffusion", &ov::genai::Text2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) @@ -323,8 +323,8 @@ void init_image_generation_pipelines(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Image2ImagePipeline properties )") - .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config) - .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("generation_config")) + .def("get_generation_config", &ov::genai::Image2ImagePipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::Image2ImagePipeline::set_generation_config, py::arg("config")) .def("set_scheduler", &ov::genai::Image2ImagePipeline::set_scheduler, py::arg("scheduler")) .def("reshape", &ov::genai::Image2ImagePipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) @@ -386,8 +386,8 @@ void init_image_generation_pipelines(py::module_& m) { device (str): Device to run the model on (e.g., CPU, GPU). kwargs: InpaintingPipeline properties )") - .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config) - .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("generation_config")) + .def("get_generation_config", &ov::genai::InpaintingPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::InpaintingPipeline::set_generation_config, py::arg("config")) .def("set_scheduler", &ov::genai::InpaintingPipeline::set_scheduler, py::arg("scheduler")) .def("reshape", &ov::genai::InpaintingPipeline::reshape, py::arg("num_images_per_prompt"), py::arg("height"), py::arg("width"), py::arg("guidance_scale")) .def_static("stable_diffusion", &ov::genai::InpaintingPipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index b1d5136253..7360975a0b 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -53,15 +53,10 @@ py::object call_common_generate( const pyutils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { - ov::genai::GenerationConfig default_config; - if (config.has_value()) { - default_config = *config; - } else { - default_config = pipe.get_generation_config(); - } + ov::genai::GenerationConfig default_config = config.has_value() ? *config : pipe.get_generation_config(); auto updated_config = pyutils::update_config_from_kwargs(default_config, kwargs); + py::object results; - EncodedInputs tensor_data; StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); // Call suitable generate overload for each type of input. diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 45a0c46174..34522409ea 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -358,7 +358,10 @@ ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::O ov::genai::GenerationConfig res_config; if(config.has_value()) res_config = *config; - res_config.update_generation_config(kwargs_to_any_map(kwargs)); + + if (!kwargs.empty()) + res_config.update_generation_config(kwargs_to_any_map(kwargs)); + return res_config; } diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 340cb3da62..b0cfa0a42a 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -150,10 +150,10 @@ void init_vlm_pipeline(py::module_& m) { .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) - .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("new_template")) + .def("set_chat_template", &ov::genai::VLMPipeline::set_chat_template, py::arg("chat_template")) .def("get_tokenizer", &ov::genai::VLMPipeline::get_tokenizer) - .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) - .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("new_config")) + .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config, py::arg("config")) .def( "generate", [](ov::genai::VLMPipeline& pipe, diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index cd42dcf58d..d290612ed6 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -187,7 +187,10 @@ OptionalWhisperGenerationConfig update_whisper_config_from_kwargs(const Optional WhisperGenerationConfig res_config; if (config.has_value()) res_config = *config; - res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + + if (!kwargs.empty()) + res_config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + return res_config; } @@ -295,7 +298,12 @@ void init_whisper_pipeline(py::module_& m) { .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps) .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt) .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords) - .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")); + .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) + .def("update_generation_config", []( + ov::genai::WhisperGenerationConfig& config, + const py::kwargs& kwargs) { + config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); + });; py::class_(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring) .def(py::init<>()) diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 093cd993de..b8c2e625c5 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -25,8 +25,8 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/text_callback_streamer.cpp") -add_executable(${TEST_TARGET_NAME} ${tests_src} - block_allocator.cpp) +add_executable(${TEST_TARGET_NAME} ${tests_src}) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE openvino::genai gtest_main) target_include_directories(${TEST_TARGET_NAME} PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src") target_sources(${TEST_TARGET_NAME} PRIVATE ${src_files}) diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 7f07980389..5dc848aba5 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -54,7 +54,8 @@ TEST(TestCacheManager, test_cache_size_param) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; - device_config.set_model_params(12, 64, num_decoder_layers); + std::vector num_kv_heads(12, 12); + device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); @@ -76,7 +77,8 @@ TEST(TestCacheManager, test_kv_blocks_param) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; - device_config.set_model_params(12, 64, num_decoder_layers); + std::vector num_kv_heads(12, 12); + device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); @@ -97,9 +99,12 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; size_t head_size = 64; - size_t num_kv_heads = 12; + std::vector num_kv_heads(12, 12); device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers); - size_t block_size_in_bytes = num_decoder_layers * 2 * num_kv_heads * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + size_t block_size_in_bytes = 0; + for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) { + block_size_in_bytes += 2 * num_kv_heads[layer_id] * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + } ov::InferRequest request = core.compile_model(get_dummy_model(num_decoder_layers)).create_infer_request(); diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp index 0d7435818f..973648f637 100644 --- a/tests/cpp/device_config.cpp +++ b/tests/cpp/device_config.cpp @@ -18,7 +18,7 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) { const std::string device = "CPU"; size_t num_decoder_layers = 12; size_t head_size = 64, head_size_u8 = head_size + 8; - size_t num_kv_heads = 12; + std::vector num_kv_heads(12, 12); ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); diff --git a/tests/cpp/generate_config.cpp b/tests/cpp/generate_config.cpp deleted file mode 100644 index 974fd499f8..0000000000 --- a/tests/cpp/generate_config.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include "openvino/genai/generation_config.hpp" - - -using namespace ov::genai; - -TEST(GenerationConfigTest, invalid_temperature) { - GenerationConfig config; - config.max_new_tokens = 20; - config.temperature = -0.1; - config.do_sample = true; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_temperature) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.temperature = 0.1; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_top_p) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.top_p = -0.5; - EXPECT_THROW(config.validate(), ov::Exception); - config.top_p = 1.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_top_p) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.top_p = 0.1; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_repeatition_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.repetition_penalty = -3.0; - EXPECT_THROW(config.validate(), ov::Exception); - config.repetition_penalty = -0.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_repeatition_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.repetition_penalty = 1.8; - EXPECT_NO_THROW(config.validate()); - config.repetition_penalty = 0.1; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_presence_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.presence_penalty = 3.0; - EXPECT_THROW(config.validate(), ov::Exception); - config.presence_penalty = -3.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_presence_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.presence_penalty = 1.8; - EXPECT_NO_THROW(config.validate()); - config.presence_penalty = -2.0; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_frequency_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.frequency_penalty = 3.0; - EXPECT_THROW(config.validate(), ov::Exception); - config.frequency_penalty = -3.1; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_frequency_penalty) { - GenerationConfig config; - config.max_new_tokens = 20; - config.do_sample = true; - config.frequency_penalty = 1.8; - EXPECT_NO_THROW(config.validate()); - config.frequency_penalty = -2.0; - EXPECT_NO_THROW(config.validate()); -} - -ov::genai::GenerationConfig speculative_decoding_multinomial() { - auto speculative_decoding_multinomial_config = ov::genai::multinomial(); - speculative_decoding_multinomial_config.num_assistant_tokens = 5; - return speculative_decoding_multinomial_config; -} - -ov::genai::GenerationConfig speculative_decoding_greedy() { - auto speculative_decoding_greedy_config = ov::genai::greedy(); - speculative_decoding_greedy_config.assistant_confidence_threshold = 0.4f; - return speculative_decoding_greedy_config; -} - -TEST(GenerationConfigTest, invalid_static_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.num_assistant_tokens = 5; - config.assistant_confidence_threshold = 0.2; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_static_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.num_assistant_tokens = 5; - config.assistant_confidence_threshold = 0; - EXPECT_NO_THROW(config.validate()); -} - -TEST(GenerationConfigTest, invalid_dynamic_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.num_assistant_tokens = 5; - config.assistant_confidence_threshold = 0.5; - EXPECT_THROW(config.validate(), ov::Exception); -} - -TEST(GenerationConfigTest, valid_dynamic_spec_decoding) { - GenerationConfig config = speculative_decoding_greedy(); - config.assistant_confidence_threshold = 0.5; - config.num_assistant_tokens = 0; - EXPECT_NO_THROW(config.validate()); -} diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index ea1720faa2..cc0b53a433 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -44,7 +44,7 @@ std::shared_ptr init_cache_manager(SchedulerConfig scheduler_confi size_t num_decoder_layers = 12; ov::InferRequest request = core.compile_model(get_model(num_decoder_layers)).create_infer_request(); size_t head_size = 64, head_size_u8 = head_size + 8; - size_t num_kv_heads = 12; + std::vector num_kv_heads(12, 12); ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); return std::make_shared(device_config, request, core); diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 7e3c075405..9040fa435f 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -73,6 +73,7 @@ def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 generation_config.num_return_sequences = generation_config.num_beams @@ -82,6 +83,7 @@ def get_beam_search_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.min_new_tokens = 15 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 @@ -92,6 +94,7 @@ def get_beam_search_with_single_stop_string() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 50 generation_config.num_return_sequences = generation_config.num_beams generation_config.stop_strings = {"open sour"} # expected match on "open source" @@ -102,6 +105,7 @@ def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 50 generation_config.num_return_sequences = generation_config.num_beams generation_config.stop_strings = {".", "software", "Intel"} @@ -112,6 +116,7 @@ def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = generation_config.num_beams generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} @@ -299,7 +304,7 @@ def convert_to_hf( kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty - if generation_config.num_beams > 1: + if generation_config.is_beam_search(): # beam search case kwargs['num_beam_groups'] = generation_config.num_beam_groups kwargs['num_beams'] = generation_config.num_beams @@ -309,7 +314,7 @@ def convert_to_hf( kwargs['output_scores'] = True if generation_config.num_beam_groups > 1: kwargs['diversity_penalty'] = generation_config.diversity_penalty - elif generation_config.do_sample: + elif generation_config.is_multinomial(): # mulitinomial kwargs['temperature'] = generation_config.temperature kwargs['top_k'] = generation_config.top_k @@ -364,18 +369,6 @@ def run_continuous_batching( return output -def read_models_list(file_name: str): - models = [] - with open(file_name) as f: - for model_name in f: - model_name = model_name.strip() - # skip comment in model scope file - if model_name.startswith('#'): - continue - models.append(model_name) - return models - - def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) @@ -447,7 +440,7 @@ def generate_and_compare_with_reference_text(models_path: Path, prompts: List[st assert ref_text == ov_text -def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): +def run_continuous_batching_pipeline_test(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): prompts, generation_configs = get_test_dataset() scheduler_config = get_scheduler_config(scheduler_params) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 87b2147bcd..9e8e4681f9 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -32,7 +32,7 @@ def get_models_list(): "HuggingFaceH4/zephyr-7b-beta", "ikala/redpajama-3b-chat", "mistralai/Mistral-7B-v0.1", - + # "meta-llama/Llama-2-7b-chat-hf", # Cannot be downloaded without access token # "google/gemma-2b-it", # Cannot be downloaded without access token. # "google/gemma-7b-it", # Cannot be downloaded without access token. @@ -49,7 +49,7 @@ def get_models_list(): model_ids = precommit_models else: model_ids = nightly_models - + if pytest.selected_model_ids: model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] # pytest.set_trace() @@ -82,45 +82,45 @@ def get_chat_models_list(): @functools.lru_cache(1) def read_model(params, **tokenizer_kwargs): model_id, path = params - + from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) if (path / "openvino_model.xml").exists(): - opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, + opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, compile=False, device='CPU') else: - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer, with_detokenizer=True, **tokenizer_kwargs) openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - + # to store tokenizer config jsons with special tokens hf_tokenizer.save_pretrained(path) - - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, + + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, compile=False, device='CPU', load_in_8bit=False) opt_model.generation_config.save_pretrained(path) opt_model.config.save_pretrained(path) opt_model.save_pretrained(path) - + return ( model_id, path, hf_tokenizer, opt_model, - ov_genai.LLMPipeline(path, 'CPU', **{'ENABLE_MMAP': False}), + ov_genai.LLMPipeline(path, 'CPU', ENABLE_MMAP=False), ) # in OpenVINO GenAI this parameter is called stop_criteria, -# while in HF it's called early_stopping. +# while in HF it's called early_stopping. # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" STOP_CRITERIA_MAP = { - ov_genai.StopCriteria.NEVER: "never", - ov_genai.StopCriteria.EARLY: True, + ov_genai.StopCriteria.NEVER: "never", + ov_genai.StopCriteria.EARLY: True, ov_genai.StopCriteria.HEURISTIC: False } @@ -137,8 +137,9 @@ def model_tmp_path(tmpdir_factory): shutil.copy(src_file, temp_path / src_file.name) yield model_id, Path(temp_path) + @pytest.fixture(scope="module") -def model_tokenizers_path_tmp_path(tmpdir_factory): +def model_tokenizers_tmp_path(tmpdir_factory): model_id, path, _, _, _ = read_model(get_models_list()[0]) temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) @@ -146,7 +147,7 @@ def model_tokenizers_path_tmp_path(tmpdir_factory): # There was no easy way to add tokens to IR in tests, so we remove them # and set tokens in configs and to check if they are read and validated correctly. import openvino as ov - + # copy openvino converted model and tokenizers for pattern in ['*.xml', '*.bin']: for src_file in path.glob(pattern): @@ -162,7 +163,7 @@ def model_tokenizers_path_tmp_path(tmpdir_factory): ov_model.set_rt_info("eos_token_id", "") ov_model.set_rt_info("chat_template", "") ov.save_model(ov_model, str(temp_path / src_file.name)) - + if src_file in ['openvino_tokenizer.bin', 'openvino_detokenizer.bin']: continue if src_file.is_file(): @@ -179,10 +180,15 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path): for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return ov_genai.LLMPipeline(temp_path, 'CPU') + + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU') + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_pipe @functools.lru_cache(1) def get_continuous_batching(path): - scheduler_config = ov_genai.SchedulerConfig() - return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', **{"scheduler_config": scheduler_config}) + return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig()) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 00bffb6646..c2c7d634f5 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu -diffusers==0.31.0 +diffusers==0.32.1 optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py deleted file mode 100644 index 07b4f7c15f..0000000000 --- a/tests/python_tests/test_chat_generate_api.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import openvino_genai as ov_genai -import pytest -from typing import Dict, Tuple - -from ov_genai_test_utils import ( - get_chat_models_list, - read_model, - get_continuous_batching, -) - - -generation_configs = [ - dict(do_sample=False, max_new_tokens=20), - dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) -] - - -questions = [ - '1+1=', - 'What is the previous answer?', - 'Why is the Sun yellow?', - 'What was my first question?' -] - - -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_chat_compare_with_HF(model_descr, generation_config: Dict): - chat_history_hf = [] - chat_history_ov = [] - chat_prompt = '' - - # Will set add_special_tokens=False inside pipeline when start_chat() is called. - model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - - pipe.start_chat() - for prompt in questions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **generation_config) - answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - answer_ov = pipe.generate(prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - - pipe.finish_chat() - - if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') - - assert chat_history_ov == chat_history_hf - - -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -@pytest.mark.nightly -def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict): - # compares with HF when history in ov_genai is save as a text - chat_history_hf = [] - chat_history_ov = [] - chat_prompt = '' - - # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. - # Need to regenerate openvino_tokenizer/detokenizer. - model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - ov_tokenizer = ov_pipe.get_tokenizer() - - for prompt in questions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **generation_config) - answer_str = hf_tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - chat_prompt = ov_tokenizer.apply_chat_template(chat_history_ov, add_generation_prompt=True) - answer_ov = ov_pipe.generate(chat_prompt, **generation_config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - - if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') - - assert chat_history_ov == chat_history_hf - - -@pytest.mark.parametrize("generation_config", generation_configs[1:]) -@pytest.mark.parametrize("model_descr", get_chat_models_list()) -@pytest.mark.precommit -def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): - model_id, path, hf_tokenizer, opt_model, ov_stateful_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) - cb_pipe = get_continuous_batching(path) - - ov_stateful_pipe.start_chat() - cb_pipe.start_chat() - - for question in questions: - generated = cb_pipe.generate(question, **generation_config) - reference = ov_stateful_pipe.generate(question, **generation_config) - assert generated == reference - - # Test that finish_chat() doesn't fail just in case. - cb_pipe.finish_chat() diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_continuous_batching.py similarity index 59% rename from tests/python_tests/test_preemption.py rename to tests/python_tests/test_continuous_batching.py index 7c648e73dc..01762bf9e3 100644 --- a/tests/python_tests/test_preemption.py +++ b/tests/python_tests/test_continuous_batching.py @@ -1,19 +1,183 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os import pytest +import math +from typing import Dict + +from pathlib import Path +from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer -from openvino_genai import GenerationConfig from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ - get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \ + get_scheduler_config, get_greedy, run_continuous_batching_pipeline_test, get_beam_search, get_greedy, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p from test_sampling import RandomSamplingTestStruct, get_current_platform_ref_texts +from ov_genai_test_utils import ( + get_chat_models_list, + read_model, + get_continuous_batching, +) + +def read_models_list(file_name: str): + models = [] + with open(file_name) as f: + for model_name in f: + model_name = model_name.strip() + # skip comment in model scope file + if model_name.startswith('#'): + continue + models.append(model_name) + return models + +# +# e2e tests on random and real models +# + +@pytest.mark.precommit +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +def test_e2e_precommit(tmp_path, model_id): + run_continuous_batching_pipeline_test(tmp_path, model_id) + + +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) +def test_e2e_nightly(tmp_path, model_id): + run_continuous_batching_pipeline_test(tmp_path, model_id) + + +@pytest.mark.real_models +@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) +def test_e2e_real_models(tmp_path, model_id): + run_continuous_batching_pipeline_test(tmp_path, model_id) + +# +# Comparison with stateful +# TODO: remove these tests once test_llm_pipeline.py are generalized and parametrized to test both Stateful and PA paths +# + +test_configs = [ + dict(max_new_tokens=20), + dict(max_new_tokens=200, ignore_eos=True), + dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) +] +batched_prompts = [ + ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], + ['hello', 'Here is the longest nowel ever: '], + ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], + ['table is made', 'table is made [force left pad tokens]'] +] +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. +@pytest.mark.precommit +def test_continuous_batching_vs_stateful(prompt, generation_config): + model_id, path, tokenizer, model, stateful = read_model(( + "facebook/opt-125m", + Path("opt-125m") + )) + cb = get_continuous_batching(path) + generated = cb.generate(prompt, **generation_config) + reference = stateful.generate(prompt, **generation_config) + assert generated.texts == reference.texts + if 1 != generation_config.get("num_return_sequences", 1): + # Stateful puts zeroes to generated.scores. Don't compare them. + for gen, ref in zip(generated.scores, reference.scores): + assert math.isclose(gen, ref, abs_tol=0.0003) + + +prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.precommit +def test_cb_streamer_vs_return_vs_stateful(prompt): + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(( + "facebook/opt-125m", + Path("opt-125m") + )) + cb_pipe = get_continuous_batching(path) + streamed = [] + generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + reference = ov_pipe.generate(prompt, max_new_tokens=20) + assert generated == "".join(streamed) + assert "".join(streamed) == reference + + +generation_configs = [ + dict(do_sample=False, max_new_tokens=20), + dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0, repetition_penalty=1.0) +] +questions = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] +@pytest.mark.parametrize("generation_config_kwargs", generation_configs[1:]) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +def test_chat_scenario_vs_stateful(model_descr, generation_config_kwargs: Dict): + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb_pipe = get_continuous_batching(path) + + ov_pipe.start_chat() + cb_pipe.start_chat() + + generation_config = GenerationConfig(**generation_config_kwargs) + ov_pipe.set_generation_config(generation_config) + + for question in questions: + generated = cb_pipe.generate(question, generation_config=generation_config) + reference = ov_pipe.generate(question) + assert generated == reference + + # Test that finish_chat() doesn't fail just in case. + cb_pipe.finish_chat() + +# +# Stress tests to check OOM case +# + +@pytest.mark.precommit +@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()], + ids=["greedy", "beam_search", "multinomial_all_parameters"]) +def test_post_oom_health(tmp_path, sampling_config): + generation_config = sampling_config + generation_config.ignore_eos = True + generation_config.max_new_tokens = 1000000 + + scheduler_config = get_scheduler_config() + scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly + + model_id : str = "facebook/opt-125m" + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + + models_path : Path = tmp_path / model_id + save_ov_model_from_optimum(opt_model, hf_tokenizer, models_path) + + cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") + + # First run should return incomplete response + output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) + assert (len(output)) + assert (len(output[0].m_generation_ids)) + + # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM + output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) + assert (len(output)) + assert (len(output[0].m_generation_ids)) + +# +# Pre-emption +# -def get_greedy_seq_len_300() -> GenerationConfig: +def get_parallel_sampling_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_return_sequences = 3 + # TODO: add generation_config.generator and return parameters below + # generation_config.num_return_sequences = 3 + # generation_config.do_sample = True + # generation_config.top_k = 10 + # generation_config.top_p = 0.5 generation_config.max_new_tokens = 300 return generation_config @@ -21,14 +185,15 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 300 generation_config.num_return_sequences = generation_config.num_beams return generation_config scheduler_params_list = [({"num_kv_blocks": 2, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), ({"num_kv_blocks": 2, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_parallel_sampling_seq_len_300()), + ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_parallel_sampling_seq_len_300()), ({"num_kv_blocks": 34, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), ({"num_kv_blocks": 34, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), ({"num_kv_blocks": 100, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), @@ -36,7 +201,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_preemption(tmp_path, params): - run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) multinomial_params = RandomSamplingTestStruct( @@ -175,4 +340,4 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file + generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py new file mode 100644 index 0000000000..110caaf0e5 --- /dev/null +++ b/tests/python_tests/test_generation_config.py @@ -0,0 +1,142 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino_genai import GenerationConfig +from typing import Tuple, List +import json +import os +import pytest + +configs = [ + # stop conditions + dict(max_new_tokens=12), + dict(max_length=12), + dict(stop_token_ids={2}), + dict(eos_token_id=1, stop_token_ids={1}), + dict(stop_strings={"a", "b"}), + dict(ignore_eos=True, max_new_tokens=10), + dict(ignore_eos=True, max_length=10), + dict(max_new_tokens=0, echo=True), + dict(min_new_tokens=1, max_new_tokens=1), + # multinomial + dict(max_new_tokens=1, do_sample=True, num_return_sequences=2), + dict(max_new_tokens=1, do_sample=True, top_k=1), + dict(max_new_tokens=1, do_sample=True, top_p=0.5), + dict(max_new_tokens=1, do_sample=True, temperature=0.5), + # beam search + dict(max_new_tokens=1, num_beams=2), + dict(max_new_tokens=1, num_beams=2, num_return_sequences=1), + dict(max_new_tokens=1, num_beams=2, num_return_sequences=2), + dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=1.0), + dict(max_new_tokens=1, num_beams=4, length_penalty=1.0), + dict(max_new_tokens=1, num_beams=4, no_repeat_ngram_size=2), + # assistant generation + dict(max_new_tokens=1, assistant_confidence_threshold=0.5), + dict(max_new_tokens=1, num_assistant_tokens=2), + dict(max_new_tokens=1, num_assistant_tokens=2, max_ngram_size=2), # prompt lookup +] +@pytest.mark.parametrize("generation_config_kwargs", configs) +@pytest.mark.precommit +@pytest.mark.nightly +def test_valid_configs(generation_config_kwargs): + config = GenerationConfig(**generation_config_kwargs) + config.validate() + + config = GenerationConfig() + config.update_generation_config(**generation_config_kwargs) + config.validate() + + +invalid_configs = [ + dict(num_return_sequences=0), # no reason to run with empty output + dict(num_return_sequences=2), # beam search or multimonial is required + # stop conditions + dict(), # no stop conditions at all + dict(eos_token_id=1), # 'stop_token_ids' does not contain 'eos_token_id' + dict(eos_token_id=1, stop_token_ids={2}), # 'stop_token_ids' is not empty, but does not contain 'eos_token_id' + dict(ignore_eos=True), # no 'max_new_tokens', no 'max_length' with 'ignore_eos' + dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative + dict(max_new_tokens=0), # max new tokens cannot be empty (only when 'echo' is True) + dict(max_new_tokens=10, min_new_tokens=20), # 'max_new_tokens' must be >= 'min_new_tokens' + # penalties + dict(max_new_tokens=1, repetition_penalty=-1.0), # invalid repetition_penalty + dict(max_new_tokens=1, presence_penalty=-3.0), # invalid presence_penalty + dict(max_new_tokens=1, frequency_penalty=3.0), # invalid frequency_penalty + # multinomial sampling + dict(max_new_tokens=1, do_sample=True, top_p=1.1), # 'top_p' must be within (0, 1] when 'do_sample' is True + dict(max_new_tokens=1, do_sample=True, top_p=0), # 'top_p' must be within (0, 1] when 'do_sample' is True + dict(max_new_tokens=1, do_sample=True, temperature=-1.0), # invalid temp + # parameters requiring multimonial + dict(max_new_tokens=1, top_k=1), # requires do_sample=True + dict(max_new_tokens=1, top_p=0.5), # requires do_sample=True + dict(max_new_tokens=1, temperature=2.0), # requires do_sample=True + # beam search + dict(max_new_tokens=1, num_beams=2, num_return_sequences=3), # 'num_beams' must be >= 'num_return_sequences' + dict(max_new_tokens=1, num_beams=3, num_beam_groups=2), # 'num_beams' must be divisible by 'num_beam_groups' + dict(max_new_tokens=1, num_beams=3, do_sample=True), # 'beam sample is not supported + dict(max_new_tokens=1, num_beams=3, no_repeat_ngram_size=0), # invalid 'no_repeat_ngram_size' + dict(max_new_tokens=1, num_beams=4, num_beam_groups=2, diversity_penalty=0.0), # 'diversity_penalty' should not be a default value + dict(max_new_tokens=1, num_beams=4, diversity_penalty=1.0), # 'diversity_penalty' is used only for grouped beam search + dict(max_new_tokens=1, num_beams=2, frequency_penalty=1.0), # 'frequency_penalty' is not supported by beam search + dict(max_new_tokens=1, num_beams=2, presence_penalty=1.0), # 'presence_penalty' is not supported by beam search + dict(max_new_tokens=1, num_beams=2, repetition_penalty=0.0), # 'repetition_penalty' is not supported by beam search + # parameters requiring beam search + dict(max_new_tokens=1, num_beam_groups=2), # requiring beam search + dict(max_new_tokens=1, no_repeat_ngram_size=2), # requiring beam search + dict(max_new_tokens=1, diversity_penalty=1.0), # requiring beam search + dict(max_new_tokens=1, length_penalty=2), # requiring beam search + # assistant generation + dict(max_new_tokens=1, num_assistant_tokens=2, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group + dict(max_new_tokens=1, assistant_confidence_threshold=1.0, do_sample=True, num_return_sequences=2), # 'num_return_sequences' must be 1, as we cannot use different number of tokens per sequence within a group + dict(max_new_tokens=1, num_assistant_tokens=2, num_beams=2), # beam search is not compatible with assistant generation + dict(max_new_tokens=1, assistant_confidence_threshold=1.0, num_assistant_tokens=2), # 'assistant_confidence_threshold' and 'num_assistant_tokens' are mutually exclusive + dict(max_new_tokens=1, max_ngram_size=1), # 'max_ngram_size' is for prompt lookup, but assistant generation is turned off ('num_assistant_tokens' is 0) + # TODO: add tests for invalid properties +] +@pytest.mark.parametrize("generation_config_kwargs", invalid_configs) +@pytest.mark.precommit +@pytest.mark.nightly +def test_invalid_generation_configs_throws(generation_config_kwargs): + config = GenerationConfig(**generation_config_kwargs) + with pytest.raises(RuntimeError): + config.validate() + + config = GenerationConfig() + config.update_generation_config(**generation_config_kwargs) + with pytest.raises(RuntimeError): + config.validate() + + +def load_genai_generation_config_from_file(configs: List[Tuple], temp_path): + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + + ov_generation_config = GenerationConfig(temp_path / "generation_config.json") + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_generation_config + +@pytest.mark.precommit +@pytest.mark.nightly +def test_multiple_eos_are_read_as_stop_token_ids(tmp_path): + generation_config_json = { + "eos_token_id": [ + 2, + 32000, + 32007 + ] + } + configs = [ + (generation_config_json, "generation_config.json"), + ] + + generation_config = load_genai_generation_config_from_file(configs, tmp_path) + + assert generation_config.eos_token_id == 2 + assert generation_config.stop_token_ids == { 2, 32000, 32007 } diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_kv_cache_eviction.py similarity index 97% rename from tests/python_tests/test_cache_optimizations.py rename to tests/python_tests/test_kv_cache_eviction.py index d89697ba42..6228f53dd1 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -15,7 +15,7 @@ from openvino import serialize from transformers import AutoTokenizer -from common import TESTS_ROOT, run_test_pipeline +from common import TESTS_ROOT, run_continuous_batching_pipeline_test def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: @@ -147,7 +147,6 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t def get_greedy_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_return_sequences = 3 generation_config.max_new_tokens = 300 return generation_config @@ -155,6 +154,7 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_beam_groups = 3 generation_config.num_beams = 6 + generation_config.diversity_penalty = 1 generation_config.max_new_tokens = 300 generation_config.num_return_sequences = generation_config.num_beams return generation_config @@ -168,5 +168,5 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): - run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + run_continuous_batching_pipeline_test(tmp_path, "facebook/opt-125m", params[0], params[1]) diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_llm_pipeline.py similarity index 82% rename from tests/python_tests/test_generate_api.py rename to tests/python_tests/test_llm_pipeline.py index 824a3cca26..6e3cce06d0 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino_genai import StopCriteria +from openvino_genai import StopCriteria, GenerationConfig import pytest from typing import Union, List, Dict, Optional import numpy as np @@ -12,12 +12,12 @@ import torch import math from ov_genai_test_utils import ( - get_models_list, - read_model, + get_models_list, + read_model, load_genai_pipe_with_configs, - model_tmp_path, - STOP_CRITERIA_MAP, - get_continuous_batching, + get_chat_models_list, + model_tmp_path, + STOP_CRITERIA_MAP, ) @@ -26,12 +26,12 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro config = generation_config.copy() # to avoid side effects num_beams = config['num_beams'] if 'num_beams' in config else 1 config['num_return_sequences'] = num_beams - + if not isinstance(prompts, list): prompts = [prompts] if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. @@ -72,7 +72,7 @@ def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, config = generation_config.copy() # to avoid side effects if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. @@ -101,9 +101,9 @@ def run_hf_ov_genai_comparison_text_inputs(model_descr, generation_config: Dict, def run_hf_ov_genai_comparison_encoded_inputs( - model_descr, - generation_config: Dict, - input_ids: np.ndarray, + model_descr, + generation_config: Dict, + input_ids: np.ndarray, attention_mask: Optional[np.array] = None ): device = 'CPU' @@ -112,18 +112,18 @@ def run_hf_ov_genai_comparison_encoded_inputs( config = generation_config.copy() # to avoid side effects if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = 1.0 # 1.0 means no penalty - + generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) - + if attention_mask is not None: inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) @@ -138,6 +138,9 @@ def run_hf_ov_genai_comparison_encoded_inputs( ov_res = np.array(ov_output.tokens, dtype=np.int64) assert np.all(ov_res == hf_res) +# +# e2e work +# test_cases = [ (dict(max_new_tokens=20), 'table is made of'), @@ -197,14 +200,13 @@ def test_batch_text_input(model_descr, generation_config, prompts): @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_beam_search_decoding(model_descr, num_beam_groups, group_size, - max_new_tokens, diversity_penalty, prompt): +def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=diversity_penalty, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @@ -215,17 +217,17 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size, @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly -def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): +def test_beam_search_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): # todo: with EARLY stop_criteria looks like HF return invalid out with sentence # while genai ends sentence with if (stop_criteria == StopCriteria.EARLY): pytest.skip() generation_config = dict( - num_beam_groups=2, - num_beams=2 * 3, - diversity_penalty=1.0, - num_return_sequences=2 * 3, - max_new_tokens=max_new_tokens, + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, stop_criteria=stop_criteria, ) run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @@ -241,11 +243,11 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, max_new_tokens, prompt): generation_config = dict( - num_beam_groups=num_beam_groups, - num_beams=num_beam_groups * group_size, - diversity_penalty=1.0, - num_return_sequences=num_beam_groups * group_size, - max_new_tokens=max_new_tokens, + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, ) run_hf_ov_genai_comparison_text_inputs(read_model(model_descr), generation_config, prompt) @@ -283,6 +285,74 @@ def test_greedy_repetition_penalty(model_descr, prompt): assert(len(set(ov_output.split(' '))) > len(set(ov_output_half_penalty.split(' ')))) +@pytest.mark.precommit +@pytest.mark.nightly +def test_batch_size_switch(): + ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] + ov_pipe.generate(["a"], max_new_tokens=2) + ov_pipe.generate(["1", "2"], max_new_tokens=2) + ov_pipe.generate(["a"], max_new_tokens=2) + +# +# Chat scenario +# + +generation_configs = [ + dict(max_new_tokens=20), + dict(max_new_tokens=10, num_beam_groups=3, num_beams=15, num_return_sequences=1, diversity_penalty=1.0) +] + +questions = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] + +@pytest.mark.parametrize("generation_config_kwargs", generation_configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_with_HF(model_descr, generation_config_kwargs: Dict): + chat_history_hf = [] + chat_history_ov = [] + chat_prompt = '' + + # Will set add_special_tokens=False inside pipeline when start_chat() is called. + model_id, path, tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + + from transformers import GenerationConfig as HFGenerationConfig + hf_generation_config = HFGenerationConfig(**generation_config_kwargs) + ov_generation_config = GenerationConfig(**generation_config_kwargs) + + ov_pipe.start_chat() + for prompt in questions: + chat_history_hf.append({'role': 'user', 'content': prompt}) + chat_history_ov.append({'role': 'user', 'content': prompt}) + + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + + answer = opt_model.generate(**tokenized, generation_config=hf_generation_config) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + + answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + + ov_pipe.finish_chat() + + if chat_history_ov != chat_history_hf: + print(f'hf_output: {chat_history_hf}') + print(f'ov_output: {chat_history_ov}') + + assert chat_history_ov == chat_history_hf + + +# +# Streaming with callback +# + def user_defined_callback(subword): print(subword) @@ -422,31 +492,13 @@ def test_operator_with_streamer_kwargs_batch_throws(): with pytest.raises(RuntimeError): ov_pipe('', num_beams=2, streamer=printer) +# +# Tests on generation configs handling +# -invalid_configs = [ - dict(num_beam_groups=3, num_beams=15, do_sample=True), - # TODO: CVS-158682 eos_token_id is still read from tiny-random-phi3 and we cannot modify RTInfo in tests - # dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len - dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos - dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty - dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp - dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p - dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k -] -@pytest.mark.parametrize("generation_config", invalid_configs) @pytest.mark.precommit @pytest.mark.nightly -def test_invalid_generation_configs_throws(model_tmp_path, generation_config): - model_id, temp_path = model_tmp_path - config_json = {} - ov_pipe = load_genai_pipe_with_configs([(config_json, "config.json")], temp_path) - with pytest.raises(RuntimeError): - ov_pipe.generate('blah blah', **generation_config) - - -@pytest.mark.precommit -@pytest.mark.nightly -def test_valid_configs(model_tmp_path): +def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path): model_id, temp_path = model_tmp_path ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path) @@ -454,30 +506,21 @@ def test_valid_configs(model_tmp_path): config.do_sample = True # no eos_token_id but it's loaded from config.json ov_pipe.set_generation_config(config) + assert 37 == ov_pipe.get_generation_config().eos_token_id + -invalid_py_configs = [ - dict(num_beam_groups=3, num_beams=15, do_sample=True), - # TODO: Currently unexpected params do not cause exceptions. Need to implement it in c++ and return this test - # dict(unexisting_key_name=True), # no eos_token_id no max_new_tokens, no max_len - dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos - dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty - dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp - dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p - dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k -] @pytest.mark.precommit @pytest.mark.nightly -@pytest.mark.parametrize("generation_config", invalid_py_configs) -def test_python_generation_config_validation_throws(model_tmp_path, generation_config): - model_id, temp_path = model_tmp_path - ov_pipe = load_genai_pipe_with_configs([({"eos_token_id": 37}, "config.json")], temp_path) - - # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned - # instead of RuntimeError, which is returned when GenerationConfig values are validated - return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError - with pytest.raises(return_exception_type): - ov_pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) +def test_pipeline_validates_generation_config(): + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + ov_pipe = read_model((model_id, path))[4] + invalid_generation_config = dict(num_beam_groups=3, num_beams=15, do_sample=True) # beam sample is not supported + with pytest.raises(RuntimeError): + ov_pipe.generate("dummy prompt", **invalid_generation_config) +# +# Work with Unicode in Python API +# @pytest.mark.precommit @pytest.mark.nightly @@ -512,69 +555,9 @@ def test_unicode_pybind_decoding_one_string_streamer(): ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) assert '�' == res_str[-1] - -@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") -@pytest.mark.precommit -@pytest.mark.nightly -@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") -def test_left_pad(): - # test left pad tokenizer post processing implementation - prompts = [ - "The Sun is yellow because", - "The Sun is yellow because [force left pad tokens]" - ] - models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) - - config = { - "max_new_tokens": 20, - "num_beam_groups": 2, - "num_beams": 2, - "num_return_sequences": 2, - "do_sample": False, - "diversity_penalty": 1.0, - # phi 1_5 has no eos_token_id in model configuration - # ov genai will detect eos_token_id from tokenizer config - # hf implementation doesn't fetch it from tokenizer config and defaults to None - # align ov genai and hf by setting eos_token_id explicitly - "eos_token_id": 50256, - } - - models[2].pad_token = models[2].eos_token - run_hf_ov_genai_comparison_batched(models, config, prompts) - - -@pytest.mark.parametrize("generation_config", test_configs) -@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. -@pytest.mark.precommit -def test_continuous_batching_vs_stateful(prompt, generation_config): - model_id, path, tokenizer, model, stateful = read_model(( - "facebook/opt-125m", - Path("opt-125m") - )) - cb = get_continuous_batching(path) - generated = cb.generate(prompt, **generation_config) - reference = stateful.generate(prompt, **generation_config) - assert generated.texts == reference.texts - if 1 != generation_config.get("num_return_sequences", 1): - # Stateful puts zeroes to generated.scores. Don't compare them. - for gen, ref in zip(generated.scores, reference.scores): - assert math.isclose(gen, ref, abs_tol=0.0003) - - -@pytest.mark.parametrize("prompt", prompts) -@pytest.mark.precommit -def test_cb_streamer_vs_return_vs_stateful(prompt): - model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model(( - "facebook/opt-125m", - Path("opt-125m") - )) - cb_pipe = get_continuous_batching(path) - streamed = [] - generated = cb_pipe.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) - reference = ov_pipe.generate(prompt, max_new_tokens=20) - assert generated == "".join(streamed) - assert "".join(streamed) == reference - +# +# Perf metrics +# def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: model_id, path, hf_tokenizer, opt_model, ov_pipe = model_descr @@ -582,12 +565,13 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st config = generation_config.copy() # to avoid side effects if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config + # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = 1.0 # 1.0 means no penalty + return ov_pipe.generate([prompt], **config).perf_metrics @@ -598,20 +582,21 @@ def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: st @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit @pytest.mark.nightly +@pytest.mark.skip(reason="load_time + mean_gen_duration < total_time fails in https://github.com/openvinotoolkit/openvino.genai/actions/runs/12503590506/job/34884840100?pr=1440.") def test_perf_metrics(model_descr, generation_config, prompt): import time start_time = time.perf_counter() perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) total_time = (time.perf_counter() - start_time) * 1000 - + # Check that load time is adequate. load_time = perf_metrics.get_load_time() - assert load_time > 0 and load_time < 1000.0 - + assert load_time > 0 and load_time < 1000.0 + # Check that num input and generated tokens are adequate. num_generated_tokens = perf_metrics.get_num_generated_tokens() - assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] - + assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] + num_input_tokens = perf_metrics.get_num_input_tokens() assert num_input_tokens > 0 and num_input_tokens <= len(prompt) @@ -622,7 +607,7 @@ def test_perf_metrics(model_descr, generation_config, prompt): raw_metrics = perf_metrics.raw_metrics durations = np.array(raw_metrics.m_durations) / 1000 # Check that prefill is not included in durations for TPOT calculation. - # For the very long prompt prefill is slow and TTFT is much larger than any other token genration duration. + # For the very long prompt prefill is slow and TTFT is much larger than any other token generation duration. assert np.all(mean_ttft > durations * 2) mean_tpot, std_tpot = perf_metrics.get_tpot() @@ -632,7 +617,7 @@ def test_perf_metrics(model_descr, generation_config, prompt): mean_throughput, std_throughput = perf_metrics.get_throughput() assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std) assert mean_throughput > 0 and mean_throughput < 20000.0 - + mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration() assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std) assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time @@ -647,7 +632,7 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std) assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration assert std_detok_duration == 0 - + # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics assert np.allclose(mean_tpot, np.mean(durations)) assert np.allclose(std_tpot, np.std(durations)) @@ -668,15 +653,11 @@ def test_perf_metrics(model_descr, generation_config, prompt): assert len(raw_metrics.m_batch_sizes) > 0 assert len(raw_metrics.m_durations) > 0 +# +# Misc +# -@pytest.mark.precommit -@pytest.mark.nightly -def test_batch_switch(): - ov_pipe = read_model(('katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')))[4] - ov_pipe.generate(["a"], max_new_tokens=2) - ov_pipe.generate(["1", "2"], max_new_tokens=2) - - +# TODO: move to test_sampling.py @pytest.mark.precommit @pytest.mark.nightly def test_stop_token_ids(): @@ -684,13 +665,14 @@ def test_stop_token_ids(): res = ov_pipe.generate( ov.Tensor([(1,)]), max_new_tokens=3, - stop_token_ids={-1, 9935, ov_pipe.get_tokenizer().get_eos_token_id()}, + stop_token_ids={9935, ov_pipe.get_tokenizer().get_eos_token_id()}, include_stop_str_in_output=False ) assert 2 == len(res.tokens[0]) assert 9935 in res.tokens[0] +# TODO: move to test_sampling.py @pytest.mark.precommit @pytest.mark.nightly def test_stop_strings(): @@ -701,3 +683,34 @@ def test_stop_strings(): stop_strings={"ignored", "боль"} ) assert "боль" not in res + + +# TODO: move this test to test_tokenizer.py +@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") +def test_left_pad(): + # test left pad tokenizer post processing implementation + prompts = [ + "The Sun is yellow because", + "The Sun is yellow because [force left pad tokens]" + ] + models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) + + config = { + "max_new_tokens": 20, + "num_beam_groups": 2, + "num_beams": 2, + "num_return_sequences": 2, + "do_sample": False, + "diversity_penalty": 1.0, + # phi 1_5 has no eos_token_id in model configuration + # ov genai will detect eos_token_id from tokenizer config + # hf implementation doesn't fetch it from tokenizer config and defaults to None + # align ov genai and hf by setting eos_token_id explicitly + "eos_token_id": 50256, + } + + models[2].pad_token = models[2].eos_token + run_hf_ov_genai_comparison_batched(models, config, prompts) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index cad8b0fea0..c3500d15ac 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -145,7 +145,7 @@ def test_chat_generation(model_descr): 'What was my first question?' ] - model_path = get_chat_models_lists()[0][1] + model_path = get_chat_models_list()[0][1] chat_history_stateful = generate_chat_history(model_path, "CPU", { }, questions) chat_history_static = generate_chat_history(model_path, "NPU", common_config, questions) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index fbcce76bf7..25ae9d8afa 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -10,13 +10,13 @@ from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer from typing import List, TypedDict -from common import run_test_pipeline, read_models_list, get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ - generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ +from common import get_hugging_face_model_and_tokenizer, save_ov_model_from_optimum, \ + get_greedy, get_beam_search, get_multinomial_temperature, \ get_greedy_with_penalties, get_multinomial_temperature, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \ + get_greedy, get_greedy_with_min_and_max_tokens, \ get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \ get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ @@ -27,25 +27,9 @@ run_continuous_batching +# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) -def test_sampling_precommit(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - - -@pytest.mark.nightly -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) -def test_sampling_nightly(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - -@pytest.mark.real_models -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) -def test_real_models(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - - -@pytest.mark.precommit -def test_eos_beam_search(tmp_path): +def test_beam_search_has_eos_token_at_end(tmp_path): ''' Current test checks that in case of beam search, some generation results explicitly have EOS token at the end, which is aligned with HF @@ -61,8 +45,9 @@ def test_eos_beam_search(tmp_path): generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) +# TODO: currently, this test drops EOS token as both HF and OV use `skip_special_tokens=True`, which should be disabled for samlpling tests @pytest.mark.precommit -def test_eos_greedy(tmp_path): +def test_greedy_has_eos_token_at_end(tmp_path): ''' Current test checks that in case of gready, some generation results explicitly have EOS token at the end, which is aligned with HF: @@ -76,55 +61,44 @@ def test_eos_greedy(tmp_path): scheduler_config = get_scheduler_config() generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + +# TODO: consider removing all these functions with generation configs and use Dict with properties, which can be converted to generation config @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), - get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), - get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), - get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), - get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ], - ids=[ - "greedy", - "greedy_with_min_and_max_tokens", - "greedy_with_repetition_penalty", - "greedy_with_single_stop_string", - "greedy_with_multiple_stop_strings", - "greedy_with_multiple_stop_strings_no_match", - "beam", - "beam_search_min_and_max_tokens", - "beam_search_with_multiple_stop_strings_no_match", - "get_greedy_stop_strings_exclude_from_output", - "get_greedy_stop_strings_include_to_output", - "get_greedy_n_stop_strings_exclude_from_output", - "get_greedy_n_stop_strings_include_to_output" - ]) -def test_individual_generation_configs_deterministic(tmp_path, generation_config): - prompts = [ - "What is OpenVINO?", - ] +@pytest.mark.parametrize("generation_config", + [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), + get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), + get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), + get_greedy_stop_strings_exclude_from_output(), get_greedy_stop_strings_include_to_output(), + get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output()], + ids=["greedy", "greedy_with_min_and_max_tokens", "greedy_with_repetition_penalty", "greedy_with_single_stop_string", + "greedy_with_multiple_stop_strings", "greedy_with_multiple_stop_strings_no_match", "beam_search", "beam_search_min_and_max_tokens", + "beam_search_with_multiple_stop_strings_no_match", "greedy_stop_strings_exclude_from_output", "greedy_stop_strings_include_to_output", + "greedy_n_stop_strings_exclude_from_output", "greedy_n_stop_strings_include_to_output"]) +def test_sampling_against_optimum(tmp_path, generation_config): + prompts = [ "What is OpenVINO?" ] generation_configs = [generation_config] model_id : str = "facebook/opt-125m" generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + @pytest.mark.precommit @pytest.mark.xfail( raises=AssertionError, reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", strict=True, ) -@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings(),], - ids=[ - "beam_search_with_single_stop_string", - "beam_search_with_multiple_stop_strings", - ]) +@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings()], + ids=["beam_search_with_single_stop_string", "beam_search_with_multiple_stop_strings"]) def test_beam_search_with_stop_string(tmp_path, generation_config): - prompts = [ - "What is OpenVINO?", - ] + prompts = [ "What is OpenVINO?" ] generation_configs = [generation_config] model_id : str = "facebook/opt-125m" generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) +# TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF +# and merge this tests with 'test_sampling_against_optimum' by extending a list of generation configs + class PlatformsRefTexts(TypedDict, total=False): linux: List[List[str]] win32: List[List[str]] @@ -306,7 +280,7 @@ class RandomSamplingTestStruct: "multinomial_temperature_and_frequence_penalty", "greedy_with_penalties", "multinomial_max_and_min_token"]) -def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): +def test_multinomial_sampling_against_reference(tmp_path, test_struct: RandomSamplingTestStruct): generation_config = test_struct.generation_config prompts = test_struct.prompts @@ -326,9 +300,10 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl @pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters]) +@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], + ids=["greedy", "beam_search", "multinomial_all_parameters"]) @pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_without_completion(tmp_path, get_generation_config, max_num_batched_tokens): +def test_echo_prompt_phase_only(tmp_path, get_generation_config, max_num_batched_tokens): generation_config = get_generation_config() generation_config.max_new_tokens = 0 generation_config.echo = True @@ -337,14 +312,14 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") + cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") - outputs = pipe.generate(["What is OpenVINO?"], generation_configs) + outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) for output in outputs: assert(len(output.m_generation_ids)) @@ -353,9 +328,10 @@ def test_echo_without_completion(tmp_path, get_generation_config, max_num_batche @pytest.mark.precommit -@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters]) +@pytest.mark.parametrize("get_generation_config", [get_greedy, get_beam_search, get_multinomial_all_parameters], + ids=["greedy", "beam_search", "multinomial_all_parameters"]) @pytest.mark.parametrize("max_num_batched_tokens", [2, 4, 256]) -def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_tokens): +def test_echo_with_generation_phase(tmp_path, get_generation_config, max_num_batched_tokens): generation_config = get_generation_config() generation_config.max_new_tokens = 10 generation_config.echo = True @@ -364,45 +340,17 @@ def test_echo_with_completion(tmp_path, get_generation_config, max_num_batched_t scheduler_config.max_num_batched_tokens = max_num_batched_tokens generation_configs = [generation_config] model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) + opt_model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) - - pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") + save_ov_model_from_optimum(opt_model, hf_tokenizer, model_path) - outputs = pipe.generate(["What is OpenVINO?"], generation_configs) + cb_pipe = ContinuousBatchingPipeline(model_path, Tokenizer(model_path), scheduler_config, "CPU") + outputs = cb_pipe.generate(["What is OpenVINO?"], generation_configs) assert(len(outputs)) + for output in outputs: assert(len(output.m_generation_ids)) for sequence in output.m_generation_ids: assert(sequence.startswith("What is OpenVINO?")) assert(len(sequence) > len("What is OpenVINO?")) - - -@pytest.mark.precommit -@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()]) -def test_post_oom_health(tmp_path, sampling_config): - generation_config = sampling_config - generation_config.ignore_eos = True - generation_config.max_new_tokens = 1000000 - - scheduler_config = get_scheduler_config() - # Low cache size to trigger OOM quickly - scheduler_config.num_kv_blocks = 10 - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_hugging_face_model_and_tokenizer(model_id, use_optimum=True) - - models_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, models_path) - - pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU") - # First run should return incomplete response - output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert (len(output)) - assert(len(output[0].m_generation_ids)) - # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM - output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert (len(output)) - assert(len(output[0].m_generation_ids)) diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 0c2a106d50..8129298763 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -1,6 +1,7 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import os import pytest import numpy as np from transformers import AutoTokenizer @@ -17,15 +18,19 @@ def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): - # load Tokenizer where all configs are cleared. - # remove existing jsons from previous tests for json_file in temp_path.glob("*.json"): json_file.unlink() for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return openvino_genai.Tokenizer(temp_path) + + ov_tokenizer = openvino_genai.Tokenizer(temp_path) + + for _, config_name in configs: + os.remove(temp_path / config_name) + + return ov_tokenizer def get_chat_templates(): @@ -181,7 +186,7 @@ def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): @pytest.mark.nightly def test_set_chat_template(): model_descr = get_chat_models_list()[0] - model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + model_id, path, hf_tokenizer, opt_model, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) prompt = "how are you?" dummy_conversation = [ @@ -265,7 +270,7 @@ def test_load_special_tokens_from_special_tokens_map_json(model_tmp_path): @pytest.mark.precommit @pytest.mark.nightly @pytest.mark.skip(reason="CVS-158682 - RTInfo is not modified in tests for unknown reasons") -def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tmp_path): +def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_tmp_path): # special_tokens_map is not available # but tokenize_config.json exists # will load both string and integer representations @@ -280,7 +285,7 @@ def test_load_special_tokens_from_tokenizer_config_json(model_tokenizers_path_tm "eos_token": "", } - tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_path_tmp_path[1]) + tok = load_genai_tokenizer_with_configs([(tok_config_json, "tokenizer_config.json")], model_tokenizers_tmp_path[1]) assert tok.get_pad_token() == tok_config_json['pad_token'] assert tok.get_bos_token() == tok_config_json['bos_token'] assert tok.get_eos_token() == tok_config_json['eos_token'] diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_pipeline.py similarity index 100% rename from tests/python_tests/test_vlm_api.py rename to tests/python_tests/test_vlm_pipeline.py diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_pipeline.py similarity index 100% rename from tests/python_tests/test_whisper_generate_api.py rename to tests/python_tests/test_whisper_pipeline.py diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp index 6cf462fdf8..e0c50cda02 100644 --- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp +++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp @@ -123,11 +123,6 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data ov::genai::GenerationConfig greedy_search = ov::genai::greedy(); greedy_search.max_new_tokens = std::min(max_output_len, output_len); greedy_search.ignore_eos = true; - greedy_search.repetition_penalty = 1.0; - greedy_search.frequency_penalty = 0.0; - greedy_search.presence_penalty = 0.0; - greedy_search.diversity_penalty = 0.0; - greedy_search.length_penalty = 0.0; dataset.push_data(human_question, greedy_search); dataset.push_lens(input_len, output_len); diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index b2c2015f80..536d015612 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -20,6 +20,8 @@ def run_wwb(args): @pytest.mark.parametrize( ("model_id", "model_type", "backend"), [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "image-to-image", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-to-image", "hf"), ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"), ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"), @@ -40,6 +42,8 @@ def test_image_model_types(model_id, model_type, backend): "CPU", "--model-type", model_type, + "--num-inference-steps", + "2", ] if backend == "hf": wwb_args.append("--hf") @@ -65,7 +69,8 @@ def test_image_model_types(model_id, model_type, backend): @pytest.mark.parametrize( ("model_id", "model_type"), [ - ("echarlaix/tiny-random-stable-diffusion-xl", "text-to-image"), + ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "image-to-image"), + ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "text-to-image"), ], ) def test_image_model_genai(model_id, model_type): @@ -73,15 +78,15 @@ def test_image_model_genai(model_id, model_type): GT_FILE = os.path.join(temp_dir, "gt.csv") MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--")) - result = subprocess.run(["optimum-cli", "export", - "openvino", "-m", model_id, + result = subprocess.run(["huggingface-cli", "download", + model_id, "--local-dir", MODEL_PATH], capture_output=True, text=True) assert result.returncode == 0 wwb_args = [ "--base-model", - MODEL_PATH, + model_id, "--num-samples", "1", "--gt-data", @@ -90,6 +95,8 @@ def test_image_model_genai(model_id, model_type): "CPU", "--model-type", model_type, + "--num-inference-steps", + "2", ] result = run_wwb(wwb_args) assert result.returncode == 0 @@ -108,6 +115,8 @@ def test_image_model_genai(model_id, model_type): "--model-type", model_type, "--genai", + "--num-inference-steps", + "2", ] result = run_wwb(wwb_args) @@ -131,6 +140,9 @@ def test_image_model_genai(model_id, model_type): model_type, "--output", output_dir, + "--genai", + "--num-inference-steps", + "2", ] result = run_wwb(wwb_args) assert result.returncode == 0 @@ -149,6 +161,8 @@ def test_image_model_genai(model_id, model_type): "CPU", "--model-type", model_type, + "--num-inference-steps", + "2", ] result = run_wwb(wwb_args) assert result.returncode == 0 @@ -182,6 +196,8 @@ def test_image_custom_dataset(model_id, model_type, backend): "google-research-datasets/conceptual_captions", "--dataset-field", "caption", + "--num-inference-steps", + "2", ] if backend == "hf": wwb_args.append("--hf") diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py index 278db2c6a1..f608601ec8 100644 --- a/tools/who_what_benchmark/whowhatbench/__init__.py +++ b/tools/who_what_benchmark/whowhatbench/__init__.py @@ -3,6 +3,7 @@ from .text_evaluator import TextEvaluator as Evaluator from .text2image_evaluator import Text2ImageEvaluator from .visualtext_evaluator import VisualTextEvaluator +from .image2image import Image2ImageEvaluator __all__ = [ @@ -11,5 +12,6 @@ "TextEvaluator", "Text2ImageEvaluator", "VisualTextEvaluator", + "Image2ImageEvaluator", "EVALUATOR_REGISTRY", ] diff --git a/tools/who_what_benchmark/whowhatbench/image2image.py b/tools/who_what_benchmark/whowhatbench/image2image.py new file mode 100644 index 0000000000..90eb6c7c87 --- /dev/null +++ b/tools/who_what_benchmark/whowhatbench/image2image.py @@ -0,0 +1,129 @@ +import os +from typing import Any, Union + +import datasets +import pandas as pd +from tqdm import tqdm +from transformers import set_seed +import torch +import openvino_genai + +from .registry import register_evaluator +from .text2image_evaluator import Text2ImageEvaluator + +from .whowhat_metrics import ImageSimilarity + + +def preprocess_fn(example): + return { + "prompts": example["Instruction_VLM-LLM"], + "images": example["source_img"], + } + + +def prepare_default_data(num_samples=None): + DATASET_NAME = "paint-by-inpaint/PIPE" + NUM_SAMPLES = 10 if num_samples is None else num_samples + set_seed(42) + default_dataset = datasets.load_dataset( + DATASET_NAME, split="test", streaming=True + ).filter(lambda example: example["Instruction_VLM-LLM"] != "").take(NUM_SAMPLES) + return default_dataset.map( + lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names + ) + + +@register_evaluator("image-to-image") +class Image2ImageEvaluator(Text2ImageEvaluator): + def __init__( + self, + base_model: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + metrics="similarity", + similarity_model_id: str = "openai/clip-vit-large-patch14", + num_inference_steps=4, + crop_prompts=True, + num_samples=None, + gen_image_fn=None, + seed=42, + is_genai=False, + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.metrics = metrics + self.crop_prompt = crop_prompts + self.num_samples = num_samples + self.num_inference_steps = num_inference_steps + self.seed = seed + self.similarity = None + self.similarity = ImageSimilarity(similarity_model_id) + self.last_cmp = None + self.gt_dir = os.path.dirname(gt_data) + self.generation_fn = gen_image_fn + self.is_genai = is_genai + self.resolution = None + + if base_model: + self.gt_data = self._generate_data( + base_model, gen_image_fn, os.path.join(self.gt_dir, "reference") + ) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): + def default_gen_image_fn(model, prompt, image, num_inference_steps, generator=None): + with torch.no_grad(): + output = model( + prompt, + image=image, + num_inference_steps=num_inference_steps, + output_type="pil", + strength=0.8, + generator=generator, + ) + return output.images[0] + + generation_fn = gen_image_fn or default_gen_image_fn + + if self.test_data: + if isinstance(self.test_data, str): + data = pd.read_csv(self.test_data) + else: + if isinstance(self.test_data, dict): + assert "prompts" in self.test_data + assert "images" in self.test_data + data = dict(self.test_data) + data = pd.DataFrame.from_dict(data) + else: + data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples)) + + prompts = data["prompts"] + images = data["images"] + output_images = [] + rng = torch.Generator(device="cpu") + + if not os.path.exists(image_dir): + os.makedirs(image_dir) + + for i, (prompt, image) in tqdm(enumerate(zip(prompts, images)), desc="Evaluate pipeline"): + set_seed(self.seed) + rng = rng.manual_seed(self.seed) + output = generation_fn( + model, + prompt, + image=image, + num_inference_steps=self.num_inference_steps, + generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng + ) + image_path = os.path.join(image_dir, f"{i}.png") + output.save(image_path) + output_images.append(image_path) + + res_data = {"prompts": list(prompts), "images": output_images} + df = pd.DataFrame(res_data) + + return df diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py new file mode 100644 index 0000000000..f54d232bc2 --- /dev/null +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -0,0 +1,252 @@ +import logging +import json + +from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq +from diffusers import DiffusionPipeline, AutoPipelineForImage2Image + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class GenAIModelWrapper: + """ + A helper class to store additional attributes for GenAI models + """ + + def __init__(self, model, model_dir, model_type): + self.model = model + self.model_type = model_type + + if model_type == "text" or model_type == "visual-text": + self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + elif model_type == "text-to-image": + self.config = DiffusionPipeline.load_config( + model_dir, trust_remote_code=True) + + def __getattr__(self, attr): + if attr in self.__dict__: + return getattr(self, attr) + else: + return getattr(self.model, attr) + + +def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None): + try: + import openvino_genai + except ImportError: + logger.error( + "Failed to import openvino_genai package. Please install it.") + exit(-1) + return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text") + + +def load_text_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if use_hf: + logger.info("Using HF Transformers API") + model = AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) + model.eval() + elif use_genai: + logger.info("Using OpenVINO GenAI API") + model = load_text_genai_pipeline(model_id, device, ov_config) + else: + logger.info("Using Optimum API") + from optimum.intel.openvino import OVModelForCausalLM + try: + model = OVModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_config + ) + except ValueError: + config = AutoConfig.from_pretrained( + model_id, trust_remote_code=True) + model = OVModelForCausalLM.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_config, + ) + + return model + + +def load_text2image_genai_pipeline(model_dir, device="CPU", ov_config=None): + try: + import openvino_genai + except ImportError: + logger.error( + "Failed to import openvino_genai package. Please install it.") + exit(-1) + + return GenAIModelWrapper( + openvino_genai.Text2ImagePipeline(model_dir, device=device, **ov_config), + model_dir, + "text-to-image" + ) + + +def load_text2image_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if use_genai: + logger.info("Using OpenvINO GenAI API") + model = load_text2image_genai_pipeline(model_id, device, ov_config) + elif use_hf: + logger.info("Using HF Transformers API") + model = DiffusionPipeline.from_pretrained( + model_id, trust_remote_code=True) + else: + logger.info("Using Optimum API") + from optimum.intel import OVPipelineForText2Image + TEXT2IMAGEPipeline = OVPipelineForText2Image + + try: + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_config + ) + except ValueError: + config = AutoConfig.from_pretrained( + model_id, trust_remote_code=True) + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_config, + ) + + return model + + +def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None): + try: + import openvino_genai + except ImportError as e: + logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e) + exit(-1) + + return GenAIModelWrapper( + openvino_genai.VLMPipeline(model_dir, device, **ov_config), + model_dir, + "visual-text" + ) + + +def load_visual_text_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if use_hf: + logger.info("Using HF Transformers API") + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + try: + model = AutoModelForVision2Seq.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) + except ValueError: + try: + model = AutoModel.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) + except ValueError: + model = AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower(), _attn_implementation="eager", use_flash_attention_2=False + ) + model.eval() + elif use_genai: + logger.info("Using OpenVINO GenAI API") + model = load_visual_text_genai_pipeline(model_id, device, ov_config) + else: + logger.info("Using Optimum API") + from optimum.intel.openvino import OVModelForVisualCausalLM + try: + model = OVModelForVisualCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_config + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = OVModelForVisualCausalLM.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_config, + ) + return model + + +def load_image2image_genai_pipeline(model_dir, device="CPU", ov_config=None): + try: + import openvino_genai + except ImportError as e: + logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e) + exit(-1) + + return GenAIModelWrapper( + openvino_genai.Image2ImagePipeline(model_dir, device, **ov_config), + model_dir, + "image-to-image" + ) + + +def load_imagetext2image_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if use_hf: + logger.info("Using HF Transformers API") + model = AutoPipelineForImage2Image.from_pretrained( + model_id, trust_remote_code=True + ) + elif use_genai: + logger.info("Using OpenVINO GenAI API") + model = load_image2image_genai_pipeline(model_id, device, ov_config) + else: + logger.info("Using Optimum API") + from optimum.intel.openvino import OVPipelineForImage2Image + try: + model = OVPipelineForImage2Image.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_config + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = OVPipelineForImage2Image.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_config, + ) + return model + + +def load_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if model_id is None: + return None + + if ov_config: + with open(ov_config) as f: + ov_options = json.load(f) + else: + ov_options = {} + + if model_type == "text": + return load_text_model(model_id, device, ov_options, use_hf, use_genai) + elif model_type == "text-to-image": + return load_text2image_model( + model_id, device, ov_options, use_hf, use_genai + ) + elif model_type == "visual-text": + return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai) + elif model_type == "image-to-image": + return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai) + else: + raise ValueError(f"Unsupported model type: {model_type}") diff --git a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py index 0cced117e4..e930c48b0a 100644 --- a/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -116,14 +116,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): - output = model( - prompt, - num_inference_steps=num_inference_steps, - output_type="pil", - width=self.resolution[0], - height=self.resolution[0], - generator=generator, - ) + with torch.no_grad(): + output = model( + prompt, + num_inference_steps=num_inference_steps, + output_type="pil", + width=self.resolution[0], + height=self.resolution[0], + generator=generator, + ) return output.images[0] generation_fn = gen_image_fn or default_gen_image_fn diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 04813f5fd8..2ff8c45975 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -1,18 +1,17 @@ import argparse import difflib import numpy as np -import json import logging import os -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel, AutoModelForVision2Seq +from transformers import AutoTokenizer, AutoProcessor import openvino as ov import pandas as pd from datasets import load_dataset -from diffusers import DiffusionPipeline from PIL import Image +from whowhatbench.model_loaders import load_model from whowhatbench import EVALUATOR_REGISTRY # Configure logging @@ -20,224 +19,6 @@ logger = logging.getLogger(__name__) -class GenAIModelWrapper: - """ - A helper class to store additional attributes for GenAI models - """ - - def __init__(self, model, model_dir, model_type): - self.model = model - self.model_type = model_type - - if model_type == "text" or model_type == "visual-text": - self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - elif model_type == "text-to-image": - self.config = DiffusionPipeline.load_config( - model_dir, trust_remote_code=True) - - def __getattr__(self, attr): - if attr in self.__dict__: - return getattr(self, attr) - else: - return getattr(self.model, attr) - - -def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None): - try: - import openvino_genai - except ImportError: - logger.error( - "Failed to import openvino_genai package. Please install it.") - exit(-1) - return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text") - - -def load_text_model( - model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False -): - if use_hf: - logger.info("Using HF Transformers API") - model = AutoModelForCausalLM.from_pretrained( - model_id, trust_remote_code=True, device_map=device.lower() - ) - model.eval() - elif use_genai: - logger.info("Using OpenVINO GenAI API") - model = load_text_genai_pipeline(model_id, device, ov_config) - else: - logger.info("Using Optimum API") - from optimum.intel.openvino import OVModelForCausalLM - try: - model = OVModelForCausalLM.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_config - ) - except ValueError: - config = AutoConfig.from_pretrained( - model_id, trust_remote_code=True) - model = OVModelForCausalLM.from_pretrained( - model_id, - config=config, - trust_remote_code=True, - use_cache=True, - device=device, - ov_config=ov_config, - ) - - return model - - -def load_text2image_genai_pipeline(model_dir, device="CPU", ov_config=None): - try: - import openvino_genai - except ImportError: - logger.error( - "Failed to import openvino_genai package. Please install it.") - exit(-1) - - return GenAIModelWrapper( - openvino_genai.Text2ImagePipeline(model_dir, device=device, **ov_config), - model_dir, - "text-to-image" - ) - - -def load_text2image_model( - model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False -): - if use_genai: - logger.info("Using OpenvINO GenAI API") - model = load_text2image_genai_pipeline(model_id, device, ov_config) - elif use_hf: - logger.info("Using HF Transformers API") - model = DiffusionPipeline.from_pretrained( - model_id, trust_remote_code=True) - else: - logger.info("Using Optimum API") - from optimum.intel import OVPipelineForText2Image - TEXT2IMAGEPipeline = OVPipelineForText2Image - - try: - model = TEXT2IMAGEPipeline.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_config - ) - except ValueError: - config = AutoConfig.from_pretrained( - model_id, trust_remote_code=True) - model = TEXT2IMAGEPipeline.from_pretrained( - model_id, - config=config, - trust_remote_code=True, - use_cache=True, - device=device, - ov_config=ov_config, - ) - - return model - - -def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None): - try: - import openvino_genai - except ImportError as e: - logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e) - exit(-1) - - return GenAIModelWrapper( - openvino_genai.VLMPipeline(model_dir, device, **ov_config), - model_dir, - "visual-text" - ) - - -def load_visual_text_model( - model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False -): - if use_hf: - logger.info("Using HF Transformers API") - config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) - try: - model = AutoModelForVision2Seq.from_pretrained( - model_id, trust_remote_code=True, device_map=device.lower() - ) - except ValueError: - try: - model = AutoModel.from_pretrained( - model_id, trust_remote_code=True, device_map=device.lower() - ) - except ValueError: - model = AutoModelForCausalLM.from_pretrained( - model_id, trust_remote_code=True, device_map=device.lower(), _attn_implementation="eager", use_flash_attention_2=False - ) - model.eval() - elif use_genai: - logger.info("Using OpenVINO GenAI API") - model = load_visual_text_genai_pipeline(model_id, device, ov_config) - else: - logger.info("Using Optimum API") - from optimum.intel.openvino import OVModelForVisualCausalLM - try: - model = OVModelForVisualCausalLM.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_config - ) - except ValueError: - config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) - model = OVModelForVisualCausalLM.from_pretrained( - model_id, - config=config, - trust_remote_code=True, - use_cache=True, - device=device, - ov_config=ov_config, - ) - return model - - -def load_model( - model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False -): - if model_id is None: - return None - - if ov_config: - with open(ov_config) as f: - ov_options = json.load(f) - else: - ov_options = {} - - if model_type == "text": - return load_text_model(model_id, device, ov_options, use_hf, use_genai) - elif model_type == "text-to-image": - return load_text2image_model( - model_type, model_id, device, ov_options, use_hf, use_genai - ) - elif model_type == "visual-text": - return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai) - else: - raise ValueError(f"Unsupported model type: {model_type}") - - -def load_prompts(args): - if args.dataset is None: - return None - split = "validation" - if args.split is not None: - split = args.split - if "," in args.dataset: - path_name = args.dataset.split(",") - path = path_name[0] - name = path_name[1] - else: - path = args.dataset - name = None - data = load_dataset(path=path, name=name, split=split) - - res = data[args.dataset_field] - - res = {"prompts": list(res)} - - return res - - def parse_args(): parser = argparse.ArgumentParser( prog="WWB CLI", @@ -274,9 +55,10 @@ def parse_args(): parser.add_argument( "--model-type", type=str, - choices=["text", "text-to-image", "visual-text"], + choices=["text", "text-to-image", "visual-text", "image-to-image"], default="text", - help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation.", + help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, " + "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt", ) parser.add_argument( "--data-encoder", @@ -385,6 +167,26 @@ def check_args(args): "Wether --target-model, --target-data or --gt-data should be provided") +def load_prompts(args): + if args.dataset is None: + return None + split = "validation" + if args.split is not None: + split = args.split + if "," in args.dataset: + path_name = args.dataset.split(",") + path = path_name[0] + name = path_name[1] + else: + path = args.dataset + name = None + data = load_dataset(path=path, name=name, split=split) + + res = data[args.dataset_field] + res = {"prompts": list(res)} + return res + + def load_tokenizer(args): tokenizer = None if args.tokenizer is not None: @@ -449,7 +251,7 @@ def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question): def genai_gen_image(model, prompt, num_inference_steps, generator=None): - if model.resolution[0] is not None: + if model.resolution is not None and model.resolution[0] is not None: image_tensor = model.generate( prompt, width=model.resolution[0], @@ -467,8 +269,21 @@ def genai_gen_image(model, prompt, num_inference_steps, generator=None): return image +def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=None): + image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)) + image_tensor = model.generate( + prompt, + image=image_data, + num_inference_steps=num_inference_steps, + strength=0.8, + generator=generator, + ) + image = Image.fromarray(image_tensor.data[0]) + return image + + def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question): - image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.byte)) + image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)) config = model.get_generation_config() config.max_new_tokens = max_new_tokens config.do_sample = False @@ -529,6 +344,17 @@ def create_evaluator(base_model, args): gen_answer_fn=genai_gen_visual_text if args.genai else None, processor=processor, ) + elif task == "image-to-image": + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + num_samples=args.num_samples, + num_inference_steps=args.num_inference_steps, + gen_image_fn=genai_gen_image2image if args.genai else None, + is_genai=args.genai, + seed=args.seed, + ) else: raise ValueError(f"Unsupported task: {task}") @@ -637,7 +463,7 @@ def main(): if args.verbose and (args.target_model or args.target_data): if args.model_type == "text" or args.model_type == "visual-text": print_text_results(evaluator) - elif "text-to-image" in args.model_type: + elif "text-to-image" in args.model_type or "image-to-image" in args.model_type: print_image_results(evaluator)