diff --git a/.github/actions/handle_docker/get_images_to_build.py b/.github/actions/handle_docker/get_images_to_build.py index 84bdbe7bd449d2..3d72056f041c6d 100644 --- a/.github/actions/handle_docker/get_images_to_build.py +++ b/.github/actions/handle_docker/get_images_to_build.py @@ -64,7 +64,10 @@ def main(): expected_tag = f'pr-{args.pr}' if head_tag != expected_tag: - logger.error(f"Please update docker tag in {args.head_tag_file} to {expected_tag}") + logger.error(f"Some of your changes affected Docker environment for CI. " + f"Please update docker tag in {args.head_tag_file} to {expected_tag}. " + f"For more details please see " + f"https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/ci/github_actions/docker_images.md") sys.exit(1) elif merge_queue_target_branch: diff --git a/.github/actions/setup_python/action.yml b/.github/actions/setup_python/action.yml index d1290508ab778f..ce85be46ced17e 100644 --- a/.github/actions/setup_python/action.yml +++ b/.github/actions/setup_python/action.yml @@ -79,9 +79,4 @@ runs: - if: ${{ inputs.show-cache-info == 'true' }} name: Get pip cache info shell: bash - run: | - echo "Cache size: " - du -h -d2 ${{ env.PIP_CACHE_DIR }} - echo "Cache info: " - python3 -m pip cache info - continue-on-error: true + run: python3 -m pip cache info diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag index 37b0ae41c049c4..9045291a72877a 100644 --- a/.github/dockerfiles/docker_tag +++ b/.github/dockerfiles/docker_tag @@ -1 +1 @@ -pr-28142 +pr-28381 \ No newline at end of file diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index c0dac9816598e1..a68f5dbd976f33 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -78,13 +78,13 @@ jobs: echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV - name: 'Upload sphinx.log' - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@v4.6.0 with: name: sphinx_build_log_${{ env.PR_NUMBER }}.log path: build/docs/sphinx.log - name: 'Upload docs html' - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@v4.6.0 with: name: openvino_docs_html_${{ env.PR_NUMBER }}.zip path: build/docs/openvino_docs_html.zip @@ -101,7 +101,7 @@ jobs: - name: 'Upload test results' if: failure() - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + uses: actions/upload-artifact@v4.6.0 with: name: openvino_docs_pytest path: build/docs/_artifacts/ diff --git a/.github/workflows/job_cxx_unit_tests.yml b/.github/workflows/job_cxx_unit_tests.yml index 52a2b3f4d287c8..a2a5762b4ea0bf 100644 --- a/.github/workflows/job_cxx_unit_tests.yml +++ b/.github/workflows/job_cxx_unit_tests.yml @@ -195,6 +195,12 @@ jobs: ${{ env.SETUPVARS_COMMAND }} ${{ env.INSTALL_TEST_DIR }}/ov_cpu_unit_tests --gtest_print_time=1 --gtest_output=xml:${{ env.INSTALL_TEST_DIR }}/TEST-CPUUnitTests.xml + - name: CPU plugin unit tests (vectorized) + if: fromJSON(inputs.affected-components).CPU.test + run: | + ${{ env.SETUPVARS_COMMAND }} + ${{ env.INSTALL_TEST_DIR }}/ov_cpu_unit_tests_vectorized --gtest_print_time=1 --gtest_output=xml:${{ env.INSTALL_TEST_DIR }}/TEST-CPUUnitTestsVectorized.xml + - name: ov_subgraphs_dumper_tests tests run: | ${{ env.SETUPVARS_COMMAND }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 65a72ef8f4936e..1cbdbe72507f6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,4 +185,4 @@ endif() # provides a callback function to describe each component in repo include(cmake/packaging/packaging.cmake) -ov_cpack(${OV_CPACK_COMPONENTS_ALL}) \ No newline at end of file +ov_cpack(${OV_CPACK_COMPONENTS_ALL}) diff --git a/README.md b/README.md index 7e9b173530de61..8019bb892023f2 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,7 @@ Learn how to run LLMs and GenAI with [Samples](https://github.com/openvinotoolki - [OpenVINO Execution Provider for ONNX Runtime](https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html) - use OpenVINO as a backend with your existing ONNX Runtime code. - [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/openvino/) - build context-augmented GenAI applications with the LlamaIndex framework and enhance runtime performance with OpenVINO. - [LangChain](https://python.langchain.com/docs/integrations/llms/openvino/) - integrate OpenVINO with the LangChain framework to enhance runtime performance for GenAI applications. +- [Keras 3](https://github.com/keras-team/keras) - Keras 3 is a multi-backend deep learning framework. Users can switch model inference to the OpenVINO backend using the Keras API. Check out the [Awesome OpenVINO](https://github.com/openvinotoolkit/awesome-openvino) repository to discover a collection of community-made AI projects based on OpenVINO! diff --git a/cmake/developer_package/compile_flags/os_flags.cmake b/cmake/developer_package/compile_flags/os_flags.cmake index e70de45b9416b1..759a9080188639 100644 --- a/cmake/developer_package/compile_flags/os_flags.cmake +++ b/cmake/developer_package/compile_flags/os_flags.cmake @@ -104,6 +104,7 @@ macro(ov_check_compiler_supports_sve flags) int main() { svfloat64_t a; a = svdup_n_f64(0); + (void)a; // to avoid warnings return 0; }") @@ -259,7 +260,6 @@ endmacro() macro(ov_arm_sve_optimization_flags flags) # Check for compiler SVE support ov_check_compiler_supports_sve("-march=armv8-a+sve") - if(OV_COMPILER_IS_INTEL_LLVM) message(WARNING "Unsupported CXX compiler ${CMAKE_CXX_COMPILER_ID}") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") @@ -449,6 +449,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") # Build with multiple processes ov_add_compiler_flags(/MP) + # Specifies both the source character set and the execution character set as UTF-8. + # For details, refer to link: https://learn.microsoft.com/en-us/cpp/build/reference/utf-8-set-source-and-executable-character-sets-to-utf-8?view=msvc-170 + ov_add_compiler_flags(/utf-8) + # Workaround for an MSVC compiler issue in some versions of Visual Studio 2022. # The issue involves a null dereference to a mutex. For details, refer to link https://github.com/microsoft/STL/wiki/Changelog#vs-2022-1710 if(MSVC AND MSVC_VERSION GREATER_EQUAL 1930) diff --git a/cmake/templates/OpenVINOConfig.cmake.in b/cmake/templates/OpenVINOConfig.cmake.in index 46550e36aacde3..209d38106c2b7c 100644 --- a/cmake/templates/OpenVINOConfig.cmake.in +++ b/cmake/templates/OpenVINOConfig.cmake.in @@ -549,8 +549,13 @@ if(_ov_as_external_package) foreach(target IN LISTS _ov_imported_libs) if(TARGET ${target}) get_target_property(imported_configs ${target} IMPORTED_CONFIGURATIONS) - if(NOT RELWITHDEBINFO IN_LIST imported_configs) - set_property(TARGET ${target} PROPERTY MAP_IMPORTED_CONFIG_RELWITHDEBINFO RELEASE) + if(RELEASE IN_LIST imported_configs) + if(NOT RELWITHDEBINFO IN_LIST imported_configs) + set_property(TARGET ${target} PROPERTY MAP_IMPORTED_CONFIG_RELWITHDEBINFO RELEASE) + endif() + if(NOT MINSIZEREL IN_LIST imported_configs) + set_property(TARGET ${target} PROPERTY MAP_IMPORTED_CONFIG_MINSIZEREL RELEASE) + endif() endif() unset(imported_configs) endif() diff --git a/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst b/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst index 3bb46116ee1748..e38bcb64d90530 100644 --- a/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst +++ b/docs/articles_en/about-openvino/compatibility-and-support/supported-devices.rst @@ -90,16 +90,3 @@ topic (step 3 "Configure input and output"). | \* **Of the Linux systems, versions 22.04 and 24.04 include drivers for NPU.** | **For Windows, CPU inference on ARM64 is not supported.** - -.. note:: - - With the OpenVINO 2024.0 release, support for GNA has been discontinued. To keep using it - in your solutions, revert to the 2023.3 (LTS) version. - - With the OpenVINO™ 2023.0 release, support has been cancelled for: - - - Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X - - Intel® Vision Accelerator Design with Intel® Movidius™ - - To keep using the MYRIAD and HDDL plugins with your hardware, - revert to the OpenVINO 2022.3 (LTS) version. diff --git a/docs/articles_en/about-openvino/release-notes-openvino.rst b/docs/articles_en/about-openvino/release-notes-openvino.rst index bf475159380dff..0134ed15215541 100644 --- a/docs/articles_en/about-openvino/release-notes-openvino.rst +++ b/docs/articles_en/about-openvino/release-notes-openvino.rst @@ -91,7 +91,7 @@ Jupyter Notebooks -Previous 2024 releases +Previous 2025 releases +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ .. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -106,1555 +106,6 @@ Previous 2024 releases * More GenAI coverage and framework integrations to minimize code changes. - * New models supported: Llama 3.2 (1B & 3B), Gemma 2 (2B & 9B), and YOLO11. - * LLM support on NPU: Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, Qwen2-7B-Instruct and Phi-3 - Mini-Instruct. - * Noteworthy notebooks added: Sam2, Llama3.2, Llama3.2 - Vision, Wav2Lip, Whisper, and Llava. - * Preview: support for Flax, a high-performance Python neural network library based on JAX. - Its modular design allows for easy customization and accelerated inference on GPUs. - - * Broader Large Language Model (LLM) support and more model compression techniques. - - * Optimizations for built-in GPUs on Intel® Core™ Ultra Processors (Series 1) and Intel® Arc™ - Graphics include KV Cache compression for memory reduction along with improved usability, - and model load time optimizations to improve first token latency for LLMs. - * Dynamic quantization was enabled to improve first token latency for LLMs on built-in - Intel® GPUs without impacting accuracy on Intel® Core™ Ultra Processors (Series 1). Second - token latency will also improve for large batch inference. - * A new method to generate synthetic text data is implemented in the Neural Network - Compression Framework (NNCF). This will allow LLMs to be compressed more accurately using - data-aware methods without datasets. Coming soon: This feature will soon be accessible via - Optimum Intel on Hugging Face. - - * More portability and performance to run AI at the edge, in the cloud, or locally. - - * Support for - `Intel® Xeon® 6 Processors with P-cores `__ - (formerly codenamed Granite Rapids) and - `Intel® Core™ Ultra 200V series processors `__ - (formerly codenamed Arrow Lake-S). - * Preview: GenAI API enables multimodal AI deployment with support for multimodal pipelines - for improved contextual awareness, transcription pipelines for easy audio-to-text - conversions, and image generation pipelines for streamlined text-to-visual conversions. - * Speculative decoding feature added to the GenAI API for improved performance and efficient - text generation using a small draft model that is periodically corrected by the full-size - model. - * Preview: LoRA adapters are now supported in the GenAI API for developers to quickly and - efficiently customize image and text generation models for specialized tasks. - * The GenAI API now also supports LLMs on NPU allowing developers to specify NPU as the - target device, specifically for WhisperPipeline (for whisper-base, whisper-medium, and - whisper-small) and LLMPipeline (for Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, - Qwen2-7B-Instruct and Phi-3 Mini-instruct). Use driver version 32.0.100.3104 or later for - best performance. - - *Now deprecated* - - * Python 3.8 is no longer supported: - - - **OpenVINO™ Runtime** - - *Common* - - * Numpy 2.x has been adopted for all currently supported components, including NNCF. - * A new constant constructor has been added, enabling constants to be created from data pointer - as shared memory. Additionally, it can take ownership of a shared, or other, object, avoiding - a two-step process to wrap memory into ``ov::Tensor``. - * Asynchronous file reading with mmap library has been implemented, reducing loading times for - model files, especially for LLMs. - * CPU implementation of SliceScatter operator is now available, used for models such as Gemma, - supporting increased LLM performance. - - - *CPU Device Plugin* - - * Gold support of the Intel® Xeon® 6 platform with P-cores (formerly code name Granite Rapids) - has been reached. - * Support of Intel® Core™ Ultra 200V series processors (formerly codenamed Arrow Lake-S) has - been implemented. - * LLM performance has been further improved with Rotary Position Embedding optimization; Query, - Key, and Value; and multi-layer perceptron fusion optimization. - * FP16 support has been extended with SDPA and PagedAttention, improving performance of LLM via - both native APIs and the vLLM integration. - * Models with LoRA adapters are now supported. - - - *GPU Device Plugin* - - * The KV cache INT8 compression mechanism is now available for all supported GPUs. It enables a - significant reduction in memory consumption, increasing performance with a minimal impact to - accuracy (it affects systolic devices slightly more than non-systolic ones). The feature is - activated by default for non-systolic devices. - * LoRA adapters are now functionally supported on GPU. - * A new feature of GPU weightless blob caching enables caching model structure only and reusing - the weights from the original model file. Use the new OPTIMIZE_SIZE property to activate. - * Dynamic quantization with INT4 and INT8 precisions has been implemented and enabled by - default on Intel® Core™ Ultra platforms, improving LLM first token latency. - - - *NPU Device Plugin* - - * Models retrieved from the OpenVINO cache have a smaller memory footprint now. The plugin - releases the cached model (blob) after weights are loaded in NPU regions. Model export is not - available in this scenario. Memory consumption is reduced during inference execution with one - blob size. This optimization requires the latest NPU driver: 32.0.100.3104. - * A driver bug for ``ov::intel_npu::device_total_mem_size`` has been fixed. The plugin will now - report 2GB as the maximum allocatable memory for any driver that does not support graph - extension 1.8. Even if older drivers report a larger amount of memory to be available, memory - allocation would fail when 2GB are exceeded. Plugin reports the number that driver exposes - for any driver that supports graph extension 1.8 (or newer). - * A new API is used to initialize the model (available in graph extension 1.8). - * Inference request set_tensors is now supported. - * ``ov::device::LUID`` is now exposed on Windows. - * LLM-related improvements have been implemented in terms of both memory usage and performance. - * AvgPool and MaxPool operator support has been extended, adding support for more PyTorch models. - - * NOTE: for systems based on Intel® Core™ Ultra Processors Series 2, more than 16GB of RAM may - be required to use larger models, such as Llama-2-7B, Mistral-0.2-7B, and Qwen-2-7B - (exceeding 4B parameters) with prompt sizes over 1024 tokens. - - - *OpenVINO Python API* - - * Constant now can be created from openvino.Tensor. - * The “release_memory” method has been added for a compiled model, improving control over - memory consumption. - - - - *OpenVINO Node.js API* - - * Querying the best device to perform inference of a model with specific operations - is now available in JavaScript API. - * Contribution guidelines have been improved to make it easier for developers to contribute. - * Testing scope has been extended by inference in end-to-end tests. - * JavaScript API samples have been improved for readability and ease of running. - - - - *TensorFlow Framework Support* - - * TensorFlow 2.18.0, Keras 3.6.0, NumPy 2.0.2 in Python 3.12, and NumPy 1.26.4 in other Python - versions have been added to validation. - * Out-of-the-box conversion with static ranks has been improved by devising a new shape for - Switch-Merge condition sub-graphs. - * Complex type for the following operations is now supported: ExpandDims, Pack, Prod, Rsqrt, - ScatterNd, Sub. - * The following issues have been fixed: - - * the corner case with one element in LinSpace to avoid division by zero, - * support FP16 and FP64 input types for LeakyRelu, - * support non-i32/i64 output index type for ArgMin/Max operations. - - - - *PyTorch Framework Support* - - * PyTorch version 2.5 is now supported. - * OpenVINO Model Converter (OVC) now supports TorchScript and ExportedProgram saved on a drive. - * The issue of aten.index.Tensor conversion for indices with “None” values has been fixed, - helping to support the HF Stable Diffusion model in ExportedProgram format. - - - - *ONNX Framework Support* - - * ONNX version 1.17.0 is now used. - * Customers' models with DequantizeLinear-21, com.microsoft.MatMulNBits, and - com.microsoft.QuickGelu operations are now supported. - - *JAX/Flax Framework Support* - - * JAX 0.4.35 and Flax 0.10.0 has been added to validation. - * jax._src.core.ClosedJaxpr object conversion is now supported. - * Vision Transformer from google-research/vision_transformer is now supported - (with support for 37 new operations). - - - **OpenVINO Model Server** - - * The OpenAI API text embedding endpoint has been added, enabling OVMS to be used as a building - block for AI applications like RAG. - `(read more) `__ - * The rerank endpoint has been added based on Cohere API, enabling easy similarity detection - between a query and a set of documents. It is one of the building blocks for AI applications - like RAG and makes integration with frameworks such as langchain easy. - `(read more) `__ - * The following improvements have been done to LLM text generation: - - * The ``echo`` sampling parameter together with ``logprobs`` in the ``completions`` endpoint - is now supported. - * Performance has been increased on both CPU and GPU. - * Throughput in high-concurrency scenarios has been increased with dynamic_split_fuse for GPU. - * Testing coverage and stability has been improved. - * The procedure for service deployment and model repository preparation has been simplified. - - * An experimental version of a Windows binary package - native model server for Windows OS - is - available. This release includes a set of limitations and has limited tests coverage. It is - intended for testing, while the production-ready release is expected with 2025.0. All feedback - is welcome. - - - **Neural Network Compression Framework** - - * A new nncf.data.generate_text_data() method has been added for generating a synthetic dataset - for LLM compression. This approach helps to compress LLMs more accurately in situations when - the dataset is not available or not sufficient. - `See our example `__ - for more information about the usage. - * Support of data-free and data-aware weight compression methods - nncf.compress_weights() - - has been extended with NF4 per-channel quantization, making compressed LLMs more accurate and - faster on NPU. - * Caching of computed statistics in nncf.compress_weights() is now available, significantly - reducing compression time when performing compression of the same LLM multiple times, with - different compression parameters. To enable it, set the advanced ``statistics_path`` parameter - of nncf.compress_weights() to the desired file path location. - * The ``backup_mode`` optional parameter has been added to nncf.compress_weights(), for - specifying the data type for embeddings, convolutions, and last linear layers during 4-bit - weight compression. Available options are INT8_ASYM (default), INT8_SYM, and NONE (retains - the original floating-point precision of the model weights). In certain situations, - non-default value might give better accuracy of compressed LLMs. - * Preview support is now available for optimizing models in Torch - `FX format `__, nncf.quantize(), and - nncf.compress_weights() methods. After optimization such models can be directly executed - via torch.compile(compressed_model, backend="openvino"). For more details, see - `INT8 quantization example `__. - * Memory consumption of data-aware weight compression methods - nncf.compress_weights() – has - been reduced significantly, with some variation depending on the model and method. - * Support for the following has changed: - - * NumPy 2 added - * PyTorch upgraded to 2.5.1 - * ONNX upgraded to 1.17 - * Python 3.8 discontinued - - - - **OpenVINO Tokenizers** - - * Several operations have been introduced and optimized. - * Conversion parameters and environment info have been added to ``rt_info``, improving - reproducibility and debugging. - - - - **OpenVINO.GenAI** - - * The following has been added: - - * LoRA adapter for the LLMPipeline. - * Text2ImagePipeline with LoRA adapter and text2image samples. - * VLMPipeline and visual_language_chat sample for text generation models with text and image - inputs. - * WhisperPipeline and whisper_speech_recognition sample. - - * speculative_decoding_lm has been moved to LLMPipeline based implementation and is now - installed as part of the package. - * On NPU, a set of pipelines has been enabled: WhisperPipeline (for whisper-base, - whisper-medium, and whisper-small), LLMPipeline (for Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, - Qwen2-7B-Instruct, and Phi-3 Mini-instruct). Use driver version 32.0.100.3104 or later for - best performance. - - - - - - **Other Changes and Known Issues** - - *Jupyter Notebooks* - - * `Text-to-Image generation using OpenVINO GenAI `__ - * `Multi LoRA Image Generation `__ - * `Virtual Try-on using OpenVINO and CatVTON `__ - * `Visual Language Assistant using OpenVINO GenAI `__ - * `Speech recognition using OpenVINO GenAI `__ - * `YoloV11 `__ - * `Llama-3.2-vision `__ - * `Pixtral `__ - * `Segment Anything 2 `__ - * `Video Lips-sync using Wav2Lip `__ - * `Convert JAX to OpenVINO tutorial `__ - - - *Known Issues* - - | **Component: CPU Plugin** - | ID: 155898 - | Description: - | Description: When using new version of Transformer version to convert some of LLMs - (GPT-J/GPT-NeoX or falcon-7b), the inference accuracy may be impacted on 4th or 5th - generation of Intel® Xeon® processors, due to model structure update triggering inference - precision difference in part of the model. The workaround is to use transformer version of - 4.44.2 or lower. - - | **Component: GPU Plugin** - | ID: 154583 - | Description: - | LLM accuracy can be low especially on non-systolic platforms like Intel® Core™ Ultra. When - facing the low accuracy issue, user needs to manually set a config ACTIVATION_SCALING_FACOTR - with a value of 8.0 in the compile_model() function. From the next release, scaling factor - value will be automatically applied through updated IR. - - | **Component: GenAI** - | ID: 156437, 148933 - | Description: - | When using Python GenAI APIs, if ONNX 17.0 and later is installed, it may encounter the - error “DLL load failed while importing onnx_cpp2py_export: A dynamic link library (DLL) - initialization routine failed.” It is due to the ONNX dependency issue - `onnx/onnx#6267 `__, - Install - `Microsoft Visual C++ Redistributable `__ - latest supported downloads to fix the issue. - - | **Component: GenAI** - | ID: 156944 - | Description: - | There were backward incompatible changes resulting in different text generated by LLMs like - Mistralai/Mistral-7B-Instruct-v0.2 and TinyLlama/TinyLlama-1.1B-Chat-v1.0 when using a - tokenizer converted by older openvino_tolenizers. A way to resolve the issue is to convert - tokenizer and detokenizer models using the latest openvino_tokenizers. - - - - - - - - -.. dropdown:: 2024.4 - 19 September 2024 - :animate: fade-in-slide-down - :color: secondary - - **What's new** - - * More Gen AI coverage and framework integrations to minimize code changes. - - * Support for GLM-4-9B Chat, MiniCPM-1B, Llama 3 and 3.1, Phi-3-Mini, Phi-3-Medium and - YOLOX-s models. - * Noteworthy notebooks added: Florence-2, NuExtract-tiny Structure Extraction, Flux.1 Image - Generation, PixArt-α: Photorealistic Text-to-Image Synthesis, and Phi-3-Vision Visual - Language Assistant. - - * Broader Large Language Model (LLM) support and more model compression techniques. - - * OpenVINO™ runtime optimized for Intel® Xe Matrix Extensions (Intel® XMX) systolic arrays on - built-in GPUs for efficient matrix multiplication resulting in significant LLM performance - boost with improved 1st and 2nd token latency, as well as a smaller memory footprint on - Intel® Core™ Ultra Processors (Series 2). - * Memory sharing enabled for NPUs on Intel® Core™ Ultra Processors (Series 2) for efficient - pipeline integration without memory copy overhead. - * Addition of the PagedAttention feature for discrete GPUs* enables a significant boost in - throughput for parallel inferencing when serving LLMs on Intel® Arc™ Graphics or Intel® - Data Center GPU Flex Series. - - * More portability and performance to run AI at the edge, in the cloud, or locally. - - * Support for Intel® Core™ Ultra Processors Series 2 (formerly codenamed Lunar Lake) on Windows. - * OpenVINO™ Model Server now comes with production-quality support for OpenAI-compatible API - which enables significantly higher throughput for parallel inferencing on Intel® Xeon® - processors when serving LLMs to many concurrent users. - * Improved performance and memory consumption with prefix caching, KV cache compression, and - other optimizations for serving LLMs using OpenVINO™ Model Server. - * Support for Python 3.12. - * Support for Red Hat Enterprise Linux (RHEL) version 9.3 - 9.4. - - *Now deprecated* - - * The following will not be available beyond the 2024.4 OpenVINO version: - - * The macOS x86_64 debug bins - * Python 3.8 - * Discrete Keem Bay support - - * Intel® Streaming SIMD Extensions (Intel® SSE) will be supported in source code form, but not - enabled in the binary package by default, starting with OpenVINO 2025.0. - - Check the `deprecation section <#deprecation-and-support>`__ for more information. - - **OpenVINO™ Runtime** - - *Common* - - * Encryption and decryption of topology in model cache is now supported with callback functions - provided by the user (CPU only for now; ov::cache_encryption_callbacks). - * The Ubuntu20 and Ubuntu22 Docker images now include the tokenizers and GenAI CPP modules, - including pre-installed Python modules, in development versions of these images. - * Python 3.12 is now supported. - - *CPU Device Plugin* - - * The following is now supported: - - * Tensor parallel feature for multi-socket CPU inference, with performance improvement for - LLMs with 6B+ parameters (enabled through model_distribution_policy hint configurations). - * RMSNorm operator, optimized with JIT kernel to improve both the 1st and 2nd token - performance of LLMs. - - * The following has been improved: - - * vLLM support, with PagedAttention exposing attention score as the second output. It can now - be used in the cache eviction algorithm to improve LLM serving performance. - * 1st token performance with Llama series of models, with additional CPU operator optimization - (such as MLP, SDPA) on BF16 precision. - * Default oneTBB version on Linux is now 2021.13.0, improving overall performance on latest - Intel® Xeon® platforms. - * MXFP4 weight compression models (compressing weights to 4-bit with the e2m1 data type - without a zero point and with 8-bit e8m0 scales) have been optimized for Intel® Xeon® - platforms thanks to fullyconnected compressed weight LLM support. - - * The following has been fixed: - - * Memory leak when ov::num_streams value is 0. - * CPU affinity mask is changed after OpenVINO execution when OpenVINO is compiled - with -DTHREADING=SEQ. - - - *GPU Device Plugin* - - * Dynamic quantization for LLMs is now supported on discrete GPU platforms. - * Stable Diffusion 3 is now supported with good accuracy on Intel GPU platforms. - * Both first and second token latency for LLMs have been improved on Intel GPU platforms. - * The issue of model cache not regenerating with the value changes of - ``ov::hint::performance_mode`` or ``ov::hint::dynamic_quantization_group_size`` has been - fixed. - - - *NPU Device Plugin* - - * `Remote Tensor API `__ - is now supported. - * You can now query the available number of tiles (ov::intel_npu::max_tiles) and force a - specific number of tiles to be used by the model, per inference request - (ov::intel_npu::tiles). **Note:** ov::intel_npu::tiles overrides the default number of tiles - selected by the compiler based on performance hints (ov::hint::performance_mode). Any tile - number other than 1 may be a problem for cross platform compatibility, if not tested - explicitly versus the max_tiles value. - * You can now bypass the model caching mechanism in the driver - (ov::intel_npu::bypass_umd_caching). Read more about driver and OpenVINO caching. - * Memory footprint at model execution has been reduced by one blob (compiled model) size. - For execution, the plugin no longer retrieves the compiled model from the driver, it uses the - level zero graph handle directly, instead. The compiled model is now retrieved from the driver - only during the export method. - - - *OpenVINO Python API* - - * Openvino.Tensor, when created in the shared memory mode, now prevents “garbage collection” of - numpy memory. - * The ``openvino.experimental`` submodule is now available, providing access to experimental - functionalities under development. - * New python-exclusive openvino.Model constructors have been added. - * Image padding in PreProcessor is now available. - * OpenVINO Runtime is now compatible with numpy 2.0. - - - *OpenVINO Node.js API* - - * The following has been improved - - * Unit tests for increased efficiency and stability - * Security updates applied to dependencies - - * `Electron `__ - compatibility is now confirmed with new end-to-end tests. - * `New API methods `__ added. - - - *TensorFlow Framework Support* - - * TensorFlow 2.17.0 is now supported. - * JAX 0.4.31 is now supported via a path of jax2tf with native_serialization=False - * `8 NEW* operations `__ - have been added. - * Tensor lists with multiple undefined dimensions in element_shape are now supported, enabling - support for TF Hub lite0-detection/versions/1 model. - - - *PyTorch Framework Support* - - * Torch 2.4 is now supported. - * Inplace ops are now supported automatically if the regular version is supported. - * Symmetric GPTQ model from Hugging Face will now be automatically converted to the signed type - (INT4) and zero-points will be removed. - - - *ONNX Framework Support* - - * ONNX 1.16.0 is now supported - * models with constants/inputs of uINT4/INT4 types are now supported. - * 4 NEW operations have been added. - - - **OpenVINO Model Server** - - * OpenAI API for text generation is now officially supported and recommended for production - usage. It comes with the following new features: - - * Prefix caching feature, caching the prompt evaluation to speed up text generation. - * Ability to compress the KV Cache to a lower precision, reducing memory consumption without - a significant loss of accuracy. - * ``stop`` sampling parameters, to define a sequence that stops text generation. - * ``logprobs`` sampling parameter, returning the probabilities to returned tokens. - * Generic metrics related to execution of the MediaPipe graph that can be used for autoscaling - based on the current load and the level of concurrency. - * `Demo of text generation horizontal scalability `__ - using basic docker containers and Kubernetes. - * Automatic cancelling of text generation for disconnected clients. - * Non-UTF-8 responses from the model can be now automatically changed to Unicode replacement - characters, due to their configurable handling. - * Intel GPU with paged attention is now supported. - * Support for Llama3.1 models. - - * The following has been improved: - - * Handling of model templates without bos_token is now fixed. - * Performance of the multinomial sampling algorithm. - * ``finish_reason`` in the response correctly determines reaching max_tokens (length) and - completing the sequence (stop). - * Security and stability. - - - - **Neural Network Compression Framework** - - * The LoRA Correction algorithm is now included in the Weight Compression method, improving the - accuracy of INT4-compressed models on top of other data-aware algorithms, such as AWQ and - Scale Estimation. To enable it, set the lora_correction option to True in - nncf.compress_weights(). - * The GPTQ compression algorithm can now be combined with the Scale Estimation algorithm, - making it possible to run GPTQ, AWQ, and Scale Estimation together, for the optimum-accuracy - INT4-compressed models. - * INT8 quantization of LSTMSequence and Convolution operations for constant inputs is now - enabled, resulting in better performance and reduced model size. - - - **OpenVINO Tokenizers** - - * Split and BPE tokenization operations have been reimplemented, resulting in improved - tokenization accuracy and performance. - * New building options are now available, offering up to a 12x reduction in binary size. - * An operation is now available to validate and skip/replace model-generated non-Unicode - bytecode sequences during detokenization. - - **OpenVINO.GenAI** - - * New samples and pipelines are now available: - - * An example IterableStreamer implementation in - `multinomial_causal_lm/python sample `__ - - * GenAI compilation is now available as part of OpenVINO via the –DOPENVINO_EXTRA_MODULES CMake - option. - - - - **Other Changes and Known Issues** - - *Jupyter Notebooks* - - * `Florence-2 `__ - * `NuExtract: Structure Extraction `__ - * `Flux.1 Image Generation `__ - * `PixArt-α: Photorealistic Text-to-Image Synthesis `__ - * `Phi-3-Vision Visual Language Assistant `__ - * `MiniCPMV2.6 `__ - * `InternVL2 `__ - * The list of supported models in - `LLM chatbot `__ - now includes Phi3.5, Gemma2 support - - *Known Issues* - - | **Component: CPU** - | ID: CVS-150542, CVS-145996 - | Description: - | The upgrade of default oneTBB on Linux platforms to 2021.13.0 improves overall - performance on latest Intel® Xeon® platform but causes regression in some cases. Limit the - threads usage of postprocessing done by Torch can mitigate the regression (For example: - torch.set_num_threads(n), n can be 1, beam search number, prompt batch size or other - numbers). - - | **Component: OpenVINO.Genai** - | ID: 149694 - | Description: - | Passing openvino.Tensor instance to LLMPipleine triggers incompatible arguments error if - OpenVINO and GenAI are installed from PyPI on Windows. - - | **Component: OpenVINO.Genai** - | ID: 148308 - | Description: - | OpenVINO.GenAI archive doesn't have debug libraries for OpenVINO Tokenizers and - OpenVINO.GenAI. - - | **Component: ONNX for ARM** - | ID: n/a - | Description: - | For ARM binaries, the `1.16 ONNX library `__ - is not yet available. The ONNX library for ARM, version 1.15, does not include the latest - functional and security updates. Users should update to the latest version as it becomes - available. - | Currently, if an unverified AI model is supplied to the ONNX frontend, it could lead to a - directory traversal issue. Ensure that the file name and file path that a model contains - are verified and correct. To learn more about the vulnerability, see: - `CVE-2024-27318 `__ and - `CVE-2024-27319 `__. - - | **Component: Kaldi** - | ID: n/a - | Description: - | There is a known issue with the Kaldi DL framework support on the Python version 3.12 due - to the numpy version incompatibilities. As Kaldi support in OpenVINO is currently deprecated - and will be discontinued with version 2025.0, the issue will not be addressed. - - - - - -.. dropdown:: 2024.3 - 31 July 2024 - :animate: fade-in-slide-down - :color: secondary - - **What's new** - - * More Gen AI coverage and framework integrations to minimize code changes. - - * OpenVINO pre-optimized models are now available in Hugging Face making it easier for developers - to get started with these models. - - * Broader Large Language Model (LLM) support and more model compression techniques. - - * Significant improvement in LLM performance on Intel discrete GPUs with the addition of - Multi-Head Attention (MHA) and OneDNN enhancements. - - * More portability and performance to run AI at the edge, in the cloud, or locally. - - * Improved CPU performance when serving LLMs with the inclusion of vLLM and continuous batching - in the OpenVINO Model Server (OVMS). vLLM is an easy-to-use open-source library that supports - efficient LLM inferencing and model serving. - * Ubuntu 24.04 is now officially supported. - - **OpenVINO™ Runtime** - - *Common* - - * OpenVINO may now be used as a backend for vLLM, offering better CPU performance due to - fully-connected layer optimization, fusing multiple fully-connected layers (MLP), U8 KV cache, - and dynamic split fuse. - * Ubuntu 24.04 is now officially supported, which means OpenVINO is now validated on this - system (preview support). - * The following have been improved: - - * Increasing support for models like YoloV10 or PixArt-XL-2, thanks to enabling Squeeze and - Concat layers. - * Performance of precision conversion FP16/BF16 -> FP32. - - *AUTO Inference Mode* - - * Model cache is now disabled for CPU acceleration even when cache_dir is set, because CPU - acceleration is skipped when the cached model is ready for the target device in the 2nd run. - - *Heterogeneous Inference Mode* - - * PIPELINE_PARALLEL policy is now available, to inference large models on multiple devices per - available memory size, being especially useful for large language models that don't fit into - one discrete GPU (a preview feature). - - *CPU Device Plugin* - - * Fully Connected layers have been optimized together with RoPE optimization with JIT kernel to - improve performance for LLM serving workloads on Intel AMX platforms. - * Dynamic quantization of Fully Connected layers is now enabled by default on Intel AVX2 and - AVX512 platforms, improving out-of-the-box performance for 8bit/4bit weight-compressed LLMs. - * Performance has been improved for: - - * ARM server configuration, due to migration to Intel® oneAPI Threading Building Blocks 2021.13. - * ARM for FP32 and FP16. - - *GPU Device Plugin* - - * Performance has been improved for: - - * LLMs and Stable Diffusion on discrete GPUs, due to latency decrease, through optimizations - such as Multi-Head Attention (MHA) and oneDNN improvements. - * Whisper models on discrete GPU. - - - *NPU Device Plugin* - - * NPU inference of LLMs is now supported with GenAI API (preview feature). To support LLMs on - NPU (requires the most recent version of the NPU driver), additional relevant features are - also part of the NPU plugin now. - * Models bigger than 2GB are now supported on both NPU driver - (Intel® NPU Driver - Windows* 32.0.100.2540) and NPU plugin side (both Linux and Windows). - * Memory optimizations have been implemented: - - * Weights are no longer copied from NPU compiler adapter. - * Improved memory and first-ever inference latency for inference on NPU. - - *OpenVINO Python API* - - * visit_attributes is now available for custom operation implemented in Python, enabling - serialization of operation attributes. - * Python API is now extended with new methods for Model class, e.g. Model.get_sink_index, new - overloads for Model.get_result_index. - - *OpenVINO Node.js API* - - * Tokenizers and StringTensor are now supported for LLM inference. - * Compatibility with electron.js is now restored for desktop application developers. - * Async version of Core.import_model and enhancements for Core.read_model methods are now - available, for more efficient model reading, especially for LLMs. - - *TensorFlow Framework Support* - - * Models with keras.LSTM operations are now more performant in CPU inference. - * The tensor list initialized with an undefined element shape value is now supported. - - *TensorFlow Lite Framework Support* - - * Constants containing spare tensors are now supported. - - *PyTorch Framework Support* - - * Setting types/shapes for nested structures (e.g., dictionaries and tuples) is now supported. - * The aten::layer_norm has been updated to support dynamic shape normalization. - * Dynamic shapes support in the FX graph has been improved, benefiting torch.compile and - torch.export based applications, improving performance for gemma and chatglm model - families. - - *ONNX Framework Support* - - * More models are now supported: - - * Models using the new version of the ReduceMean operation (introduced in ONNX opset 18). - * Models using the Multinomial operation (introduced in ONNX opset 7). - - - **OpenVINO Model Server** - - * The following has been improved in OpenAI API text generation: - - * Performance results, due to OpenVINO Runtime and sampling algorithms. - * Reporting generation engine metrics in the logs. - * Extra sampling parameters added. - * Request parameters affecting memory consumption now have value restrictions, within a - configurable range. - - * The following has been fixed in OpenAI API text generation: - - * Generating streamer responses impacting incomplete utf-8 sequences. - * A sporadic generation hang. - * Incompatibility of the last response from the ``completions`` endpoint stream with the vLLM - benchmarking script. - - **Neural Network Compression Framework** - - * The `MXFP4 `__ - data format is now supported in the Weight Compression method, compressing weights to 4-bit - with the e2m1 data type without a zero point and with 8-bit e8m0 scales. This feature - is enabled by setting ``mode=CompressWeightsMode.E2M1`` in nncf.compress_weights(). - * The AWQ algorithm in the Weight Compression method has been extended for patterns: - Act->MatMul and Act->MUltiply->MatMul to cover the Phi family models. - * The representation of symmetrically quantized weights has been updated to a signed data type - with no zero point. This allows NPU to support compressed LLMs with the symmetric mode. - * BF16 models in Post-Training Quantization are now supported; nncf.quantize(). - * `Activation Sparsity `__ (Contextual Sparsity) algorithm in - the Weight Compression method is now supported (preview), speeding up LLM inference. - The algorithm is enabled by setting the ``target_sparsity_by_scope`` option in - nncf.compress_weights() and supports Torch models only. - - - **OpenVINO Tokenizers** - - * The following is now supported: - - * Full Regex syntax with the PCRE2 library for text normalization and splitting. - * Left padding side for all tokenizer types. - - * GLM-4 tokenizer support, as well as detokenization support for Phi-3 and Gemma have been - improved. - - - - - - **Other Changes and Known Issues** - - *Jupyter Notebooks* - - * `Stable Diffusion V3 `__ - * `Depth Anything V2 `__ - * `RAG System with LLamaIndex `__ - * `Image Synthesis with Pixart `__ - * `Function calling LLM agent with Qwen-Agent `__ - * `Jina-CLIP `__ - * `MiniCPM -V2 Visual Language Assistant `__ - * `OpenVINO XAI: first steps `__ - * `OpenVINO XAI: deep dive `__ - * `LLM Agent with LLamaIndex `__ - * `Stable Audio `__ - * `Phi-3-vision `__ - - *OpenVINO.GenAI* - - * Performance counters have been added. - * Preview support for NPU is now available. - - *Hugging Face* - - OpenVINO pre-optimized models are now available on Hugging Face: - - * Phi-3-mini-128k-instruct ( - `INT4 `__, - `INT8 `__, - `FP16 `__) - * Mistral-7B-Instruct-v0.2 ( - `INT4 `__, - `INT8 `__, - `FP16 `__) - * Mixtral-8x7b-Instruct-v0.1 ( - `INT4 `__, - `INT8 `__) - * LCM_Dreamshaper_v7 ( - `INT8 `__, - `FP16 `__) - * starcoder2-7b ( - `INT4 `__, - `INT8 `__, - `FP16 `__) - * For all the models see `HuggingFace `__ - - - - - *Known Issues* - - | **Component: OpenVINO.GenAI** - | ID: 148308 - | Description: - | The OpenVINO.GenAI archive distribution doesn't include debug libraries for OpenVINO - Tokenizers and OpenVINO.GenAI. - - | **Component: GPU** - | ID: 146283 - | Description: - | For some LLM models, longer prompts, such as several thousand tokens, may result in - decreased accuracy on the GPU plugin. - | Workaround: - | It is recommended to run the model in the FP32 precision to avoid the issue. - - - - - -.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -.. dropdown:: 2024.2 - 17 June 2024 - :animate: fade-in-slide-down - :color: secondary - - **What's new** - - * More :doc:`Gen AI <../learn-openvino/llm_inference_guide/genai-guide>` coverage and framework - integrations to minimize code changes. - - * Llama 3 optimizations for CPUs, built-in GPUs, and discrete GPUs for improved performance - and efficient memory usage. - * Support for Phi-3-mini, a family of AI models that leverages the power of small language - models for faster, more accurate and cost-effective text processing. - * Python Custom Operation is now enabled in OpenVINO making it easier for Python developers - to code their custom operations instead of using C++ custom operations (also supported). - Python Custom Operation empowers users to implement their own specialized operations into - any model. - * Notebooks expansion to ensure better coverage for new models. Noteworthy notebooks added: - DynamiCrafter, YOLOv10, Chatbot notebook with Phi-3, and QWEN2. - - - * Broader Large Language Model (LLM) support and more model compression techniques. - - * GPTQ method for 4-bit weight compression added to NNCF for more efficient inference and - improved performance of compressed LLMs. - * Significant LLM performance improvements and reduced latency for both built-in GPUs and - discrete GPUs. - * Significant improvement in 2nd token latency and memory footprint of FP16 weight LLMs on - AVX2 (13th Gen Intel® Core™ processors) and AVX512 (3rd Gen Intel® Xeon® Scalable Processors) - based CPU platforms, particularly for small batch sizes. - - * More portability and performance to run AI at the edge, in the cloud, or locally. - - * Model Serving Enhancements: - - * Preview: OpenVINO Model Server (OVMS) now supports OpenAI-compatible API along with Continuous - Batching and PagedAttention, enabling significantly higher throughput for parallel - inferencing, especially on Intel® Xeon® processors, when serving LLMs to many concurrent - users. - * OpenVINO backend for Triton Server now supports dynamic input shapes. - * Integration of TorchServe through torch.compile OpenVINO backend for easy model deployment, - provisioning to multiple instances, model versioning, and maintenance. - - * Preview: addition of the :doc:`Generate API <../learn-openvino/llm_inference_guide/genai-guide>`, - a simplified API for text generation using large language models with only a few lines of - code. The API is available through the newly launched OpenVINO GenAI package. - * Support for Intel® Atom® Processor X Series. For more details, see :doc:`System Requirements <./release-notes-openvino/system-requirements>`. - * Preview: Support for Intel® Xeon® 6 processor. - - **OpenVINO™ Runtime** - - *Common* - - * Operations and data types using UINT2, UINT3, and UINT6 are now supported, to allow for a more - efficient LLM weight compression. - * Common OV headers have been optimized, improving binary compilation time and reducing binary - size. - - *AUTO Inference Mode* - - * AUTO takes model caching into account when choosing the device for fast first-inference latency. - If model cache is already in place, AUTO will directly use the selected device instead of - temporarily leveraging CPU as first-inference device. - * Dynamic models are now loaded to the selected device, instead of loading to CPU without - considering device priority. - * Fixed the exceptions when use AUTO with stateful models having dynamic input or output. - - *CPU Device Plugin* - - * Performance when using latency mode in FP32 precision has been improved on Intel client - platforms, including Intel® Core™ Ultra (formerly codenamed Meteor Lake) and 13th Gen Core - processors (formerly codenamed Raptor Lake). - * 2nd token latency and memory footprint for FP16 LLMs have been improved significantly on AVX2 - and AVX512 based CPU platforms, particularly for small batch sizes. - * PagedAttention has been optimized on AVX2, AVX512 and AMX platforms together with INT8 KV cache - support to improve the performance when serving LLM workloads on Intel CPUs. - * LLMs with shared embeddings have been optimized to improve performance and memory consumption - on several models including Gemma. - * Performance on ARM-based servers is significantly improved with upgrade to TBB 2021.2.5. - * Improved FP32 and FP16 performance on ARM CPU. - - *GPU Device Plugin* - - * Both first token and average token latency of LLMs is improved on all GPU platforms, most - significantly on discrete GPUs. Memory usage of LLMs has been reduced as well. - * Stable Diffusion FP16 performance improved on Intel® Core™ Ultra platforms, with significant - pipeline improvement for models with dynamic-shaped input. Memory usage of the pipeline - has been reduced, as well. - * Optimized permute_f_y kernel performance has been improved. - - *NPU Device Plugin* - - * A new set of configuration options is now available. - * Performance increase has been unlocked, with the new `2408 NPU driver `__. - - *OpenVINO Python API* - - * Writing custom Python operators is now supported for basic scenarios (alignment with OpenVINO - C++ API.) This empowers users to implement their own specialized operations into any model. - Full support with more advanced features is within the scope of upcoming releases. - - *OpenVINO C API* - - * More element types are now supported to algin with the OpenVINO C++ API. - - *OpenVINO Node.js API* - - * OpenVINO node.js packages now support the electron.js framework. - * Extended and improved JS API documentation for more complete usage guidelines. - * Better JS API alignment with OpenVINO C++ API, delivering more advanced features to JS users. - - *TensorFlow Framework Support* - - * 3 new operations are now supported. See operations marked as `NEW here `__. - * LookupTableImport has received better support, required for 2 models from TF Hub: - - * mil-nce - * openimages-v4-ssd-mobilenet-v2 - - *TensorFlow Lite Framework Support* - - * The GELU operation required for customer model is now supported. - - *PyTorch Framework Support* - - * 9 new operations are now supported. - * aten::set_item now supports negative indices. - * Issue with adaptive pool when shape is list has been fixed (PR `#24586 `__). - - *ONNX Support* - - * The InputModel interface should be used from now on, instead of a number of deprecated APIs - and class symbols - * Translation for ReduceMin-18 and ReduceSumSquare-18 operators has been added, to address - customer model requests - * Behavior of the Gelu-20 operator has been fixed for the case when “none” is set as the - default value. - - **OpenVINO Model Server** - - * OpenVINO Model server can be now used for text generation use cases using OpenAI compatible API. - * Added support for continuous batching and PagedAttention algorithms for text generation with - fast and efficient in high concurrency load especially on Intel® Xeon® processors. - `Learn more about it `__. - - **Neural Network Compression Framework** - - * GPTQ method is now supported in nncf.compress_weights() for data-aware 4-bit weight - compression of LLMs. Enabled by `gptq=True`` in nncf.compress_weights(). - * Scale Estimation algorithm for more accurate 4-bit compressed LLMs. Enabled by - `scale_estimation=True`` in nncf.compress_weights(). - * Added support for models with BF16 weights in nncf.compress_weights(). - * nncf.quantize() method is now the recommended path for quantization initialization of - PyTorch models in Quantization-Aware Training. See example for more details. - * compressed_model.nncf.get_config() and nncf.torch.load_from_config() API have been added to - save and restore quantized PyTorch models. See example for more details. - * Automatic support for int8 quantization of PyTorch models with custom modules has been added. - Now it is not needed to register such modules before quantization. - - **Other Changes and Known Issues** - - *Jupyter Notebooks* - - * Latest notebooks along with the GitHub validation status can be found in the - `OpenVINO notebook section `__ - * The following notebooks have been updated or newly added: - - * `Image to Video Generation with Stable Video Diffusion `__ - * `Image generation with Stable Cascade `__ - * `One Step Sketch to Image translation with pix2pix-turbo and OpenVINO `__ - * `Animating Open-domain Images with DynamiCrafter and OpenVINO `__ - * `Text-to-Video retrieval with S3D MIL-NCE and OpenVINO `__ - * `Convert and Optimize YOLOv10 with OpenVINO `__ - * `Visual-language assistant with nanoLLaVA and OpenVINO `__ - * `Person Counting System using YOLOV8 and OpenVINO™ `__ - * `Quantization-Sparsity Aware Training with NNCF, using PyTorch framework `__ - * `Create an LLM-powered Chatbot using OpenVINO `__ - - *Known Issues* - - | **Component: TBB** - | ID: TBB-1400/ TBB-1401 - | Description: - | In 2024.2, oneTBB 2021.2.x is used for Intel Distribution of OpenVINO Ubuntu and Red Hat - archives, instead of system TBB/oneTBB. This improves performance on the new generation of - Intel® Xeon® platforms but may increase latency of some models on the previous generation. - You can build OpenVINO with **-DSYSTEM_TBB=ON** to get better latency performance for - these models. - - | **Component: python API** - | ID: CVS-141744 - | Description: - | During post commit tests we found problem related with custom operations. Fix is ready and - will be delivered with 2024.3 release. - | - Initial problem: test_custom_op hanged on destruction because it was waiting for a - thread which tried to acquire GIL. - | - The second problem is that pybind11 doesn't allow to work with GIL besides of current - scope and it's impossible to release GIL for destructors. Blocking destructors and the - GIL pybind/pybind11#1446 - | - Current solution allows to release GIL for InferRequest and all called by chain destructors. - - | **Component: CPU runtime** - | *ID:* MFDNN-11428 - | *Description:* - | Due to adopting a new OneDNN library, improving performance for most use cases, - particularly for AVX2 BRGEMM kernels with the latency hint, the following regressions may - be noticed: - | a. latency regression on certain models, such as unet-camvid-onnx-0001 and mask_rcnn_resnet50_atrous_coco on MTL Windows latency mode - | b. performance regression on Intel client platforms if the throughput hint is used - | The issue is being investigated and planned to be resolved in the following releases. - - | **Component: Hardware Configuration** - | *ID:* N/A - | *Description:* - | Reduced performance for LLMs may be observed on newer CPUs. To mitigate, modify the default settings in BIOS to change the system into 2 NUMA node system: - | 1. Enter the BIOS configuration menu. - | 2. Select EDKII Menu -> Socket Configuration -> Uncore Configuration -> Uncore General Configuration -> SNC. - | 3. The SNC setting is set to *AUTO* by default. Change the SNC setting to *disabled* to configure one NUMA node per processor socket upon boot. - | 4. After system reboot, confirm the NUMA node setting using: `numatcl -H`. Expect to see only nodes 0 and 1 on a 2-socket system with the following mapping: - | Node - 0 - 1 - | 0 - 10 - 21 - | 1 - 21 - 10 - - - - - - - - - -.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -.. ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - -.. dropdown:: 2024.1 - 24 April 2024 - :animate: fade-in-slide-down - :color: secondary - - **What's new** - - * More Gen AI coverage and framework integrations to minimize code changes. - - * Mixtral and URLNet models optimized for performance improvements on Intel® Xeon® processors. - * Stable Diffusion 1.5, ChatGLM3-6B, and Qwen-7B models optimized for improved inference speed - on Intel® Core™ Ultra processors with integrated GPU. - * Support for Falcon-7B-Instruct, a GenAI Large Language Model (LLM) ready-to-use chat/instruct - model with superior performance metrics. - * New Jupyter Notebooks added: YOLO V9, YOLO V8 Oriented Bounding Boxes Detection (OOB), Stable - Diffusion in Keras, MobileCLIP, RMBG-v1.4 Background Removal, Magika, TripoSR, AnimateAnyone, - LLaVA-Next, and RAG system with OpenVINO and LangChain. - - * Broader LLM model support and more model compression techniques. - - * LLM compilation time reduced through additional optimizations with compressed embedding. - Improved 1st token performance of LLMs on 4th and 5th generations of Intel® Xeon® processors - with Intel® Advanced Matrix Extensions (Intel® AMX). - * Better LLM compression and improved performance with oneDNN, INT4, and INT8 support for - Intel® Arc™ GPUs. - * Significant memory reduction for select smaller GenAI models on Intel® Core™ Ultra processors - with integrated GPU. - - * More portability and performance to run AI at the edge, in the cloud, or locally. - - * The preview NPU plugin for Intel® Core™ Ultra processors is now available in the OpenVINO - open-source GitHub repository, in addition to the main OpenVINO package on PyPI. - * The JavaScript API is now more easily accessible through the npm repository, enabling - JavaScript developers' seamless access to the OpenVINO API. - * FP16 inference on ARM processors now enabled for the Convolutional Neural Network (CNN) by - default. - - **OpenVINO™ Runtime** - - *Common* - - * Unicode file paths for cached models are now supported on Windows. - * Pad pre-processing API to extend input tensor on edges with constants. - * A fix for inference failures of certain image generation models has been implemented - (fused I/O port names after transformation). - * Compiler's warnings-as-errors option is now on, improving the coding criteria and quality. - Build warnings will not be allowed for new OpenVINO code and the existing warnings have been - fixed. - - *AUTO Inference Mode* - - * Returning the ov::enable_profiling value from ov::CompiledModel is now supported. - - *CPU Device Plugin* - - * 1st token performance of LLMs has been improved on the 4th and 5th generations of Intel® Xeon® - processors with Intel® Advanced Matrix Extensions (Intel® AMX). - * LLM compilation time and memory footprint have been improved through additional optimizations - with compressed embeddings. - * Performance of MoE (e.g. Mixtral), Gemma, and GPT-J has been improved further. - * Performance has been improved significantly for a wide set of models on ARM devices. - * FP16 inference precision is now the default for all types of models on ARM devices. - * CPU architecture-agnostic build has been implemented, to enable unified binary distribution - on different ARM devices. - - *GPU Device Plugin* - - * LLM first token latency has been improved on both integrated and discrete GPU platforms. - * For the ChatGLM3-6B model, average token latency has been improved on integrated GPU platforms. - * For Stable Diffusion 1.5 FP16 precision, performance has been improved on Intel® Core™ Ultra - processors. - - *NPU Device Plugin* - - * NPU Plugin is now part of the OpenVINO GitHub repository. All the most recent plugin changes - will be immediately available in the repo. Note that NPU is part of Intel® Core™ Ultra - processors. - * New OpenVINO™ notebook “Hello, NPU!” introducing NPU usage with OpenVINO has been added. - * Version 22H2 or later is required for Microsoft Windows® 11 64-bit to run inference on NPU. - - *OpenVINO Python API* - - * GIL-free creation of RemoteTensors is now used - holding GIL means that the process is not suited - for multithreading and removing the GIL lock will increase performance which is critical for - the concept of Remote Tensors. - * Packed data type BF16 on the Python API level has been added, opening a new way of supporting - data types not handled by numpy. - * 'pad' operator support for ov::preprocess::PrePostProcessorItem has been added. - * ov.PartialShape.dynamic(int) definition has been provided. - - *OpenVINO C API* - - * Two new pre-processing APIs for scale and mean have been added. - - *OpenVINO Node.js API* - - * New methods to align JavaScript API with CPP API have been added, such as - CompiledModel.exportModel(), core.import_model(), Core set/get property and Tensor.get_size(), - and Model.is_dynamic(). - * Documentation has been extended to help developers start integrating JavaScript applications - with OpenVINO™. - - *TensorFlow Framework Support* - - * `tf.keras.layers.TextVectorization tokenizer `__ - is now supported. - * Conversion of models with Variable and HashTable (dictionary) resources has been improved. - * 8 NEW operations have been added - (`see the list here, marked as NEW `__). - * 10 operations have received complex tensor support. - * Input tensor names for TF1 models have been adjusted to have a single name per input. - * Hugging Face model support coverage has increased significantly, due to: - - * extraction of input signature of a model in memory has been fixed, - * reading of variable values for a model in memory has been fixed. - - *PyTorch Framework Support* - - * ModuleExtension, a new type of extension for PyTorch models is now supported - (`PR #23536 `__). - * 22 NEW operations have been added. - * Experimental support for models produced by torch.export (FX graph) has been added - (`PR #23815 `__). - - *ONNX Framework Support* - - * 8 new operations have been added. - - **OpenVINO Model Server** - - * OpenVINO™ Runtime backend used is now 2024.1 - * OpenVINO™ models with String data type on output are supported. Now, OpenVINO™ Model Server - can support models with input and output of the String type, so developers can take advantage - of the tokenization built into the model as the first layer. Developers can also rely on any - postprocessing embedded into the model which returns text only. Check the - `demo on string input data with the universal-sentence-encoder model `__ - and the - `String output model demo `__. - * MediaPipe Python calculators have been updated to support relative paths for all related - configuration and Python code files. Now, the complete graph configuration folder can be - deployed in an arbitrary path without any code changes. - * KServe REST API support has been extended to properly handle the string format in JSON body, - just like the binary format compatible with NVIDIA Triton™. - * `A demo showcasing a full RAG algorithm `__ - fully delegated to the model server has been added. - - **Neural Network Compression Framework** - - * Model subgraphs can now be defined in the ignored scope for INT8 Post-training Quantization, - nncf.quantize(), which simplifies excluding accuracy-sensitive layers from quantization. - * A batch size of more than 1 is now partially supported for INT8 Post-training Quantization, - speeding up the process. Note that it is not recommended for transformer-based models as it - may impact accuracy. Here is an - `example demo `__. - * Now it is possible to apply fine-tuning on INT8 models after Post-training Quantization to - improve model accuracy and make it easier to move from post-training to training-aware - quantization. Here is an - `example demo `__. - - **OpenVINO Tokenizers** - - * TensorFlow support has been extended - TextVectorization layer translation: - - * Aligned existing ops with TF ops and added a translator for them. - * Added new ragged tensor ops and string ops. - - * A new tokenizer type, RWKV is now supported: - - * Added Trie tokenizer and Fuse op for ragged tensors. - * A new way to get OV Tokenizers: build a vocab from file. - - * Tokenizer caching has been redesigned to work with the OpenVINO™ model caching mechanism. - - **Other Changes and Known Issues** - - *Jupyter Notebooks* - - The default branch for the OpenVINO™ Notebooks repository has been changed from 'main' to - 'latest'. The 'main' branch of the notebooks repository is now deprecated and will be maintained - until September 30, 2024. - - The new branch, 'latest', offers a better user experience and simplifies maintenance due to - significant refactoring and an improved directory naming structure. - - Use the local - `README.md `__ - file and OpenVINO™ Notebooks at - `GitHub Pages `__ - to navigate through the content. - - The following notebooks have been updated or newly added: - - * `Grounded Segment Anything `__ - * `Visual Content Search with MobileCLIP `__ - * `YOLO V8 Oriented Bounding Box Detection Optimization `__ - * `Magika: AI-powered fast and efficient file type identification `__ - * `Keras Stable Diffusion `__ - * `RMBG background removal `__ - * `AnimateAnyone: pose guided image to video generation `__ - * `LLaVA-Next visual-language assistant `__ - * `TripoSR: single image 3d reconstruction `__ - * `RAG system with OpenVINO and LangChain `__ - - *Known Issues* - - | **Component: CPU Plugin** - | *ID:* N/A - | *Description:* - | Default CPU pinning policy on Windows has been changed to follow Windows' policy - instead of controlling the CPU pinning in the OpenVINO plugin. This brings certain dynamic or - performance variance on Windows. Developers can use ov::hint::enable_cpu_pinning to enable - or disable CPU pinning explicitly. - - | **Component: Hardware Configuration** - | *ID:* N/A - | *Description:* - | Reduced performance for LLMs may be observed on newer CPUs. To mitigate, modify the default settings in BIOS to - | change the system into 2 NUMA node system: - | 1. Enter the BIOS configuration menu. - | 2. Select EDKII Menu -> Socket Configuration -> Uncore Configuration -> Uncore General Configuration -> SNC. - | 3. The SNC setting is set to *AUTO* by default. Change the SNC setting to *disabled* to configure one NUMA node per processor socket upon boot. - | 4. After system reboot, confirm the NUMA node setting using: `numatcl -H`. Expect to see only nodes 0 and 1 on a - | 2-socket system with the following mapping: - | Node - 0 - 1 - | 0 - 10 - 21 - | 1 - 21 - 10 - - - - - - - - - - -.. dropdown:: 2024.0 - 06 March 2024 - :animate: fade-in-slide-down - :color: secondary - - **What's new** - - * More Generative AI coverage and framework integrations to minimize code changes. - - * Improved out-of-the-box experience for TensorFlow sentence encoding models through the - installation of OpenVINO™ toolkit Tokenizers. - * New and noteworthy models validated: - Mistral, StableLM-tuned-alpha-3b, and StableLM-Epoch-3B. - * OpenVINO™ toolkit now supports Mixture of Experts (MoE), a new architecture that helps - process more efficient generative models through the pipeline. - * JavaScript developers now have seamless access to OpenVINO API. This new binding enables a - smooth integration with JavaScript API. - - * Broader Large Language Model (LLM) support and more model compression techniques. - - * Broader Large Language Model (LLM) support and more model compression techniques. - * Improved quality on INT4 weight compression for LLMs by adding the popular technique, - Activation-aware Weight Quantization, to the Neural Network Compression Framework (NNCF). - This addition reduces memory requirements and helps speed up token generation. - * Experience enhanced LLM performance on Intel® CPUs, with internal memory state enhancement, - and INT8 precision for KV-cache. Specifically tailored for multi-query LLMs like ChatGLM. - * The OpenVINO™ 2024.0 release makes it easier for developers, by integrating more OpenVINO™ - features with the Hugging Face ecosystem. Store quantization configurations for popular - models directly in Hugging Face to compress models into INT4 format while preserving - accuracy and performance. - - * More portability and performance to run AI at the edge, in the cloud, or locally. - - * A preview plugin architecture of the integrated Neural Processor Unit (NPU) as part of - Intel® Core™ Ultra processor (formerly codenamed Meteor Lake) is now included in the - main OpenVINO™ package on PyPI. - * Improved performance on ARM by enabling the ARM threading library. In addition, we now - support multi-core ARM processors and enabled FP16 precision by default on MacOS. - * New and improved LLM serving samples from OpenVINO Model Server for multi-batch inputs and - Retrieval Augmented Generation (RAG). - - **OpenVINO™ Runtime** - - *Common* - - * The legacy API for CPP and Python bindings has been removed. - * StringTensor support has been extended by operators such as ``Gather``, ``Reshape``, and - ``Concat``, as a foundation to improve support for tokenizer operators and compliance with - the TensorFlow Hub. - * oneDNN has been updated to v3.3. - (`see oneDNN release notes `__). - - *CPU Device Plugin* - - * LLM performance on Intel® CPU platforms has been improved for systems based on AVX2 and - AVX512, using dynamic quantization and internal memory state optimization, such as INT8 - precision for KV-cache. 13th and 14th generations of Intel® Core™ processors and Intel® Core™ - Ultra processors use AVX2 for CPU execution, and these platforms will benefit from speedup. - Enable these features by setting ``"DYNAMIC_QUANTIZATION_GROUP_SIZE":"32"`` and - ``"KV_CACHE_PRECISION":"u8"`` in the configuration file. - * The ``ov::affinity`` API configuration is now deprecated and will be removed in release - 2025.0. - * The following have been improved and optimized: - - * Multi-query structure LLMs (such as ChatGLM 2/3) for BF16 on the 4th and 5th generation - Intel® Xeon® Scalable processors. - * `Mixtral `__ model performance. - * 8-bit compressed LLM compilation time and memory usage, valuable for models with large - embeddings like `Qwen `__. - * Convolutional networks in FP16 precision on ARM processors. - - *GPU Device Plugin* - - * The following have been improved and optimized: - - * Average token latency for LLMs on integrated GPU (iGPU) platforms, using INT4-compressed - models with large context size on Intel® Core™ Ultra processors. - * LLM beam search performance on iGPU. Both average and first-token latency decrease may be - expected for larger context sizes. - * Multi-batch performance of YOLOv5 on iGPU platforms. - - * Memory usage for LLMs has been optimized, enabling '7B' models with larger context on - 16Gb platforms. - - *NPU Device Plugin (preview feature)* - - * The NPU plugin for OpenVINO™ is now available through PyPI (run “pip install openvino”). - - *OpenVINO Python API* - - * ``.add_extension`` method signatures have been aligned, improving API behavior for better - user experience. - - *OpenVINO C API* - - * ov_property_key_cache_mode (C++ ov::cache_mode) now enables the ``optimize_size`` and - ``optimize_speed`` modes to set/get model cache. - * The VA surface on Windows exception has been fixed. - - *OpenVINO Node.js API* - - * OpenVINO - `JS bindings `__ - are consistent with the OpenVINO C++ API. - * A new distribution channel is now available: Node Package Manager (npm) software registry - (:doc:`check the installation guide <../get-started/install-openvino/install-openvino-npm>`). - * JavaScript API is now available for Windows users, as some limitations for platforms other - than Linux have been removed. - - *TensorFlow Framework Support* - - * String tensors are now natively supported, handled on input, output, and intermediate layers - (`PR #22024 `__). - - * TensorFlow Hub universal-sentence-encoder-multilingual inferred out of the box - * string tensors supported for ``Gather``, ``Concat``, and ``Reshape`` operations - * integration with openvino-tokenizers module - importing openvino-tokenizers automatically - patches TensorFlow FE with the required translators for models with tokenization - - * Fallback for Model Optimizer by operation to the legacy Frontend is no longer available. - Fallback by .json config will remain until Model Optimizer is discontinued - (`PR #21523 `__). - * Support for the following has been added: - - * Mutable variables and resources such as HashTable*, Variable, VariableV2 - (`PR #22270 `__). - * New tensor types: tf.u16, tf.u32, and tf.u64 - (`PR #21864 `__). - * 14 NEW Ops*. - `Check the list here (marked as NEW) `__. - * TensorFlow 2.15 - (`PR #22180 `__). - - * The following issues have been fixed: - - * UpSampling2D conversion crashed when input type as int16 - (`PR #20838 `__). - * IndexError list index for Squeeze - (`PR #22326 `__). - * Correct FloorDiv computation for signed integers - (`PR #22684 `__). - * Fixed bad cast error for tf.TensorShape to ov.PartialShape - (`PR #22813 `__). - * Fixed reading tf.string attributes for models in memory - (`PR #22752 `__). - - *ONNX Framework Support* - - * ONNX Frontend now uses the OpenVINO API 2.0. - - *PyTorch Framework Support* - - * Names for outputs unpacked from dict or tuple are now clearer - (`PR #22821 `__). - * FX Graph (torch.compile) now supports kwarg inputs, improving data type coverage. - (`PR #22397 `__). - - **OpenVINO Model Server** - - * OpenVINO™ Runtime backend used is now 2024.0. - * Text generation demo now supports multi batch size, with streaming and unary clients. - * The REST client now supports servables based on mediapipe graphs, including python pipeline - nodes. - * Included dependencies have received security-related updates. - * Reshaping a model in runtime based on the incoming requests (auto shape and auto batch size) - is deprecated and will be removed in the future. Using OpenVINO's dynamic shape models is - recommended instead. - - **Neural Network Compression Framework (NNCF)** - - * The `Activation-aware Weight Quantization (AWQ) `__ - algorithm for data-aware 4-bit weights compression is now available. It facilitates better - accuracy for compressed LLMs with high ratio of 4-bit weights. To enable it, use the - dedicated ``awq`` optional parameter of ``the nncf.compress_weights()`` API. - * ONNX models are now supported in Post-training Quantization with Accuracy Control, through - the ``nncf.quantize_with_accuracy_control()``, method. It may be used for models in the - OpenVINO IR and ONNX formats. - * A `weight compression example tutorial `__ - is now available, demonstrating how to find the appropriate hyperparameters for the TinyLLama - model from the Hugging Face Transformers, as well as other LLMs, with some modifications. - - **OpenVINO Tokenizer** - - * Regex support has been improved. - * Model coverage has been improved. - * Tokenizer metadata has been added to rt_info. - * Limited support for Tensorflow Text models has been added: convert MUSE for TF Hub with - string inputs. - * OpenVINO Tokenizers have their own repository now: - `/openvino_tokenizers `__ - - **Other Changes and Known Issues** - - *Jupyter Notebooks* - - The following notebooks have been updated or newly added: - - * `Mobile language assistant with MobileVLM `__ - * `Depth estimation with DepthAnything `__ - * `Kosmos-2 `__ - * `Zero-shot Image Classification with SigLIP `__ - * `Personalized image generation with PhotoMaker `__ - * `Voice tone cloning with OpenVoice `__ - * `Line-level text detection with Surya `__ - * `InstantID: Zero-shot Identity-Preserving Generation using OpenVINO `__ - * `Tutorial for Big Image Transfer (BIT) model quantization using NNCF `__ - * `Tutorial for OpenVINO Tokenizers integration into inference pipelines `__ - * `LLM chatbot `__ and - `LLM RAG pipeline `__ - have received integration with new models: minicpm-2b-dpo, gemma-7b-it, qwen1.5-7b-chat, baichuan2-7b-chat - - *Known issues* - - | **Component: CPU Plugin** - | *ID:* N/A - | *Description:* - | Starting with 24.0, model inputs and outputs will no longer have tensor names, unless - explicitly set to align with the PyTorch framework behavior. - - | **Component: GPU runtime** - | *ID:* 132376 - | *Description:* - | First-inference latency slow down for LLMs on Intel® Core™ Ultra processors. Up to 10-20% - drop may occur due to radical memory optimization for processing long sequences - (about 1.5-2 GB reduced memory usage). - - | **Component: CPU runtime** - | *ID:* N/A - | *Description:* - | Performance results (first token latency) may vary from those offered by the previous - OpenVINO version, for “latency” hint inference of LLMs with long prompts on Intel® Xeon® - platforms with 2 or more sockets. The reason is that all CPU cores of just the single - socket running the application are employed, lowering the memory overhead for LLMs when - numa control is not used. - | *Workaround:* - | The behavior is expected but stream and thread configuration may be used to include cores - from all sockets. - @@ -1666,12 +117,15 @@ Previous 2024 releases Deprecation And Support +++++++++++++++++++++++++++++ + Using deprecated features and components is not advised. They are available to enable a smooth transition to new solutions and will be discontinued in the future. To keep using discontinued features, you will have to revert to the last LTS OpenVINO version supporting them. For more details, refer to the `OpenVINO Legacy Features and Components __` page. + + Discontinued in 2024 ----------------------------- @@ -1730,97 +184,18 @@ Deprecated and to be removed in the future `model conversion transition guide `__. * OpenVINO property Affinity API will be discontinued with OpenVINO 2025.0. It will be replaced with CPU binding configurations (``ov::hint::enable_cpu_pinning``). -* OpenVINO Model Server components: - - * “auto shape” and “auto batch size” (reshaping a model in runtime) will be removed in the - future. OpenVINO's dynamic shape models are recommended instead. - -* Starting with 2025.0 MacOS x86 will no longer be recommended for use due to the discontinuation - of validation. Full support will be removed later in 2025. -* A number of notebooks have been deprecated. For an up-to-date listing of available notebooks, - refer to the `OpenVINO™ Notebook index (openvinotoolkit.github.io) `__. - .. dropdown:: See the deprecated notebook list - :animate: fade-in-slide-down - :color: muted - * `Handwritten OCR with OpenVINO™ `__ - * See alternative: `Optical Character Recognition (OCR) with OpenVINO™ `__, - * See alternative: `PaddleOCR with OpenVINO™ `__, - * See alternative: `Handwritten Text Recognition Demo `__ - * `Image In-painting with OpenVINO™ `__ - - * See alternative: `Image Inpainting Python Demo `__ - - * `Interactive Machine Translation with OpenVINO `__ - - * See alternative: `Machine Translation Python* Demo `__ - - * `Super Resolution with OpenVINO™ `__ - - * See alternative: `Super Resolution with PaddleGAN and OpenVINO `__ - * See alternative: `Image Processing C++ Demo `__ - - * `Image Colorization with OpenVINO Tutorial `__ - * `Interactive Question Answering with OpenVINO™ `__ - - * See alternative: `BERT Question Answering Embedding Python* Demo `__ - * See alternative: `BERT Question Answering Python* Demo `__ - - * `Vehicle Detection And Recognition with OpenVINO™ `__ - - * See alternative: `Security Barrier Camera C++ Demo `__ - - * `The attention center model with OpenVINO™ `_ - * `Image Generation with DeciDiffusion `_ - * `Image generation with DeepFloyd IF and OpenVINO™ `_ - * `Depth estimation using VI-depth with OpenVINO™ `_ - * `Instruction following using Databricks Dolly 2.0 and OpenVINO™ `_ - - * See alternative: `LLM Instruction-following pipeline with OpenVINO `__ - - * `Image generation with FastComposer and OpenVINO™ `__ - * `Video Subtitle Generation with OpenAI Whisper `__ - - * See alternative: `Automatic speech recognition using Distil-Whisper and OpenVINO `__ - - * `Introduction to Performance Tricks in OpenVINO™ `__ - * `Speaker Diarization with OpenVINO™ `__ - * `Subject-driven image generation and editing using BLIP Diffusion and OpenVINO `__ - * `Text Prediction with OpenVINO™ `__ - * `Training to Deployment with TensorFlow and OpenVINO™ `__ - * `Speech to Text with OpenVINO™ `__ - * `Convert and Optimize YOLOv7 with OpenVINO™ `__ - * `Quantize Data2Vec Speech Recognition Model using NNCF PTQ API `__ - - * See alternative: `Quantize Speech Recognition Models with accuracy control using NNCF PTQ API `__ - - * `Semantic segmentation with LRASPP MobileNet v3 and OpenVINO `__ - * `Video Recognition using SlowFast and OpenVINO™ `__ - - * See alternative: `Live Action Recognition with OpenVINO™ `__ - - * `Semantic Segmentation with OpenVINO™ using Segmenter `__ - * `Programming Language Classification with OpenVINO `__ - * `Stable Diffusion Text-to-Image Demo `__ - - * See alternative: `Stable Diffusion v2.1 using Optimum-Intel OpenVINO and multiple Intel Hardware `__ - - * `Text-to-Image Generation with Stable Diffusion v2 and OpenVINO™ `__ - - * See alternative: `Stable Diffusion v2.1 using Optimum-Intel OpenVINO and multiple Intel Hardware `__ + * “auto shape” and “auto batch size” (reshaping a model in runtime) will be removed in the + future. OpenVINO's dynamic shape models are recommended instead. - * `Image generation with Segmind Stable Diffusion 1B (SSD-1B) model and OpenVINO `__ - * `Data Preparation for 2D Medical Imaging `__ - * `Train a Kidney Segmentation Model with MONAI and PyTorch Lightning `__ - * `Live Inference and Benchmark CT-scan Data with OpenVINO™ `__ +* Starting with 2025.0 MacOS x86 is no longer recommended for use due to the discontinuation + of validation. Full support will be removed later in 2025. - * See alternative: `Quantize a Segmentation Model and Show Live Inference `__ - * `Live Style Transfer with OpenVINO™ `__ @@ -1855,7 +230,7 @@ of Intel Corporation in the U.S. and/or other countries. Other names and brands may be claimed as the property of others. -Copyright © 2024, Intel Corporation. All rights reserved. +Copyright © 2025, Intel Corporation. All rights reserved. For more complete information about compiler optimizations, see our Optimization Notice. diff --git a/docs/articles_en/about-openvino/release-notes-openvino/system-requirements.rst b/docs/articles_en/about-openvino/release-notes-openvino/system-requirements.rst index 79a9f63821c16f..9f2a4e691bae4f 100644 --- a/docs/articles_en/about-openvino/release-notes-openvino/system-requirements.rst +++ b/docs/articles_en/about-openvino/release-notes-openvino/system-requirements.rst @@ -135,9 +135,9 @@ Operating systems and developer environment Build environment components: - * `Microsoft Visual Studio 2019 or later `__ + * `Microsoft Visual Studio 2019 or later `__ * `CMake `__ 3.16 or higher - * `Python `__ 3.9-3.12 + * `Python `__ 3.9-3.12 * `Intel® HD Graphics Driver `__ required for inference on GPU @@ -149,7 +149,7 @@ Operating systems and developer environment * `Xcode `__ 10.3 * `CMake `__ 3.13 or higher - * `Python `__ 3.9-3.12 + * `Python `__ 3.9-3.12 .. tab-item:: DL framework versions: diff --git a/docs/articles_en/assets/images/openvino-overview-diagram.jpg b/docs/articles_en/assets/images/openvino-overview-diagram.jpg index bfd3c6533446f3..982b65f5eab254 100644 --- a/docs/articles_en/assets/images/openvino-overview-diagram.jpg +++ b/docs/articles_en/assets/images/openvino-overview-diagram.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:739d604dc4b8bae082e9c70e24328bcf9c30fa3fe5b1f884b9bd129509302b4e -size 1465073 +oid sha256:bf3adb1b6fafa18ecf6c5cf2944e687695953605de7e7e4e4315d108fbfb608e +size 124217 diff --git a/docs/articles_en/documentation.rst b/docs/articles_en/documentation.rst index c1dd34f5373429..8222a870c91a3b 100644 --- a/docs/articles_en/documentation.rst +++ b/docs/articles_en/documentation.rst @@ -16,6 +16,7 @@ Documentation Tool Ecosystem OpenVINO Extensibility OpenVINO™ Security + Legacy Features This section provides reference documents that guide you through the OpenVINO toolkit workflow, from preparing models, optimizing them, to deploying them in your own deep learning applications. diff --git a/docs/articles_en/documentation/legacy-features.rst b/docs/articles_en/documentation/legacy-features.rst new file mode 100644 index 00000000000000..0b09b23c081134 --- /dev/null +++ b/docs/articles_en/documentation/legacy-features.rst @@ -0,0 +1,112 @@ +Legacy Features and Components +============================== + +.. meta:: + :description: A list of deprecated OpenVINO™ components. + +Since OpenVINO has grown very rapidly in recent years, a number of its features +and components have been replaced by other solutions. Some of them are still +supported to assure OpenVINO users are given enough time to adjust their projects, +before the features are fully discontinued. + +This section will give you an overview of these major changes and tell you how +you can proceed to get the best experience and results with the current OpenVINO +offering. + + +Discontinued: +############# + +.. dropdown:: OpenVINO Development Tools Package + + | *New solution:* OpenVINO Runtime includes all supported components + | *Old solution:* `See how to install Development Tools `__ + | + | OpenVINO Development Tools used to be the OpenVINO package with tools for + advanced operations on models, such as Model conversion API, Benchmark Tool, + Accuracy Checker, Annotation Converter, Post-Training Optimization Tool, + and Open Model Zoo tools. Most of these tools have been either removed, + replaced by other solutions, or moved to the OpenVINO Runtime package. + +.. dropdown:: Model Optimizer / Conversion API + + | *New solution:* :doc:`Direct model support and OpenVINO Converter (OVC) <../openvino-workflow/model-preparation>` + | *Old solution:* `Legacy Conversion API `__ + | + | The role of Model Optimizer and later the Conversion API was largely reduced + when all major model frameworks became supported directly. For converting model + files explicitly, it has been replaced with a more light-weight and efficient + solution, the OpenVINO Converter (launched with OpenVINO 2023.1). + +.. dropdown:: Open Model ZOO + + | *New solution:* users are encouraged to use public model repositories such as `Hugging Face `__ + | *Old solution:* `Open Model ZOO `__ + | + | Open Model ZOO provided a collection of models prepared for use with OpenVINO, + and a small set of tools enabling a level of automation for the process. + Since the tools have been mostly replaced by other solutions and several + other model repositories have recently grown in size and popularity, + Open Model ZOO will no longer be maintained. You may still use its resources + until they are fully removed. `Check the OMZ GitHub project `__ + +.. dropdown:: Multi-Device Execution + + | *New solution:* :doc:`Automatic Device Selection <../openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection>` + | *Old solution:* `Check the legacy solution `__ + | + | The behavior and results of the Multi-Device Execution mode are covered by the ``CUMULATIVE_THROUGHPUT`` + option of the Automatic Device Selection. The only difference is that ``CUMULATIVE_THROUGHPUT`` uses + the devices specified by AUTO, which means that adding devices manually is not mandatory, + while with MULTI, the devices had to be specified before the inference. + +.. dropdown:: Caffe, and Kaldi model formats + + | *New solution:* conversion to ONNX via external tools + | *Old solution:* model support discontinued with OpenVINO 2024.0 + | `The last version supporting Apache MXNet, Caffe, and Kaldi model formats `__ + | :doc:`See the currently supported frameworks <../openvino-workflow/model-preparation>` + +.. dropdown:: Post-training Optimization Tool (POT) + + | *New solution:* Neural Network Compression Framework (NNCF) now offers the same functionality + | *Old solution:* POT discontinued with OpenVINO 2024.0 + | :doc:`See how to use NNCF for model optimization <../openvino-workflow/model-optimization>` + | `Check the NNCF GitHub project, including documentation `__ + +.. dropdown:: Inference API 1.0 + + | *New solution:* API 2.0 launched in OpenVINO 2022.1 + | *Old solution:* discontinued with OpenVINO 2024.0 + | `2023.2 is the last version supporting API 1.0 `__ + +.. dropdown:: Compile tool + + | *New solution:* the tool is no longer needed + | *Old solution:* discontinued with OpenVINO 2023.0 + | If you need to compile a model for inference on a specific device, use the following script: + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + .. doxygensnippet:: docs/articles_en/assets/snippets/export_compiled_model.py + :language: python + :fragment: [export_compiled_model] + + .. tab-item:: C++ + :sync: cpp + + .. doxygensnippet:: docs/articles_en/assets/snippets/export_compiled_model.cpp + :language: cpp + :fragment: [export_compiled_model] + +.. dropdown:: TensorFlow integration (OVTF) + + | *New solution:* Direct model support and OpenVINO Converter (OVC) + | *Old solution:* discontinued in OpenVINO 2023.0 + | + | OpenVINO now features a native TensorFlow support, with no need for explicit model + conversion. + diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst b/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst index 475f623ef86598..f8cf4badf932fc 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst @@ -109,5 +109,5 @@ Additional Resources - `BitBake Tool `__ - `Poky `__ - `Meta-intel `__ -- `Meta-openembedded `__ +- `Meta-openembedded `__ - `Meta-clang `__ \ No newline at end of file diff --git a/docs/articles_en/learn-openvino/llm_inference_guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide.rst index 372c3b6d652bfc..8401923b8c7ac6 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide.rst @@ -55,7 +55,10 @@ options: as well as conversion on the fly. For integration with the final product it may offer lower performance, though. - +Note that the base version of OpenVINO may also be used to run generative AI. Although it may +offer a simpler environment, with fewer dependencies, it has significant limitations and a more +demanding implementation process. For reference, see +`the article on generative AI usage of OpenVINO 2024.6 `__. The advantages of using OpenVINO for generative model deployment: diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst index 60253779b0f3dc..8fb6ad27c4232f 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst @@ -44,7 +44,7 @@ You select one of the methods by setting the ``--group-size`` parameter to eithe .. code-block:: console :name: group-quant - optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-Chat-v1.0 --weight-format int4 --sym --ratio 1.0 --group_size 128 TinyLlama-1.1B-Chat-v1.0 + optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-Chat-v1.0 --weight-format int4 --sym --ratio 1.0 --group-size 128 TinyLlama-1.1B-Chat-v1.0 .. tab-item:: Channel-wise quantization @@ -63,12 +63,12 @@ You select one of the methods by setting the ``--group-size`` parameter to eithe If you want to improve accuracy, make sure you: 1. Update NNCF: ``pip install nncf==2.13`` - 2. Use ``--scale_estimation --dataset=`` and accuracy aware quantization ``--awq``: + 2. Use ``--scale_estimation --dataset `` and accuracy aware quantization ``--awq``: .. code-block:: console :name: channel-wise-data-aware-quant - optimum-cli export openvino -m meta-llama/Llama-2-7b-chat-hf --weight-format int4 --sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset=wikitext2 Llama-2-7b-chat-hf + optimum-cli export openvino -m meta-llama/Llama-2-7b-chat-hf --weight-format int4 --sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2 Llama-2-7b-chat-hf .. important:: diff --git a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst index 5a706061777594..cde0eef055d5cb 100644 --- a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst +++ b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst @@ -8,7 +8,10 @@ Benchmark Tool devices. -This page demonstrates how to use the Benchmark Tool to estimate deep learning inference performance on supported devices. +This page demonstrates how to use the Benchmark Tool to estimate deep learning inference +performance on supported devices. Note that the MULTI plugin mentioned here is considered +a legacy tool and currently is just a mapping of the +:doc:`AUTO plugin <../../openvino-workflow/running-inference/inference-devices-and-modes/auto-device-selection>`. .. note:: diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst index 046dde9661c3bb..4b752b74187768 100644 --- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst +++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst @@ -354,7 +354,7 @@ To find the optimal weight compression parameters for a particular model, refer `example `__ , where weight compression parameters are being searched from the subset of values. To speed up the search, a self-designed validation pipeline called -`WhoWhatBench `__ +`WhoWhatBench `__ is used. The pipeline can quickly evaluate the changes in the accuracy of the optimized model compared to the baseline. @@ -491,7 +491,7 @@ Additional Resources - `OpenVINO GenAI Repo `__ : Repository containing example pipelines that implement image and text generation tasks. It also provides a tool to benchmark LLMs. -- `WhoWhatBench `__ +- `WhoWhatBench `__ - `NNCF GitHub `__ - :doc:`Post-training Quantization ` - :doc:`Training-time Optimization ` diff --git a/docs/articles_en/openvino-workflow/model-preparation/convert-model-keras.rst b/docs/articles_en/openvino-workflow/model-preparation/convert-model-keras.rst new file mode 100644 index 00000000000000..a3e456e4495354 --- /dev/null +++ b/docs/articles_en/openvino-workflow/model-preparation/convert-model-keras.rst @@ -0,0 +1,87 @@ +Converting a Keras Model +======================== + + +.. meta:: + :description: Learn how to convert a model from the + Keras format to the OpenVINO Model. + + +This document explains the process of converting Keras 3 models to the OpenVINO Intermediate Representation (IR) format. +For instructions on converting Keras 2 models, refer to :doc:`TensorFlow Model Conversion `. + +To convert a Keras 3 model, first export it to a lightweight TensorFlow SavedModel artifact, +and then convert it to an OpenVINO model, using the ``convert_model`` function. +Here is a code example of how to do this: + +.. code-block:: py + :force: + + import keras_hub + import openvino as ov + + model = keras_hub.models.BertTextClassifier.from_preset( + "bert_base_en_uncased", + num_classes=4, + preprocessor=None, + ) + + # export to SavedModel + model.export("bert_base") + + # convert to OpenVINO model + ov_model = ov.convert_model("bert_base") + + +.. note:: + + The resulting OpenVINO IR model can be saved to drive with no additional, Keras-specific steps. + Use the standard ``ov.save_model(ov_model,'model.xml')`` command. + +Alternatively, a model exported to TensorFlow SavedModel format can also be converted to OpenVINO IR using the ``ovc`` tool. Here is an example: + +.. code-block:: sh + :force: + + ovc bert_base + + +Run inference in Keras 3 with the OpenVINO backend +################################################## + +Starting with release 3.8, Keras provides native integration with the OpenVINO backend for accelerated inference. +This integration enables you to leverage OpenVINO performance optimizations directly within the Keras workflow, enabling faster inference on OpenVINO supported hardware. + +To switch to the OpenVINO backend in Keras 3, set the ``KERAS_BACKEND`` environment variable to ``"openvino"`` +or specify the backend in the local configuration file at ``~/.keras/keras.json``. +Here is an example of how to infer a model (trained with PyTorch, JAX, or TensorFlow backends) in Keras 3, using the OpenVINO backend: + +.. code-block:: py + :force: + + import os + + os.environ["KERAS_BACKEND"] = "openvino" + import numpy as np + import keras + import keras_hub + + features = { + "token_ids": np.ones(shape=(2, 12), dtype="int32"), + "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2), + "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2), + } + + # take a model from KerasHub + bert = keras_hub.models.BertTextClassifier.from_preset( + "bert_base_en_uncased", + num_classes=4, + preprocessor=None, + ) + + predictions = bert.predict(features) + +.. note:: + + The OpenVINO backend may currently lack support for some operations. + This will be addressed in upcoming Keras releases as operation coverage is being expanded. diff --git a/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst b/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst index dd2fc35c56e92b..f36a235cf79f77 100644 --- a/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst +++ b/docs/articles_en/openvino-workflow/model-preparation/convert-model-to-ir.rst @@ -14,6 +14,7 @@ Convert to OpenVINO IR Convert from TensorFlow Lite Convert from PaddlePaddle Convert from JAX/Flax + Convert from Keras diff --git a/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst b/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst index b020797c4b2e60..39658fcefca109 100644 --- a/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst +++ b/docs/articles_en/openvino-workflow/running-inference/changing-input-shape.rst @@ -206,7 +206,7 @@ Additional Resources #################### * :doc:`Extensibility documentation <../../documentation/openvino-extensibility>` - describes a special mechanism in OpenVINO that allows adding support of shape inference for custom operations. -* `ov::Model::reshape `__ - in OpenVINO Runtime C++ API +* `ov::Model::reshape `__ - in OpenVINO Runtime C++ API * `Model.reshape `__ - in OpenVINO Runtime Python API. * :doc:`Dynamic Shapes ` * :doc:`OpenVINO samples <../../learn-openvino/openvino-samples>` diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/hetero-execution.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/hetero-execution.rst index bbc23297282afe..e641192ae9bd0e 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/hetero-execution.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/hetero-execution.rst @@ -16,8 +16,8 @@ Its purpose is to: Execution via the heterogeneous mode can be divided into two independent steps: -1. Setting hardware affinity to operations (`ov::Core::query_model `__ is used internally by the Hetero device). -2. Compiling a model to the Heterogeneous device assumes splitting the model to parts, compiling them on the specified devices (via `ov::device::priorities `__), and executing them in the Heterogeneous mode. The model is split to subgraphs in accordance with the affinities, where a set of connected operations with the same affinity is to be a dedicated subgraph. Each subgraph is compiled on a dedicated device and multiple `ov::CompiledModel `__ objects are made, which are connected via automatically allocated intermediate tensors. +1. Setting hardware affinity to operations (`ov::Core::query_model `__ is used internally by the Hetero device). +2. Compiling a model to the Heterogeneous device assumes splitting the model to parts, compiling them on the specified devices (via `ov::device::priorities `__), and executing them in the Heterogeneous mode. The model is split to subgraphs in accordance with the affinities, where a set of connected operations with the same affinity is to be a dedicated subgraph. Each subgraph is compiled on a dedicated device and multiple `ov::CompiledModel `__ objects are made, which are connected via automatically allocated intermediate tensors. If you set pipeline parallelism (via ``ov::hint::model_distribution_policy``), the model is split into multiple stages, and each stage is assigned to a different device. The output of one stage is fed as input to the next stage. @@ -51,7 +51,7 @@ Manual and Automatic Modes for Assigning Affinities The Manual Mode +++++++++++++++++++++ -It assumes setting affinities explicitly for all operations in the model using `ov::Node::get_rt_info `__ with the ``"affinity"`` key. +It assumes setting affinities explicitly for all operations in the model using `ov::Node::get_rt_info `__ with the ``"affinity"`` key. If you assign specific operation to a specific device, make sure that the device actually supports the operation. Randomly selecting operations and setting affinities may lead to decrease in model accuracy. To avoid that, try to set the related operations or subgraphs of this operation to the same affinity, such as the constant operation that will be folded into this operation. @@ -156,14 +156,14 @@ In some cases you may need to consider manually adjusting affinities which were Importantly, the automatic mode will not work if any operation in a model has its ``"affinity"`` already initialized. -.. note: +.. note:: - `ov::Core::query_model `__ does not depend on affinities set by a user. Instead, it queries for an operation support based on device capabilities. + `ov::Core::query_model `__ does not depend on affinities set by a user. Instead, it queries for an operation support based on device capabilities. Configure fallback devices ########################## -If you want different devices in Hetero execution to have different device-specific configuration options, you can use the special helper property `ov::device::properties `__: +If you want different devices in Hetero execution to have different device-specific configuration options, you can use the special helper property `ov::device::properties `__: .. tab-set:: diff --git a/docs/notebooks/latent-consistency-models-image-generation-with-output.rst b/docs/notebooks/latent-consistency-models-image-generation-with-output.rst index 523afca76dd660..37dd96826cd038 100644 --- a/docs/notebooks/latent-consistency-models-image-generation-with-output.rst +++ b/docs/notebooks/latent-consistency-models-image-generation-with-output.rst @@ -694,7 +694,7 @@ generative models as it already includes all the core functionality. ``openvino_genai.Text2ImagePipeline`` class supports inference of `Diffusers -models `__. +models `__. For pipeline initialization, we should provide directory with converted by Optimum Intel pipeline and specify inference device. Optionally, we can provide configuration for LoRA Adapters using ``adapter_config``. diff --git a/docs/notebooks/multilora-image-generation-with-output.rst b/docs/notebooks/multilora-image-generation-with-output.rst index 7b6f4bc381ff27..2efe1aaab50908 100644 --- a/docs/notebooks/multilora-image-generation-with-output.rst +++ b/docs/notebooks/multilora-image-generation-with-output.rst @@ -210,7 +210,7 @@ generative models as it already includes all the core functionality. ``openvino_genai.Text2ImagePipeline`` class supports inference of `Diffusers -models `__. +models `__. For pipeline initialization, we should provide directory with converted by Optimum Intel pipeline and specify inference device. Optionally, we can provide configuration for LoRA Adapters using ``adapter_config``. diff --git a/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf index 5f24d28643598e..c5632a7e3f9627 100644 Binary files a/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf and b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf differ diff --git a/samples/python/benchmark/bert_benchmark/bert_benchmark.py b/samples/python/benchmark/bert_benchmark/bert_benchmark.py index 11055d7a70b163..bd59c2bd2b3c7b 100755 --- a/samples/python/benchmark/bert_benchmark/bert_benchmark.py +++ b/samples/python/benchmark/bert_benchmark/bert_benchmark.py @@ -11,7 +11,7 @@ import openvino as ov import datasets -from openvino.runtime import get_version +from openvino import get_version from transformers import AutoTokenizer from transformers.onnx import export from transformers.onnx.features import FeaturesManager diff --git a/samples/python/benchmark/sync_benchmark/sync_benchmark.py b/samples/python/benchmark/sync_benchmark/sync_benchmark.py index 6aed1a489e12f4..db877b7238fb37 100755 --- a/samples/python/benchmark/sync_benchmark/sync_benchmark.py +++ b/samples/python/benchmark/sync_benchmark/sync_benchmark.py @@ -10,8 +10,8 @@ import numpy as np import openvino as ov -from openvino.runtime import get_version -from openvino.runtime.utils.types import get_dtype +from openvino import get_version +from openvino.utils.types import get_dtype def fill_tensor_random(tensor): diff --git a/samples/python/benchmark/throughput_benchmark/throughput_benchmark.py b/samples/python/benchmark/throughput_benchmark/throughput_benchmark.py index ce9431e3e5121d..c96179d74edb9c 100755 --- a/samples/python/benchmark/throughput_benchmark/throughput_benchmark.py +++ b/samples/python/benchmark/throughput_benchmark/throughput_benchmark.py @@ -10,8 +10,8 @@ import numpy as np import openvino as ov -from openvino.runtime import get_version -from openvino.runtime.utils.types import get_dtype +from openvino import get_version +from openvino.utils.types import get_dtype def fill_tensor_random(tensor): diff --git a/samples/python/model_creation_sample/model_creation_sample.py b/samples/python/model_creation_sample/model_creation_sample.py index 1a9fdac84fa1a1..8572051b577f8a 100755 --- a/samples/python/model_creation_sample/model_creation_sample.py +++ b/samples/python/model_creation_sample/model_creation_sample.py @@ -9,7 +9,7 @@ import numpy as np import openvino as ov -from openvino.runtime import op, opset1, opset8 +from openvino import op, opset1, opset8 from data import digits diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt index 90222b06f1f964..c136c232391b00 100644 --- a/src/bindings/python/constraints.txt +++ b/src/bindings/python/constraints.txt @@ -21,6 +21,5 @@ h5py>=3.1.0,<3.13.0 docopt~=0.6.2 paddlepaddle==2.6.2 tensorflow>=1.15.5,<2.18.0 -six~=1.16.0 -protobuf>=3.18.1,<4.0.0 +protobuf>=3.18.1,<6.0.0 onnx==1.17.0 diff --git a/src/bindings/python/requirements_test.txt b/src/bindings/python/requirements_test.txt index 1aa2ff24b1b948..f95d5bab53b465 100644 --- a/src/bindings/python/requirements_test.txt +++ b/src/bindings/python/requirements_test.txt @@ -5,7 +5,7 @@ black flake8<=7.1.1 flake8-annotations-complexity<=0.0.8 flake8-broken-line<=1.0.0 -flake8-bugbear<=24.8.19 +flake8-bugbear<=24.12.12 flake8-class-attributes-order<=0.1.3 flake8-comprehensions<=3.16.0 flake8-debugger<=4.1.2 diff --git a/src/bindings/python/src/openvino/_ov_api.py b/src/bindings/python/src/openvino/_ov_api.py index da31fab4c95d8e..4d7c349b03167e 100644 --- a/src/bindings/python/src/openvino/_ov_api.py +++ b/src/bindings/python/src/openvino/_ov_api.py @@ -22,7 +22,12 @@ ) -class Model: +class ModelMeta(type): + def __dir__(cls) -> list: + return list(set(cls.__dict__.keys()) | set(dir(ModelBase))) + + +class Model(object, metaclass=ModelMeta): def __init__(self, *args: Any, **kwargs: Any) -> None: if args and not kwargs: if isinstance(args[0], ModelBase): @@ -65,6 +70,10 @@ def __exit__(self, exc_type: Type[BaseException], exc_value: BaseException, trac def __repr__(self) -> str: return self.__model.__repr__() + def __dir__(self) -> list: + wrapper_methods = ["__copy__", "__deepcopy__", "__dict__", "__enter__", "__exit__", "__getattr__", "__weakref__"] + return dir(self.__model) + wrapper_methods + class InferRequest(_InferRequestWrapper): """InferRequest class represents infer request which can be run in asynchronous or synchronous manners.""" diff --git a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py index 37b938491a673d..1006666374e778 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py @@ -38,6 +38,7 @@ def __init__( skip_freeze=False, constant_cache=None, module_extensions=None, + trace_kwargs=None, ): super().__init__() # We store every decoder created by this decoder so that all them are not deleted until the first decoder is deleted @@ -57,7 +58,7 @@ def __init__( self.config = pt_module.config.to_dict() try: pt_module = self._get_scripted_model( - pt_module, example_input, skip_freeze) + pt_module, example_input, skip_freeze, trace_kwargs) except Exception as e: if example_input is not None: msg = "tracing" @@ -109,7 +110,7 @@ def _get_preserved_attributes(model) -> list: preserved_attributes.append(name) return preserved_attributes - def _get_scripted_model(self, pt_module, example_inputs=None, skip_freeze=False): + def _get_scripted_model(self, pt_module, example_inputs=None, skip_freeze=False, trace_kwargs=None): freeze_by_default = False if isinstance(pt_module, torch.nn.Module): pt_module.eval() @@ -154,9 +155,11 @@ def _get_scripted_model(self, pt_module, example_inputs=None, skip_freeze=False) quantized.unpatch_quantized(pt_module) patched = False + if trace_kwargs is None: + trace_kwargs = {} try: scripted = torch.jit.trace( - pt_module, **input_parameters, strict=False) + pt_module, **input_parameters, strict=False, **trace_kwargs) finally: if patched: quantized.unpatch_quantized(pt_module) diff --git a/src/bindings/python/src/openvino/properties/intel_npu/__init__.py b/src/bindings/python/src/openvino/properties/intel_npu/__init__.py new file mode 100644 index 00000000000000..7a9b8c777907f0 --- /dev/null +++ b/src/bindings/python/src/openvino/properties/intel_npu/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Properties +import openvino._pyopenvino.properties.intel_npu as __intel_npu +from openvino.properties._properties import __make_properties + +__make_properties(__intel_npu, __name__) diff --git a/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp b/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp index 90aece1803f4b4..15a609f26d8fe9 100644 --- a/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp +++ b/src/bindings/python/src/pyopenvino/core/offline_transformations.cpp @@ -143,15 +143,18 @@ void regmodule_offline_transformations(py::module m) { m_offline_transformations.def( "paged_attention_transformation", - [](py::object& ie_api_model, bool use_block_indices_inputs, bool use_score_outputs) { + [](py::object& ie_api_model, bool use_block_indices_inputs, bool use_score_outputs, bool allow_cache_rotation) { const auto model = Common::utils::convert_to_model(ie_api_model); ov::pass::Manager manager; - manager.register_pass(use_block_indices_inputs, use_score_outputs); + manager.register_pass(use_block_indices_inputs, + use_score_outputs, + allow_cache_rotation); manager.run_passes(model); }, py::arg("model"), py::arg("use_block_indices_inputs") = false, - py::arg("use_score_outputs") = false); + py::arg("use_score_outputs") = false, + py::arg("allow_cache_rotation") = false); m_offline_transformations.def( "stateful_to_stateless_transformation", diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index 2e3a2c8d1c9be8..d434cd3f573d9c 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -4,6 +4,10 @@ #include "pyopenvino/core/properties/properties.hpp" +#include "openvino/runtime/auto/properties.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" +#include "openvino/runtime/intel_gpu/properties.hpp" +#include "openvino/runtime/intel_npu/properties.hpp" #include "pyopenvino/core/common.hpp" #include "pyopenvino/graph/any.hpp" #include "pyopenvino/utils/utils.hpp" @@ -316,4 +320,19 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_intel_auto, ov::intel_auto::enable_startup_fallback, "enable_startup_fallback"); wrap_property_RW(m_intel_auto, ov::intel_auto::enable_runtime_fallback, "enable_runtime_fallback"); wrap_property_RW(m_intel_auto, ov::intel_auto::schedule_policy, "schedule_policy"); + + // Submodule npu + py::module m_intel_npu = + m_properties.def_submodule("intel_npu", "openvino.properties.intel_npu submodule that simulates ov::intel_npu"); + + wrap_property_RO(m_intel_npu, ov::intel_npu::device_alloc_mem_size, "device_alloc_mem_size"); + wrap_property_RO(m_intel_npu, ov::intel_npu::device_total_mem_size, "device_total_mem_size"); + wrap_property_RO(m_intel_npu, ov::intel_npu::driver_version, "driver_version"); + + wrap_property_RW(m_intel_npu, ov::intel_npu::compilation_mode_params, "compilation_mode_params"); + wrap_property_RW(m_intel_npu, ov::intel_npu::turbo, "turbo"); + wrap_property_RW(m_intel_npu, ov::intel_npu::tiles, "tiles"); + wrap_property_RW(m_intel_npu, ov::intel_npu::max_tiles, "max_tiles"); + wrap_property_RW(m_intel_npu, ov::intel_npu::bypass_umd_caching, "bypass_umd_caching"); + wrap_property_RW(m_intel_npu, ov::intel_npu::defer_weights_load, "defer_weights_load"); } diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.hpp b/src/bindings/python/src/pyopenvino/core/properties/properties.hpp index 831c8336de8d65..955e5711ff38e2 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.hpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.hpp @@ -8,10 +8,6 @@ #include #include "openvino/runtime/properties.hpp" -#include "openvino/runtime/intel_cpu/properties.hpp" -#include "openvino/runtime/intel_gpu/properties.hpp" -#include "openvino/runtime/auto/properties.hpp" -#include "pyopenvino/core/properties/properties.hpp" namespace py = pybind11; diff --git a/src/bindings/python/tests/test_runtime/test_model.py b/src/bindings/python/tests/test_runtime/test_model.py index 425cdb97129c69..cf4cad0d025247 100644 --- a/src/bindings/python/tests/test_runtime/test_model.py +++ b/src/bindings/python/tests/test_runtime/test_model.py @@ -849,3 +849,11 @@ def test_tempdir_save_load_error(): with tempfile.TemporaryDirectory() as model_save_dir: save_model(mem_model, f"{model_save_dir}/model.xml") _ = Core().read_model(f"{model_save_dir}/model.xml") + + +def test_model_dir(): + model = generate_add_model() + num_of_attrs = 83 + + assert type(dir(model)) == list + assert len(dir(model)) >= num_of_attrs diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 15e2d86ead4653..4cdd598e0eee4b 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -13,6 +13,7 @@ import openvino.properties.intel_auto as intel_auto import openvino.properties.intel_gpu as intel_gpu import openvino.properties.intel_gpu.hint as intel_gpu_hint +import openvino.properties.intel_npu as intel_npu import openvino.properties.device as device import openvino.properties.log as log import openvino.properties.streams as streams @@ -190,6 +191,9 @@ def test_conflicting_enum(proxy_enums, expected_values): (intel_gpu.uarch_version, "GPU_UARCH_VERSION"), (intel_gpu.execution_units_count, "GPU_EXECUTION_UNITS_COUNT"), (intel_gpu.memory_statistics, "GPU_MEMORY_STATISTICS"), + (intel_npu.device_alloc_mem_size, "NPU_DEVICE_ALLOC_MEM_SIZE"), + (intel_npu.device_total_mem_size, "NPU_DEVICE_TOTAL_MEM_SIZE"), + (intel_npu.driver_version, "NPU_DRIVER_VERSION"), ], ) def test_properties_ro(ov_property_ro, expected_value): @@ -417,6 +421,36 @@ def test_properties_ro(ov_property_ro, expected_value): "AVAILABLE_DEVICE_MEM_SIZE", ((128, 128),), ), + ( + intel_npu.compilation_mode_params, + "NPU_COMPILATION_MODE_PARAMS", + (("dummy-op-replacement=true", "dummy-op-replacement=true"),), + ), + ( + intel_npu.turbo, + "NPU_TURBO", + ((True, True),), + ), + ( + intel_npu.tiles, + "NPU_TILES", + ((128, 128),), + ), + ( + intel_npu.max_tiles, + "NPU_MAX_TILES", + ((128, 128),), + ), + ( + intel_npu.bypass_umd_caching, + "NPU_BYPASS_UMD_CACHING", + ((True, True),), + ), + ( + intel_npu.defer_weights_load, + "NPU_DEFER_WEIGHTS_LOAD", + ((True, True),), + ), ], ) def test_properties_rw(ov_property_rw, expected_value, test_values): diff --git a/src/common/snippets/include/snippets/lowered/loop_info.hpp b/src/common/snippets/include/snippets/lowered/loop_info.hpp index 23e1f14a8b7f5e..3bb8407594bfcd 100644 --- a/src/common/snippets/include/snippets/lowered/loop_info.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_info.hpp @@ -25,8 +25,6 @@ using LoopInfoPtr = std::shared_ptr; */ class LoopInfo : public std::enable_shared_from_this { public: - enum {UNDEFINED_DIM_IDX = std::numeric_limits::max()}; - LoopInfo() = default; LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits); LoopInfo(size_t work_amount, size_t increment, const std::vector& entries, const std::vector& exits); @@ -66,7 +64,7 @@ class LoopInfo : public std::enable_shared_from_this { /** * @brief Returns dimension index if dimension indices for all input and output ports are equal. - * Otherwise returns UNDEFINED_DIM_IDX. + * Otherwise returns LoopPort::UNDEFINED_DIM_IDX. * @return index */ size_t get_dim_idx() const; diff --git a/src/common/snippets/include/snippets/lowered/loop_port.hpp b/src/common/snippets/include/snippets/lowered/loop_port.hpp index 394a396c73ae99..5055ea419ce020 100644 --- a/src/common/snippets/include/snippets/lowered/loop_port.hpp +++ b/src/common/snippets/include/snippets/lowered/loop_port.hpp @@ -6,17 +6,37 @@ #include "snippets/lowered/expression_port.hpp" #include "snippets/lowered/expression.hpp" +#include "snippets/utils/utils.hpp" namespace ov { namespace snippets { namespace lowered { -/* The structure describes port of Loop: expression port that connected to Expressions from other Loops. +/* The class describes port of Loop: expression port that connected to Expressions from other Loops. */ -struct LoopPort { +class LoopPort { +public: + enum {UNDEFINED_DIM_IDX = std::numeric_limits::max()}; + enum class Type { + Incremented, // Loop port which data ptr should be incremented after each Loop iteration + NotIncremented, // Loop port which data ptr should not be incremented (for example, to avoid double increment) + NotProcessed, // LoopPort which doesn't process the dim by `dim_idx` (UNDEFINED_DIM_IDX) and is used only for Loop bound definition + }; + LoopPort() = default; - LoopPort(const ExpressionPort& port, bool is_incremented = true, size_t dim_idx = 0); + + template::type = true> + static LoopPort create(const ExpressionPort& port, size_t dim_idx = 0) { + return LoopPort(port, dim_idx, T); + } + + template::type = true> + static LoopPort create(const ExpressionPort& port) { + return LoopPort(port, UNDEFINED_DIM_IDX, Type::NotProcessed); + } std::shared_ptr clone_with_new_expr(const ExpressionPtr& new_expr) const; @@ -24,13 +44,33 @@ struct LoopPort { friend bool operator!=(const LoopPort& lhs, const LoopPort& rhs); friend bool operator<(const LoopPort& lhs, const LoopPort& rhs); - std::shared_ptr expr_port = {}; - // True if after each Loop iteration the corresponding data pointer should be incremented. - // Otherwise, the data pointer shift is skipped - bool is_incremented = true; - size_t dim_idx = 0; // The numeration starts from the end (dim_idx = 0 -> is the most inner dimension) + const std::shared_ptr& get_expr_port() const { return m_expr_port; } + Type get_type() const { return m_type; } + size_t get_dim_idx() const; + + void set_expr_port(std::shared_ptr p); + void set_dim_idx(size_t idx); + + template::type = true> + void convert_to_type() { + OPENVINO_ASSERT(is_processed(), "NotProcessed LoopPort cannot change type!"); + m_type = T; + } + + bool is_processed() const; + bool is_incremented() const; + +private: + LoopPort(const ExpressionPort& port, size_t dim_idx, Type type); + + std::shared_ptr m_expr_port = {}; + size_t m_dim_idx = 0; // The numeration starts from the end (dim_idx = 0 -> is the most inner dimension) + Type m_type = Type::Incremented; }; +std::ostream& operator<<(std::ostream& out, const LoopPort::Type& type); + } // namespace lowered } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp index 40a2611b80ef48..32b5e241ba4cf8 100644 --- a/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/insert_buffers.hpp @@ -32,8 +32,8 @@ class InsertBuffers : public RangedPass { const LinearIR::constExprIt& begin_it, const LinearIR::constExprIt& end_it, const LoopManagerPtr& loop_manager, - const std::vector& loop_entries, - const std::vector& loop_exits) const; + const std::vector& loop_entries, + const std::vector& loop_exits) const; static LinearIR::constExprIt insertion_position(const LinearIR& linear_ir, const LoopManagerPtr& loop_manager, diff --git a/src/common/snippets/src/lowered/expressions/buffer_expression.cpp b/src/common/snippets/src/lowered/expressions/buffer_expression.cpp index a8b3bb2034b105..0a1b9758a6c371 100644 --- a/src/common/snippets/src/lowered/expressions/buffer_expression.cpp +++ b/src/common/snippets/src/lowered/expressions/buffer_expression.cpp @@ -77,10 +77,10 @@ void BufferExpression::init_allocation_size(const std::shared_ptr& const auto& subtensor = ov::snippets::utils::get_projected_subtensor(parent_port); auto hard_equal = [&parent_port](const LoopPort& port) { - return *port.expr_port == parent_port; + return *port.get_expr_port() == parent_port; }; auto soft_equal = [&](const LoopPort& loop_port) { - const auto& port = *loop_port.expr_port; + const auto& port = *loop_port.get_expr_port(); // Check semantic of LoopPort if (parent_port.get_index() != port.get_index() || port.get_expr()->get_node()->get_type_info() != parent_port.get_expr()->get_node()->get_type_info()) @@ -109,8 +109,10 @@ void BufferExpression::init_allocation_size(const std::shared_ptr& OPENVINO_ASSERT(it != output_ports.end(), "compute_allocation_shape: output port of parent loop can not be found"); } const auto& loop_port = *it; - const auto& dim_idx = loop_port.dim_idx; - if (loop_port.is_incremented && dim_idx < rank) { + if (!loop_port.is_processed()) + continue; + const auto& dim_idx = loop_port.get_dim_idx(); + if (dim_idx < rank) { if (const auto& unified_loop_info = ov::as_type_ptr(loop_info)) m_allocation_size = utils::dynamic_safe_mul(m_allocation_size, unified_loop_info->get_work_amount()); else if (const auto& expanded_loop_info = ov::as_type_ptr(loop_info)) diff --git a/src/common/snippets/src/lowered/loop_info.cpp b/src/common/snippets/src/lowered/loop_info.cpp index 1c856869878b80..3e9da0e15ada44 100644 --- a/src/common/snippets/src/lowered/loop_info.cpp +++ b/src/common/snippets/src/lowered/loop_info.cpp @@ -19,9 +19,9 @@ LoopInfo::LoopInfo(size_t work_amount, size_t increment, const std::vector(port)); for (const auto& port : exits) - m_output_ports.emplace_back(port); + m_output_ports.push_back(LoopPort::create(port)); } bool LoopInfo::is_dynamic() const { @@ -30,14 +30,22 @@ bool LoopInfo::is_dynamic() const { size_t LoopInfo::get_dim_idx() const { OPENVINO_ASSERT(!m_input_ports.empty(), "Loop info must have at least one input port"); - auto equal_dim_idxes = [&](const LoopPort& p) { - return !p.is_incremented || p.dim_idx == m_input_ports[0].dim_idx; - }; + + auto is_processed = [](const LoopPort& p) { return p.is_processed(); }; + auto is_processed_it = std::find_if(m_input_ports.begin(), m_input_ports.end(), is_processed); + if (is_processed_it == m_input_ports.end()) { + is_processed_it = std::find_if(m_output_ports.begin(), m_output_ports.end(), is_processed); + if (is_processed_it == m_output_ports.end()) + return LoopPort::UNDEFINED_DIM_IDX; + } + const auto dim_idx = is_processed_it->get_dim_idx(); + + auto equal_dim_idxes = [&](const LoopPort& p) { return !p.is_processed() || p.get_dim_idx() == dim_idx; }; if (std::all_of(m_input_ports.begin(), m_input_ports.end(), equal_dim_idxes) && std::all_of(m_output_ports.begin(), m_output_ports.end(), equal_dim_idxes)) { - return m_input_ports[0].dim_idx; + return dim_idx; } else { - return UNDEFINED_DIM_IDX; + return LoopPort::UNDEFINED_DIM_IDX; } } @@ -60,7 +68,7 @@ size_t LoopInfo::get_increment() const { std::vector LoopInfo::get_is_incremented() const { std::vector values; values.reserve(get_input_count() + get_output_count()); - iterate_through_ports([&values](const LoopPort& port) { values.push_back(port.is_incremented); }); + iterate_through_ports([&values](const LoopPort& port) { values.push_back(port.is_incremented()); }); return values; } @@ -81,14 +89,14 @@ void LoopInfo::set_increment(size_t increment) { } void LoopInfo::set_dim_idx(size_t dim_idx) { - auto setter = [dim_idx](LoopPort& port) { port.dim_idx = dim_idx; }; + auto setter = [dim_idx](LoopPort& port) { if (port.is_processed()) port.set_dim_idx(dim_idx); }; std::for_each(m_input_ports.begin(), m_input_ports.end(), setter); std::for_each(m_output_ports.begin(), m_output_ports.end(), setter); } template<> std::vector::iterator LoopInfo::find_loop_port(const LoopPort& loop_port) { - auto& ports = loop_port.expr_port->get_type() == ExpressionPort::Input ? m_input_ports : m_output_ports; + auto& ports = loop_port.get_expr_port()->get_type() == ExpressionPort::Input ? m_input_ports : m_output_ports; const auto it = std::find_if(ports.begin(), ports.end(), [&loop_port](const LoopPort& port) { return port == loop_port; }); OPENVINO_ASSERT(it != ports.end(), "Failed find_loop_port: existing loop port has not been found"); @@ -99,7 +107,7 @@ template<> std::vector::iterator LoopInfo::find_loop_port(const ExpressionPort& expr_port) { auto& ports = expr_port.get_type() == ExpressionPort::Input ? m_input_ports : m_output_ports; const auto it = std::find_if(ports.begin(), ports.end(), - [&expr_port](const LoopPort& port) { return *port.expr_port == expr_port; }); + [&expr_port](const LoopPort& port) { return *port.get_expr_port() == expr_port; }); return it; } @@ -118,7 +126,7 @@ namespace { void validate_new_target_ports(const std::vector& target_ports, ExpressionPort::Type target_type) { OPENVINO_ASSERT(target_ports.empty() || std::all_of(target_ports.cbegin(), target_ports.cend(), - [&target_type](const LoopPort& target_port) { return target_type == target_port.expr_port->get_type(); })); + [&target_type](const LoopPort& target_port) { return target_type == target_port.get_expr_port()->get_type(); })); } void validate_new_target_ports(const std::vector& target_ports, ExpressionPort::Type target_type) { OPENVINO_ASSERT(target_ports.empty() || @@ -128,7 +136,7 @@ void validate_new_target_ports(const std::vector& target_ports, } // namespace void LoopInfo::replace_with_new_ports(const LoopPort& actual_port, const std::vector& target_ports) { - const auto& actual_port_type = actual_port.expr_port->get_type(); + const auto& actual_port_type = actual_port.get_expr_port()->get_type(); validate_new_target_ports(target_ports, actual_port_type); auto& ports = actual_port_type == ExpressionPort::Input ? m_input_ports : m_output_ports; @@ -153,7 +161,7 @@ void LoopInfo::replace_with_new_ports(const ExpressionPort& actual_port, const s std::transform(target_loop_ports.begin(), target_loop_ports.end(), target_ports.begin(), target_loop_ports.begin(), [](LoopPort loop_port, const ExpressionPort& expr_port) { LoopPort copy = std::move(loop_port); // to save loop port parameters - copy.expr_port = std::make_shared(expr_port); + copy.set_expr_port(std::make_shared(expr_port)); return copy; }); port_it = ports.erase(port_it); @@ -164,7 +172,7 @@ std::vector LoopInfo::clone_loop_ports(const ExpressionMap& expr_map, std::vector cloned_port_points; cloned_port_points.reserve(loop_ports.size()); for (const auto& p : loop_ports) { - const auto& expr = p.expr_port->get_expr().get(); + const auto& expr = p.get_expr_port()->get_expr().get(); OPENVINO_ASSERT(expr_map.count(expr), "Can't clone LoopInfo: old expression is not in the map"); const auto& new_expr = expr_map.at(expr); cloned_port_points.emplace_back(*p.clone_with_new_expr(new_expr)); @@ -309,8 +317,8 @@ std::vector get_port_index_order(const std::vector& ports) { std::iota(new_indexes.begin(), new_indexes.end(), 0); std::sort(new_indexes.begin(), new_indexes.end(), [ports](size_t l, size_t r) { - const auto& expr_port_l = ports[l].expr_port; - const auto& expr_port_r = ports[r].expr_port; + const auto& expr_port_l = ports[l].get_expr_port(); + const auto& expr_port_r = ports[r].get_expr_port(); if (expr_port_l->get_expr() == expr_port_r->get_expr()) return expr_port_l->get_index() < expr_port_r->get_index(); return expr_port_l->get_expr()->get_exec_num() < expr_port_r->get_expr()->get_exec_num(); @@ -340,7 +348,7 @@ UnifiedLoopInfo::LoopPortInfo UnifiedLoopInfo::get_loop_port_info(const Expressi const auto& ports = is_input ? m_input_ports : m_output_ports; const auto& descs = is_input ? m_input_port_descs : m_output_port_descs; const auto it = std::find_if(ports.begin(), ports.end(), - [&expr_port](const LoopPort& port) { return *port.expr_port == expr_port; }); + [&expr_port](const LoopPort& port) { return *port.get_expr_port() == expr_port; }); const auto index = static_cast(std::distance(ports.cbegin(), it)); OPENVINO_ASSERT(index < ports.size() && index < descs.size(), "LoopPortInfo has not been found!"); return {ports[index], descs[index]}; @@ -354,10 +362,10 @@ void UnifiedLoopInfo::replace_with_cloned_descs(size_t actual_port_idx, size_t n } void UnifiedLoopInfo::replace_with_new_ports(const LoopPort& actual_port, const std::vector& target_ports) { - const auto& actual_port_type = actual_port.expr_port->get_type(); + const auto& actual_port_type = actual_port.get_expr_port()->get_type(); validate_new_target_ports(target_ports, actual_port_type); - const auto is_input = actual_port.expr_port->get_type() == ExpressionPort::Input; + const auto is_input = actual_port.get_expr_port()->get_type() == ExpressionPort::Input; auto& ports = is_input ? m_input_ports : m_output_ports; auto port_it = find_loop_port(actual_port); diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp index f0e5306c5878bc..b4816b3d9efaa7 100644 --- a/src/common/snippets/src/lowered/loop_manager.cpp +++ b/src/common/snippets/src/lowered/loop_manager.cpp @@ -93,7 +93,7 @@ std::pair LoopManager::get_loop_bo OPENVINO_ASSERT(!entries.empty(), "Loop must have input ports"); OPENVINO_ASSERT(!exits.empty(), "Loop must have output ports"); - const auto& entry_expr = entries.front().expr_port->get_expr(); + const auto& entry_expr = entries.front().get_expr_port()->get_expr(); auto loop_begin_pos = linear_ir.find(entry_expr); // Some operations in Loop can be before first input ports: Scalars, VectorBuffer. // We should iterate by them till the expr is in the corresponding Loop @@ -103,7 +103,7 @@ std::pair LoopManager::get_loop_bo prev_loop_ids = (*std::prev(loop_begin_pos))->get_loop_ids(); } - const auto& exit_expr = exits.back().expr_port->get_expr(); + const auto& exit_expr = exits.back().get_expr_port()->get_expr(); auto loop_end_pos = std::next(linear_ir.find_after(loop_begin_pos, exit_expr)); // There might be LoopEnd with another `loop_id` but in the target Loop as well. auto current_loop_ids = (*loop_end_pos)->get_loop_ids(); @@ -312,14 +312,14 @@ void LoopManager::fuse_loop_ports(std::vector& output_ports, std::vector new_output_ports; for (const auto& output_port : output_ports) { - const auto consumers_inputs = output_port.expr_port->get_connected_ports(); + const auto consumers_inputs = output_port.get_expr_port()->get_connected_ports(); std::set mapped_input_ports; std::set outside_consumers; for (const auto& consumer_input : consumers_inputs) { const auto input_port_it = std::find_if(input_ports.begin(), input_ports.end(), [&consumer_input](const LoopPort& port) { - return *port.expr_port.get() == consumer_input; + return *port.get_expr_port().get() == consumer_input; }); if (input_port_it != input_ports.end()) { mapped_input_ports.insert(*input_port_it); diff --git a/src/common/snippets/src/lowered/loop_port.cpp b/src/common/snippets/src/lowered/loop_port.cpp index 52f59bb1fa4d35..4290ee1adefcdb 100644 --- a/src/common/snippets/src/lowered/loop_port.cpp +++ b/src/common/snippets/src/lowered/loop_port.cpp @@ -11,26 +11,64 @@ namespace ov { namespace snippets { namespace lowered { -LoopPort::LoopPort(const ExpressionPort& port, bool is_incremented, size_t dim_idx) - : expr_port(std::make_shared(port)), is_incremented(is_incremented), dim_idx(dim_idx) { - OPENVINO_ASSERT(dim_idx < port.get_descriptor_ptr()->get_shape().size(), - "LoopPort dim_idx (", - dim_idx, - ") must be less than the corresponding expression port shape rank (", - port.get_descriptor_ptr()->get_shape().size(), - ")"); +LoopPort::LoopPort(const ExpressionPort& port, size_t dim_idx, Type type) + : m_expr_port(std::make_shared(port)), m_type(type) { + if (is_processed()) { + set_dim_idx(dim_idx); + } else { + OPENVINO_ASSERT(dim_idx == UNDEFINED_DIM_IDX, "NotProcessed LoopPort can have only UNDEFINED_DIM_IDX"); + m_dim_idx = dim_idx; + } } std::shared_ptr LoopPort::clone_with_new_expr(const ExpressionPtr& new_expr) const { auto new_loop_port = std::make_shared(*this); - new_loop_port->expr_port = expr_port->clone_with_new_expr(new_expr); + new_loop_port->m_expr_port = m_expr_port->clone_with_new_expr(new_expr); return new_loop_port; } +bool LoopPort::is_processed() const { + switch (m_type) { + case Type::Incremented: + case Type::NotIncremented: + return true; + case Type::NotProcessed: + return false; + default: + OPENVINO_THROW("Unknown LoopPort type"); + } +} + +bool LoopPort::is_incremented() const { + return m_type == Type::Incremented; +} + +size_t LoopPort::get_dim_idx() const { + OPENVINO_ASSERT(is_processed(), "NotProcessed LoopPort cannot call `get_dim_idx()`"); + return m_dim_idx; +} + +void LoopPort::set_expr_port(std::shared_ptr p) { + OPENVINO_ASSERT(p, "Expression port is missed"); + m_expr_port = std::move(p); +} + +void LoopPort::set_dim_idx(size_t idx) { + OPENVINO_ASSERT(is_processed(), "NotProcessed LoopPort cannot call `get_dim_idx()`"); + OPENVINO_ASSERT(idx < m_expr_port->get_descriptor_ptr()->get_shape().size(), + "LoopPort dim_idx (", + idx, + ") must be less than the corresponding expression port shape rank (", + m_expr_port->get_descriptor_ptr()->get_shape().size(), + ")"); + + m_dim_idx = idx; +} + bool operator==(const LoopPort& lhs, const LoopPort& rhs) { if (&lhs == &rhs) return true; - return *lhs.expr_port == *rhs.expr_port && lhs.is_incremented == rhs.is_incremented && lhs.dim_idx == rhs.dim_idx; + return *lhs.m_expr_port == *rhs.m_expr_port && lhs.m_type == rhs.m_type && lhs.m_dim_idx == rhs.m_dim_idx; } bool operator!=(const LoopPort& lhs, const LoopPort& rhs) { @@ -38,12 +76,30 @@ bool operator!=(const LoopPort& lhs, const LoopPort& rhs) { } bool operator<(const LoopPort& lhs, const LoopPort& rhs) { - return (*lhs.expr_port < *rhs.expr_port) || - (*lhs.expr_port == *rhs.expr_port && - (lhs.is_incremented < rhs.is_incremented || - (lhs.is_incremented == rhs.is_incremented && lhs.dim_idx < rhs.dim_idx))); + return (*lhs.m_expr_port < *rhs.m_expr_port) || + (*lhs.m_expr_port == *rhs.m_expr_port && + (lhs.m_type < rhs.m_type || + (lhs.m_type == rhs.m_type && lhs.m_dim_idx < rhs.m_dim_idx))); } +std::ostream& operator<<(std::ostream& out, const LoopPort::Type& type) { + switch (type) { + case LoopPort::Type::Incremented: + out << "Incremented"; + break; + case LoopPort::Type::NotIncremented: + out << "NotIncremented"; + break; + case LoopPort::Type::NotProcessed: + out << "NotProcessed"; + break; + default: + OPENVINO_THROW("Unknown LoopPort Type"); + } + return out; +} + + } // namespace lowered } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp b/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp index 4309986a9fe3de..663b4d1fe05b84 100644 --- a/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp +++ b/src/common/snippets/src/lowered/pass/brgemm_blocking.cpp @@ -18,21 +18,21 @@ namespace snippets { namespace lowered { namespace pass { using namespace ov::snippets::utils; +using PortType = LoopPort::Type; -snippets::lowered::SpecificIterationHandlers BrgemmBlockingBase::get_default_blocking_loop_handlers(size_t work_amount, - size_t block_size) { +lowered::SpecificIterationHandlers BrgemmBlockingBase::get_default_blocking_loop_handlers(size_t work_amount, size_t block_size) { OPENVINO_ASSERT(block_size != 0, "block size must be non zero"); SpecificIterationHandlers handlers; - const auto tail_size = snippets::utils::is_dynamic_value(work_amount) ? snippets::utils::get_dynamic_value() : work_amount % block_size; + const auto tail_size = utils::is_dynamic_value(work_amount) ? utils::get_dynamic_value() : work_amount % block_size; if (tail_size != 0) - handlers.register_pass(tail_size); + handlers.register_pass(tail_size); return handlers; } -bool BrgemmBlockingBase::blocking_loop_exists(const snippets::lowered::LoopManagerPtr& loop_manager, +bool BrgemmBlockingBase::blocking_loop_exists(const lowered::LoopManagerPtr& loop_manager, const ExpressionPtr& brgemm_expr) { auto check_port = [&](const LoopPort& p) { - return p.expr_port->get_expr() == brgemm_expr && ov::snippets::utils::one_of(p.dim_idx, 0ul, 1ul); + return p.get_expr_port()->get_expr() == brgemm_expr && one_of(p.get_dim_idx(), 0ul, 1ul); }; const auto& loop_ids = brgemm_expr->get_loop_ids(); for (const auto& id : loop_ids) { @@ -45,37 +45,37 @@ bool BrgemmBlockingBase::blocking_loop_exists(const snippets::lowered::LoopManag return false; } -void BrgemmBlockingBase::mark_m_blocking(const snippets::lowered::LoopManagerPtr& loop_manager, - snippets::lowered::LinearIR::constExprIt loop_begin, - snippets::lowered::LinearIR::constExprIt loop_end, - const std::vector& entries, - const std::vector& exits, +void BrgemmBlockingBase::mark_m_blocking(const LoopManagerPtr& loop_manager, + LinearIR::constExprIt loop_begin, + LinearIR::constExprIt loop_end, + const std::vector& entries, + const std::vector& exits, size_t block_size_m) { - const auto planar_dims = ov::snippets::utils::get_planar_vdims(*entries[0].expr_port); + const auto planar_dims = get_planar_vdims(*entries[0].get_expr_port()); const auto m = *++planar_dims.rbegin(); - const auto id = loop_manager->mark_loop(loop_begin, loop_end, m, block_size_m, 1, entries, exits, false); + const auto id = loop_manager->mark_loop(loop_begin, loop_end, m, block_size_m, entries, exits, false); loop_manager->get_loop_info(id)->set_handlers(get_m_loop_handlers(m, block_size_m)); } -void BrgemmBlockingBase::mark_n_blocking(const snippets::lowered::LoopManagerPtr& loop_manager, - snippets::lowered::LinearIR::constExprIt loop_begin, - snippets::lowered::LinearIR::constExprIt loop_end, - const std::vector& entries, - const std::vector& exits, +void BrgemmBlockingBase::mark_n_blocking(const LoopManagerPtr& loop_manager, + LinearIR::constExprIt loop_begin, + LinearIR::constExprIt loop_end, + const std::vector& entries, + const std::vector& exits, size_t block_size_n) { - const auto planar_dims = ov::snippets::utils::get_planar_vdims(*entries[1].expr_port); + const auto planar_dims = get_planar_vdims(*entries[1].get_expr_port()); const auto n = *planar_dims.rbegin(); - const auto id = loop_manager->mark_loop(loop_begin, loop_end, n, block_size_n, 0, entries, exits, false); + const auto id = loop_manager->mark_loop(loop_begin, loop_end, n, block_size_n, entries, exits, false); loop_manager->get_loop_info(id)->set_handlers(get_n_loop_handlers(n, block_size_n)); } -void BrgemmBlockingBase::mark_k_blocking(const snippets::lowered::LoopManagerPtr& loop_manager, - snippets::lowered::LinearIR::constExprIt loop_begin, - snippets::lowered::LinearIR::constExprIt loop_end, - const std::vector& entries, - const std::vector& exits, +void BrgemmBlockingBase::mark_k_blocking(const LoopManagerPtr& loop_manager, + LinearIR::constExprIt loop_begin, + LinearIR::constExprIt loop_end, + const std::vector& entries, + const std::vector& exits, size_t block_size_k) { - const auto planar_dims = ov::snippets::utils::get_planar_vdims(*entries[0].expr_port); + const auto planar_dims = get_planar_vdims(*entries[0].get_expr_port()); const auto k = *planar_dims.rbegin(); const auto id = loop_manager->mark_loop(loop_begin, loop_end, k, block_size_k, entries, exits, false); loop_manager->get_loop_info(id)->set_handlers(get_k_loop_handlers(k, block_size_k)); @@ -101,21 +101,21 @@ size_t BrgemmBlockingBase::get_default_k_blk(size_t k) const { return !utils::is_dynamic_value(k) && k > 1024 ? 1024 : 512; } -std::tuple BrgemmBlockingBase::get_blocking_params(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) const { +std::tuple BrgemmBlockingBase::get_blocking_params(const ExpressionPtr& brgemm_expr) const { size_t m, n, k; std::tie(m, n, k) = get_brgemm_dimensions(brgemm_expr); // Ticket: 113745 // TODO: extend block size selection heuristics auto get_block_size = [](const size_t dim, const size_t default_blk) { - if (!snippets::utils::is_dynamic_value(dim) && dim <= default_blk) + if (!utils::is_dynamic_value(dim) && dim <= default_blk) return get_full_dim_value(); return default_blk; }; return std::make_tuple(get_block_size(m, get_default_m_blk(m)), get_block_size(n, get_default_n_blk(n)), get_block_size(k, get_default_k_blk(k))); } -std::tuple BrgemmBlockingBase::get_brgemm_dimensions(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) { +std::tuple BrgemmBlockingBase::get_brgemm_dimensions(const ExpressionPtr& brgemm_expr) { OPENVINO_ASSERT(brgemm_expr, "Brgemm expression is nullptr!"); const auto& in_0_desc = brgemm_expr->get_input_port_descriptor(0); const auto& in_1_desc = brgemm_expr->get_input_port_descriptor(1); @@ -134,33 +134,33 @@ std::tuple BrgemmBlockingBase::get_brgemm_dimensions(con return std::make_tuple(m, n, k); } -bool BrgemmBlockingBase::mark_blocking_loops(snippets::lowered::LinearIR& linear_ir, - const snippets::lowered::LinearIR::constExprIt& brgemm_it, +bool BrgemmBlockingBase::mark_blocking_loops(LinearIR& linear_ir, + const LinearIR::constExprIt& brgemm_it, size_t m_block, size_t n_block, size_t k_block) { const auto& brgemm_expr = *brgemm_it; - brgemm_expr->get_input_port_descriptor(0)->set_subtensor(ov::snippets::VectorDims{m_block, k_block}); - brgemm_expr->get_input_port_descriptor(1)->set_subtensor(ov::snippets::VectorDims{k_block, n_block}); - brgemm_expr->get_output_port_descriptor(0)->set_subtensor(ov::snippets::VectorDims{m_block, n_block}); + brgemm_expr->get_input_port_descriptor(0)->set_subtensor(VectorDims{m_block, k_block}); + brgemm_expr->get_input_port_descriptor(1)->set_subtensor(VectorDims{k_block, n_block}); + brgemm_expr->get_output_port_descriptor(0)->set_subtensor(VectorDims{m_block, n_block}); const auto& loop_manager = linear_ir.get_loop_manager(); - if (!ov::snippets::utils::is_full_dim_value(k_block)) { - const std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 0), - LoopPort(brgemm_expr->get_input_port(1), true, 1)}; - const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), false)}; + if (!is_full_dim_value(k_block)) { + const std::vector entries{LoopPort::create(brgemm_expr->get_input_port(0), 0), + LoopPort::create(brgemm_expr->get_input_port(1), 1)}; + const std::vector exits{LoopPort::create(brgemm_expr->get_output_port(0))}; mark_k_blocking(loop_manager, brgemm_it, std::next(brgemm_it), entries, exits, k_block); } - if (!ov::snippets::utils::is_full_dim_value(n_block)) { - const std::vector entries{LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(brgemm_expr->get_input_port(1), true)}; - const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; + if (!is_full_dim_value(n_block)) { + const std::vector entries{LoopPort::create(brgemm_expr->get_input_port(0)), + LoopPort::create(brgemm_expr->get_input_port(1))}; + const std::vector exits{LoopPort::create(brgemm_expr->get_output_port(0))}; mark_n_blocking(loop_manager, brgemm_it, std::next(brgemm_it), entries, exits, n_block); } - if (!ov::snippets::utils::is_full_dim_value(m_block)) { - const std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true), - LoopPort(brgemm_expr->get_input_port(1), false)}; - const std::vector exits{LoopPort(brgemm_expr->get_output_port(0), true)}; + if (!is_full_dim_value(m_block)) { + const std::vector entries{LoopPort::create(brgemm_expr->get_input_port(0), 1), + LoopPort::create(brgemm_expr->get_input_port(1))}; + const std::vector exits{LoopPort::create(brgemm_expr->get_output_port(0), 1)}; mark_m_blocking(loop_manager, brgemm_it, std::next(brgemm_it), entries, exits, m_block); } return true; diff --git a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp index e0397b03224bc3..9090b0a0cc7ef7 100644 --- a/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp +++ b/src/common/snippets/src/lowered/pass/clean_repeated_ptr_shifts.cpp @@ -96,10 +96,10 @@ bool CleanRepeatedDataPointerShifts::reuse_increments(const LoopManagerPtr& loop const auto loop_info = loop_manager->get_loop_info(loop_end->get_id()); size_t loop_port_idx = 0; loop_info->iterate_through_infos([&resetting_data_indexes, &loop_port_idx](LoopPort& loop_port, UnifiedLoopInfo::LoopPortDesc& shifts) { - if (resetting_data_indexes.count(loop_port_idx)) { + if (resetting_data_indexes.count(loop_port_idx) && loop_port.is_processed()) { shifts.ptr_increment = 0; shifts.finalization_offset = 0; - loop_port.is_incremented = false; + loop_port.convert_to_type(); } ++loop_port_idx; }); diff --git a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp index e8132d62be0cc9..937babf4a19bf0 100644 --- a/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp +++ b/src/common/snippets/src/lowered/pass/define_buffer_clusters.cpp @@ -74,7 +74,7 @@ std::pair Defi BufferMap input_buffers; const auto& loop_inputs = loop_info->get_input_ports_info(); for (const auto& port_info : loop_inputs) { - const auto& buffer_expr = ov::as_type_ptr(port_info.port.expr_port->get_port_connector_ptr()->get_source().get_expr()); + const auto& buffer_expr = ov::as_type_ptr(port_info.port.get_expr_port()->get_port_connector_ptr()->get_source().get_expr()); if (!is_direct_buffer(buffer_expr, loop_expr)) continue; if (input_buffers.count(buffer_expr) > 0) { @@ -89,7 +89,7 @@ std::pair Defi BufferMap output_buffers; const auto& loop_outputs = loop_info->get_output_ports_info(); for (const auto& port_info : loop_outputs) { - const auto& consumer_inputs = port_info.port.expr_port->get_port_connector_ptr()->get_consumers(); + const auto& consumer_inputs = port_info.port.get_expr_port()->get_port_connector_ptr()->get_consumers(); for (const auto& consumer_input : consumer_inputs) { const auto& buffer_expr = ov::as_type_ptr(consumer_input.get_expr()); if (!is_direct_buffer(buffer_expr, loop_expr)) @@ -130,11 +130,13 @@ void DefineBufferClusters::parse_loop(const LoopManagerPtr& loop_manager, const if ((input_buffer_expr->get_data_type().size() < output_buffer_expr->get_data_type().size())) continue; - const auto in_path = MarkInvariantShapePath::getInvariantPortShapePath(*input_buffer_port_info.port.expr_port); - const auto out_path = MarkInvariantShapePath::getInvariantPortShapePath(*output_buffer_port_info.port.expr_port); + const auto in_path = MarkInvariantShapePath::getInvariantPortShapePath(*input_buffer_port_info.port.get_expr_port()); + const auto out_path = MarkInvariantShapePath::getInvariantPortShapePath(*output_buffer_port_info.port.get_expr_port()); // - Memory can be reused if there are the same loop pointer increments (data size, final offsets, ptr increments). // For that, loop ports with buffers should be on the same shape-path and have the same value of `is_incremented`. - if (in_path != out_path || input_buffer_port_info.port.is_incremented != output_buffer_port_info.port.is_incremented) + const auto in_is_incremented = input_buffer_port_info.port.is_incremented(); + const auto out_is_incremented = output_buffer_port_info.port.is_incremented(); + if (in_path != out_path || in_is_incremented != out_is_incremented) continue; // - Memory can be shared if Buffer has the same allocation size. @@ -174,13 +176,13 @@ void DefineBufferClusters::parse_nested_loops(const LoopManagerPtr& loop_manager auto can_be_data_ptr_proportionally_shifted = [](const LoopPortInfo& outer_port_info, const LoopPortInfo& inner_port_info) { // Outer Buffer ptr should be shifted to emulate "window" sliding const auto& outer_desc = outer_port_info.desc; - if (!outer_port_info.port.is_incremented || (!utils::is_dynamic_value(outer_desc.ptr_increment) && outer_desc.ptr_increment == 0)) + if (!outer_port_info.port.is_incremented() || (!utils::is_dynamic_value(outer_desc.ptr_increment) && outer_desc.ptr_increment == 0)) return false; - OPENVINO_ASSERT(inner_port_info.port.expr_port && outer_port_info.port.expr_port, "Expression ports are nullptr!"); + OPENVINO_ASSERT(inner_port_info.port.get_expr_port() && outer_port_info.port.get_expr_port(), "Expression ports are nullptr!"); // we can be sure that these data pointers will be proportionally shifted if they're on the same invariant shape path - return MarkInvariantShapePath::getInvariantPortShapePath(*inner_port_info.port.expr_port) == - MarkInvariantShapePath::getInvariantPortShapePath(*outer_port_info.port.expr_port); + return MarkInvariantShapePath::getInvariantPortShapePath(*inner_port_info.port.get_expr_port()) == + MarkInvariantShapePath::getInvariantPortShapePath(*outer_port_info.port.get_expr_port()); }; const auto outer_loop_begin = ov::as_type_ptr(outer_loop_end_expr_it->get()->get_node())->get_loop_begin(); @@ -242,7 +244,7 @@ bool DefineBufferClusters::init_buffer_last_loop_port_info(const LoopManagerPtr& const auto consumers = buffer_out->get_consumers(); for (const auto& consumer : consumers) { if (const auto& direct_loop = get_direct_loop_for_buffer_out(buffer_expr, consumer.get_expr())) { - const auto loop_order = direct_loop->get_output_ports().back().expr_port->get_expr()->get_exec_num(); + const auto loop_order = direct_loop->get_output_ports().back().get_expr_port()->get_expr()->get_exec_num(); if (loop_order > last_loop_exec_order) { OPENVINO_ASSERT(direct_loop->is_loop_port(consumer), "Consumer of Buffer from another loop must be loop port"); port_info = direct_loop->get_loop_port_info(consumer); diff --git a/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp b/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp index 27f4713810bfe0..f873a782b088b5 100644 --- a/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp +++ b/src/common/snippets/src/lowered/pass/extract_loop_invariants.cpp @@ -24,8 +24,8 @@ std::vector get_reordered_loop_ids(const LoopManagerPtr& loop_manager) { loop_ids_need_extract.push_back(p.first); auto sorter = [&](size_t lhs, size_t rhs) { - const auto lhs_last_expr = loop_manager->get_loop_info(lhs)->get_output_ports().back().expr_port->get_expr(); - const auto rhs_last_expr = loop_manager->get_loop_info(rhs)->get_output_ports().back().expr_port->get_expr(); + const auto lhs_last_expr = loop_manager->get_loop_info(lhs)->get_output_ports().back().get_expr_port()->get_expr(); + const auto rhs_last_expr = loop_manager->get_loop_info(rhs)->get_output_ports().back().get_expr_port()->get_expr(); // If last output loop ports are the same expressions - first executive Loop has inner ID in expression loop IDs. if (lhs_last_expr == rhs_last_expr) { for (const auto& id : lhs_last_expr->get_loop_ids()) { @@ -50,9 +50,9 @@ void remove_last_loop_id(const std::shared_ptr& expr) { } int64_t get_stride_after_move_outer(const LoopPort& loop_port) { - const auto& expr_port = loop_port.expr_port; + const auto& expr_port = loop_port.get_expr_port(); const auto& shape = expr_port->get_descriptor_ptr()->get_shape(); - size_t shape_dim_idx = utils::get_dim_idx(*expr_port, loop_port.dim_idx); + size_t shape_dim_idx = utils::get_dim_idx(*expr_port, loop_port.get_dim_idx()); int64_t stride = utils::get_stride(shape_dim_idx, shape); if (utils::is_dynamic_value(stride) || utils::is_dynamic_value(shape[shape_dim_idx])) { return utils::get_dynamic_value(); @@ -89,7 +89,7 @@ bool is_extraction_applicable(const ExpressionPtr& expr, const UnifiedLoopInfoPt if (is_loop_port) { // stride is not 1 after move to outside, then should not extract. const auto& loop_port = inner_loop_info->get_loop_port(expr_input_ports[i]); - if (get_stride_after_move_outer(loop_port) != 1) { + if (!loop_port.is_processed() || get_stride_after_move_outer(loop_port) != 1) { return false; } } @@ -150,7 +150,7 @@ std::vector get_loop_input_exprs(const std::vector& loo std::vector input_exprs; std::unordered_set seen_exprs; for (size_t port_num = 0; port_num < loop_in_ports.size(); ++port_num) { - const auto& expr = loop_in_ports[port_num].expr_port->get_expr(); + const auto& expr = loop_in_ports[port_num].get_expr_port()->get_expr(); if (seen_exprs.count(expr) == 0) { input_exprs.push_back(expr); seen_exprs.insert(expr); diff --git a/src/common/snippets/src/lowered/pass/fuse_loops.cpp b/src/common/snippets/src/lowered/pass/fuse_loops.cpp index 3708896f5abf39..e2dc1451aa9de0 100644 --- a/src/common/snippets/src/lowered/pass/fuse_loops.cpp +++ b/src/common/snippets/src/lowered/pass/fuse_loops.cpp @@ -26,18 +26,18 @@ bool FuseLoops::loop_ports_are_compatible(const LoopInfoPtr& loop_upper, const LoopInfoPtr& loop_lower) { auto found_port = [](const std::vector& loop_ports, const ExpressionPort& target_port) { return std::find_if(loop_ports.cbegin(), loop_ports.cend(), - [&target_port](const LoopPort& loop_port) {return *(loop_port.expr_port.get()) == target_port; }); + [&target_port](const LoopPort& loop_port) {return *(loop_port.get_expr_port().get()) == target_port; }); }; const auto& upper_exit_ports = loop_upper->get_output_ports(); const auto& lower_entry_ports = loop_lower->get_input_ports(); for (const auto& lower_entry_port : lower_entry_ports) { - const auto& src_port = lower_entry_port.expr_port->get_port_connector_ptr()->get_source(); + const auto& src_port = lower_entry_port.get_expr_port()->get_port_connector_ptr()->get_source(); const auto upper_exit_port_it = found_port(upper_exit_ports, src_port); if (upper_exit_port_it != upper_exit_ports.cend()) { const auto& upper_exit_port = *upper_exit_port_it; - if (!lower_entry_port.is_incremented || !upper_exit_port.is_incremented) + if (!lower_entry_port.is_incremented() || !upper_exit_port.is_incremented()) return false; - if (lower_entry_port.dim_idx != upper_exit_port.dim_idx) + if (lower_entry_port.get_dim_idx() != upper_exit_port.get_dim_idx()) return false; } } @@ -113,7 +113,7 @@ bool FuseLoops::fuse_upper_into_current(LinearIR& linear_ir, const LoopManagerPt bool is_fusion_allowed = true; for (size_t i = 0; i < loop_target->get_output_ports().size() && is_fusion_allowed; ++i) { const auto target_output_port = loop_target->get_output_ports()[i]; - const auto consumer_inputs = target_output_port.expr_port->get_connected_ports(); + const auto consumer_inputs = target_output_port.get_expr_port()->get_connected_ports(); for (const auto& consumer_input : consumer_inputs) { const auto& consumer = consumer_input.get_expr(); if (ov::is_type(consumer->get_node()) || consumer == current_input_port->get_expr()) @@ -157,7 +157,7 @@ bool FuseLoops::fuse_lower_into_current(LinearIR& linear_ir, const LoopManagerPt bool is_fusion_allowed = true; for (size_t i = 0; i < loop_target->get_input_ports().size() && is_fusion_allowed; ++i) { const auto target_entry_port = loop_target->get_input_ports()[i]; - const auto parent_expr_output = *target_entry_port.expr_port->get_connected_ports().begin(); + const auto parent_expr_output = *target_entry_port.get_expr_port()->get_connected_ports().begin(); const auto& parent_expr = parent_expr_output.get_expr(); if (ov::is_type(parent_expr->get_node()) || parent_expr == current_output_port->get_expr()) continue; @@ -221,7 +221,7 @@ bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l bool was_fusion_up = false; for (size_t in_port = 0; !was_fusion_up && in_port < input_ports.size(); ++in_port) { const auto& input_port = input_ports[in_port]; - const auto parent_expr_output = *input_port.expr_port->get_connected_ports().begin(); + const auto parent_expr_output = *input_port.get_expr_port()->get_connected_ports().begin(); const auto& parent_expr = parent_expr_output.get_expr(); const auto parent = parent_expr->get_node(); if (ov::is_type(parent) || @@ -247,7 +247,7 @@ bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l const auto upper_loop_id = upper_loop_ids[loop_idx]; OPENVINO_ASSERT(current_loop_id != upper_loop_id, "Loops cannot have parents of input ports with the same identifier (", upper_loop_id, ")"); - if (fuse_upper_into_current(linear_ir, loop_manager, input_port.expr_port, current_loop_id, upper_loop_id, + if (fuse_upper_into_current(linear_ir, loop_manager, input_port.get_expr_port(), current_loop_id, upper_loop_id, current_loop_begin_pos, current_loop_end_pos)) { was_fusion_up = true; prev_fused_loops.insert(current_loop_id); @@ -266,7 +266,7 @@ bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l const auto& output_ports = current_loop_info->get_output_ports(); for (size_t out_port = 0; !was_fusion_down && out_port < output_ports.size(); ++out_port) { const auto& output_port = output_ports[out_port]; - const auto consumer_exprs_inputs = output_port.expr_port->get_connected_ports(); + const auto consumer_exprs_inputs = output_port.get_expr_port()->get_connected_ports(); for (const auto& consumer_expr_input : consumer_exprs_inputs) { const auto& consumer_expr = consumer_expr_input.get_expr(); const auto consumer = consumer_expr->get_node(); @@ -294,7 +294,7 @@ bool FuseLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, l if (current_loop_id == lower_loop_id) continue; - if (fuse_lower_into_current(linear_ir, loop_manager, output_port.expr_port, current_loop_id, lower_loop_id, + if (fuse_lower_into_current(linear_ir, loop_manager, output_port.get_expr_port(), current_loop_id, lower_loop_id, current_loop_begin_pos, current_loop_end_pos)) { was_fusion_down = true; prev_fused_loops.insert(current_loop_id); diff --git a/src/common/snippets/src/lowered/pass/init_loops.cpp b/src/common/snippets/src/lowered/pass/init_loops.cpp index 69d336094f1a14..07143ec8b59e6d 100644 --- a/src/common/snippets/src/lowered/pass/init_loops.cpp +++ b/src/common/snippets/src/lowered/pass/init_loops.cpp @@ -17,14 +17,14 @@ namespace pass { namespace { inline void init_is_incremented(LoopPort& port) { - const auto& expr = port.expr_port->get_expr(); + const auto& expr = port.get_expr_port()->get_expr(); if (!std::dynamic_pointer_cast(expr->get_node())) { - port.is_incremented = false; + port.convert_to_type(); } } inline int64_t get_data_size(const LoopPort& loop_port) { - const auto& expr_port = loop_port.expr_port; + const auto& expr_port = loop_port.get_expr_port(); if (expr_port->get_type() == ExpressionPort::Input) { return static_cast(expr_port->get_expr()->get_node()->get_input_element_type(expr_port->get_index()).size()); } else if (expr_port->get_type() == ExpressionPort::Output) { diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index fabb6573ab3b14..1014517a47c14f 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -72,12 +72,11 @@ void InsertBuffers::insertion(LinearIR& linear_ir, const LinearIR::constExprIt& begin_it, const LinearIR::constExprIt& end_it, const LoopManagerPtr& loop_manager, - const std::vector& loop_entries, - const std::vector& loop_exits) const { - for (const auto& input_port : loop_entries) { - const auto& entry_port = input_port.expr_port; - const auto& expr = entry_port->get_expr(); - const auto port_idx = entry_port->get_index(); + const std::vector& loop_entries, + const std::vector& loop_exits) const { + for (const auto& entry_port : loop_entries) { + const auto& expr = entry_port.get_expr(); + const auto port_idx = entry_port.get_index(); const auto node = expr->get_node(); auto parent_expr_output = expr->get_input_port_connector(port_idx)->get_source(); auto parent_expr = parent_expr_output.get_expr(); @@ -116,17 +115,16 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // Need to insert between 2nd and 4th Loops - after 2nd Loop const auto pos = insertion_position(linear_ir, loop_manager, parent_expr, expr); const auto buffer = std::make_shared(parent->output(parent_port)); - const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0) : *entry_port; + const auto buffer_consumer = has_shape_infer_parent ? top_shape_infer_expr->get_input_port(0) : entry_port; linear_ir.insert_node(buffer, std::vector{ parent_expr_output }, buffer_loop_ids, false, pos, { buffer_consumer }); } } - for (const auto& output_port : loop_exits) { - const auto& exit_port = output_port.expr_port; - const auto& expr = exit_port->get_expr(); - const auto port_idx = exit_port->get_index(); + for (const auto& exit_port : loop_exits) { + const auto& expr = exit_port.get_expr(); + const auto port_idx = exit_port.get_index(); const auto node = expr->get_node(); - const auto output_connector = exit_port->get_port_connector_ptr(); + const auto output_connector = exit_port.get_port_connector_ptr(); const auto child_exprs_inputs = output_connector->get_consumers(); const auto& current_loops = expr->get_loop_ids(); @@ -200,7 +198,7 @@ void InsertBuffers::insertion(LinearIR& linear_ir, // | <- It should be new PortConnector // Relu // Output port connector is automatically filled from PortDescriptor - linear_ir.insert_node(buffer, std::vector{ *exit_port }, buffer_loop_ids, false, pos, { potential_consumers }); + linear_ir.insert_node(buffer, std::vector{ exit_port }, buffer_loop_ids, false, pos, { potential_consumers }); } } } @@ -213,8 +211,15 @@ bool InsertBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begi const auto loop_info = loop_data.second; const auto loop_entries = loop_info->get_input_ports(); const auto loop_exits = loop_info->get_output_ports(); + + auto cvt_to_expr_ports = [](const std::vector& loop_ports) { + std::vector expr_ports(loop_ports.size()); + std::transform(loop_ports.cbegin(), loop_ports.cend(), expr_ports.begin(), + [](const LoopPort& loop_port) { return *loop_port.get_expr_port(); }); + return expr_ports; + }; // using begin() as expr_it because we work with LoopInfo, not expressions in Linear IR - insertion(linear_ir, begin, end, loop_manager, loop_entries, loop_exits); + insertion(linear_ir, begin, end, loop_manager, cvt_to_expr_ports(loop_entries), cvt_to_expr_ports(loop_exits)); } for (auto expr_it = begin; expr_it != end; expr_it++) { @@ -226,7 +231,7 @@ bool InsertBuffers::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begi const auto input_ports = ma->get_memory_access_input_ports(); const auto output_ports = ma->get_memory_access_output_ports(); - std::vector loop_entries(input_ports.size()), loop_exits(output_ports.size()); + std::vector loop_entries(input_ports.size()), loop_exits(output_ports.size()); for (const auto& p : input_ports) { loop_entries[p.first] = expr->get_input_port(p.first); } diff --git a/src/common/snippets/src/lowered/pass/insert_loops.cpp b/src/common/snippets/src/lowered/pass/insert_loops.cpp index b4d06c1669c043..d663017d39efe4 100644 --- a/src/common/snippets/src/lowered/pass/insert_loops.cpp +++ b/src/common/snippets/src/lowered/pass/insert_loops.cpp @@ -25,7 +25,7 @@ void InsertLoops::insertion(LinearIR& linear_ir, const LoopManagerPtr& loop_mana std::vector loop_end_inputs; loop_end_inputs.reserve(in_num + out_num); loop_info->iterate_through_ports([&loop_end_inputs](const LoopPort& port) { - loop_end_inputs.push_back(port.expr_port->get_port_connector_ptr()); + loop_end_inputs.push_back(port.get_expr_port()->get_port_connector_ptr()); }); const auto is_incremented = loop_info->get_is_incremented(); diff --git a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp index 9ca5181883af36..570e878fd87954 100644 --- a/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp +++ b/src/common/snippets/src/lowered/pass/insert_specific_iterations.cpp @@ -131,7 +131,7 @@ LoopManager::LoopBounds InsertSpecificIterations::insert_copy_loop(LinearIR& lin new_ports.resize(ports.size()); for (size_t i = 0; i < ports.size(); ++i) { const auto& port = ports[i]; - new_ports[i] = *port.clone_with_new_expr(expression_map[port.expr_port->get_expr().get()]); + new_ports[i] = *port.clone_with_new_expr(expression_map[port.get_expr_port()->get_expr().get()]); } }; const auto original_loop_info = loop_manager->get_loop_info(loop_id); diff --git a/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp index 2f57d6422cf11d..c75d1e86abbfa5 100644 --- a/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp +++ b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp @@ -105,7 +105,7 @@ std::unordered_set MHAParallelWAOptimizer::find_applicab return false; bool loop_by_m = true; outermost_loop->iterate_through_ports([&loop_by_m](const lowered::LoopPort& port) { - if (port.is_incremented && port.dim_idx != m_dim_M_idx) + if (port.is_processed() && port.get_dim_idx() != m_dim_M_idx) loop_by_m = false; }); return loop_by_m; diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp index 53fb344f4d9e8a..f53f23be6e1fe9 100644 --- a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp +++ b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp @@ -56,42 +56,40 @@ void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir, // First step: set new dim value to the corresponding input_ports' dimensions if (most_outer_loop) { for (const auto& port : loop_info->get_input_ports()) { - const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type; - if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) { - const auto& expr = port.expr_port->get_expr(); - const auto& desc = port.expr_port->get_descriptor_ptr(); + if (port.is_processed()) { + const auto& expr = port.get_expr_port()->get_expr(); + const auto& desc = port.get_expr_port()->get_descriptor_ptr(); auto subtensor = desc->get_subtensor(); - if (port.dim_idx < desc->get_subtensor().size()) { - desc->set_subtensor_dim(port.dim_idx, new_dim_value); + if (port.get_dim_idx() < desc->get_subtensor().size()) { + desc->set_subtensor_dim(port.get_dim_idx(), new_dim_value); } - const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + const auto parent_desc = expr->get_input_port_connector(port.get_expr_port()->get_index())->get_source().get_descriptor_ptr(); const auto& parent_shape = parent_desc->get_shape(); if (original_shapes.find(parent_desc) == original_shapes.end()) { original_shapes[parent_desc] = parent_shape; } auto new_shape = parent_shape; - new_shape[*(desc->get_layout().rbegin() + port.dim_idx)] = new_dim_value; + new_shape[*(desc->get_layout().rbegin() + port.get_dim_idx())] = new_dim_value; parent_desc->set_shape(new_shape); } } } auto update_only_dim_idx_with_subtensor_value = [&](const LoopPort& port) { - const auto& reg_type = port.expr_port->get_descriptor_ptr()->get_reg().type; - if ((port.is_incremented && reg_type == RegType::gpr) || (reg_type == RegType::vec)) { - const auto desc = port.expr_port->get_descriptor_ptr(); - const auto expr = port.expr_port->get_expr(); - const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr(); + if (port.is_processed()) { + const auto desc = port.get_expr_port()->get_descriptor_ptr(); + const auto expr = port.get_expr_port()->get_expr(); + const auto parent_desc = expr->get_input_port_connector(port.get_expr_port()->get_index())->get_source().get_descriptor_ptr(); const auto& parent_shape = parent_desc->get_shape(); const auto& desc_subtensor = desc->get_subtensor(); - if (port.dim_idx < desc_subtensor.size()) { + if (port.get_dim_idx() < desc_subtensor.size()) { if (original_shapes.find(parent_desc) == original_shapes.end()) { original_shapes[parent_desc] = parent_shape; } auto new_shape = parent_shape; - new_shape[*(desc->get_layout().rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx); + new_shape[*(desc->get_layout().rbegin() + port.get_dim_idx())] = *(desc_subtensor.rbegin() + port.get_dim_idx()); parent_desc->set_shape(new_shape); } } diff --git a/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp index 46248b9c277818..a8144bce9a1d80 100644 --- a/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp +++ b/src/common/snippets/src/lowered/pass/set_buffer_reg_group.cpp @@ -30,13 +30,15 @@ size_t SetBufferRegGroup::get_buffer_idx(const BufferExpressionPtr& target, cons bool SetBufferRegGroup::can_be_in_one_reg_group(const UnifiedLoopInfo::LoopPortInfo& lhs_info, const UnifiedLoopInfo::LoopPortInfo& rhs_info) { const auto equal_element_type_sizes = lhs_info.desc.data_size == rhs_info.desc.data_size; - OPENVINO_ASSERT(lhs_info.port.expr_port && rhs_info.port.expr_port, "Expression ports are nullptr!"); + OPENVINO_ASSERT(lhs_info.port.get_expr_port() && rhs_info.port.get_expr_port(), "Expression ports are nullptr!"); const auto equal_invariant_shape_paths = - MarkInvariantShapePath::getInvariantPortShapePath(*lhs_info.port.expr_port) == - MarkInvariantShapePath::getInvariantPortShapePath(*rhs_info.port.expr_port); - const auto equal_is_incremented = lhs_info.port.is_incremented == rhs_info.port.is_incremented; + MarkInvariantShapePath::getInvariantPortShapePath(*lhs_info.port.get_expr_port()) == + MarkInvariantShapePath::getInvariantPortShapePath(*rhs_info.port.get_expr_port()); + const auto lhs_is_incremented = lhs_info.port.is_incremented(); + const auto rhs_is_incremented = rhs_info.port.is_incremented(); + const auto equal_is_incremented = lhs_is_incremented == rhs_is_incremented; return equal_invariant_shape_paths && equal_is_incremented && - (equal_element_type_sizes || !lhs_info.port.is_incremented || (lhs_info.desc.ptr_increment == 0 && lhs_info.desc.finalization_offset == 0)); + (equal_element_type_sizes || !lhs_is_incremented || (lhs_info.desc.ptr_increment == 0 && lhs_info.desc.finalization_offset == 0)); } bool SetBufferRegGroup::are_adjacent(const BufferMap::value_type& lhs, const BufferMap::value_type& rhs) { @@ -113,7 +115,7 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const const auto& loop_inputs = loop_info->get_input_ports_info(); for (const auto& port_info : loop_inputs) { - const auto& parent_output = port_info.port.expr_port->get_port_connector_ptr()->get_source().get_expr(); + const auto& parent_output = port_info.port.get_expr_port()->get_port_connector_ptr()->get_source().get_expr(); if (const auto buffer_expr = ov::as_type_ptr(parent_output)) { if (buffer_neighbours.count(buffer_expr) > 0) { const auto& port_desc = port_info.desc; @@ -127,7 +129,7 @@ SetBufferRegGroup::BufferMap SetBufferRegGroup::get_buffer_loop_neighbours(const const auto& loop_outputs = loop_info->get_output_ports_info(); for (const auto& port_info : loop_outputs) { - const auto& consumer_inputs = port_info.port.expr_port->get_port_connector_ptr()->get_consumers(); + const auto& consumer_inputs = port_info.port.get_expr_port()->get_port_connector_ptr()->get_consumers(); for (const auto& consumer_input : consumer_inputs) { const auto& child_expr = consumer_input.get_expr(); if (const auto buffer_expr = ov::as_type_ptr(child_expr)) diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp index 144159acca8d62..a663cc1ed564f5 100644 --- a/src/common/snippets/src/lowered/pass/split_loops.cpp +++ b/src/common/snippets/src/lowered/pass/split_loops.cpp @@ -23,7 +23,7 @@ bool SplitLoops::can_be_split(const UnifiedLoopInfoPtr& loop_to_split, const Uni const auto current_dim_idx = loop_to_split->get_dim_idx(); const auto parent_dim_idx = loop_to_fuse->get_dim_idx(); const auto& handlers = loop_to_split->get_handlers(); - const bool equal_dim_idxes = current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX && current_dim_idx == parent_dim_idx; + const bool equal_dim_idxes = current_dim_idx != LoopPort::UNDEFINED_DIM_IDX && current_dim_idx == parent_dim_idx; const bool only_main_body = handlers.get_passes().empty() && handlers.get_passes().empty(); return loop_to_split->get_work_amount() == loop_to_fuse->get_work_amount() && @@ -47,7 +47,7 @@ bool SplitLoops::run(LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, const auto& loop_id = loop_ids.front(); const auto loop = loop_manager->get_loop_info(loop_id); for (const auto& input_port : loop->get_input_ports()) { - const auto& parent_port = input_port.expr_port->get_port_connector_ptr()->get_source(); + const auto& parent_port = input_port.get_expr_port()->get_port_connector_ptr()->get_source(); const auto& parent_expr = parent_port.get_expr(); const auto& parent_loop_ids = parent_expr->get_loop_ids(); if (parent_loop_ids.empty()) @@ -141,7 +141,7 @@ bool SplitLoops::TransformInnerSplitLoop::run(LinearIR& linear_ir, LinearIR::con const auto& loop_manager = linear_ir.get_loop_manager(); const auto& outer_loop_info = loop_manager->get_loop_info(loop_end->get_id()); const auto current_dim_idx = outer_loop_info->get_dim_idx(); - OPENVINO_ASSERT(current_dim_idx != LoopInfo::UNDEFINED_DIM_IDX, + OPENVINO_ASSERT(current_dim_idx != LoopPort::UNDEFINED_DIM_IDX, "Outer splitted loop unexpectedly iterates by several dimension indices"); bool modified = false; diff --git a/src/common/snippets/src/lowered/pass/validate.cpp b/src/common/snippets/src/lowered/pass/validate.cpp index 2e9e5813c03264..e76f994d1284e1 100644 --- a/src/common/snippets/src/lowered/pass/validate.cpp +++ b/src/common/snippets/src/lowered/pass/validate.cpp @@ -117,7 +117,7 @@ void validate_loop_end(const ExpressionPtr& expr, const LinearIR& linear_ir) { const auto& final_offsets = loop_end->get_finalization_offsets(); auto validate_loop_ports = [&](const std::vector& loop_port_infos, size_t shift = 0) { for (size_t i = 0; i < loop_port_infos.size(); ++i) { - OPENVINO_ASSERT(is_incremented[i + shift] == loop_port_infos[i].port.is_incremented && + OPENVINO_ASSERT(is_incremented[i + shift] == loop_port_infos[i].port.is_incremented() && ptr_increments[i + shift] == loop_port_infos[i].desc.ptr_increment && final_offsets[i + shift] == loop_port_infos[i].desc.finalization_offset, "Incompatible data ptr shifts in LoopEnd and the corresponding LoopInfo"); diff --git a/src/common/snippets/src/lowered/pass/validate_unified_loops.cpp b/src/common/snippets/src/lowered/pass/validate_unified_loops.cpp index ec43f02d28792f..e127aaea0c11d3 100644 --- a/src/common/snippets/src/lowered/pass/validate_unified_loops.cpp +++ b/src/common/snippets/src/lowered/pass/validate_unified_loops.cpp @@ -32,7 +32,7 @@ void ValidateUnifiedLoops::validate_loop_infos(const LoopManagerPtr& loop_manage std::vector dim_indexes; auto validate_loop_port = [&loop_manager, &dim_indexes, &validated_nested_loops, &is_already_verified](const LoopPort& loop_port) { - const auto expr = loop_port.expr_port->get_expr(); + const auto expr = loop_port.get_expr_port()->get_expr(); const auto& loop_ids = expr->get_loop_ids(); // If loop_ids of the current port is subsequence of already validated IDs, skip if (is_already_verified(loop_ids)) @@ -45,7 +45,7 @@ void ValidateUnifiedLoops::validate_loop_infos(const LoopManagerPtr& loop_manage const auto id = loop_ids[i]; const auto dim_idx = loop_manager->get_loop_info(id)->get_dim_idx(); // if the loop has different dimension indexes, it don't have to meet the split loop related requirements - if (dim_idx == LoopInfo::UNDEFINED_DIM_IDX) + if (dim_idx == LoopPort::UNDEFINED_DIM_IDX) continue; if (i > 0) { if (std::find(dim_indexes.cbegin(), dim_indexes.cend(), dim_idx) != dim_indexes.cend()) { @@ -65,14 +65,14 @@ void ValidateUnifiedLoops::validate_loop_infos(const LoopManagerPtr& loop_manage OPENVINO_ASSERT(loop_info, "ValidateUnifiedLoops expects only UnifiedLoopInfo in LoopManager"); loop_info->iterate_through_ports(validate_loop_port); - // Validate that iteration dimnsion is broadcastable + // Validate that iteration dimension is broadcastable std::set unique_dimensions; loop_info->iterate_through_ports([&unique_dimensions](const LoopPort& loop_port) { - if (loop_port.is_incremented) { - const auto is_input = loop_port.expr_port->get_type() == ExpressionPort::Input; - const auto planar_shape = is_input ? ov::snippets::utils::get_planar_vdims(*loop_port.expr_port) - : ov::snippets::utils::get_preordered_vdims(*loop_port.expr_port); - const auto& dim = *(planar_shape.rbegin() + loop_port.dim_idx); + if (loop_port.is_processed()) { + const auto is_input = loop_port.get_expr_port()->get_type() == ExpressionPort::Input; + const auto planar_shape = is_input ? ov::snippets::utils::get_planar_vdims(*loop_port.get_expr_port()) + : ov::snippets::utils::get_preordered_vdims(*loop_port.get_expr_port()); + const auto& dim = *(planar_shape.rbegin() + loop_port.get_dim_idx()); // Since dim == 1 can be broadcasted to any value, it's not necessary to add it to unique dims if (!utils::is_dynamic_value(dim) && dim != 1) unique_dimensions.insert(dim); diff --git a/src/common/snippets/src/utils/loop_utils.cpp b/src/common/snippets/src/utils/loop_utils.cpp index 3d6b274c7613a8..4683006d49432b 100644 --- a/src/common/snippets/src/utils/loop_utils.cpp +++ b/src/common/snippets/src/utils/loop_utils.cpp @@ -13,17 +13,17 @@ namespace utils { using namespace ov::snippets::lowered; namespace { inline int64_t get_ptr_increment(const LoopPort& loop_port, size_t work_amount, size_t port_count) { - if (!loop_port.is_incremented) + if (!loop_port.is_incremented()) return 0; - const auto& expr_port = loop_port.expr_port; + const auto& expr_port = loop_port.get_expr_port(); const auto& layout = expr_port->get_descriptor_ptr()->get_layout(); const auto& shape = expr_port->get_descriptor_ptr()->get_shape(); size_t dim = 0; if (expr_port->get_type() == ExpressionPort::Input) { - dim = get_input_dim_idx(layout, loop_port.dim_idx); + dim = get_input_dim_idx(layout, loop_port.get_dim_idx()); } else if (expr_port->get_type() == ExpressionPort::Output) { - dim = get_output_dim_idx(layout, loop_port.dim_idx); + dim = get_output_dim_idx(layout, loop_port.get_dim_idx()); } else { OPENVINO_THROW("Unsupported expression port type!"); } @@ -47,12 +47,12 @@ inline int64_t get_finalization_offset(size_t work_amount, int64_t ptr_increment inline void init_work_amount(const LoopInfoPtr& loop_info) { size_t work_amount = 1; loop_info->iterate_through_ports([&work_amount](const LoopPort& loop_port) { - if (loop_port.is_incremented) { - const auto& desc = loop_port.expr_port->get_descriptor_ptr(); + if (loop_port.is_processed()) { + const auto& desc = loop_port.get_expr_port()->get_descriptor_ptr(); const auto& shape = desc->get_shape(); const auto& layout = desc->get_layout(); - const auto is_input = loop_port.expr_port->get_type() == ExpressionPort::Input; - const auto dim_idx = is_input ? get_input_dim_idx(layout, loop_port.dim_idx) : get_output_dim_idx(layout, loop_port.dim_idx); + const auto is_input = loop_port.get_expr_port()->get_type() == ExpressionPort::Input; + const auto dim_idx = is_input ? get_input_dim_idx(layout, loop_port.get_dim_idx()) : get_output_dim_idx(layout, loop_port.get_dim_idx()); OPENVINO_ASSERT(broadcast_merge_dim(work_amount, work_amount, shape[dim_idx]), "Failed to broadcast work_amount"); } @@ -69,7 +69,7 @@ void update_data_pointer_shifts(const UnifiedLoopInfoPtr& loop_info) { auto update_shifts = [&work_amount, &input_count, &output_count](LoopPort& loop_port, UnifiedLoopInfo::LoopPortDesc& ptr_shifts_params) { ptr_shifts_params.ptr_increment = get_ptr_increment(loop_port, work_amount, - loop_port.expr_port->get_type() == ExpressionPort::Input ? input_count : output_count); + loop_port.get_expr_port()->get_type() == ExpressionPort::Input ? input_count : output_count); ptr_shifts_params.finalization_offset = get_finalization_offset(work_amount, ptr_shifts_params.ptr_increment); }; loop_info->iterate_through_infos(update_shifts); diff --git a/src/common/snippets/tests/src/lir_comparator.cpp b/src/common/snippets/tests/src/lir_comparator.cpp index 82d5b9dcf91441..b547638eb798a3 100644 --- a/src/common/snippets/tests/src/lir_comparator.cpp +++ b/src/common/snippets/tests/src/lir_comparator.cpp @@ -31,6 +31,12 @@ inline string to_string(const SpecificLoopIterType& type) { ss << type; return ss.str(); } + +inline string to_string(const LoopPort::Type& type) { + stringstream ss; + ss << type; + return ss.str(); +} } // namespace std namespace ov { @@ -175,9 +181,10 @@ LIRComparator::Result LIRComparator::compare_loop_ports(const std::vectorget_loop_manager()->mark_loop(begin, end, 512, vector_size, - std::vector{LoopPort((*multiply.first)->get_input_port(0)), - LoopPort((*multiply.first)->get_input_port(1)), - LoopPort((*sub.first)->get_input_port(0))}, - std::vector{LoopPort((*sub.first)->get_output_port(0))}); + std::vector{LoopPort::create((*multiply.first)->get_input_port(0)), + LoopPort::create((*multiply.first)->get_input_port(1)), + LoopPort::create((*sub.first)->get_input_port(0))}, + std::vector{LoopPort::create((*sub.first)->get_output_port(0))}); linear_ir->set_loop_depth(1); } { @@ -82,9 +83,9 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithParams) { auto begin = sub.first; auto end = result.first; linear_ir_ref->get_loop_manager()->mark_loop(begin, end, 512, vector_size, - std::vector{LoopPort((*sub.first)->get_input_port(0)), - LoopPort((*sub.first)->get_input_port(1))}, - std::vector{LoopPort((*sub.first)->get_output_port(0))}); + std::vector{LoopPort::create((*sub.first)->get_input_port(0)), + LoopPort::create((*sub.first)->get_input_port(1))}, + std::vector{LoopPort::create((*sub.first)->get_output_port(0))}); } } @@ -121,9 +122,9 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithScalar) { auto begin = scalar.first; auto end = result.first; linear_ir->get_loop_manager()->mark_loop(begin, end, 512, vector_size, - std::vector{LoopPort((*multiply.first)->get_input_port(0)), - LoopPort((*sub.first)->get_input_port(0))}, - std::vector{LoopPort((*sub.first)->get_output_port(0))}); + std::vector{LoopPort::create((*multiply.first)->get_input_port(0)), + LoopPort::create((*sub.first)->get_input_port(0))}, + std::vector{LoopPort::create((*sub.first)->get_output_port(0))}); linear_ir->set_loop_depth(1); } { @@ -139,9 +140,9 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsWithScalar) { auto begin = sub.first; auto end = result.first; linear_ir_ref->get_loop_manager()->mark_loop(begin, end, 512, vector_size, - std::vector{LoopPort((*sub.first)->get_input_port(0)), - LoopPort((*sub.first)->get_input_port(1))}, - std::vector{LoopPort((*sub.first)->get_output_port(0))}); + std::vector{LoopPort::create((*sub.first)->get_input_port(0)), + LoopPort::create((*sub.first)->get_input_port(1))}, + std::vector{LoopPort::create((*sub.first)->get_output_port(0))}); } } @@ -184,19 +185,19 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsOutputLoopUpdateNotNeed auto begin = multiply.first; auto end = result1.first; linear_ir->get_loop_manager()->mark_loop(begin, end, 16, vector_size, - std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 0), - LoopPort((*multiply.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(0), true, 0), - LoopPort((*sub.first)->get_input_port(0), true, 0)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 0), - LoopPort((*sub.first)->get_output_port(0), true, 0)}); + std::vector{LoopPort::create((*multiply.first)->get_input_port(0)), + LoopPort::create((*multiply.first)->get_input_port(1)), + LoopPort::create((*add.first)->get_input_port(0)), + LoopPort::create((*sub.first)->get_input_port(0))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0)), + LoopPort::create((*sub.first)->get_output_port(0))}); linear_ir->get_loop_manager()->mark_loop(begin, end, 3, 1, - std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 1), - LoopPort((*multiply.first)->get_input_port(1), true, 1), - LoopPort((*add.first)->get_input_port(0), true, 1), - LoopPort((*sub.first)->get_input_port(0), true, 1)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 1), - LoopPort((*sub.first)->get_output_port(0), true, 1)}); + std::vector{LoopPort::create((*multiply.first)->get_input_port(0), 1), + LoopPort::create((*multiply.first)->get_input_port(1), 1), + LoopPort::create((*add.first)->get_input_port(0), 1), + LoopPort::create((*sub.first)->get_input_port(0), 1)}, + std::vector{LoopPort::create((*add.first)->get_output_port(0), 1), + LoopPort::create((*sub.first)->get_output_port(0), 1)}); linear_ir->set_loop_depth(2); } { @@ -214,21 +215,25 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsOutputLoopUpdateNotNeed auto result1 = linear_ir_ref->push_node(sub.second); auto begin_inner = add.first; auto end_inner = result1.first; - linear_ir_ref->get_loop_manager()->mark_loop(begin_inner, end_inner, 16, vector_size, - std::vector{LoopPort((*add.first)->get_input_port(0), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0), - LoopPort((*sub.first)->get_input_port(0), true, 0)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 0), - LoopPort((*sub.first)->get_output_port(0), true, 0)}); - auto begin_outer = multiply.first; - auto end_outer = result1.first; - linear_ir_ref->get_loop_manager()->mark_loop(begin_outer, end_outer, 3, 1, - std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 1), - LoopPort((*multiply.first)->get_input_port(1), true, 1), - LoopPort((*add.first)->get_input_port(0), true, 1), - LoopPort((*sub.first)->get_input_port(0), true, 1)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 1), - LoopPort((*sub.first)->get_output_port(0), true, 1)}); + { + const auto entry_ports = std::vector{LoopPort::create((*add.first)->get_input_port(0), 0), + LoopPort::create((*add.first)->get_input_port(1), 0), + LoopPort::create((*sub.first)->get_input_port(0), 0)}; + const auto exit_ports = std::vector{LoopPort::create((*add.first)->get_output_port(0), 0), + LoopPort::create((*sub.first)->get_output_port(0), 0)}; + linear_ir_ref->get_loop_manager()->mark_loop(begin_inner, end_inner, 16, vector_size, entry_ports, exit_ports); + } + { + auto begin_outer = multiply.first; + auto end_outer = result1.first; + const auto entry_ports = std::vector{LoopPort::create((*multiply.first)->get_input_port(0), 1), + LoopPort::create((*multiply.first)->get_input_port(1), 1), + LoopPort::create((*add.first)->get_input_port(0), 1), + LoopPort::create((*sub.first)->get_input_port(0), 1)}; + const auto exit_ports = std::vector{LoopPort::create((*add.first)->get_output_port(0), 1), + LoopPort::create((*sub.first)->get_output_port(0), 1)}; + linear_ir_ref->get_loop_manager()->mark_loop(begin_outer, end_outer, 3, 1, entry_ports, exit_ports); + } } } @@ -259,14 +264,20 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsFromInnermostToLoopOuts auto add = linear_ir->push_node(param_0.second, broadcastmove.second); init_expr_descriptors(*add.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); auto result = linear_ir->push_node(add.second); - linear_ir->get_loop_manager()->mark_loop(broadcastmove.first, result.first, 512, vector_size, - std::vector{LoopPort((*broadcastmove.first)->get_input_port(0), true, 0), - LoopPort((*add.first)->get_input_port(0), true, 0)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 0)}); - linear_ir->get_loop_manager()->mark_loop(broadcastmove.first, result.first, 3, 1, - std::vector{LoopPort((*broadcastmove.first)->get_input_port(0), true, 1), - LoopPort((*add.first)->get_input_port(0), true, 1)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 1)}); + + { + const auto entry_ports = std::vector{LoopPort::create((*broadcastmove.first)->get_input_port(0), 0), + LoopPort::create((*add.first)->get_input_port(0), 0)}; + const auto exit_ports = std::vector{LoopPort::create((*add.first)->get_output_port(0), 0)}; + linear_ir->get_loop_manager()->mark_loop(broadcastmove.first, result.first, 512, vector_size, entry_ports, exit_ports); + } + { + const auto entry_ports = std::vector{LoopPort::create((*broadcastmove.first)->get_input_port(0), 1), + LoopPort::create((*add.first)->get_input_port(0), 1)}; + const auto exit_ports = std::vector{LoopPort::create((*add.first)->get_output_port(0), 1)}; + linear_ir->get_loop_manager()->mark_loop(broadcastmove.first, result.first, 3, 1, entry_ports, exit_ports); + } + linear_ir->set_loop_depth(2); } { @@ -277,14 +288,19 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsFromInnermostToLoopOuts auto add = linear_ir_ref->push_node(param_0.second, broadcastmove.second); init_expr_descriptors(*add.first, {subtensor, subtensor, subtensor}, {layout, layout, layout}); auto result = linear_ir_ref->push_node(add.second); - linear_ir_ref->get_loop_manager()->mark_loop(add.first, result.first, 512, vector_size, - std::vector{LoopPort((*add.first)->get_input_port(0), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 0)}); - linear_ir_ref->get_loop_manager()->mark_loop(add.first, result.first, 3, 1, - std::vector{LoopPort((*add.first)->get_input_port(0), true, 1), - LoopPort((*add.first)->get_input_port(1), true, 1)}, - std::vector{LoopPort((*add.first)->get_output_port(0), true, 1)}); + + { + const auto entry_ports = std::vector{LoopPort::create((*add.first)->get_input_port(0), 0), + LoopPort::create((*add.first)->get_input_port(1), 0)}; + const auto exit_ports = std::vector{LoopPort::create((*add.first)->get_output_port(0), 0)}; + linear_ir_ref->get_loop_manager()->mark_loop(add.first, result.first, 512, vector_size, entry_ports, exit_ports); + } + { + const auto entry_ports = std::vector{LoopPort::create((*add.first)->get_input_port(0), 1), + LoopPort::create((*add.first)->get_input_port(1), 1)}; + const auto exit_ports = std::vector{LoopPort::create((*add.first)->get_output_port(0), 1)}; + linear_ir_ref->get_loop_manager()->mark_loop(add.first, result.first, 3, 1, entry_ports, exit_ports); + } } } @@ -312,12 +328,18 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsImpossible) { init_expr_descriptors(*load_reshape.first, {subtensor, subtensor}, {order, layout}); init_expr_descriptors(*store.first, {subtensor, subtensor}, {layout, layout}); auto result = linear_ir->push_node(store.second); - linear_ir->get_loop_manager()->mark_loop(load_reshape.first, result.first, 32, 1, - std::vector{LoopPort((*load_reshape.first)->get_input_port(0), true, 0)}, - std::vector{LoopPort((*store.first)->get_output_port(0), true, 0)}); - linear_ir->get_loop_manager()->mark_loop(load_reshape.first, result.first, 1, 1, - std::vector{LoopPort((*load_reshape.first)->get_input_port(0), true, 1)}, - std::vector{LoopPort((*store.first)->get_output_port(0), true, 1)}); + + { + const auto entry_ports = std::vector{LoopPort::create((*load_reshape.first)->get_input_port(0), 0)}; + const auto exit_ports = std::vector{LoopPort::create((*store.first)->get_output_port(0), 0)}; + linear_ir->get_loop_manager()->mark_loop(load_reshape.first, result.first, 32, 1, entry_ports, exit_ports); + } + { + const auto entry_ports = std::vector{LoopPort::create((*load_reshape.first)->get_input_port(0), 1)}; + const auto exit_ports = std::vector{LoopPort::create((*store.first)->get_output_port(0), 1)}; + linear_ir->get_loop_manager()->mark_loop(load_reshape.first, result.first, 1, 1, entry_ports, exit_ports); + } + linear_ir->set_loop_depth(2); } } @@ -352,17 +374,17 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsSplitLoops) { const auto result = linear_ir->push_node(add.second); const auto& loop_manager = linear_ir->get_loop_manager(); loop_manager->mark_loop(matmul.first, broadcastmove.first, 128, block_size, 1, - std::vector{LoopPort((*matmul.first)->get_input_port(0)), - LoopPort((*matmul.first)->get_input_port(1), false)}, - std::vector{LoopPort((*matmul.first)->get_output_port(0))}); + std::vector{LoopPort::create((*matmul.first)->get_input_port(0)), + LoopPort::create((*matmul.first)->get_input_port(1))}, + std::vector{LoopPort::create((*matmul.first)->get_output_port(0))}); loop_manager->mark_loop(broadcastmove.first, result.first, 64, vector_size, 0, - std::vector{LoopPort((*broadcastmove.first)->get_input_port(0)), - LoopPort((*add.first)->get_input_port(0))}, - std::vector{LoopPort((*add.first)->get_output_port(0))}); + std::vector{LoopPort::create((*broadcastmove.first)->get_input_port(0)), + LoopPort::create((*add.first)->get_input_port(0))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0))}); loop_manager->mark_loop(broadcastmove.first, result.first, 128, 1, 1, - std::vector{LoopPort((*broadcastmove.first)->get_input_port(0)), - LoopPort((*add.first)->get_input_port(0))}, - std::vector{LoopPort((*add.first)->get_output_port(0))}); + std::vector{LoopPort::create((*broadcastmove.first)->get_input_port(0)), + LoopPort::create((*add.first)->get_input_port(0))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0))}); ov::snippets::lowered::pass::SplitLoops().run(*linear_ir, linear_ir->begin(), linear_ir->end()); } { @@ -377,17 +399,17 @@ TEST_F(ExtractLoopInvariantsTest, ExtractedLoopInvariantsSplitLoops) { auto result = linear_ir_ref->push_node(add.second); const auto& loop_manager = linear_ir_ref->get_loop_manager(); loop_manager->mark_loop(matmul.first, add.first, 128, block_size, 1, - std::vector{LoopPort((*matmul.first)->get_input_port(0)), - LoopPort((*matmul.first)->get_input_port(1), false)}, - std::vector{LoopPort((*matmul.first)->get_output_port(0))}); + std::vector{LoopPort::create((*matmul.first)->get_input_port(0)), + LoopPort::create((*matmul.first)->get_input_port(1))}, + std::vector{LoopPort::create((*matmul.first)->get_output_port(0))}); loop_manager->mark_loop(add.first, result.first, 64, vector_size, 0, - std::vector{LoopPort((*add.first)->get_input_port(0)), - LoopPort((*add.first)->get_input_port(1))}, - std::vector{LoopPort((*add.first)->get_output_port(0))}); + std::vector{LoopPort::create((*add.first)->get_input_port(0)), + LoopPort::create((*add.first)->get_input_port(1))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0))}); loop_manager->mark_loop(add.first, result.first, 128, 1, 1, - std::vector{LoopPort((*add.first)->get_input_port(0)), - LoopPort((*add.first)->get_input_port(1))}, - std::vector{LoopPort((*add.first)->get_output_port(0))}); + std::vector{LoopPort::create((*add.first)->get_input_port(0)), + LoopPort::create((*add.first)->get_input_port(1))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0))}); ov::snippets::lowered::pass::SplitLoops().run(*linear_ir_ref, linear_ir_ref->begin(), linear_ir_ref->end()); } } @@ -457,19 +479,19 @@ TEST_F(ExtractLoopInvariantsRemoveLoopsTest, ExtractedLoopInvariantsAllExprsInLo auto result = linear_ir->push_node(multiply.second); // 3 inner loop linear_ir->get_loop_manager()->mark_loop(max.first, hmax.first, 1, vector_size, - std::vector{LoopPort((*max.first)->get_input_port(0), true, 0), - LoopPort((*max.first)->get_input_port(1), true, 0)}, - std::vector{LoopPort((*max.first)->get_output_port(0), true, 0)}); + std::vector{LoopPort::create((*max.first)->get_input_port(0), 0), + LoopPort::create((*max.first)->get_input_port(1), 0)}, + std::vector{LoopPort::create((*max.first)->get_output_port(0), 0)}); linear_ir->get_loop_manager()->mark_loop(sub.first, hsum.first, 1, vector_size, - std::vector{LoopPort((*sub.first)->get_input_port(0), true, 0), - LoopPort((*sub.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - std::vector{LoopPort((*exp.first)->get_output_port(0), true, 0), - LoopPort((*add.first)->get_output_port(0), true, 0)}); + std::vector{LoopPort::create((*sub.first)->get_input_port(0), 0), + LoopPort::create((*sub.first)->get_input_port(1), 0), + LoopPort::create((*add.first)->get_input_port(1), 0)}, + std::vector{LoopPort::create((*exp.first)->get_output_port(0), 0), + LoopPort::create((*add.first)->get_output_port(0), 0)}); linear_ir->get_loop_manager()->mark_loop(multiply.first, result.first, 1, vector_size, - std::vector{LoopPort((*multiply.first)->get_input_port(0), true, 0), - LoopPort((*multiply.first)->get_input_port(1), true, 0)}, - std::vector{LoopPort((*multiply.first)->get_output_port(0), true, 0)}); + std::vector{LoopPort::create((*multiply.first)->get_input_port(0), 0), + LoopPort::create((*multiply.first)->get_input_port(1), 0)}, + std::vector{LoopPort::create((*multiply.first)->get_output_port(0), 0)}); // outer loop info const auto loop_begin = std::make_shared(); auto loop_begin_expr = linear_ir->insert_node(loop_begin, std::vector{}, {}, false, max.first); @@ -477,10 +499,10 @@ TEST_F(ExtractLoopInvariantsRemoveLoopsTest, ExtractedLoopInvariantsAllExprsInLo std::vector loop_end_inputs{(*loop_begin_expr)->get_output_port_connector(0)}; auto loop_end_expr = linear_ir->insert_node(loop_end, loop_end_inputs, {}, false, result.first); linear_ir->get_loop_manager()->mark_loop(loop_begin_expr, result.first, 10, 1, - std::vector{LoopPort((*max.first)->get_input_port(0), true, 1), - LoopPort((*max.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - std::vector{LoopPort((*multiply.first)->get_output_port(0), true, 1)}); + std::vector{LoopPort::create((*max.first)->get_input_port(0), 1), + LoopPort::create((*max.first)->get_input_port(1), 0), + LoopPort::create((*add.first)->get_input_port(1), 0)}, + std::vector{LoopPort::create((*multiply.first)->get_output_port(0), 1)}); loop_end->set_id((*loop_end_expr)->get_loop_ids().back()); linear_ir->set_loop_depth(2); } @@ -510,10 +532,10 @@ TEST_F(ExtractLoopInvariantsRemoveLoopsTest, ExtractedLoopInvariantsAllExprsInLo std::vector loop_end_inputs{(*loop_begin_expr)->get_output_port_connector(0)}; auto loop_end_expr = linear_ir_ref->insert_node(loop_end, loop_end_inputs, {}, false, result.first); linear_ir_ref->get_loop_manager()->mark_loop(loop_begin_expr, result.first, 10, 1, - std::vector{LoopPort((*max.first)->get_input_port(0), true, 1), - LoopPort((*max.first)->get_input_port(1), true, 0), - LoopPort((*add.first)->get_input_port(1), true, 0)}, - std::vector{LoopPort((*multiply.first)->get_output_port(0), true, 1)}); + std::vector{LoopPort::create((*max.first)->get_input_port(0), 1), + LoopPort::create((*max.first)->get_input_port(1), 0), + LoopPort::create((*add.first)->get_input_port(1), 0)}, + std::vector{LoopPort::create((*multiply.first)->get_output_port(0), 1)}); loop_end->set_id((*loop_end_expr)->get_loop_ids().back()); } } diff --git a/src/common/snippets/tests/src/lowered/pass/loop.cpp b/src/common/snippets/tests/src/lowered/pass/loop.cpp index 610b98e558760f..b229b89108b316 100644 --- a/src/common/snippets/tests/src/lowered/pass/loop.cpp +++ b/src/common/snippets/tests/src/lowered/pass/loop.cpp @@ -24,6 +24,7 @@ using Snippets_TailProcessingTransformation = ::testing::Test; // [Inserted Loop number, [ptr_increments, final_offsets] using ref_map = std::map, std::vector>>; using namespace ov::snippets::lowered; +using PortType = LoopPort::Type; constexpr static size_t vector_size = 16; @@ -41,18 +42,18 @@ static void init_linear_ir(const std::vector& in_shapes, LinearIR& li const auto result = linear_ir.push_node(add.second); const auto loop_manager = linear_ir.get_loop_manager(); - linear_ir.get_loop_manager()->mark_loop(matmul.first, add.first, in_shapes[0].front(), block_size, 1, - std::vector{LoopPort((*matmul.first)->get_input_port(0)), - LoopPort((*matmul.first)->get_input_port(1), false)}, - std::vector{LoopPort((*matmul.first)->get_output_port(0))}); + linear_ir.get_loop_manager()->mark_loop(matmul.first, add.first, in_shapes[0].front(), block_size, + std::vector{LoopPort::create((*matmul.first)->get_input_port(0), 1), + LoopPort::create((*matmul.first)->get_input_port(1))}, + std::vector{LoopPort::create((*matmul.first)->get_output_port(0), 1)}); linear_ir.get_loop_manager()->mark_loop(add.first, result.first, in_shapes[2].back(), vector_size, 0, - std::vector{LoopPort((*add.first)->get_input_port(0)), - LoopPort((*add.first)->get_input_port(1))}, - std::vector{LoopPort((*add.first)->get_output_port(0))}); + std::vector{LoopPort::create((*add.first)->get_input_port(0)), + LoopPort::create((*add.first)->get_input_port(1))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0))}); linear_ir.get_loop_manager()->mark_loop(add.first, result.first, in_shapes[2].front(), 1, 1, - std::vector{LoopPort((*add.first)->get_input_port(0)), - LoopPort((*add.first)->get_input_port(1))}, - std::vector{LoopPort((*add.first)->get_output_port(0))}); + std::vector{LoopPort::create((*add.first)->get_input_port(0)), + LoopPort::create((*add.first)->get_input_port(1))}, + std::vector{LoopPort::create((*add.first)->get_output_port(0))}); } static void apply_transformations(LinearIR& linear_ir, const std::shared_ptr& config) { diff --git a/src/common/transformations/include/ov_ops/glu.hpp b/src/common/transformations/include/ov_ops/glu.hpp index add8c3a0582525..ab2c646d7a1896 100644 --- a/src/common/transformations/include/ov_ops/glu.hpp +++ b/src/common/transformations/include/ov_ops/glu.hpp @@ -77,4 +77,15 @@ class TRANSFORMATIONS_API GLU : public ov::op::Op { } // namespace internal } // namespace op + +std::ostream& operator<<(std::ostream& s, const op::internal::GLU::GluType& reduction); + +template <> +class AttributeAdapter : public EnumAttributeAdapterBase { +public: + AttributeAdapter(op::internal::GLU::GluType& value) : EnumAttributeAdapterBase(value) {} + + OPENVINO_RTTI("AttributeAdapter"); +}; + } // namespace ov diff --git a/src/common/transformations/include/transformations/sdpa_to_paged_attention/state_management_pattern.hpp b/src/common/transformations/include/transformations/sdpa_to_paged_attention/state_management_pattern.hpp index 79b4f444cfa791..2e090a4aabaa30 100644 --- a/src/common/transformations/include/transformations/sdpa_to_paged_attention/state_management_pattern.hpp +++ b/src/common/transformations/include/transformations/sdpa_to_paged_attention/state_management_pattern.hpp @@ -24,8 +24,12 @@ class ov::pass::StateManagementPattern : public ov::pass::MatcherPass { ParameterVector& parameters_to_remove, int& layer_index, ov::Output max_context_len, - ParameterVector& block_indices_inputs, + ParameterVector& block_indices_inputs_for_each_layer, ResultVector& score_results, - bool use_block_indices, - bool use_score_outputs); + bool use_per_layer_block_indices_inputs, + bool use_score_outputs, + bool allow_cache_rotation, + ParameterVector& rotated_block_indices_inputs_for_each_layer, + ParameterVector& rotation_deltas_inputs_for_each_layer, + std::shared_ptr model_rotation_trig_lut); }; diff --git a/src/common/transformations/src/ov_ops/glu.cpp b/src/common/transformations/src/ov_ops/glu.cpp index 9b5fb780d36bb8..5aac8489101bc5 100644 --- a/src/common/transformations/src/ov_ops/glu.cpp +++ b/src/common/transformations/src/ov_ops/glu.cpp @@ -30,6 +30,8 @@ GLU::GLU(const Output& data, bool GLU::visit_attributes(ov::AttributeVisitor& visitor) { visitor.on_attribute("axis", m_axis); visitor.on_attribute("split_lengths", m_split_lengths); + visitor.on_attribute("glu_type", m_glu_type); + visitor.on_attribute("split_to_glu_idx", m_split_to_glu_idx); visitor.on_attribute("output_type", m_output_type); return true; } @@ -53,4 +55,15 @@ std::shared_ptr GLU::clone_with_new_inputs(const ov::OutputVector& new_arg } } // namespace internal } // namespace op + +template <> +OPENVINO_API EnumNames& EnumNames::get() { + static auto enum_names = + EnumNames("op::internal::GLU::GluType", + {{"Swish", op::internal::GLU::GluType::Swish}, + {"Gelu", op::internal::GLU::GluType::Gelu}, + {"Gelu_Tanh", op::internal::GLU::GluType::Gelu_Tanh}}); + return enum_names; +} + } // namespace ov diff --git a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp index a36085c34237a4..7b896463fdd51b 100644 --- a/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp +++ b/src/common/transformations/src/transformations/sdpa_to_paged_attention/state_management_pattern.cpp @@ -15,6 +15,7 @@ #include "openvino/op/gather.hpp" #include "openvino/op/multiply.hpp" #include "openvino/op/paged_attention.hpp" +#include "openvino/op/parameter.hpp" #include "openvino/op/reshape.hpp" #include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/select.hpp" @@ -70,10 +71,14 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par ParameterVector& parameters_to_remove, int& layer_index, Output max_context_len, - ParameterVector& block_indices_inputs, + ParameterVector& block_indices_inputs_for_each_layer, ResultVector& score_results, - bool use_block_indices_inputs, - bool use_score_outputs) { + bool use_per_layer_block_indices_inputs, + bool use_score_outputs, + bool allow_cache_rotation, + ParameterVector& rotated_block_indices_inputs_for_each_layer, + ParameterVector& rotation_deltas_inputs_for_each_layer, + std::shared_ptr model_rotation_trig_lut) { MATCHER_SCOPE(StateManagementPattern); auto k_current = pattern::any_input(); @@ -176,9 +181,11 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par &model_remaining_params, &sliding_window, ¶meters_to_remove, - &block_indices_inputs, + &block_indices_inputs_for_each_layer, &score_results, - &layer_index](ov::pass::pattern::Matcher& m) { + &layer_index, + &rotated_block_indices_inputs_for_each_layer, + &rotation_deltas_inputs_for_each_layer](ov::pass::pattern::Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); auto real_q = pattern_map.at(q); @@ -382,11 +389,27 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par max_context_len.get_node_shared_ptr()}; pa_arguments.insert(pa_arguments.end(), additional_params.begin(), additional_params.end()); - if (use_block_indices_inputs) { + if (use_per_layer_block_indices_inputs) { auto block_indices = setName(std::make_shared(element::i32, PartialShape{-1}), "block_indices." + std::to_string(layer_index - 1)); pa_arguments.insert(pa_arguments.begin() + 7, block_indices); - block_indices_inputs.push_back(block_indices); + block_indices_inputs_for_each_layer.push_back(block_indices); + } + + OPENVINO_ASSERT(pa_arguments.size() == 13); + + if (allow_cache_rotation) { + auto rotated_block_indices = setName(std::make_shared(element::i32, PartialShape{-1}), + "rotated_block_indices." + std::to_string(layer_index - 1)); + auto rotation_deltas = setName(std::make_shared(element::i32, PartialShape{-1, -1}), + "rotation_deltas." + std::to_string(layer_index - 1)); + + pa_arguments.insert(pa_arguments.begin() + 13, rotated_block_indices); + pa_arguments.insert(pa_arguments.begin() + 14, rotation_deltas); + pa_arguments.insert(pa_arguments.begin() + 15, model_rotation_trig_lut); + + rotated_block_indices_inputs_for_each_layer.push_back(rotated_block_indices); + rotation_deltas_inputs_for_each_layer.push_back(rotation_deltas); } auto paged_attention = std::make_shared(pa_arguments); @@ -444,4 +467,4 @@ ov::pass::StateManagementPattern::StateManagementPattern(ParameterVector& kv_par auto m = std::make_shared(sdpa_variants, matcher_name); register_matcher(m, callback); -} \ No newline at end of file +} diff --git a/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp b/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp index d52e78dbd6a489..b1b0bb6078d987 100644 --- a/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp +++ b/src/core/include/openvino/pass/sdpa_to_paged_attention.hpp @@ -19,12 +19,15 @@ class OPENVINO_API SDPAToPagedAttention : public ModelPass { public: OPENVINO_MODEL_PASS_RTTI("SDPAToPagedAttention"); - explicit SDPAToPagedAttention(bool use_block_indices_inputs = false, bool use_score_outputs = false); + explicit SDPAToPagedAttention(bool use_per_layer_block_indices_inputs = false, + bool use_score_outputs = false, + bool allow_cache_rotation = false); bool run_on_model(const std::shared_ptr& model) override; private: - bool m_use_block_indices_inputs; + bool m_use_per_layer_block_indices_inputs; bool m_use_score_outputs; + bool m_allow_cache_rotation; }; } // namespace pass } // namespace ov diff --git a/src/core/src/op/paged_attention.cpp b/src/core/src/op/paged_attention.cpp index cdcb66e86ee33e..1feeab44b7f018 100644 --- a/src/core/src/op/paged_attention.cpp +++ b/src/core/src/op/paged_attention.cpp @@ -19,8 +19,8 @@ void PagedAttentionExtension::validate_and_infer_types() { OV_OP_SCOPE(PagedAttentionExtension_validate_and_infer_types); NODE_VALIDATION_CHECK(this, - get_input_size() == 13, - "PagedAttensionExtension expects 13 inputs, but it has ", + get_input_size() == 13 || get_input_size() == 16, + "PagedAttensionExtension expects 13 or 16 inputs, but it has ", get_input_size()); NODE_VALIDATION_CHECK( @@ -147,6 +147,42 @@ void PagedAttentionExtension::validate_and_infer_types() { get_input_element_type(12), "."); + if (get_input_size() == 16) { + NODE_VALIDATION_CHECK( + this, + get_input_partial_shape(13).rank().is_dynamic() || get_input_partial_shape(13).rank().get_length() == 1, + "Input `rotated_block_indices` should either have rank 1 or be omitted, but it has rank ", + get_input_partial_shape(13).rank().get_length(), + "."); + NODE_VALIDATION_CHECK(this, + get_input_element_type(13).is_dynamic() || get_input_element_type(13) == element::i32, + "Element type of `rotated_block_indices` input should be i32, but it is ", + get_input_element_type(13), + "."); + NODE_VALIDATION_CHECK( + this, + get_input_partial_shape(14).rank().is_dynamic() || get_input_partial_shape(14).rank().get_length() == 2, + "Input `rotation_deltas` should either have rank 2 or be omitted, but it has rank ", + get_input_partial_shape(14).rank().get_length(), + "."); + NODE_VALIDATION_CHECK(this, + get_input_element_type(14).is_dynamic() || get_input_element_type(14) == element::i32, + "Element type of `rotation_deltas` input should be i32, but it is ", + get_input_element_type(14), + "."); + NODE_VALIDATION_CHECK( + this, + get_input_partial_shape(15).rank().is_dynamic() || get_input_partial_shape(15).rank().get_length() == 2, + "Input `rotation_trig_lut` should either have rank 2 or be omitted, but it has rank ", + get_input_partial_shape(15).rank().get_length(), + "."); + NODE_VALIDATION_CHECK(this, + get_input_element_type(15).is_dynamic() || get_input_element_type(15) == element::f32, + "Element type of `rotation_trig_lut` input should be f32, but it is ", + get_input_element_type(15), + "."); + } + // value head_size may be not same with key auto out_ps = get_input_partial_shape(0); const auto& key_ps = get_input_partial_shape(1); diff --git a/src/core/src/pass/sdpa_to_paged_attention.cpp b/src/core/src/pass/sdpa_to_paged_attention.cpp index e6fc744bb5ef4f..ea3f3c3e79e196 100644 --- a/src/core/src/pass/sdpa_to_paged_attention.cpp +++ b/src/core/src/pass/sdpa_to_paged_attention.cpp @@ -20,9 +20,12 @@ using namespace ov::op; -ov::pass::SDPAToPagedAttention::SDPAToPagedAttention(bool use_block_indices_inputs, bool use_score_outputs) - : m_use_block_indices_inputs(use_block_indices_inputs), - m_use_score_outputs(use_score_outputs) {} +ov::pass::SDPAToPagedAttention::SDPAToPagedAttention(bool use_per_layer_block_indices_inputs, + bool use_score_outputs, + bool allow_cache_rotation) + : m_use_per_layer_block_indices_inputs(use_per_layer_block_indices_inputs), + m_use_score_outputs(use_score_outputs), + m_allow_cache_rotation(allow_cache_rotation) {} static std::shared_ptr setName(std::shared_ptr node, const char* name) { // Set name for both node and output tensor (should be only one tensor, and any other names will be overriden by a @@ -46,11 +49,18 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr(element::i32, PartialShape{-1}), "subsequence_begins"), setName(std::make_shared(element::i32, PartialShape{-1}), "block_indices_begins"), }; - if (!m_use_block_indices_inputs) { + if (!m_use_per_layer_block_indices_inputs) { auto block_indices = setName(std::make_shared(element::i32, PartialShape{-1}), "block_indices"); model_remaining_params.insert(model_remaining_params.begin() + 2, block_indices); } + std::shared_ptr model_rotation_trig_lut; + + if (m_allow_cache_rotation) { + model_rotation_trig_lut = + setName(std::make_shared(element::f32, PartialShape{-1, -1}), "rotation_trig_lut"); + } + auto sliding_window = v0::Constant::create(element::i32, Shape{}, {0}); // sliding_window auto get_parameter = [=](const std::shared_ptr& model, @@ -91,7 +101,10 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptr position_ids; @@ -120,11 +133,14 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptroutput(0), - block_indices_inputs, + block_indices_inputs_for_each_layer, score_results, - m_use_block_indices_inputs, - m_use_score_outputs); - + m_use_per_layer_block_indices_inputs, + m_use_score_outputs, + m_allow_cache_rotation, + rotated_block_indices_inputs_for_each_layer, + rotation_deltas_inputs_for_each_layer, + model_rotation_trig_lut); manager.register_pass(unsqueezed_input_ids, max_context_len, position_ids); manager.register_pass(max_context_len); manager.register_pass(max_context_len); @@ -174,14 +190,20 @@ bool ov::pass::SDPAToPagedAttention::run_on_model(const std::shared_ptrremove_parameter(parameter); } - if (m_use_block_indices_inputs) { - model->add_parameters(block_indices_inputs); + if (m_use_per_layer_block_indices_inputs) { + model->add_parameters(block_indices_inputs_for_each_layer); } if (m_use_score_outputs) { model->add_results(score_results); } + if (m_allow_cache_rotation) { + model->add_parameters(rotated_block_indices_inputs_for_each_layer); + model->add_parameters(rotation_deltas_inputs_for_each_layer); + model->add_parameters({model_rotation_trig_lut}); + } + model->add_parameters(kv_parameters); model->add_parameters(model_remaining_params); model->add_parameters({std::move(max_context_len)}); diff --git a/src/core/tests/type_prop/paged_attention.cpp b/src/core/tests/type_prop/paged_attention.cpp new file mode 100644 index 00000000000000..64fe26b32041ef --- /dev/null +++ b/src/core/tests/type_prop/paged_attention.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/paged_attention.hpp" + +#include + +#include "openvino/op/parameter.hpp" + +namespace ov { +namespace testing { + +TEST(type_prop, paged_attention_static_13_inputs) { + const auto query = std::make_shared(element::f32, PartialShape{3, 4}); + const auto key = std::make_shared(element::f32, PartialShape{3, 4}); + const auto value = std::make_shared(element::f32, PartialShape{3, 4}); + const auto key_cache = std::make_shared(element::f32, PartialShape{6, 2, 5, 4}); + const auto value_cache = std::make_shared(element::f32, PartialShape{6, 2, 5, 4}); + const auto past_lens = std::make_shared(element::i32, PartialShape{5}); + const auto subsequence_begins = std::make_shared(element::i32, PartialShape{5}); + const auto block_indices = std::make_shared(element::i32, PartialShape{15}); + const auto block_indices_begins = std::make_shared(element::i32, PartialShape{8}); + const auto scale = std::make_shared(element::f32, PartialShape{}); + const auto sliding_window = std::make_shared(element::i32, PartialShape{}); + const auto alibi_slopes = std::make_shared(element::f32, PartialShape{9}); + const auto max_context_len = std::make_shared(element::i32, PartialShape{}); + + ov::OutputVector args = {query, + key, + value, + key_cache, + value_cache, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + scale, + sliding_window, + alibi_slopes, + max_context_len}; + const auto op = std::make_shared(args); + EXPECT_EQ(op->get_output_element_type(0), element::f32); + EXPECT_EQ(op->get_output_partial_shape(0), (PartialShape{3, 4})); +} + +TEST(type_prop, paged_attention_static_16_inputs_eviction_per_block) { + const auto query = std::make_shared(element::f32, PartialShape{3, 4}); + const auto key = std::make_shared(element::f32, PartialShape{3, 4}); + const auto value = std::make_shared(element::f32, PartialShape{3, 4}); + const auto key_cache = std::make_shared(element::f32, PartialShape{6, 2, 5, 4}); + const auto value_cache = std::make_shared(element::f32, PartialShape{6, 2, 5, 4}); + const auto past_lens = std::make_shared(element::i32, PartialShape{5}); + const auto subsequence_begins = std::make_shared(element::i32, PartialShape{5}); + const auto block_indices = std::make_shared(element::i32, PartialShape{15}); + const auto block_indices_begins = std::make_shared(element::i32, PartialShape{8}); + const auto scale = std::make_shared(element::f32, PartialShape{}); + const auto sliding_window = std::make_shared(element::i32, PartialShape{}); + const auto alibi_slopes = std::make_shared(element::f32, PartialShape{9}); + const auto max_context_len = std::make_shared(element::i32, PartialShape{}); + + const auto rotated_block_indices = std::make_shared(element::i32, PartialShape{3}); + const auto rotation_deltas = std::make_shared(element::i32, PartialShape{12, 1}); + const auto rotation_trig_lut = std::make_shared(element::f32, PartialShape{256, 4}); + + ov::OutputVector args = {query, + key, + value, + key_cache, + value_cache, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + scale, + sliding_window, + alibi_slopes, + max_context_len, + rotated_block_indices, + rotation_deltas, + rotation_trig_lut}; + + const auto op = std::make_shared(args); + EXPECT_EQ(op->get_output_element_type(0), element::f32); + EXPECT_EQ(op->get_output_partial_shape(0), (PartialShape{3, 4})); +} + +TEST(type_prop, paged_attention_static_16_inputs_eviction_per_token) { + const auto query = std::make_shared(element::f32, PartialShape{3, 4}); + const auto key = std::make_shared(element::f32, PartialShape{3, 4}); + const auto value = std::make_shared(element::f32, PartialShape{3, 4}); + const auto key_cache = std::make_shared(element::f32, PartialShape{6, 2, 5, 4}); + const auto value_cache = std::make_shared(element::f32, PartialShape{6, 2, 5, 4}); + const auto past_lens = std::make_shared(element::i32, PartialShape{5}); + const auto subsequence_begins = std::make_shared(element::i32, PartialShape{5}); + const auto block_indices = std::make_shared(element::i32, PartialShape{15}); + const auto block_indices_begins = std::make_shared(element::i32, PartialShape{8}); + const auto scale = std::make_shared(element::f32, PartialShape{}); + const auto sliding_window = std::make_shared(element::i32, PartialShape{}); + const auto alibi_slopes = std::make_shared(element::f32, PartialShape{9}); + const auto max_context_len = std::make_shared(element::i32, PartialShape{}); + + const auto rotated_block_indices = std::make_shared(element::i32, PartialShape{3}); + const auto rotation_deltas = std::make_shared(element::i32, PartialShape{12, 5}); + const auto rotation_trig_lut = std::make_shared(element::f32, PartialShape{256, 4}); + + ov::OutputVector args = {query, + key, + value, + key_cache, + value_cache, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + scale, + sliding_window, + alibi_slopes, + max_context_len, + rotated_block_indices, + rotation_deltas, + rotation_trig_lut}; + + const auto op = std::make_shared(args); + EXPECT_EQ(op->get_output_element_type(0), element::f32); + EXPECT_EQ(op->get_output_partial_shape(0), (PartialShape{3, 4})); +} + +} // namespace testing +} // namespace ov diff --git a/src/core/tests/visitors/op/glu.cpp b/src/core/tests/visitors/op/glu.cpp new file mode 100644 index 00000000000000..f5de54ce9da1ed --- /dev/null +++ b/src/core/tests/visitors/op/glu.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/glu.hpp" + +#include + +#include "visitors/visitors.hpp" + +using ov::op::internal::GLU; +using ov::op::v0::Parameter; +using ov::test::NodeBuilder; + +TEST(attributes, glu_attr_Swish) { + NodeBuilder::opset().insert(); + + int64_t axis = -1; + int64_t split_lenghts = 3; + auto data = std::make_shared(ov::element::f32, ov::PartialShape{2, 1, 6}); + auto op = std::make_shared(data, axis, split_lenghts, GLU::GluType::Swish, 0); + + NodeBuilder builder(op, {data}); + auto g_op = ov::as_type_ptr(builder.create()); + + EXPECT_EQ(g_op->get_axis(), op->get_axis()); + EXPECT_EQ(g_op->get_split_lengths(), op->get_split_lengths()); + EXPECT_EQ(g_op->get_glu_type(), op->get_glu_type()); + EXPECT_EQ(g_op->get_split_to_glu_idx(), op->get_split_to_glu_idx()); + + EXPECT_EQ(g_op->get_output_element_type(0), op->get_output_element_type(0)); + EXPECT_EQ(g_op->get_output_partial_shape(0), op->get_output_partial_shape(0)); +} + +TEST(attributes, glu_attr_Gelu) { + NodeBuilder::opset().insert(); + + int64_t axis = 2; + int64_t split_lenghts = 3; + auto data = std::make_shared(ov::element::f16, ov::PartialShape{2, 1, 6}); + auto op = std::make_shared(data, axis, split_lenghts, GLU::GluType::Gelu, 1, ov::element::f16); + + NodeBuilder builder(op, {data}); + auto g_op = ov::as_type_ptr(builder.create()); + + EXPECT_EQ(g_op->get_axis(), op->get_axis()); + EXPECT_EQ(g_op->get_split_lengths(), op->get_split_lengths()); + EXPECT_EQ(g_op->get_glu_type(), op->get_glu_type()); + EXPECT_EQ(g_op->get_split_to_glu_idx(), op->get_split_to_glu_idx()); + + EXPECT_EQ(g_op->get_output_element_type(0), op->get_output_element_type(0)); + EXPECT_EQ(g_op->get_output_partial_shape(0), op->get_output_partial_shape(0)); +} + +TEST(attributes, glu_attr_Gelu_Tanh) { + NodeBuilder::opset().insert(); + + int64_t axis = 2; + int64_t split_lenghts = 3; + auto data = std::make_shared(ov::element::f16, ov::PartialShape{2, 1, 6}); + auto op = std::make_shared(data, axis, split_lenghts, GLU::GluType::Gelu_Tanh, 1, ov::element::f16); + + NodeBuilder builder(op, {data}); + auto g_op = ov::as_type_ptr(builder.create()); + + EXPECT_EQ(g_op->get_axis(), op->get_axis()); + EXPECT_EQ(g_op->get_split_lengths(), op->get_split_lengths()); + EXPECT_EQ(g_op->get_glu_type(), op->get_glu_type()); + EXPECT_EQ(g_op->get_split_to_glu_idx(), op->get_split_to_glu_idx()); + + EXPECT_EQ(g_op->get_output_element_type(0), op->get_output_element_type(0)); + EXPECT_EQ(g_op->get_output_partial_shape(0), op->get_output_partial_shape(0)); +} diff --git a/src/frontends/paddle/src/op/elementwise_ops.cpp b/src/frontends/paddle/src/op/elementwise_ops.cpp index fe13fa6425e8e1..0708be5a263227 100644 --- a/src/frontends/paddle/src/op/elementwise_ops.cpp +++ b/src/frontends/paddle/src/op/elementwise_ops.cpp @@ -54,6 +54,10 @@ NamedOutputs not_equal(const NodeContext& node_context) { return elementwise_ops(node_context); } +NamedOutputs less_equal(const NodeContext& node) { + return elementwise_ops(node); +} + NamedOutputs elementwise_floordiv(const NodeContext& node_context) { auto x = node_context.get_input("X"); auto y = node_context.get_input("Y"); diff --git a/src/frontends/paddle/src/op/expand_as_v2.cpp b/src/frontends/paddle/src/op/expand_as_v2.cpp new file mode 100644 index 00000000000000..1b9ee667495d24 --- /dev/null +++ b/src/frontends/paddle/src/op/expand_as_v2.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "default_opset.hpp" +#include "openvino/frontend/paddle/node_context.hpp" + +namespace ov { +namespace frontend { +namespace paddle { +namespace op { +NamedOutputs expand_as_v2(const NodeContext& node) { + using namespace default_opset; + auto x = node.get_input("X"); + Output shape_expected_node; + if (node.has_input("Y")) { + shape_expected_node = std::make_shared(node.get_input("Y"), element::i32); + } else { + std::vector shape_expected; + if (node.has_attribute("target_shape")) { + shape_expected = node.get_attribute>("target_shape"); + } else { + throw std::runtime_error("expand: has no target_shape attribute"); + } + shape_expected_node = Constant::create(element::i32, {shape_expected.size()}, shape_expected); + } + return node.default_single_output_mapping({std::make_shared(x, shape_expected_node)}, {"Out"}); +} + +} // namespace op +} // namespace paddle +} // namespace frontend +} // namespace ov diff --git a/src/frontends/paddle/src/op/expand_v2.cpp b/src/frontends/paddle/src/op/expand_v2.cpp index ea174efa3a9920..80f42066e0f1c9 100644 --- a/src/frontends/paddle/src/op/expand_v2.cpp +++ b/src/frontends/paddle/src/op/expand_v2.cpp @@ -62,4 +62,4 @@ NamedOutputs expand_v2(const NodeContext& node) { } // namespace op } // namespace paddle } // namespace frontend -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/src/frontends/paddle/src/op_table.cpp b/src/frontends/paddle/src/op_table.cpp index e092f16095abe0..ba422253c2b7b1 100644 --- a/src/frontends/paddle/src/op_table.cpp +++ b/src/frontends/paddle/src/op_table.cpp @@ -43,6 +43,7 @@ OP_CONVERTER(elu); OP_CONVERTER(embedding); OP_CONVERTER(exp); OP_CONVERTER(expand_v2); +OP_CONVERTER(expand_as_v2); OP_CONVERTER(eye); OP_CONVERTER(flip); OP_CONVERTER(flatten_contiguous_range); @@ -62,6 +63,7 @@ OP_CONVERTER(index_select); OP_CONVERTER(layer_norm); OP_CONVERTER(leaky_relu); OP_CONVERTER(less_than); +OP_CONVERTER(less_equal); OP_CONVERTER(linear_interp_v2); OP_CONVERTER(linspace); OP_CONVERTER(lod_array_length); @@ -179,6 +181,7 @@ std::map get_supported_ops() { {"equal", op::equal}, {"exp", op::exp}, {"expand_v2", op::expand_v2}, + {"expand_as_v2", op::expand_as_v2}, {"eye", op::eye}, {"fill_any_like", op::fill_any_like}, {"fill_constant", op::fill_constant}, @@ -200,6 +203,7 @@ std::map get_supported_ops() { {"layer_norm", op::layer_norm}, {"leaky_relu", op::leaky_relu}, {"less_than", op::less_than}, + {"less_equal", op::less_equal}, {"linear_interp_v2", op::linear_interp_v2}, {"linspace", op::linspace}, {"lod_array_length", op::lod_array_length}, diff --git a/src/frontends/paddle/tests/op_fuzzy.cpp b/src/frontends/paddle/tests/op_fuzzy.cpp index 53ea7852604376..39d20947219e6e 100644 --- a/src/frontends/paddle/tests/op_fuzzy.cpp +++ b/src/frontends/paddle/tests/op_fuzzy.cpp @@ -201,6 +201,8 @@ static const std::vector models{ std::string("expand_v2_tensor/expand_v2_tensor.pdmodel"), std::string("expand_v2_tensor_list/expand_v2_tensor_list.pdmodel"), std::string("expand_v2_tensor_list2/expand_v2_tensor_list2.pdmodel"), + std::string("expand_as_v2_1/expand_as_v2_1.pdmodel"), + std::string("expand_as_v2_2/expand_as_v2_2.pdmodel"), std::string("exp_test_float32/exp_test_float32.pdmodel"), std::string("eye/eye.pdmodel"), std::string("eye_int32/eye_int32.pdmodel"), @@ -283,6 +285,9 @@ static const std::vector models{ std::string("less_than_float32/less_than_float32.pdmodel"), std::string("less_than_int32/less_than_int32.pdmodel"), std::string("less_than_int64/less_than_int64.pdmodel"), + std::string("less_equal_float32/less_equal_float32.pdmodel"), + std::string("less_equal_int32/less_equal_int32.pdmodel"), + std::string("less_equal_int64/less_equal_int64.pdmodel"), std::string("linear_downsample_false_0/linear_downsample_false_0.pdmodel"), std::string("linear_downsample_false_1/linear_downsample_false_1.pdmodel"), std::string("linear_downsample_true_0/linear_downsample_true_0.pdmodel"), diff --git a/src/frontends/paddle/tests/requirements.txt b/src/frontends/paddle/tests/requirements.txt index 9b527dc1266957..5976576009ce68 100644 --- a/src/frontends/paddle/tests/requirements.txt +++ b/src/frontends/paddle/tests/requirements.txt @@ -2,5 +2,4 @@ -c ../../../bindings/python/constraints.txt protobuf numpy -six paddlepaddle diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_elementwise_ops.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_elementwise_ops.py index 682dae0bc19282..9709bed3a84213 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_elementwise_ops.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_elementwise_ops.py @@ -284,6 +284,39 @@ def elementwise_floordiv(name: str, x, y, in_dtype, axis=-1): return outs[0] +def elementwise_less_equal(name: str, x, y, in_dtype, cast_to_fp32=False): + paddle.enable_static() + + with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): + node_x = paddle.static.data( + name='input_x', shape=x.shape, dtype=in_dtype) + node_y = paddle.static.data( + name='input_y', shape=y.shape, dtype=in_dtype) + if paddle.__version__ >= '2.0.0': + out = paddle.less_equal(x=node_x, y=node_y, name='less_equal') + else: + out = paddle.fluid.layers.less_equal(x=node_x, y=node_y, name='less_equal') + # FuzzyTest framework doesn't support boolean so cast to fp32/int32 + + if cast_to_fp32: + in_dtype = "float32" + + out = paddle.cast(out, in_dtype) + cpu = paddle.static.cpu_places(1) + exe = paddle.static.Executor(cpu[0]) + # startup program will call initializer to initialize the parameters. + exe.run(paddle.static.default_startup_program()) + + outs = exe.run( + feed={'input_x': x, 'input_y': y}, + fetch_list=[out]) + + saveModel(name, exe, feed_vars=[node_x, node_y], fetchlist=[out], + inputs=[x, y], outputs=[outs[0]], target_dir=sys.argv[1]) + + return outs[0] + + def elementwise_ops(name: str, data_x, data_y, in_dtype, axis=-1): elementwise_add("elementwise_add" + name, data_x, data_y, in_dtype, axis) elementwise_sub("elementwise_sub" + name, data_x, data_y, in_dtype, axis) @@ -350,6 +383,20 @@ def main(): data_y = np.random.choice(sample_arr, size=(1, 3, 4)) elementwise_mul_bool("elementwise_mul_bool1", data_x, data_y) + test_cases = [ + "float32", + "int32", + "int64" + ] + + for test in test_cases: + x = np.array([0, 1, 2, 3]).astype(test) + y = np.array([1, 0, 2, 4]).astype(test) + if ((test == "float64") or (test == "int64")): + elementwise_less_equal("less_equal_" + test, x, y, test, True) + else: + elementwise_less_equal("less_equal_" + test, x, y, test, False) + if __name__ == "__main__": main() diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_expand_v2.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_expand_v2.py index 03f9737343e6fa..2f711da912dabf 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_expand_v2.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_expand_v2.py @@ -61,6 +61,27 @@ def expand_v2_tensor(name:str, x, out_shape, use_tensor_in_list): return outs[0] +def expand_as_v2(name:str, x, y): + paddle.enable_static() + + with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): + node_x = paddle.static.data(name='x', shape=x.shape, dtype=data_type) + node_y = paddle.static.data(name='y', shape=y.shape, dtype=data_type) + out = paddle.expand_as(node_x, node_y, name='expand_as_v2') + + cpu = paddle.static.cpu_places(1) + exe = paddle.static.Executor(cpu[0]) + # startup program will call initializer to initialize the parameters. + exe.run(paddle.static.default_startup_program()) + + outs = exe.run( + feed={'x': x, 'y': y}, + fetch_list=[out]) + + saveModel(name, exe, feed_vars=[node_x, node_y], fetchlist=[out], + inputs=[x, y], outputs=[outs[0]], target_dir=sys.argv[1]) + + return outs[0] def main(): data = np.random.rand(1, 1, 6).astype(data_type) @@ -70,6 +91,12 @@ def main(): expand_v2_tensor("expand_v2_tensor_list", data, [2, 3, -1], True) expand_v2_tensor("expand_v2_tensor_list2", data, [2, 2, 2, 3, -1], True) + # expand_as_v2 + data_x = np.random.rand(1, 1, 6).astype(data_type) + data_y1 = np.random.rand(2, 3, 6).astype(data_type) + data_y2 = np.random.rand(4, 2, 3, 6).astype(data_type) + expand_as_v2("expand_as_v2_1", data_x, data_y1) + expand_as_v2("expand_as_v2_2", data_x, data_y2) if __name__ == "__main__": main() diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 2eebfe88a2c803..c6ccbcf375746a 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -298,21 +298,21 @@ cross_compiled_file(${TARGET_NAME} NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 ANY + ARCH AVX512F AVX2 SVE ANY src/nodes/kernels/scaled_attn/executor_pa.cpp API src/nodes/kernels/scaled_attn/executor_pa.hpp NAME make_pa_executor NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 ANY + ARCH AVX512F AVX2 SVE ANY src/nodes/kernels/scaled_attn/attn_memcpy.cpp API src/nodes/kernels/scaled_attn/attn_memcpy.hpp NAME attn_memcpy paged_attn_memcpy attn_memcpy2d_kernel NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 ANY + ARCH AVX512F AVX2 SVE ANY src/nodes/kernels/scaled_attn/attn_quant.cpp API src/nodes/kernels/scaled_attn/attn_quant.hpp NAME attn_quantkv paged_attn_quantkv attn_quant_u8 attn_dequant_u8 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp index 008237780de3f6..8b3ed792fce535 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp @@ -192,14 +192,12 @@ void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::Expres // Quick validation check: Should we check that port is really Brgemm port? // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` // to avoid extra checks, we validate only first input port - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { - return p.dim_idx == 1; + return p.get_dim_idx() == 1 && p.is_processed(); }; - OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension M"); + OPENVINO_ASSERT( + in_ports.size() > 1 && check_port(in_ports[0]) && out_ports.size() == 1 && check_port(out_ports[0]), + "Incorrect Loop by Brgemm dimension M"); M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; input_pds[0]->set_subtensor_dim(1, M); output_pds[0]->set_subtensor_dim(1, M); @@ -213,13 +211,11 @@ void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::Expres const auto& in_ports = current_expanded_loop_info->get_input_ports(); const auto& out_ports = current_expanded_loop_info->get_output_ports(); // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { - return p.dim_idx == 0; + return p.get_dim_idx() == 0 && p.is_processed(); }; - OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && - std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && out_ports.size() == 1 && + OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_processed() && + std::all_of(in_ports.cbegin() + 1, in_ports.cend(), check_port) && out_ports.size() == 1 && check_port(out_ports.back()), "Incorrect Loop by Brgemm dimension N"); N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; @@ -240,10 +236,10 @@ void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::Expres const auto& in_ports = current_expanded_loop_info->get_input_ports(); const auto& out_ports = current_expanded_loop_info->get_output_ports(); // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && - out_ports.size() == 1 && !out_ports.front().is_incremented, + OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().get_dim_idx() == 0 && + in_ports.front().is_processed() && in_ports.back().get_dim_idx() == 1 && + in_ports.back().is_processed() && out_ports.size() == 1 && + !out_ports.front().is_processed(), "Incorrect Loop by Brgemm dimension K"); K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; input_pds[0]->set_subtensor_dim(0, K); diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index ddf8d068f920a2..4aeffce7591839 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -337,7 +337,7 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector& pr bool Node::isOneDimShape(const ov::PartialShape& pshape) { int value_1_num = 0; int sz = static_cast(pshape.size()); - for (auto s : pshape) { + for (const auto& s : pshape) { if (s.is_static() && s.get_length() == 1) { value_1_num++; } @@ -345,7 +345,7 @@ bool Node::isOneDimShape(const ov::PartialShape& pshape) { return value_1_num >= sz - 1; } -bool Node::isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2) { +bool Node::isReorderRequired(const ov::intel_cpu::MemoryDescPtr& desc1, const ov::intel_cpu::MemoryDescPtr& desc2) { bool samePrec = desc1->getPrecision() == desc2->getPrecision(); bool isOneDimShape1 = isOneDimShape(desc1->getShape().toPartialShape()); bool isOneDimShape2 = isOneDimShape(desc2->getShape().toPartialShape()); diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 9166e87dbf50e1..6b08fc54728375 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -752,7 +752,7 @@ class Node { void selectPreferPrimitiveDescriptor(const std::vector& priority, bool ignoreConstInputs); void selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, bool ignoreConstInputs); bool isOneDimShape(const ov::PartialShape& pshape); - bool isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2); + bool isReorderRequired(const ov::intel_cpu::MemoryDescPtr& desc1, const ov::intel_cpu::MemoryDescPtr& desc2); bool isConfigDefined(const NodeConfig& config) const; virtual bool canBeInPlace() const; diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp index 983c4410083beb..fc08ddcfac4819 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/subgraph.cpp @@ -80,12 +80,16 @@ SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snip m_repacking_impl_type = snippet_config->repacking_impl_type; m_repacked_inputs = snippet_config->repacked_inputs; - auto external_buffer_size = std::accumulate(m_repacked_inputs.begin(), - m_repacked_inputs.end(), - size_t(0), - [](size_t sum, const std::pair& p) { - return sum + p.second.desc()->getCurrentMemSize(); - }); + auto external_buffer_size = + std::accumulate(m_repacked_inputs.begin(), + m_repacked_inputs.end(), + size_t(0), + [](size_t sum, const std::pair& p) { + auto curr_mem_size = p.second.desc()->getCurrentMemSize(); + OPENVINO_ASSERT(curr_mem_size != ov::intel_cpu::MemoryDesc::UNDEFINED_SIZE, + "Current repacking buffer memory size is undefined"); + return sum + curr_mem_size; + }); if (get_repacking_impl_type() == RepackingImplType::IN_PARALLEL) { // When external repacking is applied in parallel section, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/brgemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/brgemm_kernel.cpp new file mode 100644 index 00000000000000..59b54f47024adf --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/brgemm_kernel.cpp @@ -0,0 +1,333 @@ +// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2024 FUJITSU LIMITED +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_kernel.hpp" + +#include +#include + +#include "dnnl_extension_utils.h" +#include "utils/cpu_utils.hpp" + +using namespace dnnl::impl::cpu::aarch64; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::aarch64::matmul; + +#define THROW_ERROR(...) OPENVINO_THROW("brgemm executor Init Failure '", __VA_ARGS__) +namespace ov { +namespace intel_cpu { + +BrgemmKernel::BrgemmKernel(size_t M, + size_t N, + size_t K, + size_t lda, + size_t ldb, + size_t ldc, + bool b_transposed, + ov::element::Type inType, + bool b_accumulate) + : M(M), + K(K), + N(N), + lda(lda), + ldb(ldb), + ldc(ldc), + b_transposed(b_transposed), + inType(inType) { + // blocking M + M_blk = matmulOptimalM; + M_tail = M % M_blk; + kBlkStep = 4 / inType.size(); + size_t vlen; + vlen = mayiuse(sve_512) ? cpu_isa_traits::vlen + : mayiuse(sve_256) ? cpu_isa_traits::vlen + : cpu_isa_traits::vlen; + // blocking N + N_blk = std::max(N, vlen / inType.size()); + N_tail = N % N_blk; + + // blocking K + K_blk = K; + K_tail = K % K_blk; + // copied K must be round up by vlen / inType.size(), otherwise copy B kernel may access wrong memory + packedBSize = rnd_up(K, vlen / inType.size()) * rnd_up(N, N_blk) * inType.size(); + size_t brg0BaseIdx = std::numeric_limits::max(); + for (size_t m = 0; m < 2; m++) { + for (size_t k = 0; k < 2; k++) { + for (size_t n = 0; n < 2; n++) { + auto& brgemmCtx = brgCtxs[getBrgIdx(m, k, n)]; + + auto M_ = m ? M_tail : M < M_blk ? 0 : M_blk; + auto N_ = n ? N_tail : N - N_tail; + auto K_ = k ? K_tail : K - K % K_blk; + auto beta = (b_accumulate || (k && brgCtxs[getBrgIdx(m, 0, n)].K != 0)) ? 1.0f : 0.0f; + + brgemmCtx.M = M_; + brgemmCtx.N = N_; + brgemmCtx.K = K_; + brgemmCtx.LDA = k ? K_blk : lda; + brgemmCtx.LDB = b_transposed ? rnd_up(N, N_blk) : ldb; // b_transposed needs copy + brgemmCtx.LDC = ldc; + brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(inType)); + brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(inType)); + brgemmCtx.beta = beta; + + // don't create brgemm kernels for empty tiles + if (M_ != 0 && K_ != 0 && N_ != 0) { + if (brg0BaseIdx == std::numeric_limits::max()) + brg0BaseIdx = getBrgIdx(m, k, n); + init_brgemm(brgemmCtx, brgKernels[getBrgIdx(m, k, n)]); + } + } + } + } + + auto& brgemmCtx0 = brgCtxs[brg0BaseIdx]; + if (b_transposed) { + size_t b_stride = 0; + b_stride = ldb * inType.size(); + // K should use the original K + init_brgemm_copy_b(brgCopyBKernel, + N, + N_blk, + N_tail, + brgemmCtx0.LDB, + K, + brgemmCtx0.dt_in0, + brgemmCtx0.dt_in1, + b_transposed, + b_stride); + } +} + +const size_t BrgemmKernel::get_scratch_a_size() const { + return packedASize; +} + +const size_t BrgemmKernel::get_scratch_b_size() const { + return packedBSize; +} + +void BrgemmKernel::init_brgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel) { + brgemm_t brgDesc; + cpu_isa_t isa; + isa = mayiuse(sve_512) ? cpu_isa_t::sve_512 : mayiuse(sve_256) ? cpu_isa_t::sve_256 : cpu_isa_t::sve_128; + auto status = brgemm_desc_init(&brgDesc, + isa, + brgemm_addr, + ctx.dt_in0, + ctx.dt_in1, + ctx.transpose_a, + ctx.transpose_b, + brgemm_row_major, + 1.f, + ctx.beta, + ctx.LDA, + ctx.LDB, + ctx.LDC, + ctx.M, + ctx.N, + ctx.K, + nullptr); + if (status != dnnl_success) { + THROW_ERROR("cannot be executed due to invalid brgconv params"); + } + + brgemm_kernel_t* brgKernel_ = nullptr; + status = brgemm_kernel_create(&brgKernel_, brgDesc); + if (status != dnnl_success) { + THROW_ERROR("cannot be executed due to invalid brgconv params"); + } + brgKernel.reset(brgKernel_); +} +void BrgemmKernel::init_brgemm_copy_a( + std::unique_ptr& brgCopyKernel, + size_t K, + size_t K_blk, + size_t K_tail, + size_t LDA, + dnnl_data_type_t dt_in0, + bool transpose, + size_t copy_A_src_stride) { + brgemm_matmul_conf_t brgCopyKernelConf; + brgCopyKernelConf.src_tag = dnnl_abcd; + brgCopyKernelConf.K = K; + brgCopyKernelConf.K_tail = K_tail; + brgCopyKernelConf.K_blk = K_blk; + brgCopyKernelConf.use_buffer_a_tail_only = false; + // padding K tail to K_blk, LDA is the stride for target tensor + brgCopyKernelConf.LDA = LDA; + brgCopyKernelConf.has_zero_point_b = false; + brgCopyKernelConf.s8s8_compensation_required = false; + brgCopyKernelConf.wei_zp_type = dnnl::impl::cpu::aarch64::none; + brgCopyKernelConf.src_zp_type = dnnl::impl::cpu::aarch64::none; + brgCopyKernelConf.src_dt = dt_in0; + brgCopyKernelConf.copy_A_src_stride = copy_A_src_stride; + brgCopyKernelConf.a_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + // copied A has the same precision of original + brgCopyKernelConf.tr_a_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + brgCopyKernelConf.transposed_A = transpose; + brgCopyKernelConf.isa = mayiuse(sve_512) ? cpu_isa_t::sve_512 + : mayiuse(sve_256) ? cpu_isa_t::sve_256 + : cpu_isa_t::sve_128; + + create_brgemm_matmul_copy_a(brgCopyKernel, &brgCopyKernelConf); +} + +void BrgemmKernel::init_brgemm_copy_b( + std::unique_ptr& brgCopyKernel, + size_t N, + size_t N_blk, + size_t N_tail, + size_t LDB, + size_t K, + dnnl_data_type_t dt_in0, + dnnl_data_type_t dt_in1, + bool transpose, + size_t copy_B_wei_stride) { + brgemm_matmul_conf_t brgCopyKernelConf; + brgCopyKernelConf.src_dt = dt_in0; + brgCopyKernelConf.wei_dt = dt_in1; + brgCopyKernelConf.wei_n_blk = N_blk; + brgCopyKernelConf.wei_tag = transpose ? dnnl_ba : dnnl_ab; + brgCopyKernelConf.copy_B_wei_stride = copy_B_wei_stride; + + // LDB here is for the target tensor, not source tensor + brgCopyKernelConf.LDB = LDB; + brgCopyKernelConf.N = N; + brgCopyKernelConf.N_tail = N_tail; + brgCopyKernelConf.N_blk = N_blk; + brgCopyKernelConf.K = K; + brgCopyKernelConf.K_blk = K; + brgCopyKernelConf.K_tail = 0; + brgCopyKernelConf.N_chunk_elems = brgCopyKernelConf.N_blk; + brgCopyKernelConf.b_dt_sz = + DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); + brgCopyKernelConf.tr_b_dt_sz = + DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); + brgCopyKernelConf.req_wei_vnni_downconvert = false; + brgCopyKernelConf.isa = mayiuse(sve_512) ? cpu_isa_t::sve_512 + : mayiuse(sve_256) ? cpu_isa_t::sve_256 + : cpu_isa_t::sve_128; + + brgCopyKernelConf.has_zero_point_a = false; + brgCopyKernelConf.has_zero_point_b = false; + brgCopyKernelConf.src_zp_type = dnnl::impl::cpu::aarch64::none; + auto ret = create_brgemm_matmul_copy_b(brgCopyKernel, &brgCopyKernelConf); + if (ret != dnnl::impl::status_t::dnnl_success) + THROW_ERROR("cannot create_brgemm_matmul_copy_b kernel"); +} + +void BrgemmKernel::copy_buffer_b(void* b, void* scratch_b) { + auto ptr_b = reinterpret_cast(b); + auto ptr_scartch_b = reinterpret_cast(scratch_b); + if (brgCopyBKernel) { + for (size_t nb = 0; nb < div_up(N, N_blk); nb++) { + auto N_stride = b_transposed ? ldb : 1; + auto pCopyKernel0In = ptr_b + nb * N_blk * inType.size() * N_stride; + auto pCopyKernel0Out = ptr_scartch_b + nb * N_blk * kBlkStep * inType.size(); + + auto ctx = jit_brgemm_matmul_copy_b_t::ctx_t(); + + const bool is_N_tail = (N - nb * N_blk < N_blk); + ctx.current_N_blk = is_N_tail ? N_tail : N_blk; + ctx.src = pCopyKernel0In; + ctx.tr_src = pCopyKernel0Out; + ctx.compensation_ptr = nullptr; + ctx.zp_a_compensation_ptr = nullptr; + ctx.zp_a_neg_value_ptr = nullptr; + ctx.current_K_start = 0; + ctx.current_K_iters = K; + (*brgCopyBKernel)(&ctx); + } + } +} + +void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void* wsp, void* scratch_a) { + auto ptr_A = reinterpret_cast(a); + auto ptr_C = reinterpret_cast(c); + auto ptr_scartch_a = reinterpret_cast(scratch_a); + auto ptr_scartch_b = reinterpret_cast(b); + uint8_t* ptr_a_tail = nullptr; + + size_t brgIdx0 = getBrgIdx(0, 0, 0); + // The step for matrix A over main K dimension + size_t K0_step0 = brgCtxs[brgIdx0].K; + auto cur_M_blk = is_M_tail ? M_tail : M_blk; + if (brgCopyAKernel) { + // only copy tailed data; + size_t K_offset = K < K_blk ? 0 : K0_step0 * inType.size(); + auto pCopyKernelIn = ptr_A + K_offset; + auto pCopyKernelOut = ptr_scartch_a; + + auto ctx = jit_brgemm_matmul_copy_a_t::ctx_t(); + + ctx.current_M_blk = cur_M_blk; + ctx.zp_b_compensation_buffer_ptr = nullptr; + ctx.zp_a_compensation_result_ptr = nullptr; + ctx.zp_b_neg_value_ptr = nullptr; + ctx.zp_ab_comp_ptr = nullptr; + ctx.src = pCopyKernelIn; + ctx.tr_src = pCopyKernelOut; + ctx.current_K_start = 0; + ctx.current_K_blk = K % K_blk; + + (*brgCopyAKernel)(&ctx); + + ptr_a_tail = pCopyKernelOut; + } + size_t count_N = 0; + for (size_t n = 0; n < 2; n++) { + size_t count_K = 0; + for (size_t k = 0; k < 2; k++) { + size_t mIdx = is_M_tail ? 1 : 0; + auto& brgemmCtx = brgCtxs[getBrgIdx(mIdx, k, n)]; + if (brgemmCtx.K != 0 && brgemmCtx.N != 0 && brgemmCtx.M != 0) { + auto local_a_ptr = k > 0 ? ptr_a_tail : ptr_A; + auto B_stride = (k * count_K + n * count_N * kBlkStep) * inType.size(); + auto weight_ptr = ptr_scartch_b + B_stride; + auto C_stride = n * count_N * ov::element::f32.size(); + auto out_ptr = ptr_C + C_stride; + callBrgemm(brgemmCtx, brgKernels[getBrgIdx(mIdx, k, n)], local_a_ptr, weight_ptr, out_ptr, wsp); + // stride K, N if body kernel is executed. + if (k == 0) { + count_K = brgemmCtx.K * brgemmCtx.LDB; + } + if (n == 0) { + count_N = brgemmCtx.N; + } + } + } + } +} + +void BrgemmKernel::executeGemm(void* a, void* b, void* c, void* wsp, void* scratch_a, void* scratch_b) { + auto ptr_A = reinterpret_cast(a); + auto ptr_B = reinterpret_cast(b); + auto ptr_C = reinterpret_cast(c); + + copy_buffer_b(ptr_B, scratch_b); + + for (size_t mb = 0; mb < div_up(M, M_blk); mb++) { + const bool is_M_tail = (M - mb * M_blk < M_blk); + auto ptr_a = ptr_A + (mb * M_blk * lda) * inType.size(); + auto ptr_c = ptr_C + (mb * M_blk * ldc) * ov::element::f32.size(); + executeGemm(is_M_tail, ptr_a, scratch_b, wsp, ptr_c, scratch_a); + } +} +void BrgemmKernel::callBrgemm(brgemmCtx& ctx, + std::unique_ptr& brgKernel, + const void* pin0, + const void* pin1, + void* pout, + void* wsp) { + brgemm_batch_element_t addr_batch; + addr_batch.ptr.A = pin0; + addr_batch.ptr.B = pin1; + brgemm_kernel_execute(brgKernel.get(), 1, &addr_batch, pout, wsp); +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/brgemm_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/brgemm_kernel.hpp new file mode 100644 index 00000000000000..06236ec1a9b775 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/brgemm_kernel.hpp @@ -0,0 +1,105 @@ +// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2024 FUJITSU LIMITED +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include +#include +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +class BrgemmKernel { +public: + // Construct brgemm kernel for matmul (M, K) * (K, N)/(N, K)^T + // FP32 * FP32 -> FP32 + // lda is the leading dimension for A matrix + // ldb is the leading dimension for B matrix + // ldc is the leading dimension for C matrix + // b_transpose indicates wheter B matrix is transposed. + BrgemmKernel(size_t M, + size_t N, + size_t K, + size_t lda, + size_t ldb, + size_t ldc, + bool b_transposed = false, + ov::element::Type inType = ov::element::f32, + bool b_accumulate = false); + // execute all M + void executeGemm(void* a, void* b, void* c, void* wsp, void* scratch_a, void* scratch_b); + // execute by m_blk + void executeGemm(bool is_M_tail, void* a, void* b, void* c, void* wsp, void* scratch_a); + + void copy_buffer_b(void* b, void* scratch_b); + // bytes needed to place scratch buffer a + const size_t get_scratch_a_size() const; + // bytes needed to place scratch buffer b + const size_t get_scratch_b_size() const; + const size_t get_wsp_size() const { + return 4 * 1024; + } + +private: + size_t M = 0, M_blk = 0, M_tail = 0; + size_t K = 0, K_blk = 0, K_tail = 0, N = 0, N_blk = 0, N_tail = 0; + size_t lda = 0, ldb = 0, ldc = 0; + bool b_transposed = false; + size_t kBlkStep = 0; + size_t packedBSize = 0; + size_t packedASize = 0; + ov::element::Type inType; + static constexpr size_t MHA_BRGEMM_KERNELS_NUM = 8; + static constexpr size_t matmulOptimalM = 32; + struct brgemmCtx { + size_t M = 0, N = 0, K = 0, LDA = 0, LDB = 0, LDC = 0; + dnnl_data_type_t dt_in0 = dnnl_data_type_undef; + dnnl_data_type_t dt_in1 = dnnl_data_type_undef; + bool transpose_a = false; + bool transpose_b = false; + float beta = 0.0f; + }; + brgemmCtx brgCtxs[MHA_BRGEMM_KERNELS_NUM]; + std::unique_ptr brgKernels[MHA_BRGEMM_KERNELS_NUM]; + std::unique_ptr brgCopyAKernel; + std::unique_ptr brgCopyBKernel; + size_t getBrgIdx(size_t mIdx, size_t kIdx, size_t nIdx) { + return mIdx * 4 + kIdx * 2 + nIdx; + } + void init_brgemm(brgemmCtx& ctx, std::unique_ptr& brgKernel); + // LDA, LDB is used for stride of target memory + void init_brgemm_copy_a( + std::unique_ptr& brgCopyKernel, + size_t K, + size_t K_blk, + size_t K_tail, + size_t LDA, + dnnl_data_type_t dt_in0, + bool transpose = false, + size_t copy_A_src_stride = 0); + + void init_brgemm_copy_b( + std::unique_ptr& brgCopyKernel, + size_t N, + size_t N_blk, + size_t N_tail, + size_t LDB, + size_t K, + dnnl_data_type_t dt_in0, + dnnl_data_type_t dt_in1, + bool transpose = false, + size_t copy_B_wei_stride = 0); + + void callBrgemm(brgemmCtx& ctx, + std::unique_ptr& brgKernel, + const void* pin0, + const void* pin1, + void* pout, + void* wsp); +}; +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp index 761a136eda2997..c7b2b13123c7d5 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp @@ -11,6 +11,9 @@ #include #include +#if defined(HAVE_SVE) +# include "arm_sve.h" +#endif namespace ov { namespace Extensions { @@ -138,7 +141,30 @@ void attn_dequant_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, f } } +#if defined(HAVE_SVE) +void inline attn_dequant_u8_kernel(const uint8_t* src, float* dst, size_t n, float scale, float zp) { + size_t i = 0; + uint8_t* src_nc = const_cast(src); + size_t nvec = n / svcntw(); + size_t lvec = svcntw(); + auto sve_pg = svptrue_b32(); + for (size_t j = 0; j < nvec; ++j) { + svuint32_t reg1 = svld1ub_u32(sve_pg, src_nc + j * lvec); + svfloat32_t reg2 = svcvt_f32_u32_z(sve_pg, reg1); + svfloat32_t reg3 = svsub_f32_z(sve_pg, reg2, svdup_n_f32(zp)); + svfloat32_t reg4 = svmul_f32_z(sve_pg, reg3, svdup_n_f32(scale)); + svst1_f32(sve_pg, dst + j * lvec, reg4); + } + i = n - n % svcntw(); + for (; i < n; ++i) { + float tmp = src_nc[i]; + tmp = (tmp - zp) * scale; + dst[i] = tmp; + } +} +#endif + } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/cache_rotation.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/cache_rotation.hpp new file mode 100644 index 00000000000000..552be63bd29a36 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/cache_rotation.hpp @@ -0,0 +1,234 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "common.hpp" +#include "openvino/openvino.hpp" + +#if defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# include +#endif + +#if defined(HAVE_AVX512F) +template +inline static void rotate_kv_cache_chunk_avx512(CT* current_x_values_ptr, + CT* current_y_values_ptr, + float* current_rotation_coeffts_cos_ptr, + float* current_rotation_coeffts_sin_ptr, + size_t num_vectorized_elements_per_iteration, + bool is_tail) { + using namespace ov::Extensions::Cpu::XARCH; + + auto result_x = _mm512_setzero_ps(); + auto result_y = _mm512_setzero_ps(); + + auto coeffts_cos = _mm512_undefined_ps(); + auto coeffts_sin = _mm512_undefined_ps(); + + auto cache_values_x = _mm512_undefined_ps(); + auto cache_values_y = _mm512_undefined_ps(); + + if (!is_tail) { + coeffts_cos = mm512_uni_loadu_ps(current_rotation_coeffts_cos_ptr); + coeffts_sin = mm512_uni_loadu_ps(current_rotation_coeffts_sin_ptr); + + cache_values_x = mm512_uni_loadu_ps(current_x_values_ptr); + cache_values_y = mm512_uni_loadu_ps(current_y_values_ptr); + } else { + coeffts_cos = mm512_uni_loadu_tail_ps(current_rotation_coeffts_cos_ptr, num_vectorized_elements_per_iteration); + coeffts_sin = mm512_uni_loadu_tail_ps(current_rotation_coeffts_sin_ptr, num_vectorized_elements_per_iteration); + + cache_values_x = mm512_uni_loadu_tail_ps(current_x_values_ptr, num_vectorized_elements_per_iteration); + cache_values_y = mm512_uni_loadu_tail_ps(current_y_values_ptr, num_vectorized_elements_per_iteration); + } + + result_x = _mm512_fmadd_ps(cache_values_x, coeffts_cos, result_x); + result_x = _mm512_fnmadd_ps(cache_values_y, coeffts_sin, result_x); // negative multiply-add + + result_y = _mm512_fmadd_ps(cache_values_x, coeffts_sin, result_y); + result_y = _mm512_fmadd_ps(cache_values_y, coeffts_cos, result_y); + + if (!is_tail) { + mm512_uni_storeu_ps(current_x_values_ptr, result_x); + mm512_uni_storeu_ps(current_y_values_ptr, result_y); + } else { + mm512_uni_storeu_tail_ps(current_x_values_ptr, result_x, num_vectorized_elements_per_iteration); + mm512_uni_storeu_tail_ps(current_y_values_ptr, result_y, num_vectorized_elements_per_iteration); + } +} +#endif + +#if defined(HAVE_AVX2) +template +inline static void rotate_kv_cache_chunk_avx2(CT* current_x_values_ptr, + CT* current_y_values_ptr, + float* current_rotation_coeffts_cos_ptr, + float* current_rotation_coeffts_sin_ptr, + size_t num_vectorized_elements_per_iteration, + size_t is_tail) { + using namespace ov::Extensions::Cpu::XARCH; + + auto result_x = _mm256_setzero_ps(); + auto result_y = _mm256_setzero_ps(); + + auto coeffts_cos = _mm256_undefined_ps(); + auto coeffts_sin = _mm256_undefined_ps(); + + auto cache_values_x = _mm256_undefined_ps(); + auto cache_values_y = _mm256_undefined_ps(); + + if (!is_tail) { + coeffts_cos = mm256_uni_loadu_ps(current_rotation_coeffts_cos_ptr); + coeffts_sin = mm256_uni_loadu_ps(current_rotation_coeffts_sin_ptr); + + cache_values_x = mm256_uni_loadu_ps(current_x_values_ptr); + cache_values_y = mm256_uni_loadu_ps(current_y_values_ptr); + } else { + coeffts_cos = mm256_uni_loadu_tail_ps(current_rotation_coeffts_cos_ptr, num_vectorized_elements_per_iteration); + coeffts_sin = mm256_uni_loadu_tail_ps(current_rotation_coeffts_sin_ptr, num_vectorized_elements_per_iteration); + + cache_values_x = mm256_uni_loadu_tail_ps(current_x_values_ptr, num_vectorized_elements_per_iteration); + cache_values_y = mm256_uni_loadu_tail_ps(current_y_values_ptr, num_vectorized_elements_per_iteration); + } + + result_x = _mm256_fmadd_ps(cache_values_x, coeffts_cos, result_x); + result_x = _mm256_fnmadd_ps(cache_values_y, coeffts_sin, result_x); // negative multiply-add + + result_y = _mm256_fmadd_ps(cache_values_x, coeffts_sin, result_y); + result_y = _mm256_fmadd_ps(cache_values_y, coeffts_cos, result_y); + + if (!is_tail) { + mm256_uni_storeu_ps(current_x_values_ptr, result_x); + mm256_uni_storeu_ps(current_y_values_ptr, result_y); + } else { + mm256_uni_storeu_tail_ps(current_x_values_ptr, result_x, num_vectorized_elements_per_iteration); + mm256_uni_storeu_tail_ps(current_y_values_ptr, result_y, num_vectorized_elements_per_iteration); + } +} +#endif + +template +inline static void rotate_kv_cache_block_opt(CT* cache_block_ptr, + float* block_rotation_coefficients_ptr, + size_t num_heads, + size_t block_size, + size_t embedding_size) { +#if !defined(HAVE_AVX2) && !defined(HAVE_AVX512F) + OPENVINO_THROW("host CPU must support either AVX2 or AVX512 instructions"); +#else + bool is_tail = false; + +# if defined(HAVE_AVX512F) + constexpr size_t vec_len_in_f32_elts = ov::Extensions::Cpu::XARCH::vec_len_f32_avx512; +# else // HAVE_AVX2 + constexpr size_t vec_len_in_f32_elts = ov::Extensions::Cpu::XARCH::vec_len_f32_avx2; +# endif // defined(HAVE_AVX512F) + + size_t num_processed_elements_per_iteration = + 2 * vec_len_in_f32_elts; // implementations act on pairs of cache values at once using separate registers, each + // elt is expanded to f32 on load + size_t num_iterations = embedding_size / num_processed_elements_per_iteration; + + if (embedding_size >= num_processed_elements_per_iteration) { + OPENVINO_ASSERT(!(num_processed_elements_per_iteration % vec_len_in_f32_elts)); + } else { + is_tail = true; + OPENVINO_ASSERT(!(embedding_size % 2)); + num_processed_elements_per_iteration = embedding_size; + num_iterations = 1; + } + + CT* current_cache_element_ptr = cache_block_ptr; + + for (size_t head_idx = 0; head_idx < num_heads; head_idx++) { + // the rotation coefficients are taken to be the same for all heads + float* current_rotation_coeffts_ptr = block_rotation_coefficients_ptr; + for (size_t tok_idx = 0; tok_idx < block_size; + tok_idx++, current_cache_element_ptr += embedding_size, current_rotation_coeffts_ptr += embedding_size) { + CT* current_x_values_ptr = current_cache_element_ptr; + CT* current_y_values_ptr = current_cache_element_ptr + embedding_size / 2; + + float* current_rotation_coeffts_cos_ptr = current_rotation_coeffts_ptr; + float* current_rotation_coeffts_sin_ptr = current_rotation_coeffts_ptr + embedding_size / 2; + + for (size_t iter_idx = 0; iter_idx < num_iterations; iter_idx++, + current_x_values_ptr += vec_len_in_f32_elts, + current_y_values_ptr += vec_len_in_f32_elts, + current_rotation_coeffts_cos_ptr += vec_len_in_f32_elts, + current_rotation_coeffts_sin_ptr += vec_len_in_f32_elts) { +# if defined(HAVE_AVX512F) + rotate_kv_cache_chunk_avx512(current_x_values_ptr, + current_y_values_ptr, + current_rotation_coeffts_cos_ptr, + current_rotation_coeffts_sin_ptr, + num_processed_elements_per_iteration / 2, + is_tail); +# else // HAVE_AVX2 + rotate_kv_cache_chunk_avx2(current_x_values_ptr, + current_y_values_ptr, + current_rotation_coeffts_cos_ptr, + current_rotation_coeffts_sin_ptr, + num_processed_elements_per_iteration / 2, + is_tail); +# endif // defined(HAVE_AVX512F) + } + } + } +#endif // !defined(HAVE_AVX512F) && !defined(HAVE_AVX2F) +} + +template +inline static void rotate_kv_cache_block_ref(CT* cache_block_ptr, + float* block_rotation_coefficients_ptr, + size_t num_heads, + size_t block_size, + size_t embedding_size) { + for (size_t head_idx = 0; head_idx < num_heads; head_idx++) { + for (size_t tok_idx = 0; tok_idx < block_size; tok_idx++) { + size_t token_offset = embedding_size * tok_idx; + CT* token_embedding_data_start_in_cache = + cache_block_ptr + head_idx * embedding_size * block_size + embedding_size * tok_idx; + float* token_data_start_in_rotation_coefficients = block_rotation_coefficients_ptr + token_offset; + for (size_t embedding_pair_idx = 0; embedding_pair_idx < embedding_size / 2; embedding_pair_idx++) { + // NB: below is the llama-style rotation (x-like values are in the first half of the embedding vector, + // y-like values are in the second half), which is different from the original RoFormer style (x- and y- + // values are interleaved), but still preserves the relative positional encoding property + CT* cache_value_0_ptr = token_embedding_data_start_in_cache + embedding_pair_idx; + CT* cache_value_1_ptr = cache_value_0_ptr + (embedding_size / 2); + + float rotation_value_cos = token_data_start_in_rotation_coefficients[embedding_pair_idx]; + float rotation_value_sin = + token_data_start_in_rotation_coefficients[embedding_pair_idx + (embedding_size / 2)]; + + CT cache_value_0 = *cache_value_0_ptr; + CT cache_value_1 = *cache_value_1_ptr; + + *cache_value_0_ptr = cache_value_0 * rotation_value_cos - cache_value_1 * rotation_value_sin; + *cache_value_1_ptr = cache_value_0 * rotation_value_sin + cache_value_1 * rotation_value_cos; + } + } + } +} + +template +inline static void rotate_kv_cache_block(CT* cache_block_ptr, + float* block_rotation_coefficients_ptr, + size_t num_heads, + size_t block_size, + size_t embedding_size) { +#if defined(HAVE_AVX512F) || defined(HAVE_AVX2) + rotate_kv_cache_block_opt(cache_block_ptr, block_rotation_coefficients_ptr, num_heads, block_size, embedding_size); +#else + rotate_kv_cache_block_ref(cache_block_ptr, block_rotation_coefficients_ptr, num_heads, block_size, embedding_size); +#endif // defined(HAVE_AVX512F) || defined(HAVE_AVX2) +} + +template <> +inline void rotate_kv_cache_block(uint8_t* cache_block_ptr, + float* block_rotation_coefficients_ptr, + size_t num_heads, + size_t block_size, + size_t embedding_size) { + OPENVINO_THROW("cache rotation is not implemented for INT8"); +} diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index cb1cd24f840bfd..8b17b3ba8fb544 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -151,19 +151,30 @@ inline void mm512_uni_storeu_tail_ps(ov::float16* addr, __m512 v, size_t count) #endif #ifdef HAVE_AVX2 +inline __m128i get_8bit_tail_mask_for_16bit_elts(size_t num_16bit_tail_elts) { + // num_tail_elts may take from 0 to 8 + static int8_t masks[9][16] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}}; + return _mm_loadu_si128(reinterpret_cast<__m128i*>(masks[num_16bit_tail_elts])); +} inline __m256i get_mask(int N7) { - static __m256i mask[] = { - _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0), - _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1), - _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1), - _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1), - _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1), - _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1), - _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), - _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1), - _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1), - }; - return _mm256_loadu_si256(&mask[N7]); + static int32_t masks[9][8] = {{0, 0, 0, 0, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 0, 0}, + {-1, -1, 0, 0, 0, 0, 0, 0}, + {-1, -1, -1, 0, 0, 0, 0, 0}, + {-1, -1, -1, -1, 0, 0, 0, 0}, + {-1, -1, -1, -1, -1, 0, 0, 0}, + {-1, -1, -1, -1, -1, -1, 0, 0}, + {-1, -1, -1, -1, -1, -1, -1, 0}, + {-1, -1, -1, -1, -1, -1, -1, -1}}; + return _mm256_loadu_si256(reinterpret_cast<__m256i*>(masks[N7])); } // load addr to __m256 reg @@ -207,7 +218,7 @@ inline void mm256_uni_storeu_ps(float* a, __m256 v) { _mm256_storeu_ps(a, v); } -inline void mm256_uni_storeu_ps(ov::bfloat16* addr, __m256 xps) { +inline __m128i __convert_avx2_packed_float_to_packed_ov_bfloat16(__m256 xps) { __m256i xpi32 = _mm256_castps_si256(xps); __m256i nan = _mm256_set1_epi32(0xffff); __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(xps, xps, _CMP_ORD_Q)); @@ -220,6 +231,11 @@ inline void mm256_uni_storeu_ps(ov::bfloat16* addr, __m256 xps) { x = _mm256_packus_epi32(x, x); x = _mm256_permute4x64_epi64(x, 0xd8); __m128i bf16_o = _mm256_extractf128_si256(x, 0); + return bf16_o; +} + +inline void mm256_uni_storeu_ps(ov::bfloat16* addr, __m256 xps) { + __m128i bf16_o = __convert_avx2_packed_float_to_packed_ov_bfloat16(xps); _mm_storeu_si128(reinterpret_cast<__m128i*>(addr), bf16_o); } @@ -230,10 +246,22 @@ inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { // store __m256 to addr inline void mm256_uni_storeu_tail_ps(float* addr, __m256 v, size_t count) { - const auto mask = get_mask(count); + auto mask = get_mask(count); return _mm256_maskstore_ps(addr, mask, v); } +inline void mm256_uni_storeu_tail_ps(ov::float16* addr, __m256 v, size_t count) { + auto mask = get_8bit_tail_mask_for_16bit_elts(count); + __m128i vec_f16 = _mm256_cvtps_ph(v, 0); + return _mm_maskmoveu_si128(vec_f16, mask, reinterpret_cast(addr)); +} + +inline void mm256_uni_storeu_tail_ps(ov::bfloat16* addr, __m256 v, size_t count) { + auto mask = get_8bit_tail_mask_for_16bit_elts(count); + __m128i bf16_o = __convert_avx2_packed_float_to_packed_ov_bfloat16(v); + return _mm_maskmoveu_si128(bf16_o, mask, reinterpret_cast(addr)); +} + inline void hsum(__m256& x) { __m256 y; // x: 0 1 2 3 4 5 6 7 y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index e67c0312bf67cc..dec4650dc548c1 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -16,16 +16,22 @@ #include "attn_memcpy.hpp" #include "attn_quant.hpp" #include "attn_quant_kernel.hpp" +#include "cache_rotation.hpp" #include "common.hpp" #include "executor_pa.hpp" #include "executor_pa_common.hpp" -#include "nodes/kernels/x64/brgemm_kernel.hpp" #include "openvino/core/parallel.hpp" #include "openvino/core/type/bfloat16.hpp" #include "openvino/core/type/float16.hpp" #include "softmax_kernel.hpp" #include "transpose_kernel.hpp" #include "utils/plain_tensor.hpp" +#if defined(OPENVINO_ARCH_X86_64) +# include "nodes/kernels/x64/brgemm_kernel.hpp" +#elif defined(OPENVINO_ARCH_ARM64) && defined(HAVE_SVE) +# include "arm_sve.h" +# include "nodes/kernels/aarch64/brgemm_kernel.hpp" +#endif namespace ov { namespace Extensions { @@ -36,7 +42,7 @@ using namespace ov; using namespace ov::intel_cpu; // currently depends on brgemm which only support x64 -#ifdef OPENVINO_ARCH_X86_64 +#if defined(OPENVINO_ARCH_X86_64) || (defined(OPENVINO_ARCH_ARM64) && defined(HAVE_SVE)) # if defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -1150,6 +1156,66 @@ static void pack_32NxK(TDST* dst, OPENVINO_THROW("pack_32NxK: should not be called."); } +template +void fill_rotation_coefficients_from_lut(T* rotation_coefficients_block_data, + const int32_t* rotation_deltas_block_data, + size_t rotation_deltas_token_stride, + const T* rotation_trig_lut, + size_t block_size, + size_t embedding_size) { + size_t dst_offset = 0; + for (size_t tok_idx = 0; tok_idx < block_size; tok_idx++) { + size_t gather_idx = *(rotation_deltas_block_data + rotation_deltas_token_stride * tok_idx); + size_t src_offset = gather_idx * embedding_size; + std::memcpy(rotation_coefficients_block_data + dst_offset, + rotation_trig_lut + src_offset, + embedding_size * sizeof(T)); + dst_offset += embedding_size; + } +} + +template +void rotate_kv_cache(PlainTensor& key_cache, + const PlainTensor& rotated_block_indices, + const PlainTensor& rotation_deltas, + const PlainTensor& rotation_trig_lut, + PlainTensor& rotation_coefficients_scratch) { + size_t num_blocks_in_total = key_cache.size(0); + size_t num_heads = key_cache.size(1); // H; + size_t block_size = key_cache.size(2); + size_t embedding_size = key_cache.size(3); // S; + + size_t num_rotated_blocks = rotated_block_indices.size(0); + int32_t* rotated_block_indices_data = rotated_block_indices.ptr(); + float* rotation_trig_lut_data = rotation_trig_lut.ptr(); + + size_t rotation_deltas_token_stride = 0; + size_t rotation_deltas_block_stride = 1; + + bool is_per_token = (rotation_deltas.shape()[1] == block_size); + if (is_per_token) { + rotation_deltas_token_stride = 1; + rotation_deltas_block_stride = block_size; + } + + for (size_t i = 0; i < num_rotated_blocks; i++) { + size_t rotated_block_index = *(rotated_block_indices_data + i); + OPENVINO_ASSERT(rotated_block_index < num_blocks_in_total); + + int32_t* rotation_deltas_block_data = rotation_deltas.ptr() + i * rotation_deltas_block_stride; + + float* rotation_coefficient_block_data = rotation_coefficients_scratch.ptr(); + fill_rotation_coefficients_from_lut(rotation_coefficient_block_data, + rotation_deltas_block_data, + rotation_deltas_token_stride, + rotation_trig_lut_data, + block_size, + embedding_size); + KVCACHE_TYPE* cache_block_ptr = key_cache.ptr(rotated_block_index); + rotate_kv_cache_block(cache_block_ptr, rotation_coefficient_block_data, num_heads, block_size, embedding_size); + } +} + template struct MHAHelper { // initialize once @@ -1180,8 +1246,10 @@ struct MHAHelper { std::vector> _wv_gemm; // will accumulate C buffer std::vector> _wv_gemm_acc; - // second token +// second token +# if defined(OPENVINO_ARCH_X86_64) std::shared_ptr _gemv; +# endif ov::element::Type _fastpath_valid_prec = ov::element::undefined; // second token for bhl loop PlainTensor _weight_bhl; @@ -1189,6 +1257,8 @@ struct MHAHelper { PlainTensor _score_offsets_aligned; PlainTensor _score_offsets; + PlainTensor _block_rotation_coefficient_scratch; + MHAHelper() { _weight.resize({size_t{1}, size_t{1}, size_t{1}, size_t{1}}); } @@ -1208,7 +1278,8 @@ struct MHAHelper { size_t sliding_window, float d_scale, size_t kv_len, - bool init_alibi_lookup) { + bool init_alibi_lookup, + bool init_rotation_coefficient_scratch) { // query shape: [B, H, L, S] // present_key shape: [block, H, 32, S] // Q*K': [M1, S] * [M2, S]' @@ -1283,6 +1354,7 @@ struct MHAHelper { _wv_scratch_a.resize( {_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); +# if defined(OPENVINO_ARCH_X86_64) if ((S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6)) { if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) && precision_of::value == ov::element::bf16 && @@ -1299,6 +1371,7 @@ struct MHAHelper { static_cast(block_size), _fastpath_valid_prec); } +# endif } if (init_alibi_lookup && (!_alibi_lookup || _alibi_lookup.m_dims[0] < kv_len)) { @@ -1306,6 +1379,10 @@ struct MHAHelper { for (size_t i = 0; i < _alibi_lookup.m_dims[0]; i++) _alibi_lookup.ptr()[i] = -static_cast((_alibi_lookup.m_dims[0] - 1 - i)); } + + if (init_rotation_coefficient_scratch) { + _block_rotation_coefficient_scratch.resize({_block_size, S}); + } } void init_reorder_buffers(size_t batch, size_t kv_len_in_blocks) { @@ -1496,6 +1573,7 @@ struct MHAHelper { size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { +# if defined(OPENVINO_ARCH_X86_64) if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { @@ -1510,6 +1588,7 @@ struct MHAHelper { } _gemv->tile_release(); } else { +# endif for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { @@ -1523,7 +1602,9 @@ struct MHAHelper { } } } +# if defined(OPENVINO_ARCH_X86_64) } +# endif for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = hq_beg; h < hq_end; h++) { @@ -1584,12 +1665,13 @@ struct MHAHelper { // batch tokens. It will assume NO mixture execution of first and second token. all tensors such as query... have // batch dimension which is DIFFERENT from above // query: [B, H, L, S] - // present_*: [block_number, H, 32, S] + // key_cache: [block_number, H, _block_size, S] + // value_cache: [block_number, H, _block_size, Sv] // output_emb: [B, L, H * S] // 3 loops along batch, head, kv cache length dimensions void exec_loop_bhl(const PlainTensor& query, - const PlainTensor& present_key, - const PlainTensor& present_value, + PlainTensor& key_cache, + PlainTensor& value_cache, const PlainTensor& output_emb, const PlainTensor& output_score, size_t max_context_len, @@ -1642,28 +1724,32 @@ struct MHAHelper { auto pk = pk_in_blocks * _block_size; if (pk < context_len) { auto block_number = block_indices.ptr()[block_indices_begins.ptr()[b] + pk_in_blocks]; +# if defined(OPENVINO_ARCH_X86_64) if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = hq_beg; h < hq_end; h++) { (*_gemv)(query.ptr(b, h, pq), - present_key.ptr(block_number, hk), + key_cache.ptr(block_number, hk), _weight_bhl.ptr(b, h, pq) + pk); } } _gemv->tile_release(); } else { +# endif for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = hq_beg; h < hq_end; h++) { dot_product_block(query.ptr(b, h, pq), - present_key.ptr(block_number, hk), + key_cache.ptr(block_number, hk), _weight_bhl.ptr(b, h, pq) + pk, _S, std::min(_block_size, context_len - pk), _key_group_size); } } +# if defined(OPENVINO_ARCH_X86_64) } +# endif } }; @@ -1729,12 +1815,11 @@ struct MHAHelper { auto block_number = block_indices.ptr()[block_indices_begins.ptr()[b] + pv_in_blocks]; for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = hq_beg; h < hq_end; h++) { - auto sub_byte_multiplier = get_sub_byte_multiplier(present_value.get_precision()); - size_t v_stride = - (block_number * present_value.m_strides[0] + hk * present_value.m_strides[1]) * - present_value.get_precision().size() / sub_byte_multiplier; + auto sub_byte_multiplier = get_sub_byte_multiplier(value_cache.get_precision()); + size_t v_stride = (block_number * value_cache.m_strides[0] + hk * value_cache.m_strides[1]) * + value_cache.get_precision().size() / sub_byte_multiplier; auto* v_ptr = reinterpret_cast::value_type*>( - present_value.m_ptr.get() + v_stride); + value_cache.m_ptr.get() + v_stride); attn_acc_value_block::value_type, VALUE_PREC>( _output_bhl.ptr(ithr, b, pq, h), _weight_bhl.ptr(b, h, pq) + pv, @@ -1868,7 +1953,7 @@ struct MHA { // one loop to handle first and second tokens void exec_loop_mixed(const PlainTensor& q, - const PlainTensor& k_cache, + PlainTensor& k_cache, const PlainTensor& v_cache, const PlainTensor& output_emb, const PlainTensor& output_score, @@ -2117,6 +2202,9 @@ struct AttentionExecutor : public PagedAttentionExecutor { size_t& sliding_window, PlainTensor& alibi_slopes, size_t& max_context_len, + PlainTensor& rotated_block_indices, + PlainTensor& rotation_deltas, + PlainTensor& rotation_trig_lut, PlainTensor& output_emb, PlainTensor& output_score) { q.reset(inputs[ID_Q]); // [B_token, H * S] @@ -2133,6 +2221,19 @@ struct AttentionExecutor : public PagedAttentionExecutor { if (!inputs[ID_ALIBI_SLOPES]->getShape().hasZeroDims()) alibi_slopes.reset(inputs[ID_ALIBI_SLOPES]); max_context_len = static_cast(*inputs[ID_MAX_CONTEXT_LEN]->getDataAs()); + + size_t inputs_size = inputs.size(); + if (inputs_size > ID_ROTATED_BLOCK_INDICES) { + OPENVINO_ASSERT(inputs_size >= ID_ROTATION_TRIG_LUT); + if (!inputs[ID_ROTATED_BLOCK_INDICES]->getShape().hasZeroDims()) + rotated_block_indices.reset(inputs[ID_ROTATED_BLOCK_INDICES]); // [num_blocks] + if (!inputs[ID_ROTATION_DELTAS]->getShape().hasZeroDims()) + rotation_deltas.reset(inputs[ID_ROTATION_DELTAS]); // [num_blocks, block_size (32) || 1] + if (!inputs[ID_ROTATION_TRIG_LUT]->getShape().hasZeroDims()) + rotation_trig_lut.reset( + inputs[ID_ROTATION_TRIG_LUT]); // [max_context_len * embedding_size], row-major layout + } + output_emb.reset(outputs[0]); if (outputs.size() == 2) output_score.reset(outputs[1]); @@ -2189,13 +2290,34 @@ struct AttentionExecutor : public PagedAttentionExecutor { if (alibi_slopes) { alibi_slopes.assert_dims({H}); } + + bool init_rotation_coefficient_scratch = false; + if (rotated_block_indices) { + // Only K entries are needed to be rotated, since position is encoded at the Q^T @ (effective_RoPE_matrix) @ + // K matrix multiplication + rotation_deltas.assert_dims({rotated_block_indices.size(0), 0}, /* special_zero = */ true); + OPENVINO_ASSERT(rotation_deltas.shape()[1] == 1 || + rotation_deltas.shape()[1] == block_size); // per-block or per-token granularity + rotation_trig_lut.assert_dims({0, S}, /* special_zero = */ true); + init_rotation_coefficient_scratch = true; + } output_emb.assert_dims({B_token, H * SV}); output_emb = output_emb.reshape({B_token, 1, H * SV}); // TODO: enable block_size to be multiple of 32 OPENVINO_ASSERT(block_size == 32, "CPU: block size must be 32, current: ", block_size); - _helper.init(H, S, SV, Hk, h_each_group_len, block_size, sliding_window, scale, max_context_len, alibi_slopes); + _helper.init(H, + S, + SV, + Hk, + h_each_group_len, + block_size, + sliding_window, + scale, + max_context_len, + alibi_slopes, + init_rotation_coefficient_scratch); } void concat_pastkv(const PlainTensor& k, @@ -2244,6 +2366,10 @@ struct AttentionExecutor : public PagedAttentionExecutor { size_t sliding_window; PlainTensor alibi_slopes; size_t max_context_len; + PlainTensor rotated_block_indices; + PlainTensor rotation_deltas; + PlainTensor rotation_trig_lut; + PlainTensor output_emb; PlainTensor output_score; @@ -2262,8 +2388,20 @@ struct AttentionExecutor : public PagedAttentionExecutor { sliding_window, alibi_slopes, max_context_len, + rotated_block_indices, + rotation_deltas, + rotation_trig_lut, output_emb, output_score); + + if (rotated_block_indices) { + rotate_kv_cache(k_cache, + rotated_block_indices, + rotation_deltas, + rotation_trig_lut, + _helper._block_rotation_coefficient_scratch); + } + concat_pastkv(k, v, k_cache, v_cache, past_lens, subsequence_begins, block_indices, block_indices_begins); _kernel(q, @@ -2288,7 +2426,7 @@ std::shared_ptr make_pa_executor(ov::element::Type data_ size_t value_group_size) { std::shared_ptr executor; -#ifdef OPENVINO_ARCH_X86_64 +#if defined(OPENVINO_ARCH_X86_64) if (data_type == ov::element::bf16) { # if defined(HAVE_AVX512F) if (key_cache_type == ov::element::u8) { @@ -2358,8 +2496,18 @@ std::shared_ptr make_pa_executor(ov::element::Type data_ } else { OPENVINO_THROW("make_pa_executor: unsupported precision: ", data_type); } +#elif (defined(OPENVINO_ARCH_ARM64) && defined(HAVE_SVE)) + if (data_type == ov::element::f32) { + if (key_cache_type == ov::element::u8 && value_cache_type == ov::element::u8) { + executor = + std::make_shared>(key_group_size, value_group_size); + } else { + OPENVINO_THROW("make_pa_executor: key_cache_type and value_cache_type of u8 is only support"); + } + } + #else - OPENVINO_THROW("make_pa_executor: only support x64 platform"); + OPENVINO_THROW("make_pa_executor: only support x64 platform or ARM with SVE support"); #endif return executor; } @@ -2367,4 +2515,4 @@ std::shared_ptr make_pa_executor(ov::element::Type data_ } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp index 81c54c84d9453a..66911d4f4e7b1f 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp @@ -21,19 +21,22 @@ namespace Cpu { struct PagedAttentionExecutor { // PagedAttention input index - static const size_t ID_Q = 0; // [B_token, H * S], float - static const size_t ID_K = 1; // [B_token, Hk * S], float - static const size_t ID_V = 2; // [B_token, Hk * S], float - static const size_t ID_KCACHE = 3; // [block_number, H, block_size, S], float - static const size_t ID_VCACHE = 4; // [block_number, H, block_size, S], float - static const size_t ID_PAST_LENS = 5; // [B_seq] - static const size_t ID_SUBSEQUENCE_BEGINS = 6; // [B_seq+1] - static const size_t ID_BLOCK_INDICES = 7; // [num_blocks] - static const size_t ID_BLOCK_INDICES_BEGINS = 8; // [B_seq+1] - static const size_t ID_SCALE = 9; // [], float - static const size_t ID_SLIDING_WINDOW = 10; // [] - static const size_t ID_ALIBI_SLOPES = 11; // [H|0], float - static const size_t ID_MAX_CONTEXT_LEN = 12; // [] + static const size_t ID_Q = 0; // [B_token, H * S], float + static const size_t ID_K = 1; // [B_token, Hk * S], float + static const size_t ID_V = 2; // [B_token, Hk * S], float + static const size_t ID_KCACHE = 3; // [block_number, H, block_size, S], float + static const size_t ID_VCACHE = 4; // [block_number, H, block_size, S], float + static const size_t ID_PAST_LENS = 5; // [B_seq] + static const size_t ID_SUBSEQUENCE_BEGINS = 6; // [B_seq+1] + static const size_t ID_BLOCK_INDICES = 7; // [num_blocks] + static const size_t ID_BLOCK_INDICES_BEGINS = 8; // [B_seq+1] + static const size_t ID_SCALE = 9; // [], float + static const size_t ID_SLIDING_WINDOW = 10; // [] + static const size_t ID_ALIBI_SLOPES = 11; // [H|0], float + static const size_t ID_MAX_CONTEXT_LEN = 12; // [] + static const size_t ID_ROTATED_BLOCK_INDICES = 13; // [num_rotated_blocks || 0], int32 + static const size_t ID_ROTATION_DELTAS = 14; // [num_rotated_blocks * block_size || 0], int32 + static const size_t ID_ROTATION_TRIG_LUT = 15; // [max_context_length * S || 0], f32 virtual void execute(const std::vector& inputs, const std::vector outputs) = 0; virtual ~PagedAttentionExecutor() = default; @@ -107,4 +110,4 @@ class JitMatMulVecAMX : public dnnl::impl::cpu::x64::jit_generator { } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index f42f15ce1e065a..27782970323bdd 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -50,7 +50,7 @@ using namespace ov; #endif template -void cvt_copy(TA* dst, TB* src, size_t n) { +static void cvt_copy(TA* dst, TB* src, size_t n) { size_t i = 0; #if defined(HAVE_AVX512F) for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { @@ -1561,4 +1561,4 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp index 93d7db55107951..c89e807bae7fa5 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp @@ -11,6 +11,10 @@ #include "common.hpp" #include "openvino/core/type/element_type.hpp" +#if defined(HAVE_SVE) +# include "arm_sve.h" +#endif + namespace ov { namespace Extensions { namespace Cpu { @@ -593,6 +597,116 @@ inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_strid } } +#elif defined(HAVE_SVE) +template +inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size_t src_stride) { + for (size_t i = 0; i < 16; i++) { + for (size_t j = 0; j < 16; j++) { + dst[i * dst_stride + j] = static_cast(src[i + j * src_stride]); + } + } +} + +template +inline void transpose_16xK_kernel(TDST* dst, TSRC* src, size_t K, size_t dst_stride, size_t src_stride) { + for (size_t i = 0; i < K; i++) { + for (size_t j = 0; j < 16; j++) { + dst[i * dst_stride + j] = static_cast(src[i + j * src_stride]); + } + } +} + +inline void transpose_8x8_kernel(float* src, size_t ld_src, float* dst, size_t ld_dst) { + // load from src to registers + // a: a0 a1 a2 a3 a4 a5 a6 a7 + // b: b0 b1 b2 b3 b4 b5 b6 b7 + // c: c0 c1 c2 c3 c4 c5 c6 c7 + // d: d0 d1 d2 d3 d4 d5 d6 d7 + // e: e0 e1 e2 e3 e4 e5 e6 e7 + // f: f0 f1 f2 f3 f4 f5 f6 f7 + // g: g0 g1 g2 g3 g4 g5 g6 g7 + // h: h0 h1 h2 h3 h4 h5 h6 h7 + svfloat32_t a = svld1_f32(svptrue_b8(), &src[0 * ld_src]); + svfloat32_t b = svld1_f32(svptrue_b8(), &src[1 * ld_src]); + svfloat32_t c = svld1_f32(svptrue_b8(), &src[2 * ld_src]); + svfloat32_t d = svld1_f32(svptrue_b8(), &src[3 * ld_src]); + svfloat32_t e = svld1_f32(svptrue_b8(), &src[4 * ld_src]); + svfloat32_t f = svld1_f32(svptrue_b8(), &src[5 * ld_src]); + svfloat32_t g = svld1_f32(svptrue_b8(), &src[6 * ld_src]); + svfloat32_t h = svld1_f32(svptrue_b8(), &src[7 * ld_src]); + // unpacking and interleaving 32-bit elements + // a0 b0 a1 b1 a4 b4 a5 b5 + // a2 b2 a3 b3 a6 b6 a7 b7 + // c0 d0 c1 d1 ... + // c2 d2 c3 d3 ... + // e0 f0 e1 f1 ... + // e2 f2 e3 f3 ... + // g0 h0 g1 h1 ... + // g2 h2 g3 h3 ... + svfloat32_t ta = svtrn1_f32(a, b); + svfloat32_t tb = svtrn2_f32(a, b); + svfloat32_t tc = svtrn1_f32(c, d); + svfloat32_t td = svtrn2_f32(c, d); + svfloat32_t te = svtrn1_f32(e, f); + svfloat32_t tf = svtrn2_f32(e, f); + svfloat32_t tg = svtrn1_f32(g, h); + svfloat32_t th = svtrn2_f32(g, h); + // unpacking and interleaving 64-bit elements + // a0 b0 c0 d0 a4 b4 c4 d4 + // a1 b1 c1 d1 ... + // a2 b2 c2 d2 ... + // a3 b3 c3 d3 ... + // e0 f0 g0 h0 e4 f4 g4 h4 + // e1 f1 g1 h1 ... + // e2 f2 g2 h2 ... + // e3 f3 g3 h3 ... + a = svreinterpret_f32_f64(svtrn1_f64(svreinterpret_f64_f32(ta), svreinterpret_f64_f32(tc))); + b = svreinterpret_f32_f64(svtrn2_f64(svreinterpret_f64_f32(ta), svreinterpret_f64_f32(tc))); + c = svreinterpret_f32_f64(svtrn1_f64(svreinterpret_f64_f32(tb), svreinterpret_f64_f32(td))); + d = svreinterpret_f32_f64(svtrn2_f64(svreinterpret_f64_f32(tb), svreinterpret_f64_f32(td))); + e = svreinterpret_f32_f64(svtrn1_f64(svreinterpret_f64_f32(te), svreinterpret_f64_f32(tg))); + f = svreinterpret_f32_f64(svtrn2_f64(svreinterpret_f64_f32(te), svreinterpret_f64_f32(tg))); + g = svreinterpret_f32_f64(svtrn1_f64(svreinterpret_f64_f32(tf), svreinterpret_f64_f32(th))); + h = svreinterpret_f32_f64(svtrn2_f64(svreinterpret_f64_f32(tf), svreinterpret_f64_f32(th))); + // shuffle 128-bits (composed of 4 32-bit elements) + // a0 b0 c0 d0 e0 f0 g0 h0 + // a1 b1 c1 d1 ... + // a2 b2 c2 d2 ... + // a3 b3 c3 d3 ... + // a4 b4 c4 d4 ... + // a5 b5 c5 d5 ... + // a6 b6 c6 d6 ... + // a7 b7 c7 d7 ... + svfloat32_t t1a = svext_f32(a, a, 4); + svfloat32_t t1b = svext_f32(b, b, 4); + svfloat32_t t1c = svext_f32(c, c, 4); + svfloat32_t t1d = svext_f32(d, d, 4); + ta = svext_f32(t1a, e, 4); + tb = svext_f32(t1b, f, 4); + tc = svext_f32(t1c, g, 4); + td = svext_f32(t1d, h, 4); + te = svsel_f32(svptrue_pat_b32(SV_VL4), t1a, e); + tf = svsel_f32(svptrue_pat_b32(SV_VL4), t1b, f); + tg = svsel_f32(svptrue_pat_b32(SV_VL4), t1c, g); + th = svsel_f32(svptrue_pat_b32(SV_VL4), t1d, h); + // Store the transposed result in destination + svst1_f32(svptrue_b8(), &dst[0 * ld_dst], ta); + svst1_f32(svptrue_b8(), &dst[1 * ld_dst], tc); + svst1_f32(svptrue_b8(), &dst[2 * ld_dst], tb); + svst1_f32(svptrue_b8(), &dst[3 * ld_dst], td); + svst1_f32(svptrue_b8(), &dst[4 * ld_dst], te); + svst1_f32(svptrue_b8(), &dst[5 * ld_dst], tg); + svst1_f32(svptrue_b8(), &dst[6 * ld_dst], tf); + svst1_f32(svptrue_b8(), &dst[7 * ld_dst], th); +} +template <> +inline void transpose_16x16_kernel(float* dst, float* src, size_t dst_stride, size_t src_stride) { + transpose_8x8_kernel(src, src_stride, dst, dst_stride); + transpose_8x8_kernel(src + 8, src_stride, dst + 8 * dst_stride, dst_stride); + transpose_8x8_kernel(src + 8 * src_stride, src_stride, dst + 8, dst_stride); + transpose_8x8_kernel(src + 8 * src_stride + 8, src_stride, dst + 8 * dst_stride + 8, dst_stride); +} + #else template diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp index 54aa80e9dff7c0..98be1a12517441 100644 --- a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp @@ -82,7 +82,8 @@ void PagedAttention::initSupportedPrimitiveDescriptors() { creatorsMap.at(LayoutType::ncsp) ->createSharedDesc(rtPrecision, getInputShapeAtPort(PagedAttentionExecutor::ID_V))); - OPENVINO_ASSERT(orgInputNumber == 13, "The input number of PagedAttention should be 13."); + OPENVINO_ASSERT(orgInputNumber == 13 || orgInputNumber == 16, + "The input number of PagedAttention should be 13 or 16."); // kvcache, float, [] auto past_key_input_mem_precision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE); auto past_value_input_mem_precision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE); @@ -130,6 +131,23 @@ void PagedAttention::initSupportedPrimitiveDescriptors() { config.outConfs[1].setMemDesc( creatorsMap.at(LayoutType::ncsp)->createSharedDesc(ov::element::f32, getOutputShapeAtPort(1))); + if (orgInputNumber == 16) { + // rotated_block_indices, int, [num_rotated_blocks || 0] + config.inConfs[PagedAttentionExecutor::ID_ROTATED_BLOCK_INDICES].setMemDesc( + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(ov::element::i32, + getInputShapeAtPort(PagedAttentionExecutor::ID_ROTATED_BLOCK_INDICES))); + // rotation_deltas, int, [num_rotated_blocks, block_size || 1] || [0] + config.inConfs[PagedAttentionExecutor::ID_ROTATION_DELTAS].setMemDesc( + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(ov::element::i32, getInputShapeAtPort(PagedAttentionExecutor::ID_ROTATION_DELTAS))); + // rotation_trig_lut, float, [max_context_len, embedding_size (aka S) || 0] + config.inConfs[PagedAttentionExecutor::ID_ROTATION_TRIG_LUT].setMemDesc( + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(ov::element::f32, + getInputShapeAtPort(PagedAttentionExecutor::ID_ROTATION_TRIG_LUT))); + } + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref_any); } @@ -140,7 +158,7 @@ void PagedAttention::createPrimitive() { PagedAttentionKey key = {rtPrecision}; auto builder = [&](const PagedAttentionKey& key) -> std::shared_ptr { -#ifdef OPENVINO_ARCH_X86_64 +#if defined(OPENVINO_ARCH_X86_64) || (defined(OPENVINO_ARCH_ARM64)) // Since we are quantize only last dim it's safe to use the last dim of KV. auto kCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE); auto vCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE); diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp index ff27a0e4246baf..400e4946312330 100644 --- a/src/plugins/intel_cpu/src/nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/nodes_factory.cpp @@ -232,6 +232,8 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") { INTEL_CPU_NODE(MHA, Type::MHA); INTEL_CPU_NODE(PagedAttention, Type::PagedAttention); INTEL_CPU_NODE(RMSNorm, Type::RMS); +#elif defined(OPENVINO_ARCH_ARM64) + INTEL_CPU_NODE(PagedAttention, Type::PagedAttention); #endif } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index 16df97bb209ed9..8282fec8fdf6c3 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -20,7 +20,7 @@ bool pass::AdjustBrgemmCopyBLoopPorts::update_loop_info( bool modified = false; auto caller = [&](snippets::lowered::LoopPort& loop_port, snippets::lowered::UnifiedLoopInfo::LoopPortDesc& loop_desc) { - const auto& p = *loop_port.expr_port; + const auto& p = *loop_port.get_expr_port(); if (p.get_type() == snippets::lowered::ExpressionPort::Input && p.get_index() == 1) { const auto& node = p.get_expr()->get_node(); if (auto brg = as_type_ptr(node)) { @@ -31,9 +31,9 @@ bool pass::AdjustBrgemmCopyBLoopPorts::update_loop_info( * 2) Zero padding is applied if N4k < 256 or N2k < 64 */ if (brgemm_utils::with_repacking(brg->get_type()) && precision != element::f32 && - loop_port.is_incremented) { + loop_port.is_incremented()) { // K blocking loop: account for zero padding - if (loop_port.dim_idx == 1) { + if (loop_port.get_dim_idx() == 1) { const auto ptr_incr = loop_desc.ptr_increment; const auto blocked_shape_ptr_inc = brgemm_utils::repacking::compute_LDB(ptr_incr, precision); if (ptr_incr != 0 && ptr_incr != blocked_shape_ptr_inc) { @@ -44,7 +44,7 @@ bool pass::AdjustBrgemmCopyBLoopPorts::update_loop_info( loop_desc.ptr_increment * (loop_desc.finalization_offset / ptr_incr); } // N blocking loop: account for the VNNI format - } else if (loop_port.dim_idx == 0) { + } else if (loop_port.get_dim_idx() == 0) { auto k_blk_size = static_cast(brgemm_utils::compute_vnni_factor(precision)); loop_desc.ptr_increment = snippets::utils::dynamic_safe_mul(loop_desc.ptr_increment, k_blk_size); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index f0f87457364e0f..9a7dd2dbe727c7 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -122,13 +122,13 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, loop_info->replace_with_new_ports(in_ports[1], {in_ports[1], new_port}); }; if (!is_full_dim_value(m_block)) - update_loop_info({compens_port, false, 1}); + update_loop_info(LoopPort::create(compens_port)); if (!is_full_dim_value(n_block)) - update_loop_info({compens_port, true, 0}); + update_loop_info(LoopPort::create(compens_port, 0)); if (!is_full_dim_value(k_block)) - update_loop_info({compens_port, false, 1}); + update_loop_info(LoopPort::create(compens_port, 1)); } return true; } diff --git a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp index 2a85714a792655..dcd97fdd74b638 100644 --- a/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp +++ b/src/plugins/intel_cpu/src/transformations/tpp/x64/pass/lowered/set_tpp_leading_dim.cpp @@ -16,6 +16,7 @@ namespace tpp { namespace pass { namespace { using ExpressionPort = snippets::lowered::ExpressionPort; +using LoopPort = snippets::lowered::LoopPort; // Note: Buffer is directly connected to the port if it remains in the same loops with the port's expression // Directly connected Buffers store data densely, so strides are defined by subternsor dims // Indirectly connected Buffers (with loops between the expr and Buffer) store data according @@ -23,8 +24,8 @@ using ExpressionPort = snippets::lowered::ExpressionPort; bool has_directly_connected_buffer(const ExpressionPort& port, const snippets::lowered::LoopManagerPtr& loop_mngr) { auto accepted_loops = [&loop_mngr, &port](const std::vector& orig, const std::vector& connect) { size_t connect_idx = 0; - auto pred = [&port](const snippets::lowered::LoopPort& loop_port ) { - return *loop_port.expr_port == port; + auto pred = [&port](const LoopPort& loop_port ) { + return *loop_port.get_expr_port() == port; }; for (const auto orig_loop : orig) { if (connect_idx < connect.size() && orig_loop == connect[connect_idx]) { @@ -39,7 +40,7 @@ bool has_directly_connected_buffer(const ExpressionPort& port, const snippets::l loop_info->get_input_ports() : loop_info->get_output_ports(); const auto& found = std::find_if(border_points.begin(), border_points.end(), pred); - if (found == border_points.end() || found->is_incremented) + if (found == border_points.end() || found->is_incremented()) return false; } return true; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index ddd546f151e006..dfa0315cbf1852 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1064,29 +1064,17 @@ void Transformations::MainSnippets(void) { ((in_type0 == element::f32 && in_type1 == ov::element::f32 && config.inferencePrecision == ov::element::bf16)); const auto is_int8 = in_type0 == ov::element::i8; - if (is_fp32) - return true; - // Only FP32 dynamic MHA is supported - if (matmul->is_dynamic()) - return false; if (matmul->get_transpose_a()) return false; - // [150842] The execution of Brgemm INT8/BF16/FP16 on AMX platforms depends on the value of "K % VNNIFactor". - // For more details, please teake a look at the ticket 150842 - if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { - const auto& b_shape = matmul->get_input_partial_shape(1); - const auto K = matmul->get_transpose_b() ? *b_shape.rbegin() : *++b_shape.rbegin(); - const size_t brgemm_vnni_factor_for_real16 = 2; // 4/2(size in term of byte for bf16/fp16) - if (is_bf16 || is_fp16) - return K.is_static() && (K.get_length() % brgemm_vnni_factor_for_real16 == 0); - if (is_int8) - return K.is_static(); - } + if (is_fp32) + return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2); if (is_int8) - return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni) || + return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) || + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_vnni) || dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni); if (is_bf16) - return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16); + return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) || + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16); if (is_fp16) return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx_fp16); return true; diff --git a/src/plugins/intel_cpu/tests/unit/CMakeLists.txt b/src/plugins/intel_cpu/tests/unit/CMakeLists.txt index 63441b504735b0..81645f4fc87553 100644 --- a/src/plugins/intel_cpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_cpu/tests/unit/CMakeLists.txt @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 # +add_subdirectory(vectorized) + set(TARGET_NAME ov_cpu_unit_tests) if(BUILD_SHARED_LIBS) @@ -52,6 +54,8 @@ ov_add_test_target( $/include EXCLUDED_SOURCE_PATHS ${EXCLUDED_SOURCE_PATHS_FOR_UNIT_TEST} + ${CMAKE_CURRENT_SOURCE_DIR}/vectorized + OBJECT_FILES ${OBJ_LIB} LINK_LIBRARIES @@ -78,6 +82,7 @@ if (ENABLE_SNIPPETS_LIBXSMM_TPP) target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $) endif() + # LTO set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp index 738afba6a101f9..fc6783f3b3ca45 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/brgemm_blocking.cpp @@ -24,6 +24,7 @@ using namespace ov::snippets::lowered; using namespace ov::snippets::lowered::pass; using namespace ov::snippets; using BRGEMM_TYPE = intel_cpu::brgemm_utils::BRGEMM_TYPE; +using PortType = LoopPort::Type; namespace { enum class BACKEND_TYPE{CPU, TPP}; @@ -56,28 +57,28 @@ void create_brgemm_loop_infos(const LinearIRPtr& linear_ir, if (k_block) { const auto loop_info = std::make_shared(k, k_blk, - std::vector{LoopPort(brgemm_expr->get_input_port(0)), - LoopPort(brgemm_expr->get_input_port(1), true, 1)}, - std::vector{LoopPort(brgemm_expr->get_output_port(0), false)}, + std::vector{LoopPort::create(brgemm_expr->get_input_port(0), 0), + LoopPort::create(brgemm_expr->get_input_port(1), 1)}, + std::vector{LoopPort::create(brgemm_expr->get_output_port(0))}, get_k_loop_handlers(k, k_block, backend)); linear_ir->get_loop_manager()->add_loop_info(loop_info); } if (n_block) { linear_ir->get_loop_manager()->add_loop_info( std::make_shared(n, n_blk, - std::vector{LoopPort(brgemm_expr->get_input_port(0), false), - LoopPort(brgemm_expr->get_input_port(1))}, - std::vector{LoopPort(brgemm_expr->get_output_port(0))}, + std::vector{LoopPort::create(brgemm_expr->get_input_port(0)), + LoopPort::create(brgemm_expr->get_input_port(1))}, + std::vector{LoopPort::create(brgemm_expr->get_output_port(0))}, BrgemmBlockingBase::get_default_blocking_loop_handlers(n, n_block))); } if (m_block) { - std::vector entries{LoopPort(brgemm_expr->get_input_port(0), true, 1)}; + std::vector entries{LoopPort::create(brgemm_expr->get_input_port(0), 1)}; for (size_t i = 1; i < brgemm_expr->get_input_count(); ++i) - entries.emplace_back(brgemm_expr->get_input_port(i), false, 1); + entries.push_back(LoopPort::create(brgemm_expr->get_input_port(i))); linear_ir->get_loop_manager()->add_loop_info( std::make_shared(m, m_blk, entries, - std::vector{LoopPort(brgemm_expr->get_output_port(0), true, 1)}, + std::vector{LoopPort::create(brgemm_expr->get_output_port(0), 1)}, BrgemmBlockingBase::get_default_blocking_loop_handlers(m, m_block))); } } @@ -269,8 +270,8 @@ TEST_F(BrgemmCPUBlockingTest, WithCompensations) { loop_info->replace_with_new_ports(in_ports[1], {in_ports[1], new_port}); }; const auto& compens_port = brgemm_expr->get_input_port(2); - update_loop_info(1, {compens_port, true, 0}); - update_loop_info(0, {compens_port, false, 1}); + update_loop_info(1, LoopPort::create(compens_port, 0)); + update_loop_info(0, LoopPort::create(compens_port, 1)); brgemm_expr->set_loop_ids({2, 1, 0}); auto result = linear_ir_ref->push_node(brgemm.second); @@ -314,9 +315,9 @@ TEST_F(BrgemmCPUBlockingTest, AMX) { init_expr_descriptors(brgemm_expr, {{m_blk, k_blk}, {k_blk, n_blk}, get_default_subtensor(), {m_blk, n_blk}}); create_brgemm_loop_infos(linear_ir_ref, brgemm_expr, m, 0, k, k_blk, n, n_blk); - std::vector entries {LoopPort(brgemm_expr->get_input_port(0), true, 1), - LoopPort(brgemm_expr->get_input_port(1), false, 1)}; - std::vector exits {LoopPort(brgemm_expr->get_output_port(0), true, 1)}; + std::vector entries {LoopPort::create(brgemm_expr->get_input_port(0), 1), + LoopPort::create(brgemm_expr->get_input_port(1))}; + std::vector exits {LoopPort::create(brgemm_expr->get_output_port(0), 1)}; auto handlers = BrgemmBlockingBase::get_default_blocking_loop_handlers(m, m_blk); linear_ir_ref->get_loop_manager()-> add_loop_info(std::make_shared(m, m_blk, entries, exits, handlers)); diff --git a/src/plugins/intel_cpu/tests/unit/vectorized/CMakeLists.txt b/src/plugins/intel_cpu/tests/unit/vectorized/CMakeLists.txt new file mode 100644 index 00000000000000..7e64c24f604a85 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/vectorized/CMakeLists.txt @@ -0,0 +1,89 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +set(TARGET_NAME ov_cpu_unit_tests_vectorized) + +if(BUILD_SHARED_LIBS) + set (OBJ_LIB $) +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + ov_add_compiler_flags(/wd5051) +endif() + +if(NOT X86_64) + list(APPEND EXCLUDED_SOURCE_PATHS_FOR_UNIT_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/paged_attn_cache_rotation.cpp) +else() + list(APPEND EXCLUDED_SOURCE_PATHS_FOR_UNIT_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/stub.cpp) +endif() + +if (ENABLE_MLAS_FOR_CPU) + set(MLAS_LIBRARY "mlas") +endif() + +if (ENABLE_SHL_FOR_CPU) + set(SHL_LIBRARY "shl") +endif() + +ov_add_test_target( + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + INCLUDES + PUBLIC + $/src + $/src/nodes + $ + PRIVATE + $/include + EXCLUDED_SOURCE_PATHS + ${EXCLUDED_SOURCE_PATHS_FOR_UNIT_TEST} + OBJECT_FILES + ${OBJ_LIB} + LINK_LIBRARIES + gtest + gtest_main + dnnl + gmock + openvino_runtime_s + unit_test_utils + ov_snippets_models + snippets_test_utils + ${MLAS_LIBRARY} + ${SHL_LIBRARY} + ADD_CPPLINT + LABELS + OV UNIT CPU +) + + +if (ENABLE_SNIPPETS_LIBXSMM_TPP) + add_definitions(-DSNIPPETS_LIBXSMM_TPP -DLIBXSMM_DEFAULT_CONFIG) + target_compile_definitions(xsmm PRIVATE __BLAS=0) + target_link_libraries(${TARGET_NAME} PRIVATE xsmm) + target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $) +endif() + +if (X86_64) + ov_avx2_optimization_flags(avx2_flags) + ov_avx512_optimization_flags(avx512_flags) + + target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags};${avx512_flags}") + target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_AVX2 HAVE_AVX512F) +endif() + + +if (WIN32) + # Prevents defining min/max as macros + target_compile_definitions(${TARGET_NAME} PRIVATE NOMINMAX) +endif() + +target_include_directories(${TARGET_NAME} SYSTEM PRIVATE + $) + +target_include_directories(${TARGET_NAME} SYSTEM PRIVATE + $/src/common + $/src/cpu + $/include) diff --git a/src/plugins/intel_cpu/tests/unit/vectorized/paged_attn_cache_rotation.cpp b/src/plugins/intel_cpu/tests/unit/vectorized/paged_attn_cache_rotation.cpp new file mode 100644 index 00000000000000..870c5c576a73e1 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/vectorized/paged_attn_cache_rotation.cpp @@ -0,0 +1,509 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include +#include +#include + +// the includes in the block below are necessary in order for the common.hpp header to be +// instantiated correctly +#include +#if defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# include +#endif +#include "kernels/scaled_attn/common.hpp" +#include "nodes/kernels/scaled_attn/cache_rotation.hpp" +#include "perf_count.h" +#include "utils/plain_tensor.hpp" + +using namespace ov::intel_cpu; + +template +using Rank2Matrix = std::vector>; + +template +using Rank3Matrix = std::vector>>; + +// Expected layout: [block_size, embedding_size] +template +std::vector get_block_memory(size_t block_size, size_t embedding_size, const Rank2Matrix& init_values) { + auto mem = std::vector(block_size * embedding_size); + if (!init_values.empty()) { + assert(init_values.size() == block_size); + assert(init_values[0].size() == embedding_size); + for (size_t i = 0; i < block_size; i++) { + for (size_t j = 0; j < embedding_size; j++) { + mem[i * embedding_size + j] = init_values[i][j]; + } + } + } + return mem; +} + +// Expected layout: [num_heads, block_size, embedding_size] +template +std::vector get_block_memory(size_t num_heads, + size_t block_size, + size_t embedding_size, + const Rank3Matrix& init_values) { + auto mem = std::vector(num_heads * block_size * embedding_size); + if (!init_values.empty()) { + assert(init_values.size() == num_heads); + assert(init_values[0].size() == block_size); + assert(init_values[0][0].size() == embedding_size); + for (size_t i = 0; i < num_heads; i++) { + for (size_t j = 0; j < block_size; j++) { + for (size_t k = 0; k < embedding_size; k++) { + mem[i * embedding_size * block_size + j * embedding_size + k] = init_values[i][j][k]; + } + } + } + } + return mem; +} + +template +Rank3Matrix get_matrix_from_mem(std::vector mem_vec, + size_t num_heads, + size_t block_size, + size_t embedding_size) { + Rank3Matrix retval(num_heads); + for (size_t i = 0; i < num_heads; i++) { + retval[i].resize(block_size); + for (size_t j = 0; j < block_size; j++) { + retval[i][j].resize(embedding_size); + } + } + for (size_t i = 0; i < num_heads; i++) { + for (size_t j = 0; j < block_size; j++) { + for (size_t k = 0; k < embedding_size; k++) { + retval[i][j][k] = mem_vec[block_size * embedding_size * i + embedding_size * j + k]; + } + } + } + return retval; +} + +template +void compare_with_tolerance(const Rank3Matrix& test_data, const Rank3Matrix& ref_data, T abs_err) { + ASSERT_EQ(test_data.size(), ref_data.size()); + ASSERT_GT(test_data.size(), 0); + + ASSERT_EQ(test_data[0].size(), ref_data[0].size()); + ASSERT_GT(test_data[0].size(), 0); + + ASSERT_EQ(test_data[0][0].size(), ref_data[0][0].size()); + ASSERT_GT(test_data[0][0].size(), 0); + + for (size_t i = 0; i < test_data.size(); i++) { + for (size_t j = 0; j < test_data[0].size(); j++) { + for (size_t k = 0; k < test_data[0][0].size(); k++) { + T diff = test_data[i][j][k] - ref_data[i][j][k]; + if ((diff > abs_err) || (diff < -abs_err)) { + ADD_FAILURE() << std::setprecision(8) << "diff " << diff << " exceeding atol " << abs_err + << " at idx [" << i << ";" << j << ";" << k << "] --- test " << test_data[i][j][k] + << ", ref " << ref_data[i][j][k]; + } + } + } + } +} + +template +static T get_tolerance() { + return T{}; +} + +template <> +float get_tolerance() { + return 1e-6f; +} + +template <> +ov::float16 get_tolerance() { + return ov::float16{5e-3}; +} + +template <> +ov::bfloat16 get_tolerance() { + return ov::bfloat16{4e-2}; +} + +template +class CacheRotationKernelInputTypeParameterizedTest : public ::testing::Test { +public: + void SetUp() override { + Rank3Matrix values_before_rotation = { + { + {1.0f, 1.0f, 1.0f, 1.0f}, + {1.0f, 1.0f, 1.0f, 1.0f}, + {1.0f, 1.0f, 1.0f, 1.0f}, + {1.0f, 1.0f, 1.0f, 1.0f}, + }, + { + {-2.0f, -2.0f, -2.0f, -2.0f}, + {2.0f, 2.0f, 2.0f, 2.0f}, + {-1.0f, 2.0f, -3.0f, 4.0f}, + {2.0f, 2.0f, 2.0f, 2.0f}, + }, + }; + cache_mem = get_block_memory(num_heads, block_size, embedding_size, values_before_rotation); + + Rank2Matrix rotation_values = { + {0.5f, 0.70710678f, 0.86602540f, -0.70710678f}, + {0.86602540f, 1.0f, 0.5f, 0.0f}, + {-0.70710678f, 0.0f, 0.70710678f, 1.0f}, + {0.0f, 0.6f, -1.0f, -0.8f}, + }; + + rotation_coefficients_mem = get_block_memory(block_size, embedding_size, rotation_values); + } + size_t num_heads = 2; + size_t block_size = 4; + size_t embedding_size = 4; + std::vector cache_mem; + std::vector rotation_coefficients_mem; + Rank3Matrix ref_values_after_rotation = { + { + {-0.36602540f, 1.41421356f, 1.36602540f, 0.00000000f}, + {0.36602540f, 1.00000000f, 1.36602540f, 1.00000000f}, + {-1.41421356f, -1.00000000f, 0.00000000f, 1.00000000f}, + {1.00000000f, 1.40000000f, -1.00000000f, -0.20000000f}, + }, + { + {0.73205081f, -2.82842712f, -2.73205081f, 0.00000000f}, + {0.73205081f, 2.00000000f, 2.73205081f, 2.00000000f}, + {2.82842712f, -4.00000000f, 1.41421356f, 2.00000000f}, + {2.00000000f, 2.80000000f, -2.00000000f, -0.40000000f}, + }, + }; + + void test_block_opt_vs_ref(size_t num_heads, size_t embedding_size, size_t block_size) { + auto cache_block_mem_ref = get_block_memory(num_heads, block_size, embedding_size, Rank3Matrix{}); + auto rotation_coeffts_block_mem = get_block_memory(block_size, embedding_size, Rank2Matrix{}); + + std::mt19937 engine; + engine.seed(0); + std::uniform_real_distribution rng(-2.0, 2.0); + + auto generate_fn = [&]() { + return TypeParam(rng(engine)); + }; + + std::generate(cache_block_mem_ref.begin(), cache_block_mem_ref.end(), generate_fn); + // coeffts are now not strictly sine-cosine pairs, but it does not matter for the kernels + std::generate(rotation_coeffts_block_mem.begin(), + rotation_coeffts_block_mem.end(), + generate_fn); + + + + auto cache_block_mem_hw = cache_block_mem_ref; + + auto raw_mem_ptr_ref = cache_block_mem_ref.data(); + auto raw_rotation_coefficients_mem_ptr = rotation_coeffts_block_mem.data(); + auto raw_mem_ptr_hw = cache_block_mem_hw.data(); + + ov::intel_cpu::PerfCount counter; + { + ov::intel_cpu::PerfHelper helper(counter); + rotate_kv_cache_block_opt(raw_mem_ptr_hw, + raw_rotation_coefficients_mem_ptr, + num_heads, + block_size, + embedding_size); + } + + { + ov::intel_cpu::PerfHelper helper(counter); + rotate_kv_cache_block_ref(raw_mem_ptr_ref, + raw_rotation_coefficients_mem_ptr, + num_heads, + block_size, + embedding_size); + } + + auto ref_values_after_rotation = get_matrix_from_mem(cache_block_mem_ref, num_heads, block_size, embedding_size); + auto opt_values_after_rotation = get_matrix_from_mem(cache_block_mem_hw, num_heads, block_size, embedding_size); + compare_with_tolerance(opt_values_after_rotation, ref_values_after_rotation, get_tolerance()); + } +}; + +using OV_FP_TYPES = ::testing::Types; + +TYPED_TEST_SUITE_P(CacheRotationKernelInputTypeParameterizedTest); + +TYPED_TEST_P(CacheRotationKernelInputTypeParameterizedTest, RefBlockRotationGivesReferenceResults) { + auto raw_cache_mem_ptr = this->cache_mem.data(); + auto raw_rotation_coefficients_mem_ptr = this->rotation_coefficients_mem.data(); + + rotate_kv_cache_block_ref(raw_cache_mem_ptr, + raw_rotation_coefficients_mem_ptr, + this->num_heads, + this->block_size, + this->embedding_size); + + auto test_values_after_rotation = + get_matrix_from_mem(this->cache_mem, this->num_heads, this->block_size, this->embedding_size); + compare_with_tolerance(test_values_after_rotation, this->ref_values_after_rotation, get_tolerance()); +} + +enum class TargetInstructionSet { AVX2, AVX512 }; + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsuggest-override" // false positive in gtest macro internals +#endif + +MATCHER_P3(IsNFirstValuesNear, ref_container, abs_err, n, "") { + if (ref_container.size() < n || arg.size() < n) + return false; + if (ref_container.size() != arg.size()) + return false; + + bool is_ok = true; + for (size_t i = 0; i < n; i++) { + if (!::testing::ExplainMatchResult(::testing::FloatNear(static_cast(arg[i]), abs_err), + static_cast(ref_container[i]), + result_listener)) { + *result_listener << " for element at idx " << i << '\n'; + is_ok = false; + } + } + return is_ok; +} + + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +class CacheRotationKernelInstructionParameterizedTest + : public ::testing::TestWithParam> { +protected: + constexpr static size_t MAX_CHUNK_SIZE_IN_ELEMENTS = 16; + template + using MemChunk = std::array; + + template + void test_chunk_rotation_for_type() { + auto instruction_set = std::get<0>(GetParam()); + if (instruction_set == TargetInstructionSet::AVX512 && (!ov::with_cpu_x86_avx512f())) { + GTEST_SKIP() << "test executor must have AVX512 support"; + } + if (instruction_set == TargetInstructionSet::AVX2 && (!ov::with_cpu_x86_avx2())) { + GTEST_SKIP() << "test executor must have AVX2 support"; + } + auto num_elements_to_process = std::get<1>(GetParam()); + + MemChunk chunk_x = {-0.76777814f, + 0.97583583f, + -0.23619731f, + 0.19022397f, + 0.56691264f, + 0.64870757f, + 0.63334306f, + 1.97307894f, + 0.72495168f, + 1.22328697f, + -0.6005607f, + 0.17189973f, + -0.92268487f, + 0.40205632f, + 0.85996431f, + 1.70078315f}; + + MemChunk chunk_y = {1.68812157f, + -0.90722836f, + 0.58474063f, + -0.64561766f, + 0.62651501f, + 1.55990472f, + 0.41571189f, + 0.38366555f, + 0.09841767f, + 0.02218336f, + -0.07657361f, + 1.6062845f, + -1.08282323f, + -0.92034808f, + -1.48428038f, + 0.43501142f}; + + MemChunk chunk_cos = {-0.87461971f, + 0.95630476f, + 0.08715574f, + 0.8480481f, + -0.9612617f, + 0.27563736f, + 0.97437006f, + 0.66913061f, + -0.89100652f, + 0.98480775f, + -0.7313537f, + -0.2419219f, + 0.10452846f, + 0.70710678f, + -0.32556815f, + -0.2923717f}; + + MemChunk chunk_sin = {-0.48480962f, + -0.2923717f, + 0.9961947f, + 0.52991926f, + 0.27563736f, + -0.9612617f, + -0.22495105f, + 0.74314483f, + 0.4539905f, + -0.17364818f, + -0.68199836f, + -0.97029573f, + -0.9945219f, + -0.70710678f, + -0.94551858f, + 0.95630476f}; + + MemChunk ref_chunk_cos = chunk_cos; + MemChunk ref_chunk_sin = chunk_sin; + + MemChunk ref_chunk_x = {1.48993147f, + 0.66794854f, + -0.60310147f, + 0.50344431f, + -0.71764235f, + 1.6782847f, + 0.71062535f, + 1.03512844f, + -0.69061736f, + 1.20855459f, + 0.38699921f, + 1.51698468f, + -1.17333824f, + -0.36648762f, + -1.68339166f, + -0.91326436f}; + + MemChunk ref_chunk_y = {-1.10423816f, + -1.15289358f, + -0.184335f, + -0.44671148f, + -0.44598258f, + -0.19360973f, + 0.26258603f, + 1.72300577f, + 0.24143039f, + -0.19057521f, + 0.46558381f, + -0.55538896f, + 0.80444446f, + -0.93508112f, + -0.32987781f, + 1.49928198f}; + + // unprocessed elements should remain untouched + std::copy(chunk_x.begin() + num_elements_to_process, + chunk_x.end(), + ref_chunk_x.begin() + num_elements_to_process); + std::copy(chunk_y.begin() + num_elements_to_process, + chunk_y.end(), + ref_chunk_y.begin() + num_elements_to_process); + + switch (instruction_set) { + using namespace ov::Extensions::Cpu::XARCH; + case TargetInstructionSet::AVX2: + rotate_kv_cache_chunk_avx2(chunk_x.data(), + chunk_y.data(), + chunk_cos.data(), + chunk_sin.data(), + num_elements_to_process, + /* is_tail = */ num_elements_to_process < vec_len_f32_avx2); + break; + case TargetInstructionSet::AVX512: + rotate_kv_cache_chunk_avx512(chunk_x.data(), + chunk_y.data(), + chunk_cos.data(), + chunk_sin.data(), + num_elements_to_process, + /* is_tail = */ num_elements_to_process < vec_len_f32_avx512); + break; + default: + FAIL() << "unknown target instruction set"; + } + + std::string type_name = ov::element::from().to_string(); + + EXPECT_THAT(chunk_x, IsNFirstValuesNear(ref_chunk_x, get_tolerance(), num_elements_to_process)) + << ", element type is: " << type_name; + EXPECT_THAT(chunk_y, IsNFirstValuesNear(ref_chunk_y, get_tolerance(), num_elements_to_process)) + << ", element type is: " << type_name; + + EXPECT_EQ(chunk_cos, ref_chunk_cos) << ", element type is: " << type_name; + EXPECT_EQ(chunk_sin, ref_chunk_sin) << ", element type is: " << type_name; + } +}; + +TEST_P(CacheRotationKernelInstructionParameterizedTest, OptChunkRotationGivesReferenceResults) { + test_chunk_rotation_for_type(); + test_chunk_rotation_for_type(); + test_chunk_rotation_for_type(); +} + +auto TEST_STRUCT_TO_NAME_FN = + [](const testing::TestParamInfo& info) { + size_t num_elts = std::get<1>(info.param); + switch (std::get<0>(info.param)) { + case TargetInstructionSet::AVX2: + return std::string("avx2-") + std::to_string(num_elts); + case TargetInstructionSet::AVX512: + return std::string("avx512-") + std::to_string(num_elts); + } + return std::string("unknown"); + }; + +INSTANTIATE_TEST_SUITE_P(AVX2, + CacheRotationKernelInstructionParameterizedTest, + ::testing::Combine(::testing::Values(TargetInstructionSet::AVX2), + ::testing::Range(size_t(0), + ov::Extensions::Cpu::XARCH::vec_len_f32_avx2 + 1)), + TEST_STRUCT_TO_NAME_FN); +INSTANTIATE_TEST_SUITE_P(AVX512, + CacheRotationKernelInstructionParameterizedTest, + ::testing::Combine(::testing::Values(TargetInstructionSet::AVX512), + ::testing::Range(size_t(0), + ov::Extensions::Cpu::XARCH::vec_len_f32_avx512 + 1)), + TEST_STRUCT_TO_NAME_FN); + +TYPED_TEST_P(CacheRotationKernelInputTypeParameterizedTest, OptBlockRotationGivesReferenceResults) { + auto raw_cache_mem_ptr = this->cache_mem.data(); + auto raw_rotation_coefficients_mem_ptr = this->rotation_coefficients_mem.data(); + + rotate_kv_cache_block_opt(raw_cache_mem_ptr, + raw_rotation_coefficients_mem_ptr, + this->num_heads, + this->block_size, + this->embedding_size); + + auto test_values_after_rotation = + get_matrix_from_mem(this->cache_mem, this->num_heads, this->block_size, this->embedding_size); + compare_with_tolerance(test_values_after_rotation, this->ref_values_after_rotation, get_tolerance()); +} + +TYPED_TEST_P(CacheRotationKernelInputTypeParameterizedTest, OptBlockRotationIsSimilarToRef) { + // short case + this->test_block_opt_vs_ref(/* num_heads = */ 4, /* embedding_size = */ 64, /* block_size = */ 2); + + // long case + this->test_block_opt_vs_ref(256, 1024, 32); +} + +REGISTER_TYPED_TEST_SUITE_P(CacheRotationKernelInputTypeParameterizedTest, + RefBlockRotationGivesReferenceResults, + OptBlockRotationGivesReferenceResults, + OptBlockRotationIsSimilarToRef); +INSTANTIATE_TYPED_TEST_SUITE_P(AllFPTypes, CacheRotationKernelInputTypeParameterizedTest, OV_FP_TYPES); diff --git a/src/plugins/intel_cpu/tests/unit/vectorized/stub.cpp b/src/plugins/intel_cpu/tests/unit/vectorized/stub.cpp new file mode 100644 index 00000000000000..2c6aba41d2231d --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/vectorized/stub.cpp @@ -0,0 +1,12 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +TEST(StubTest, AlwaysPass) { + // Some target platforms for the vectorized tests do not have any cases right now. + // In order to make the build pass on these platforms, the build system will include this + // file as the only source for the ov_cpu_unit_tests_vectorized, and the test binary + // will always pass the run. +} diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 5b7873c1500638..eaffaad2281b01 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -181,6 +181,7 @@ struct network { bool is_primary_stream() const { return _is_primary_stream; } bool is_dynamic() const { return _is_dynamic; } size_t get_weights_cache_capacity() const { return _weights_cache_capacity; } + bool contains_state(const std::string& variable_id); memory_pool& get_memory_pool() const { return *_memory_pool; @@ -192,6 +193,8 @@ struct network { const ov::intel_gpu::VariableStateInfo& get_variable_info(const std::string &variable_id) const; const ov::intel_gpu::VariablesMap& get_variables() const; const ov::intel_gpu::VariablesInfoMap& get_variables_info() const; + void set_reuse_variable_mem(bool reuse = false); + bool is_reuse_variable_mem() { return _reuse_variable_mem; } const ExecutionConfig& get_config() const { return _config; } @@ -215,6 +218,7 @@ struct network { bool _is_dynamic = false; bool _enable_profiling = false; bool _reset_arguments; + bool _reuse_variable_mem = false; std::unordered_map> _primitives; std::vector _in_out_shared_mem_types; @@ -225,6 +229,8 @@ struct network { ov::intel_gpu::VariablesMap _variables_states; ov::intel_gpu::VariablesInfoMap _variables_state_info; + std::vector> _read_values; + std::unordered_map>> _state_initializers; program::primitives_info _prims_info; size_t _weights_cache_capacity = 1; diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index 379d7b3b64a222..1a06231d61d618 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -173,6 +173,11 @@ struct program { program_node const& get_node(primitive_id const& id) const; std::shared_ptr get_node_ptr(const primitive_id& prim) { return nodes_map.at(prim); } std::shared_ptr get_node_ptr(const primitive_id& prim) const { return nodes_map.at(prim); } + void set_state_initializers(const std::string& variable_id, const primitive_id& id); + bool has_state_initializers(const std::string& variable_id, const primitive_id& id); + bool contains_state(const std::string& variable_id); + const std::vector& get_initializers(const std::string& variable_id) { return state_initializers.at(variable_id); } + const std::map>& get_state_initializers() const { return state_initializers; } // returns already existing program_node for given primitive 'prim' (lookup in 'nodes_map') // if it was previously created, otherwise creates and then returns program_node @@ -322,6 +327,8 @@ struct program { primitives_info prim_info; graph_optimizer_info optimizer_passes_info; + std::map> state_initializers; + primitives_info get_current_stage_info() const; /* ** High-level functions, in order of usage diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp index 2638f2ad60cf26..ad79e5178f21a8 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp @@ -21,7 +21,9 @@ struct paged_attention : public primitive_base { paged_attention(const primitive_id& id, const std::vector& inputs) : primitive_base(id, inputs) { - OPENVINO_ASSERT(inputs.size() == 13, "[GPU] Unexpected inputs number for PagedAttention primitive: ", inputs.size()); + OPENVINO_ASSERT((inputs.size() == 13) || (inputs.size() == 15), + "[GPU] Unexpected inputs number for PagedAttention primitive: ", + inputs.size()); } bool has_scores_output() const { @@ -38,6 +40,7 @@ struct paged_attention : public primitive_base { ob << heads_num; ob << kv_heads_num; ob << has_alibi; + ob << has_rotated_blocks; } void load(BinaryInputBuffer& ib) override { @@ -46,6 +49,7 @@ struct paged_attention : public primitive_base { ib >> heads_num; ib >> kv_heads_num; ib >> has_alibi; + ib >> has_rotated_blocks; } optional_value scale_val{}; @@ -53,5 +57,6 @@ struct paged_attention : public primitive_base { size_t heads_num = 0; size_t kv_heads_num = 0; bool has_alibi = false; + bool has_rotated_blocks = false; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_state_init_subgraphs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_state_init_subgraphs.cpp new file mode 100644 index 00000000000000..4146ace18efa25 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_state_init_subgraphs.cpp @@ -0,0 +1,60 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "read_value_inst.h" +#include "pass_manager.h" +#include + +#include "intel_gpu/graph/program.hpp" + +using namespace cldnn; + +void mark_state_init_subgraphs::mark_init_subgraph(program& p, read_value_node& node) { + const auto& variable_id = node.get_primitive()->variable_id; + if (p.contains_state(variable_id)) + return; + + std::queue q; + q.push(&node); + + auto can_be_marked = [&](const program_node* dep_node) { + if (p.has_state_initializers(variable_id, dep_node->id())) + return false; + + for (auto& u : dep_node->get_users()) { + if (u == &node) + continue; + if (p.has_state_initializers(variable_id, u->id())) + continue; + else + return false; + } + GPU_DEBUG_TRACE_DETAIL << "marked " << dep_node->id() << " as node in a init_subgraph for " << node.id() << std::endl; + return true; + }; + + while (!q.empty()) { + auto cur_size = q.size(); + for (size_t i = 0; i < cur_size; ++i) { + auto& cur_node = q.front(); + q.pop(); + for (auto& dep : cur_node->get_dependencies()) { + if (can_be_marked(dep.first)) { + p.set_state_initializers(variable_id, dep.first->id()); + q.push(dep.first); + } + } + } + } +} + +void mark_state_init_subgraphs::run(program& p) { + auto rit = p.get_processing_order().rbegin(); + for (; rit != p.get_processing_order().rend(); rit++) { + auto& node = *rit; + if (node->is_type()) { + mark_init_subgraph(p, node->as()); + } + } +} diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp index 09734a68acaffc..f08fa6a67b6303 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp @@ -3,6 +3,8 @@ // #include "impls/cpu/cpu_impl_helpers.hpp" +#include "assign_inst.h" +#include "kv_cache_inst.h" #include "read_value_inst.h" #include "impls/registry/implementation_map.hpp" #include "register.hpp" @@ -61,6 +63,13 @@ struct read_value_impl : public typed_primitive_impl { } else { variable.get_memory()->fill(stream); } + if (!instance.get_user_insts().empty()) { + auto user_inst = instance.get_user_insts().front(); + if (!(user_inst->get_node().is_type() || user_inst->get_node().is_type()) && + instance.get_network().contains_state(variable_id)) { + variable.set(); + } + } } if (!instance.can_be_optimized()) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp index 2bc377f2c1459a..15a1632a8a2b1f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp @@ -204,6 +204,11 @@ struct paged_attention_impl : multi_stage_primitive { // dependency args.inputs.push_back(instance.subsequence_begins_memory_ptr()); } + if (desc->has_rotated_blocks) { + args.inputs.push_back(instance.rotated_block_indices_memory_ptr()); + args.inputs.push_back(instance.rotation_deltas_memory_ptr()); + args.inputs.push_back(instance.rotation_trig_lut_memory_ptr()); + } } else if (kernel_idx == 4) { // Output scores calculation kernel args.inputs = { instance.past_lens_memory_ptr(), diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 0b7c3d85c37e27..c4b32fa1549991 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -9,6 +9,7 @@ #include "quantize_inst.h" #include "eltwise_inst.h" #include "convolution_inst.h" +#include "read_value_inst.h" #include #include #include @@ -89,6 +90,16 @@ class mark_nodes : public base_pass { void run(program& p) override; }; +class mark_state_init_subgraphs : public base_pass { + // This optimization pass aggregates nodes into state initializer subgraphs +public: + mark_state_init_subgraphs() : base_pass("mark_state_init_subgraphs") {} + +private: + void run(program& p) override; + void mark_init_subgraph(program& p, read_value_node& node); +}; + class mark_shape_of_subgraphs : public base_pass { // This optimization pass aggregates nodes into shape_of subgraphs for further optimizations. // There are few key requirements to decide if node belongs to shape_of subgraph or not: diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 37152b0d9e3b4f..eef58068f1ab7e 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -661,6 +661,15 @@ void network::build_exec_order() { } } } + +bool network::contains_state(const std::string& variable_id) { + auto it = _state_initializers.find(variable_id); + if (it != _state_initializers.end()) + return true; + else + return false; +} + void network::add_to_exec_order(const primitive_id& id) { auto inst = get_primitive(id); _exec_order.push_back(inst); @@ -698,6 +707,19 @@ std::map network::execute(const std::vectorget_node().as().get_primitive(); + auto it = _state_initializers.find(prim->variable_id); + if (it != _state_initializers.end()) { + const auto& variable = get_variable(prim->variable_id); + if (variable.is_set()) { + for (auto& init_inst : it->second) { + init_inst->set_flag(ExecutionFlags::SKIP); + } + } + } + } + // We shouldn't call surfaces_lock::create() function constantly here, but due to // some changes in assembler code, performance drops in case if we move it under // `shared_mem_found` condition (it somehow connected with get_cl_queue() - this function call @@ -913,6 +935,15 @@ void network::allocate_primitive_instance(program_node const& node) { if (node.is_type()) _data_outputs.push_back(inst); } + if (node.is_type()) { + _read_values.push_back(inst); + const auto& variable_id = node.as().get_primitive()->variable_id; + if (_program->contains_state(variable_id)) { + for (const auto& id : _program->get_initializers(variable_id)) { + _state_initializers[variable_id].push_back(get_primitive(id)); + } + } + } if (auto state_prim = std::dynamic_pointer_cast(inst)) { auto prim = inst->get_node().get_primitive(); set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type(), prim.get()); @@ -997,5 +1028,9 @@ void network::set_variables_state_info(const std::string& variable_id, _variables_state_info.at(variable_id).m_primitives.insert(p); } +void network::set_reuse_variable_mem(bool reuse) { + _reuse_variable_mem = reuse; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/paged_attention.cpp index c761aaf63799cd..48ae46d83de34a 100644 --- a/src/plugins/intel_gpu/src/graph/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/paged_attention.cpp @@ -98,6 +98,7 @@ std::string paged_attention_inst::to_string(const paged_attention_node& node) { paged_attention_info.add("kv_heads_num", desc->kv_heads_num); paged_attention_info.add("scale", desc->scale_val.value_or(1.0f)); paged_attention_info.add("has_alibi", desc->has_alibi); + paged_attention_info.add("has_rotated_blocks", desc->has_rotated_blocks); node_info->add("paged_attention primitive info", paged_attention_info); node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index d1cb04375a7c49..480de4803f2e5c 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -624,16 +624,24 @@ void primitive_inst::realloc_if_needed(bool prev_execution_skipped) { _max_output_layout_count[j] = 0; } } else { - _outputs[0] = variable.get_memory(); + GPU_DEBUG_TRACE_DETAIL + << id() << " : realloc_if_needed: can_be_optimized = false and memories are not being shared" + << std::endl; + if (!get_network().is_reuse_variable_mem()) { + GPU_DEBUG_TRACE_DETAIL << "Update output mem with new variable mem" << std::endl; + _outputs[0] = variable.get_memory(); + _max_output_layout_count[0] = variable.get_actual_mem_size() / dt_sizes_in_B[0]; - if (auto compressed_cache_variable = dynamic_cast(&variable)) { - _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); - if (compressed_cache_variable->has_zp_state()) { - _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + if (compressed_cache_variable->has_zp_state()) { + _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + } } + } else { + GPU_DEBUG_TRACE_DETAIL << "Can reuse variable mem of prev request" << std::endl; } - GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: can_be_optimized = false and memories are not being shared" << std::endl; } } else { variable.set_layout(_impl_params->output_layouts[0]); @@ -2660,12 +2668,21 @@ bool primitive_inst::is_valid_fusion() const { // Check if broadcast happens more than single axis. // Current gemm_tiled_opt kernel FUSED_OP_LOAD macro cannot support broadcast on dynamic dimension. - if (_node->is_type() && can_broadcast == true && merged_shape.rank().get_length() == outer_dep_pshape.rank().get_length()) { + if (_node->is_type() && can_broadcast == true && merged_shape.rank().get_length() >= outer_dep_pshape.rank().get_length()) { uint8_t broadcast_more_than_single_axis = 0; + auto updated_outer_dep_pshape = ov::PartialShape(outer_dep_pshape); + + // Update outer_dep_pshape to merged_shape rank + if (merged_shape.rank().get_length() > outer_dep_pshape.rank().get_length()) { + updated_outer_dep_pshape.insert(updated_outer_dep_pshape.begin(), + merged_shape.rank().get_length() - outer_dep_pshape.rank().get_length(), ov::Dimension(1)); + } + for (int64_t i = 0; i < merged_shape.rank().get_length(); i++) { - if (merged_shape.get_shape().at(i) != outer_dep_pshape.get_shape().at(i)) + if (merged_shape[i] != updated_outer_dep_pshape[i]) broadcast_more_than_single_axis++; } + if (broadcast_more_than_single_axis > 1) can_broadcast = false; } diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index c938be22b816ed..5717864541e9a6 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -653,6 +653,8 @@ void program::post_optimize_graph(bool is_internal) { // for OOO queue if (_config.get_property(ov::intel_gpu::queue_type) == QueueTypes::out_of_order) get_processing_order().calculate_BFS_processing_order(); + + apply_opt_pass(); } // mark if the node is constant assuming that all dependencies are marked properly @@ -830,6 +832,27 @@ void program::reverse_connection(program_node& dep_node, program_node& user_node } } +void program::set_state_initializers(const std::string& variable_id, const primitive_id& id) { + state_initializers[variable_id].push_back(id); +} + +bool program::has_state_initializers(const std::string& variable_id, const primitive_id& id) { + auto it = state_initializers.find(variable_id); + if (it != state_initializers.end()) { + const auto& initializers = it->second; + return std::find(initializers.begin(), initializers.end(), id) != initializers.end(); + } + return false; +} + +bool program::contains_state(const std::string& variable_id) { + auto it = state_initializers.find(variable_id); + if (it != state_initializers.end()) + return true; + else + return false; +} + program_node& program::get_or_create(std::shared_ptr prim) { auto itr = nodes_map.lower_bound(prim->id); if (itr != nodes_map.end() && itr->first == prim->id) @@ -1848,6 +1871,12 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const { for (auto const& node_id : allocating_order) { ob << node_id; } + + ob << state_initializers.size(); + for (auto& state_initializer : state_initializers) { + ob << state_initializer.first; + ob << state_initializer.second; + } } void program::load(cldnn::BinaryInputBuffer& ib) { @@ -2016,4 +2045,15 @@ void program::load(cldnn::BinaryInputBuffer& ib) { ib >> node_id; allocating_order.emplace_back(node_id); } + + size_t state_initializers_size; + ib >> state_initializers_size; + state_initializers.clear(); + for (size_t i = 0; i < state_initializers_size; i++) { + std::string variable_id; + std::vector initializers; + ib >> variable_id; + ib >> initializers; + state_initializers[variable_id] = initializers; + } } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index f38aef64f6fe00..d740bfc099651e 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -17,8 +17,6 @@ #include "gemm_inst.h" #include "fully_connected_inst.h" #include "deconvolution_inst.h" -#include "quantize_inst.h" -#include "reorder_inst.h" #include "pooling_inst.h" #include "reduce_inst.h" #include diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl index 07486c1b9a1498..908e0e01439fdb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl @@ -290,6 +290,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( } } + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + unroll_for(uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) { unroll_for(uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) { unroll_for (uint fx = 0; fx < FILTER_SIZE_X; fx++) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl index 7e960afa4b87d3..2d6598e0a654cc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl @@ -42,6 +42,11 @@ KERNEL(pa_sdpa_opt)( #endif #if HAS_ALIBI const __global ALIBI_INPUT_TYPE* alibi_slopes, +#endif +#if HAS_ROTATED_BLOCKS + const __global INPUT8_TYPE* rotated_block_indices, + const __global INPUT9_TYPE* rotation_deltas, + const __global INPUT10_TYPE* rotated_block_indices, #endif __global OUTPUT_TYPE* output, #if PAGED_ATTENTION_SCORES_OUTPUT @@ -62,7 +67,10 @@ KERNEL(pa_sdpa_opt)( // past_lens: [sequences_num] // subsequence_begins: [sequences_num + 1] // block_indices: [used_blocks_num] - // block_indices: [sequences_num + 1] + // block_indices_begins: [sequences_num + 1] + // rotated_block_indices: [num_rotated_blocks ] + // rotation_deltas [num_rotated_blocks, 1 || PAGED_ATTENTION_BLOCK_SIZE ] + // rotation_trig_lut [MAX_CONTEXT_LEN, HEAD_SIZE] // // Output shapes: // output: [sequences_num, HEADS_NUM * HEAD_SIZE] @@ -148,6 +156,10 @@ KERNEL(pa_sdpa_opt)( } #endif +#ifdef HAS_ROTATED_BLOCKS + // TODO (vshampor): add cache block rotation at this spot +#endif + const uint blocks_num_per_partition = min(total_blocks_num - partition_idx * PAGED_ATTENTION_BLOCKS_PER_PARTITION, (uint)PAGED_ATTENTION_BLOCKS_PER_PARTITION); uint blocks_num = blocks_num_per_partition / SUBGROUPS_PER_WG; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 01db635a5bd2ae..a8d8ba9799bbd7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -867,7 +867,11 @@ KERNEL(sdpa_opt)( #define b0_idx (batch_idx / NUM_HEADS) #define b1_idx (batch_idx % NUM_HEADS) #define target_seq_dim ((uint)get_global_id(1)) +#if IS_PAGED_ATTENTION + #define target_seq_idx ((uint)block_start_pos - subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]) +#else #define target_seq_idx ((uint)get_global_id(1) * TARGET_SEQ_LEN_BLOCK_SIZE) +#endif #define head_size_idx ((uint)get_local_id(2) % HEAD_SIZE) #define sglid (uint)get_sub_group_local_id() #define sgid (uint)get_sub_group_id() @@ -994,8 +998,15 @@ KERNEL(sdpa_opt)( __attribute__((opencl_unroll_hint(1))) for (uint start_partition_idx = 0; start_partition_idx < SOURCE_SEQ_LEN; start_partition_idx += SEQ_LEN_PARTITION_SIZE) { const uint seq_len = start_partition_idx + sgid * SUBGROUP_SIZE; +#if IS_CAUSAL + const uint partition_seq_len = min((uint)SEQ_LEN_PARTITION_SIZE, (uint)max(0, (int)(target_seq_idx + seq_idx_end) - (int)start_partition_idx)); +#else const uint partition_seq_len = min((uint)SOURCE_SEQ_LEN - start_partition_idx, (uint)SEQ_LEN_PARTITION_SIZE); +#endif +#if IS_CAUSAL + if (seq_len <= target_seq_idx) { // keep tril i.e. m >= n +#endif #if IS_PAGED_ATTENTION #ifdef BROADCAST_GROUP_SIZE const uint heads_dim = num_heads_dim / BROADCAST_GROUP_SIZE; @@ -1026,21 +1037,21 @@ KERNEL(sdpa_opt)( #endif int seq_len_calc_size = min((int)(SOURCE_SEQ_LEN) - (int)seq_len, (int)SUBGROUP_SIZE); +#if IS_CAUSAL + MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_acc = INPUT0_VAL_ZERO; +#else // !IS_CAUSAL MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_acc; qk_acc = FUNC_CALL(load_attn_mask)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, -#if IS_PAGED_ATTENTION - block_start_pos - subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]] + sglid, -#else target_seq_idx + sglid, -#endif // TODO: pass seq_len_calc_size here seq_len ATTN_MASK_BUFFER ATTN_SCALE_BUFFER PA_BUFFERS); +#endif // !IS_CAUSAL if (seq_len_calc_size >= SUBGROUP_SIZE) { #if IS_KV_COMPRESSED @@ -1157,6 +1168,10 @@ KERNEL(sdpa_opt)( { SOFTMAX_ACCUMULATOR_TYPE qk_max = SOFTMAX_ACCUMULATOR_VAL_MIN; unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { +#if IS_CAUSAL + // casual mask: valid only if m >= n + if (seq_len + i <= target_seq_idx + sglid) { +#endif // IS_CAUSAL #if !APPLY_SCALES_TO_QUERY #if HAS_SCALE_INPUT const OUTPUT_TYPE scale_val = *scale; @@ -1172,12 +1187,21 @@ KERNEL(sdpa_opt)( #endif qk_acc[i] = INPUT0_MIN_FUNC(INPUT0_MAX_FUNC(qk_acc[i], INPUT0_VAL_MIN), INPUT0_VAL_MAX); - +#if IS_CAUSAL + } else { + qk_acc[i] = INPUT0_VAL_MIN; + } +#endif // IS_CAUSAL qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(qk_acc[i])); slm_qk_vals[sglid][sgid * TARGET_SEQ_LEN_BLOCK_SIZE + i] = qk_acc[i]; } slm_qk_max_vals[sglid][sgid] = qk_max; } +#if IS_CAUSAL + } else { // skip triu + slm_qk_max_vals[sglid][sgid] = SOFTMAX_ACCUMULATOR_VAL_MIN; + } +#endif barrier(CLK_LOCAL_MEM_FENCE); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp index 909a40d677f535..bac6ebd11fbe9b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp @@ -237,6 +237,9 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params& jit.AddConstant(MakeJitConstant("PAGED_ATTENTION_SCORES_OUTPUT", 1)); } + if (params.conf.has_rotated_blocks) + jit.AddConstant(MakeJitConstant("HAS_ROTATED_BLOCKS", 1)); + if (kernel_idx == KernelsTypes::MULTI_TOKENS || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS) jit.AddConstant(MakeJitConstant("MULTI_TOKENS_PROCESSING", 1)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 8fcc4a16692d6c..7b9519395d88ca 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -100,6 +100,7 @@ struct sdpa_configuration { int64_t paged_attention_max_len = 0; bool has_const_scale_val = false; float scale_val = 0.f; + bool has_rotated_blocks = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index c3d74feffb5599..0a19ff1e3be5bd 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -593,10 +593,10 @@ void Graph::update_profiling_info() { perfMap[executedID.first].first = executedID.first; pcIter = perfMap.find(executedID.first); auto& perfCount = pcIter->second.second; - - cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; - - collectTimings(cldnnInfo, perfCount); + if (executedID.second) { + cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; + collectTimings(cldnnInfo, perfCount); + } perfCount.num++; } } @@ -722,6 +722,8 @@ std::vector Graph::get_profiling_info() const { if ((!existInProfiling || (existInProfiling && perfIter->second.first.length() == 0)) && executedPrimitives.find(primId) != executedPrimitives.end()) { auto event = executedPrimitives.at(primId); + if (!event) + continue; cldnn::instrumentation::profiling_info cldnnInfo{primId, event->get_profiling_info()}; diff --git a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp index d82d3a66fed7f7..b56807d720b870 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp @@ -48,6 +48,7 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared const size_t scale_idx = 9; const size_t alibi_idx = 11; + const size_t rotated_block_indices_idx = 13; std::shared_ptr scale_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(scale_idx)); if (scale_const) { @@ -62,6 +63,12 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0; prim.num_outputs = 1; + + std::shared_ptr rotated_block_indices_const = + std::dynamic_pointer_cast(op->get_input_node_shared_ptr(rotated_block_indices_idx)); + OPENVINO_ASSERT(rotated_block_indices_const != nullptr); + prim.has_rotated_blocks = ov::shape_size(rotated_block_indices_const->get_output_shape(0)) > 0; + if (op->get_output_size() > 1) { const auto scores_output_idx = 1; const auto& users = op->get_output_target_inputs(scores_output_idx); diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index f87f9af5275722..676e37294c818d 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -295,13 +295,21 @@ void SyncInferRequest::enqueue() { std::move(events.begin(), events.end(), std::back_inserter(dependencies)); } + auto network = m_graph->get_network(); for (const auto& it : m_variables) { const auto& name = it.first; const auto& variable = it.second; + if (network->has_variable(name)) { + const auto& prev_var = network->get_variable(name); + if (prev_var.get_memory() == variable->get_memory()) { + network->set_reuse_variable_mem(true); + continue; + } + } + network->set_reuse_variable_mem(false); prepare_state(name, variable); } - auto network = m_graph->get_network(); network->set_shape_predictor(m_shape_predictor); m_internal_outputs.clear(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp new file mode 100644 index 00000000000000..d9059e63338876 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.cpp @@ -0,0 +1,165 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lora_horizontal_fusion.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/opsets/opset1.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" + +#include "intel_gpu/op/fully_connected_compressed.hpp" + +namespace ov { +namespace intel_gpu { + +LoRAHorizontalFusion::LoRAHorizontalFusion() { + using namespace ov::pass::pattern; + + auto is_target_pattern = [](const std::shared_ptr& split_node) { + auto is_lora_pattern = [](const std::shared_ptr& node) { + #define check(node) if (!node) return false; + + const auto& add = std::dynamic_pointer_cast(node); check(add) + + size_t matmul2_idx = ov::is_type(add->get_input_node_shared_ptr(0)) ? 0 : 1; + const auto& matmul2 = std::dynamic_pointer_cast(add->get_input_node_shared_ptr(matmul2_idx)); check(matmul2) + + const auto& multiply = std::dynamic_pointer_cast(matmul2->get_input_node_shared_ptr(0)); check(multiply) + + const auto& variable_b = std::dynamic_pointer_cast(matmul2->get_input_node_shared_ptr(1)); check(variable_b) + + size_t matmul1_idx = ov::is_type(multiply->get_input_node_shared_ptr(0)) ? 0 : 1; + const auto& matmul1 = std::dynamic_pointer_cast(multiply->get_input_node_shared_ptr(matmul1_idx)); check(matmul1) + + size_t alpha_idx = (matmul1_idx + 1) % 2; + const auto& variable_alpha = + std::dynamic_pointer_cast(multiply->get_input_node_shared_ptr(alpha_idx)); check(variable_alpha) + + const auto& variable_a = std::dynamic_pointer_cast(matmul1->get_input_node_shared_ptr(1)); check(variable_a) + + #undef check + return true; + }; + + for (const auto& user : split_node->get_users()) { + if (!is_lora_pattern(user)) { + return false; + } + } + + return true; + }; + + auto lora_input = any_input(); + auto main_flow_1 = wrap_type({lora_input, any_input(), any_input(), any_input()}); + auto main_flow_2 = wrap_type({lora_input, any_input(), any_input(), any_input(), any_input()}); + auto main_flow = std::make_shared(OutputVector{main_flow_1, main_flow_2}); + + auto axis_const = wrap_type(); + auto split_const = wrap_type(); + auto split = wrap_type({main_flow, axis_const, split_const}, ov::pass::pattern::op::as_value_predicate(is_target_pattern)); + + ov::matcher_pass_callback callback = [=](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + const auto& split = m.get_match_root(); + + ov::NodeVector add_nodes; + ov::NodeVector multiply_nodes; + ov::NodeVector variable_a_nodes; + ov::NodeVector variable_b_nodes; + ov::NodeVector variable_alpha_nodes; + ov::NodeVector matmul1_nodes; + ov::NodeVector matmul2_nodes; + + for (const auto& add : split->get_users()) { + add_nodes.emplace_back(add); + + size_t matmul2_idx = ov::is_type(add->get_input_node_shared_ptr(0)) ? 0 : 1; + matmul2_nodes.emplace_back(add->get_input_node_shared_ptr(matmul2_idx)); + } + for (const auto& matmul2 : matmul2_nodes) { + multiply_nodes.emplace_back(matmul2->get_input_node_shared_ptr(0)); + variable_b_nodes.emplace_back(matmul2->get_input_node_shared_ptr(1)); + } + for (const auto& multiply : multiply_nodes) { + size_t matmul1_idx = ov::is_type(multiply->get_input_node_shared_ptr(0)) ? 0 : 1; + matmul1_nodes.emplace_back(multiply->get_input_node_shared_ptr(matmul1_idx)); + + size_t alpha_idx = (matmul1_idx + 1) % 2; + variable_alpha_nodes.emplace_back(multiply->get_input_node_shared_ptr(alpha_idx)); + } + for (const auto& matmul1 : matmul1_nodes) { + variable_a_nodes.emplace_back(matmul1->get_input_node_shared_ptr(1)); + } + + auto fused_variable_a = std::make_shared(variable_a_nodes, 0); + fused_variable_a->set_friendly_name(variable_a_nodes[0]->get_friendly_name() + + "_fused_" + std::to_string(variable_a_nodes.size()) + "_ReadValues"); + ov::copy_runtime_info(variable_a_nodes, fused_variable_a); + + auto fused_variable_alpha = std::make_shared(variable_alpha_nodes, 1); + fused_variable_alpha->set_friendly_name(variable_alpha_nodes[0]->get_friendly_name() + + "_fused_" + std::to_string(variable_alpha_nodes.size()) + "_ReadValues"); + ov::copy_runtime_info(variable_alpha_nodes, fused_variable_alpha); + + bool transpose_a1 = std::dynamic_pointer_cast(matmul1_nodes[0])->get_transpose_a(); + bool transpose_b1 = std::dynamic_pointer_cast(matmul1_nodes[0])->get_transpose_b(); + auto fused_matmul1 = std::make_shared(pattern_map.at(lora_input), fused_variable_a, transpose_a1, transpose_b1); + auto fused_matmul1_name = matmul1_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(matmul1_nodes.size()) + "_MatMuls"; + fused_matmul1->set_friendly_name(fused_matmul1_name); + ov::copy_runtime_info(matmul1_nodes, fused_matmul1); + for (const auto& old_matmul1 : matmul1_nodes) { + old_matmul1->clear_control_dependencies(); + } + + auto fused_multiply = std::make_shared(fused_matmul1, fused_variable_alpha); + auto multiply_name = multiply_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(multiply_nodes.size()) + "_Multiply"; + fused_multiply->set_friendly_name(multiply_name); + ov::copy_runtime_info(multiply_nodes, fused_multiply); + for (const auto& old_multiply : multiply_nodes) { + old_multiply->clear_control_dependencies(); + } + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {fused_multiply->get_output_partial_shape(0).size() - 1}); + auto output_split = std::make_shared(fused_multiply, axis_const, matmul2_nodes.size()); + auto split_name = fused_multiply->get_friendly_name() + "_split"; + copy_runtime_info(fused_multiply, output_split); + output_split->set_friendly_name(split_name); + for (size_t i = 0; i < matmul2_nodes.size(); ++i) { + matmul2_nodes[i]->input(0).replace_source_output(output_split->output(i)); + } + + auto fused_matmul2 = std::make_shared(matmul2_nodes, matmul2_nodes[0]->get_output_partial_shape(0).size() - 1); + auto matmul2_name = matmul2_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(matmul2_nodes.size()) + "_MatMuls_output"; + fused_matmul2->set_friendly_name(matmul2_name); + ov::copy_runtime_info(matmul2_nodes, fused_matmul2); + + auto fused_add = std::make_shared(split->get_input_node_shared_ptr(0), fused_matmul2); + auto fused_add_name = add_nodes[0]->get_friendly_name() + "_fused_" + std::to_string(add_nodes.size()) + "_Adds"; + fused_add->set_friendly_name(fused_add_name); + ov::copy_runtime_info(add_nodes, fused_add); + + for (size_t i = 0; i < add_nodes.size(); ++i) { + const auto& old_add = add_nodes[i]; + for (auto u : old_add->get_users()) { + for (size_t idx = 0; idx < u->inputs().size(); ++idx) { + if (u->get_input_node_shared_ptr(idx) == old_add) { + u->input(idx).replace_source_output(split->output(i)); + } + } + } + old_add->clear_control_dependencies(); + } + + split->input(0).replace_source_output(fused_add->output(0)); + return true; + }; + + auto m = std::make_shared(split, "LoRAHorizontalFusion"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp new file mode 100644 index 00000000000000..631028d68baa7a --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/lora_horizontal_fusion.hpp @@ -0,0 +1,88 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +// Before: +// ┌─────────┐ ┌─────────┐ +// │ReadValue│ │ReadValue│ +// └────┬────┘ └────┬────┘ +// │ ┌───────────┐ │ +// │ ┌───────────────────────┼ LoraInput ┼───────────────────┐ │ +// │ │ └─────┬─────┘ │ │ +// │ ┌────▼───┐ │ ┌────▼───┐ │ +// └────► Gemm │ │ │ Gemm ◄──────┘ +// ┌─────────┐ └────┬───┘ │ └────┬───┘ ┌─────────┐ +// │ReadValue│ │ │ │ │ReadValue│ +// └────┬────┘ │ ┌───────────▼────────────┐ │ └────┬────┘ +// │ ┌────▼───┐ │FullyConnectedCompressed│ ┌────▼───┐ │ +// └─────────────►Multiply│ └───────────┬────────────┘ │Multiply◄────────────┘ +// └────┬───┘ │ └────────┘ +// ┌─────────┐ │ │ │ ┌─────────┐ +// │ReadValue│ │ │ │ │ReadValue│ +// └────┬────┘ │ │ │ └────┬────┘ +// │ ┌────▼───┐ ┌──────▼──────┐ ┌────▼───┐ │ +// └─────────────► Gemm │ ┌───────────┼VariadicSplit┼──────────┐ │ Gemm ◄────────────────┘ +// └────┬───┘ │ └──────┬──────┘ │ └────┬───┘ +// │ │ │ │ │ +// │ │ │ │ │ +// │ │ │ │ │ +// │ ┌──▼──┐ ▼ ┌──▼──┐ │ +// └───────► Add │ ... │ Add ◄────┘ +// └─────┘ └─────┘ +// After: +// ┌─────────┐ +// ┌────┼ReadValue│ +// ┌──────────┐ ┌──────┐ │ └─────────┘ +// │LoRA_Input┼────────────────────────────┐ ┌─────────────┼Concat◄─────┤ ... +// └────┬─────┘ │ │ └──────┘ │ ┌─────────┐ +// │ │ │ └────┼ReadValue│ +// │ │ │ └─────────┘ +// │ ┌────▼──▼───┐ +// │ │MatMulFused│ +// │ └───────────┘ +// │ │ ┌─────────┐ +// │ │ ┌────┼ReadValue│ +// │ │ ┌──────┐ │ └─────────┘ +// │ │ ┌────────┼Concat◄─────┤ ... +// │ │ │ └──────┘ │ ┌─────────┐ +// │ │ │ └────┼ReadValue│ +// ┌───────────▼────────────┐ ┌───▼──────▼──┐ └─────────┘ +// │FullyConnectedCompressed│ │MultiplyFused│ +// └───────────┬────────────┘ └──────┬──────┘ +// │ │ +// │ ┌─────────┐ │ ┌─────────┐ +// │ │ReadValue│ ┌──▼──┐ │ReadValue│ +// │ └────┬────┘ │Split│ └────┬────┘ +// │ │ └──┬──┘ │ +// │ │ │ │ +// │ │ ┌────────┼────────┐ │ +// │ │ │ │ │ +// │ ┌──▼──▼──┐ ┌──▼──▼──┐ +// │ │ MatMul │ ... │ MatMul │ +// │ └────┬───┘ └────┬───┘ +// │ └──────┐ ┌────────┘ +// │ │ │ +// │ ┌─────┐ ┌─▼────▼─┐ +// └─────────────► Add ◄─────────────┼ Concat │ +// └──┬──┘ └────────┘ +// │ +// │ +// ┌──────▼──────┐ +// │VariadicSplit│ +// └─────────────┘ + +class LoRAHorizontalFusion: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("LoRAHorizontalFusion", "0"); + LoRAHorizontalFusion(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 7c7c09adcd182f..662323813aa27f 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -82,6 +82,7 @@ #include "plugin/transformations/group_norm_composition.hpp" #include "plugin/transformations/dynamic_quantize_fully_connected.hpp" #include "plugin/transformations/optimize_subsequent_reshapes.hpp" +#include "plugin/transformations/lora_horizontal_fusion.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" @@ -929,8 +930,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { bool fuse_mlp_swiglu = !device_info.supports_immad && device_info.execution_units_count >= 128 && !disable_fc_swiglu_fusion; - if (!disable_horizontal_fc_fusion) + if (!disable_horizontal_fc_fusion) { manager.register_pass(fuse_mlp_swiglu); + manager.register_pass(); + } // ZP should not be folded for FC. But still, ZP should be folded for Gather. // Therefore, run MarkDequantization again to fold ZP constant. diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_fc_horizontal_fusion.cpp index 7c9994b8235661..3022d1e0362404 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_fc_horizontal_fusion.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_fc_horizontal_fusion.cpp @@ -31,7 +31,7 @@ struct ShapeParams { int weights_group_size; }; -using FullyConnectedHorizontalFusionParams = std::tuple; @@ -56,6 +57,7 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface(params.at(var_offset + 1), "var_alpha_" + std::to_string(idx)); + auto read_value_b = std::make_shared(params.at(var_offset + 2), "var_b_" + std::to_string(idx)); + auto matmul1 = std::make_shared(params.at(0), read_value_a, false, true); + auto multiply = std::make_shared(matmul1, read_value_alpha); + auto matmul2 = std::make_shared(multiply, read_value_b, false, true); + auto add = std::make_shared(connect_node, matmul2); + return add; + } + + std::shared_ptr init_subgraph(const std::vector& weights_shapes, const int group_size, const ov::element::Type data_precision, const ov::element::Type weights_precision, @@ -212,8 +230,13 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface(data_precision, data_shape)}; + const bool has_bias, + const uint64_t lora_rank) { + ov::ParameterVector params; + for (const auto& shape : inputDynamicShapes) { + params.push_back(std::make_shared(data_precision, shape)); + } + const auto weight1 = init_compressed_weights_subgraph(weights_shapes[0], group_size, data_precision, @@ -246,10 +269,20 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterfaceset_friendly_name("fully_connected_2"); auto matmul3 = std::make_shared(params[0], weight3, false, transpose_weights); matmul3->set_friendly_name("fully_connected_3"); + + std::shared_ptr matmul1_result = matmul1; + std::shared_ptr matmul2_result = matmul2; + std::shared_ptr matmul3_result = matmul3; + if (lora_rank != 0) { + matmul1_result = init_lora_subgraph(params, matmul1, 0); + matmul2_result = init_lora_subgraph(params, matmul2, 1); + matmul3_result = init_lora_subgraph(params, matmul3, 2); + } + if (!has_bias) { - auto matmul4 = std::make_shared(matmul1, matmul2, true, false); + auto matmul4 = std::make_shared(matmul1_result, matmul2_result, true, false); matmul4->set_friendly_name("gemm1"); - auto matmul5 = std::make_shared(matmul4, matmul3, true, true); + auto matmul5 = std::make_shared(matmul4, matmul3_result, true, true); matmul5->set_friendly_name("gemm2"); return std::make_shared(ov::NodeVector{matmul5}, params, "FCHorizontalFusion"); } else { @@ -261,17 +294,17 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface(bias1_tensor); - auto bias_add1 = std::make_shared(matmul1, bias1_const); + auto bias_add1 = std::make_shared(matmul1_result, bias1_const); bias_add1->set_friendly_name("add1"); auto bias2_shape = ov::Shape{1, weights_shapes[1].back()}; auto bias2_tensor = ov::test::utils::create_and_fill_tensor(data_precision, bias2_shape, in_data); auto bias2_const = std::make_shared(bias2_tensor); - auto bias_add2 = std::make_shared(matmul2, bias2_const); + auto bias_add2 = std::make_shared(matmul2_result, bias2_const); bias_add2->set_friendly_name("add2"); auto bias3_shape = ov::Shape{1, weights_shapes[2].back()}; auto bias3_tensor = ov::test::utils::create_and_fill_tensor(data_precision, bias3_shape, in_data); auto bias3_const = std::make_shared(bias3_tensor); - auto bias_add3 = std::make_shared(matmul3, bias3_const); + auto bias_add3 = std::make_shared(matmul3_result, bias3_const); bias_add3->set_friendly_name("add3"); auto matmul4 = std::make_shared(bias_add1, bias_add2, true, false); @@ -294,6 +327,7 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface input_shapes = {shape_params.data_shape}; + + if (lora_rank != 0) { + for (size_t i = 0; i < shape_params.weights_shapes.size(); ++i) { + // variable_A + input_shapes.push_back({{-1, *shape_params.data_shape.first.rbegin()}, {{lora_rank, shape_params.data_shape.second.front().back()}}}); + // variable_alpha + input_shapes.push_back({{1, -1}, {{1, lora_rank}}}); + // variable_B + input_shapes.push_back({{ov::Dimension(shape_params.weights_shapes[i].back()), -1}, {{shape_params.weights_shapes[i].back(), lora_rank}}}); + } + } - init_input_shapes({shape_params.data_shape, {{}, shape_params.weights_shapes}}); + init_input_shapes(input_shapes); inType = outType = activations_precision; - function = init_subgraph(inputDynamicShapes[0], - shape_params.weights_shapes, + function = init_subgraph(shape_params.weights_shapes, shape_params.weights_group_size, activations_precision, weights_precision, @@ -317,7 +364,8 @@ class FullyConnectedHorizontalFusion : public testing::WithParamInterface(test_param); + uint64_t lora_rank = std::get<9>(test_param); + bool is_lora_fused = false; for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { if (n->get_friendly_name() == "Compressed_weights") { ASSERT_EQ(n->get_output_element_type(0), weights_precision); } + if (n->get_friendly_name().find("fused_3_MatMuls") != std::string::npos) { + is_lora_fused = true; + } } + OPENVINO_ASSERT(lora_rank == 0 || is_lora_fused, "[GPU] LoRA fusion failed"); } }; @@ -376,6 +436,8 @@ const std::vector input_shapes = { {{{-1, -1, -1}, {{1, 4, 16}}}, weights4}, }; +const std::vector lora_rank = {0, 16}; // 0 means w/o LoRA + INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_no_bias, FullyConnectedHorizontalFusion, ::testing::Combine(::testing::ValuesIn(input_shapes), @@ -386,7 +448,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_no_bias, ::testing::Values(true), ::testing::ValuesIn(per_tensor_zp), ::testing::Values(false), - ::testing::Values(0) /* no dyn_quan */), + ::testing::Values(0) /* no dyn_quan */, + ::testing::ValuesIn(lora_rank)), FullyConnectedHorizontalFusion::get_test_case_name); INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_with_bias, @@ -399,7 +462,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_with_bias, ::testing::Values(true), ::testing::Values(true), ::testing::Values(true), - ::testing::Values(0) /* no dyn_quan */), + ::testing::Values(0) /* no dyn_quan */, + ::testing::ValuesIn(lora_rank)), FullyConnectedHorizontalFusion::get_test_case_name); std::vector dyn_quan_weights = {{1, 128, 32}, {1, 128, 4}, {1, 128, 32}}; @@ -417,7 +481,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_FCHorizontalFusion_no_bias_dyn_quan, ::testing::Values(true), ::testing::Values(true), ::testing::Values(false), - ::testing::Values(UINT64_MAX) /* dyn_quan */), + ::testing::Values(UINT64_MAX) /* dyn_quan */, + ::testing::ValuesIn(lora_rank)), FullyConnectedHorizontalFusion::get_test_case_name); diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_unfusion.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_unfusion.cpp index 1cc079a10b82f6..04c053c6dbd1c3 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_unfusion.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/dynamic_unfusion.cpp @@ -15,6 +15,8 @@ namespace { using ov::test::InputShape; using DynamicUnfusionsParams = std::tuple, // input shapes + bool, // Matmul transpose a + bool, // Matmul transpose b ov::element::Type>; // input precision class DynamicUnfusions : public testing::WithParamInterface, @@ -22,9 +24,11 @@ class DynamicUnfusions : public testing::WithParamInterface obj) { std::vector input_shapes; + bool transpose_a; + bool transpose_b; ov::element::Type input_precision; - std::tie(input_shapes, input_precision) = obj.param; + std::tie(input_shapes, transpose_a, transpose_b, input_precision) = obj.param; std::ostringstream result; result << "IS=("; @@ -42,18 +46,22 @@ class DynamicUnfusions : public testing::WithParamInterface input_shapes; + bool transpose_a; + bool transpose_b; ov::element::Type input_precision; - std::tie(input_shapes, input_precision) = GetParam(); + std::tie(input_shapes, transpose_a, transpose_b, input_precision) = GetParam(); init_input_shapes(input_shapes); inType = outType = input_precision; - function = init_subgraph(inputDynamicShapes, input_precision); + function = init_subgraph(inputDynamicShapes, transpose_a, transpose_b, input_precision); } }; @@ -83,13 +93,28 @@ TEST_P(DynamicUnfusions, Inference) { const std::vector input_precisions = {ov::element::f32}; -const std::vector> input_shapes_dyn = { +const std::vector> input_shapes_same_rank_fusing_dyn = { {{{1024, -1}, {{1024, 1024}}}, {{-1, 1024}, {{1024, 1024}}}, {{1, -1}, {{1, 1}}}}, + {{{1024, -1}, {{1024, 1024}}}, {{-1, 1024}, {{1024, 1024}}}, {{1, -1}, {{1, 1024}}}}, }; +const std::vector> input_shapes_diff_rank_fusing_dyn = { + {{{1024, -1}, {{1024, 1024}}}, {{-1, 1024}, {{1024, 1024}}}, {{1, -1}, {{1, 1}}}}, + {{{-1, -1, 1024}, {{1, 1024, 1024}}}, {{-1, 1024}, {{1024, 1024}}}, {{1, -1}, {{1, 1024}}}}, +}; + +INSTANTIATE_TEST_SUITE_P(DynamicUnfusions_basic_same_rank, + DynamicUnfusions, + ::testing::Combine(::testing::ValuesIn(input_shapes_same_rank_fusing_dyn), + ::testing::Values(false), + ::testing::Values(false), + ::testing::ValuesIn(input_precisions)), + DynamicUnfusions::getTestCaseName); -INSTANTIATE_TEST_SUITE_P(DynamicUnfusions_basic, +INSTANTIATE_TEST_SUITE_P(DynamicUnfusions_basic_diff_rank, DynamicUnfusions, - ::testing::Combine(::testing::ValuesIn(input_shapes_dyn), + ::testing::Combine(::testing::ValuesIn(input_shapes_diff_rank_fusing_dyn), + ::testing::Values(false), + ::testing::Values(true), ::testing::ValuesIn(input_precisions)), DynamicUnfusions::getTestCaseName); } // namespace diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp index fe923135550e5b..89612039fb788f 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp @@ -351,6 +351,22 @@ std::vector get_test_params() { p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); p.push_back({with_rearrange, with_mask, !with_scale, !causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); + + /* -- causal mask -- */ + + p.push_back({with_rearrange, !with_mask, !with_scale, causal, !compressed, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + p.push_back({!with_rearrange, with_mask, !with_scale, causal, !compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + + // Beam search + p.push_back({with_rearrange, !with_mask, !with_scale, causal, !compressed, 2, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, !with_mask, !with_scale, causal, !compressed, 4, ov::element::Type_t::f16, 5, 16, 1, {0, 2, 1, 3}}); + + // Compressed + p.push_back({with_rearrange, with_mask, !with_scale, causal, compressed, 1, ov::element::Type_t::f16, 10, 1, 1, {0, 1, 2, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 2, 1, 3}}); + p.push_back({with_rearrange, with_mask, !with_scale, causal, compressed, 1, ov::element::Type_t::f16, 10, 4, 1, {0, 1, 2, 3}}); return p; } diff --git a/src/plugins/intel_gpu/tests/unit/fusions/mvn_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/mvn_fusion_test.cpp index 24aa5d31e1d76e..723103c4b5d4b6 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/mvn_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/mvn_fusion_test.cpp @@ -170,7 +170,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_scale_quantize_i8, ::testing::ValuesIn mvn_test_params{ CASE_MVN_I8_2, 2, 2, 4 }, mvn_test_params{ CASE_MVN_I8_3, 2, 2, 4 }, mvn_test_params{ CASE_MVN_I8_4, 2, 2, 4 }, - mvn_test_params{ CASE_MVN_I8_8, 3, 3, 4 }, + // mvn_test_params{ CASE_MVN_I8_8, 3, 3, 4 }, // TODO: It will be fix soon, test reference is wrong in new driver. mvn_test_params{ CASE_MVN_3D_I8_1, 2, 2, 4 }, mvn_test_params{ CASE_MVN_3D_I8_2, 2, 2, 4 }, mvn_test_params{ CASE_MVN_U8_1, 2, 2, 4 }, @@ -221,7 +221,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_scale_activation_eltwise_fp32_quantize mvn_test_params{ CASE_MVN_I8_5, 2, 4, 6 }, mvn_test_params{ CASE_MVN_I8_6, 2, 4, 6 }, mvn_test_params{ CASE_MVN_I8_7, 3, 4, 6 }, - mvn_test_params{ CASE_MVN_I8_8, 3, 5, 6 }, + // mvn_test_params{ CASE_MVN_I8_8, 3, 5, 6 }, // TODO: It will be fix soon, test reference is wrong in new driver. mvn_test_params{ CASE_MVN_3D_I8_1, 2, 4, 6 }, mvn_test_params{ CASE_MVN_3D_I8_2, 2, 4, 6 }, mvn_test_params{ CASE_MVN_3D_I8_3, 2, 4, 6 }, diff --git a/src/plugins/intel_gpu/tests/unit/passes/mark_state_init_subgraphs_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/mark_state_init_subgraphs_test.cpp new file mode 100644 index 00000000000000..81f2f39765879c --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/passes/mark_state_init_subgraphs_test.cpp @@ -0,0 +1,249 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include "intel_gpu/runtime/engine.hpp" + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/program.hpp" +#include "activation_inst.h" +#include "data_inst.h" +#include "eltwise_inst.h" +#include "gemm_inst.h" +#include "fully_connected_inst.h" +#include "read_value_inst.h" +#include "reorder_inst.h" +#include "reshape_inst.h" +#include "permute_inst.h" + +#include "program_wrapper.h" + +using namespace cldnn; +using namespace ::tests; + +class mark_state_init_subgraphs_test: public ::testing::Test { +public: + static bool check_subgraph(const program_node& node, program& program, std::vector expected_subgraph) { + const auto& variable_id = node.as().get_primitive()->variable_id; + if (!program.contains_state(variable_id)) + return false; + + auto& state_initializers = program.get_initializers(variable_id); + if (state_initializers.size() != expected_subgraph.size()) + return false; + + for (auto& pid : expected_subgraph) { + if (std::find(state_initializers.begin(), state_initializers.end(), pid) == state_initializers.end()) + return false; + } + return true; + } + + void test_cross_attn_key_state_init_subgraphs(bool is_caching_test) { + auto& engine = get_test_engine(); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + auto input_k_layout_dynamic = layout{ov::PartialShape{-1, -1, 512}, data_types::f16, format::bfyx}; + auto input_q_layout_dynamic = layout{ov::PartialShape{-1, 8, -1, 64}, data_types::f16, format::bfyx}; + auto weights = engine.allocate_memory({ ov::PartialShape{512, 512}, data_types::f32, format::bfyx }); + ov::op::util::VariableInfo info{ov::PartialShape{-1, 8, -1, 64}, data_types::f16, "v0"}; + auto kv_layout = layout{info.data_shape, info.data_type, format::bfyx}; + activation_additional_params params = {-65504, 65504}; + + topology topology; + topology.add(input_layout("input_k", input_k_layout_dynamic)); + topology.add(input_layout("input_q", input_q_layout_dynamic)); + topology.add(data("weights", weights)); + topology.add(reorder("convert", + input_info("weights"), + format::any, + data_types::f16, + std::vector(), + reorder_mean_mode::subtract, + padding(), + true)); + topology.add(fully_connected("fc", input_info("input_k"), { "convert" }, "", data_types::f16, 3, 2)); + topology.add(reshape("reshape", + input_info("fc"), + true, + {0, 0, 8, 64}, + ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension::dynamic(), 8, 64}, + reshape::reshape_mode::base)); + topology.add(permute("transpose", input_info("reshape"), {0, 2, 1, 3})); + topology.add(read_value("read_value", {input_info("transpose")}, info.variable_id, {kv_layout}, data_types::f32)); + topology.add(gemm("gemm", {input_info("input_q"), input_info("read_value")}, data_types::f16, {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 1, 2, 3}, 1.0f, 0.0f)); + topology.add(activation("clamp", input_info("gemm"), activation_func::clamp, params)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + auto prog = network->get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(check_subgraph(prog->get_node("read_value"), *prog, {"transpose", "reshape", "fc", "input_k", "convert"})); + } + + void test_cross_attn_value_state_init_subgraphs(bool is_caching_test) { + auto& engine = get_test_engine(); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + auto input_v_layout_dynamic = layout{ov::PartialShape{-1, -1, 512}, data_types::f16, format::bfyx}; + auto input_qk_layout_dynamic = layout{ov::PartialShape{-1, 8, -1, -1}, data_types::f16, format::bfyx}; + auto weights = engine.allocate_memory({ ov::PartialShape{512, 512}, data_types::f32, format::bfyx }); + auto add_data = engine.allocate_memory({ ov::PartialShape{1, 1, 512}, data_types::f16, format::bfyx }); + ov::op::util::VariableInfo info{ov::PartialShape{-1, 8, -1, 64}, data_types::f16, "v1"}; + auto kv_layout = layout{info.data_shape, info.data_type, format::bfyx}; + + topology topology; + topology.add(input_layout("input_v", input_v_layout_dynamic)); + topology.add(input_layout("input_qk", input_qk_layout_dynamic)); + topology.add(data("weights", weights)); + topology.add(data("add_data", add_data)); + topology.add(reorder("convert", + input_info("weights"), + format::any, + data_types::f16, + std::vector(), + reorder_mean_mode::subtract, + padding(), + true)); + topology.add(fully_connected("fc", input_info("input_v"), { "convert" }, "", data_types::f16, 3, 2)); + topology.add(eltwise("add", + {input_info("fc"), input_info("add_data")}, + eltwise_mode::sum, + std::vector{}, + data_types::f16, + ov::op::AutoBroadcastType::NUMPY, + true)); + topology.add(reshape("reshape1", + input_info("add"), + true, + {0, 0, 8, 64}, + ov::PartialShape{-1, -1, 8, 64}, + reshape::reshape_mode::base)); + topology.add(permute("transpose", input_info("reshape1"), {0, 2, 1, 3})); + topology.add(read_value("read_value", {input_info("transpose")}, info.variable_id, {kv_layout}, data_types::f32)); + topology.add(gemm("gemm", {input_info("input_qk"), input_info("read_value")}, data_types::f16, {0, 1, 2, 3}, {0, 1, 2, 3}, {0, 2, 1, 3}, 1.0f, 0.0f)); + topology.add(reshape("reshape2", + input_info("gemm"), + true, + {0, 0, 512}, + ov::PartialShape{-1, -1, 512}, + reshape::reshape_mode::base)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + auto prog = network->get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(check_subgraph(prog->get_node("read_value"), *prog, {"transpose", "reshape1", "fc", "input_v", "convert", "add_data"})); + } + + void test_cross_attn_multiple_state_init_subgraphs(bool is_caching_test) { + auto& engine = get_test_engine(); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + auto param_layout_dynamic = layout{ov::PartialShape{-1, -1, 512}, data_types::f16, format::bfyx}; + auto input_q_layout_dynamic = layout{ov::PartialShape{-1, 8, -1, 64}, data_types::f16, format::bfyx}; + auto weights = engine.allocate_memory({ ov::PartialShape{512, 512}, data_types::f32, format::bfyx }); + auto add_data = engine.allocate_memory({ ov::PartialShape{1, 1, 512}, data_types::f16, format::bfyx }); + auto kv_layout = layout{ov::PartialShape{-1, 8, -1, 64}, data_types::f16, format::bfyx}; + activation_additional_params act_params = {-65504, 65504}; + + topology topology; + topology.add(input_layout("param", param_layout_dynamic)); + topology.add(input_layout("input_q", input_q_layout_dynamic)); + topology.add(data("weights_k_proj", weights)); + topology.add(data("weights_v_proj", weights)); + topology.add(data("add_data", add_data)); + topology.add(reorder("convert_k_proj", + input_info("weights_k_proj"), + format::any, + data_types::f16, + std::vector(), + reorder_mean_mode::subtract, + padding(), + true)); + topology.add(reorder("convert_v_proj", + input_info("weights_v_proj"), + format::any, + data_types::f16, + std::vector(), + reorder_mean_mode::subtract, + padding(), + true)); + topology.add(fully_connected("fc_k_proj", input_info("param"), { "convert_k_proj" }, "", data_types::f16, 3, 2)); + topology.add(reshape("reshape_k_proj", + input_info("fc_k_proj"), + true, + {0, 0, 8, 64}, + ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension::dynamic(), 8, 64}, + reshape::reshape_mode::base)); + topology.add(permute("transpose_k_proj", input_info("reshape_k_proj"), {0, 2, 1, 3})); + topology.add(read_value("read_value_1", {input_info("transpose_k_proj")}, "v1", {kv_layout}, data_types::f32)); + topology.add(gemm("gemm_k_proj", {input_info("input_q"), input_info("read_value_1")}, data_types::f16, {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 1, 2, 3}, 1.0f, 0.0f)); + topology.add(activation("clamp", input_info("gemm_k_proj"), activation_func::clamp, act_params)); + topology.add(fully_connected("fc_v_proj", input_info("param"), { "convert_v_proj" }, "", data_types::f16, 3, 2)); + topology.add(eltwise("add_v_proj", + {input_info("fc_v_proj"), input_info("add_data")}, + eltwise_mode::sum, + std::vector{}, + data_types::f16, + ov::op::AutoBroadcastType::NUMPY, + true)); + topology.add(reshape("reshape_v_proj", + input_info("add_v_proj"), + true, + {0, 0, 8, 64}, + ov::PartialShape{-1, -1, 8, 64}, + reshape::reshape_mode::base)); + topology.add(permute("transpose_v_proj", input_info("reshape_v_proj"), {0, 2, 1, 3})); + topology.add(read_value("read_value_2", {input_info("transpose_v_proj")}, "v2", {kv_layout}, data_types::f32)); + topology.add(gemm("gemm_qkv", {input_info("clamp"), input_info("read_value_2")}, data_types::f16, {0, 1, 2, 3}, {0, 1, 2, 3}, {0, 2, 1, 3}, 1.0f, 0.0f)); + topology.add(reshape("reshape_qkv", + input_info("gemm_qkv"), + true, + {0, 0, 512}, + ov::PartialShape{-1, -1, 512}, + reshape::reshape_mode::base)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + auto prog = network->get_program(); + ASSERT_NE(prog, nullptr); + + ASSERT_TRUE(check_subgraph(prog->get_node("read_value_1"), *prog, {"transpose_k_proj", "reshape_k_proj", "fc_k_proj", "convert_k_proj"})); + ASSERT_TRUE(check_subgraph(prog->get_node("read_value_2"), *prog, {"transpose_v_proj", "reshape_v_proj", "fc_v_proj", "convert_v_proj", "add_data"})); + } +}; + +TEST_F(mark_state_init_subgraphs_test, cross_attn_key_state_init_subgraphs) { + this->test_cross_attn_key_state_init_subgraphs(false); +} + +TEST_F(mark_state_init_subgraphs_test, cross_attn_value_state_init_subgraphs) { + this->test_cross_attn_value_state_init_subgraphs(false); +} + +TEST_F(mark_state_init_subgraphs_test, cross_attn_multiple_state_init_subgraphs) { + this->test_cross_attn_multiple_state_init_subgraphs(false); +} + +TEST_F(mark_state_init_subgraphs_test, cross_attn_key_state_init_subgraphs_cached) { + this->test_cross_attn_key_state_init_subgraphs(true); +} + +TEST_F(mark_state_init_subgraphs_test, cross_attn_value_state_init_subgraphs_cached) { + this->test_cross_attn_value_state_init_subgraphs(true); +} + +TEST_F(mark_state_init_subgraphs_test, cross_attn_multiple_state_init_subgraphs_cached) { + this->test_cross_attn_multiple_state_init_subgraphs(true); +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index 13934020bfdf66..340a49a14caa49 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -7684,7 +7684,9 @@ INSTANTIATE_TEST_SUITE_P(convolution_grouped_fsv4_fsv16, TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, false, false, false, format::b_fs_yx_fsv4, ""), TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, false, true, false, format::b_fs_yx_fsv4, ""), TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, false, false, format::b_fs_yx_fsv4, ""), - TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, true, false, format::b_fs_yx_fsv4, ""), + + // TODO: It will be fix soon, test reference is wrong in new driver. + // TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, true, false, format::b_fs_yx_fsv4, ""), TestParamType_grouped_convolution_gpu(3, 1, 1, 80, 252, 3, 1, 1, 4, 1, 1, true, false, true, format::b_fs_yx_fsv4, ""), // Format: b_fs_yx_fsv16 diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp index 385226ad7eb11f..54ed130bc6a3cc 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp @@ -63,34 +63,32 @@ struct reduce_accumulator { }; AccT accumulate(AccT& acc, AccT& input_val, cldnn::reduce_mode reduce_mode, bool sum_only) { - if (reduce_mode == cldnn::reduce_mode::sum || reduce_mode == cldnn::reduce_mode::mean || - reduce_mode == cldnn::reduce_mode::log_sum) - acc += input_val; - else if (reduce_mode == cldnn::reduce_mode::max) + if (reduce_mode == cldnn::reduce_mode::max) { acc = input_val > acc ? input_val : acc; - else if (reduce_mode == cldnn::reduce_mode::min) + } else if (reduce_mode == cldnn::reduce_mode::sum || reduce_mode == cldnn::reduce_mode::mean || + reduce_mode == cldnn::reduce_mode::log_sum) { + acc += input_val; + } else if (reduce_mode == cldnn::reduce_mode::min) { acc = input_val < acc ? input_val : acc; - else if (reduce_mode == cldnn::reduce_mode::prod) + } else if (reduce_mode == cldnn::reduce_mode::prod) { acc = acc * input_val; - else if (reduce_mode == cldnn::reduce_mode::logical_and) + } else if (reduce_mode == cldnn::reduce_mode::logical_and) { acc = acc && input_val; - else if (reduce_mode == cldnn::reduce_mode::logical_or) + } else if (reduce_mode == cldnn::reduce_mode::logical_or) { acc = acc || input_val; - else if (reduce_mode == cldnn::reduce_mode::sum_square) { + } else if (reduce_mode == cldnn::reduce_mode::sum_square) { if (sum_only) acc += input_val; else acc += input_val * input_val; - } - else if (reduce_mode == cldnn::reduce_mode::l1) + } else if (reduce_mode == cldnn::reduce_mode::l1) { acc += abs(input_val); - else if (reduce_mode == cldnn::reduce_mode::l2) { + } else if (reduce_mode == cldnn::reduce_mode::l2) { if (sum_only) acc += input_val; else acc += input_val * input_val; - } - else if (reduce_mode == cldnn::reduce_mode::log_sum_exp) { + } else if (reduce_mode == cldnn::reduce_mode::log_sum_exp) { if (sum_only) acc += input_val; else diff --git a/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp b/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp new file mode 100644 index 00000000000000..38d2365c9e0545 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/lora_horizontal_fusion.cpp @@ -0,0 +1,569 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/opsets/opset6.hpp" + +#include "plugin/transformations/lora_horizontal_fusion.hpp" +#include "intel_gpu/op/placeholder.hpp" +#include "intel_gpu/op/fully_connected_compressed.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +namespace ov { +namespace test { +namespace intel_gpu { + +TEST_F(TransformationTestsF, LoRAHorizontalFusion_default) { + ov::element::Type model_dt = ov::element::f16; + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2560, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2560, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {2048, 256, 256}); + auto split = std::make_shared(fc_fused, axis_const, split_const); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_b_0 = std::make_shared(variable_b_0); + auto matmul1_0 = std::make_shared(lora_input, read_value_a_0, false, true); + auto multiply_0 = std::make_shared(matmul1_0, read_value_alpha_0); + auto matmul2_0 = std::make_shared(multiply_0, read_value_b_0, false, true); + auto add_0 = std::make_shared(split->output(0), matmul2_0); + + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto read_value_b_1 = std::make_shared(variable_b_1); + auto matmul1_1 = std::make_shared(lora_input, read_value_a_1, false, true); + auto multiply_1 = std::make_shared(matmul1_1, read_value_alpha_1); + auto matmul2_1 = std::make_shared(multiply_1, read_value_b_1, false, true); + auto add_1 = std::make_shared(split->output(1), matmul2_1); + + auto variable_a_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_2"}); + auto variable_alpha_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_2"}); + auto variable_b_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_2"}); + auto read_value_a_2 = std::make_shared(variable_a_2); + auto read_value_alpha_2 = std::make_shared(variable_alpha_2); + auto read_value_b_2 = std::make_shared(variable_b_2); + auto matmul1_2 = std::make_shared(lora_input, read_value_a_2, false, true); + auto multiply_2 = std::make_shared(matmul1_2, read_value_alpha_2); + auto matmul2_2 = std::make_shared(multiply_2, read_value_b_2, false, true); + auto add_2 = std::make_shared(split->output(2), matmul2_2); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape_pattern2 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(add_0, reshape_pattern0, true); + auto reshape1 = std::make_shared(add_1, reshape_pattern1, true); + auto reshape2 = std::make_shared(add_2, reshape_pattern2, true); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + + model = std::make_shared(ov::NodeVector{result0, result1, result2}, ov::ParameterVector{lora_input}); + manager.register_pass(); + } + + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2560, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2560, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + auto variable_a_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_2"}); + + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto read_value_a_2 = std::make_shared(variable_a_2); + auto concat_variable_a = std::make_shared(NodeVector{read_value_a_0, read_value_a_1, read_value_a_2}, 0); + + auto fused_matmul1 = std::make_shared(lora_input, concat_variable_a, false, true); + + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + auto variable_alpha_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_2"}); + + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto read_value_alpha_2 = std::make_shared(variable_alpha_2); + auto concat_variable_alpha = std::make_shared(NodeVector{read_value_alpha_0, read_value_alpha_1, read_value_alpha_2}, 1); + + auto multiply = std::make_shared(fused_matmul1, concat_variable_alpha); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); + auto split = std::make_shared(multiply, split_axis, 3); + + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + auto variable_b_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_2"}); + + auto read_value_b_0 = std::make_shared(variable_b_0); + auto read_value_b_1 = std::make_shared(variable_b_1); + auto read_value_b_2 = std::make_shared(variable_b_2); + + auto matmul2_0 = std::make_shared(split->output(0), read_value_b_0, false, true); + auto matmul2_1 = std::make_shared(split->output(1), read_value_b_1, false, true); + auto matmul2_2 = std::make_shared(split->output(2), read_value_b_2, false, true); + + auto concat_matmul2 = std::make_shared(NodeVector{matmul2_0, matmul2_1, matmul2_2}, 2); + + auto add = std::make_shared(fc_fused, concat_matmul2); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {2048, 256, 256}); + auto var_split = std::make_shared(add, axis_const, split_const); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape_pattern2 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(var_split->output(0), reshape_pattern0, true); + auto reshape1 = std::make_shared(var_split->output(1), reshape_pattern1, true); + auto reshape2 = std::make_shared(var_split->output(2), reshape_pattern2, true); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + + model_ref = std::make_shared(ov::NodeVector{result0, result1, result2}, ov::ParameterVector{lora_input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, LoRAHorizontalFusion_swap_add_and_multiply_inputs) { + ov::element::Type model_dt = ov::element::f16; + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2560, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2560, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {2048, 256, 256}); + auto split = std::make_shared(fc_fused, axis_const, split_const); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_b_0 = std::make_shared(variable_b_0); + auto matmul1_0 = std::make_shared(lora_input, read_value_a_0, false, true); + auto multiply_0 = std::make_shared(read_value_alpha_0, matmul1_0); + auto matmul2_0 = std::make_shared(multiply_0, read_value_b_0, false, true); + auto add_0 = std::make_shared(matmul2_0, split->output(0)); + + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto read_value_b_1 = std::make_shared(variable_b_1); + auto matmul1_1 = std::make_shared(lora_input, read_value_a_1, false, true); + auto multiply_1 = std::make_shared(read_value_alpha_1, matmul1_1); + auto matmul2_1 = std::make_shared(multiply_1, read_value_b_1, false, true); + auto add_1 = std::make_shared(matmul2_1, split->output(1)); + + auto variable_a_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_2"}); + auto variable_alpha_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_2"}); + auto variable_b_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_2"}); + auto read_value_a_2 = std::make_shared(variable_a_2); + auto read_value_alpha_2 = std::make_shared(variable_alpha_2); + auto read_value_b_2 = std::make_shared(variable_b_2); + auto matmul1_2 = std::make_shared(lora_input, read_value_a_2, false, true); + auto multiply_2 = std::make_shared(read_value_alpha_2, matmul1_2); + auto matmul2_2 = std::make_shared(multiply_2, read_value_b_2, false, true); + auto add_2 = std::make_shared(matmul2_2, split->output(2)); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape_pattern2 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(add_0, reshape_pattern0, true); + auto reshape1 = std::make_shared(add_1, reshape_pattern1, true); + auto reshape2 = std::make_shared(add_2, reshape_pattern2, true); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + + model = std::make_shared(ov::NodeVector{result0, result1, result2}, ov::ParameterVector{lora_input}); + manager.register_pass(); + } + + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2560, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2560, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + auto variable_a_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_2"}); + + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto read_value_a_2 = std::make_shared(variable_a_2); + auto concat_variable_a = std::make_shared(NodeVector{read_value_a_0, read_value_a_1, read_value_a_2}, 0); + + auto fused_matmul1 = std::make_shared(lora_input, concat_variable_a, false, true); + + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + auto variable_alpha_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_2"}); + + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto read_value_alpha_2 = std::make_shared(variable_alpha_2); + auto concat_variable_alpha = std::make_shared(NodeVector{read_value_alpha_0, read_value_alpha_1, read_value_alpha_2}, 1); + + auto multiply = std::make_shared(fused_matmul1, concat_variable_alpha); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); + auto split = std::make_shared(multiply, split_axis, 3); + + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + auto variable_b_2 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_2"}); + + auto read_value_b_0 = std::make_shared(variable_b_0); + auto read_value_b_1 = std::make_shared(variable_b_1); + auto read_value_b_2 = std::make_shared(variable_b_2); + + auto matmul2_0 = std::make_shared(split->output(0), read_value_b_0, false, true); + auto matmul2_1 = std::make_shared(split->output(1), read_value_b_1, false, true); + auto matmul2_2 = std::make_shared(split->output(2), read_value_b_2, false, true); + + auto concat_matmul2 = std::make_shared(NodeVector{matmul2_0, matmul2_1, matmul2_2}, 2); + + auto add = std::make_shared(fc_fused, concat_matmul2); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, {2048, 256, 256}); + auto var_split = std::make_shared(add, axis_const, split_const); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape_pattern2 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(var_split->output(0), reshape_pattern0, true); + auto reshape1 = std::make_shared(var_split->output(1), reshape_pattern1, true); + auto reshape2 = std::make_shared(var_split->output(2), reshape_pattern2, true); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + + model_ref = std::make_shared(ov::NodeVector{result0, result1, result2}, ov::ParameterVector{lora_input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, LoRAHorizontalFusion_split_two_outputs) { + ov::element::Type model_dt = ov::element::f16; + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2304, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2304, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256}); + auto split = std::make_shared(fc_fused, axis_const, split_const); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_b_0 = std::make_shared(variable_b_0); + auto matmul1_0 = std::make_shared(lora_input, read_value_a_0, false, true); + auto multiply_0 = std::make_shared(matmul1_0, read_value_alpha_0); + auto matmul2_0 = std::make_shared(multiply_0, read_value_b_0, false, true); + auto add_0 = std::make_shared(split->output(0), matmul2_0); + + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto read_value_b_1 = std::make_shared(variable_b_1); + auto matmul1_1 = std::make_shared(lora_input, read_value_a_1, false, true); + auto multiply_1 = std::make_shared(matmul1_1, read_value_alpha_1); + auto matmul2_1 = std::make_shared(multiply_1, read_value_b_1, false, true); + auto add_1 = std::make_shared(split->output(1), matmul2_1); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(add_0, reshape_pattern0, true); + auto reshape1 = std::make_shared(add_1, reshape_pattern1, true); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + + model = std::make_shared(ov::NodeVector{result0, result1}, ov::ParameterVector{lora_input}); + manager.register_pass(); + } + + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2304, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2304, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto concat_variable_a = std::make_shared(NodeVector{read_value_a_0, read_value_a_1}, 0); + + auto fused_matmul1 = std::make_shared(lora_input, concat_variable_a, false, true); + + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto concat_variable_alpha = std::make_shared(NodeVector{read_value_alpha_0, read_value_alpha_1}, 1); + + auto multiply = std::make_shared(fused_matmul1, concat_variable_alpha); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); + auto split = std::make_shared(multiply, split_axis, 2); + + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + + auto read_value_b_0 = std::make_shared(variable_b_0); + auto read_value_b_1 = std::make_shared(variable_b_1); + + auto matmul2_0 = std::make_shared(split->output(0), read_value_b_0, false, true); + auto matmul2_1 = std::make_shared(split->output(1), read_value_b_1, false, true); + + auto concat_matmul2 = std::make_shared(NodeVector{matmul2_0, matmul2_1}, 2); + + auto add = std::make_shared(fc_fused, concat_matmul2); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256}); + auto var_split = std::make_shared(add, axis_const, split_const); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(var_split->output(0), reshape_pattern0, true); + auto reshape1 = std::make_shared(var_split->output(1), reshape_pattern1, true); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + + model_ref = std::make_shared(ov::NodeVector{result0, result1}, ov::ParameterVector{lora_input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, LoRAHorizontalFusion_multiple_split_output_users) { + ov::element::Type model_dt = ov::element::f16; + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2304, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2304, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256}); + auto split = std::make_shared(fc_fused, axis_const, split_const); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_b_0 = std::make_shared(variable_b_0); + auto matmul1_0 = std::make_shared(lora_input, read_value_a_0, false, true); + auto multiply_0 = std::make_shared(matmul1_0, read_value_alpha_0); + auto matmul2_0 = std::make_shared(multiply_0, read_value_b_0, false, true); + auto add_0 = std::make_shared(split->output(0), matmul2_0); + + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto read_value_b_1 = std::make_shared(variable_b_1); + auto matmul1_1 = std::make_shared(lora_input, read_value_a_1, false, true); + auto multiply_1 = std::make_shared(matmul1_1, read_value_alpha_1); + auto matmul2_1 = std::make_shared(multiply_1, read_value_b_1, false, true); + auto add_1 = std::make_shared(split->output(1), matmul2_1); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(add_0, reshape_pattern0, true); + auto reshape1 = std::make_shared(add_1, reshape_pattern1, true); + + auto shape_of0 = std::make_shared(add_0); + auto shape_of1 = std::make_shared(add_0); + auto shape_of2 = std::make_shared(add_1); + auto shape_of3 = std::make_shared(add_1); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(shape_of0); + auto result3 = std::make_shared(shape_of1); + auto result4 = std::make_shared(shape_of2); + auto result5 = std::make_shared(shape_of3); + + model = std::make_shared(ov::NodeVector{result0, result1, result2, result3, result4, result5}, ov::ParameterVector{lora_input}); + manager.register_pass(); + } + + { + auto lora_input = std::make_shared(model_dt, ov::PartialShape{-1, -1, 2048}); + auto weights = std::make_shared(ov::element::u8, ov::Shape{2304, 2048}); + auto bias = std::make_shared(); + auto scale = std::make_shared(model_dt, ov::Shape{2304, 1}); + auto fc_fused = std::make_shared(lora_input, weights, bias, scale); + + auto variable_a_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_0"}); + auto variable_a_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, 2048}), model_dt, "var_a_1"}); + + auto read_value_a_0 = std::make_shared(variable_a_0); + auto read_value_a_1 = std::make_shared(variable_a_1); + auto concat_variable_a = std::make_shared(NodeVector{read_value_a_0, read_value_a_1}, 0); + + auto fused_matmul1 = std::make_shared(lora_input, concat_variable_a, false, true); + + auto variable_alpha_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_0"}); + auto variable_alpha_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), model_dt, "var_alpha_1"}); + + auto read_value_alpha_0 = std::make_shared(variable_alpha_0); + auto read_value_alpha_1 = std::make_shared(variable_alpha_1); + auto concat_variable_alpha = std::make_shared(NodeVector{read_value_alpha_0, read_value_alpha_1}, 1); + + auto multiply = std::make_shared(fused_matmul1, concat_variable_alpha); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); + auto split = std::make_shared(multiply, split_axis, 2); + + auto variable_b_0 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({2048, -1}), model_dt, "var_b_0"}); + auto variable_b_1 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({256, -1}), model_dt, "var_b_1"}); + + auto read_value_b_0 = std::make_shared(variable_b_0); + auto read_value_b_1 = std::make_shared(variable_b_1); + + auto matmul2_0 = std::make_shared(split->output(0), read_value_b_0, false, true); + auto matmul2_1 = std::make_shared(split->output(1), read_value_b_1, false, true); + + auto concat_matmul2 = std::make_shared(NodeVector{matmul2_0, matmul2_1}, 2); + + auto add = std::make_shared(fc_fused, concat_matmul2); + + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}); + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {2048, 256}); + auto var_split = std::make_shared(add, axis_const, split_const); + + auto reshape_pattern0 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 32, 64}); + auto reshape_pattern1 = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{0, 0, 4, 64}); + auto reshape0 = std::make_shared(var_split->output(0), reshape_pattern0, true); + auto reshape1 = std::make_shared(var_split->output(1), reshape_pattern1, true); + + auto shape_of0 = std::make_shared(var_split->output(0)); + auto shape_of1 = std::make_shared(var_split->output(0)); + auto shape_of2 = std::make_shared(var_split->output(1)); + auto shape_of3 = std::make_shared(var_split->output(1)); + + auto result0 = std::make_shared(reshape0); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(shape_of0); + auto result3 = std::make_shared(shape_of1); + auto result4 = std::make_shared(shape_of2); + auto result5 = std::make_shared(shape_of3); + + model_ref = std::make_shared(ov::NodeVector{result0, result1, result2, result3, result4, result5}, ov::ParameterVector{lora_input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index a274c8d1c1cae6..371091b6bd1f3d 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -423,6 +423,8 @@ class Config final { std::string toString() const; + void fromString(const std::string& str); + private: std::shared_ptr _desc; ImplMap _impl; diff --git a/src/plugins/intel_npu/src/al/src/config/config.cpp b/src/plugins/intel_npu/src/al/src/config/config.cpp index a4e2b515b8e3f6..c9c26451d6f7d5 100644 --- a/src/plugins/intel_npu/src/al/src/config/config.cpp +++ b/src/plugins/intel_npu/src/al/src/config/config.cpp @@ -244,6 +244,31 @@ std::string Config::toString() const { return resultStream.str(); } +void Config::fromString(const std::string& str) { + std::map config; + std::string str_cfg(str); + + auto parse_token = [&](const std::string& token) { + auto pos_eq = token.find('='); + auto key = token.substr(0, pos_eq); + auto value = token.substr(pos_eq + 2, token.size() - pos_eq - 3); + config[key] = value; + }; + + size_t pos = 0; + std::string token, key, value; + while ((pos = str_cfg.find(' ')) != std::string::npos) { + token = str_cfg.substr(0, pos); + parse_token(token); + str_cfg.erase(0, pos + 1); + } + + // Process tail + parse_token(str_cfg); + + update(config); +} + // // envVarStrToBool // diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index a85277b636b2e6..66e1e8e55fde2a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "compiled_model.hpp" @@ -21,6 +21,7 @@ #include "openvino/util/common_util.hpp" #include "partitioning/patterns/opt.hpp" #include "plugin.hpp" +#include "serialization.hpp" #include "unfold_sync_infer_request.hpp" #include "util.hpp" @@ -486,6 +487,222 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, report_io(); } +ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized) + : ov::npuw::ICompiledModel(model, plugin), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc), + m_name(model->get_friendly_name()), + m_loaded_from_cache(serialized) { + ::intel_npu::registerNPUWOptions(*m_options_desc); + NPUW_ASSERT(serialized && "This constructor should only be utilized during deserialization!"); + LOG_DEBUG("CompiledModel is being deserialized, skipping the full constructor flow..."); +} + +void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream) const { + using namespace ov::npuw::s11n; + + LOG_DEBUG("Serializing CompiledModelDesc..."); + LOG_BLOCK(); + + write(stream, replaced_by); + + write(stream, param_base); + write(stream, forced_to_fcall); + + write(stream, host_gather.dst_idx); + write(stream, host_gather.src_idx); + write(stream, host_gather.idx_idx); + + write(stream, spatial); + + write(stream, scales); + write(stream, zerops); + write(stream, is_remote); + + // NOTE: for closure only serialize uids - full flow + write(stream, closure_uid); + + // Some tensors might be present in CPU closure already - need to serialize as is + // FIXME: When weightless serialization is introduced, this should be handled differently + write(stream, closure.size()); + std::vector cpu_closures; + std::vector cpu_closure_ids; + for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) { + if (closure_uid[cidx] == -1) { // CPU closure, not in the bank + cpu_closure_ids.push_back(cidx); + cpu_closures.push_back(closure[cidx]); + } + } + + write(stream, cpu_closure_ids); + + for (const auto& tensor : cpu_closures) { + write(stream, tensor); + } + + // FIXME: support weightless flow! + + LOG_DEBUG("DONE."); +} + +void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream) { + using namespace ov::npuw::s11n; + + LOG_DEBUG("Deserializing CompiledModelDesc..."); + LOG_BLOCK(); + + read(stream, replaced_by); + + read(stream, param_base); + read(stream, forced_to_fcall); + + read(stream, host_gather.dst_idx); + read(stream, host_gather.src_idx); + read(stream, host_gather.idx_idx); + + read(stream, spatial); + + read(stream, scales); + read(stream, zerops); + read(stream, is_remote); + + // NOTE: for closure only deserialize uids - full flow + read(stream, closure_uid); + + // Some tensors might be present in CPU closure already - need to deserialize as is + // FIXME: When weightless serialization is introduced, this should be handled differently + std::size_t closure_size = 0; + read(stream, closure_size); + std::vector cpu_closure_ids; + read(stream, cpu_closure_ids); + closure.resize(closure_size); + for (const auto& cidx : cpu_closure_ids) { + read(stream, closure[cidx]); + } + + // FIXME: support weightless flow! + + LOG_DEBUG("DONE."); +} + +void ov::npuw::CompiledModel::serialize(std::ostream& stream) const { + LOG_INFO("Serializing CompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Serialize name + write(stream, m_name); + + // Serialize inputs and outputs + write(stream, inputs()); + write(stream, outputs()); + + // Serialize meta + write(stream, m_inputs_to_submodels_inputs); + write(stream, m_outputs_to_submodels_outputs); + write(stream, m_param_subscribers); + write(stream, m_submodels_input_to_prev_output); + + // Write device list + write(stream, m_dev_list); + + // Write config + write(stream, m_cfg); + + // Serialize compiled submodels + write(stream, m_compiled_submodels.size()); + for (const auto& subm : m_compiled_submodels) { + // Write device idx + std::size_t device_idx = subm.device_it - m_dev_list.begin(); + write(stream, device_idx); + // Write ICompiledModel if it's there + if (subm.compiled_model) { + write(stream, true); + // FIXME: workaround for import/export model since import model seem to reset the file pointer + std::stringstream ss; + subm.compiled_model->export_model(ss); + write(stream, ss.str()); + } else { + write(stream, false); + } + // Write the rest of the submodel desc + subm.serialize(stream); + } + + LOG_INFO("Done."); +} + +std::shared_ptr ov::npuw::CompiledModel::deserialize( + std::istream& stream, + const std::shared_ptr& plugin) { + LOG_INFO("Deserializing CompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Deserialize model name first + std::string model_name; + read(stream, model_name); + + // Create a dummy CompiledModel with an empty ov::Model - this will skip the constructor flow + // to continue deserialization + ov::ParameterVector parameters; + ov::NodeVector results; + + read(stream, parameters); + read(stream, results); + + auto ov_model = std::make_shared(results, parameters, model_name); + + auto compiled = std::make_shared(ov_model, plugin, true); + + // Deserialize meta + compiled->m_name = model_name; + read(stream, compiled->m_inputs_to_submodels_inputs); + read(stream, compiled->m_outputs_to_submodels_outputs); + read(stream, compiled->m_param_subscribers); + read(stream, compiled->m_submodels_input_to_prev_output); + + // Deserialize device list + read(stream, compiled->m_dev_list); + + // Deserialize config + read(stream, compiled->m_cfg); + + // Deserialize compiled submodels + std::size_t subm_size = 0; + read(stream, subm_size); + compiled->m_compiled_submodels.resize(subm_size); + for (std::size_t i = 0; i < subm_size; ++i) { + std::size_t device_idx = 0; + read(stream, device_idx); + + bool has_compiled_model = false; + read(stream, has_compiled_model); + if (has_compiled_model) { + // Import model from the plugin + // FIXME: workaround for import/export model since import model seems to reset the file pointer + std::string buf; + read(stream, buf); + std::stringstream buffer(buf); + compiled->m_compiled_submodels[i].compiled_model = + plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]); + } + compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx; + compiled->m_compiled_submodels[i].deserialize(stream); + } + + compiled->implement_properties(); + compiled->report_io(); + + LOG_INFO("Done."); + + return compiled; +} + void ov::npuw::CompiledModel::finalize_weights_bank() { LOG_INFO("Finalizing weights bank..."); // Register lazy tensors @@ -541,6 +758,33 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { LOG_INFO("Done."); } +void ov::npuw::CompiledModel::reconstruct_closure() { + for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { + auto& comp_model_desc = m_compiled_submodels[idx]; + + // Skip optimized out and non-functions + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + continue; + } + + const auto real_idx = comp_model_desc.replaced_by.value_or(idx); + auto& func_desc = m_compiled_submodels[real_idx]; + + // At this point closure size should have already been deserialized + NPUW_ASSERT(!comp_model_desc.closure.empty() && "Closure shouldn't be empty at this point!"); + for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) { + if (comp_model_desc.closure[cidx]) { + // host-side closure - already set, do nothing + NPUW_ASSERT(!comp_model_desc.is_remote[cidx]); + continue; + } + NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1); + comp_model_desc.closure[cidx] = + m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it); + } + } +} + void ov::npuw::CompiledModel::detach_memory() { LOG_INFO("Detaching model & weight memory..."); LOG_BLOCK(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index f1de81f51d8c6a..b4faf9d417b003 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -40,6 +40,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties); + CompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized); void export_model(std::ostream& model) const override; std::shared_ptr get_runtime_model() const override; @@ -56,6 +59,7 @@ class CompiledModel : public ov::npuw::ICompiledModel { friend class UnfoldInferRequest; friend class MemAccessSim; friend class FuncMemMgr; + friend class LLMCompiledModel; bool compile_for_success(std::size_t id); bool compile_for_device(std::size_t id, const std::string& device_to_try); @@ -66,6 +70,10 @@ class CompiledModel : public ov::npuw::ICompiledModel { void report_io() const; + void serialize(std::ostream& stream) const; + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& plugin); + // This is used for removing too long output tensor names to fix some compilation issues // NB: These two methods has nothing to do with this particular class and should be // moved elsewhere @@ -83,6 +91,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { void log_device_dist() const; void implement_properties(); + // For full deserialization flow with weights + void reconstruct_closure(); + void finalize_weights_bank(); void detach_memory(); std::string global_mem_device() const; @@ -141,7 +152,7 @@ class CompiledModel : public ov::npuw::ICompiledModel { // lazy_closure is used for weights sharing and allocating device memory. std::vector closure; std::vector lazy_closure; - std::vector closure_uid; + std::vector closure_uid; // Note: value -1 is considered uninitialized std::vector scales; std::vector zerops; std::vector is_remote; @@ -154,6 +165,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { // Metrics execution_stats stat; + + void serialize(std::ostream& stream) const; + void deserialize(std::istream& stream); }; std::vector m_compiled_submodels; diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 4eedaceb3235d1..f05cf0509e7531 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "llm_compiled_model.hpp" @@ -14,6 +14,7 @@ #include "openvino/pass/stateful_to_stateless.hpp" #include "openvino/pass/validate.hpp" #include "openvino/runtime/iasync_infer_request.hpp" +#include "serialization.hpp" namespace opp = ov::pass::pattern; class TransposeValueTensors : public ov::pass::MatcherPass { @@ -301,6 +302,11 @@ struct NPUDesc { }; std::optional extract_npu_descriptor(const std::shared_ptr& plugin) { + const auto all_devices = plugin->get_core()->get_available_devices(); + if (std::find(all_devices.begin(), all_devices.end(), "NPU") == all_devices.end()) { + return std::nullopt; + } + const std::string arch = plugin->get_property(ov::device::architecture.name(), ov::AnyMap{}).as(); const int64_t max_tiles = plugin->get_property(ov::intel_npu::max_tiles.name(), ov::AnyMap{}).as(); @@ -418,6 +424,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m const std::shared_ptr& plugin, const ov::AnyMap& properties) : ov::npuw::ICompiledModel(model, plugin), + m_name(model->get_friendly_name()), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), m_cfg(m_options_desc) { LOG_DEBUG("Creating LLMCompiledModel"); @@ -478,12 +485,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m auto prefill_config = prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as(); - const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); - LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); // NB: GENERATE_HINT is only applicable for default generate config! if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) { - OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!"); + OPENVINO_THROW("GENERATE_HINT only works with default generate config!"); } + const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); auto generate_config = generate_config_opt.value_or(get_default_generate_config(kvcache_model, npudesc, generate_hint)) .as(); @@ -503,11 +509,147 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m "model and its config, please check passed config."); implement_properties(); + LOG_DEBUG("Done"); } -void ov::npuw::LLMCompiledModel::export_model(std::ostream& model) const { - OPENVINO_NOT_IMPLEMENTED; +ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized) + : ov::npuw::ICompiledModel(model, plugin), + m_name(model->get_friendly_name()), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc) { + NPUW_ASSERT(serialized && "This constructor should only be utilized during deserialization!"); + LOG_DEBUG("LLMCompiledModel is being deserialized, skipping the full constructor flow..."); +} + +void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { + LOG_INFO("Serializing LLMCompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Serialize magic number first + write(stream, NPUW_SERIALIZATION_INDICATOR); + + // Serialize general meta info + write(stream, OPENVINO_VERSION_MAJOR); + write(stream, OPENVINO_VERSION_MINOR); + write(stream, OPENVINO_VERSION_PATCH); + write(stream, std::string(NPUW_SERIALIZATION_VERSION)); + + // Serialize name + write(stream, m_name); + + // Serialize inputs and outputs + write(stream, inputs()); + write(stream, outputs()); + + // Serialize LLMCompiledModel-specific data + write(stream, m_kvcache_desc.max_prompt_size); + write(stream, m_kvcache_desc.total_size); + write(stream, m_kvcache_desc.num_stored_tokens); + write(stream, m_kvcache_desc.dim); + + // Serialize CompiledModels + m_kvcache_compiled->serialize(stream); + m_prefill_compiled->serialize(stream); + + // Serialize weights bank (if required) + const auto& kv_bank = m_kvcache_compiled->m_weights_bank; + const auto& p_bank = m_prefill_compiled->m_weights_bank; + NPUW_ASSERT(kv_bank && p_bank && kv_bank == p_bank && "Prefill and KVCache models' weight bank should be shared!"); + // FIXME: support weightless flow + write(stream, kv_bank->get_name()); + kv_bank->serialize(stream); + + LOG_INFO("Done."); +} + +std::shared_ptr ov::npuw::LLMCompiledModel::deserialize( + std::istream& stream, + const std::shared_ptr& plugin) { + LOG_INFO("Deserializing LLMCompiledModel..."); + LOG_BLOCK(); + + using namespace ov::npuw::s11n; + + // Sanity check magic number + std::array serialization_indicator; + read(stream, serialization_indicator); + NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!"); + + // Deserialize general meta info + int vmajor, vminor, vpatch; + std::string s11n_version; + read(stream, vmajor); + read(stream, vminor); + read(stream, vpatch); + read(stream, s11n_version); + + if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH || + s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) { + OPENVINO_THROW("This blobs was serialized with different OV version!", + " Serialized by OV ", + vmajor, + '.', + vminor, + '.', + vpatch, + " Current OV version ", + OPENVINO_VERSION_MAJOR, + '.', + OPENVINO_VERSION_MINOR, + '.', + OPENVINO_VERSION_PATCH, + " NPUW serialized by version ", + s11n_version, + " NPUW current serialization version ", + NPUW_SERIALIZATION_VERSION); + } + + // Deserialize model name first + std::string model_name; + read(stream, model_name); + + // Create a dummy CompiledModel with an empty ov::Model - this will skip the constructor flow + // to continue deserialization + ov::ParameterVector parameters; + ov::NodeVector results; + + read(stream, parameters); + read(stream, results); + + auto ov_model = std::make_shared(results, parameters, model_name); + + auto compiled = std::make_shared(ov_model, plugin, true); + + // Deserialize LLMCompiledModel-specific data + read(stream, compiled->m_kvcache_desc.max_prompt_size); + read(stream, compiled->m_kvcache_desc.total_size); + read(stream, compiled->m_kvcache_desc.num_stored_tokens); + read(stream, compiled->m_kvcache_desc.dim); + + // Deserialize CompiledModels + compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); + compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); + + // Deserialize weights bank (if required) + std::string bank_name; + read(stream, bank_name); + auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core(), bank_name); + + // FIXME: support weightless option + compiled->m_kvcache_compiled->m_weights_bank = bank; + compiled->m_prefill_compiled->m_weights_bank = bank; + + // After bank deserialization - reconstruct NPU closures from the bank + compiled->m_kvcache_compiled->reconstruct_closure(); + compiled->m_prefill_compiled->reconstruct_closure(); + + LOG_INFO("Done."); + return compiled; } std::shared_ptr ov::npuw::LLMCompiledModel::get_runtime_model() const { diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp index e37a47b2c77948..5003ccce40bb9d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -28,8 +28,15 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties); + LLMCompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const bool serialized); LLMCompiledModel() = delete; + void export_model(std::ostream& model) const override; + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& plugin); + std::shared_ptr get_runtime_model() const override; void set_property(const ov::AnyMap& properties) override; @@ -42,6 +49,7 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel { std::shared_ptr create_sync_infer_request() const override; void implement_properties(); + std::string m_name; std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; ::intel_npu::Config m_cfg; GetPropertiesMap m_prop_to_opt; diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp new file mode 100644 index 00000000000000..5ff28204b4b6ca --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2024-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "serialization.hpp" + +#include "intel_npu/config/config.hpp" +#include "logging.hpp" +#include "openvino/op/constant.hpp" +#include "spatial.hpp" + +void ov::npuw::s11n::write(std::ostream& stream, const std::streampos& var) { + stream.write(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::write(std::ostream& stream, const std::string& var) { + auto var_size = var.size(); + stream.write(reinterpret_cast(&var_size), sizeof var_size); + stream.write(&var[0], var.size()); +} + +void ov::npuw::s11n::write(std::ostream& stream, const bool& var) { + stream.write(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ov::npuw::compiled::Spatial& var) { + using ov::npuw::s11n::write; + + write(stream, var.params.size()); + for (const auto& p : var.params) { + write(stream, p.idx); + write(stream, p.dim); + } + write(stream, var.range); + write(stream, var.nway); + write(stream, var.out_dim); + write(stream, var.nway_iters); + write(stream, var.tail_size); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ov::Tensor& var) { + using ov::npuw::s11n::write; + + auto type_str = var.get_element_type().to_string(); + write(stream, type_str); + write(stream, var.get_shape()); + write(stream, var.get_byte_size()); + + ov::Tensor tensor; + if (var.is_continuous()) { + tensor = var; + } else { + // Just copy strided tensor to a non-strided one + tensor = ov::Tensor(var.get_element_type(), var.get_shape()); + var.copy_to(tensor); + } + NPUW_ASSERT(tensor); + stream.write(reinterpret_cast(var.data()), var.get_byte_size()); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ::intel_npu::Config& var) { + write(stream, var.toString()); +} + +void ov::npuw::s11n::write(std::ostream& stream, const ov::Output& var) { + write(stream, var.get_element_type().to_string()); + write(stream, var.get_partial_shape().to_string()); + write(stream, var.get_names()); +} + +void ov::npuw::s11n::read(std::istream& stream, std::streampos& var) { + stream.read(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::read(std::istream& stream, std::string& var) { + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + var.resize(var_size); + stream.read(&var[0], var_size); +} + +void ov::npuw::s11n::read(std::istream& stream, bool& var) { + stream.read(reinterpret_cast(&var), sizeof var); +} + +void ov::npuw::s11n::read(std::istream& stream, ov::npuw::compiled::Spatial& var) { + using ov::npuw::s11n::read; + + ov::npuw::compiled::Spatial spat; + std::size_t params_size = 0; + read(stream, params_size); + for (std::size_t i = 0; i < params_size; ++i) { + ov::npuw::compiled::Spatial::Param p; + read(stream, p.idx); + read(stream, p.dim); + spat.params.push_back(p); + } + read(stream, spat.range); + read(stream, spat.nway); + read(stream, spat.out_dim); + read(stream, spat.nway_iters); + read(stream, spat.tail_size); +} + +void ov::npuw::s11n::read(std::istream& stream, ov::Tensor& var) { + std::string type_str; + read(stream, type_str); + ov::element::Type type(type_str); + + ov::Shape shape; + read(stream, shape); + + std::size_t byte_size = 0; + read(stream, byte_size); + + var = ov::Tensor(type, shape); + + stream.read(reinterpret_cast(var.data()), byte_size); +} + +void ov::npuw::s11n::read(std::istream& stream, ::intel_npu::Config& var) { + std::string str; + read(stream, str); + var.fromString(str); +} + +void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) { + std::string elem_type_str; + std::string part_shape_str; + std::unordered_set names; + read(stream, elem_type_str); + read(stream, part_shape_str); + read(stream, names); + // NOTE: the code below is taken from NPU plugin's create_dummy_model() + var = std::make_shared(ov::element::Type(elem_type_str), ov::PartialShape(part_shape_str)); + var->set_friendly_name(*names.begin()); // FIXME: any_name ? + var->output(0).get_tensor().set_names(names); +} + +void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr& var) { + std::string elem_type_str; + std::string part_shape_str; + std::unordered_set names; + read(stream, elem_type_str); + read(stream, part_shape_str); + read(stream, names); + // NOTE: the code below is taken from NPU plugin's create_dummy_model() + std::shared_ptr res = + std::make_shared(ov::element::Type(elem_type_str), std::vector{1}); + // FIXME: serialize names as well? + const std::shared_ptr& tensor_dummy = + std::make_shared(ov::element::Type(elem_type_str), + ov::PartialShape(part_shape_str), + names); + var = std::make_shared(res); + var->output(0).set_tensor_ptr(tensor_dummy); + var->set_friendly_name(*names.begin()); // any_name ? +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp new file mode 100644 index 00000000000000..77a6b3aa865254 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/serialization.hpp @@ -0,0 +1,207 @@ +// Copyright (C) 2024-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const constexpr std::array NPUW_SERIALIZATION_INDICATOR = + {char{0x13}, char{0x37}, char{0x6e}, char{0x70}, char{0x75}, char{0x77}}; + +const constexpr char* NPUW_SERIALIZATION_VERSION = "0.0"; + +// Forward declaration +namespace intel_npu { +class Config; +} // namespace intel_npu + +namespace ov { + +// Forward declaration +class Node; +class Tensor; +template +class Output; + +// Forward declaration +namespace op { +namespace v0 { +class Parameter; +} // namespace v0 +} // namespace op + +namespace npuw { + +// Forward declaration +namespace compiled { +struct Spatial; +} // namespace compiled + +namespace s11n { + +// Specific type overloads +void write(std::ostream& stream, const std::streampos& var); +void write(std::ostream& stream, const std::string& var); +void write(std::ostream& stream, const bool& var); +void write(std::ostream& stream, const ov::npuw::compiled::Spatial& var); +void write(std::ostream& stream, const ov::Tensor& var); +void write(std::ostream& stream, const ::intel_npu::Config& var); +void write(std::ostream& stream, const ov::Output& var); + +void read(std::istream& stream, std::streampos& var); +void read(std::istream& stream, std::string& var); +void read(std::istream& stream, bool& var); +void read(std::istream& stream, ov::npuw::compiled::Spatial& var); +void read(std::istream& stream, ov::Tensor& var); +void read(std::istream& stream, ::intel_npu::Config& var); +void read(std::istream& stream, std::shared_ptr& var); +void read(std::istream& stream, std::shared_ptr& var); + +// Forward declaration +template +void write(std::ostream& stream, const std::pair& var); +template +void write(std::ostream& stream, const std::vector& var); +template +void write(std::ostream& stream, const std::array& var); +template +void read(std::istream& stream, std::pair& var); +template +void read(std::istream& stream, std::vector& var); +template +void read(std::istream& stream, std::array& var); + +// Serialization +template ::value, bool> = true> +void write(std::ostream& stream, const T& var) { + stream.write(reinterpret_cast(&var), sizeof var); +} + +template +void write(std::ostream& stream, const std::pair& var) { + write(stream, var.first); + write(stream, var.second); +} + +template +void write(std::ostream& stream, const std::vector& var) { + write(stream, var.size()); + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::array& var) { + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::unordered_set& var) { + write(stream, var.size()); + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::map& var) { + write(stream, var.size()); + for (const auto& el : var) { + write(stream, el); + } +} + +template +void write(std::ostream& stream, const std::optional& var) { + if (var) { + write(stream, true); + write(stream, var.value()); + } else { + write(stream, false); + } +} + +// Deserialization +template ::value, bool> = true> +void read(std::istream& stream, T& var) { + stream.read(reinterpret_cast(&var), sizeof var); +} + +template +void read(std::istream& stream, std::pair& var) { + read(stream, var.first); + read(stream, var.second); +} + +template +void read(std::istream& stream, std::vector& var) { + var.clear(); + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + var.reserve(var_size); + for (std::size_t i = 0; i < var_size; ++i) { + T elem; + read(stream, elem); + var.push_back(elem); + } +} + +template +void read(std::istream& stream, std::array& var) { + for (std::size_t i = 0; i < N; ++i) { + T elem; + read(stream, elem); + var[i] = elem; + } +} + +template +void read(std::istream& stream, std::unordered_set& var) { + var.clear(); + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + for (std::size_t i = 0; i < var_size; ++i) { + T elem; + read(stream, elem); + var.insert(elem); + } +} + +template +void read(std::istream& stream, std::map& var) { + var.clear(); + std::size_t var_size = 0; + stream.read(reinterpret_cast(&var_size), sizeof var_size); + for (std::size_t i = 0; i < var_size; ++i) { + std::pair elem; + read(stream, elem); + var[elem.first] = elem.second; + } +} + +template +void read(std::istream& stream, std::optional& var) { + bool has_value = false; + read(stream, has_value); + if (has_value) { + T val; + read(stream, val); + var = val; + } +} + +} // namespace s11n +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp index fce2f63db4e807..2dc7eeaac3c538 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -48,6 +48,7 @@ struct Spatial { std::size_t nway_iters = 0u; std::size_t tail_size = 0u; + Spatial() = default; Spatial(const function::Spatial& s, const std::shared_ptr& m) : range(s._range), nway(s._slice), diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 4cc804f7b7e399..21b575fe54a53b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,6 +6,7 @@ #include "logging.hpp" #include "openvino/core/parallel.hpp" +#include "serialization.hpp" #include "util.hpp" using ov::npuw::weights::Bank; @@ -84,6 +85,7 @@ void Bank::evaluate_and_allocate() { std::unique_lock storage_guard(device_bank.mutex); vec.reserve(device_bank.storage.size()); + // FIXME: only add non-allocated tensors here for (const auto& el : device_bank.storage) { vec.push_back(el.second.lt); } @@ -155,6 +157,109 @@ bool Bank::is_remote(int64_t uid) const { return false; } +void Bank::serialize(std::ostream& stream) const { + using namespace ov::npuw::s11n; + + LOG_INFO("Serializing weights bank..."); + LOG_BLOCK(); + + std::lock_guard guard(m_mutex); + + write(stream, m_device_banks.size()); + + for (const auto& elem : m_device_banks) { + const auto& device = elem.first; + const auto& device_bank = elem.second; + std::lock_guard dev_guard(device_bank.mutex); + write(stream, device); + write(stream, device_bank.storage.size()); + for (const auto& t_pair : device_bank.storage) { + write(stream, t_pair.first); + write(stream, t_pair.second.tensor); + } + } + + LOG_INFO("DONE."); +} + +std::shared_ptr Bank::deserialize(std::istream& stream, + const std::shared_ptr& core, + const std::string& name) { + using namespace ov::npuw::s11n; + + LOG_INFO("Deserializing weights bank..."); + LOG_BLOCK(); + + auto bank = ov::npuw::weights::bank(name, core, ""); + + std::size_t bank_size = 0; + read(stream, bank_size); + + for (std::size_t i = 0; i < bank_size; ++i) { + std::string device; + read(stream, device); + std::size_t storage_size = 0; + read(stream, storage_size); + for (std::size_t j = 0; j < storage_size; ++j) { + int64_t uid = -1; + read(stream, uid); + bank->read_and_add_tensor(stream, uid, device); + } + } + + LOG_INFO("DONE."); + + return bank; +} + +void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device) { + using namespace ov::npuw::s11n; + + // This method is supposed to be used only during deserialization + std::lock_guard guard(m_mutex); + + auto& device_bank = m_device_banks[device]; + std::lock_guard dev_guard(device_bank.mutex); + + auto iter_device = device_bank.storage.find(uid); + + if (iter_device != device_bank.storage.end()) { + // Already allocated + return; + } + + if (device == "CPU") { + // Just read deserialized tensor into the bank + read(stream, device_bank.storage[uid].tensor); + return; + } + + // Need to allocate on device and copy deserialized tensor to that memory + ov::SoPtr remote_tensor; + ov::Tensor allocated_tensor; + + // FIXME: reading not via a dedicated function + std::string type_str; + read(stream, type_str); + ov::element::Type type(type_str); + + ov::Shape shape; + read(stream, shape); + + std::size_t byte_size = 0; + read(stream, byte_size); + + auto remote_ctx = m_core->get_default_context(device)._ptr; + remote_tensor = remote_ctx->create_host_tensor(type, shape); + allocated_tensor = ov::make_tensor(remote_tensor); + device_bank.storage[uid] = {LazyTensor(), allocated_tensor}; + stream.read(reinterpret_cast(allocated_tensor.data()), byte_size); +} + +std::string Bank::get_name() const { + return m_bank_name; +} + std::shared_ptr BankManager::getBank(const std::string& bank_name, const std::shared_ptr& core, const std::string& alloc_device) { @@ -162,7 +267,7 @@ std::shared_ptr BankManager::getBank(const std::string& bank_name, auto iter = m_bank_map.find(bank_name); if (iter == m_bank_map.end() || iter->second.expired()) { - auto bank = std::make_shared(core, alloc_device); + auto bank = std::make_shared(core, alloc_device, bank_name); m_bank_map[bank_name] = bank; return bank; } @@ -174,7 +279,7 @@ std::shared_ptr ov::npuw::weights::bank(const std::string& bank_name, const std::string& alloc_device) { if (bank_name.empty()) { // Don't share this bank in manager - return std::make_shared(core, alloc_device); + return std::make_shared(core, alloc_device, bank_name); } auto& instance = BankManager::getInstance(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp index 0d1d84b490c5e2..fd9f0e39841b7a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -18,13 +18,19 @@ namespace ov { namespace npuw { +// Forward declaration +class LLMCompiledModel; +class CompiledModel; namespace weights { class Bank { public: - explicit Bank(const std::shared_ptr& core, const std::string& alloc_device) + explicit Bank(const std::shared_ptr& core, + const std::string& alloc_device, + const std::string& bank_name) : m_core(core), - m_alloc_device(alloc_device) {} + m_alloc_device(alloc_device), + m_bank_name(bank_name) {} // Register LazyTensor in a bank if it's not there. Returns LazyTensor's unique id int64_t registerLT(const LazyTensor& tensor, const std::string& device); @@ -37,7 +43,12 @@ class Bank { bool is_remote(int64_t uid) const; + std::string get_name() const; + private: + friend class ov::npuw::LLMCompiledModel; + friend class ov::npuw::CompiledModel; + struct StoredTensor { LazyTensor lt; ov::Tensor tensor; @@ -52,10 +63,18 @@ class Bank { ov::Tensor eval_and_alloc(const LazyTensor& tensor, DeviceBank& dbank, const std::string& device); + void serialize(std::ostream& stream) const; + static std::shared_ptr deserialize(std::istream& stream, + const std::shared_ptr& core, + const std::string& name); + // Used during deserialization + void read_and_add_tensor(std::istream& stream, int64_t uid, const std::string& device); + mutable std::mutex m_mutex; std::shared_ptr m_core = nullptr; std::string m_alloc_device; int64_t uid_count = 0; + std::string m_bank_name; }; std::shared_ptr bank(const std::string& bank_name, diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index fa641dfdcd9641..c5f5ba436785cd 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 Intel Corporation +// Copyright (C) 2018-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,6 +7,10 @@ #include #include "compiled_model.hpp" +#include "npuw/compiled_model.hpp" +#include "npuw/llm_compiled_model.hpp" +#include "npuw/serialization.hpp" +#include "driver_compiler_adapter.hpp" #include "compiler_adapter_factory.hpp" #include "intel_npu/common/device_helpers.hpp" #include "intel_npu/common/icompiler_adapter.hpp" @@ -752,7 +756,25 @@ std::shared_ptr Plugin::import_model(std::istream& stream, c OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::import_model"); OV_ITT_TASK_CHAIN(PLUGIN_IMPORT_MODEL, itt::domains::NPUPlugin, "Plugin::import_model", "merge_configs"); - const std::map propertiesMap = any_copy(properties); + // If was exported via NPUW + auto stream_start_pos = stream.tellg(); + std::array serialization_indicator; + ov::npuw::s11n::read(stream, serialization_indicator); + if (serialization_indicator == NPUW_SERIALIZATION_INDICATOR) { + stream.seekg(stream_start_pos); + return ov::npuw::LLMCompiledModel::deserialize(stream, shared_from_this()); + } + stream.seekg(stream_start_pos); + + // Drop NPUW properties if there are any + ov::AnyMap npu_plugin_properties; + for (auto it = properties.begin(); it != properties.end(); ++it) { + if (it->first.find("NPUW") == it->first.npos) { + npu_plugin_properties.insert(*it); + } + } + const std::map propertiesMap = any_copy(npu_plugin_properties); + auto localConfig = merge_configs(_globalConfig, propertiesMap, OptionMode::RunTime); _logger.setLevel(localConfig.get()); const auto platform = _backends->getCompilationPlatform(localConfig.get(), localConfig.get()); diff --git a/src/plugins/intel_npu/src/utils/src/zero/CMakeLists.txt b/src/plugins/intel_npu/src/utils/src/zero/CMakeLists.txt index 5faf2c9f01c4a0..f54a78b2b6eebd 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/CMakeLists.txt +++ b/src/plugins/intel_npu/src/utils/src/zero/CMakeLists.txt @@ -48,8 +48,4 @@ if(TARGET ze_loader) ov_install_static_lib(ze_loader ${NPU_PLUGIN_COMPONENT}) add_dependencies(${TARGET_NAME} ze_loader) - - # TODO: remove once https://github.com/oneapi-src/level-zero/pull/243 is merged and made a part of official release - ov_developer_package_export_targets(TARGET utils) - ov_install_static_lib(utils ${NPU_PLUGIN_COMPONENT}) endif() diff --git a/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py b/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py index 72051783fa7422..36d1c0e863635f 100644 --- a/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py +++ b/tests/model_hub_tests/transformation_tests/generate_ref_diffs.py @@ -56,7 +56,7 @@ def get_models_list_type(file_name: str, cls: Union[Type[OVModelForCausalLM], Ty models.append((model_name, model_link, None, None, cls)) elif len(line_items) == 4: model_name, model_link, mark, reason = line_items - models.append((model_name, model_link, mark, reason)) + models.append((model_name, model_link, mark, reason, cls)) elif len(line_items) > 4: model_name, model_link, mark, reason, *other = line_items if not mark: @@ -106,7 +106,7 @@ def main(): # wrapping in try/catch block to continue printing models even if one has failed try: - paged_attention_transformation(ov_model, use_cache_eviction, use_cache_eviction) + paged_attention_transformation(ov_model, use_cache_eviction, use_cache_eviction, use_cache_eviction) except: print(f"Couldn't run SDPAToPA transformation on {model_id} and generate diffs.") continue @@ -117,10 +117,12 @@ def main(): after_map[op.get_type_name()] = after_map.get(op.get_type_name(), 0) + 1 print(f'\t"{model_id}" : {{', file=file) - for op in set(after_map.keys()) | set(before_map.keys()): + for op in sorted(set(after_map.keys()) | set(before_map.keys())): print(f'\t\t"{op}" : {after_map.get(op, 0) - before_map.get(op, 0)},', file=file) print('\t},', file=file) print('}', file=file) + print(f"output written to {OUTPUT_FILE}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py b/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py index 43ef49d9b5a226..aac6c3765aca3b 100644 --- a/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py +++ b/tests/model_hub_tests/transformation_tests/sdpa2pa_ref_diff.py @@ -5,666 +5,665 @@ ref_diff_map = { "hf-internal-testing/tiny-random-LlamaForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-CohereForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-GPTJForCausalLM" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 13, "ReadValue" : -10, - "Assign" : -10, - }, - "hf-internal-testing/tiny-random-GPTNeoForCausalLM" : { - "PagedAttentionExtension" : 4, - "ScaledDotProductAttention" : -4, - "Parameter" : 11, - "ReadValue" : -8, - "Assign" : -8, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-GPTNeoXForCausalLM" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 13, "ReadValue" : -10, - "Assign" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-MistralForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-CodeGenForCausalLM" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 13, "ReadValue" : -10, - "Assign" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/Mixtral-tiny" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM" : { + "Assign" : -5, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 13, "ReadValue" : -5, - "Assign" : -5, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-Starcoder2ForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-BloomForCausalLM" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 14, "ReadValue" : -10, - "Assign" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-gpt2" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 13, "ReadValue" : -10, - "Assign" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-BlenderbotForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 8, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-PegasusForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 8, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-PhiForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-MptForCausalLM" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 14, "ReadValue" : -10, - "Assign" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-StableLmForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-PersimmonForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-FalconForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "hf-tiny-model-private/tiny-random-OPTForCausalLM" : { + "Assign" : -10, "PagedAttentionExtension" : 5, - "ScaledDotProductAttention" : -5, "Parameter" : 14, "ReadValue" : -10, - "Assign" : -10, + "ScaledDotProductAttention" : -5, }, "katuni4ka/tiny-random-xverse" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-baichuan2-13b" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-qwen" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-aquilachat" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-aquila2" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-qwen1.5-moe" : { + "Assign" : -8, "PagedAttentionExtension" : 4, - "ScaledDotProductAttention" : -4, "Parameter" : 11, "ReadValue" : -8, - "Assign" : -8, + "ScaledDotProductAttention" : -4, }, "katuni4ka/tiny-random-codegen2" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-olmo-hf" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-baichuan2" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-jais" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-internlm" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-internlm2" : { + "Assign" : -8, "PagedAttentionExtension" : 4, - "ScaledDotProductAttention" : -4, "Parameter" : 11, "ReadValue" : -8, + "ScaledDotProductAttention" : -4, + }, + "katuni4ka/tiny-random-minicpm" : { "Assign" : -8, + "PagedAttentionExtension" : 4, + "Parameter" : 11, + "ReadValue" : -8, + "ScaledDotProductAttention" : -4, }, - "katuni4ka/tiny-random-minicpm" : { - "ReadValue" : -8, - "ScaledDotProductAttention" : -4, - "Assign" : -8, - "PagedAttentionExtension" : 4, - "Parameter" : 11, - }, "katuni4ka/tiny-random-falcon-40b" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-dbrx" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "fxmarty/tiny-random-GemmaForCausalLM" : { + "Assign" : -2, "PagedAttentionExtension" : 1, - "ScaledDotProductAttention" : -1, "Parameter" : 5, "ReadValue" : -2, - "Assign" : -2, + "ScaledDotProductAttention" : -1, }, "fxmarty/tiny-dummy-qwen2" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "fxmarty/really-tiny-falcon-testing" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "Xenova/tiny-random-Phi3ForCausalLM" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "facebook/opt-125m" : { + "Assign" : -24, "PagedAttentionExtension" : 12, - "ScaledDotProductAttention" : -12, "Parameter" : 28, "ReadValue" : -24, - "Assign" : -24, + "ScaledDotProductAttention" : -12, }, "facebook/opt-350m" : { + "Assign" : -48, "PagedAttentionExtension" : 24, - "ScaledDotProductAttention" : -24, "Parameter" : 52, "ReadValue" : -48, - "Assign" : -48, + "ScaledDotProductAttention" : -24, }, "katuni4ka/tiny-random-chatglm2" : { + "Assign" : -4, "PagedAttentionExtension" : 2, - "ScaledDotProductAttention" : -2, "Parameter" : 7, "ReadValue" : -4, - "Assign" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-glm4" : { + "Assign" : -12, "PagedAttentionExtension" : 6, - "ScaledDotProductAttention" : -6, "Parameter" : 15, "ReadValue" : -12, - "Assign" : -12, + "ScaledDotProductAttention" : -6, }, "katuni4ka/tiny-random-llava-next" : { + "Assign" : -4, "PagedAttentionExtension" : 2, "Parameter" : 7, "ReadValue" : -4, "ScaledDotProductAttention" : -2, - "Assign" : -4, }, "katuni4ka/tiny-random-minicpmv-2_6" : { + "Assign" : -4, "PagedAttentionExtension" : 2, "Parameter" : 7, "ReadValue" : -4, "ScaledDotProductAttention" : -2, - "Assign" : -4, }, "katuni4ka/tiny-random-llava" : { "Assign" : -4, + "PagedAttentionExtension" : 2, "Parameter" : 7, "ReadValue" : -4, "ScaledDotProductAttention" : -2, - "PagedAttentionExtension" : 2, }, - # "katuni4ka/tiny-random-nanollava" : { # "Assign" : -4, + # "PagedAttentionExtension" : 2, # "Parameter" : 7, # "ReadValue" : -4, # "ScaledDotProductAttention" : -2, - # "PagedAttentionExtension" : 2, # }, + "hf-internal-testing/tiny-random-GPTNeoForCausalLM" : { + "ScaledDotProductAttention" : -4, + "ReadValue" : -8, + "PagedAttentionExtension" : 4, + "Parameter" : 11, + "Assign" : -8, + } } ref_diff_map_cache_eviction = { "hf-internal-testing/tiny-random-LlamaForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-CohereForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-GPTJForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 17, "Assign" : -10, - }, - "hf-internal-testing/tiny-random-GPTNeoForCausalLM" : { - "ScaledDotProductAttention" : -4, - "ReadValue" : -8, - "PagedAttentionExtension" : 4, - "Parameter" : 14, - "Assign" : -8, + "PagedAttentionExtension" : 5, + "Parameter" : 28, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-GPTNeoXForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 17, "Assign" : -10, + "PagedAttentionExtension" : 5, + "Parameter" : 28, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-MistralForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-CodeGenForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 17, "Assign" : -10, + "PagedAttentionExtension" : 5, + "Parameter" : 28, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/Mixtral-tiny" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -5, - "PagedAttentionExtension" : 5, - "Parameter" : 17, "Assign" : -5, + "PagedAttentionExtension" : 5, + "Parameter" : 28, + "ReadValue" : -5, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-Starcoder2ForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-BloomForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 18, "Assign" : -10, + "PagedAttentionExtension" : 5, + "Parameter" : 29, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-gpt2" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 17, "Assign" : -10, + "PagedAttentionExtension" : 5, + "Parameter" : 28, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-BlenderbotForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 9, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 14, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-PegasusForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 9, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 14, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-PhiForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-MptForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 18, "Assign" : -10, + "PagedAttentionExtension" : 5, + "Parameter" : 29, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "hf-internal-testing/tiny-random-StableLmForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-PersimmonForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-internal-testing/tiny-random-FalconForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "hf-tiny-model-private/tiny-random-OPTForCausalLM" : { - "ScaledDotProductAttention" : -5, - "ReadValue" : -10, - "PagedAttentionExtension" : 5, - "Parameter" : 18, "Assign" : -10, + "PagedAttentionExtension" : 5, + "Parameter" : 29, + "ReadValue" : -10, + "ScaledDotProductAttention" : -5, }, "katuni4ka/tiny-random-xverse" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-baichuan2-13b" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-qwen" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-aquilachat" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-aquila2" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-qwen1.5-moe" : { - "ScaledDotProductAttention" : -4, - "ReadValue" : -8, - "PagedAttentionExtension" : 4, - "Parameter" : 14, "Assign" : -8, + "PagedAttentionExtension" : 4, + "Parameter" : 23, + "ReadValue" : -8, + "ScaledDotProductAttention" : -4, }, "katuni4ka/tiny-random-codegen2" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-olmo-hf" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-baichuan2" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-jais" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-internlm" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-internlm2" : { - "ScaledDotProductAttention" : -4, - "ReadValue" : -8, - "PagedAttentionExtension" : 4, - "Parameter" : 14, "Assign" : -8, + "PagedAttentionExtension" : 4, + "Parameter" : 23, + "ReadValue" : -8, + "ScaledDotProductAttention" : -4, }, "katuni4ka/tiny-random-minicpm" : { - "ScaledDotProductAttention" : -4, - "Parameter" : 14, + "Assign" : -8, "PagedAttentionExtension" : 4, + "Parameter" : 23, "ReadValue" : -8, - "Assign" : -8, + "ScaledDotProductAttention" : -4, }, "katuni4ka/tiny-random-falcon-40b" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-dbrx" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "fxmarty/tiny-random-GemmaForCausalLM" : { - "ScaledDotProductAttention" : -1, - "ReadValue" : -2, - "PagedAttentionExtension" : 1, - "Parameter" : 5, "Assign" : -2, + "PagedAttentionExtension" : 1, + "Parameter" : 8, + "ReadValue" : -2, + "ScaledDotProductAttention" : -1, }, "fxmarty/tiny-dummy-qwen2" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "fxmarty/really-tiny-falcon-testing" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "Xenova/tiny-random-Phi3ForCausalLM" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "facebook/opt-125m" : { - "ScaledDotProductAttention" : -12, - "ReadValue" : -24, - "PagedAttentionExtension" : 12, - "Parameter" : 39, "Assign" : -24, + "PagedAttentionExtension" : 12, + "Parameter" : 64, + "ReadValue" : -24, + "ScaledDotProductAttention" : -12, }, "facebook/opt-350m" : { - "ScaledDotProductAttention" : -24, - "ReadValue" : -48, - "PagedAttentionExtension" : 24, - "Parameter" : 75, "Assign" : -48, + "PagedAttentionExtension" : 24, + "Parameter" : 124, + "ReadValue" : -48, + "ScaledDotProductAttention" : -24, }, "katuni4ka/tiny-random-chatglm2" : { - "ScaledDotProductAttention" : -2, - "ReadValue" : -4, - "PagedAttentionExtension" : 2, - "Parameter" : 8, "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, + "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-glm4" : { - "ScaledDotProductAttention" : -6, - "ReadValue" : -12, - "PagedAttentionExtension" : 6, - "Parameter" : 20, "Assign" : -12, + "PagedAttentionExtension" : 6, + "Parameter" : 33, + "ReadValue" : -12, + "ScaledDotProductAttention" : -6, }, "katuni4ka/tiny-random-llava-next" : { - "Parameter" : 8, "Assign" : -4, - "ReadValue" : -4, "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-minicpmv-2_6" : { - "Parameter" : 8, "Assign" : -4, - "ReadValue" : -4, "PagedAttentionExtension" : 2, + "Parameter" : 13, + "ReadValue" : -4, "ScaledDotProductAttention" : -2, }, "katuni4ka/tiny-random-llava" : { + "Assign" : -4, + "PagedAttentionExtension" : 2, + "Parameter" : 13, "ReadValue" : -4, - "Parameter" : 8, "ScaledDotProductAttention" : -2, - "PagedAttentionExtension" : 2, - "Assign" : -4, }, - - # "katuni4ka/tiny-random-nanollava" : { + # "katuni4ka/tiny-random-nanollava" : { + # "Assign" : -4, + # "PagedAttentionExtension" : 2, + # "Parameter" : 13, # "ReadValue" : -4, - # "Parameter" : 8, # "ScaledDotProductAttention" : -2, - # "PagedAttentionExtension" : 2, - # "Assign" : -4, # }, + + "hf-internal-testing/tiny-random-GPTNeoForCausalLM" : { + "ScaledDotProductAttention" : -4, + "ReadValue" : -8, + "PagedAttentionExtension" : 4, + "Parameter" : 23, + "Assign" : -8, + } } diff --git a/tests/model_hub_tests/transformation_tests/test_pa_transformation.py b/tests/model_hub_tests/transformation_tests/test_pa_transformation.py index 2bc6726dff030f..fc6e8c1e65903f 100644 --- a/tests/model_hub_tests/transformation_tests/test_pa_transformation.py +++ b/tests/model_hub_tests/transformation_tests/test_pa_transformation.py @@ -17,13 +17,14 @@ def compare_diffs(ov_model: ov.Model, model_id: str, use_block_indices_inputs: bool, - use_score_outputs: bool): + use_score_outputs: bool, + allow_cache_rotation: bool): before_map = {} for op in ov_model.get_ordered_ops(): if op.get_type_name() in nodes_to_compare: before_map[op.get_type_name()] = before_map.get(op.get_type_name(), 0) + 1 - paged_attention_transformation(ov_model, use_block_indices_inputs, use_score_outputs) + paged_attention_transformation(ov_model, use_block_indices_inputs, use_score_outputs, allow_cache_rotation) after_map = {} for op in ov_model.get_ordered_ops(): @@ -36,7 +37,7 @@ def compare_diffs(ov_model: ov.Model, for op in set(after_map.keys()) | set(before_map.keys()): resulting_map[op] = after_map.get(op, 0) - before_map.get(op, 0) - use_cache_eviction = use_block_indices_inputs and use_score_outputs + use_cache_eviction = use_block_indices_inputs and use_score_outputs and allow_cache_rotation reference_map = ref_diff_map_cache_eviction[model_id] if use_cache_eviction else ref_diff_map[model_id] assert reference_map == resulting_map @@ -51,32 +52,47 @@ def compare_diffs(ov_model: ov.Model, assert shape[-1].is_static, f"Dimension {len(shape) - 1} of input '{name}' in '{model_id}' is not static: {shape}" assert shape[-2].is_static, f"Dimension {len(shape) - 2} of input '{name}' in '{model_id}' is not static: {shape}" - # Test for block_indices inputs and scores outputs to appear in the model + interesting_input_patterns = {} + interesting_output_patterns = {} + + + # Test for block_indices inputs and scores outputs to appear in the model if (use_block_indices_inputs): - block_indices_pattern = r'block_indices\.[0-9]+' - block_indices_counter = 0 - - model_inputs = ov_model.inputs - for input in model_inputs: - for name in list(input.get_names()): - if re.search(block_indices_pattern, name): - block_indices_counter += 1 - - assert block_indices_counter == resulting_map["PagedAttentionExtension"], \ - f"The number of block_indices inputs doesn't correspond to the expected value. Expected {resulting_map['PagedAttentionExtension']}, received {block_indices_counter}" - + interesting_input_patterns["block_indices"] = r'^block_indices\.[0-9]+' + if (use_score_outputs): - score_pattern = r'scores\.[0-9]+' - score_outputs_counter = 0 + interesting_output_patterns["scores"] = r'^scores\.[0-9]+' + + if (allow_cache_rotation): + interesting_input_patterns["rotated_block_indices"] = r'^rotated_block_indices\.[0-9]+'; + interesting_input_patterns["rotation_deltas"] = r'^rotation_deltas\.[0-9]+'; + interesting_input_patterns["rotation_trig_lut"] = r'rotation_trig_lut'; + + input_counters = {k: 0 for k in interesting_input_patterns} + output_counters = {k: 0 for k in interesting_output_patterns} + + for pattern_dict, counter_dict, io_set in zip([interesting_input_patterns, interesting_output_patterns], + [input_counters, output_counters], + [ov_model.inputs, ov_model.outputs]): + for input_id in counter_dict: + pattern = pattern_dict[input_id] + for model_io in io_set: + for name in list(model_io.get_names()): + if re.search(pattern, name): + counter_dict[input_id] += 1 + + if allow_cache_rotation: + assert input_counters["rotation_trig_lut"] == 1 + input_counters.pop("rotation_trig_lut") + + for input_id, count in input_counters.items(): + assert count == resulting_map["PagedAttentionExtension"], \ + f"The number of {input_id} inputs doesn't correspond to the expected value. Expected {resulting_map['PagedAttentionExtension']}, received {count}" - model_outputs = ov_model.outputs - for output in model_outputs: - for name in list(output.get_names()): - if re.search(score_pattern, name): - score_outputs_counter += 1 + for output_id, count in output_counters.items(): + assert count == resulting_map["PagedAttentionExtension"], \ + f"The number of {output_id} outputs doesn't correspond to the expected value. Expected {resulting_map['PagedAttentionExtension']}, received {count}" - assert block_indices_counter == resulting_map["PagedAttentionExtension"], \ - f"The number of scores outputs doesn't correspond to the expected value. Expected {resulting_map['PagedAttentionExtension']}, received {block_indices_counter}" @retry(3, exceptions=(OSError,), delay=1) def run_pa(tmp_path, @@ -84,11 +100,12 @@ def run_pa(tmp_path, model_link, cls: Union[Type[OVModelForCausalLM], Type[OVModelForVisualCausalLM]], use_block_indices_inputs, - use_score_outputs): + use_score_outputs, + allow_cache_rotation): model = cls.from_pretrained(model_id, export=True, trust_remote_code=True) ov_model = model.model if cls is OVModelForCausalLM else model.lm_model - compare_diffs(ov_model, model_id, use_block_indices_inputs, use_score_outputs) + compare_diffs(ov_model, model_id, use_block_indices_inputs, use_score_outputs, allow_cache_rotation) @pytest.mark.precommit @pytest.mark.parametrize("model_name, model_link, mark, reason", utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "hf-tiny-random-models-precommit"))) @@ -99,7 +116,7 @@ def test_pa_precommit(tmp_path, model_name, model_link, mark, reason, ie_device) pytest.skip(reason) elif mark == 'xfail': pytest.xfail(reason) - run_pa(tmp_path, model_name, model_link, OVModelForCausalLM, False, False) + run_pa(tmp_path, model_name, model_link, OVModelForCausalLM, False, False, False) @pytest.mark.precommit @pytest.mark.parametrize("model_name, model_link, mark, reason", utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "hf-tiny-random-models-precommit"))) @@ -110,7 +127,7 @@ def test_pa_precommit_use_cache_eviction(tmp_path, model_name, model_link, mark, pytest.skip(reason) elif mark == 'xfail': pytest.xfail(reason) - run_pa(tmp_path, model_name, model_link, OVModelForCausalLM, True, True) + run_pa(tmp_path, model_name, model_link, OVModelForCausalLM, True, True, True) @pytest.mark.precommit @pytest.mark.parametrize("model_name, model_link, mark, reason", utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "hf-tiny-random-vl-models-precommit"))) @@ -121,7 +138,7 @@ def test_pa_vlm(tmp_path, model_name, model_link, mark, reason, ie_device): pytest.skip(reason) elif mark == 'xfail': pytest.xfail(reason) - run_pa(tmp_path, model_name, model_link, OVModelForVisualCausalLM, False, False) + run_pa(tmp_path, model_name, model_link, OVModelForVisualCausalLM, False, False, False) @pytest.mark.precommit @pytest.mark.parametrize("model_name, model_link, mark, reason", utils.get_models_list(os.path.join(os.path.dirname(__file__), "models", "hf-tiny-random-vl-models-precommit"))) @@ -132,4 +149,4 @@ def test_pa_vlm_use_cache_eviction(tmp_path, model_name, model_link, mark, reaso pytest.skip(reason) elif mark == 'xfail': pytest.xfail(reason) - run_pa(tmp_path, model_name, model_link, OVModelForVisualCausalLM, True, True) \ No newline at end of file + run_pa(tmp_path, model_name, model_link, OVModelForVisualCausalLM, True, True, True) diff --git a/thirdparty/level_zero/level-zero b/thirdparty/level_zero/level-zero index 91e28669b464c3..0d1f19d2a8d23e 160000 --- a/thirdparty/level_zero/level-zero +++ b/thirdparty/level_zero/level-zero @@ -1 +1 @@ -Subproject commit 91e28669b464c32eced6b0afc84bd08ce77d17c6 +Subproject commit 0d1f19d2a8d23e74465a18168cb00af4c10d0d9c diff --git a/tools/ovc/openvino/tools/ovc/cli_parser.py b/tools/ovc/openvino/tools/ovc/cli_parser.py index 48a396c508bc49..6163658f00e846 100644 --- a/tools/ovc/openvino/tools/ovc/cli_parser.py +++ b/tools/ovc/openvino/tools/ovc/cli_parser.py @@ -10,7 +10,7 @@ from typing import List, Union import openvino -from openvino.runtime import PartialShape, Dimension, Type # pylint: disable=no-name-in-module,import-error +from openvino import PartialShape, Dimension, Type # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.error import Error from openvino.tools.ovc.help import get_convert_model_help_specifics from openvino.tools.ovc.moc_frontend.shape_utils import to_partial_shape, is_shape_type diff --git a/tools/ovc/openvino/tools/ovc/convert.py b/tools/ovc/openvino/tools/ovc/convert.py index 1bd61ff567e5d0..9aa6d36359898d 100644 --- a/tools/ovc/openvino/tools/ovc/convert.py +++ b/tools/ovc/openvino/tools/ovc/convert.py @@ -4,7 +4,7 @@ import pathlib from typing import Any -from openvino.runtime import Model # pylint: disable=no-name-in-module,import-error +from openvino import Model # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.cli_parser import get_all_cli_parser from openvino.tools.ovc.convert_impl import _convert from openvino.tools.ovc.logger import get_logger_state, restore_logger_state @@ -97,7 +97,7 @@ def convert_model( are reused for weights in the converted model. Returns: - openvino.runtime.Model + openvino.Model """ params = locals() logger_state = get_logger_state() diff --git a/tools/ovc/openvino/tools/ovc/convert_impl.py b/tools/ovc/openvino/tools/ovc/convert_impl.py index c690df9d4a00f6..0c01145533e57d 100644 --- a/tools/ovc/openvino/tools/ovc/convert_impl.py +++ b/tools/ovc/openvino/tools/ovc/convert_impl.py @@ -44,8 +44,8 @@ # pylint: disable=no-name-in-module,import-error from openvino.frontend import FrontEndManager, OpConversionFailure, TelemetryExtension -from openvino.runtime import get_version as get_rt_version -from openvino.runtime import PartialShape +from openvino import get_version as get_rt_version +from openvino import PartialShape try: from openvino.frontend.tensorflow.utils import create_tf_graph_iterator, type_supported_by_tf_fe, \ diff --git a/tools/ovc/openvino/tools/ovc/main.py b/tools/ovc/openvino/tools/ovc/main.py index 1118999dcd5a7b..762ecb258f0c11 100644 --- a/tools/ovc/openvino/tools/ovc/main.py +++ b/tools/ovc/openvino/tools/ovc/main.py @@ -17,7 +17,7 @@ import_openvino_tokenizers() # pylint: disable=no-name-in-module,import-error -from openvino.runtime import save_model +from openvino import save_model def main(): diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/analysis.py b/tools/ovc/openvino/tools/ovc/moc_frontend/analysis.py index 827a8e4338af96..c4b40f3c062b8a 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/analysis.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/analysis.py @@ -3,8 +3,8 @@ import json -from openvino.runtime import PartialShape, Model, Type # pylint: disable=no-name-in-module,import-error -from openvino.runtime.utils.types import get_dtype # pylint: disable=no-name-in-module,import-error +from openvino import PartialShape, Model, Type # pylint: disable=no-name-in-module,import-error +from openvino.utils.types import get_dtype # pylint: disable=no-name-in-module,import-error def json_model_analysis_dump(framework_model: Model): diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/extractor.py b/tools/ovc/openvino/tools/ovc/moc_frontend/extractor.py index 3693a54230f9c2..344d5f70e81d65 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/extractor.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/extractor.py @@ -368,7 +368,7 @@ def convert_params_lists_to_dicts(input_model, """ Convert lists of unnamed params to dicts using input names from input_model. - :param input_model: openvino.runtime.InputModel + :param input_model: openvino.InputModel :param input_user_shapes: list of input shapes or dictionary where key is input name, value is input shape from user. :param input_user_data_types: list of input types or dictionary where key is input name, value is input type from user. @@ -377,7 +377,7 @@ def convert_params_lists_to_dicts(input_model, input_user_data_types_dict - dictionary where key is input name, value is type from user; freeze_placeholder - dictionary where key is input name, value is input value from user; """ - from openvino.runtime import PartialShape # pylint: disable=no-name-in-module,import-error + from openvino import PartialShape # pylint: disable=no-name-in-module,import-error model_inputs = input_model.get_inputs() input_user_data_types_dict = {} input_user_shapes_dict = {} @@ -396,7 +396,7 @@ def convert_params_lists_to_dicts(input_model, # input_user_data_types is list only if unnamed inputs were used if isinstance(input_user_data_types, list): - from openvino.runtime import Type # pylint: disable=no-name-in-module,import-error + from openvino import Type # pylint: disable=no-name-in-module,import-error if input_user_shapes_dict is None: input_user_shapes_dict = {} @@ -404,7 +404,7 @@ def convert_params_lists_to_dicts(input_model, # this cycle adds each unnamed type to dictionary using name from model_inputs for idx, node_type in enumerate(input_user_data_types): assert isinstance(node_type, (type, np.dtype, Type)), "Got incorrect format of input types. " \ - "Expected numpy type or openvino.runtime.Type, " \ + "Expected numpy type or openvino.Type, " \ "got {}.".format(type(node_type)) inp_name = find_first_unused_input(model_inputs, input_user_data_types_dict, "type") diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/layout_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/layout_utils.py index 68c4406622f9b6..62d85d18d0df81 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/layout_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/layout_utils.py @@ -3,7 +3,7 @@ from typing import Callable -from openvino.runtime import PartialShape # pylint: disable=no-name-in-module,import-error +from openvino import PartialShape # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.error import Error from openvino.tools.ovc.utils import refer_to_faq_msg @@ -62,7 +62,7 @@ def get_dimension_index_by_label(input_shape: PartialShape, input_names: list, l layout = layout_value.get('source_layout', None) if layout is None: return default_dim, True - from openvino.runtime import Layout # pylint: disable=no-name-in-module,import-error + from openvino import Layout # pylint: disable=no-name-in-module,import-error layout_parsed = Layout(layout) if layout_parsed.has_name(dimension_label): return layout_parsed.get_index_by_name(dimension_label), False diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/moc_emit_ir.py b/tools/ovc/openvino/tools/ovc/moc_frontend/moc_emit_ir.py index 9e8e2507dea1a3..b786a59b02a1e8 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/moc_emit_ir.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/moc_emit_ir.py @@ -3,7 +3,7 @@ import argparse -from openvino.runtime import Model # pylint: disable=no-name-in-module,import-error +from openvino import Model # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.moc_frontend.preprocessing import apply_preprocessing diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/offline_transformations.py b/tools/ovc/openvino/tools/ovc/moc_frontend/offline_transformations.py index 9b7ea2bbf35065..2e85d44e21f77f 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/offline_transformations.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/offline_transformations.py @@ -78,7 +78,7 @@ def get_available_transformations(): return {} -# net should be openvino.runtime.Model type, but OV Engine is still optional dependency +# net should be openvino.Model type, but OV Engine is still optional dependency def apply_user_transformations(func: object, transforms: list): available_transformations = get_available_transformations() diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py index 4a297707a0e537..58cc07ed5dc319 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py @@ -11,8 +11,8 @@ from openvino.frontend import FrontEnd, InputModel, NotImplementedFailure, \ Place # pylint: disable=no-name-in-module,import-error -from openvino.runtime import PartialShape, Type # pylint: disable=no-name-in-module,import-error -from openvino.runtime.utils.types import get_element_type, \ +from openvino import PartialShape, Type # pylint: disable=no-name-in-module,import-error +from openvino.utils.types import get_element_type, \ get_numpy_ctype # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.moc_frontend.analysis import json_model_analysis_dump from openvino.tools.ovc.moc_frontend.extractor import fe_user_data_repack, convert_params_lists_to_dicts, \ diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/preprocessing.py b/tools/ovc/openvino/tools/ovc/moc_frontend/preprocessing.py index 0195960065d88e..a182cfa7f4481c 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/preprocessing.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/preprocessing.py @@ -6,7 +6,7 @@ from openvino.preprocess import PrePostProcessor # pylint: disable=no-name-in-module,import-error # pylint: disable=no-name-in-module,import-error -from openvino.runtime import Model, Layout, PartialShape +from openvino import Model, Layout, PartialShape from openvino.tools.ovc.error import Error from openvino.tools.ovc.moc_frontend.layout_utils import update_layout_to_dict from openvino.tools.ovc.utils import refer_to_faq_msg diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py index 486f72d87fd89d..830a066562a6e0 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py @@ -8,7 +8,7 @@ import numpy as np # pylint: disable=no-name-in-module,import-error -from openvino.runtime import Tensor, PartialShape +from openvino import Tensor, PartialShape from openvino.tools.ovc.cli_parser import single_input_to_input_cut_info, _InputCutInfo from openvino.tools.ovc.error import Error diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/shape_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/shape_utils.py index 739f0f53b0ef16..9145fad3b1bb06 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/shape_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/shape_utils.py @@ -4,7 +4,7 @@ import sys import numpy as np -from openvino.runtime import Shape, PartialShape, Dimension # pylint: disable=no-name-in-module,import-error +from openvino import Shape, PartialShape, Dimension # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.error import Error diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/type_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/type_utils.py index b7f7c8b0abc1ce..50a3ffdc3e9fce 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/type_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/type_utils.py @@ -6,7 +6,7 @@ import numpy as np import openvino as ov -from openvino.runtime import Type +from openvino import Type def is_type(val): diff --git a/tools/ovc/openvino/tools/ovc/telemetry_utils.py b/tools/ovc/openvino/tools/ovc/telemetry_utils.py index f68a92be5d2de5..98769357d69046 100644 --- a/tools/ovc/openvino/tools/ovc/telemetry_utils.py +++ b/tools/ovc/openvino/tools/ovc/telemetry_utils.py @@ -4,7 +4,7 @@ import argparse import numbers import os -from openvino.runtime import get_version as get_rt_version # pylint: disable=no-name-in-module,import-error +from openvino import get_version as get_rt_version # pylint: disable=no-name-in-module,import-error from openvino.tools.ovc.cli_parser import get_params_with_paths_list from openvino.tools.ovc.telemetry_params import telemetry_params from openvino.tools.ovc.utils import check_values_equal @@ -95,7 +95,7 @@ def send_conversion_result(conversion_result: str, need_shutdown=False): def arg_to_str(arg): # This method converts to string only known types, otherwise returns string with name of the type - from openvino.runtime import PartialShape, Shape, Type, Layout # pylint: disable=no-name-in-module,import-error + from openvino import PartialShape, Shape, Type, Layout # pylint: disable=no-name-in-module,import-error if isinstance(arg, (PartialShape, Shape, Type, Layout)): return str(arg) if isinstance(arg, (str, numbers.Number, bool)): diff --git a/tools/ovc/openvino/tools/ovc/version.py b/tools/ovc/openvino/tools/ovc/version.py index 34e0128af929da..f7509efa7ab4c5 100644 --- a/tools/ovc/openvino/tools/ovc/version.py +++ b/tools/ovc/openvino/tools/ovc/version.py @@ -3,7 +3,7 @@ import re -from openvino.runtime import get_version as get_ie_version # pylint: disable=no-name-in-module,import-error +from openvino import get_version as get_ie_version # pylint: disable=no-name-in-module,import-error def extract_release_version(version: str): @@ -29,7 +29,7 @@ def simplify_version(version: str): def get_simplified_ie_version(version=None): - from openvino.runtime import get_version # pylint: disable=no-name-in-module,import-error + from openvino import get_version # pylint: disable=no-name-in-module,import-error if version is None: version = get_version() diff --git a/tools/ovc/unit_tests/moc_tf_fe/conversion_basic_models_test.py b/tools/ovc/unit_tests/moc_tf_fe/conversion_basic_models_test.py index dc29b439b6151d..f10b5d32209e33 100644 --- a/tools/ovc/unit_tests/moc_tf_fe/conversion_basic_models_test.py +++ b/tools/ovc/unit_tests/moc_tf_fe/conversion_basic_models_test.py @@ -4,7 +4,7 @@ import numpy as np import os import pytest -from openvino.runtime import Core +from openvino import Core from openvino.tools.ovc.convert import convert_model diff --git a/tools/ovc/unit_tests/moc_tf_fe/utils.py b/tools/ovc/unit_tests/moc_tf_fe/utils.py index f68c3ef0559b45..78fa88d26f5891 100644 --- a/tools/ovc/unit_tests/moc_tf_fe/utils.py +++ b/tools/ovc/unit_tests/moc_tf_fe/utils.py @@ -5,7 +5,7 @@ import numpy as np -from openvino.runtime import Core +from openvino import Core from openvino.tools.ovc.convert import convert_model diff --git a/tools/ovc/unit_tests/ovc/convert/import_from_mo_test.py b/tools/ovc/unit_tests/ovc/convert/import_from_mo_test.py index 526942ccf712fe..1b7ddcc675f41d 100644 --- a/tools/ovc/unit_tests/ovc/convert/import_from_mo_test.py +++ b/tools/ovc/unit_tests/ovc/convert/import_from_mo_test.py @@ -3,7 +3,7 @@ import os import tempfile -from openvino.runtime import serialize +from openvino import serialize from pathlib import Path from unit_tests.ovc.convert.utils import create_onnx_model, save_to_onnx from unit_tests.ovc.unit_test_with_mocked_telemetry import UnitTestWithMockedTelemetry diff --git a/tools/ovc/unit_tests/ovc/convert/meta_data_test.py b/tools/ovc/unit_tests/ovc/convert/meta_data_test.py index d28afb507d3ea5..cf10d217523a80 100644 --- a/tools/ovc/unit_tests/ovc/convert/meta_data_test.py +++ b/tools/ovc/unit_tests/ovc/convert/meta_data_test.py @@ -3,8 +3,8 @@ import os import tempfile -from openvino.runtime import get_version as get_rt_version -from openvino.runtime import serialize +from openvino import get_version as get_rt_version +from openvino import serialize from openvino.tools.ovc import convert_model from pathlib import Path from unit_tests.ovc.unit_test_with_mocked_telemetry import UnitTestWithMockedTelemetry diff --git a/tools/ovc/unit_tests/ovc/convert/meta_data_test_actual.py b/tools/ovc/unit_tests/ovc/convert/meta_data_test_actual.py index 337853942d87bb..0413b3467c7f86 100644 --- a/tools/ovc/unit_tests/ovc/convert/meta_data_test_actual.py +++ b/tools/ovc/unit_tests/ovc/convert/meta_data_test_actual.py @@ -6,8 +6,8 @@ import unittest from pathlib import Path -from openvino.runtime import get_version as get_rt_version -from openvino.runtime import serialize, convert_model +from openvino import get_version as get_rt_version +from openvino import serialize, convert_model from openvino.tools.mo.utils.ir_reader.restore_graph import restore_graph_from_ir, save_restored_graph @@ -74,7 +74,7 @@ def ref_meta_data(): serialize(ov_model, out_xml.encode('utf-8'), out_xml.replace('.xml', '.bin').encode('utf-8')) - from openvino.runtime import Core + from openvino import Core core = Core() deserialized_model = core.read_model(out_xml) self.check_meta_data(deserialized_model, ref_meta) diff --git a/tools/ovc/unit_tests/ovc/utils/cli_parser_test.py b/tools/ovc/unit_tests/ovc/utils/cli_parser_test.py index 735acf6544d164..9eab60dcdab0ed 100644 --- a/tools/ovc/unit_tests/ovc/utils/cli_parser_test.py +++ b/tools/ovc/unit_tests/ovc/utils/cli_parser_test.py @@ -7,8 +7,8 @@ import tempfile import unittest -import openvino.runtime as ov -from openvino.runtime import PartialShape +import openvino as ov +from openvino import PartialShape from openvino.tools.ovc.cli_parser import _InputCutInfo from openvino.tools.ovc.cli_parser import input_to_input_cut_info, \