diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a9b468dff4..9ab4587c2a 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,3 +8,7 @@ updates: directory: "image_generation/lcm_dreamshaper_v7/cpp/scripts/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "text_generation/causal_lm/cpp/" + schedule: + interval: "weekly" diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml new file mode 100644 index 0000000000..9faa853a2f --- /dev/null +++ b/.github/workflows/bandit.yml @@ -0,0 +1,16 @@ +name: python -m bandit --recursive --configfile bandit.yml . +on: + pull_request: + paths-ignore: + - 'thirdparty' + - '**.md' +jobs: + bandit: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: 3.11 + - run: python -m pip install bandit + - run: python -m bandit --recursive --configfile bandit.yml . diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index f3ff8a6ee7..df03bab7c6 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -3,7 +3,6 @@ on: pull_request: paths: - .github/workflows/causal_lm_cpp.yml - - llm_bench/python/** - text_generation/causal_lm/cpp/* - thirdparty/openvino_tokenizers - "!**.md" @@ -24,20 +23,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id openlm-research/open_llama_3b_v2 --output_dir ./open_llama_3b_v2/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - - name: convert_tokenizer and run + - name: greedy_causal_lm run: | source ./ov/setupvars.sh - convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer - ./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/ "return 0" + ./build/greedy_causal_lm ./open_llama_3b_v2/ "return 0" cpp-beam_search_causal_lm-ubuntu: runs-on: ubuntu-20.04 @@ -51,21 +51,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - name: Compare run: | source ./ov/setupvars.sh - convert_tokenizer ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --with-detokenizer - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?" > ./pred.txt + timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -81,7 +82,7 @@ jobs: " echo "Why is the Sun yellow?" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ 69 > ./pred.txt + timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -97,7 +98,7 @@ jobs: " echo "69" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ Hi > ./pred.txt + timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -113,7 +114,7 @@ jobs: " echo "Hi" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "return 0" > ./pred.txt + timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -129,7 +130,7 @@ jobs: " echo "return 0" passed - ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "你好! 你好嗎?" > ./pred.txt + ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -145,7 +146,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -178,23 +179,23 @@ jobs: - name: Install OpenVINO shell: bash run: | - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip + curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/windows/w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64.zip unzip ov.zip - name: Download, convert and build shell: cmd run: | - call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu - python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16 + call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat + python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare shell: cmd run: | - call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64\setupvars.bat - convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --with-detokenizer + call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat - .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "69" > .\pred.txt + .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt echo import transformers > ref.py echo predictions = open('pred.txt', 'r').read() >> ref.py echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py @@ -219,20 +220,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id Qwen/Qwen-7B-Chat --output_dir ./Qwen-7B-Chat/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - name: Compare run: | source ./ov/setupvars.sh - convert_tokenizer ./Qwen-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/pytorch/dldt/FP16/ 69 > ./pred.txt + timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -246,20 +248,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id Qwen/Qwen1.5-7B-Chat --output_dir ./Qwen1.5-7B-Chat/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - name: Run run: | source ./ov/setupvars.sh - convert_tokenizer ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/pytorch/dldt/FP16/ "你好!" > ./pred_qwen15.txt + timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores @@ -273,20 +276,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id microsoft/phi-2 --output_dir ./Phi-2/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j 15 - wait - name: Compare run: | source ./ov/setupvars.sh - convert_tokenizer ./Phi-2/pytorch/dldt/FP16/ --output ./Phi-2/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 50s ./build/beam_search_causal_lm ./Phi-2/pytorch/dldt/FP16/ 69 > ./pred.txt + timeout 50s ./build/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt + cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores steps: @@ -299,20 +304,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id argilla/notus-7b-v1 --output_dir ./notus-7b-v1/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - name: Compare run: | source ./ov/setupvars.sh - convert_tokenizer ./notus-7b-v1/pytorch/dldt/FP16/ --output ./notus-7b-v1/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/pytorch/dldt/FP16/ 69 > ./pred.txt + timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores @@ -331,19 +337,18 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu - python ./llm_bench/python/convert.py --model_id databricks/dolly-v2-3b --output_dir ./dolly-v2-3b/ --precision FP16 - python ./llm_bench/python/convert.py --model_id databricks/dolly-v2-7b --output_dir ./dolly-v2-7b/ --precision FP16 - convert_tokenizer ./dolly-v2-3b/pytorch/dldt/FP16/ --output ./dolly-v2-3b/pytorch/dldt/FP16/ --with-detokenizer - convert_tokenizer ./dolly-v2-7b/pytorch/dldt/FP16/ --output ./dolly-v2-7b/pytorch/dldt/FP16/ --with-detokenizer + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - name: run and compare run: | source ./ov/setupvars.sh - ./build/speculative_decoding_lm ./dolly-v2-3b/pytorch/dldt/FP16/ ./dolly-v2-7b/pytorch/dldt/FP16/ "Alan Turing was a" > predictions_speculative.txt - ./build/greedy_causal_lm ./dolly-v2-7b/pytorch/dldt/FP16/ "Alan Turing was a" > predictions_greedy.txt + ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt + ./build/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -366,17 +371,17 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt "transformers<4.38" ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu - python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16 - convert_tokenizer ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --with-detokenizer + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait - name: run and compare run: | source ./ov/setupvars.sh @@ -388,8 +393,8 @@ jobs: Question: Can you please add 2 and 3 A:' > ./prompt.txt - ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "$( predictions_prompt_lookup.txt - ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "$( predictions_greedy.txt + ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt + ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -411,21 +416,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id microsoft/phi-1_5 --output_dir ./Phi-1_5/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j 15 - wait - name: Run Generation run: | source ./ov/setupvars.sh - convert_tokenizer ./Phi-1_5/pytorch/dldt/FP16/ --output ./Phi-1_5/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 50s ./build/greedy_causal_lm ./Phi-1_5/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_greedy.txt - timeout 50s ./build/beam_search_causal_lm ./Phi-1_5/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_beam.txt + timeout 50s ./build/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt - name: Compare run: | python -c " @@ -444,7 +450,7 @@ jobs: echo Phi-1_5 passed cpp-greedy_causal_lm-redpajama-3b-chat: - runs-on: ubuntu-20.04 + runs-on: ubuntu-20.04-4-cores steps: - uses: actions/checkout@v4 with: @@ -455,20 +461,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager "optimum>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ./llm_bench/python/convert.py --model_id ikala/redpajama-3b-chat --output_dir ./redpajama-3b-chat/ --precision FP16 & + python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + sudo apt-get install libtbb-dev + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ cmake --build ./build/ --config Release -j - wait + - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code - name: Run Generation run: | source ./ov/setupvars.sh - convert_tokenizer ./redpajama-3b-chat/pytorch/dldt/FP16/ --output ./redpajama-3b-chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/pytorch/dldt/FP16/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 0d21e42f95..de06153570 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -1,4 +1,5 @@ name: lcm_dreamshaper + on: pull_request: paths: @@ -7,75 +8,106 @@ on: - .github/workflows/lcm_dreamshaper_cpp.yml - thirdparty/openvino_tokenizers - "!**.md" + +env: + working_directory: "./image_generation/lcm_dreamshaper_v7/cpp/" + concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true + jobs: lcm_dreamshaper_v7_cpp-linux: runs-on: ubuntu-20.04 + defaults: + run: + # Do not ignore bash profile files. From: + # https://github.com/marketplace/actions/setup-miniconda#important + shell: bash -l {0} steps: - uses: actions/checkout@v4 with: submodules: recursive - - uses: actions/setup-python@v4 + + - name: Setup conda + uses: conda-incubator/setup-miniconda@v3 with: - python-version: 3.8 - - name: Initialize OpenVINO + miniconda-version: "latest" + activate-environment: openvino_lcm_cpp + python-version: "3.10" + + - name: Install OpenVINO and other conda dependencies run: | - mkdir openvino - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./openvino/ --strip-components 1 -xz - sudo ./openvino/install_dependencies/install_openvino_dependencies.sh - - name: Download / convert a model / tokenizer + conda activate openvino_lcm_cpp + conda update -c conda-forge --all + conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH + + - name: Install python dependencies + working-directory: ${{ env.working_directory }} run: | - source ./openvino/setupvars.sh - cd ./image_generation/lcm_dreamshaper_v7/cpp/scripts/ - python -m pip install -U pip - python -m pip install -r ./requirements.txt - python -m pip install ../../../../thirdparty/openvino_tokenizers/ - python convert_model.py -lcm "SimianLuo/LCM_Dreamshaper_v7" -t "FP16" + conda activate openvino_lcm_cpp + python -m pip install -r requirements.txt + python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + + - name: Download and convert model and tokenizer + working-directory: ${{ env.working_directory }} + run: | + conda activate openvino_lcm_cpp + optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16 + - name: Build app + working-directory: ${{ env.working_directory }} run: | - source ./openvino/setupvars.sh - cd ./image_generation/lcm_dreamshaper_v7/cpp/ + conda activate openvino_lcm_cpp cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release --parallel + - name: Run app - run: | - source ./openvino/setupvars.sh - cd ./image_generation/lcm_dreamshaper_v7/cpp/build/ - ./lcm_dreamshaper + working-directory: ${{ env.working_directory }} + run: ./build/lcm_dreamshaper + lcm_dreamshaper_v7_cpp-windows: - runs-on: windows-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - name: Initialize OpenVINO - shell: cmd - run: | - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64.zip - unzip ov.zip - - name: Download / convert a model / tokenizer - shell: cmd - run: | - call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64/setupvars.bat - cd ./image_generation/lcm_dreamshaper_v7/cpp/scripts/ - python -m pip install -r ./requirements.txt - python -m pip install ../../../../thirdparty/openvino_tokenizers/ - python convert_model.py -lcm "SimianLuo/LCM_Dreamshaper_v7" -t "FP16" - - name: Build app - shell: cmd - run: | - call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64/setupvars.bat - cd ./image_generation/lcm_dreamshaper_v7/cpp/ - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --parallel - - name: Run app - shell: cmd - run: | - call w_openvino_toolkit_windows_2024.1.0.dev20240304_x86_64/setupvars.bat - cd ./image_generation/lcm_dreamshaper_v7/cpp/build/ - call "./Release/lcm_dreamshaper.exe" + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup conda + uses: conda-incubator/setup-miniconda@v3 + with: + miniconda-version: "latest" + activate-environment: openvino_lcm_cpp + python-version: "3.10" + + - name: Install OpenVINO and other conda dependencies + run: | + conda activate openvino_lcm_cpp + conda update -c conda-forge --all + conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH + + - name: Install python dependencies + working-directory: ${{ env.working_directory }} + run: | + conda activate openvino_lcm_cpp + python -m pip install -r requirements.txt + python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + + - name: Download and convert model and tokenizer + working-directory: ${{ env.working_directory }} + run: | + conda activate openvino_lcm_cpp + optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16 + + - name: Build app + working-directory: ${{ env.working_directory }} + run: | + conda activate openvino_lcm_cpp + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + + - name: Run app + working-directory: ${{ env.working_directory }} + run: '& "./build/Release/lcm_dreamshaper.exe" -r --dynamic' diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 41c68becd8..38a2022e1d 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -39,7 +39,7 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge openvino=2024.0.0 c-compiler cxx-compiler make cmake + conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies @@ -53,9 +53,7 @@ jobs: working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - export MODEL_PATH="models/stable_diffusion_v1_5_ov/FP16" - optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --convert-tokenizer --weight-format fp16 $MODEL_PATH - convert_tokenizer $MODEL_PATH/tokenizer/ --tokenizer-output-type i32 -o $MODEL_PATH/tokenizer/ + optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --weight-format fp16 models/stable_diffusion_v1_5_ov/FP16 - name: Build app working-directory: ${{ env.working_directory }} @@ -66,8 +64,7 @@ jobs: - name: Run app working-directory: ${{ env.working_directory }} - run: | - ./build/stable_diffusion -m ./models/stable_diffusion_v1_5_ov -t FP16 + run: ./build/stable_diffusion -m ./models/stable_diffusion_v1_5_ov -t FP16 stable_diffusion_1_5_cpp-windows: runs-on: windows-latest @@ -86,7 +83,7 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge openvino=2024.0.0 c-compiler cxx-compiler make cmake + conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake - name: Install python dependencies working-directory: ${{ env.working_directory }} @@ -99,9 +96,7 @@ jobs: working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - $env:MODEL_PATH='models/stable_diffusion_v1_5_ov/FP16' - optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --convert-tokenizer --weight-format fp16 $env:MODEL_PATH - convert_tokenizer $env:MODEL_PATH/tokenizer/ --tokenizer-output-type i32 -o $env:MODEL_PATH/tokenizer/ + optimum-cli export openvino --model runwayml/stable-diffusion-v1-5 --task stable-diffusion --weight-format fp16 models/stable_diffusion_v1_5_ov/FP16 - name: Build app working-directory: ${{ env.working_directory }} @@ -112,5 +107,4 @@ jobs: - name: Run app working-directory: ${{ env.working_directory }} - run: | - & "./build/Release/stable_diffusion.exe" -m ./models/stable_diffusion_v1_5_ov -t FP16 --dynamic + run: '& "./build/Release/stable_diffusion.exe" -m ./models/stable_diffusion_v1_5_ov -t FP16 --dynamic' diff --git a/bandit.yml b/bandit.yml new file mode 100644 index 0000000000..be2fd3da5b --- /dev/null +++ b/bandit.yml @@ -0,0 +1,398 @@ +### This config may optionally select a subset of tests to run or skip by +### filling out the 'tests' and 'skips' lists given below. If no tests are +### specified for inclusion then it is assumed all tests are desired. The skips +### set will remove specific tests from the include set. This can be controlled +### using the -t/-s CLI options. Note that the same test ID should not appear +### in both 'tests' and 'skips', this would be nonsensical and is detected by +### Bandit at runtime. + +# Available tests: +# B101 : assert_used +# B102 : exec_used +# B103 : set_bad_file_permissions +# B104 : hardcoded_bind_all_interfaces +# B105 : hardcoded_password_string +# B106 : hardcoded_password_funcarg +# B107 : hardcoded_password_default +# B108 : hardcoded_tmp_directory +# B110 : try_except_pass +# B112 : try_except_continue +# B201 : flask_debug_true +# B301 : pickle +# B302 : marshal +# B303 : md5 +# B304 : ciphers +# B305 : cipher_modes +# B306 : mktemp_q +# B307 : eval +# B308 : mark_safe +# B310 : urllib_urlopen +# B311 : random +# B312 : telnetlib +# B313 : xml_bad_cElementTree +# B314 : xml_bad_ElementTree +# B315 : xml_bad_expatreader +# B316 : xml_bad_expatbuilder +# B317 : xml_bad_sax +# B318 : xml_bad_minidom +# B319 : xml_bad_pulldom +# B320 : xml_bad_etree +# B321 : ftplib +# B323 : unverified_context +# B324 : hashlib_new_insecure_functions +# B401 : import_telnetlib +# B402 : import_ftplib +# B403 : import_pickle +# B404 : import_subprocess +# B405 : import_xml_etree +# B406 : import_xml_sax +# B407 : import_xml_expat +# B408 : import_xml_minidom +# B409 : import_xml_pulldom +# B410 : import_lxml +# B411 : import_xmlrpclib +# B412 : import_httpoxy +# B413 : import_pycrypto +# B501 : request_with_no_cert_validation +# B502 : ssl_with_bad_version +# B503 : ssl_with_bad_defaults +# B504 : ssl_with_no_version +# B505 : weak_cryptographic_key +# B506 : yaml_load +# B507 : ssh_no_host_key_verification +# B601 : paramiko_calls +# B602 : subprocess_popen_with_shell_equals_true +# B603 : subprocess_without_shell_equals_true +# B604 : any_other_function_with_shell_equals_true +# B605 : start_process_with_a_shell +# B606 : start_process_with_no_shell +# B607 : start_process_with_partial_path +# B608 : hardcoded_sql_expressions +# B609 : linux_commands_wildcard_injection +# B610 : django_extra_used +# B611 : django_rawsql_used +# B701 : jinja2_autoescape_false +# B702 : use_of_mako_templates +# B703 : django_mark_safe + +# (optional) list included test IDs here, eg '[B101, B406]': +# IPAS Required Checkers. Do not disable these +# Additional checkers may be added if desired +tests: + [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413'] + +# (optional) list skipped test IDs here, eg '[B101, B406]': +# The following checkers are not required but be added to tests list if desired +skips: + [ 'B101', 'B102', 'B103', 'B104', 'B105', 'B106', 'B107', 'B108', 'B110', 'B112', 'B201', 'B501', 'B502', 'B503', 'B504', 'B505', 'B506', 'B507', 'B601', 'B602', 'B603', 'B604', 'B605', 'B606', 'B607', 'B608', 'B609', 'B610', 'B611', 'B701', 'B702', 'B703'] + +### (optional) plugin settings - some test plugins require configuration data +### that may be given here, per-plugin. All bandit test plugins have a built in +### set of sensible defaults and these will be used if no configuration is +### provided. It is not necessary to provide settings for every (or any) plugin +### if the defaults are acceptable. + +any_other_function_with_shell_equals_true: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +assert_used: + skips: [] +hardcoded_tmp_directory: + tmp_dirs: + - /tmp + - /var/tmp + - /dev/shm +linux_commands_wildcard_injection: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +ssl_with_bad_defaults: + bad_protocol_versions: + - PROTOCOL_SSLv2 + - SSLv2_METHOD + - SSLv23_METHOD + - PROTOCOL_SSLv3 + - PROTOCOL_TLSv1 + - SSLv3_METHOD + - TLSv1_METHOD +ssl_with_bad_version: + bad_protocol_versions: + - PROTOCOL_SSLv2 + - SSLv2_METHOD + - SSLv23_METHOD + - PROTOCOL_SSLv3 + - PROTOCOL_TLSv1 + - SSLv3_METHOD + - TLSv1_METHOD +start_process_with_a_shell: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +start_process_with_no_shell: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +start_process_with_partial_path: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +subprocess_popen_with_shell_equals_true: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +subprocess_without_shell_equals_true: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +try_except_continue: + check_typed_exception: false +try_except_pass: + check_typed_exception: false +weak_cryptographic_key: + weak_key_size_dsa_high: 1024 + weak_key_size_dsa_medium: 2048 + weak_key_size_ec_high: 160 + weak_key_size_ec_medium: 224 + weak_key_size_rsa_high: 1024 + weak_key_size_rsa_medium: 2048 +exclude_dirs: + - thirdparty diff --git a/image_generation/common/diffusers/src/scheduler_lcm.cpp b/image_generation/common/diffusers/src/scheduler_lcm.cpp index af82c981a4..d5f97b6772 100644 --- a/image_generation/common/diffusers/src/scheduler_lcm.cpp +++ b/image_generation/common/diffusers/src/scheduler_lcm.cpp @@ -192,7 +192,7 @@ std::map LCMScheduler::step(ov::Tensor noise_pred, ov:: if (inference_step != num_inference_steps - 1) { std::vector noise; if (read_torch_noise) { - std::string noise_file = "../scripts/torch_noise_step_" + std::to_string(inference_step) + ".txt"; + std::string noise_file = "./latents/torch_noise_step_" + std::to_string(inference_step) + ".txt"; noise = read_vector_from_txt(noise_file); } else { noise = randn_function(noise_pred.get_size(), seed); diff --git a/image_generation/lcm_dreamshaper_v7/cpp/.gitignore b/image_generation/lcm_dreamshaper_v7/cpp/.gitignore new file mode 100644 index 0000000000..cf7dbce266 --- /dev/null +++ b/image_generation/lcm_dreamshaper_v7/cpp/.gitignore @@ -0,0 +1,2 @@ +images +models diff --git a/image_generation/lcm_dreamshaper_v7/cpp/README.md b/image_generation/lcm_dreamshaper_v7/cpp/README.md index 1d4f1f7ace..6077b8a1c7 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/README.md +++ b/image_generation/lcm_dreamshaper_v7/cpp/README.md @@ -2,19 +2,25 @@ The pure C++ text-to-image pipeline, driven by the OpenVINO native API for SD v1.5 Latent Consistency Model with LCM Scheduler. It includes advanced features like LoRA integration with safetensors and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). Loading `openvino_tokenizers` to `ov::Core` enables tokenization. [The common folder](../../common/) contains schedulers for image generation and `imwrite()` for saving `bmp` images. This demo has been tested for Linux platform only. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/263-latent-consistency-models-image-generation/263-lcm-lora-controlnet.ipynb) which provides an example of image generaztion in Python. > [!NOTE] ->This tutorial assumes that the current working directory is `/image_generation/lcm_dreamshaper_v7/cpp/` and all paths are relative to this folder. +> This tutorial assumes that the current working directory is `/image_generation/lcm_dreamshaper_v7/cpp/` and all paths are relative to this folder. ## Step 1: Prepare build environment +Prerequisites: +- Conda ([installation guide](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)) + C++ Packages: * [CMake](https://cmake.org/download/): Cross-platform build tool -* [OpenVINO](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html): Model inference +* [OpenVINO](https://docs.openvino.ai/2024/get-started/install-openvino.html): Model inference Prepare a python environment and install dependencies: ```shell conda create -n openvino_lcm_cpp python==3.10 conda activate openvino_lcm_cpp -conda install -c conda-forge openvino c-compiler cxx-compiler make +conda update -c conda-forge --all +conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake +# Ensure that Conda standard libraries are used +conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` ## Step 2: Latent Consistency Model and Tokenizer models @@ -26,21 +32,18 @@ conda install -c conda-forge openvino c-compiler cxx-compiler make ```shell git submodule update --init conda activate openvino_lcm_cpp - python -m pip install -r scripts/requirements.txt + python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] ``` -2. Run model conversion script to download and convert PyTorch model to OpenVINO IR via [optimum-intel](https://github.com/huggingface/optimum-intel). Please, use the script `scripts/convert_model.py` to convert the model: +2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel). Example command for downloading and exporting FP16 model: - ```shell - cd scripts - python convert_model.py -lcm "SimianLuo/LCM_Dreamshaper_v7" -t FP16 - ``` + `optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16` If https://huggingface.co/ is down, the script won't be able to download the model. > [!NOTE] ->Only static model is currently supported for this sample. +> Only static model is currently supported for this sample. ### LoRA enabling with safetensors @@ -67,19 +70,20 @@ Usage: lcm_dreamshaper [OPTION...] ``` -* `-p, --posPrompt arg` Initial positive prompt for SD (default: cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting) +* `-p, --posPrompt arg` Initial positive prompt for LCM (default: a beautiful pink unicorn) * `-d, --device arg` AUTO, CPU, or GPU. Doesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only (default: CPU) -* `--step arg` Number of diffusion step ( default: 20) +* `--step arg` Number of diffusion step (default: 4) * `-s, --seed arg` Number of random seed to generate latent (default: 42) -* `--num arg` Number of image output(default: 1) +* `--num arg` Number of image output (default: 1) * `--height arg` Height of output image (default: 512) * `--width arg` Width of output image (default: 512) * `-c, --useCache` Use model caching -* `-r, --readNPLatent` Read numpy generated latents from file -* `-m, --modelPath arg` Specify path of SD model IR (default: ../scripts/SimianLuo/LCM_Dreamshaper_v7) -* `-t, --type arg` Specify the type of SD model IR (FP16_static or FP16_dyn) (default: FP16_static) -* `-l, --loraPath arg` Specify path of lora file. (*.safetensors). (default: ) -* `-a, --alpha arg` alpha for lora (default: 0.75) +* `-r, --readNPLatent` Read numpy generated latents from file, only supported for one output image +* `-m, --modelPath arg` Specify path to LCM model IRs (default: ./models/lcm_dreamshaper_v7) +* `-t, --type arg` Specify the type of LCM model IRs (e.g., FP32, FP16 or INT8) (default: FP16) +* `--dynamic` Specify the model input shape to use dynamic shape +* `-l, --loraPath arg` Specify path to LoRA file (*.safetensors) (default: ) +* `-a, --alpha arg` Specify alpha for LoRA (default: 0.75) * `-h, --help` Print usage > [!NOTE] @@ -91,15 +95,15 @@ Positive prompt: a beautiful pink unicorn Read the numpy latent input and noise for scheduler instead of C++ std lib for the alignment with Python pipeline. -* Generate image with random data generated by Python `./build/lcm_dreamshaper -r` +* Generate image with random data generated by Python: `./build/lcm_dreamshaper -r` ![image](./python_random.bmp) -* Generate image with C++ lib generated latent and noise : `./build/lcm_dreamshaper` +* Generate image with C++ lib generated latent and noise: `./build/lcm_dreamshaper` ![image](./cpp_random.bmp) -* Generate image with soulcard lora and C++ generated latent and noise `./stable_diffusion -r -l path/to/soulcard.safetensors` +* Generate image with soulcard lora and C++ generated latent and noise: `./stable_diffusion -r -l path/to/soulcard.safetensors` ![image](./lora_cpp_random.bmp) diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/np_latents_512x512.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/np_latents_512x512.txt similarity index 100% rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/np_latents_512x512.txt rename to image_generation/lcm_dreamshaper_v7/cpp/latents/np_latents_512x512.txt diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_0.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_0.txt similarity index 100% rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_0.txt rename to image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_0.txt diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_1.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_1.txt similarity index 100% rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_1.txt rename to image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_1.txt diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_2.txt b/image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_2.txt similarity index 100% rename from image_generation/lcm_dreamshaper_v7/cpp/scripts/torch_noise_step_2.txt rename to image_generation/lcm_dreamshaper_v7/cpp/latents/torch_noise_step_2.txt diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt new file mode 100644 index 0000000000..047e0d826f --- /dev/null +++ b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt @@ -0,0 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.2.2+cpu +diffusers==0.27.2 +optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py b/image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py deleted file mode 100644 index c55ec0ecc9..0000000000 --- a/image_generation/lcm_dreamshaper_v7/cpp/scripts/convert_model.py +++ /dev/null @@ -1,41 +0,0 @@ -from pathlib import Path -import argparse -from optimum.intel.openvino import OVLatentConsistencyModelPipeline -from transformers import AutoTokenizer -from openvino_tokenizers import convert_tokenizer -from openvino import Type, save_model - - -def parse_args() -> argparse.Namespace: - """Parse and return command line arguments.""" - parser = argparse.ArgumentParser(add_help=False) - args = parser.add_argument_group('Options') - args.add_argument('-h', '--help', action = 'help', - help='Show this help message and exit.') - args.add_argument('-t', '--type', type = str, default = "FP32", required = True, - help='Required. data type, FP32, FP16.') - args.add_argument('-lcm','--lcm_weights', type = str, default="SimianLuo/LCM_Dreamshaper_v7", required = True, - help='Specify the path of lcm model') - return parser.parse_args() - -args = parse_args() -output_path = Path(args.lcm_weights) / (args.type + "_static") - -###convert LCM model to IR - -model = OVLatentConsistencyModelPipeline.from_pretrained(args.lcm_weights, trust_remote_code=True, export=True, compile=False) -if args.type == "FP16": - model.half() - -model.reshape(1, 512, 512, 1) - -model.compile() -model.save_pretrained(output_path) - -# convert tokenizer - -tokenizer_path = output_path / "tokenizer" -hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) -ov_tokenizer_encoder = convert_tokenizer(hf_tokenizer, tokenizer_output_type=Type.i32) - -save_model(ov_tokenizer_encoder, tokenizer_path / "openvino_tokenizer.xml", compress_to_fp16=False) diff --git a/image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt deleted file mode 100644 index f7003f7218..0000000000 --- a/image_generation/lcm_dreamshaper_v7/cpp/scripts/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -torch==2.3.0+cpu -diffusers==0.27.2 -optimum-intel[nncf,openvino]==1.16.1 diff --git a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp index 1df11bee29..0b06d22067 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp +++ b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp @@ -21,11 +21,14 @@ #include "lora.hpp" #include "imwrite.hpp" -ov::Tensor randn_tensor(uint32_t height, uint32_t width, bool use_np_latents, uint32_t seed = 42) { - ov::Tensor noise(ov::element::f32, {1, 4, height / 8, width / 8}); +const size_t TOKENIZER_MODEL_MAX_LENGTH = 77; // 'model_max_length' parameter from 'tokenizer_config.json' +const size_t VAE_SCALE_FACTOR = 8; + +ov::Tensor randn_tensor(ov::Shape shape, bool use_np_latents, uint32_t seed = 42) { + ov::Tensor noise(ov::element::f32, shape); if (use_np_latents) { // read np generated latents with defaut seed 42 - const char * latent_file_name = "../scripts/np_latents_512x512.txt"; + const char * latent_file_name = "./latents/np_latents_512x512.txt"; std::ifstream latent_copy_file(latent_file_name, std::ios::ate); OPENVINO_ASSERT(latent_copy_file.is_open(), "Cannot open ", latent_file_name); @@ -60,13 +63,67 @@ void apply_lora(std::shared_ptr model, InsertLoRA::LoRAMap& lora_map) } } -StableDiffusionModels compile_models(const std::string& model_path, const std::string& device, - const std::string& lora_path, const float alpha, const bool use_cache) { +void reshape_text_encoder(std::shared_ptr model, size_t batch_size, size_t tokenizer_model_max_length) { + ov::PartialShape input_shape = model->input(0).get_partial_shape(); + input_shape[0] = batch_size; + input_shape[1] = tokenizer_model_max_length; + std::map idx_to_shape{{0, input_shape}}; + model->reshape(idx_to_shape); +} + +void reshape_unet(std::shared_ptr model, + int64_t batch_size, + int64_t height, + int64_t width, + int64_t tokenizer_model_max_length) { + height = height / VAE_SCALE_FACTOR; + width = width / VAE_SCALE_FACTOR; + + std::map name_to_shape; + + for (auto input : model->inputs()) { + std::string input_name = input.get_any_name(); + name_to_shape[input_name] = input.get_partial_shape(); + if (input_name == "timestep") { + name_to_shape[input_name][0] = 1; + } else if (input_name == "sample") { + name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width}; + } else if (input_name == "time_ids") { + name_to_shape[input_name][0] = batch_size; + } else if (input_name == "timestep_cond") { + name_to_shape[input_name][0] = batch_size; + } else { + name_to_shape[input_name][0] = batch_size; + name_to_shape[input_name][1] = TOKENIZER_MODEL_MAX_LENGTH; + } + } + + model->reshape(name_to_shape); +} + +void reshape_vae_decoder(std::shared_ptr model, int64_t height, int64_t width) { + height = height / VAE_SCALE_FACTOR; + width = width / VAE_SCALE_FACTOR; + ov::Dimension vae_decoder_latent_channels = model->input(0).get_partial_shape()[1]; + std::map idx_to_shape{{0, {1, vae_decoder_latent_channels, height, width}}}; + model->reshape(idx_to_shape); +} + +StableDiffusionModels compile_models(const std::string& model_path, + const std::string& device, + const std::string& lora_path, + const float alpha, + const bool use_cache, + const bool use_dynamic_shapes, + const size_t batch_size, + const size_t height, + const size_t width) { StableDiffusionModels models; ov::Core core; if (use_cache) core.set_property(ov::cache_dir("./cache_dir")); + core.add_extension(TOKENIZERS_LIBRARY_PATH); // read LoRA weights @@ -78,6 +135,9 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s // Text encoder { auto text_encoder_model = core.read_model(model_path + "/text_encoder/openvino_model.xml"); + if (!use_dynamic_shapes) { + reshape_text_encoder(text_encoder_model, batch_size, TOKENIZER_MODEL_MAX_LENGTH); + } apply_lora(text_encoder_model, lora_weights["text_encoder"]); models.text_encoder = core.compile_model(text_encoder_model, device); } @@ -85,6 +145,9 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s // UNet { auto unet_model = core.read_model(model_path + "/unet/openvino_model.xml"); + if (!use_dynamic_shapes) { + reshape_unet(unet_model, batch_size, height, width, TOKENIZER_MODEL_MAX_LENGTH); + } apply_lora(unet_model, lora_weights["unet"]); models.unet = core.compile_model(unet_model, device); } @@ -92,6 +155,9 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s // VAE decoder { auto vae_decoder_model = core.read_model(model_path + "/vae_decoder/openvino_model.xml"); + if (!use_dynamic_shapes) { + reshape_vae_decoder(vae_decoder_model, height, width); + } ov::preprocess::PrePostProcessor ppp(vae_decoder_model); ppp.output().model().set_layout("NCHW"); ppp.output().tensor().set_layout("NHWC"); @@ -108,15 +174,14 @@ StableDiffusionModels compile_models(const std::string& model_path, const std::s } ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt) { - const size_t MAX_LENGTH = 77; // 'model_max_length' from 'tokenizer_config.json' const size_t HIDDEN_SIZE = static_cast(models.text_encoder.output(0).get_partial_shape()[2].get_length()); const int32_t EOS_TOKEN_ID = 49407, PAD_TOKEN_ID = EOS_TOKEN_ID; - const ov::Shape input_ids_shape({1, MAX_LENGTH}); + const ov::Shape input_ids_shape({1, TOKENIZER_MODEL_MAX_LENGTH}); ov::InferRequest tokenizer_req = models.tokenizer.create_infer_request(); ov::InferRequest text_encoder_req = models.text_encoder.create_infer_request(); - ov::Tensor text_embeddings(ov::element::f32, {1, MAX_LENGTH, HIDDEN_SIZE}); + ov::Tensor text_embeddings(ov::element::f32, {1, TOKENIZER_MODEL_MAX_LENGTH, HIDDEN_SIZE}); ov::Tensor input_ids(ov::element::i32, input_ids_shape); std::fill_n(input_ids.data(), input_ids.get_size(), PAD_TOKEN_ID); @@ -124,7 +189,7 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt) { tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &pos_prompt}); tokenizer_req.infer(); ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids"); - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); // text embeddings text_encoder_req.set_tensor("input_ids", input_ids); @@ -192,20 +257,23 @@ ov::Tensor postprocess_image(ov::Tensor decoded_image) { } int32_t main(int32_t argc, char* argv[]) try { - cxxopts::Options options("stable_diffusion", "Stable Diffusion implementation in C++ using OpenVINO\n"); + cxxopts::Options options("lcm_dreamshaper", "LCM_Dreamshaper_v7 implementation in C++ using OpenVINO\n"); options.add_options() - ("p,posPrompt", "Initial positive prompt for LCM ", cxxopts::value()->default_value("a beautiful pink unicorn")) + ("p,posPrompt", "Initial positive prompt for LCM", cxxopts::value()->default_value("a beautiful pink unicorn")) ("d,device", "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only", cxxopts::value()->default_value("CPU")) ("step", "Number of diffusion steps", cxxopts::value()->default_value("4")) ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value()->default_value("42")) ("num", "Number of image output", cxxopts::value()->default_value("1")) + ("height","Height of output image",cxxopts::value()->default_value("512")) + ("width", "Width of output image", cxxopts::value()->default_value("512")) ("c,useCache", "Use model caching", cxxopts::value()->default_value("false")) ("r,readNPLatent", "Read numpy generated latents from file, only supported for one output image", cxxopts::value()->default_value("false")) - ("m,modelPath", "Specify path of LCM model IRs", cxxopts::value()->default_value("../scripts/SimianLuo/LCM_Dreamshaper_v7")) - ("t,type", "Specify the type of LCM model IRs (e.g., FP16_static or FP16_dyn)", cxxopts::value()->default_value("FP16_static")) + ("m,modelPath", "Specify path to LCM model IRs", cxxopts::value()->default_value("./models/lcm_dreamshaper_v7")) + ("t,type", "Specify the type of LCM model IRs (e.g., FP32, FP16 or INT8)", cxxopts::value()->default_value("FP16")) + ("dynamic","Specify the model input shape to use dynamic shape",cxxopts::value()->default_value("false")) ("l,loraPath", "Specify path of LoRA file. (*.safetensors).", cxxopts::value()->default_value("")) - ("a,alpha", "alpha for LoRA", cxxopts::value()->default_value("0.75")) + ("a,alpha", "Specify alpha for LoRA", cxxopts::value()->default_value("0.75")) ("h,help", "Print usage"); cxxopts::ParseResult result; @@ -227,13 +295,15 @@ int32_t main(int32_t argc, char* argv[]) try { const uint32_t num_inference_steps = result["step"].as(); const uint32_t user_seed = result["seed"].as(); const uint32_t num_images = result["num"].as(); + const uint32_t height = result["height"].as(); + const uint32_t width = result["width"].as(); const bool use_cache = result["useCache"].as(); const bool read_np_latent = result["readNPLatent"].as(); const std::string model_base_path = result["modelPath"].as(); const std::string model_type = result["type"].as(); + const bool use_dynamic_shapes = result["dynamic"].as(); const std::string lora_path = result["loraPath"].as(); const float alpha = result["alpha"].as(); - const uint32_t height = 512, width = 512; OPENVINO_ASSERT(!read_np_latent || (read_np_latent && (num_images == 1)), "\"readNPLatent\" option is only supported for one output image. Number of image output was set to: " + std::to_string(num_images)); @@ -248,14 +318,23 @@ int32_t main(int32_t argc, char* argv[]) try { std::cout << "OpenVINO version: " << ov::get_openvino_version() << std::endl; std::cout << "Running (may take some time) ..." << std::endl; - // Stable Diffusion pipeline + const std::string model_path = model_base_path + "/" + model_type; + if (!std::filesystem::exists(model_path)) { + std::cerr << "Model IRs for type " << model_type << " don't exist in directory " << model_path << "\n"; + std::cerr << "Refer to README.md to know how to export OpenVINO model with particular data type." << std::endl; + return EXIT_FAILURE; + } - StableDiffusionModels models = compile_models(model_base_path + "/" + model_type, device, lora_path, alpha, use_cache); + // Stable Diffusion pipeline + const size_t batch_size = 1; + StableDiffusionModels models = + compile_models(model_path, device, lora_path, alpha, use_cache, use_dynamic_shapes, batch_size, height, width); ov::InferRequest unet_infer_request = models.unet.create_infer_request(); ov::PartialShape sample_shape = models.unet.input("sample").get_partial_shape(); - OPENVINO_ASSERT(sample_shape.is_dynamic() || (sample_shape[2] * 8 == width && sample_shape[3] * 8 == height), - "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]"); + OPENVINO_ASSERT(sample_shape.is_dynamic() || + (sample_shape[2] * VAE_SCALE_FACTOR == height && sample_shape[3] * VAE_SCALE_FACTOR == width), + "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]"); // no negative prompt for LCM model: // https://huggingface.co/docs/diffusers/api/pipelines/latent_consistency_models#diffusers.LatentConsistencyModelPipeline @@ -269,12 +348,17 @@ int32_t main(int32_t argc, char* argv[]) try { std::vector timesteps = scheduler->get_timesteps(); float guidance_scale = 8.0; - ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, 256); + const size_t unet_time_cond_proj_dim = static_cast(models.unet.input("timestep_cond").get_partial_shape()[1].get_length()); + ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, unet_time_cond_proj_dim); + + const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); + ov::Shape latent_model_input_shape = ov::Shape({1, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); + + ov::Tensor denoised(ov::element::f32, latent_model_input_shape); - ov::Tensor denoised(ov::element::f32, {1, 4, height / 8, width / 8}); for (uint32_t n = 0; n < num_images; n++) { std::uint32_t seed = num_images == 1 ? user_seed: user_seed + n; - ov::Tensor latent_model_input = randn_tensor(height, width, read_np_latent, seed); + ov::Tensor latent_model_input = randn_tensor(latent_model_input_shape, read_np_latent, seed); for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) { ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md index 0ff3ad0906..fb01326ea5 100644 --- a/image_generation/stable_diffusion_1_5/cpp/README.md +++ b/image_generation/stable_diffusion_1_5/cpp/README.md @@ -18,7 +18,7 @@ Prepare a python environment and install dependencies: ```shell conda create -n openvino_sd_cpp python==3.10 conda activate openvino_sd_cpp -conda install -c conda-forge openvino=2024.0.0 c-compiler cxx-compiler make cmake +conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake # Ensure that Conda standard libraries are used conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` @@ -40,13 +40,8 @@ python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] - [dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) to run Stable Diffusion with LoRA adapters. Example command for downloading and exporting FP16 model: - ```shell - export MODEL_PATH="models/dreamlike_anime_1_0_ov/FP16" - # Using optimum-cli for exporting model to OpenVINO format - optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --convert-tokenizer --weight-format fp16 $MODEL_PATH - # Converting tokenizer manually (`--convert-tokenizer` flag of `optimum-cli` results in "OpenVINO Tokenizer export for CLIPTokenizer is not supported.") - convert_tokenizer $MODEL_PATH/tokenizer/ --tokenizer-output-type i32 -o $MODEL_PATH/tokenizer/ - ``` + + `optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike_anime_1_0_ov/FP16` You can also choose other precision and export FP32 or INT8 model. diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt index 289149d134..29b40d70c4 100644 --- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt +++ b/image_generation/stable_diffusion_1_5/cpp/requirements.txt @@ -2,5 +2,5 @@ torch==2.2.2+cpu diffusers==0.27.2 transformers==4.39.3 -optimum-intel[nncf,openvino]==1.16.0 +optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 huggingface_hub[cli]==0.22.2 diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp index d1c24c32a8..7f9f9afc3b 100644 --- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp +++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp @@ -17,8 +17,6 @@ #include "scheduler_lms_discrete.hpp" const size_t TOKENIZER_MODEL_MAX_LENGTH = 77; // 'model_max_length' parameter from 'tokenizer_config.json' -const int64_t UNET_IN_CHANNELS = 4; // 'in_channels' parameter from 'unet/config.json' -const int64_t VAE_DECODER_LATENT_CHANNELS = 4; // 'latent_channels' parameter from 'vae_decoder/config.json' const size_t VAE_SCALE_FACTOR = 8; class Timer { @@ -35,8 +33,8 @@ class Timer { } }; -ov::Tensor randn_tensor(uint32_t height, uint32_t width, bool use_np_latents, uint32_t seed = 42) { - ov::Tensor noise(ov::element::f32, {1, UNET_IN_CHANNELS, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); +ov::Tensor randn_tensor(ov::Shape shape, bool use_np_latents, uint32_t seed = 42) { + ov::Tensor noise(ov::element::f32, shape); if (use_np_latents) { // read np generated latents with defaut seed 42 const char* latent_file_name = "../np_latents_512x512.txt"; @@ -111,7 +109,7 @@ void reshape_unet_encoder(std::shared_ptr model, if (input_name == "timestep") { name_to_shape[input_name][0] = 1; } else if (input_name == "sample") { - name_to_shape[input_name] = {batch_size, UNET_IN_CHANNELS, height, width}; + name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width}; } else if (input_name == "time_ids") { name_to_shape[input_name][0] = batch_size; } else { @@ -127,7 +125,8 @@ void reshape_vae_decoder(std::shared_ptr model, int64_t height, int64 height = height / VAE_SCALE_FACTOR; width = width / VAE_SCALE_FACTOR; - std::map idx_to_shape{{0, {1, VAE_DECODER_LATENT_CHANNELS, height, width}}}; + ov::PartialShape input_shape = model->input(0).get_partial_shape(); + std::map idx_to_shape{{0, {1, input_shape[1], height, width}}}; model->reshape(idx_to_shape); } @@ -216,7 +215,7 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, s tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &prompt}); tokenizer_req.infer(); ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids"); - std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); + std::copy_n(input_ids_token.data(), input_ids_token.get_size(), input_ids.data()); // text embeddings text_encoder_req.set_tensor("input_ids", input_ids); @@ -286,44 +285,22 @@ ov::Tensor postprocess_image(ov::Tensor decoded_image) { int32_t main(int32_t argc, char* argv[]) try { cxxopts::Options options("stable_diffusion", "Stable Diffusion implementation in C++ using OpenVINO\n"); - options.add_options()( - "p,posPrompt", - "Initial positive prompt for SD ", - cxxopts::value()->default_value( - "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"))( - "n,negPrompt", - "Defaut is empty with space", - cxxopts::value()->default_value(" "))( - "d,device", - "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device " - "only", - cxxopts::value()->default_value( - "CPU"))("step", "Number of diffusion steps", cxxopts::value()->default_value("20"))( - "s,seed", - "Number of random seed to generate latent for one image output", - cxxopts::value()->default_value( - "42"))("num", "Number of image output", cxxopts::value()->default_value("1"))( - "height", - "Destination image height", - cxxopts::value()->default_value( - "512"))("width", "Destination image width", cxxopts::value()->default_value("512"))( - "c,useCache", - "Use model caching", - cxxopts::value()->default_value("false"))("r,readNPLatent", - "Read numpy generated latents from file", - cxxopts::value()->default_value("false"))( - "m,modelPath", - "Specify path of SD model IRs", - cxxopts::value()->default_value("../models/dreamlike_anime_1_0_ov"))( - "t,type", - "Specify the type of SD model IRs (FP32, FP16 or INT8)", - cxxopts::value()->default_value("FP16"))("dynamic", - "Specify the model input shape to use dynamic shape", - cxxopts::value()->default_value("false"))( - "l,loraPath", - "Specify path of LoRA file. (*.safetensors).", - cxxopts::value()->default_value( - ""))("a,alpha", "alpha for LoRA", cxxopts::value()->default_value("0.75"))("h,help", "Print usage"); + options.add_options() + ("p,posPrompt", "Initial positive prompt for SD ", cxxopts::value()->default_value("cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting")) + ("n,negPrompt", "Defaut is empty with space", cxxopts::value()->default_value(" ")) + ("d,device", "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only", cxxopts::value()->default_value("CPU")) + ("step", "Number of diffusion steps", cxxopts::value()->default_value("20")) + ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value()->default_value("42")) + ("num", "Number of image output", cxxopts::value()->default_value("1")) + ("height", "Destination image height", cxxopts::value()->default_value("512")) + ("width", "Destination image width", cxxopts::value()->default_value("512")) + ("c,useCache", "Use model caching", cxxopts::value()->default_value("false")) + ("r,readNPLatent", "Read numpy generated latents from file", cxxopts::value()->default_value("false")) + ("m,modelPath", "Specify path of SD model IRs", cxxopts::value()->default_value("./models/dreamlike_anime_1_0_ov")) + ("t,type", "Specify the type of SD model IRs (FP32, FP16 or INT8)", cxxopts::value()->default_value("FP16")) + ("dynamic", "Specify the model input shape to use dynamic shape", cxxopts::value()->default_value("false")) + ("l,loraPath", "Specify path of LoRA file. (*.safetensors).", cxxopts::value()->default_value("")) + ("a,alpha", "alpha for LoRA", cxxopts::value()->default_value("0.75"))("h,help", "Print usage"); cxxopts::ParseResult result; try { @@ -397,10 +374,13 @@ int32_t main(int32_t argc, char* argv[]) try { for (uint32_t n = 0; n < num_images; n++) { std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n; - ov::Tensor noise = randn_tensor(height, width, read_np_latent, seed); + + const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); // latents are multiplied by 'init_noise_sigma' - ov::Shape latent_shape = noise.get_shape(), latent_model_input_shape = latent_shape; + ov::Shape latent_shape = ov::Shape({batch_size, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); + ov::Shape latent_model_input_shape = latent_shape; + ov::Tensor noise = randn_tensor(latent_shape, read_np_latent, seed); latent_model_input_shape[0] = 2; // Unet accepts batch 2 ov::Tensor latent(ov::element::f32, latent_shape), latent_model_input(ov::element::f32, latent_model_input_shape); diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 3f1d1fa118..a9a696c68b 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -34,7 +34,6 @@ DEFAULT_SUPER_RESOLUTION_STEPS = 50 DEFAULT_SUPER_RESOLUTION_WIDTH = 128 DEFAULT_SUPER_RESOLUTION_HEIGHT = 128 -DEFAULT_OUTPUT_TOKEN_SIZE = 512 MAX_OUTPUT_TOKEN_SIZE = 64 * 1024 mem_consumption = MemConsumption() @@ -88,22 +87,22 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, # Remove `token_type_ids` from inputs input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data input_token_size = input_tokens[0].numel() - - max_output_token_size = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] - max_output_token_size = MAX_OUTPUT_TOKEN_SIZE if max_output_token_size > MAX_OUTPUT_TOKEN_SIZE else max_output_token_size if args['batch_size'] > 1: out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) out_str += " Batch_size={}, ".format(args['batch_size']) out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) - out_str += 'all max_output_token_size: {} * {}'.format(max_output_token_size, args['batch_size']) + if args['infer_count'] is not None: + out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) log.info(out_str) max_rss_mem_consumption = '' max_shared_mem_consumption = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() + min_gen_tokens = 0 if args['infer_count'] is None else args['infer_count'] + max_gen_tokens = MAX_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] start = time.perf_counter() - result = model.generate(**input_data, max_new_tokens=int(max_output_token_size), num_beams=args['num_beams'], use_cache=True) + result = model.generate(**input_data, min_new_tokens=int(min_gen_tokens), max_new_tokens=int(max_gen_tokens), num_beams=args['num_beams'], use_cache=True) end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() @@ -124,19 +123,23 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, else: generated_text_len = len(result[bs_idx]) num_tokens += generated_text_len - if generated_text_len > max_output_token_size: + if generated_text_len > max_gen_tokens: log.error('Output token size is over max output token size!') result_text = generated_text[bs_idx] if args["output_dir"] is not None: utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) - result_md5_list.append(hashlib.md5(result_text.encode()).hexdigest()) + result_md5_list.append(hashlib.md5(result_text.encode(), usedforsecurity=False).hexdigest()) if num == 0: warmup_md5[prompt_index] = result_md5_list per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + tm_list = bench_hook.get_time_list() + log.debug('latency of all tokens:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + tm_infer_list = bench_hook.get_time_infer_list() iter_data = gen_iterate_data( num, input_token_size * args['batch_size'], - max_output_token_size, + len(tm_infer_list), num_tokens, generation_time, per_token_time, @@ -147,8 +150,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, tokenization_time=(tok_encode_time, tok_decode_time) ) iter_data_list.append(iter_data) - tm_list = bench_hook.get_time_list() - tm_infer_list = bench_hook.get_time_infer_list() utils.metrics_print.print_metrics( num, iter_data, @@ -239,7 +240,7 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, mem_consumption.clear_max_memory_consumption() for bs_idx in range(args['batch_size']): rslt_img_fn = utils.output_file.output_gen_image(res[bs_idx], args, image_id, num, bs_idx, proc_id, '.png') - result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes()).hexdigest()) + result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) generation_time = end - start iter_data = gen_iterate_data( iter_idx=num, @@ -339,7 +340,7 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im result_md5_list = [] if framework == 'ov': rslt_img_fn = utils.output_file.output_gen_image(res[0], args, image_id, num, None, proc_id, '.png') - result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes()).hexdigest()) + result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) generation_time = end - start iter_data = gen_iterate_data( @@ -412,6 +413,15 @@ def num_iters_type(x): return x +def num_infer_count_type(x): + x = int(x) + if x < 1: + raise argparse.ArgumentTypeError('Minimum input value is 1') + elif x > MAX_OUTPUT_TOKEN_SIZE: + raise argparse.ArgumentTypeError(f'Max input value is {MAX_OUTPUT_TOKEN_SIZE}') + return x + + def get_argprser(): parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError) @@ -425,9 +435,8 @@ def get_argprser(): '-ic', '--infer_count', default=None, - type=int, - help='limit the output token size ' - f'(default {DEFAULT_OUTPUT_TOKEN_SIZE}) of text_gen and code_gen models.', + type=num_infer_count_type, + help='set the output token size, the value must be greater than 0.' ) parser.add_argument( '-n', @@ -501,7 +510,7 @@ def get_argprser(): def main(): - log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) + log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout) args = get_argprser() model_path, framework, model_args, model_name = utils.model_utils.analyze_args(args) diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py index 01312b4ebe..221752bcfc 100644 --- a/llm_bench/python/convert.py +++ b/llm_bench/python/convert.py @@ -972,7 +972,7 @@ def ts_patched_forward( remote_code = False pt_model = None try: - config = AutoConfig.from_pretrained(args.model_id) + config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=False) except Exception: config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) remote_code = True @@ -1215,14 +1215,13 @@ def convert_falcon(args): def convert_phi(args): trust_remote_code = False try: - config = AutoConfig.from_pretrained(args.model_id) + config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=False) except Exception: config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) trust_remote_code = True cuda, post_init = patch_gptq(config) model_kwargs = {} - if trust_remote_code: - model_kwargs["trust_remote_code"] = trust_remote_code + model_kwargs["trust_remote_code"] = trust_remote_code precision = args.precision compression_only = ( args.compress_weights @@ -1238,7 +1237,7 @@ def convert_phi(args): if not compression_only: pt_model = AutoModelForCausalLM.from_pretrained( args.model_id, - config=AutoConfig.from_pretrained(args.model_id), + config=config, **model_kwargs, ) pt_model.config.use_cache = True diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index 0dc3476328..87224e5d85 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu numpy -openvino>=2024.0.0 +openvino>=2024.1.0 auto-gptq>=0.5.1 # for gptq pillow torch diff --git a/llm_bench/python/requirements_2024.1.txt b/llm_bench/python/requirements_2024.1.txt deleted file mode 100644 index a0d4388870..0000000000 --- a/llm_bench/python/requirements_2024.1.txt +++ /dev/null @@ -1,111 +0,0 @@ -nncf @ git+https://github.com/openvinotoolkit/nncf.git@ec497ce0781fe867d73d5c5bdf8310fdb40604a4#egg=nncf -about-time==4.2.1 -accelerate==0.29.2 -aiohttp==3.9.4 -aiosignal==1.3.1 -alive-progress==3.1.5 -async-timeout==4.0.3 -attrs==23.2.0 -auto_gptq==0.7.1 -autograd==1.6.2 -bitsandbytes==0.43.1 -blobfile==2.1.1 -certifi==2019.11.28 -chardet==3.0.4 -charset-normalizer==3.3.2 -cma==3.2.2 -coloredlogs==15.0.1 -contourpy==1.2.1 -cycler==0.12.1 -datasets==2.18.0 -Deprecated==1.2.14 -diffusers==0.27.2 -dill==0.3.8 -einops==0.7.0 -filelock==3.13.4 -fonttools==4.51.0 -frozenlist==1.4.1 -fsspec==2024.2.0 -future==1.0.0 -gekko==1.1.1 -grapheme==0.6.0 -huggingface-hub==0.22.2 -humanfriendly==10.0 -idna==2.8 -importlib_metadata==7.1.0 -influxdb-client==1.41.0 -Jinja2==3.1.3 -joblib==1.4.0 -jsonschema==4.21.1 -jsonschema-specifications==2023.12.1 -jstyleson==0.0.2 -kiwisolver==1.4.5 -lxml==4.9.4 -Mako==1.1.0 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -matplotlib==3.8.4 -mdurl==0.1.2 -mpmath==1.3.0 -multidict==6.0.5 -multiprocess==0.70.16 -natsort==8.4.0 -networkx==3.3 -ninja==1.11.1.1 -numpy==1.26.4 -onnx==1.16.0 -openvino-telemetry==2024.1.0 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@ff792c278502a85444dd116413dbca71aa660599 -packaging==24.0 -pandas==2.2.2 -peft==0.10.0 -pillow==10.3.0 -pip==24.0 -protobuf==5.26.1 -psutil==5.9.8 -py-cpuinfo==9.0.0 -pyarrow==15.0.2 -pyarrow-hotfix==0.6 -pycryptodomex==3.20.0 -pydot==2.0.0 -Pygments==2.17.2 -pymoo==0.6.1.1 -pyparsing==3.1.2 -python-dateutil==2.9.0.post0 -python-git==2018.2.1 -pytz==2024.1 -PyYAML==6.0.1 -reactivex==4.0.4 -referencing==0.34.0 -regex==2023.12.25 -requests==2.31.0 -requests-unixsocket==0.2.0 -rich==13.7.1 -rouge==1.0.1 -rpds-py==0.18.0 -safetensors==0.4.3 -scikit-learn==1.4.2 -scipy==1.13.0 -Send2Trash==1.8.3 -sentencepiece==0.2.0 -setuptools==65.5.0 -six==1.14.0 -sympy==1.12 -tabulate==0.9.0 -threadpoolctl==3.4.0 -tiktoken==0.6.0 -timm==0.9.16 -tokenizers==0.15.2 -torch==2.2.2 -torchvision==0.17.2 -tqdm==4.66.2 -transformers==4.39.3 -transformers-stream-generator==0.0.5 -typing_extensions==4.11.0 -tzdata==2024.1 -urllib3==2.2.1 -wheel==0.41.2 -wrapt==1.16.0 -xxhash==3.4.1 -yarl==1.9.4 -zipp==3.18.1 diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py index 7142abd3bd..b0d0d93aa1 100644 --- a/llm_bench/python/utils/nncf_utils.py +++ b/llm_bench/python/utils/nncf_utils.py @@ -33,7 +33,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str): INT4_MODEL_CONFIGURATION = { - "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.5}, + "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64}, "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8}, "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6}, @@ -64,4 +64,8 @@ def get_compressed_path(output_dir: str, base_precision, option: str): "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9}, "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7}, + "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7}, + "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, + "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0}, + "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, } diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py index ed62498fc6..a2416ccb92 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/utils/ov_utils.py @@ -143,7 +143,7 @@ def create_text_gen_model(model_path, device, **kwargs): else: remote_code = False try: - model_config = AutoConfig.from_pretrained(model_path) + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False) except Exception: model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) remote_code = True diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt index ff4132e08f..87cbbda618 100644 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/CMakeLists.txt @@ -27,18 +27,22 @@ add_executable(${TARGET_NAME} speculative_decoding_lm.cpp) target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) -set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17) -set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON) +target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime) +set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17) +set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) +find_package(TBB REQUIRED COMPONENTS tbb) +target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb) set(TARGET_NAME prompt_lookup_decoding_lm) add_executable(${TARGET_NAME} prompt_lookup_decoding_lm.cpp) target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") target_include_directories(${TARGET_NAME} PRIVATE ./) find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) -set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17) -set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD_REQUIRED ON) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime) +set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17) +set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) +find_package(TBB REQUIRED COMPONENTS tbb) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb) set(TARGET_NAME generate_sample) add_executable(${TARGET_NAME} generate_pipeline/generate_sample.cpp) diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md index 55bc919ce0..08b91ab70e 100644 --- a/text_generation/causal_lm/cpp/README.md +++ b/text_generation/causal_lm/cpp/README.md @@ -1,6 +1,6 @@ # Text generation C++ samples that support most popular models like LLaMA 2 -These examples showcase inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `convert_tokenizer` to generate IRs for the samples. [group_beam_searcher.hpp](group_beam_searcher.hpp) implements the algorithm of the same name, which is used by `beam_search_causal_lm`. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +These examples showcase inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. [group_beam_searcher.hpp](group_beam_searcher.hpp) implements the algorithm of the same name, which is used by `beam_search_causal_lm`. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. ## How it works @@ -53,7 +53,16 @@ This approach reduces the need for multiple infer requests to the main model, en ## Install OpenVINO -Install [OpenVINO Archives >= 2024.0](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. +Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install `libtbb-dev` on Linux + +> [!NOTE] +> `tbb` development files are installed with OpenVINO Archive on Windows and macOS. + +```sh +sudo apt-get install libtbb-dev +``` ## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers` @@ -81,18 +90,20 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upg ```sh source /setupvars.sh -python3 -m pip install --upgrade-strategy eager "transformers<4.38" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu -python3 ../../../llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir ./TinyLlama-1.1B-Chat-v1.0/ --precision FP16 -convert_tokenizer ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code +python3 -m pip install --upgrade-strategy eager -r requirements.txt +# Update openvino_tokenizers from the submodule +python3 -m pip install ./../../../thirdparty/openvino_tokenizers/[transformers] +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` #### Windows ```bat \setupvars.bat -python -m pip install --upgrade-strategy eager "transformers<4.38" -r ..\..\..\llm_bench\python\requirements.txt ..\..\..\thirdparty\openvino_tokenizers\[transformers] --extra-index-url https://download.pytorch.org/whl/cpu -python ..\..\..\llm_bench\python\convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir .\TinyLlama-1.1B-Chat-v1.0\ --precision FP16 -convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --with-detokenizer --trust-remote-code +python -m pip install --upgrade-strategy eager -r requirements.txt +REM Update openvino_tokenizers from the submodule +python -m pip install .\..\..\..\thirdparty\openvino_tokenizers\[transformers] +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` ## Run @@ -106,19 +117,21 @@ convert_tokenizer .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ --output .\TinyL ### Examples: #### Linux/MacOS: -1. `./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"` -2. `./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"` -3. `./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ ./Llama-2-7b-chat-hf/pytorch/dldt/FP16/ "Why is the Sun yellow?"` -4. `./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Why is the Sun yellow?"` +1. `./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` +2. `./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` +3. `./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ ./Llama-2-7b-chat-hf/ "Why is the Sun yellow?"` +4. `./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` #### Windows: -1. `.\build\Release\greedy_causal_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "Why is the Sun yellow?"` -2. `.\build\Release\beam_search_causal_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "Why is the Sun yellow?"` -3. `.\build\Release\speculative_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ .\Llama-2-7b-chat-hf\pytorch\dldt\FP16\ "Why is the Sun yellow?"` -4. `.\build\Release\prompt_lookup_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\pytorch\dldt\FP16\ "Why is the Sun yellow?"` +1. `.\build\Release\greedy_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` +2. `.\build\Release\beam_search_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` +3. `.\build\Release\speculative_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ .\Llama-2-7b-chat-hf\ "Why is the Sun yellow?"` +4. `.\build\Release\prompt_lookup_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + ## Supported models 1. chatglm diff --git a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp index f4a50e94bb..5060b88642 100644 --- a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp +++ b/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include #include namespace { @@ -94,10 +95,11 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { // trim kv_cache values up to the new_seq_len - for (auto& state : request.query_state()) { - ov::Tensor old_tensor = state.get_state(); - state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); - } + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); } class PromptLookupCandidateGenerator { diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt new file mode 100644 index 0000000000..e1c10930ad --- /dev/null +++ b/text_generation/causal_lm/cpp/requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +optimum[openvino]==1.19.2 +optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 +einops==0.8.0 # For Qwen +transformers_stream_generator==0.0.5 # For Qwen diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp index 92523f82a5..b0c40a7a9f 100644 --- a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp +++ b/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp @@ -1,18 +1,17 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include #include +#include +#include #include constexpr size_t BATCH_SIZE = 1; -// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], +// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], // threfore usually SEQ_LEN_AXIS = 2 constexpr size_t SEQ_LEN_AXIS = 2; -int64_t SPECIAL_EOS_TOKEN; - namespace { std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); @@ -43,7 +42,7 @@ struct TextStreamer { std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; token_cache.clear(); print_len = 0; - return; + return; } if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { // Don't print incomplete text @@ -60,22 +59,24 @@ struct TextStreamer { print_len = 0; } }; -} ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) { // Copy elements from the old to a new tensor and return it. // It's assumed that key/values tensor has a shape [BATCH_SIZE, num_kv_heads, seq_len, head_size] or [seq_len, ...], // It that's not the case for your model please implement your own trim method. - OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, "Cannot trim key/values with sequence length axis = ", seq_len_axis); - + OPENVINO_ASSERT(seq_len_axis == 2 || seq_len_axis == 0, + "Cannot trim key/values with sequence length axis = ", + seq_len_axis); + auto old_tensor_data = tensor.data(); auto shape = tensor.get_shape(); + size_t batch_size = shape[0]; size_t num_kv_heads = shape[1]; size_t old_seq_len = shape[2]; size_t head_size = shape[3]; - + OPENVINO_ASSERT(new_seq_len <= old_seq_len); - + // if new_seq_len equal to old one no need to copy tensor, return as is if (old_seq_len == new_seq_len) return tensor; @@ -83,31 +84,133 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ if (seq_len_axis == 0) { shape[0] = new_seq_len; tensor.set_shape(shape); + return tensor; } - // if seq_len_axis == 2, then data is not contiguous, in order to trim need to repack tensor - auto new_tensor = ov::Tensor{ov::element::f32, {BATCH_SIZE, num_kv_heads, new_seq_len, head_size}}; - auto new_tensor_data = new_tensor.data(); - for (size_t batch = 0; batch < BATCH_SIZE; ++batch){ - for (size_t i = 0; i < num_kv_heads; ++i) { - for (size_t j = 0; j < new_seq_len; ++j) { - auto dst_ptr = new_tensor_data + num_kv_heads * new_seq_len * head_size * batch + new_seq_len * head_size * i + head_size * j; - auto src_ptr = old_tensor_data + num_kv_heads * new_seq_len * head_size * batch + old_seq_len * head_size * i + head_size * j; - std::memcpy(dst_ptr, src_ptr, head_size * sizeof(float)); - } - } - } + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); + return new_tensor; } void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { // trim kv_cache values up to the new_seq_len - for (auto& state: request.query_state()) { - ov::Tensor old_tensor = state.get_state(); - state.set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); +} + +class AssistedCandidateGenerator { +private: + ov::InferRequest draft_model; + size_t max_seq_length; + size_t num_pred_tokens = 5; + const size_t max_pred_tokens = 10; + int64_t out_of_kv_cache_token = -1; + size_t draft_model_seq_length = 0; + +public: + AssistedCandidateGenerator(ov::InferRequest draft_model, const size_t max_seq_length, const size_t num_pred_tokens) + : draft_model{draft_model}, + max_seq_length{max_seq_length}, + num_pred_tokens{num_pred_tokens} {}; + + int64_t generate_next_token(const std::vector tokens) { + size_t tokens_size = tokens.size(); + auto input_ids = draft_model.get_tensor("input_ids"); + input_ids.set_shape({BATCH_SIZE, tokens_size}); + std::copy_n(tokens.begin(), tokens_size, input_ids.data()); + + auto attention_mask = draft_model.get_tensor("attention_mask"); + attention_mask.set_shape({BATCH_SIZE, draft_model_seq_length + tokens_size}); + std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); + + auto position_ids = draft_model.get_tensor("position_ids"); + position_ids.set_shape({BATCH_SIZE, tokens_size}); + std::iota(position_ids.data(), + position_ids.data() + position_ids.get_size(), + draft_model_seq_length); + + draft_model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + draft_model.get_tensor("beam_idx").data()[0] = 0; + + draft_model.infer(); + + auto logits = draft_model.get_tensor("logits"); + size_t vocab_size = logits.get_shape().back(); + auto sequence_logits = logits.data() + (tokens_size - 1) * vocab_size; + + draft_model_seq_length += tokens_size; + + return std::max_element(sequence_logits, sequence_logits + vocab_size) - sequence_logits; + } + + std::vector generate_candidates(int64_t out_token) { + std::vector candidates; + + // limit candidates size by num_pred_tokens or by max_seq_length + size_t candidates_to_generate = std::min(num_pred_tokens, max_seq_length - draft_model_seq_length - 1); + + candidates.reserve(candidates_to_generate); + + // generate cadidates + for (size_t i = 0; i < candidates_to_generate; i++) { + // if out_of_kv_cache_token is present, prepend it to out_token in order to collect kv cache for it + if (out_of_kv_cache_token != -1) { + out_token = generate_next_token(std::vector{out_of_kv_cache_token, out_token}); + out_of_kv_cache_token = -1; + } else { + out_token = generate_next_token(std::vector{out_token}); + } + + candidates.push_back(out_token); + } + + out_of_kv_cache_token = candidates.back(); + return candidates; + } + + void update_candidate_strategy(const size_t num_matches) { + // dynamically adjust number of generated candidates based on number of matches + // we want to balance the benefits of getting candidates tokens correct with the + // cost of forecasting incorrect candidates tokens. + if (num_matches == num_pred_tokens) { + num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens); + } else { + num_pred_tokens = std::max(int64_t(num_pred_tokens) - 1, int64_t(1)); + } } + + void update_kv_cache(const size_t seq_length) { + // this is the case when main model accepted all candidates from draft model + // we need to collect kv cache for out_of_kv_cache_token by infering it + // on next candidates generation cycle out_of_kv_cache_token will be prefixed + // to main models's latest out token + if (draft_model_seq_length < seq_length) { + return; + } + + out_of_kv_cache_token = -1; + ::update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_length); + draft_model_seq_length = seq_length; + } +}; + +int64_t get_eos_token(const std::shared_ptr tokenizer) { + auto rt_info = tokenizer->get_rt_info(); // Get the runtime info for the model + + auto it = rt_info.find("eos_token_id"); + if (it == rt_info.end()) { + throw std::runtime_error("EOS token ID not found in model's runtime information."); + } + return it->second.as(); } +} // namespace + int main(int argc, char* argv[]) try { if (argc != 4) { throw std::runtime_error(std::string{"Usage: "} + argv[0] + "
''"); @@ -118,150 +221,130 @@ int main(int argc, char* argv[]) try { core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model( - tokenizer_model, "CPU").create_infer_request(); - auto [draft_input_ids, draft_attention_mask] = tokenize(tokenizer, argv[3]); - ov::InferRequest detokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); + auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]); + ov::InferRequest detokenizer = + core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); TextStreamer text_streamer{std::move(detokenizer)}; - // draft model - ov::InferRequest draft_model = core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); + // draft model (which is smaller, less accurate but faster) + ov::InferRequest draft_model = + core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); + + uint64_t seq_len = input_ids.get_shape()[1]; - draft_model.set_tensor("input_ids", draft_input_ids); - draft_model.set_tensor("attention_mask", draft_attention_mask); - - ov::Tensor draft_position_ids = draft_model.get_tensor("position_ids"); - draft_position_ids.set_shape(draft_input_ids.get_shape()); - std::iota(draft_position_ids.data(), draft_position_ids.data() + draft_position_ids.get_size(), 0); - uint64_t seq_len = draft_input_ids.get_shape()[1]; + // main model (which is bigger, more accurate but slower) + ov::InferRequest main_model = + core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request(); - // main model - ov::InferRequest main_model = core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request(); + size_t max_sequence_length = 100; - // Input tensors for the main model should not be mixed with draft. - // Do not feed the same draft_postion_ids to the main, but copy input_ids from the draft_input_ids - auto input_ids = main_model.get_tensor("input_ids"); - input_ids.set_shape(draft_input_ids.get_shape()); - draft_input_ids.copy_to(input_ids); + AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5}; - auto attention_mask = main_model.get_tensor("attention_mask"); - attention_mask.set_shape(draft_input_ids.get_shape()); - std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); + main_model.set_tensor("input_ids", input_ids); + main_model.set_tensor("attention_mask", attention_mask); auto position_ids = main_model.get_tensor("position_ids"); - position_ids.set_shape(draft_input_ids.get_shape()); + position_ids.set_shape(input_ids.get_shape()); std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); - + // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 - draft_model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); - draft_model.get_tensor("beam_idx").data()[0] = 0; main_model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); main_model.get_tensor("beam_idx").data()[0] = 0; // To coollect kv-cache for the and to get the next token run the very first infer request - draft_model.infer(); + candidateGenerator.generate_next_token( + std::vector(input_ids.data(), input_ids.data() + input_ids.get_size())); + main_model.infer(); size_t vocab_size = draft_model.get_tensor("logits").get_shape().back(); - OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), "vocab size should be the same for the both models"); - + OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), + "vocab size should be the same for the both models"); + // logits shape is [BATCH_SIZE, seq_len, vocab_size] auto logits = main_model.get_tensor("logits"); auto data_logits = logits.data() + (seq_len - 1) * vocab_size; int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits; - - // the first token which is fed to both draft and main netwoks on each iteration - auto first_token = out_token; - text_streamer.put(out_token); - - // run K infer requests on draft model and get next K prediction tokens on each iteration - uint64_t K = 5; - std::vector draft_tokens; - // The draft model predicts tokens one by one in an auto-regressive manner, draft_input_ids length should be 1. - draft_input_ids.set_shape({BATCH_SIZE, 1}); - draft_position_ids.set_shape({BATCH_SIZE, 1}); - - auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model - - if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID - SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); - } else { - throw std::runtime_error("EOS token ID not found in model's runtime information."); - } + text_streamer.put(out_token); -/* Speculative decoding works the following way. The draft model predicts the next K - tokens one by one in an autoregressive manner, while the main model validates these - predictions and corrects them if necessary. We go through each predicted token, and - if a difference is detected between the draft and main model, we stop and keep the - last token predicted by the main model. Then the draft model gets the latest main - prediction and again tries to predict the next K tokens, repeating the cycle. - - This approach reduces the need for multiple infer requests to the main model, - enhancing performance. For instance, in more predictable parts of text generation, - the draft model can, in best-case scenarios, generate the next K tokens that exactly - match the target. In tha caste the are validated in a single inference request to - the main model (which is bigger, more accurate but slower) instead of running K - subsequent requests. - */ - int max_sequence_length = 100; - while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) { - // infer the K next tokens with draft model - for (int i = 0; i < K; ++i) { - draft_input_ids.data()[0] = out_token; - draft_attention_mask.set_shape({BATCH_SIZE, seq_len + i + 1}); - std::fill_n(draft_attention_mask.data(), draft_attention_mask.get_size(), 1); - draft_position_ids.data()[0] = int64_t(draft_attention_mask.get_size() - 1); - - draft_model.infer(); - - auto draft_logits = draft_model.get_tensor("logits").data(); - int64_t arg_max_token = std::max_element(draft_logits, draft_logits + vocab_size) - draft_logits; - out_token = arg_max_token; - draft_tokens.emplace_back(arg_max_token); + const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); + + /* Speculative decoding works the following way. The draft model predicts the next K + tokens one by one in an autoregressive manner, while the main model validates these + predictions and corrects them if necessary. We go through each predicted token, and + if a difference is detected between the draft and main model, we stop and keep the + last token predicted by the main model. Then the draft model gets the latest main + prediction and again tries to predict the next K tokens, repeating the cycle. + + This approach reduces the need for multiple infer requests to the main model, + enhancing performance. For instance, in more predictable parts of text generation, + the draft model can, in best-case scenarios, generate the next K tokens that exactly + match the target. In that case they are validated in a single inference call to + the main model instead of running K subsequent requests. + */ + + while (out_token != EOS_TOKEN && seq_len < max_sequence_length) { + // generate candidates from the draft model + std::vector candidates = candidateGenerator.generate_candidates(out_token); + size_t candidates_size = candidates.size(); + + // For the main network, candidates_size + 1 tokens will be fed at once in a single infer request. + input_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + + input_ids.data()[0] = out_token; + if (candidates_size > 0) { + std::copy_n(candidates.begin(), candidates_size, input_ids.data() + 1); } - // For the main network, K tokens will be fed at once in a single infer request. - input_ids.set_shape({BATCH_SIZE, K}); - // Set the first token for the main model to be the same as for the draft model. - input_ids.data()[0] = first_token; - for (int i = 0; i < K - 1; i++) - input_ids.data()[i + 1] = draft_tokens[i]; - - attention_mask.set_shape({BATCH_SIZE, seq_len + K}); + attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1}); std::fill_n(attention_mask.data(), attention_mask.get_size(), 1); - position_ids.set_shape({BATCH_SIZE, K}); + position_ids.set_shape({BATCH_SIZE, candidates_size + 1}); std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), seq_len); main_model.infer(); data_logits = logits.data(); // [BATCH_SIZE, K, vocab_size] - size_t disagree_idx = K - 1; - // Iterate through the predicted tokens from the main model and compare them with draft predictions. - // In the worst-case scenario (disagreement at the beginning), iter will increase by 1. - // In the best-case scenario, all elements match, and K predicted tokens will be taken. - for (size_t i = 0; i < K; i++) { + + // match model tokens with candidate tokens + // 1. accept current out token (if not eos) + // 2. check if it matches apropriate candidate + // 2.1 if it's match, continue - accept next token + // 2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by + // model from a valid sequence. + size_t accepted_tokens_number = 0; + for (size_t i = 0; i < candidates_size + 1; i++) { auto start = data_logits + vocab_size * i; auto stop = data_logits + vocab_size * (i + 1); out_token = std::max_element(start, stop) - start; + + if (out_token == EOS_TOKEN) { + break; + } + text_streamer.put(out_token); + accepted_tokens_number++; - disagree_idx = i; - if (out_token != draft_tokens[i] || out_token == SPECIAL_EOS_TOKEN || seq_len + disagree_idx + 1 >= max_sequence_length) + if (i == candidates_size || out_token != candidates[i]) { break; + } } // After the inference request, key/values have shape [BATCH_SIZE, seq_len + K, vocab_size]. // Increment the sequence length by the number of matched tokens, and // trim the KV cache to match the new sequence length. - seq_len += disagree_idx + 1; - update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_len); + seq_len += accepted_tokens_number; + + if (accepted_tokens_number > 0) { + candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1); + } + + candidateGenerator.update_kv_cache(seq_len); update_kv_cache(main_model, SEQ_LEN_AXIS, seq_len); - - draft_tokens.clear(); - first_token = out_token; + + candidates.clear(); } text_streamer.end(); // Model is stateful which means that context (kv-cache) which belongs to a particular