LoRA in Text2ImagePipeline (openvinotoolkit#911)

Co-authored-by: Ilya Lavrenov <[email protected]>
ilya-lavrenov · Oct 7, 2024 · 41f1e7b · 41f1e7b
1 parent b11f0d9
commit 41f1e7b
Show file tree

Hide file tree

Showing 28 changed files with 318 additions and 111 deletions.
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -67,7 +67,7 @@ jobs:
       - name: Run app
         run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
-          ./build/samples/cpp/stable_diffusion/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+          ./build/samples/cpp/text2image/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
 
   lcm_dreamshaper_v7_cpp-windows:
     runs-on: windows-latest
@@ -118,7 +118,7 @@ jobs:
       - name: Run app
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          ./build/samples/cpp/stable_diffusion/Release/lcm_dreamshaper.exe  ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+          ./build/samples/cpp/text2image/Release/lcm_dreamshaper.exe  ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
 
   Overall_Status:
     name: ci/gha_overall_status_lcm

diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -64,10 +64,19 @@ jobs:
           source openvino_sd_cpp/bin/activate
           optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16
 
-      - name: Run app
+      - name: Run main app
         run: |
           source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
-          ./build/samples/cpp/stable_diffusion/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+          ./build/samples/cpp/text2image/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+
+      - name: Download LoRA adapter
+        run: |
+          wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591
+
+      - name: Run LoRA app
+        run: |
+          source ${{ env.OV_INSTALL_DIR }}/setupvars.sh
+          ./build/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7
 
   stable_diffusion_1_5_cpp-windows:
       runs-on: windows-latest
@@ -118,7 +127,7 @@ jobs:
         - name: Run app
           run: |
             . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-            ./build/samples/cpp/stable_diffusion/Release/stable_diffusion.exe  ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
+            ./build/samples/cpp/text2image/Release/stable_diffusion.exe  ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
 
   Overall_Status:
     name: ci/gha_overall_status_stable_diffusion

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ It includes the following pipelines:
      6. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md)
      7. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md)
      8. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md)
-3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/stable_diffusion/README.md)
+3. [Stable Diffuison and Latent Consistency Model (with LoRA) C++ image generation pipeline](./samples/cpp/text2image/README.md)
 
 ### Requirements
 

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -13,7 +13,7 @@ add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
 add_subdirectory(cpp/benchmark_genai)
 add_subdirectory(cpp/whisper_speech_recognition)
-add_subdirectory(cpp/stable_diffusion)
+add_subdirectory(cpp/text2image)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)
@@ -26,7 +26,7 @@ install(DIRECTORY
             # Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet.
             # Don't install continuous_batching_accuracy and continuous_batching_benchmark because they depend on json.
             cpp/whisper_speech_recognition
-            cpp/stable_diffusion
+            cpp/text2image
             cpp/lora_greedy_causal_lm
         DESTINATION samples/cpp COMPONENT cpp_samples_genai)
 
@@ -36,6 +36,6 @@ install(DIRECTORY
             python/greedy_causal_lm
             python/multinomial_causal_lm
             python/whisper_speech_recognition
-            # python/stable_diffusion
+            # python/text2image
         DESTINATION samples/python COMPONENT cpp_samples_genai
         USE_SOURCE_PERMISSIONS)
diff --git a/samples/cpp/stable_diffusion/README.md b/samples/cpp/stable_diffusion/README.md
diff --git a/samples/cpp/stable_diffusion/512x512.bmp → samples/cpp/text2image/512x512.bmp b/samples/cpp/stable_diffusion/512x512.bmp → samples/cpp/text2image/512x512.bmp
diff --git a/samples/cpp/stable_diffusion/CMakeLists.txt → samples/cpp/text2image/CMakeLists.txt b/samples/cpp/stable_diffusion/CMakeLists.txt → samples/cpp/text2image/CMakeLists.txt
@@ -8,7 +8,7 @@ find_package(OpenVINOGenAI REQUIRED
     NO_CMAKE_FIND_ROOT_PATH
 )
 
-# create executable
+# create main sample executable
 
 add_executable(stable_diffusion
     ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
@@ -26,3 +26,22 @@ install(TARGETS stable_diffusion
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
         EXCLUDE_FROM_ALL)
+
+# create LoRA sample executable
+
+add_executable(lora_stable_diffusion
+    ${CMAKE_CURRENT_SOURCE_DIR}/lora.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp)
+
+target_include_directories(lora_stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(lora_stable_diffusion PRIVATE openvino::genai)
+
+set_target_properties(lora_stable_diffusion PROPERTIES
+    COMPILE_PDB_NAME lora_stable_diffusion
+    # Ensure out of box LC_RPATH on macOS with SIP
+    INSTALL_RPATH_USE_LINK_PATH ON)
+
+install(TARGETS lora_stable_diffusion
+        RUNTIME DESTINATION samples_bin/
+        COMPONENT samples_bin
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md
@@ -0,0 +1,76 @@
+# Text to Image C++ Generation Pipeline
+
+Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source.
+
+There are two sample files:
+ - [`main.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline
+ - [`lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline
+
+Users can change the sample code and play with the following generation parameters:
+
+- Change width or height of generated image
+- Generate multiple images per prompt
+- Adjust a number of inference steps
+- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598))
+- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1
+- Apply multiple different LoRA adapters and mix them with different blending coefficients
+
+## Download and convert the models and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16
+```
+
+## Run
+
+`stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'`
+
+### Examples
+
+Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting`
+
+   ![](./512x512.bmp)
+
+## Supported models
+
+Models can be downloaded from [HiggingFace](https://huggingface.co/models). This sample can run the following list of models, but not limitied to:
+
+- [botp/stable-diffusion-v1-5](https://huggingface.co/botp/stable-diffusion-v1-5)
+- [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2)
+- [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1)
+- [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0)
+- [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
+
+## Run with optional LoRA adapters
+
+LoRA adapters can be connected to the pipeline and modify generated images to have certain style, details or quality. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multple adapter files with corresponding alpha parameters in command line. Check `lora.cpp` source code to learn how to enable adapters and specify them in each `generate` call.
+
+Here is an example how to run the sample with a single adapter. First download adapter file from https://civitai.com/models/67927/soulcard page manually and save it as `soulcard.safetensors`. Or download it from command line:
+
+`wget -O soulcard.safetensors https://civitai.com/api/download/models/72591`
+
+Then run `lora_stable_diffusion` executable:
+
+`./lora_stable_diffusion dreamlike_anime_1_0_ov/FP16 'curly-haired unicorn in the forest, anime, line' soulcard.safetensors 0.7`
+
+The sample generates two images with and without adapters applied using the same prompt:
+   - `lora.bmp` with adapters applied
+   - `baseline.bmp` without adapters applied
+
+Check the difference:
+
+With adapter | Without adapter
+:---:|:---:
+![](./lora.bmp) | ![](./baseline.bmp)
+
+
+## Note
+
+- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample:
+
+C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method.
diff --git a/samples/cpp/text2image/baseline.bmp b/samples/cpp/text2image/baseline.bmp
diff --git a/samples/cpp/stable_diffusion/imwrite.cpp → samples/cpp/text2image/imwrite.cpp b/samples/cpp/stable_diffusion/imwrite.cpp → samples/cpp/text2image/imwrite.cpp
diff --git a/samples/cpp/stable_diffusion/imwrite.hpp → samples/cpp/text2image/imwrite.hpp b/samples/cpp/stable_diffusion/imwrite.hpp → samples/cpp/text2image/imwrite.hpp
diff --git a/samples/cpp/text2image/lora.bmp b/samples/cpp/text2image/lora.bmp
diff --git a/samples/cpp/text2image/lora.cpp b/samples/cpp/text2image/lora.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/text2image/pipeline.hpp"
+
+#include "imwrite.hpp"
+
+int32_t main(int32_t argc, char* argv[]) try {
+    OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' [<LORA_SAFETENSORS> <ALPHA> ...]]");
+
+    const std::string models_path = argv[1], prompt = argv[2];
+    const std::string device = "CPU";  // GPU, NPU can be used as well
+
+    ov::genai::AdapterConfig adapter_config;
+    // Multiple LoRA adapters applied simultaniously are supported, parse them all and corresponding alphas from cmd parameters:
+    for(size_t i = 0; i < (argc - 3)/2; ++i) {
+        ov::genai::Adapter adapter(argv[3 + 2*i]);
+        float alpha = std::atof(argv[3 + 2*i + 1]);
+        adapter_config.add(adapter, alpha);
+    }
+
+    // LoRA adapters passed to the constructor will be activated by default in next generates
+    ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config));
+
+    std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n";
+    ov::Tensor image = pipe.generate(prompt,
+        ov::genai::random_generator(std::make_shared<ov::genai::CppStdGenerator>(42)),
+        ov::genai::width(512),
+        ov::genai::height(896),
+        ov::genai::num_inference_steps(20));
+    imwrite("lora.bmp", image, true);
+
+    std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n";
+    image = pipe.generate(prompt,
+        ov::genai::adapters(),  // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters
+        ov::genai::random_generator(std::make_shared<ov::genai::CppStdGenerator>(42)),
+        ov::genai::width(512),
+        ov::genai::height(896),
+        ov::genai::num_inference_steps(20));
+    imwrite("baseline.bmp", image, true);
+
+    return EXIT_SUCCESS;
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+}
diff --git a/samples/cpp/stable_diffusion/main.cpp → samples/cpp/text2image/main.cpp b/samples/cpp/stable_diffusion/main.cpp → samples/cpp/text2image/main.cpp
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -161,8 +161,6 @@ static constexpr ov::Property<float> presence_penalty{"presence_penalty"};
 static constexpr ov::Property<float> frequency_penalty{"frequency_penalty"};
 static constexpr ov::Property<size_t> rng_seed{"rng_seed"};
 
-static constexpr AdaptersProperty adapters;
-
 // Predefined Configs
 OPENVINO_GENAI_EXPORTS GenerationConfig beam_search();
 OPENVINO_GENAI_EXPORTS GenerationConfig greedy();

diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp
@@ -92,7 +92,9 @@ struct OPENVINO_GENAI_EXPORTS AdapterConfig {
 
 class AdaptersProperty : public ov::Property<AdapterConfig> {
 public:
-    constexpr AdaptersProperty() : ov::Property<AdapterConfig>("adapters") {}
+    inline constexpr static const char* name () { return "adapters"; }
+
+    constexpr AdaptersProperty() : ov::Property<AdapterConfig>(name()) {}
 
     inline std::pair<std::string, ov::Any> operator()(const AdapterConfig& config) const {
         return ov::Property<AdapterConfig>::operator()(config);
@@ -154,6 +156,9 @@ class AdaptersProperty : public ov::Property<AdapterConfig> {
 };
 
 
+static constexpr AdaptersProperty adapters;
+
+
 class OPENVINO_GENAI_EXPORTS AdapterController {
 
     std::shared_ptr<AdapterControllerImpl> m_pimpl;
@@ -165,15 +170,12 @@ class OPENVINO_GENAI_EXPORTS AdapterController {
 
     AdapterController(std::shared_ptr<ov::Model> model, const AdapterConfig& config, const std::string& prefix, std::string device = "");
 
-    // Call it every time when adapter config is changed; if adapter is configured as a static one, this call is not required
-    void apply(ov::InferRequest& request, const AdapterConfig& config);
+    // Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument
+    void apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config = std::nullopt);
 
     // the next call of apply will set all adapter tensors regardless of config change, use this method if full state.reset is called for the controlled model
     void force_full_apply(bool full_apply = true);
 
-    // Apply the same config that was used last time (in initialization or in previous call to apply).
-    void apply(ov::InferRequest& request);
-
     operator bool() const {
         return bool(m_pimpl);
     }

diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp
@@ -7,6 +7,7 @@
 
 #include "openvino/genai/visibility.hpp"
 #include "openvino/genai/tokenizer.hpp"
+#include "openvino/genai/lora_adapter.hpp"
 
 #include "openvino/core/any.hpp"
 #include "openvino/runtime/tensor.hpp"
@@ -53,10 +54,13 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
         return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
     }
 
+    void set_adapters(const AdapterConfig& adapters);
+
     ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance);
 
 private:
     Config m_config;
+    AdapterController m_adapter_controller;
     ov::InferRequest m_request;
     std::shared_ptr<ov::Model> m_model;
 

diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp
@@ -13,6 +13,7 @@
 
 #include "openvino/genai/visibility.hpp"
 
+#include "openvino/genai/lora_adapter.hpp"
 #include "openvino/genai/text2image/clip_text_model.hpp"
 #include "openvino/genai/text2image/unet2d_condition_model.hpp"
 #include "openvino/genai/text2image/autoencoder_kl.hpp"
@@ -81,6 +82,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
         int64_t width = -1;
         size_t num_inference_steps = 50;
 
+        AdapterConfig adapters;
+
         void update_generation_config(const ov::AnyMap& config_map);
 
         // checks whether is config is valid
@@ -96,6 +99,13 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
 
     Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties = {});
 
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    Text2ImagePipeline(const std::string& root_dir,
+                  const std::string& device,
+                  Properties&&... properties)
+        : Text2ImagePipeline(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { }
+
     // creates either LCM or SD pipeline from building blocks
     static Text2ImagePipeline stable_diffusion(
         const std::shared_ptr<Scheduler>& scheduler_type,