[Image generation] Non public API changes for image generation (openv…

…inotoolkit#1146) Now, LCM, SD and SDXL pipelines for image to image generation work correctly. Public API will be later
sungeunk · Nov 6, 2024 · a99dc93 · a99dc93
1 parent c36fe02
commit a99dc93
Show file tree

Hide file tree

Showing 14 changed files with 285 additions and 94 deletions.
diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py
@@ -36,7 +36,7 @@ def read_image(path: str) -> Tensor:
 
     '''
     pic = Image.open(path).convert("RGB")
-    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.byte)
+    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
     return Tensor(image_data)
 
 

diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
@@ -13,6 +13,7 @@
 #include "openvino/runtime/properties.hpp"
 
 #include "openvino/genai/visibility.hpp"
+#include "openvino/genai/image_generation/generation_config.hpp"
 
 namespace ov {
 namespace genai {
@@ -74,7 +75,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
 
     ov::Tensor decode(ov::Tensor latent);
 
-    ov::Tensor encode(ov::Tensor image);
+    ov::Tensor encode(ov::Tensor image, std::shared_ptr<Generator> generator);
 
     const Config& get_config() const;
 

diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp
@@ -5,6 +5,7 @@
 
 #include <string>
 #include <iostream>
+#include <fstream>
 
 #include <openvino/runtime/tensor.hpp>
 
@@ -31,3 +32,42 @@ inline void print_tensor(std::string name, ov::Tensor tensor) {
         print_array(tensor.data<ov::float16>(), tensor.get_size());
     }
 }
+
+template <typename tensor_T, typename file_T>
+void _read_tensor_step(tensor_T* data, size_t i, std::ifstream& file, size_t& printed_elements, bool assign) {
+    const size_t print_size = 10;
+
+    file_T value;
+    file >> value;
+
+    // this mode is used to fallback to reference data to check further execution
+    if (assign)
+        data[i] = value;
+
+    if (std::abs(value - data[i]) > 1e-7 && printed_elements < print_size) {
+        std::cout << i << ") ref = " << value << " act = " << static_cast<file_T>(data[i]) << std::endl;
+        ++printed_elements;
+    }
+}
+
+inline void read_tensor(const std::string& file_name, ov::Tensor tensor, bool assign = false) {
+    std::ifstream file(file_name.c_str());
+    OPENVINO_ASSERT(file.is_open(), "Failed to open file ", file_name);
+
+    std::cout << "Opening " << file_name << std::endl;
+    std::cout << "tensor shape " << tensor.get_shape() << std::endl;
+
+    for (size_t i = 0, printed_elements = 0; i < tensor.get_size(); ++i) {
+        if (tensor.get_element_type() == ov::element::f32)
+            _read_tensor_step<float, float>(tensor.data<float>(), i, file, printed_elements, assign);
+        else if (tensor.get_element_type() == ov::element::f64)
+            _read_tensor_step<double, double>(tensor.data<double>(), i, file, printed_elements, assign);
+        else if (tensor.get_element_type() == ov::element::u8)
+            _read_tensor_step<uint8_t, float>(tensor.data<uint8_t>(), i, file, printed_elements, assign);
+        else {
+            OPENVINO_THROW("Unsupported tensor type ", tensor.get_element_type(), " by read_tensor");
+        }
+    }
+
+    std::cout << "Closing " << file_name << std::endl;
+}
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -22,6 +22,48 @@
 namespace ov {
 namespace genai {
 
+class DiagonalGaussianDistribution {
+public:
+    explicit DiagonalGaussianDistribution(ov::Tensor parameters)
+        : m_parameters(parameters) {
+        ov::Shape shape = parameters.get_shape();
+        OPENVINO_ASSERT(shape[0] == 1, "Batch size must be 1");
+        shape[1] /= 2;
+
+        m_mean = ov::Tensor(parameters.get_element_type(), shape, parameters.data());
+        m_std = ov::Tensor(m_mean.get_element_type(), shape);
+        ov::Tensor logvar(parameters.get_element_type(), shape, m_mean.data<float>() + m_mean.get_size());
+
+        float * logvar_data = logvar.data<float>();
+        float * std_data = m_std.data<float>();
+
+        for (size_t i = 0; i < logvar.get_size(); ++i) {
+            logvar_data[i] = std::min(std::max(logvar_data[i], -30.0f), 20.0f);
+            std_data[i] = std::exp(0.5 * logvar_data[i]);
+        }
+    }
+
+    ov::Tensor sample(std::shared_ptr<Generator> generator) const {
+        OPENVINO_ASSERT(generator, "Generator must not be nullptr");
+
+        ov::Tensor rand_tensor = generator->randn_tensor(m_mean.get_shape());
+
+        float * rand_tensor_data = rand_tensor.data<float>();
+        const float * mean_data = m_mean.data<float>();
+        const float * std_data = m_std.data<float>();
+
+        for (size_t i = 0; i < rand_tensor.get_size(); ++i) {
+            rand_tensor_data[i] = mean_data[i] + std_data[i] * rand_tensor_data[i];
+        }
+
+        return rand_tensor;
+    }
+
+private:
+    ov::Tensor m_parameters;
+    ov::Tensor m_mean, m_std;
+};
+
 size_t get_vae_scale_factor(const std::filesystem::path& vae_config_path) {
     std::ifstream file(vae_config_path);
     OPENVINO_ASSERT(file.is_open(), "Failed to open ", vae_config_path);
@@ -141,12 +183,34 @@ ov::Tensor AutoencoderKL::decode(ov::Tensor latent) {
     return m_decoder_request.get_output_tensor();
 }
 
-ov::Tensor AutoencoderKL::encode(ov::Tensor image) {
+ov::Tensor AutoencoderKL::encode(ov::Tensor image, std::shared_ptr<Generator> generator) {
     OPENVINO_ASSERT(m_encoder_request, "VAE encoder model must be compiled first. Cannot infer non-compiled model");
 
     m_encoder_request.set_input_tensor(image);
     m_encoder_request.infer();
-    return m_encoder_request.get_output_tensor();
+
+    ov::Tensor output = m_encoder_request.get_output_tensor(), latent;
+
+    ov::CompiledModel compiled_model = m_encoder_request.get_compiled_model();
+    auto outputs = compiled_model.outputs();
+    OPENVINO_ASSERT(outputs.size() == 1, "AutoencoderKL encoder model is expected to have a single output");
+
+    const std::string output_name = outputs[0].get_any_name();
+    if (output_name == "latent_sample") {
+        latent = output;
+    } else if (output_name == "latent_parameters") {
+        latent = DiagonalGaussianDistribution(output).sample(generator);
+    } else {
+        OPENVINO_THROW("Unexpected output name for AutoencoderKL encoder '", output_name, "'");
+    }
+
+    // apply shift and scaling factor
+    float * latent_data = latent.data<float>();
+    for (size_t i = 0; i < latent.get_size(); ++i) {
+        latent_data[i] = (latent_data[i] - m_config.shift_factor) * m_config.scaling_factor;
+    }
+
+    return latent;
 }
 
 const AutoencoderKL::Config& AutoencoderKL::get_config() const {
@@ -171,25 +235,21 @@ void AutoencoderKL::merge_vae_image_pre_processing() const {
     ppp.input().preprocess()
         .convert_layout()
         .convert_element_type(ov::element::f32)
-        .scale(255.0f / 2.0f)
+        // this is less accurate that in VaeImageProcessor::normalize
+        .scale(255.0 / 2.0)
         .mean(1.0f);
 
-    // apply m_config.scaling_factor as last step
-    ppp.output().postprocess().custom([scaling_factor = m_config.scaling_factor](const ov::Output<ov::Node>& port) {
-        auto c_scaling_factor = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, scaling_factor);
-        return std::make_shared<ov::op::v1::Multiply>(port, c_scaling_factor);
-    });
-
     ppp.build();
 }
 
 void AutoencoderKL::merge_vae_image_post_processing() const {
     ov::preprocess::PrePostProcessor ppp(m_decoder_model);
 
     // scale and shift input before VAE decoder
-    ppp.input().preprocess()
-        .scale(m_config.scaling_factor)
-        .mean(-m_config.shift_factor);
+    if (m_config.scaling_factor != 1.0f)
+        ppp.input().preprocess().scale(m_config.scaling_factor);
+    if (m_config.shift_factor != 0.0f)
+        ppp.input().preprocess().mean(-m_config.shift_factor);
 
     // apply VaeImageProcessor normalization steps
     // https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159

diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -29,7 +29,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path)
 SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir)
     : m_config(root_dir / "config.json") {
     m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
-    m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
+    m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
 SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir,

diff --git a/src/cpp/src/image_generation/schedulers/ddim.cpp b/src/cpp/src/image_generation/schedulers/ddim.cpp
@@ -114,14 +114,22 @@ void DDIMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
         default:
             OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
     }
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L711
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+    }
 }
 
 std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
     // noise_pred - model_output
     // latents - sample
     // inference_step
 
-    size_t timestep = get_timesteps()[inference_step];
+    size_t timestep = m_timesteps[inference_step];
 
     // get previous step value (=t-1)
     int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
@@ -205,7 +213,7 @@ void DDIMScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr<Generator>
     int64_t latent_timestep = m_timesteps.front();
 
     float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]);
-    float sqrt_one_minus_alpha_prod = std::sqrt(1.0f - m_alphas_cumprod[latent_timestep]);
+    float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]);
 
     ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape());
 

diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp
@@ -102,13 +102,14 @@ EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) :
     m_sigmas.push_back(0);
 
     m_step_index = -1;
+    m_begin_index = -1;
 }
 
 void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) {
     // TODO: support `timesteps` and `sigmas` inputs
     m_timesteps.clear();
     m_sigmas.clear();
-    m_step_index = -1;
+    m_step_index = m_begin_index = -1;
 
     m_num_inference_steps = num_inference_steps;
     std::vector<float> sigmas;
@@ -192,17 +193,29 @@ void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float str
         OPENVINO_THROW("Unsupported value for 'final_sigmas_type'");
     }
     m_sigmas.push_back(sigma_last);
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        // keep original timesteps
+        m_schedule_timesteps = m_timesteps;
+        // while return patched ones by 'strength' parameter
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+        m_begin_index = t_start;
+    }
 }
 
 std::map<std::string, ov::Tensor> EulerDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
     // noise_pred - model_output
     // latents - sample
     // inference_step
 
-    size_t timestep = get_timesteps()[inference_step];
+    size_t timestep = m_timesteps[inference_step];
 
     if (m_step_index == -1)
-        m_step_index = 0;
+        m_step_index = m_begin_index;
 
     float sigma = m_sigmas[m_step_index];
     // TODO: hardcoded gamma
@@ -273,7 +286,7 @@ float EulerDiscreteScheduler::get_init_noise_sigma() const {
 
 void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
     if (m_step_index == -1)
-        m_step_index = 0;
+        m_step_index = m_begin_index;
 
     float sigma = m_sigmas[m_step_index];
     float* sample_data = sample.data<float>();
@@ -282,9 +295,28 @@ void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inferen
     }
 }
 
+size_t EulerDiscreteScheduler::_index_for_timestep(int64_t timestep) const {
+    for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+        if (timestep == m_schedule_timesteps[i]) {
+            return i;
+        }
+    }
+
+    OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
 void EulerDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr<Generator> generator) const {
-    // use https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_euler_discrete.py#L686
-    OPENVINO_THROW("Not implemented");
+    const int64_t latent_timestep = m_timesteps.front();
+    const float sigma = m_sigmas[_index_for_timestep(latent_timestep)];
+
+    ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape());
+
+    float * init_latent_data = init_latent.data<float>();
+    const float * rand_tensor_data = rand_tensor.data<float>();
+
+    for (size_t i = 0; i < init_latent.get_size(); ++i) {
+        init_latent_data[i] = init_latent_data[i] + sigma * rand_tensor_data[i];
+    }
 }
 
 }  // namespace genai

diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp
@@ -55,10 +55,12 @@ class EulerDiscreteScheduler : public IScheduler {
     Config m_config;
 
     std::vector<float> m_alphas_cumprod, m_sigmas;
-    std::vector<int64_t> m_timesteps;
+    std::vector<int64_t> m_timesteps, m_schedule_timesteps;
     size_t m_num_inference_steps;
 
-    size_t m_step_index;
+    int m_step_index, m_begin_index;
+
+    size_t _index_for_timestep(int64_t timestep) const;
 };
 
 } // namespace genai

diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -261,6 +261,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
+        if (!initial_image) {
+            // in case of typical text to image generation, we need to ignore 'strength'
+            generation_config.strength = 1.0f;
+        }
+
         const auto& transformer_config = m_transformer->get_config();
         const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
                                                  ? 2
@@ -558,7 +563,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         // 6. Denoising loop
         ov::Tensor noisy_residual_tensor(ov::element::f32, {});
 
-        for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) {
+        for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
                 batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
@@ -650,16 +655,14 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_3 == std::nullopt,
                         "Negative prompt 3 is not used when guidance scale < 1.0");
 
-        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
-            if (initial_image) {
-                ov::Shape initial_image_shape = initial_image.get_shape();
-                size_t height = initial_image_shape[1], width = initial_image_shape[2];
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) {
+            ov::Shape initial_image_shape = initial_image.get_shape();
+            size_t height = initial_image_shape[1], width = initial_image_shape[2];
 
-                OPENVINO_ASSERT(generation_config.height == height,
-                    "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
-                OPENVINO_ASSERT(generation_config.width == width,
-                    "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
-            }
+            OPENVINO_ASSERT(generation_config.height == height,
+                "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
+            OPENVINO_ASSERT(generation_config.width == width,
+                "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
 
             OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
                 "'Strength' generation parameter must be withion [0, 1] range");