Skip to content

Commit

Permalink
[Image generation] Non public API changes for image generation (openv…
Browse files Browse the repository at this point in the history
…inotoolkit#1146)

Now, LCM, SD and SDXL pipelines for image to image generation work
correctly.

Public API will be later
  • Loading branch information
ilya-lavrenov authored Nov 6, 2024
1 parent c36fe02 commit a99dc93
Show file tree
Hide file tree
Showing 14 changed files with 285 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def read_image(path: str) -> Tensor:
'''
pic = Image.open(path).convert("RGB")
image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.byte)
image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
return Tensor(image_data)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "openvino/runtime/properties.hpp"

#include "openvino/genai/visibility.hpp"
#include "openvino/genai/image_generation/generation_config.hpp"

namespace ov {
namespace genai {
Expand Down Expand Up @@ -74,7 +75,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {

ov::Tensor decode(ov::Tensor latent);

ov::Tensor encode(ov::Tensor image);
ov::Tensor encode(ov::Tensor image, std::shared_ptr<Generator> generator);

const Config& get_config() const;

Expand Down
40 changes: 40 additions & 0 deletions src/cpp/src/debug_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <string>
#include <iostream>
#include <fstream>

#include <openvino/runtime/tensor.hpp>

Expand All @@ -31,3 +32,42 @@ inline void print_tensor(std::string name, ov::Tensor tensor) {
print_array(tensor.data<ov::float16>(), tensor.get_size());
}
}

template <typename tensor_T, typename file_T>
void _read_tensor_step(tensor_T* data, size_t i, std::ifstream& file, size_t& printed_elements, bool assign) {
const size_t print_size = 10;

file_T value;
file >> value;

// this mode is used to fallback to reference data to check further execution
if (assign)
data[i] = value;

if (std::abs(value - data[i]) > 1e-7 && printed_elements < print_size) {
std::cout << i << ") ref = " << value << " act = " << static_cast<file_T>(data[i]) << std::endl;
++printed_elements;
}
}

inline void read_tensor(const std::string& file_name, ov::Tensor tensor, bool assign = false) {
std::ifstream file(file_name.c_str());
OPENVINO_ASSERT(file.is_open(), "Failed to open file ", file_name);

std::cout << "Opening " << file_name << std::endl;
std::cout << "tensor shape " << tensor.get_shape() << std::endl;

for (size_t i = 0, printed_elements = 0; i < tensor.get_size(); ++i) {
if (tensor.get_element_type() == ov::element::f32)
_read_tensor_step<float, float>(tensor.data<float>(), i, file, printed_elements, assign);
else if (tensor.get_element_type() == ov::element::f64)
_read_tensor_step<double, double>(tensor.data<double>(), i, file, printed_elements, assign);
else if (tensor.get_element_type() == ov::element::u8)
_read_tensor_step<uint8_t, float>(tensor.data<uint8_t>(), i, file, printed_elements, assign);
else {
OPENVINO_THROW("Unsupported tensor type ", tensor.get_element_type(), " by read_tensor");
}
}

std::cout << "Closing " << file_name << std::endl;
}
84 changes: 72 additions & 12 deletions src/cpp/src/image_generation/models/autoencoder_kl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,48 @@
namespace ov {
namespace genai {

class DiagonalGaussianDistribution {
public:
explicit DiagonalGaussianDistribution(ov::Tensor parameters)
: m_parameters(parameters) {
ov::Shape shape = parameters.get_shape();
OPENVINO_ASSERT(shape[0] == 1, "Batch size must be 1");
shape[1] /= 2;

m_mean = ov::Tensor(parameters.get_element_type(), shape, parameters.data());
m_std = ov::Tensor(m_mean.get_element_type(), shape);
ov::Tensor logvar(parameters.get_element_type(), shape, m_mean.data<float>() + m_mean.get_size());

float * logvar_data = logvar.data<float>();
float * std_data = m_std.data<float>();

for (size_t i = 0; i < logvar.get_size(); ++i) {
logvar_data[i] = std::min(std::max(logvar_data[i], -30.0f), 20.0f);
std_data[i] = std::exp(0.5 * logvar_data[i]);
}
}

ov::Tensor sample(std::shared_ptr<Generator> generator) const {
OPENVINO_ASSERT(generator, "Generator must not be nullptr");

ov::Tensor rand_tensor = generator->randn_tensor(m_mean.get_shape());

float * rand_tensor_data = rand_tensor.data<float>();
const float * mean_data = m_mean.data<float>();
const float * std_data = m_std.data<float>();

for (size_t i = 0; i < rand_tensor.get_size(); ++i) {
rand_tensor_data[i] = mean_data[i] + std_data[i] * rand_tensor_data[i];
}

return rand_tensor;
}

private:
ov::Tensor m_parameters;
ov::Tensor m_mean, m_std;
};

size_t get_vae_scale_factor(const std::filesystem::path& vae_config_path) {
std::ifstream file(vae_config_path);
OPENVINO_ASSERT(file.is_open(), "Failed to open ", vae_config_path);
Expand Down Expand Up @@ -141,12 +183,34 @@ ov::Tensor AutoencoderKL::decode(ov::Tensor latent) {
return m_decoder_request.get_output_tensor();
}

ov::Tensor AutoencoderKL::encode(ov::Tensor image) {
ov::Tensor AutoencoderKL::encode(ov::Tensor image, std::shared_ptr<Generator> generator) {
OPENVINO_ASSERT(m_encoder_request, "VAE encoder model must be compiled first. Cannot infer non-compiled model");

m_encoder_request.set_input_tensor(image);
m_encoder_request.infer();
return m_encoder_request.get_output_tensor();

ov::Tensor output = m_encoder_request.get_output_tensor(), latent;

ov::CompiledModel compiled_model = m_encoder_request.get_compiled_model();
auto outputs = compiled_model.outputs();
OPENVINO_ASSERT(outputs.size() == 1, "AutoencoderKL encoder model is expected to have a single output");

const std::string output_name = outputs[0].get_any_name();
if (output_name == "latent_sample") {
latent = output;
} else if (output_name == "latent_parameters") {
latent = DiagonalGaussianDistribution(output).sample(generator);
} else {
OPENVINO_THROW("Unexpected output name for AutoencoderKL encoder '", output_name, "'");
}

// apply shift and scaling factor
float * latent_data = latent.data<float>();
for (size_t i = 0; i < latent.get_size(); ++i) {
latent_data[i] = (latent_data[i] - m_config.shift_factor) * m_config.scaling_factor;
}

return latent;
}

const AutoencoderKL::Config& AutoencoderKL::get_config() const {
Expand All @@ -171,25 +235,21 @@ void AutoencoderKL::merge_vae_image_pre_processing() const {
ppp.input().preprocess()
.convert_layout()
.convert_element_type(ov::element::f32)
.scale(255.0f / 2.0f)
// this is less accurate that in VaeImageProcessor::normalize
.scale(255.0 / 2.0)
.mean(1.0f);

// apply m_config.scaling_factor as last step
ppp.output().postprocess().custom([scaling_factor = m_config.scaling_factor](const ov::Output<ov::Node>& port) {
auto c_scaling_factor = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, scaling_factor);
return std::make_shared<ov::op::v1::Multiply>(port, c_scaling_factor);
});

ppp.build();
}

void AutoencoderKL::merge_vae_image_post_processing() const {
ov::preprocess::PrePostProcessor ppp(m_decoder_model);

// scale and shift input before VAE decoder
ppp.input().preprocess()
.scale(m_config.scaling_factor)
.mean(-m_config.shift_factor);
if (m_config.scaling_factor != 1.0f)
ppp.input().preprocess().scale(m_config.scaling_factor);
if (m_config.shift_factor != 0.0f)
ppp.input().preprocess().mean(-m_config.shift_factor);

// apply VaeImageProcessor normalization steps
// https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path)
SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir)
: m_config(root_dir / "config.json") {
m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
}

SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir,
Expand Down
12 changes: 10 additions & 2 deletions src/cpp/src/image_generation/schedulers/ddim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,22 @@ void DDIMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
default:
OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
}

// apply 'strength' used in image generation
// in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L711
{
size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
}
}

std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
// noise_pred - model_output
// latents - sample
// inference_step

size_t timestep = get_timesteps()[inference_step];
size_t timestep = m_timesteps[inference_step];

// get previous step value (=t-1)
int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
Expand Down Expand Up @@ -205,7 +213,7 @@ void DDIMScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr<Generator>
int64_t latent_timestep = m_timesteps.front();

float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]);
float sqrt_one_minus_alpha_prod = std::sqrt(1.0f - m_alphas_cumprod[latent_timestep]);
float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]);

ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape());

Expand Down
44 changes: 38 additions & 6 deletions src/cpp/src/image_generation/schedulers/euler_discrete.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,14 @@ EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) :
m_sigmas.push_back(0);

m_step_index = -1;
m_begin_index = -1;
}

void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) {
// TODO: support `timesteps` and `sigmas` inputs
m_timesteps.clear();
m_sigmas.clear();
m_step_index = -1;
m_step_index = m_begin_index = -1;

m_num_inference_steps = num_inference_steps;
std::vector<float> sigmas;
Expand Down Expand Up @@ -192,17 +193,29 @@ void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float str
OPENVINO_THROW("Unsupported value for 'final_sigmas_type'");
}
m_sigmas.push_back(sigma_last);

// apply 'strength' used in image generation
// in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650
{
size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
// keep original timesteps
m_schedule_timesteps = m_timesteps;
// while return patched ones by 'strength' parameter
m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
m_begin_index = t_start;
}
}

std::map<std::string, ov::Tensor> EulerDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
// noise_pred - model_output
// latents - sample
// inference_step

size_t timestep = get_timesteps()[inference_step];
size_t timestep = m_timesteps[inference_step];

if (m_step_index == -1)
m_step_index = 0;
m_step_index = m_begin_index;

float sigma = m_sigmas[m_step_index];
// TODO: hardcoded gamma
Expand Down Expand Up @@ -273,7 +286,7 @@ float EulerDiscreteScheduler::get_init_noise_sigma() const {

void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
if (m_step_index == -1)
m_step_index = 0;
m_step_index = m_begin_index;

float sigma = m_sigmas[m_step_index];
float* sample_data = sample.data<float>();
Expand All @@ -282,9 +295,28 @@ void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inferen
}
}

size_t EulerDiscreteScheduler::_index_for_timestep(int64_t timestep) const {
for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
if (timestep == m_schedule_timesteps[i]) {
return i;
}
}

OPENVINO_THROW("Failed to find index for timestep ", timestep);
}

void EulerDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr<Generator> generator) const {
// use https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_euler_discrete.py#L686
OPENVINO_THROW("Not implemented");
const int64_t latent_timestep = m_timesteps.front();
const float sigma = m_sigmas[_index_for_timestep(latent_timestep)];

ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape());

float * init_latent_data = init_latent.data<float>();
const float * rand_tensor_data = rand_tensor.data<float>();

for (size_t i = 0; i < init_latent.get_size(); ++i) {
init_latent_data[i] = init_latent_data[i] + sigma * rand_tensor_data[i];
}
}

} // namespace genai
Expand Down
6 changes: 4 additions & 2 deletions src/cpp/src/image_generation/schedulers/euler_discrete.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@ class EulerDiscreteScheduler : public IScheduler {
Config m_config;

std::vector<float> m_alphas_cumprod, m_sigmas;
std::vector<int64_t> m_timesteps;
std::vector<int64_t> m_timesteps, m_schedule_timesteps;
size_t m_num_inference_steps;

size_t m_step_index;
int m_step_index, m_begin_index;

size_t _index_for_timestep(int64_t timestep) const;
};

} // namespace genai
Expand Down
23 changes: 13 additions & 10 deletions src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
ImageGenerationConfig generation_config = m_generation_config;
generation_config.update_generation_config(properties);

if (!initial_image) {
// in case of typical text to image generation, we need to ignore 'strength'
generation_config.strength = 1.0f;
}

const auto& transformer_config = m_transformer->get_config();
const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
? 2
Expand Down Expand Up @@ -558,7 +563,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
// 6. Denoising loop
ov::Tensor noisy_residual_tensor(ov::element::f32, {});

for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) {
for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
// concat the same latent twice along a batch dimension in case of CFG
if (batch_size_multiplier > 1) {
batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
Expand Down Expand Up @@ -650,16 +655,14 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_3 == std::nullopt,
"Negative prompt 3 is not used when guidance scale < 1.0");

if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
if (initial_image) {
ov::Shape initial_image_shape = initial_image.get_shape();
size_t height = initial_image_shape[1], width = initial_image_shape[2];
if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) {
ov::Shape initial_image_shape = initial_image.get_shape();
size_t height = initial_image_shape[1], width = initial_image_shape[2];

OPENVINO_ASSERT(generation_config.height == height,
"Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
OPENVINO_ASSERT(generation_config.width == width,
"Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
}
OPENVINO_ASSERT(generation_config.height == height,
"Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
OPENVINO_ASSERT(generation_config.width == width,
"Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");

OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
"'Strength' generation parameter must be withion [0, 1] range");
Expand Down
Loading

0 comments on commit a99dc93

Please sign in to comment.