diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py index 4cde753f11..5dd7b83b3b 100755 --- a/samples/python/visual_language_chat/visual_language_chat.py +++ b/samples/python/visual_language_chat/visual_language_chat.py @@ -36,7 +36,7 @@ def read_image(path: str) -> Tensor: ''' pic = Image.open(path).convert("RGB") - image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.byte) + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) return Tensor(image_data) diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp index ed163242be..b838fbfd97 100644 --- a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp +++ b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp @@ -13,6 +13,7 @@ #include "openvino/runtime/properties.hpp" #include "openvino/genai/visibility.hpp" +#include "openvino/genai/image_generation/generation_config.hpp" namespace ov { namespace genai { @@ -74,7 +75,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { ov::Tensor decode(ov::Tensor latent); - ov::Tensor encode(ov::Tensor image); + ov::Tensor encode(ov::Tensor image, std::shared_ptr generator); const Config& get_config() const; diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp index 73a027ab32..415f8c0480 100644 --- a/src/cpp/src/debug_utils.hpp +++ b/src/cpp/src/debug_utils.hpp @@ -5,6 +5,7 @@ #include #include +#include #include @@ -31,3 +32,42 @@ inline void print_tensor(std::string name, ov::Tensor tensor) { print_array(tensor.data(), tensor.get_size()); } } + +template +void _read_tensor_step(tensor_T* data, size_t i, std::ifstream& file, size_t& printed_elements, bool assign) { + const size_t print_size = 10; + + file_T value; + file >> value; + + // this mode is used to fallback to reference data to check further execution + if (assign) + data[i] = value; + + if (std::abs(value - data[i]) > 1e-7 && printed_elements < print_size) { + std::cout << i << ") ref = " << value << " act = " << static_cast(data[i]) << std::endl; + ++printed_elements; + } +} + +inline void read_tensor(const std::string& file_name, ov::Tensor tensor, bool assign = false) { + std::ifstream file(file_name.c_str()); + OPENVINO_ASSERT(file.is_open(), "Failed to open file ", file_name); + + std::cout << "Opening " << file_name << std::endl; + std::cout << "tensor shape " << tensor.get_shape() << std::endl; + + for (size_t i = 0, printed_elements = 0; i < tensor.get_size(); ++i) { + if (tensor.get_element_type() == ov::element::f32) + _read_tensor_step(tensor.data(), i, file, printed_elements, assign); + else if (tensor.get_element_type() == ov::element::f64) + _read_tensor_step(tensor.data(), i, file, printed_elements, assign); + else if (tensor.get_element_type() == ov::element::u8) + _read_tensor_step(tensor.data(), i, file, printed_elements, assign); + else { + OPENVINO_THROW("Unsupported tensor type ", tensor.get_element_type(), " by read_tensor"); + } + } + + std::cout << "Closing " << file_name << std::endl; +} diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp index 3ffbdd442a..d7eaf18bf4 100644 --- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp +++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp @@ -22,6 +22,48 @@ namespace ov { namespace genai { +class DiagonalGaussianDistribution { +public: + explicit DiagonalGaussianDistribution(ov::Tensor parameters) + : m_parameters(parameters) { + ov::Shape shape = parameters.get_shape(); + OPENVINO_ASSERT(shape[0] == 1, "Batch size must be 1"); + shape[1] /= 2; + + m_mean = ov::Tensor(parameters.get_element_type(), shape, parameters.data()); + m_std = ov::Tensor(m_mean.get_element_type(), shape); + ov::Tensor logvar(parameters.get_element_type(), shape, m_mean.data() + m_mean.get_size()); + + float * logvar_data = logvar.data(); + float * std_data = m_std.data(); + + for (size_t i = 0; i < logvar.get_size(); ++i) { + logvar_data[i] = std::min(std::max(logvar_data[i], -30.0f), 20.0f); + std_data[i] = std::exp(0.5 * logvar_data[i]); + } + } + + ov::Tensor sample(std::shared_ptr generator) const { + OPENVINO_ASSERT(generator, "Generator must not be nullptr"); + + ov::Tensor rand_tensor = generator->randn_tensor(m_mean.get_shape()); + + float * rand_tensor_data = rand_tensor.data(); + const float * mean_data = m_mean.data(); + const float * std_data = m_std.data(); + + for (size_t i = 0; i < rand_tensor.get_size(); ++i) { + rand_tensor_data[i] = mean_data[i] + std_data[i] * rand_tensor_data[i]; + } + + return rand_tensor; + } + +private: + ov::Tensor m_parameters; + ov::Tensor m_mean, m_std; +}; + size_t get_vae_scale_factor(const std::filesystem::path& vae_config_path) { std::ifstream file(vae_config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", vae_config_path); @@ -141,12 +183,34 @@ ov::Tensor AutoencoderKL::decode(ov::Tensor latent) { return m_decoder_request.get_output_tensor(); } -ov::Tensor AutoencoderKL::encode(ov::Tensor image) { +ov::Tensor AutoencoderKL::encode(ov::Tensor image, std::shared_ptr generator) { OPENVINO_ASSERT(m_encoder_request, "VAE encoder model must be compiled first. Cannot infer non-compiled model"); m_encoder_request.set_input_tensor(image); m_encoder_request.infer(); - return m_encoder_request.get_output_tensor(); + + ov::Tensor output = m_encoder_request.get_output_tensor(), latent; + + ov::CompiledModel compiled_model = m_encoder_request.get_compiled_model(); + auto outputs = compiled_model.outputs(); + OPENVINO_ASSERT(outputs.size() == 1, "AutoencoderKL encoder model is expected to have a single output"); + + const std::string output_name = outputs[0].get_any_name(); + if (output_name == "latent_sample") { + latent = output; + } else if (output_name == "latent_parameters") { + latent = DiagonalGaussianDistribution(output).sample(generator); + } else { + OPENVINO_THROW("Unexpected output name for AutoencoderKL encoder '", output_name, "'"); + } + + // apply shift and scaling factor + float * latent_data = latent.data(); + for (size_t i = 0; i < latent.get_size(); ++i) { + latent_data[i] = (latent_data[i] - m_config.shift_factor) * m_config.scaling_factor; + } + + return latent; } const AutoencoderKL::Config& AutoencoderKL::get_config() const { @@ -171,15 +235,10 @@ void AutoencoderKL::merge_vae_image_pre_processing() const { ppp.input().preprocess() .convert_layout() .convert_element_type(ov::element::f32) - .scale(255.0f / 2.0f) + // this is less accurate that in VaeImageProcessor::normalize + .scale(255.0 / 2.0) .mean(1.0f); - // apply m_config.scaling_factor as last step - ppp.output().postprocess().custom([scaling_factor = m_config.scaling_factor](const ov::Output& port) { - auto c_scaling_factor = std::make_shared(ov::element::f32, ov::Shape{1}, scaling_factor); - return std::make_shared(port, c_scaling_factor); - }); - ppp.build(); } @@ -187,9 +246,10 @@ void AutoencoderKL::merge_vae_image_post_processing() const { ov::preprocess::PrePostProcessor ppp(m_decoder_model); // scale and shift input before VAE decoder - ppp.input().preprocess() - .scale(m_config.scaling_factor) - .mean(-m_config.shift_factor); + if (m_config.scaling_factor != 1.0f) + ppp.input().preprocess().scale(m_config.scaling_factor); + if (m_config.shift_factor != 0.0f) + ppp.input().preprocess().mean(-m_config.shift_factor); // apply VaeImageProcessor normalization steps // https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159 diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp index 4f15cf97c4..38e3dad290 100644 --- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp +++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp @@ -29,7 +29,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path) SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir) : m_config(root_dir / "config.json") { m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string()); - m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json"); + m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json"); } SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir, diff --git a/src/cpp/src/image_generation/schedulers/ddim.cpp b/src/cpp/src/image_generation/schedulers/ddim.cpp index be2f951679..414390aaf7 100644 --- a/src/cpp/src/image_generation/schedulers/ddim.cpp +++ b/src/cpp/src/image_generation/schedulers/ddim.cpp @@ -114,6 +114,14 @@ void DDIMScheduler::set_timesteps(size_t num_inference_steps, float strength) { default: OPENVINO_THROW("Unsupported value for 'timestep_spacing'"); } + + // apply 'strength' used in image generation + // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L711 + { + size_t init_timestep = std::min(num_inference_steps * strength, num_inference_steps); + size_t t_start = std::max(num_inference_steps - init_timestep, 0); + m_timesteps = std::vector(m_timesteps.begin() + t_start, m_timesteps.end()); + } } std::map DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) { @@ -121,7 +129,7 @@ std::map DDIMScheduler::step(ov::Tensor noise_pred, ov: // latents - sample // inference_step - size_t timestep = get_timesteps()[inference_step]; + size_t timestep = m_timesteps[inference_step]; // get previous step value (=t-1) int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps; @@ -205,7 +213,7 @@ void DDIMScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr int64_t latent_timestep = m_timesteps.front(); float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]); - float sqrt_one_minus_alpha_prod = std::sqrt(1.0f - m_alphas_cumprod[latent_timestep]); + float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]); ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape()); diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp index 334c7bd372..54d0497fcb 100644 --- a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp @@ -102,13 +102,14 @@ EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) : m_sigmas.push_back(0); m_step_index = -1; + m_begin_index = -1; } void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) { // TODO: support `timesteps` and `sigmas` inputs m_timesteps.clear(); m_sigmas.clear(); - m_step_index = -1; + m_step_index = m_begin_index = -1; m_num_inference_steps = num_inference_steps; std::vector sigmas; @@ -192,6 +193,18 @@ void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float str OPENVINO_THROW("Unsupported value for 'final_sigmas_type'"); } m_sigmas.push_back(sigma_last); + + // apply 'strength' used in image generation + // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650 + { + size_t init_timestep = std::min(num_inference_steps * strength, num_inference_steps); + size_t t_start = std::max(num_inference_steps - init_timestep, 0); + // keep original timesteps + m_schedule_timesteps = m_timesteps; + // while return patched ones by 'strength' parameter + m_timesteps = std::vector(m_timesteps.begin() + t_start, m_timesteps.end()); + m_begin_index = t_start; + } } std::map EulerDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr generator) { @@ -199,10 +212,10 @@ std::map EulerDiscreteScheduler::step(ov::Tensor noise_ // latents - sample // inference_step - size_t timestep = get_timesteps()[inference_step]; + size_t timestep = m_timesteps[inference_step]; if (m_step_index == -1) - m_step_index = 0; + m_step_index = m_begin_index; float sigma = m_sigmas[m_step_index]; // TODO: hardcoded gamma @@ -273,7 +286,7 @@ float EulerDiscreteScheduler::get_init_noise_sigma() const { void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { if (m_step_index == -1) - m_step_index = 0; + m_step_index = m_begin_index; float sigma = m_sigmas[m_step_index]; float* sample_data = sample.data(); @@ -282,9 +295,28 @@ void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inferen } } +size_t EulerDiscreteScheduler::_index_for_timestep(int64_t timestep) const { + for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) { + if (timestep == m_schedule_timesteps[i]) { + return i; + } + } + + OPENVINO_THROW("Failed to find index for timestep ", timestep); +} + void EulerDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr generator) const { - // use https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_euler_discrete.py#L686 - OPENVINO_THROW("Not implemented"); + const int64_t latent_timestep = m_timesteps.front(); + const float sigma = m_sigmas[_index_for_timestep(latent_timestep)]; + + ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape()); + + float * init_latent_data = init_latent.data(); + const float * rand_tensor_data = rand_tensor.data(); + + for (size_t i = 0; i < init_latent.get_size(); ++i) { + init_latent_data[i] = init_latent_data[i] + sigma * rand_tensor_data[i]; + } } } // namespace genai diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp index d7b20363cd..3aafd9fc6c 100644 --- a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp +++ b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp @@ -55,10 +55,12 @@ class EulerDiscreteScheduler : public IScheduler { Config m_config; std::vector m_alphas_cumprod, m_sigmas; - std::vector m_timesteps; + std::vector m_timesteps, m_schedule_timesteps; size_t m_num_inference_steps; - size_t m_step_index; + int m_step_index, m_begin_index; + + size_t _index_for_timestep(int64_t timestep) const; }; } // namespace genai diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp index d392582e0d..03216935a4 100644 --- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp @@ -261,6 +261,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ImageGenerationConfig generation_config = m_generation_config; generation_config.update_generation_config(properties); + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + generation_config.strength = 1.0f; + } + const auto& transformer_config = m_transformer->get_config(); const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 @@ -558,7 +563,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { // 6. Denoising loop ov::Tensor noisy_residual_tensor(ov::element::f32, {}); - for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) { + for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) { // concat the same latent twice along a batch dimension in case of CFG if (batch_size_multiplier > 1) { batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); @@ -650,16 +655,14 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used when guidance scale < 1.0"); - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - if (initial_image) { - ov::Shape initial_image_shape = initial_image.get_shape(); - size_t height = initial_image_shape[1], width = initial_image_shape[2]; + if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) { + ov::Shape initial_image_shape = initial_image.get_shape(); + size_t height = initial_image_shape[1], width = initial_image_shape[2]; - OPENVINO_ASSERT(generation_config.height == height, - "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); - OPENVINO_ASSERT(generation_config.width == width, - "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); - } + OPENVINO_ASSERT(generation_config.height == height, + "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); + OPENVINO_ASSERT(generation_config.width == width, + "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, "'Strength' generation parameter must be withion [0, 1] range"); diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index 99343c37a6..7142f3dbe5 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -168,10 +168,10 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; - ov::Tensor latent(ov::element::f32, {}); + ov::Tensor latent; if (initial_image) { - latent = m_vae->encode(initial_image); + latent = m_vae->encode(initial_image, generation_config.generator); if (generation_config.num_images_per_prompt > 1) { ov::Tensor batched_latent(ov::element::f32, latent_shape); for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { @@ -179,7 +179,6 @@ class StableDiffusionPipeline : public DiffusionPipeline { } latent = batched_latent; } - m_scheduler->add_noise(latent, generation_config.generator); } else { latent = generation_config.generator->randn_tensor(latent_shape); @@ -199,6 +198,11 @@ class StableDiffusionPipeline : public DiffusionPipeline { ImageGenerationConfig generation_config = m_generation_config; generation_config.update_generation_config(properties); + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + generation_config.strength = 1.0f; + } + // Stable Diffusion pipeline // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline @@ -261,7 +265,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg); ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); - for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) { + for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); // concat the same latent twice along a batch dimension in case of CFG if (batch_size_multiplier > 1) { @@ -355,22 +359,19 @@ class StableDiffusionPipeline : public DiffusionPipeline { OPENVINO_ASSERT(generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used by ", pipeline_name); OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name); - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - if (initial_image) { - ov::Shape initial_image_shape = initial_image.get_shape(); - size_t height = initial_image_shape[1], width = initial_image_shape[2]; - - OPENVINO_ASSERT(generation_config.height == height, - "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); - OPENVINO_ASSERT(generation_config.width == width, - "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); - } + if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) { + ov::Shape initial_image_shape = initial_image.get_shape(); + size_t height = initial_image_shape[1], width = initial_image_shape[2]; + OPENVINO_ASSERT(generation_config.height == height, + "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); + OPENVINO_ASSERT(generation_config.width == width, + "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, "'Strength' generation parameter must be withion [0, 1] range"); } else { - OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline"); OPENVINO_ASSERT(!initial_image, "Internal error: initial_image must be empty for Text 2 image pipeline"); + OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline"); } } diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp index 42ee49a19d..25a7fe8889 100644 --- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp @@ -174,10 +174,17 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; - ov::Tensor latent(ov::element::f32, {}); + ov::Tensor latent; if (initial_image) { - latent = m_vae->encode(initial_image); + latent = m_vae->encode(initial_image, generation_config.generator); + if (generation_config.num_images_per_prompt > 1) { + ov::Tensor batched_latent(ov::element::f32, latent_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(latent, batched_latent, 0, n); + } + latent = batched_latent; + } m_scheduler->add_noise(latent, generation_config.generator); } else { latent = generation_config.generator->randn_tensor(latent_shape); @@ -197,6 +204,11 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ImageGenerationConfig generation_config = m_generation_config; generation_config.update_generation_config(properties); + if (!initial_image) { + // in case of typical text to image generation, we need to ignore 'strength' + generation_config.strength = 1.0f; + } + // Stable Diffusion pipeline // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline @@ -387,7 +399,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg); ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); - for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) { + for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) { batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); // concat the same latent twice along a batch dimension in case of CFG if (batch_size_multiplier > 1) { @@ -471,22 +483,19 @@ class StableDiffusionXLPipeline : public DiffusionPipeline { OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used when guidance scale <= 1.0"); OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name); - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - if (initial_image) { - ov::Shape initial_image_shape = initial_image.get_shape(); - size_t height = initial_image_shape[1], width = initial_image_shape[2]; - - OPENVINO_ASSERT(generation_config.height == height, - "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); - OPENVINO_ASSERT(generation_config.width == width, - "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); - } + if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) { + ov::Shape initial_image_shape = initial_image.get_shape(); + size_t height = initial_image_shape[1], width = initial_image_shape[2]; + OPENVINO_ASSERT(generation_config.height == height, + "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); + OPENVINO_ASSERT(generation_config.width == width, + "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, "'Strength' generation parameter must be withion [0, 1] range"); } else { - OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline"); OPENVINO_ASSERT(!initial_image, "Internal error: initial_image must be empty for Text 2 image pipeline"); + OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline"); } } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b01f45917b..d61ab57f60 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -161,7 +161,7 @@ class InputsEmbedder::IInputsEmbedder { ov::Shape reshaped_image_shape = reshaped_image.get_shape(); for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) { ov::Tensor single_image{ - ov::element::u8, + reshaped_image.get_element_type(), {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)}, reshaped_image.data() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3) }; diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 18eef72b23..df8fce094d 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -11,34 +11,69 @@ if hasattr(os, "add_dll_directory"): os.add_dll_directory(os.path.dirname(__file__)) + from .py_openvino_genai import ( - ContinuousBatchingPipeline, DecodedResults, EncodedResults, - GenerationConfig, - GenerationResult, + RawPerfMetrics, + PerfMetrics, + StreamerBase, +) + +# VLM pipeline + +from .py_openvino_genai import ( + VLMPipeline, +) + +# LLM pipeline +from .py_openvino_genai import ( + LLMPipeline, + draft_model +) + +# LoRA +from .py_openvino_genai import ( Adapter, - AdapterConfig, + AdapterConfig +) + +# Generation config +from .py_openvino_genai import ( + GenerationConfig, + StopCriteria +) + +# Tokenizers +from .py_openvino_genai import ( + TokenizedInputs, + Tokenizer +) + +# Whispter +from .py_openvino_genai import ( + WhisperGenerationConfig, + WhisperPipeline, +) + +# Image generation +from .py_openvino_genai import ( CLIPTextModel, CLIPTextModelWithProjection, UNet2DConditionModel, AutoencoderKL, - LLMPipeline, - VLMPipeline, Text2ImagePipeline, - PerfMetrics, - RawPerfMetrics, - SchedulerConfig, Scheduler, - StopCriteria, - StreamerBase, - TokenizedInputs, - Tokenizer, - WhisperGenerationConfig, - WhisperPipeline, - CacheEvictionConfig, - AggregationMode, + ImageGenerationConfig, Generator, CppStdGenerator, - draft_model +) + +# Continuous batching +from .py_openvino_genai import ( + ContinuousBatchingPipeline, + GenerationResult, + SchedulerConfig, + CacheEvictionConfig, + AggregationMode, ) diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index 16a66cd84d..e535e9f2d8 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -190,6 +190,17 @@ void init_image_generation_pipelines(py::module_& m) { .def("next", &ov::genai::CppStdGenerator::next) .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor); + auto image_generation_scheduler = py::class_>(m, "Scheduler", "Scheduler for image generation pipelines.") + .def("from_config", &ov::genai::Scheduler::from_config); + + py::enum_(image_generation_scheduler, "Type") + .value("AUTO", ov::genai::Scheduler::Type::AUTO) + .value("LCM", ov::genai::Scheduler::Type::LCM) + .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE) + .value("DDIM", ov::genai::Scheduler::Type::DDIM) + .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE) + .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE); + py::class_(m, "ImageGenerationConfig", "This class is used for storing generation config for image generation pipeline.") .def(py::init<>()) .def_readwrite("prompt_2", &ov::genai::ImageGenerationConfig::prompt_2) @@ -274,15 +285,4 @@ void init_image_generation_pipelines(py::module_& m) { py::arg("prompt"), "Input string", (text2image_generate_docstring + std::string(" \n ")).c_str() ); - - auto image_generation_scheduler = py::class_>(m, "Scheduler", "Scheduler for image generation pipelines.") - .def("from_config", &ov::genai::Scheduler::from_config); - - py::enum_(image_generation_scheduler, "Type") - .value("AUTO", ov::genai::Scheduler::Type::AUTO) - .value("LCM", ov::genai::Scheduler::Type::LCM) - .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE) - .value("DDIM", ov::genai::Scheduler::Type::DDIM) - .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE) - .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE); }