diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py
index 4cde753f11..5dd7b83b3b 100755
--- a/samples/python/visual_language_chat/visual_language_chat.py
+++ b/samples/python/visual_language_chat/visual_language_chat.py
@@ -36,7 +36,7 @@ def read_image(path: str) -> Tensor:
 
     '''
     pic = Image.open(path).convert("RGB")
-    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.byte)
+    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
     return Tensor(image_data)
 
 
diff --git a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
index ed163242be..b838fbfd97 100644
--- a/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/autoencoder_kl.hpp
@@ -13,6 +13,7 @@
 #include "openvino/runtime/properties.hpp"
 
 #include "openvino/genai/visibility.hpp"
+#include "openvino/genai/image_generation/generation_config.hpp"
 
 namespace ov {
 namespace genai {
@@ -74,7 +75,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
 
     ov::Tensor decode(ov::Tensor latent);
 
-    ov::Tensor encode(ov::Tensor image);
+    ov::Tensor encode(ov::Tensor image, std::shared_ptr<Generator> generator);
 
     const Config& get_config() const;
 
diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp
index 73a027ab32..415f8c0480 100644
--- a/src/cpp/src/debug_utils.hpp
+++ b/src/cpp/src/debug_utils.hpp
@@ -5,6 +5,7 @@
 
 #include <string>
 #include <iostream>
+#include <fstream>
 
 #include <openvino/runtime/tensor.hpp>
 
@@ -31,3 +32,42 @@ inline void print_tensor(std::string name, ov::Tensor tensor) {
         print_array(tensor.data<ov::float16>(), tensor.get_size());
     }
 }
+
+template <typename tensor_T, typename file_T>
+void _read_tensor_step(tensor_T* data, size_t i, std::ifstream& file, size_t& printed_elements, bool assign) {
+    const size_t print_size = 10;
+
+    file_T value;
+    file >> value;
+
+    // this mode is used to fallback to reference data to check further execution
+    if (assign)
+        data[i] = value;
+
+    if (std::abs(value - data[i]) > 1e-7 && printed_elements < print_size) {
+        std::cout << i << ") ref = " << value << " act = " << static_cast<file_T>(data[i]) << std::endl;
+        ++printed_elements;
+    }
+}
+
+inline void read_tensor(const std::string& file_name, ov::Tensor tensor, bool assign = false) {
+    std::ifstream file(file_name.c_str());
+    OPENVINO_ASSERT(file.is_open(), "Failed to open file ", file_name);
+
+    std::cout << "Opening " << file_name << std::endl;
+    std::cout << "tensor shape " << tensor.get_shape() << std::endl;
+
+    for (size_t i = 0, printed_elements = 0; i < tensor.get_size(); ++i) {
+        if (tensor.get_element_type() == ov::element::f32)
+            _read_tensor_step<float, float>(tensor.data<float>(), i, file, printed_elements, assign);
+        else if (tensor.get_element_type() == ov::element::f64)
+            _read_tensor_step<double, double>(tensor.data<double>(), i, file, printed_elements, assign);
+        else if (tensor.get_element_type() == ov::element::u8)
+            _read_tensor_step<uint8_t, float>(tensor.data<uint8_t>(), i, file, printed_elements, assign);
+        else {
+            OPENVINO_THROW("Unsupported tensor type ", tensor.get_element_type(), " by read_tensor");
+        }
+    }
+
+    std::cout << "Closing " << file_name << std::endl;
+}
diff --git a/src/cpp/src/image_generation/models/autoencoder_kl.cpp b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
index 3ffbdd442a..d7eaf18bf4 100644
--- a/src/cpp/src/image_generation/models/autoencoder_kl.cpp
+++ b/src/cpp/src/image_generation/models/autoencoder_kl.cpp
@@ -22,6 +22,48 @@
 namespace ov {
 namespace genai {
 
+class DiagonalGaussianDistribution {
+public:
+    explicit DiagonalGaussianDistribution(ov::Tensor parameters)
+        : m_parameters(parameters) {
+        ov::Shape shape = parameters.get_shape();
+        OPENVINO_ASSERT(shape[0] == 1, "Batch size must be 1");
+        shape[1] /= 2;
+
+        m_mean = ov::Tensor(parameters.get_element_type(), shape, parameters.data());
+        m_std = ov::Tensor(m_mean.get_element_type(), shape);
+        ov::Tensor logvar(parameters.get_element_type(), shape, m_mean.data<float>() + m_mean.get_size());
+
+        float * logvar_data = logvar.data<float>();
+        float * std_data = m_std.data<float>();
+
+        for (size_t i = 0; i < logvar.get_size(); ++i) {
+            logvar_data[i] = std::min(std::max(logvar_data[i], -30.0f), 20.0f);
+            std_data[i] = std::exp(0.5 * logvar_data[i]);
+        }
+    }
+
+    ov::Tensor sample(std::shared_ptr<Generator> generator) const {
+        OPENVINO_ASSERT(generator, "Generator must not be nullptr");
+
+        ov::Tensor rand_tensor = generator->randn_tensor(m_mean.get_shape());
+
+        float * rand_tensor_data = rand_tensor.data<float>();
+        const float * mean_data = m_mean.data<float>();
+        const float * std_data = m_std.data<float>();
+
+        for (size_t i = 0; i < rand_tensor.get_size(); ++i) {
+            rand_tensor_data[i] = mean_data[i] + std_data[i] * rand_tensor_data[i];
+        }
+
+        return rand_tensor;
+    }
+
+private:
+    ov::Tensor m_parameters;
+    ov::Tensor m_mean, m_std;
+};
+
 size_t get_vae_scale_factor(const std::filesystem::path& vae_config_path) {
     std::ifstream file(vae_config_path);
     OPENVINO_ASSERT(file.is_open(), "Failed to open ", vae_config_path);
@@ -141,12 +183,34 @@ ov::Tensor AutoencoderKL::decode(ov::Tensor latent) {
     return m_decoder_request.get_output_tensor();
 }
 
-ov::Tensor AutoencoderKL::encode(ov::Tensor image) {
+ov::Tensor AutoencoderKL::encode(ov::Tensor image, std::shared_ptr<Generator> generator) {
     OPENVINO_ASSERT(m_encoder_request, "VAE encoder model must be compiled first. Cannot infer non-compiled model");
 
     m_encoder_request.set_input_tensor(image);
     m_encoder_request.infer();
-    return m_encoder_request.get_output_tensor();
+
+    ov::Tensor output = m_encoder_request.get_output_tensor(), latent;
+
+    ov::CompiledModel compiled_model = m_encoder_request.get_compiled_model();
+    auto outputs = compiled_model.outputs();
+    OPENVINO_ASSERT(outputs.size() == 1, "AutoencoderKL encoder model is expected to have a single output");
+
+    const std::string output_name = outputs[0].get_any_name();
+    if (output_name == "latent_sample") {
+        latent = output;
+    } else if (output_name == "latent_parameters") {
+        latent = DiagonalGaussianDistribution(output).sample(generator);
+    } else {
+        OPENVINO_THROW("Unexpected output name for AutoencoderKL encoder '", output_name, "'");
+    }
+
+    // apply shift and scaling factor
+    float * latent_data = latent.data<float>();
+    for (size_t i = 0; i < latent.get_size(); ++i) {
+        latent_data[i] = (latent_data[i] - m_config.shift_factor) * m_config.scaling_factor;
+    }
+
+    return latent;
 }
 
 const AutoencoderKL::Config& AutoencoderKL::get_config() const {
@@ -171,15 +235,10 @@ void AutoencoderKL::merge_vae_image_pre_processing() const {
     ppp.input().preprocess()
         .convert_layout()
         .convert_element_type(ov::element::f32)
-        .scale(255.0f / 2.0f)
+        // this is less accurate that in VaeImageProcessor::normalize
+        .scale(255.0 / 2.0)
         .mean(1.0f);
 
-    // apply m_config.scaling_factor as last step
-    ppp.output().postprocess().custom([scaling_factor = m_config.scaling_factor](const ov::Output<ov::Node>& port) {
-        auto c_scaling_factor = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, scaling_factor);
-        return std::make_shared<ov::op::v1::Multiply>(port, c_scaling_factor);
-    });
-
     ppp.build();
 }
 
@@ -187,9 +246,10 @@ void AutoencoderKL::merge_vae_image_post_processing() const {
     ov::preprocess::PrePostProcessor ppp(m_decoder_model);
 
     // scale and shift input before VAE decoder
-    ppp.input().preprocess()
-        .scale(m_config.scaling_factor)
-        .mean(-m_config.shift_factor);
+    if (m_config.scaling_factor != 1.0f)
+        ppp.input().preprocess().scale(m_config.scaling_factor);
+    if (m_config.shift_factor != 0.0f)
+        ppp.input().preprocess().mean(-m_config.shift_factor);
 
     // apply VaeImageProcessor normalization steps
     // https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159
diff --git a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
index 4f15cf97c4..38e3dad290 100644
--- a/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
+++ b/src/cpp/src/image_generation/models/sd3_transformer_2d_model.cpp
@@ -29,7 +29,7 @@ SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path)
 SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir)
     : m_config(root_dir / "config.json") {
     m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string());
-    m_vae_scale_factor = ov::genai::get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
+    m_vae_scale_factor = get_vae_scale_factor(root_dir.parent_path() / "vae_decoder" / "config.json");
 }
 
 SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir,
diff --git a/src/cpp/src/image_generation/schedulers/ddim.cpp b/src/cpp/src/image_generation/schedulers/ddim.cpp
index be2f951679..414390aaf7 100644
--- a/src/cpp/src/image_generation/schedulers/ddim.cpp
+++ b/src/cpp/src/image_generation/schedulers/ddim.cpp
@@ -114,6 +114,14 @@ void DDIMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
         default:
             OPENVINO_THROW("Unsupported value for 'timestep_spacing'");
     }
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L711
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+    }
 }
 
 std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
@@ -121,7 +129,7 @@ std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov:
     // latents - sample
     // inference_step
 
-    size_t timestep = get_timesteps()[inference_step];
+    size_t timestep = m_timesteps[inference_step];
 
     // get previous step value (=t-1)
     int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps;
@@ -205,7 +213,7 @@ void DDIMScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr<Generator>
     int64_t latent_timestep = m_timesteps.front();
 
     float sqrt_alpha_prod = std::sqrt(m_alphas_cumprod[latent_timestep]);
-    float sqrt_one_minus_alpha_prod = std::sqrt(1.0f - m_alphas_cumprod[latent_timestep]);
+    float sqrt_one_minus_alpha_prod = std::sqrt(1.0 - m_alphas_cumprod[latent_timestep]);
 
     ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape());
 
diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp
index 334c7bd372..54d0497fcb 100644
--- a/src/cpp/src/image_generation/schedulers/euler_discrete.cpp
+++ b/src/cpp/src/image_generation/schedulers/euler_discrete.cpp
@@ -102,13 +102,14 @@ EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) :
     m_sigmas.push_back(0);
 
     m_step_index = -1;
+    m_begin_index = -1;
 }
 
 void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float strength) {
     // TODO: support `timesteps` and `sigmas` inputs
     m_timesteps.clear();
     m_sigmas.clear();
-    m_step_index = -1;
+    m_step_index = m_begin_index = -1;
 
     m_num_inference_steps = num_inference_steps;
     std::vector<float> sigmas;
@@ -192,6 +193,18 @@ void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps, float str
         OPENVINO_THROW("Unsupported value for 'final_sigmas_type'");
     }
     m_sigmas.push_back(sigma_last);
+
+    // apply 'strength' used in image generation
+    // in diffusers, it's https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L650
+    {
+        size_t init_timestep = std::min<size_t>(num_inference_steps * strength, num_inference_steps);
+        size_t t_start = std::max<size_t>(num_inference_steps - init_timestep, 0);
+        // keep original timesteps
+        m_schedule_timesteps = m_timesteps;
+        // while return patched ones by 'strength' parameter
+        m_timesteps = std::vector<int64_t>(m_timesteps.begin() + t_start, m_timesteps.end());
+        m_begin_index = t_start;
+    }
 }
 
 std::map<std::string, ov::Tensor> EulerDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step, std::shared_ptr<Generator> generator) {
@@ -199,10 +212,10 @@ std::map<std::string, ov::Tensor> EulerDiscreteScheduler::step(ov::Tensor noise_
     // latents - sample
     // inference_step
 
-    size_t timestep = get_timesteps()[inference_step];
+    size_t timestep = m_timesteps[inference_step];
 
     if (m_step_index == -1)
-        m_step_index = 0;
+        m_step_index = m_begin_index;
 
     float sigma = m_sigmas[m_step_index];
     // TODO: hardcoded gamma
@@ -273,7 +286,7 @@ float EulerDiscreteScheduler::get_init_noise_sigma() const {
 
 void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) {
     if (m_step_index == -1)
-        m_step_index = 0;
+        m_step_index = m_begin_index;
 
     float sigma = m_sigmas[m_step_index];
     float* sample_data = sample.data<float>();
@@ -282,9 +295,28 @@ void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inferen
     }
 }
 
+size_t EulerDiscreteScheduler::_index_for_timestep(int64_t timestep) const {
+    for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+        if (timestep == m_schedule_timesteps[i]) {
+            return i;
+        }
+    }
+
+    OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
 void EulerDiscreteScheduler::add_noise(ov::Tensor init_latent, std::shared_ptr<Generator> generator) const {
-    // use https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_euler_discrete.py#L686
-    OPENVINO_THROW("Not implemented");
+    const int64_t latent_timestep = m_timesteps.front();
+    const float sigma = m_sigmas[_index_for_timestep(latent_timestep)];
+
+    ov::Tensor rand_tensor = generator->randn_tensor(init_latent.get_shape());
+
+    float * init_latent_data = init_latent.data<float>();
+    const float * rand_tensor_data = rand_tensor.data<float>();
+
+    for (size_t i = 0; i < init_latent.get_size(); ++i) {
+        init_latent_data[i] = init_latent_data[i] + sigma * rand_tensor_data[i];
+    }
 }
 
 }  // namespace genai
diff --git a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp
index d7b20363cd..3aafd9fc6c 100644
--- a/src/cpp/src/image_generation/schedulers/euler_discrete.hpp
+++ b/src/cpp/src/image_generation/schedulers/euler_discrete.hpp
@@ -55,10 +55,12 @@ class EulerDiscreteScheduler : public IScheduler {
     Config m_config;
 
     std::vector<float> m_alphas_cumprod, m_sigmas;
-    std::vector<int64_t> m_timesteps;
+    std::vector<int64_t> m_timesteps, m_schedule_timesteps;
     size_t m_num_inference_steps;
 
-    size_t m_step_index;
+    int m_step_index, m_begin_index;
+
+    size_t _index_for_timestep(int64_t timestep) const;
 };
 
 } // namespace genai
diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
index d392582e0d..03216935a4 100644
--- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -261,6 +261,11 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
+        if (!initial_image) {
+            // in case of typical text to image generation, we need to ignore 'strength'
+            generation_config.strength = 1.0f;
+        }
+
         const auto& transformer_config = m_transformer->get_config();
         const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale)
                                                  ? 2
@@ -558,7 +563,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         // 6. Denoising loop
         ov::Tensor noisy_residual_tensor(ov::element::f32, {});
 
-        for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) {
+        for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
                 batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
@@ -650,16 +655,14 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_3 == std::nullopt,
                         "Negative prompt 3 is not used when guidance scale < 1.0");
 
-        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
-            if (initial_image) {
-                ov::Shape initial_image_shape = initial_image.get_shape();
-                size_t height = initial_image_shape[1], width = initial_image_shape[2];
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) {
+            ov::Shape initial_image_shape = initial_image.get_shape();
+            size_t height = initial_image_shape[1], width = initial_image_shape[2];
 
-                OPENVINO_ASSERT(generation_config.height == height,
-                    "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
-                OPENVINO_ASSERT(generation_config.width == width,
-                    "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
-            }
+            OPENVINO_ASSERT(generation_config.height == height,
+                "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
+            OPENVINO_ASSERT(generation_config.width == width,
+                "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
 
             OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
                 "'Strength' generation parameter must be withion [0, 1] range");
diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
index 99343c37a6..7142f3dbe5 100644
--- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -168,10 +168,10 @@ class StableDiffusionPipeline : public DiffusionPipeline {
 
         ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels,
                                generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor};
-        ov::Tensor latent(ov::element::f32, {});
+        ov::Tensor latent;
 
         if (initial_image) {
-            latent = m_vae->encode(initial_image);
+            latent = m_vae->encode(initial_image, generation_config.generator);
             if (generation_config.num_images_per_prompt > 1) {
                 ov::Tensor batched_latent(ov::element::f32, latent_shape);
                 for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) {
@@ -179,7 +179,6 @@ class StableDiffusionPipeline : public DiffusionPipeline {
                 }
                 latent = batched_latent;
             }
-
             m_scheduler->add_noise(latent, generation_config.generator);
         } else {
             latent = generation_config.generator->randn_tensor(latent_shape);
@@ -199,6 +198,11 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
+        if (!initial_image) {
+            // in case of typical text to image generation, we need to ignore 'strength'
+            generation_config.strength = 1.0f;
+        }
+
         // Stable Diffusion pipeline
         // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline
 
@@ -261,7 +265,7 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg);
 
         ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {});
-        for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) {
+        for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) {
             batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
@@ -355,22 +359,19 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         OPENVINO_ASSERT(generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used by ", pipeline_name);
         OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name);
 
-        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
-            if (initial_image) {
-                ov::Shape initial_image_shape = initial_image.get_shape();
-                size_t height = initial_image_shape[1], width = initial_image_shape[2];
-
-                OPENVINO_ASSERT(generation_config.height == height,
-                    "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
-                OPENVINO_ASSERT(generation_config.width == width,
-                    "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
-            }
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) {
+            ov::Shape initial_image_shape = initial_image.get_shape();
+            size_t height = initial_image_shape[1], width = initial_image_shape[2];
 
+            OPENVINO_ASSERT(generation_config.height == height,
+                "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
+            OPENVINO_ASSERT(generation_config.width == width,
+                "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
             OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
                 "'Strength' generation parameter must be withion [0, 1] range");
         } else {
-            OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline");
             OPENVINO_ASSERT(!initial_image, "Internal error: initial_image must be empty for Text 2 image pipeline");
+            OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline");
         }
     }
 
diff --git a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
index 42ee49a19d..25a7fe8889 100644
--- a/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
+++ b/src/cpp/src/image_generation/stable_diffusion_xl_pipeline.hpp
@@ -174,10 +174,17 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {
 
         ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels,
                                generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor};
-        ov::Tensor latent(ov::element::f32, {});
+        ov::Tensor latent;
 
         if (initial_image) {
-            latent = m_vae->encode(initial_image);
+            latent = m_vae->encode(initial_image, generation_config.generator);
+            if (generation_config.num_images_per_prompt > 1) {
+                ov::Tensor batched_latent(ov::element::f32, latent_shape);
+                for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) {
+                    batch_copy(latent, batched_latent, 0, n);
+                }
+                latent = batched_latent;
+            }
             m_scheduler->add_noise(latent, generation_config.generator);
         } else {
             latent = generation_config.generator->randn_tensor(latent_shape);
@@ -197,6 +204,11 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {
         ImageGenerationConfig generation_config = m_generation_config;
         generation_config.update_generation_config(properties);
 
+        if (!initial_image) {
+            // in case of typical text to image generation, we need to ignore 'strength'
+            generation_config.strength = 1.0f;
+        }
+
         // Stable Diffusion pipeline
         // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline
 
@@ -387,7 +399,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {
         ov::Tensor latent_cfg(ov::element::f32, latent_shape_cfg);
 
         ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {});
-        for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) {
+        for (size_t inference_step = 0; inference_step < timesteps.size(); inference_step++) {
             batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
             // concat the same latent twice along a batch dimension in case of CFG
             if (batch_size_multiplier > 1) {
@@ -471,22 +483,19 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {
         OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used when guidance scale <= 1.0");
         OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name);
 
-        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
-            if (initial_image) {
-                ov::Shape initial_image_shape = initial_image.get_shape();
-                size_t height = initial_image_shape[1], width = initial_image_shape[2];
-
-                OPENVINO_ASSERT(generation_config.height == height,
-                    "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
-                OPENVINO_ASSERT(generation_config.width == width,
-                    "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
-            }
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE && initial_image) {
+            ov::Shape initial_image_shape = initial_image.get_shape();
+            size_t height = initial_image_shape[1], width = initial_image_shape[2];
 
+            OPENVINO_ASSERT(generation_config.height == height,
+                "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
+            OPENVINO_ASSERT(generation_config.width == width,
+                "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
             OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
                 "'Strength' generation parameter must be withion [0, 1] range");
         } else {
-            OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline");
             OPENVINO_ASSERT(!initial_image, "Internal error: initial_image must be empty for Text 2 image pipeline");
+            OPENVINO_ASSERT(generation_config.strength == 1.0f, "'Strength' generation parameter must be 1.0f for Text 2 image pipeline");
         }
     }
 
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index b01f45917b..d61ab57f60 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -161,7 +161,7 @@ class InputsEmbedder::IInputsEmbedder {
             ov::Shape reshaped_image_shape = reshaped_image.get_shape();
             for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) {
                 ov::Tensor single_image{
-                    ov::element::u8,
+                    reshaped_image.get_element_type(),
                     {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)},
                     reshaped_image.data<uint8_t>() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3)
                 };
diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py
index 18eef72b23..df8fce094d 100644
--- a/src/python/openvino_genai/__init__.py
+++ b/src/python/openvino_genai/__init__.py
@@ -11,34 +11,69 @@
 if hasattr(os, "add_dll_directory"):
     os.add_dll_directory(os.path.dirname(__file__))
 
+
 from .py_openvino_genai import (
-    ContinuousBatchingPipeline,
     DecodedResults,
     EncodedResults,
-    GenerationConfig,
-    GenerationResult,
+    RawPerfMetrics,
+    PerfMetrics,
+    StreamerBase,
+)
+
+# VLM pipeline
+
+from .py_openvino_genai import (
+    VLMPipeline,
+)
+
+# LLM pipeline
+from .py_openvino_genai import (
+    LLMPipeline, 
+    draft_model
+)
+
+# LoRA
+from .py_openvino_genai import (
     Adapter,
-    AdapterConfig,
+    AdapterConfig
+)
+
+# Generation config
+from .py_openvino_genai import (
+    GenerationConfig,
+    StopCriteria
+)
+
+# Tokenizers
+from .py_openvino_genai import (
+    TokenizedInputs,
+    Tokenizer
+)
+
+# Whispter
+from .py_openvino_genai import (
+    WhisperGenerationConfig,
+    WhisperPipeline,
+)
+
+# Image generation
+from .py_openvino_genai import (
     CLIPTextModel,
     CLIPTextModelWithProjection,
     UNet2DConditionModel,
     AutoencoderKL,
-    LLMPipeline, 
-    VLMPipeline,
     Text2ImagePipeline,
-    PerfMetrics,
-    RawPerfMetrics,
-    SchedulerConfig,
     Scheduler,
-    StopCriteria,
-    StreamerBase,
-    TokenizedInputs,
-    Tokenizer,
-    WhisperGenerationConfig,
-    WhisperPipeline,
-    CacheEvictionConfig,
-    AggregationMode,
+    ImageGenerationConfig,
     Generator,
     CppStdGenerator,
-    draft_model
+)
+
+# Continuous batching
+from .py_openvino_genai import (
+    ContinuousBatchingPipeline,
+    GenerationResult,
+    SchedulerConfig,
+    CacheEvictionConfig,
+    AggregationMode,
 )
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index 16a66cd84d..e535e9f2d8 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -190,6 +190,17 @@ void init_image_generation_pipelines(py::module_& m) {
         .def("next", &ov::genai::CppStdGenerator::next)
         .def("randn_tensor", &ov::genai::CppStdGenerator::randn_tensor);
 
+    auto image_generation_scheduler = py::class_<ov::genai::Scheduler, std::shared_ptr<ov::genai::Scheduler>>(m, "Scheduler", "Scheduler for image generation pipelines.")
+        .def("from_config", &ov::genai::Scheduler::from_config);
+
+    py::enum_<ov::genai::Scheduler::Type>(image_generation_scheduler, "Type")
+        .value("AUTO", ov::genai::Scheduler::Type::AUTO)
+        .value("LCM", ov::genai::Scheduler::Type::LCM)
+        .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE)
+        .value("DDIM", ov::genai::Scheduler::Type::DDIM)
+        .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE)
+        .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE);
+
     py::class_<ov::genai::ImageGenerationConfig>(m, "ImageGenerationConfig", "This class is used for storing generation config for image generation pipeline.")
         .def(py::init<>())
         .def_readwrite("prompt_2", &ov::genai::ImageGenerationConfig::prompt_2)
@@ -274,15 +285,4 @@ void init_image_generation_pipelines(py::module_& m) {
             py::arg("prompt"), "Input string",
             (text2image_generate_docstring + std::string(" \n ")).c_str()
         );
-
-    auto image_generation_scheduler = py::class_<ov::genai::Scheduler, std::shared_ptr<ov::genai::Scheduler>>(m, "Scheduler", "Scheduler for image generation pipelines.")
-        .def("from_config", &ov::genai::Scheduler::from_config);
-
-    py::enum_<ov::genai::Scheduler::Type>(image_generation_scheduler, "Type")
-        .value("AUTO", ov::genai::Scheduler::Type::AUTO)
-        .value("LCM", ov::genai::Scheduler::Type::LCM)
-        .value("LMS_DISCRETE", ov::genai::Scheduler::Type::LMS_DISCRETE)
-        .value("DDIM", ov::genai::Scheduler::Type::DDIM)
-        .value("EULER_DISCRETE", ov::genai::Scheduler::Type::EULER_DISCRETE)
-        .value("FLOW_MATCH_EULER_DISCRETE", ov::genai::Scheduler::Type::FLOW_MATCH_EULER_DISCRETE);
 }